1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/counter.h> 46 #include <sys/filedesc.h> 47 #include <sys/fnv_hash.h> 48 #include <sys/kernel.h> 49 #include <sys/ktr.h> 50 #include <sys/lock.h> 51 #include <sys/malloc.h> 52 #include <sys/fcntl.h> 53 #include <sys/mount.h> 54 #include <sys/namei.h> 55 #include <sys/proc.h> 56 #include <sys/rwlock.h> 57 #include <sys/sdt.h> 58 #include <sys/smp.h> 59 #include <sys/syscallsubr.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysproto.h> 62 #include <sys/vnode.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 67 #ifdef DDB 68 #include <ddb/ddb.h> 69 #endif 70 71 #include <vm/uma.h> 72 73 SDT_PROVIDER_DECLARE(vfs); 74 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 75 "struct vnode *"); 76 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 77 "char *"); 78 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 79 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 80 "char *", "struct vnode *"); 81 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 82 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 83 "struct vnode *", "char *"); 84 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 87 "struct vnode *", "char *"); 88 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 89 "char *"); 90 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 92 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 93 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 94 "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *", 96 "char *", "int"); 97 SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *", 98 "char *", "int"); 99 100 /* 101 * This structure describes the elements in the cache of recent 102 * names looked up by namei. 103 */ 104 105 struct namecache { 106 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 107 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 108 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 109 struct vnode *nc_dvp; /* vnode of parent of name */ 110 union { 111 struct vnode *nu_vp; /* vnode the name refers to */ 112 u_int nu_neghits; /* negative entry hits */ 113 } n_un; 114 u_char nc_flag; /* flag bits */ 115 u_char nc_nlen; /* length of name */ 116 char nc_name[0]; /* segment name + nul */ 117 }; 118 119 /* 120 * struct namecache_ts repeats struct namecache layout up to the 121 * nc_nlen member. 122 * struct namecache_ts is used in place of struct namecache when time(s) need 123 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 124 * both a non-dotdot directory name plus dotdot for the directory's 125 * parent. 126 */ 127 struct namecache_ts { 128 struct timespec nc_time; /* timespec provided by fs */ 129 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 130 int nc_ticks; /* ticks value when entry was added */ 131 struct namecache nc_nc; 132 }; 133 134 #define nc_vp n_un.nu_vp 135 #define nc_neghits n_un.nu_neghits 136 137 /* 138 * Flags in namecache.nc_flag 139 */ 140 #define NCF_WHITE 0x01 141 #define NCF_ISDOTDOT 0x02 142 #define NCF_TS 0x04 143 #define NCF_DTS 0x08 144 #define NCF_DVDROP 0x10 145 #define NCF_NEGATIVE 0x20 146 #define NCF_HOTNEGATIVE 0x40 147 148 /* 149 * Name caching works as follows: 150 * 151 * Names found by directory scans are retained in a cache 152 * for future reference. It is managed LRU, so frequently 153 * used names will hang around. Cache is indexed by hash value 154 * obtained from (dvp, name) where dvp refers to the directory 155 * containing name. 156 * 157 * If it is a "negative" entry, (i.e. for a name that is known NOT to 158 * exist) the vnode pointer will be NULL. 159 * 160 * Upon reaching the last segment of a path, if the reference 161 * is for DELETE, or NOCACHE is set (rewrite), and the 162 * name is located in the cache, it will be dropped. 163 * 164 * These locks are used (in the order in which they can be taken): 165 * NAME TYPE ROLE 166 * vnodelock mtx vnode lists and v_cache_dd field protection 167 * bucketlock rwlock for access to given set of hash buckets 168 * neglist mtx negative entry LRU management 169 * 170 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 171 * shrinking the LRU list. 172 * 173 * It is legal to take multiple vnodelock and bucketlock locks. The locking 174 * order is lower address first. Both are recursive. 175 * 176 * "." lookups are lockless. 177 * 178 * ".." and vnode -> name lookups require vnodelock. 179 * 180 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 181 * 182 * Insertions and removals of entries require involved vnodes and bucketlocks 183 * to be write-locked to prevent other threads from seeing the entry. 184 * 185 * Some lookups result in removal of the found entry (e.g. getting rid of a 186 * negative entry with the intent to create a positive one), which poses a 187 * problem when multiple threads reach the state. Similarly, two different 188 * threads can purge two different vnodes and try to remove the same name. 189 * 190 * If the already held vnode lock is lower than the second required lock, we 191 * can just take the other lock. However, in the opposite case, this could 192 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 193 * the first node, locking everything in order and revalidating the state. 194 */ 195 196 /* 197 * Structures associated with name caching. 198 */ 199 #define NCHHASH(hash) \ 200 (&nchashtbl[(hash) & nchash]) 201 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 202 static u_long __read_mostly nchash; /* size of hash table */ 203 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 204 "Size of namecache hash table"); 205 static u_long __read_mostly ncnegfactor = 12; /* ratio of negative entries */ 206 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 207 "Ratio of negative namecache entries"); 208 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 209 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 210 "Number of negative entries in namecache"); 211 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 212 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 213 "Number of namecache entries"); 214 static u_long __exclusive_cache_line numcachehv;/* number of cache entries with vnodes held */ 215 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, 216 "Number of namecache entries with vnodes held"); 217 u_int __read_mostly ncsizefactor = 2; 218 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 219 "Size factor for namecache"); 220 static u_int __read_mostly ncpurgeminvnodes; 221 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 222 "Number of vnodes below which purgevfs ignores the request"); 223 static u_int __read_mostly ncneghitsrequeue = 8; 224 SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0, 225 "Number of hits to requeue a negative entry in the LRU list"); 226 227 struct nchstats nchstats; /* cache effectiveness statistics */ 228 229 static struct mtx ncneg_shrink_lock; 230 static int shrink_list_turn; 231 232 struct neglist { 233 struct mtx nl_lock; 234 TAILQ_HEAD(, namecache) nl_list; 235 } __aligned(CACHE_LINE_SIZE); 236 237 static struct neglist __read_mostly *neglists; 238 static struct neglist ncneg_hot; 239 240 #define numneglists (ncneghash + 1) 241 static u_int __read_mostly ncneghash; 242 static inline struct neglist * 243 NCP2NEGLIST(struct namecache *ncp) 244 { 245 246 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 247 } 248 249 #define numbucketlocks (ncbuckethash + 1) 250 static u_int __read_mostly ncbuckethash; 251 static struct rwlock_padalign __read_mostly *bucketlocks; 252 #define HASH2BUCKETLOCK(hash) \ 253 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 254 255 #define numvnodelocks (ncvnodehash + 1) 256 static u_int __read_mostly ncvnodehash; 257 static struct mtx __read_mostly *vnodelocks; 258 static inline struct mtx * 259 VP2VNODELOCK(struct vnode *vp) 260 { 261 262 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 263 } 264 265 /* 266 * UMA zones for the VFS cache. 267 * 268 * The small cache is used for entries with short names, which are the 269 * most common. The large cache is used for entries which are too big to 270 * fit in the small cache. 271 */ 272 static uma_zone_t __read_mostly cache_zone_small; 273 static uma_zone_t __read_mostly cache_zone_small_ts; 274 static uma_zone_t __read_mostly cache_zone_large; 275 static uma_zone_t __read_mostly cache_zone_large_ts; 276 277 #define CACHE_PATH_CUTOFF 35 278 279 static struct namecache * 280 cache_alloc(int len, int ts) 281 { 282 struct namecache_ts *ncp_ts; 283 struct namecache *ncp; 284 285 if (__predict_false(ts)) { 286 if (len <= CACHE_PATH_CUTOFF) 287 ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); 288 else 289 ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); 290 ncp = &ncp_ts->nc_nc; 291 } else { 292 if (len <= CACHE_PATH_CUTOFF) 293 ncp = uma_zalloc(cache_zone_small, M_WAITOK); 294 else 295 ncp = uma_zalloc(cache_zone_large, M_WAITOK); 296 } 297 return (ncp); 298 } 299 300 static void 301 cache_free(struct namecache *ncp) 302 { 303 struct namecache_ts *ncp_ts; 304 305 if (ncp == NULL) 306 return; 307 if ((ncp->nc_flag & NCF_DVDROP) != 0) 308 vdrop(ncp->nc_dvp); 309 if (__predict_false(ncp->nc_flag & NCF_TS)) { 310 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 311 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 312 uma_zfree(cache_zone_small_ts, ncp_ts); 313 else 314 uma_zfree(cache_zone_large_ts, ncp_ts); 315 } else { 316 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 317 uma_zfree(cache_zone_small, ncp); 318 else 319 uma_zfree(cache_zone_large, ncp); 320 } 321 } 322 323 static void 324 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 325 { 326 struct namecache_ts *ncp_ts; 327 328 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 329 (tsp == NULL && ticksp == NULL), 330 ("No NCF_TS")); 331 332 if (tsp == NULL && ticksp == NULL) 333 return; 334 335 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 336 if (tsp != NULL) 337 *tsp = ncp_ts->nc_time; 338 if (ticksp != NULL) 339 *ticksp = ncp_ts->nc_ticks; 340 } 341 342 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 343 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 344 "VFS namecache enabled"); 345 346 /* Export size information to userland */ 347 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 348 sizeof(struct namecache), "sizeof(struct namecache)"); 349 350 /* 351 * The new name cache statistics 352 */ 353 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, 354 "Name cache statistics"); 355 #define STATNODE_ULONG(name, descr) \ 356 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 357 #define STATNODE_COUNTER(name, descr) \ 358 static counter_u64_t __read_mostly name; \ 359 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 360 STATNODE_ULONG(numneg, "Number of negative cache entries"); 361 STATNODE_ULONG(numcache, "Number of cache entries"); 362 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 363 STATNODE_COUNTER(dothits, "Number of '.' hits"); 364 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 365 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 366 STATNODE_COUNTER(nummiss, "Number of cache misses"); 367 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 368 STATNODE_COUNTER(numposzaps, 369 "Number of cache hits (positive) we do not want to cache"); 370 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 371 STATNODE_COUNTER(numnegzaps, 372 "Number of cache hits (negative) we do not want to cache"); 373 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 374 /* These count for kern___getcwd(), too. */ 375 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 376 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 377 STATNODE_COUNTER(numfullpathfail2, 378 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 379 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 380 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 381 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 382 "Number of times zap_and_exit failed to lock"); 383 static long cache_lock_vnodes_cel_3_failures; 384 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 385 "Number of times 3-way vnode locking failed"); 386 387 static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 388 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 389 char *buf, char **retbuf, u_int buflen); 390 391 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 392 393 static int cache_yield; 394 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 395 "Number of times cache called yield"); 396 397 static void 398 cache_maybe_yield(void) 399 { 400 401 if (should_yield()) { 402 cache_yield++; 403 kern_yield(PRI_USER); 404 } 405 } 406 407 static inline void 408 cache_assert_vlp_locked(struct mtx *vlp) 409 { 410 411 if (vlp != NULL) 412 mtx_assert(vlp, MA_OWNED); 413 } 414 415 static inline void 416 cache_assert_vnode_locked(struct vnode *vp) 417 { 418 struct mtx *vlp; 419 420 vlp = VP2VNODELOCK(vp); 421 cache_assert_vlp_locked(vlp); 422 } 423 424 static uint32_t 425 cache_get_hash(char *name, u_char len, struct vnode *dvp) 426 { 427 uint32_t hash; 428 429 hash = fnv_32_buf(name, len, FNV1_32_INIT); 430 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 431 return (hash); 432 } 433 434 static inline struct rwlock * 435 NCP2BUCKETLOCK(struct namecache *ncp) 436 { 437 uint32_t hash; 438 439 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 440 return (HASH2BUCKETLOCK(hash)); 441 } 442 443 #ifdef INVARIANTS 444 static void 445 cache_assert_bucket_locked(struct namecache *ncp, int mode) 446 { 447 struct rwlock *blp; 448 449 blp = NCP2BUCKETLOCK(ncp); 450 rw_assert(blp, mode); 451 } 452 #else 453 #define cache_assert_bucket_locked(x, y) do { } while (0) 454 #endif 455 456 #define cache_sort(x, y) _cache_sort((void **)(x), (void **)(y)) 457 static void 458 _cache_sort(void **p1, void **p2) 459 { 460 void *tmp; 461 462 if (*p1 > *p2) { 463 tmp = *p2; 464 *p2 = *p1; 465 *p1 = tmp; 466 } 467 } 468 469 static void 470 cache_lock_all_buckets(void) 471 { 472 u_int i; 473 474 for (i = 0; i < numbucketlocks; i++) 475 rw_wlock(&bucketlocks[i]); 476 } 477 478 static void 479 cache_unlock_all_buckets(void) 480 { 481 u_int i; 482 483 for (i = 0; i < numbucketlocks; i++) 484 rw_wunlock(&bucketlocks[i]); 485 } 486 487 static void 488 cache_lock_all_vnodes(void) 489 { 490 u_int i; 491 492 for (i = 0; i < numvnodelocks; i++) 493 mtx_lock(&vnodelocks[i]); 494 } 495 496 static void 497 cache_unlock_all_vnodes(void) 498 { 499 u_int i; 500 501 for (i = 0; i < numvnodelocks; i++) 502 mtx_unlock(&vnodelocks[i]); 503 } 504 505 static int 506 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 507 { 508 509 cache_sort(&vlp1, &vlp2); 510 MPASS(vlp2 != NULL); 511 512 if (vlp1 != NULL) { 513 if (!mtx_trylock(vlp1)) 514 return (EAGAIN); 515 } 516 if (!mtx_trylock(vlp2)) { 517 if (vlp1 != NULL) 518 mtx_unlock(vlp1); 519 return (EAGAIN); 520 } 521 522 return (0); 523 } 524 525 static void 526 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 527 { 528 529 MPASS(vlp1 != NULL || vlp2 != NULL); 530 531 if (vlp1 != NULL) 532 mtx_unlock(vlp1); 533 if (vlp2 != NULL) 534 mtx_unlock(vlp2); 535 } 536 537 static int 538 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 539 { 540 struct nchstats snap; 541 542 if (req->oldptr == NULL) 543 return (SYSCTL_OUT(req, 0, sizeof(snap))); 544 545 snap = nchstats; 546 snap.ncs_goodhits = counter_u64_fetch(numposhits); 547 snap.ncs_neghits = counter_u64_fetch(numneghits); 548 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 549 counter_u64_fetch(numnegzaps); 550 snap.ncs_miss = counter_u64_fetch(nummisszap) + 551 counter_u64_fetch(nummiss); 552 553 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 554 } 555 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 556 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 557 "VFS cache effectiveness statistics"); 558 559 #ifdef DIAGNOSTIC 560 /* 561 * Grab an atomic snapshot of the name cache hash chain lengths 562 */ 563 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, 564 "hash table stats"); 565 566 static int 567 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 568 { 569 struct nchashhead *ncpp; 570 struct namecache *ncp; 571 int i, error, n_nchash, *cntbuf; 572 573 retry: 574 n_nchash = nchash + 1; /* nchash is max index, not count */ 575 if (req->oldptr == NULL) 576 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 577 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 578 cache_lock_all_buckets(); 579 if (n_nchash != nchash + 1) { 580 cache_unlock_all_buckets(); 581 free(cntbuf, M_TEMP); 582 goto retry; 583 } 584 /* Scan hash tables counting entries */ 585 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 586 LIST_FOREACH(ncp, ncpp, nc_hash) 587 cntbuf[i]++; 588 cache_unlock_all_buckets(); 589 for (error = 0, i = 0; i < n_nchash; i++) 590 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 591 break; 592 free(cntbuf, M_TEMP); 593 return (error); 594 } 595 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 596 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 597 "nchash chain lengths"); 598 599 static int 600 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 601 { 602 int error; 603 struct nchashhead *ncpp; 604 struct namecache *ncp; 605 int n_nchash; 606 int count, maxlength, used, pct; 607 608 if (!req->oldptr) 609 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 610 611 cache_lock_all_buckets(); 612 n_nchash = nchash + 1; /* nchash is max index, not count */ 613 used = 0; 614 maxlength = 0; 615 616 /* Scan hash tables for applicable entries */ 617 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 618 count = 0; 619 LIST_FOREACH(ncp, ncpp, nc_hash) { 620 count++; 621 } 622 if (count) 623 used++; 624 if (maxlength < count) 625 maxlength = count; 626 } 627 n_nchash = nchash + 1; 628 cache_unlock_all_buckets(); 629 pct = (used * 100) / (n_nchash / 100); 630 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 631 if (error) 632 return (error); 633 error = SYSCTL_OUT(req, &used, sizeof(used)); 634 if (error) 635 return (error); 636 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 637 if (error) 638 return (error); 639 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 640 if (error) 641 return (error); 642 return (0); 643 } 644 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 645 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 646 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 647 #endif 648 649 /* 650 * Negative entries management 651 * 652 * A variation of LRU scheme is used. New entries are hashed into one of 653 * numneglists cold lists. Entries get promoted to the hot list on first hit. 654 * Partial LRU for the hot list is maintained by requeueing them every 655 * ncneghitsrequeue hits. 656 * 657 * The shrinker will demote hot list head and evict from the cold list in a 658 * round-robin manner. 659 */ 660 static void 661 cache_negative_hit(struct namecache *ncp) 662 { 663 struct neglist *neglist; 664 u_int hits; 665 666 MPASS(ncp->nc_flag & NCF_NEGATIVE); 667 hits = atomic_fetchadd_int(&ncp->nc_neghits, 1); 668 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 669 if ((hits % ncneghitsrequeue) != 0) 670 return; 671 mtx_lock(&ncneg_hot.nl_lock); 672 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 673 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 674 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 675 mtx_unlock(&ncneg_hot.nl_lock); 676 return; 677 } 678 /* 679 * The shrinker cleared the flag and removed the entry from 680 * the hot list. Put it back. 681 */ 682 } else { 683 mtx_lock(&ncneg_hot.nl_lock); 684 } 685 neglist = NCP2NEGLIST(ncp); 686 mtx_lock(&neglist->nl_lock); 687 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 688 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 689 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 690 ncp->nc_flag |= NCF_HOTNEGATIVE; 691 } 692 mtx_unlock(&neglist->nl_lock); 693 mtx_unlock(&ncneg_hot.nl_lock); 694 } 695 696 static void 697 cache_negative_insert(struct namecache *ncp, bool neg_locked) 698 { 699 struct neglist *neglist; 700 701 MPASS(ncp->nc_flag & NCF_NEGATIVE); 702 cache_assert_bucket_locked(ncp, RA_WLOCKED); 703 neglist = NCP2NEGLIST(ncp); 704 if (!neg_locked) { 705 mtx_lock(&neglist->nl_lock); 706 } else { 707 mtx_assert(&neglist->nl_lock, MA_OWNED); 708 } 709 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 710 if (!neg_locked) 711 mtx_unlock(&neglist->nl_lock); 712 atomic_add_rel_long(&numneg, 1); 713 } 714 715 static void 716 cache_negative_remove(struct namecache *ncp, bool neg_locked) 717 { 718 struct neglist *neglist; 719 bool hot_locked = false; 720 bool list_locked = false; 721 722 MPASS(ncp->nc_flag & NCF_NEGATIVE); 723 cache_assert_bucket_locked(ncp, RA_WLOCKED); 724 neglist = NCP2NEGLIST(ncp); 725 if (!neg_locked) { 726 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 727 hot_locked = true; 728 mtx_lock(&ncneg_hot.nl_lock); 729 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 730 list_locked = true; 731 mtx_lock(&neglist->nl_lock); 732 } 733 } else { 734 list_locked = true; 735 mtx_lock(&neglist->nl_lock); 736 } 737 } 738 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 739 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 740 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 741 } else { 742 mtx_assert(&neglist->nl_lock, MA_OWNED); 743 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 744 } 745 if (list_locked) 746 mtx_unlock(&neglist->nl_lock); 747 if (hot_locked) 748 mtx_unlock(&ncneg_hot.nl_lock); 749 atomic_subtract_rel_long(&numneg, 1); 750 } 751 752 static void 753 cache_negative_shrink_select(int start, struct namecache **ncpp, 754 struct neglist **neglistpp) 755 { 756 struct neglist *neglist; 757 struct namecache *ncp; 758 int i; 759 760 *ncpp = ncp = NULL; 761 neglist = NULL; 762 763 for (i = start; i < numneglists; i++) { 764 neglist = &neglists[i]; 765 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 766 continue; 767 mtx_lock(&neglist->nl_lock); 768 ncp = TAILQ_FIRST(&neglist->nl_list); 769 if (ncp != NULL) 770 break; 771 mtx_unlock(&neglist->nl_lock); 772 } 773 774 *neglistpp = neglist; 775 *ncpp = ncp; 776 } 777 778 static void 779 cache_negative_zap_one(void) 780 { 781 struct namecache *ncp, *ncp2; 782 struct neglist *neglist; 783 struct mtx *dvlp; 784 struct rwlock *blp; 785 786 if (!mtx_trylock(&ncneg_shrink_lock)) 787 return; 788 789 mtx_lock(&ncneg_hot.nl_lock); 790 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 791 if (ncp != NULL) { 792 neglist = NCP2NEGLIST(ncp); 793 mtx_lock(&neglist->nl_lock); 794 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 795 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 796 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 797 mtx_unlock(&neglist->nl_lock); 798 } 799 800 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 801 shrink_list_turn++; 802 if (shrink_list_turn == numneglists) 803 shrink_list_turn = 0; 804 if (ncp == NULL && shrink_list_turn == 0) 805 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 806 if (ncp == NULL) { 807 mtx_unlock(&ncneg_hot.nl_lock); 808 goto out; 809 } 810 811 MPASS(ncp->nc_flag & NCF_NEGATIVE); 812 dvlp = VP2VNODELOCK(ncp->nc_dvp); 813 blp = NCP2BUCKETLOCK(ncp); 814 mtx_unlock(&neglist->nl_lock); 815 mtx_unlock(&ncneg_hot.nl_lock); 816 mtx_lock(dvlp); 817 rw_wlock(blp); 818 mtx_lock(&neglist->nl_lock); 819 ncp2 = TAILQ_FIRST(&neglist->nl_list); 820 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 821 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 822 ncp = NULL; 823 goto out_unlock_all; 824 } 825 SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 826 ncp->nc_name, ncp->nc_neghits); 827 828 cache_zap_locked(ncp, true); 829 out_unlock_all: 830 mtx_unlock(&neglist->nl_lock); 831 rw_wunlock(blp); 832 mtx_unlock(dvlp); 833 out: 834 mtx_unlock(&ncneg_shrink_lock); 835 cache_free(ncp); 836 } 837 838 /* 839 * cache_zap_locked(): 840 * 841 * Removes a namecache entry from cache, whether it contains an actual 842 * pointer to a vnode or if it is just a negative cache entry. 843 */ 844 static void 845 cache_zap_locked(struct namecache *ncp, bool neg_locked) 846 { 847 848 if (!(ncp->nc_flag & NCF_NEGATIVE)) 849 cache_assert_vnode_locked(ncp->nc_vp); 850 cache_assert_vnode_locked(ncp->nc_dvp); 851 cache_assert_bucket_locked(ncp, RA_WLOCKED); 852 853 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 854 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 855 LIST_REMOVE(ncp, nc_hash); 856 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 857 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 858 ncp->nc_name, ncp->nc_vp); 859 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 860 if (ncp == ncp->nc_vp->v_cache_dd) 861 ncp->nc_vp->v_cache_dd = NULL; 862 } else { 863 SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp, 864 ncp->nc_name, ncp->nc_neghits); 865 cache_negative_remove(ncp, neg_locked); 866 } 867 if (ncp->nc_flag & NCF_ISDOTDOT) { 868 if (ncp == ncp->nc_dvp->v_cache_dd) 869 ncp->nc_dvp->v_cache_dd = NULL; 870 } else { 871 LIST_REMOVE(ncp, nc_src); 872 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 873 ncp->nc_flag |= NCF_DVDROP; 874 atomic_subtract_rel_long(&numcachehv, 1); 875 } 876 } 877 atomic_subtract_rel_long(&numcache, 1); 878 } 879 880 static void 881 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 882 { 883 struct rwlock *blp; 884 885 MPASS(ncp->nc_dvp == vp); 886 MPASS(ncp->nc_flag & NCF_NEGATIVE); 887 cache_assert_vnode_locked(vp); 888 889 blp = NCP2BUCKETLOCK(ncp); 890 rw_wlock(blp); 891 cache_zap_locked(ncp, false); 892 rw_wunlock(blp); 893 } 894 895 static bool 896 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 897 struct mtx **vlpp) 898 { 899 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 900 struct rwlock *blp; 901 902 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 903 cache_assert_vnode_locked(vp); 904 905 if (ncp->nc_flag & NCF_NEGATIVE) { 906 if (*vlpp != NULL) { 907 mtx_unlock(*vlpp); 908 *vlpp = NULL; 909 } 910 cache_zap_negative_locked_vnode_kl(ncp, vp); 911 return (true); 912 } 913 914 pvlp = VP2VNODELOCK(vp); 915 blp = NCP2BUCKETLOCK(ncp); 916 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 917 vlp2 = VP2VNODELOCK(ncp->nc_vp); 918 919 if (*vlpp == vlp1 || *vlpp == vlp2) { 920 to_unlock = *vlpp; 921 *vlpp = NULL; 922 } else { 923 if (*vlpp != NULL) { 924 mtx_unlock(*vlpp); 925 *vlpp = NULL; 926 } 927 cache_sort(&vlp1, &vlp2); 928 if (vlp1 == pvlp) { 929 mtx_lock(vlp2); 930 to_unlock = vlp2; 931 } else { 932 if (!mtx_trylock(vlp1)) 933 goto out_relock; 934 to_unlock = vlp1; 935 } 936 } 937 rw_wlock(blp); 938 cache_zap_locked(ncp, false); 939 rw_wunlock(blp); 940 if (to_unlock != NULL) 941 mtx_unlock(to_unlock); 942 return (true); 943 944 out_relock: 945 mtx_unlock(vlp2); 946 mtx_lock(vlp1); 947 mtx_lock(vlp2); 948 MPASS(*vlpp == NULL); 949 *vlpp = vlp1; 950 return (false); 951 } 952 953 static int 954 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 955 { 956 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 957 struct rwlock *blp; 958 int error = 0; 959 960 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 961 cache_assert_vnode_locked(vp); 962 963 pvlp = VP2VNODELOCK(vp); 964 if (ncp->nc_flag & NCF_NEGATIVE) { 965 cache_zap_negative_locked_vnode_kl(ncp, vp); 966 goto out; 967 } 968 969 blp = NCP2BUCKETLOCK(ncp); 970 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 971 vlp2 = VP2VNODELOCK(ncp->nc_vp); 972 cache_sort(&vlp1, &vlp2); 973 if (vlp1 == pvlp) { 974 mtx_lock(vlp2); 975 to_unlock = vlp2; 976 } else { 977 if (!mtx_trylock(vlp1)) { 978 error = EAGAIN; 979 goto out; 980 } 981 to_unlock = vlp1; 982 } 983 rw_wlock(blp); 984 cache_zap_locked(ncp, false); 985 rw_wunlock(blp); 986 mtx_unlock(to_unlock); 987 out: 988 mtx_unlock(pvlp); 989 return (error); 990 } 991 992 static int 993 cache_zap_wlocked_bucket(struct namecache *ncp, struct rwlock *blp) 994 { 995 struct mtx *dvlp, *vlp; 996 997 cache_assert_bucket_locked(ncp, RA_WLOCKED); 998 999 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1000 vlp = NULL; 1001 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1002 vlp = VP2VNODELOCK(ncp->nc_vp); 1003 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1004 cache_zap_locked(ncp, false); 1005 rw_wunlock(blp); 1006 cache_unlock_vnodes(dvlp, vlp); 1007 return (0); 1008 } 1009 1010 rw_wunlock(blp); 1011 return (EAGAIN); 1012 } 1013 1014 static int 1015 cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp) 1016 { 1017 struct mtx *dvlp, *vlp; 1018 1019 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1020 1021 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1022 vlp = NULL; 1023 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1024 vlp = VP2VNODELOCK(ncp->nc_vp); 1025 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1026 rw_runlock(blp); 1027 rw_wlock(blp); 1028 cache_zap_locked(ncp, false); 1029 rw_wunlock(blp); 1030 cache_unlock_vnodes(dvlp, vlp); 1031 return (0); 1032 } 1033 1034 rw_runlock(blp); 1035 return (EAGAIN); 1036 } 1037 1038 static int 1039 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1040 struct mtx **vlpp1, struct mtx **vlpp2) 1041 { 1042 struct mtx *dvlp, *vlp; 1043 1044 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1045 1046 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1047 vlp = NULL; 1048 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1049 vlp = VP2VNODELOCK(ncp->nc_vp); 1050 cache_sort(&dvlp, &vlp); 1051 1052 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1053 cache_zap_locked(ncp, false); 1054 cache_unlock_vnodes(dvlp, vlp); 1055 *vlpp1 = NULL; 1056 *vlpp2 = NULL; 1057 return (0); 1058 } 1059 1060 if (*vlpp1 != NULL) 1061 mtx_unlock(*vlpp1); 1062 if (*vlpp2 != NULL) 1063 mtx_unlock(*vlpp2); 1064 *vlpp1 = NULL; 1065 *vlpp2 = NULL; 1066 1067 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1068 cache_zap_locked(ncp, false); 1069 cache_unlock_vnodes(dvlp, vlp); 1070 return (0); 1071 } 1072 1073 rw_wunlock(blp); 1074 *vlpp1 = dvlp; 1075 *vlpp2 = vlp; 1076 if (*vlpp1 != NULL) 1077 mtx_lock(*vlpp1); 1078 mtx_lock(*vlpp2); 1079 rw_wlock(blp); 1080 return (EAGAIN); 1081 } 1082 1083 static void 1084 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1085 { 1086 1087 if (blp != NULL) { 1088 rw_runlock(blp); 1089 } else { 1090 mtx_unlock(vlp); 1091 } 1092 } 1093 1094 static int __noinline 1095 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1096 struct timespec *tsp, int *ticksp) 1097 { 1098 int ltype; 1099 1100 *vpp = dvp; 1101 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1102 dvp, cnp->cn_nameptr); 1103 counter_u64_add(dothits, 1); 1104 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1105 if (tsp != NULL) 1106 timespecclear(tsp); 1107 if (ticksp != NULL) 1108 *ticksp = ticks; 1109 vrefact(*vpp); 1110 /* 1111 * When we lookup "." we still can be asked to lock it 1112 * differently... 1113 */ 1114 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1115 if (ltype != VOP_ISLOCKED(*vpp)) { 1116 if (ltype == LK_EXCLUSIVE) { 1117 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1118 if ((*vpp)->v_iflag & VI_DOOMED) { 1119 /* forced unmount */ 1120 vrele(*vpp); 1121 *vpp = NULL; 1122 return (ENOENT); 1123 } 1124 } else 1125 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1126 } 1127 return (-1); 1128 } 1129 1130 static __noinline int 1131 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1132 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1133 { 1134 struct namecache *ncp; 1135 struct rwlock *blp; 1136 struct mtx *dvlp, *dvlp2; 1137 uint32_t hash; 1138 int error; 1139 1140 if (cnp->cn_namelen == 2 && 1141 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1142 counter_u64_add(dotdothits, 1); 1143 dvlp = VP2VNODELOCK(dvp); 1144 dvlp2 = NULL; 1145 mtx_lock(dvlp); 1146 retry_dotdot: 1147 ncp = dvp->v_cache_dd; 1148 if (ncp == NULL) { 1149 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1150 "..", NULL); 1151 mtx_unlock(dvlp); 1152 if (dvlp2 != NULL) 1153 mtx_unlock(dvlp2); 1154 return (0); 1155 } 1156 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1157 if (ncp->nc_dvp != dvp) 1158 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1159 if (!cache_zap_locked_vnode_kl2(ncp, 1160 dvp, &dvlp2)) 1161 goto retry_dotdot; 1162 MPASS(dvp->v_cache_dd == NULL); 1163 mtx_unlock(dvlp); 1164 if (dvlp2 != NULL) 1165 mtx_unlock(dvlp2); 1166 cache_free(ncp); 1167 } else { 1168 dvp->v_cache_dd = NULL; 1169 mtx_unlock(dvlp); 1170 if (dvlp2 != NULL) 1171 mtx_unlock(dvlp2); 1172 } 1173 return (0); 1174 } 1175 1176 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1177 blp = HASH2BUCKETLOCK(hash); 1178 retry: 1179 if (LIST_EMPTY(NCHHASH(hash))) 1180 goto out_no_entry; 1181 1182 rw_wlock(blp); 1183 1184 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1185 counter_u64_add(numchecks, 1); 1186 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1187 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1188 break; 1189 } 1190 1191 /* We failed to find an entry */ 1192 if (ncp == NULL) { 1193 rw_wunlock(blp); 1194 goto out_no_entry; 1195 } 1196 1197 counter_u64_add(numposzaps, 1); 1198 1199 error = cache_zap_wlocked_bucket(ncp, blp); 1200 if (error != 0) { 1201 zap_and_exit_bucket_fail++; 1202 cache_maybe_yield(); 1203 goto retry; 1204 } 1205 cache_free(ncp); 1206 return (0); 1207 out_no_entry: 1208 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1209 counter_u64_add(nummisszap, 1); 1210 return (0); 1211 } 1212 1213 /** 1214 * Lookup a name in the name cache 1215 * 1216 * # Arguments 1217 * 1218 * - dvp: Parent directory in which to search. 1219 * - vpp: Return argument. Will contain desired vnode on cache hit. 1220 * - cnp: Parameters of the name search. The most interesting bits of 1221 * the cn_flags field have the following meanings: 1222 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1223 * it up. 1224 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1225 * - tsp: Return storage for cache timestamp. On a successful (positive 1226 * or negative) lookup, tsp will be filled with any timespec that 1227 * was stored when this cache entry was created. However, it will 1228 * be clear for "." entries. 1229 * - ticks: Return storage for alternate cache timestamp. On a successful 1230 * (positive or negative) lookup, it will contain the ticks value 1231 * that was current when the cache entry was created, unless cnp 1232 * was ".". 1233 * 1234 * # Returns 1235 * 1236 * - -1: A positive cache hit. vpp will contain the desired vnode. 1237 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1238 * to a forced unmount. vpp will not be modified. If the entry 1239 * is a whiteout, then the ISWHITEOUT flag will be set in 1240 * cnp->cn_flags. 1241 * - 0: A cache miss. vpp will not be modified. 1242 * 1243 * # Locking 1244 * 1245 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1246 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1247 * lock is not recursively acquired. 1248 */ 1249 int 1250 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1251 struct timespec *tsp, int *ticksp) 1252 { 1253 struct namecache_ts *ncp_ts; 1254 struct namecache *ncp; 1255 struct rwlock *blp; 1256 struct mtx *dvlp; 1257 uint32_t hash; 1258 enum vgetstate vs; 1259 int error, ltype; 1260 1261 if (__predict_false(!doingcache)) { 1262 cnp->cn_flags &= ~MAKEENTRY; 1263 return (0); 1264 } 1265 1266 counter_u64_add(numcalls, 1); 1267 1268 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1269 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1270 1271 if ((cnp->cn_flags & MAKEENTRY) == 0) 1272 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1273 1274 retry: 1275 blp = NULL; 1276 dvlp = NULL; 1277 error = 0; 1278 if (cnp->cn_namelen == 2 && 1279 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1280 counter_u64_add(dotdothits, 1); 1281 dvlp = VP2VNODELOCK(dvp); 1282 mtx_lock(dvlp); 1283 ncp = dvp->v_cache_dd; 1284 if (ncp == NULL) { 1285 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1286 "..", NULL); 1287 mtx_unlock(dvlp); 1288 return (0); 1289 } 1290 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1291 if (ncp->nc_flag & NCF_NEGATIVE) 1292 *vpp = NULL; 1293 else 1294 *vpp = ncp->nc_vp; 1295 } else 1296 *vpp = ncp->nc_dvp; 1297 /* Return failure if negative entry was found. */ 1298 if (*vpp == NULL) 1299 goto negative_success; 1300 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1301 dvp, cnp->cn_nameptr, *vpp); 1302 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1303 *vpp); 1304 cache_out_ts(ncp, tsp, ticksp); 1305 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1306 NCF_DTS && tsp != NULL) { 1307 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1308 *tsp = ncp_ts->nc_dotdottime; 1309 } 1310 goto success; 1311 } 1312 1313 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1314 blp = HASH2BUCKETLOCK(hash); 1315 rw_rlock(blp); 1316 1317 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1318 counter_u64_add(numchecks, 1); 1319 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1320 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1321 break; 1322 } 1323 1324 /* We failed to find an entry */ 1325 if (ncp == NULL) { 1326 rw_runlock(blp); 1327 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1328 NULL); 1329 counter_u64_add(nummiss, 1); 1330 return (0); 1331 } 1332 1333 /* We found a "positive" match, return the vnode */ 1334 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1335 counter_u64_add(numposhits, 1); 1336 *vpp = ncp->nc_vp; 1337 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1338 dvp, cnp->cn_nameptr, *vpp, ncp); 1339 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1340 *vpp); 1341 cache_out_ts(ncp, tsp, ticksp); 1342 goto success; 1343 } 1344 1345 negative_success: 1346 /* We found a negative match, and want to create it, so purge */ 1347 if (cnp->cn_nameiop == CREATE) { 1348 counter_u64_add(numnegzaps, 1); 1349 goto zap_and_exit; 1350 } 1351 1352 counter_u64_add(numneghits, 1); 1353 cache_negative_hit(ncp); 1354 if (ncp->nc_flag & NCF_WHITE) 1355 cnp->cn_flags |= ISWHITEOUT; 1356 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 1357 ncp->nc_name); 1358 cache_out_ts(ncp, tsp, ticksp); 1359 cache_lookup_unlock(blp, dvlp); 1360 return (ENOENT); 1361 1362 success: 1363 /* 1364 * On success we return a locked and ref'd vnode as per the lookup 1365 * protocol. 1366 */ 1367 MPASS(dvp != *vpp); 1368 ltype = 0; /* silence gcc warning */ 1369 if (cnp->cn_flags & ISDOTDOT) { 1370 ltype = VOP_ISLOCKED(dvp); 1371 VOP_UNLOCK(dvp, 0); 1372 } 1373 vs = vget_prep(*vpp); 1374 cache_lookup_unlock(blp, dvlp); 1375 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1376 if (cnp->cn_flags & ISDOTDOT) { 1377 vn_lock(dvp, ltype | LK_RETRY); 1378 if (dvp->v_iflag & VI_DOOMED) { 1379 if (error == 0) 1380 vput(*vpp); 1381 *vpp = NULL; 1382 return (ENOENT); 1383 } 1384 } 1385 if (error) { 1386 *vpp = NULL; 1387 goto retry; 1388 } 1389 if ((cnp->cn_flags & ISLASTCN) && 1390 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1391 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1392 } 1393 return (-1); 1394 1395 zap_and_exit: 1396 if (blp != NULL) 1397 error = cache_zap_rlocked_bucket(ncp, blp); 1398 else 1399 error = cache_zap_locked_vnode(ncp, dvp); 1400 if (error != 0) { 1401 zap_and_exit_bucket_fail++; 1402 cache_maybe_yield(); 1403 goto retry; 1404 } 1405 cache_free(ncp); 1406 return (0); 1407 } 1408 1409 struct celockstate { 1410 struct mtx *vlp[3]; 1411 struct rwlock *blp[2]; 1412 }; 1413 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1414 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1415 1416 static inline void 1417 cache_celockstate_init(struct celockstate *cel) 1418 { 1419 1420 bzero(cel, sizeof(*cel)); 1421 } 1422 1423 static void 1424 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1425 struct vnode *dvp) 1426 { 1427 struct mtx *vlp1, *vlp2; 1428 1429 MPASS(cel->vlp[0] == NULL); 1430 MPASS(cel->vlp[1] == NULL); 1431 MPASS(cel->vlp[2] == NULL); 1432 1433 MPASS(vp != NULL || dvp != NULL); 1434 1435 vlp1 = VP2VNODELOCK(vp); 1436 vlp2 = VP2VNODELOCK(dvp); 1437 cache_sort(&vlp1, &vlp2); 1438 1439 if (vlp1 != NULL) { 1440 mtx_lock(vlp1); 1441 cel->vlp[0] = vlp1; 1442 } 1443 mtx_lock(vlp2); 1444 cel->vlp[1] = vlp2; 1445 } 1446 1447 static void 1448 cache_unlock_vnodes_cel(struct celockstate *cel) 1449 { 1450 1451 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1452 1453 if (cel->vlp[0] != NULL) 1454 mtx_unlock(cel->vlp[0]); 1455 if (cel->vlp[1] != NULL) 1456 mtx_unlock(cel->vlp[1]); 1457 if (cel->vlp[2] != NULL) 1458 mtx_unlock(cel->vlp[2]); 1459 } 1460 1461 static bool 1462 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1463 { 1464 struct mtx *vlp; 1465 bool ret; 1466 1467 cache_assert_vlp_locked(cel->vlp[0]); 1468 cache_assert_vlp_locked(cel->vlp[1]); 1469 MPASS(cel->vlp[2] == NULL); 1470 1471 MPASS(vp != NULL); 1472 vlp = VP2VNODELOCK(vp); 1473 1474 ret = true; 1475 if (vlp >= cel->vlp[1]) { 1476 mtx_lock(vlp); 1477 } else { 1478 if (mtx_trylock(vlp)) 1479 goto out; 1480 cache_lock_vnodes_cel_3_failures++; 1481 cache_unlock_vnodes_cel(cel); 1482 if (vlp < cel->vlp[0]) { 1483 mtx_lock(vlp); 1484 mtx_lock(cel->vlp[0]); 1485 mtx_lock(cel->vlp[1]); 1486 } else { 1487 if (cel->vlp[0] != NULL) 1488 mtx_lock(cel->vlp[0]); 1489 mtx_lock(vlp); 1490 mtx_lock(cel->vlp[1]); 1491 } 1492 ret = false; 1493 } 1494 out: 1495 cel->vlp[2] = vlp; 1496 return (ret); 1497 } 1498 1499 static void 1500 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1501 struct rwlock *blp2) 1502 { 1503 1504 MPASS(cel->blp[0] == NULL); 1505 MPASS(cel->blp[1] == NULL); 1506 1507 cache_sort(&blp1, &blp2); 1508 1509 if (blp1 != NULL) { 1510 rw_wlock(blp1); 1511 cel->blp[0] = blp1; 1512 } 1513 rw_wlock(blp2); 1514 cel->blp[1] = blp2; 1515 } 1516 1517 static void 1518 cache_unlock_buckets_cel(struct celockstate *cel) 1519 { 1520 1521 if (cel->blp[0] != NULL) 1522 rw_wunlock(cel->blp[0]); 1523 rw_wunlock(cel->blp[1]); 1524 } 1525 1526 /* 1527 * Lock part of the cache affected by the insertion. 1528 * 1529 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1530 * However, insertion can result in removal of an old entry. In this 1531 * case we have an additional vnode and bucketlock pair to lock. If the 1532 * entry is negative, ncelock is locked instead of the vnode. 1533 * 1534 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1535 * preserving the locking order (smaller address first). 1536 */ 1537 static void 1538 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1539 uint32_t hash) 1540 { 1541 struct namecache *ncp; 1542 struct rwlock *blps[2]; 1543 1544 blps[0] = HASH2BUCKETLOCK(hash); 1545 for (;;) { 1546 blps[1] = NULL; 1547 cache_lock_vnodes_cel(cel, dvp, vp); 1548 if (vp == NULL || vp->v_type != VDIR) 1549 break; 1550 ncp = vp->v_cache_dd; 1551 if (ncp == NULL) 1552 break; 1553 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1554 break; 1555 MPASS(ncp->nc_dvp == vp); 1556 blps[1] = NCP2BUCKETLOCK(ncp); 1557 if (ncp->nc_flag & NCF_NEGATIVE) 1558 break; 1559 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1560 break; 1561 /* 1562 * All vnodes got re-locked. Re-validate the state and if 1563 * nothing changed we are done. Otherwise restart. 1564 */ 1565 if (ncp == vp->v_cache_dd && 1566 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1567 blps[1] == NCP2BUCKETLOCK(ncp) && 1568 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1569 break; 1570 cache_unlock_vnodes_cel(cel); 1571 cel->vlp[0] = NULL; 1572 cel->vlp[1] = NULL; 1573 cel->vlp[2] = NULL; 1574 } 1575 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1576 } 1577 1578 static void 1579 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1580 uint32_t hash) 1581 { 1582 struct namecache *ncp; 1583 struct rwlock *blps[2]; 1584 1585 blps[0] = HASH2BUCKETLOCK(hash); 1586 for (;;) { 1587 blps[1] = NULL; 1588 cache_lock_vnodes_cel(cel, dvp, vp); 1589 ncp = dvp->v_cache_dd; 1590 if (ncp == NULL) 1591 break; 1592 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1593 break; 1594 MPASS(ncp->nc_dvp == dvp); 1595 blps[1] = NCP2BUCKETLOCK(ncp); 1596 if (ncp->nc_flag & NCF_NEGATIVE) 1597 break; 1598 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1599 break; 1600 if (ncp == dvp->v_cache_dd && 1601 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1602 blps[1] == NCP2BUCKETLOCK(ncp) && 1603 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1604 break; 1605 cache_unlock_vnodes_cel(cel); 1606 cel->vlp[0] = NULL; 1607 cel->vlp[1] = NULL; 1608 cel->vlp[2] = NULL; 1609 } 1610 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1611 } 1612 1613 static void 1614 cache_enter_unlock(struct celockstate *cel) 1615 { 1616 1617 cache_unlock_buckets_cel(cel); 1618 cache_unlock_vnodes_cel(cel); 1619 } 1620 1621 /* 1622 * Add an entry to the cache. 1623 */ 1624 void 1625 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1626 struct timespec *tsp, struct timespec *dtsp) 1627 { 1628 struct celockstate cel; 1629 struct namecache *ncp, *n2, *ndd; 1630 struct namecache_ts *ncp_ts, *n2_ts; 1631 struct nchashhead *ncpp; 1632 struct neglist *neglist; 1633 uint32_t hash; 1634 int flag; 1635 int len; 1636 bool neg_locked; 1637 u_long lnumcache; 1638 1639 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1640 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, 1641 ("cache_enter: Adding a doomed vnode")); 1642 VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, 1643 ("cache_enter: Doomed vnode used as src")); 1644 1645 if (__predict_false(!doingcache)) 1646 return; 1647 1648 /* 1649 * Avoid blowout in namecache entries. 1650 */ 1651 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1652 if (__predict_false(lnumcache >= desiredvnodes * ncsizefactor)) { 1653 atomic_add_long(&numcache, -1); 1654 return; 1655 } 1656 1657 cache_celockstate_init(&cel); 1658 ndd = NULL; 1659 ncp_ts = NULL; 1660 flag = 0; 1661 if (cnp->cn_nameptr[0] == '.') { 1662 if (cnp->cn_namelen == 1) 1663 return; 1664 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1665 len = cnp->cn_namelen; 1666 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1667 cache_enter_lock_dd(&cel, dvp, vp, hash); 1668 /* 1669 * If dotdot entry already exists, just retarget it 1670 * to new parent vnode, otherwise continue with new 1671 * namecache entry allocation. 1672 */ 1673 if ((ncp = dvp->v_cache_dd) != NULL && 1674 ncp->nc_flag & NCF_ISDOTDOT) { 1675 KASSERT(ncp->nc_dvp == dvp, 1676 ("wrong isdotdot parent")); 1677 neg_locked = false; 1678 if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) { 1679 neglist = NCP2NEGLIST(ncp); 1680 mtx_lock(&ncneg_hot.nl_lock); 1681 mtx_lock(&neglist->nl_lock); 1682 neg_locked = true; 1683 } 1684 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1685 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, 1686 ncp, nc_dst); 1687 } else { 1688 cache_negative_remove(ncp, true); 1689 } 1690 if (vp != NULL) { 1691 TAILQ_INSERT_HEAD(&vp->v_cache_dst, 1692 ncp, nc_dst); 1693 ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE); 1694 } else { 1695 ncp->nc_flag &= ~(NCF_HOTNEGATIVE); 1696 ncp->nc_flag |= NCF_NEGATIVE; 1697 cache_negative_insert(ncp, true); 1698 } 1699 if (neg_locked) { 1700 mtx_unlock(&neglist->nl_lock); 1701 mtx_unlock(&ncneg_hot.nl_lock); 1702 } 1703 ncp->nc_vp = vp; 1704 cache_enter_unlock(&cel); 1705 return; 1706 } 1707 dvp->v_cache_dd = NULL; 1708 cache_enter_unlock(&cel); 1709 cache_celockstate_init(&cel); 1710 SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); 1711 flag = NCF_ISDOTDOT; 1712 } 1713 } 1714 1715 /* 1716 * Calculate the hash key and setup as much of the new 1717 * namecache entry as possible before acquiring the lock. 1718 */ 1719 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1720 ncp->nc_flag = flag; 1721 ncp->nc_vp = vp; 1722 if (vp == NULL) 1723 ncp->nc_flag |= NCF_NEGATIVE; 1724 ncp->nc_dvp = dvp; 1725 if (tsp != NULL) { 1726 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1727 ncp_ts->nc_time = *tsp; 1728 ncp_ts->nc_ticks = ticks; 1729 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1730 if (dtsp != NULL) { 1731 ncp_ts->nc_dotdottime = *dtsp; 1732 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1733 } 1734 } 1735 len = ncp->nc_nlen = cnp->cn_namelen; 1736 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1737 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1738 cache_enter_lock(&cel, dvp, vp, hash); 1739 1740 /* 1741 * See if this vnode or negative entry is already in the cache 1742 * with this name. This can happen with concurrent lookups of 1743 * the same path name. 1744 */ 1745 ncpp = NCHHASH(hash); 1746 LIST_FOREACH(n2, ncpp, nc_hash) { 1747 if (n2->nc_dvp == dvp && 1748 n2->nc_nlen == cnp->cn_namelen && 1749 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1750 if (tsp != NULL) { 1751 KASSERT((n2->nc_flag & NCF_TS) != 0, 1752 ("no NCF_TS")); 1753 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1754 n2_ts->nc_time = ncp_ts->nc_time; 1755 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1756 if (dtsp != NULL) { 1757 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1758 if (ncp->nc_flag & NCF_NEGATIVE) 1759 mtx_lock(&ncneg_hot.nl_lock); 1760 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1761 if (ncp->nc_flag & NCF_NEGATIVE) 1762 mtx_unlock(&ncneg_hot.nl_lock); 1763 } 1764 } 1765 goto out_unlock_free; 1766 } 1767 } 1768 1769 if (flag == NCF_ISDOTDOT) { 1770 /* 1771 * See if we are trying to add .. entry, but some other lookup 1772 * has populated v_cache_dd pointer already. 1773 */ 1774 if (dvp->v_cache_dd != NULL) 1775 goto out_unlock_free; 1776 KASSERT(vp == NULL || vp->v_type == VDIR, 1777 ("wrong vnode type %p", vp)); 1778 dvp->v_cache_dd = ncp; 1779 } 1780 1781 if (vp != NULL) { 1782 if (vp->v_type == VDIR) { 1783 if (flag != NCF_ISDOTDOT) { 1784 /* 1785 * For this case, the cache entry maps both the 1786 * directory name in it and the name ".." for the 1787 * directory's parent. 1788 */ 1789 if ((ndd = vp->v_cache_dd) != NULL) { 1790 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1791 cache_zap_locked(ndd, false); 1792 else 1793 ndd = NULL; 1794 } 1795 vp->v_cache_dd = ncp; 1796 } 1797 } else { 1798 vp->v_cache_dd = NULL; 1799 } 1800 } 1801 1802 if (flag != NCF_ISDOTDOT) { 1803 if (LIST_EMPTY(&dvp->v_cache_src)) { 1804 vhold(dvp); 1805 atomic_add_rel_long(&numcachehv, 1); 1806 } 1807 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1808 } 1809 1810 /* 1811 * Insert the new namecache entry into the appropriate chain 1812 * within the cache entries table. 1813 */ 1814 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1815 1816 /* 1817 * If the entry is "negative", we place it into the 1818 * "negative" cache queue, otherwise, we place it into the 1819 * destination vnode's cache entries queue. 1820 */ 1821 if (vp != NULL) { 1822 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1823 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1824 vp); 1825 } else { 1826 if (cnp->cn_flags & ISWHITEOUT) 1827 ncp->nc_flag |= NCF_WHITE; 1828 cache_negative_insert(ncp, false); 1829 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1830 ncp->nc_name); 1831 } 1832 cache_enter_unlock(&cel); 1833 if (numneg * ncnegfactor > lnumcache) 1834 cache_negative_zap_one(); 1835 cache_free(ndd); 1836 return; 1837 out_unlock_free: 1838 cache_enter_unlock(&cel); 1839 cache_free(ncp); 1840 return; 1841 } 1842 1843 static u_int 1844 cache_roundup_2(u_int val) 1845 { 1846 u_int res; 1847 1848 for (res = 1; res <= val; res <<= 1) 1849 continue; 1850 1851 return (res); 1852 } 1853 1854 /* 1855 * Name cache initialization, from vfs_init() when we are booting 1856 */ 1857 static void 1858 nchinit(void *dummy __unused) 1859 { 1860 u_int i; 1861 1862 cache_zone_small = uma_zcreate("S VFS Cache", 1863 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1864 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1865 UMA_ZONE_ZINIT); 1866 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1867 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1868 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1869 UMA_ZONE_ZINIT); 1870 cache_zone_large = uma_zcreate("L VFS Cache", 1871 sizeof(struct namecache) + NAME_MAX + 1, 1872 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1873 UMA_ZONE_ZINIT); 1874 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1875 sizeof(struct namecache_ts) + NAME_MAX + 1, 1876 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1877 UMA_ZONE_ZINIT); 1878 1879 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1880 ncbuckethash = cache_roundup_2(mp_ncpus * 64) - 1; 1881 if (ncbuckethash > nchash) 1882 ncbuckethash = nchash; 1883 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1884 M_WAITOK | M_ZERO); 1885 for (i = 0; i < numbucketlocks; i++) 1886 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 1887 ncvnodehash = cache_roundup_2(mp_ncpus * 64) - 1; 1888 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 1889 M_WAITOK | M_ZERO); 1890 for (i = 0; i < numvnodelocks; i++) 1891 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 1892 ncpurgeminvnodes = numbucketlocks; 1893 1894 ncneghash = 3; 1895 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 1896 M_WAITOK | M_ZERO); 1897 for (i = 0; i < numneglists; i++) { 1898 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 1899 TAILQ_INIT(&neglists[i].nl_list); 1900 } 1901 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 1902 TAILQ_INIT(&ncneg_hot.nl_list); 1903 1904 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 1905 1906 numcalls = counter_u64_alloc(M_WAITOK); 1907 dothits = counter_u64_alloc(M_WAITOK); 1908 dotdothits = counter_u64_alloc(M_WAITOK); 1909 numchecks = counter_u64_alloc(M_WAITOK); 1910 nummiss = counter_u64_alloc(M_WAITOK); 1911 nummisszap = counter_u64_alloc(M_WAITOK); 1912 numposzaps = counter_u64_alloc(M_WAITOK); 1913 numposhits = counter_u64_alloc(M_WAITOK); 1914 numnegzaps = counter_u64_alloc(M_WAITOK); 1915 numneghits = counter_u64_alloc(M_WAITOK); 1916 numfullpathcalls = counter_u64_alloc(M_WAITOK); 1917 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 1918 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 1919 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 1920 numfullpathfound = counter_u64_alloc(M_WAITOK); 1921 } 1922 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 1923 1924 void 1925 cache_changesize(int newmaxvnodes) 1926 { 1927 struct nchashhead *new_nchashtbl, *old_nchashtbl; 1928 u_long new_nchash, old_nchash; 1929 struct namecache *ncp; 1930 uint32_t hash; 1931 int i; 1932 1933 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 1934 if (newmaxvnodes < numbucketlocks) 1935 newmaxvnodes = numbucketlocks; 1936 1937 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 1938 /* If same hash table size, nothing to do */ 1939 if (nchash == new_nchash) { 1940 free(new_nchashtbl, M_VFSCACHE); 1941 return; 1942 } 1943 /* 1944 * Move everything from the old hash table to the new table. 1945 * None of the namecache entries in the table can be removed 1946 * because to do so, they have to be removed from the hash table. 1947 */ 1948 cache_lock_all_vnodes(); 1949 cache_lock_all_buckets(); 1950 old_nchashtbl = nchashtbl; 1951 old_nchash = nchash; 1952 nchashtbl = new_nchashtbl; 1953 nchash = new_nchash; 1954 for (i = 0; i <= old_nchash; i++) { 1955 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 1956 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 1957 ncp->nc_dvp); 1958 LIST_REMOVE(ncp, nc_hash); 1959 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 1960 } 1961 } 1962 cache_unlock_all_buckets(); 1963 cache_unlock_all_vnodes(); 1964 free(old_nchashtbl, M_VFSCACHE); 1965 } 1966 1967 /* 1968 * Invalidate all entries from and to a particular vnode. 1969 */ 1970 void 1971 cache_purge(struct vnode *vp) 1972 { 1973 TAILQ_HEAD(, namecache) ncps; 1974 struct namecache *ncp, *nnp; 1975 struct mtx *vlp, *vlp2; 1976 1977 CTR1(KTR_VFS, "cache_purge(%p)", vp); 1978 SDT_PROBE1(vfs, namecache, purge, done, vp); 1979 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 1980 vp->v_cache_dd == NULL) 1981 return; 1982 TAILQ_INIT(&ncps); 1983 vlp = VP2VNODELOCK(vp); 1984 vlp2 = NULL; 1985 mtx_lock(vlp); 1986 retry: 1987 while (!LIST_EMPTY(&vp->v_cache_src)) { 1988 ncp = LIST_FIRST(&vp->v_cache_src); 1989 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 1990 goto retry; 1991 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1992 } 1993 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 1994 ncp = TAILQ_FIRST(&vp->v_cache_dst); 1995 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 1996 goto retry; 1997 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1998 } 1999 ncp = vp->v_cache_dd; 2000 if (ncp != NULL) { 2001 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2002 ("lost dotdot link")); 2003 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2004 goto retry; 2005 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2006 } 2007 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2008 mtx_unlock(vlp); 2009 if (vlp2 != NULL) 2010 mtx_unlock(vlp2); 2011 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2012 cache_free(ncp); 2013 } 2014 } 2015 2016 /* 2017 * Invalidate all negative entries for a particular directory vnode. 2018 */ 2019 void 2020 cache_purge_negative(struct vnode *vp) 2021 { 2022 TAILQ_HEAD(, namecache) ncps; 2023 struct namecache *ncp, *nnp; 2024 struct mtx *vlp; 2025 2026 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2027 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2028 if (LIST_EMPTY(&vp->v_cache_src)) 2029 return; 2030 TAILQ_INIT(&ncps); 2031 vlp = VP2VNODELOCK(vp); 2032 mtx_lock(vlp); 2033 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2034 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2035 continue; 2036 cache_zap_negative_locked_vnode_kl(ncp, vp); 2037 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2038 } 2039 mtx_unlock(vlp); 2040 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2041 cache_free(ncp); 2042 } 2043 } 2044 2045 /* 2046 * Flush all entries referencing a particular filesystem. 2047 */ 2048 void 2049 cache_purgevfs(struct mount *mp, bool force) 2050 { 2051 TAILQ_HEAD(, namecache) ncps; 2052 struct mtx *vlp1, *vlp2; 2053 struct rwlock *blp; 2054 struct nchashhead *bucket; 2055 struct namecache *ncp, *nnp; 2056 u_long i, j, n_nchash; 2057 int error; 2058 2059 /* Scan hash tables for applicable entries */ 2060 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2061 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2062 return; 2063 TAILQ_INIT(&ncps); 2064 n_nchash = nchash + 1; 2065 vlp1 = vlp2 = NULL; 2066 for (i = 0; i < numbucketlocks; i++) { 2067 blp = (struct rwlock *)&bucketlocks[i]; 2068 rw_wlock(blp); 2069 for (j = i; j < n_nchash; j += numbucketlocks) { 2070 retry: 2071 bucket = &nchashtbl[j]; 2072 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2073 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2074 if (ncp->nc_dvp->v_mount != mp) 2075 continue; 2076 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2077 &vlp1, &vlp2); 2078 if (error != 0) 2079 goto retry; 2080 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2081 } 2082 } 2083 rw_wunlock(blp); 2084 if (vlp1 == NULL && vlp2 == NULL) 2085 cache_maybe_yield(); 2086 } 2087 if (vlp1 != NULL) 2088 mtx_unlock(vlp1); 2089 if (vlp2 != NULL) 2090 mtx_unlock(vlp2); 2091 2092 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2093 cache_free(ncp); 2094 } 2095 } 2096 2097 /* 2098 * Perform canonical checks and cache lookup and pass on to filesystem 2099 * through the vop_cachedlookup only if needed. 2100 */ 2101 2102 int 2103 vfs_cache_lookup(struct vop_lookup_args *ap) 2104 { 2105 struct vnode *dvp; 2106 int error; 2107 struct vnode **vpp = ap->a_vpp; 2108 struct componentname *cnp = ap->a_cnp; 2109 struct ucred *cred = cnp->cn_cred; 2110 int flags = cnp->cn_flags; 2111 struct thread *td = cnp->cn_thread; 2112 2113 *vpp = NULL; 2114 dvp = ap->a_dvp; 2115 2116 if (dvp->v_type != VDIR) 2117 return (ENOTDIR); 2118 2119 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2120 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2121 return (EROFS); 2122 2123 error = VOP_ACCESS(dvp, VEXEC, cred, td); 2124 if (error) 2125 return (error); 2126 2127 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2128 if (error == 0) 2129 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2130 if (error == -1) 2131 return (0); 2132 return (error); 2133 } 2134 2135 /* 2136 * XXX All of these sysctls would probably be more productive dead. 2137 */ 2138 static int __read_mostly disablecwd; 2139 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 2140 "Disable the getcwd syscall"); 2141 2142 /* Implementation of the getcwd syscall. */ 2143 int 2144 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2145 { 2146 2147 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, 2148 MAXPATHLEN)); 2149 } 2150 2151 int 2152 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen, 2153 size_t path_max) 2154 { 2155 char *bp, *tmpbuf; 2156 struct filedesc *fdp; 2157 struct vnode *cdir, *rdir; 2158 int error; 2159 2160 if (__predict_false(disablecwd)) 2161 return (ENODEV); 2162 if (__predict_false(buflen < 2)) 2163 return (EINVAL); 2164 if (buflen > path_max) 2165 buflen = path_max; 2166 2167 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); 2168 fdp = td->td_proc->p_fd; 2169 FILEDESC_SLOCK(fdp); 2170 cdir = fdp->fd_cdir; 2171 vrefact(cdir); 2172 rdir = fdp->fd_rdir; 2173 vrefact(rdir); 2174 FILEDESC_SUNLOCK(fdp); 2175 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); 2176 vrele(rdir); 2177 vrele(cdir); 2178 2179 if (!error) { 2180 if (bufseg == UIO_SYSSPACE) 2181 bcopy(bp, buf, strlen(bp) + 1); 2182 else 2183 error = copyout(bp, buf, strlen(bp) + 1); 2184 #ifdef KTRACE 2185 if (KTRPOINT(curthread, KTR_NAMEI)) 2186 ktrnamei(bp); 2187 #endif 2188 } 2189 free(tmpbuf, M_TEMP); 2190 return (error); 2191 } 2192 2193 /* 2194 * Thus begins the fullpath magic. 2195 */ 2196 2197 static int __read_mostly disablefullpath; 2198 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, 2199 "Disable the vn_fullpath function"); 2200 2201 /* 2202 * Retrieve the full filesystem path that correspond to a vnode from the name 2203 * cache (if available) 2204 */ 2205 int 2206 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2207 { 2208 char *buf; 2209 struct filedesc *fdp; 2210 struct vnode *rdir; 2211 int error; 2212 2213 if (__predict_false(disablefullpath)) 2214 return (ENODEV); 2215 if (__predict_false(vn == NULL)) 2216 return (EINVAL); 2217 2218 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2219 fdp = td->td_proc->p_fd; 2220 FILEDESC_SLOCK(fdp); 2221 rdir = fdp->fd_rdir; 2222 vrefact(rdir); 2223 FILEDESC_SUNLOCK(fdp); 2224 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); 2225 vrele(rdir); 2226 2227 if (!error) 2228 *freebuf = buf; 2229 else 2230 free(buf, M_TEMP); 2231 return (error); 2232 } 2233 2234 /* 2235 * This function is similar to vn_fullpath, but it attempts to lookup the 2236 * pathname relative to the global root mount point. This is required for the 2237 * auditing sub-system, as audited pathnames must be absolute, relative to the 2238 * global root mount point. 2239 */ 2240 int 2241 vn_fullpath_global(struct thread *td, struct vnode *vn, 2242 char **retbuf, char **freebuf) 2243 { 2244 char *buf; 2245 int error; 2246 2247 if (__predict_false(disablefullpath)) 2248 return (ENODEV); 2249 if (__predict_false(vn == NULL)) 2250 return (EINVAL); 2251 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2252 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); 2253 if (!error) 2254 *freebuf = buf; 2255 else 2256 free(buf, M_TEMP); 2257 return (error); 2258 } 2259 2260 int 2261 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) 2262 { 2263 struct vnode *dvp; 2264 struct namecache *ncp; 2265 struct mtx *vlp; 2266 int error; 2267 2268 vlp = VP2VNODELOCK(*vp); 2269 mtx_lock(vlp); 2270 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2271 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2272 break; 2273 } 2274 if (ncp != NULL) { 2275 if (*buflen < ncp->nc_nlen) { 2276 mtx_unlock(vlp); 2277 vrele(*vp); 2278 counter_u64_add(numfullpathfail4, 1); 2279 error = ENOMEM; 2280 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2281 vp, NULL); 2282 return (error); 2283 } 2284 *buflen -= ncp->nc_nlen; 2285 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2286 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2287 ncp->nc_name, vp); 2288 dvp = *vp; 2289 *vp = ncp->nc_dvp; 2290 vref(*vp); 2291 mtx_unlock(vlp); 2292 vrele(dvp); 2293 return (0); 2294 } 2295 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2296 2297 mtx_unlock(vlp); 2298 vn_lock(*vp, LK_SHARED | LK_RETRY); 2299 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2300 vput(*vp); 2301 if (error) { 2302 counter_u64_add(numfullpathfail2, 1); 2303 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2304 return (error); 2305 } 2306 2307 *vp = dvp; 2308 if (dvp->v_iflag & VI_DOOMED) { 2309 /* forced unmount */ 2310 vrele(dvp); 2311 error = ENOENT; 2312 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2313 return (error); 2314 } 2315 /* 2316 * *vp has its use count incremented still. 2317 */ 2318 2319 return (0); 2320 } 2321 2322 /* 2323 * The magic behind kern___getcwd() and vn_fullpath(). 2324 */ 2325 static int 2326 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 2327 char *buf, char **retbuf, u_int buflen) 2328 { 2329 int error, slash_prefixed; 2330 #ifdef KDTRACE_HOOKS 2331 struct vnode *startvp = vp; 2332 #endif 2333 struct vnode *vp1; 2334 2335 buflen--; 2336 buf[buflen] = '\0'; 2337 error = 0; 2338 slash_prefixed = 0; 2339 2340 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2341 counter_u64_add(numfullpathcalls, 1); 2342 vref(vp); 2343 if (vp->v_type != VDIR) { 2344 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2345 if (error) 2346 return (error); 2347 if (buflen == 0) { 2348 vrele(vp); 2349 return (ENOMEM); 2350 } 2351 buf[--buflen] = '/'; 2352 slash_prefixed = 1; 2353 } 2354 while (vp != rdir && vp != rootvnode) { 2355 /* 2356 * The vp vnode must be already fully constructed, 2357 * since it is either found in namecache or obtained 2358 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2359 * without obtaining the vnode lock. 2360 */ 2361 if ((vp->v_vflag & VV_ROOT) != 0) { 2362 vn_lock(vp, LK_RETRY | LK_SHARED); 2363 2364 /* 2365 * With the vnode locked, check for races with 2366 * unmount, forced or not. Note that we 2367 * already verified that vp is not equal to 2368 * the root vnode, which means that 2369 * mnt_vnodecovered can be NULL only for the 2370 * case of unmount. 2371 */ 2372 if ((vp->v_iflag & VI_DOOMED) != 0 || 2373 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2374 vp1->v_mountedhere != vp->v_mount) { 2375 vput(vp); 2376 error = ENOENT; 2377 SDT_PROBE3(vfs, namecache, fullpath, return, 2378 error, vp, NULL); 2379 break; 2380 } 2381 2382 vref(vp1); 2383 vput(vp); 2384 vp = vp1; 2385 continue; 2386 } 2387 if (vp->v_type != VDIR) { 2388 vrele(vp); 2389 counter_u64_add(numfullpathfail1, 1); 2390 error = ENOTDIR; 2391 SDT_PROBE3(vfs, namecache, fullpath, return, 2392 error, vp, NULL); 2393 break; 2394 } 2395 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2396 if (error) 2397 break; 2398 if (buflen == 0) { 2399 vrele(vp); 2400 error = ENOMEM; 2401 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2402 startvp, NULL); 2403 break; 2404 } 2405 buf[--buflen] = '/'; 2406 slash_prefixed = 1; 2407 } 2408 if (error) 2409 return (error); 2410 if (!slash_prefixed) { 2411 if (buflen == 0) { 2412 vrele(vp); 2413 counter_u64_add(numfullpathfail4, 1); 2414 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2415 startvp, NULL); 2416 return (ENOMEM); 2417 } 2418 buf[--buflen] = '/'; 2419 } 2420 counter_u64_add(numfullpathfound, 1); 2421 vrele(vp); 2422 2423 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); 2424 *retbuf = buf + buflen; 2425 return (0); 2426 } 2427 2428 struct vnode * 2429 vn_dir_dd_ino(struct vnode *vp) 2430 { 2431 struct namecache *ncp; 2432 struct vnode *ddvp; 2433 struct mtx *vlp; 2434 enum vgetstate vs; 2435 2436 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2437 vlp = VP2VNODELOCK(vp); 2438 mtx_lock(vlp); 2439 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2440 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2441 continue; 2442 ddvp = ncp->nc_dvp; 2443 vs = vget_prep(ddvp); 2444 mtx_unlock(vlp); 2445 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2446 return (NULL); 2447 return (ddvp); 2448 } 2449 mtx_unlock(vlp); 2450 return (NULL); 2451 } 2452 2453 int 2454 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2455 { 2456 struct namecache *ncp; 2457 struct mtx *vlp; 2458 int l; 2459 2460 vlp = VP2VNODELOCK(vp); 2461 mtx_lock(vlp); 2462 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2463 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2464 break; 2465 if (ncp == NULL) { 2466 mtx_unlock(vlp); 2467 return (ENOENT); 2468 } 2469 l = min(ncp->nc_nlen, buflen - 1); 2470 memcpy(buf, ncp->nc_name, l); 2471 mtx_unlock(vlp); 2472 buf[l] = '\0'; 2473 return (0); 2474 } 2475 2476 /* 2477 * This function updates path string to vnode's full global path 2478 * and checks the size of the new path string against the pathlen argument. 2479 * 2480 * Requires a locked, referenced vnode. 2481 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2482 * 2483 * If sysctl debug.disablefullpath is set, ENODEV is returned, 2484 * vnode is left locked and path remain untouched. 2485 * 2486 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2487 * because it falls back to the ".." lookup if the namecache lookup fails. 2488 */ 2489 int 2490 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2491 u_int pathlen) 2492 { 2493 struct nameidata nd; 2494 struct vnode *vp1; 2495 char *rpath, *fbuf; 2496 int error; 2497 2498 ASSERT_VOP_ELOCKED(vp, __func__); 2499 2500 /* Return ENODEV if sysctl debug.disablefullpath==1 */ 2501 if (__predict_false(disablefullpath)) 2502 return (ENODEV); 2503 2504 /* Construct global filesystem path from vp. */ 2505 VOP_UNLOCK(vp, 0); 2506 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2507 2508 if (error != 0) { 2509 vrele(vp); 2510 return (error); 2511 } 2512 2513 if (strlen(rpath) >= pathlen) { 2514 vrele(vp); 2515 error = ENAMETOOLONG; 2516 goto out; 2517 } 2518 2519 /* 2520 * Re-lookup the vnode by path to detect a possible rename. 2521 * As a side effect, the vnode is relocked. 2522 * If vnode was renamed, return ENOENT. 2523 */ 2524 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2525 UIO_SYSSPACE, path, td); 2526 error = namei(&nd); 2527 if (error != 0) { 2528 vrele(vp); 2529 goto out; 2530 } 2531 NDFREE(&nd, NDF_ONLY_PNBUF); 2532 vp1 = nd.ni_vp; 2533 vrele(vp); 2534 if (vp1 == vp) 2535 strcpy(path, rpath); 2536 else { 2537 vput(vp1); 2538 error = ENOENT; 2539 } 2540 2541 out: 2542 free(fbuf, M_TEMP); 2543 return (error); 2544 } 2545 2546 #ifdef DDB 2547 static void 2548 db_print_vpath(struct vnode *vp) 2549 { 2550 2551 while (vp != NULL) { 2552 db_printf("%p: ", vp); 2553 if (vp == rootvnode) { 2554 db_printf("/"); 2555 vp = NULL; 2556 } else { 2557 if (vp->v_vflag & VV_ROOT) { 2558 db_printf("<mount point>"); 2559 vp = vp->v_mount->mnt_vnodecovered; 2560 } else { 2561 struct namecache *ncp; 2562 char *ncn; 2563 int i; 2564 2565 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2566 if (ncp != NULL) { 2567 ncn = ncp->nc_name; 2568 for (i = 0; i < ncp->nc_nlen; i++) 2569 db_printf("%c", *ncn++); 2570 vp = ncp->nc_dvp; 2571 } else { 2572 vp = NULL; 2573 } 2574 } 2575 } 2576 db_printf("\n"); 2577 } 2578 2579 return; 2580 } 2581 2582 DB_SHOW_COMMAND(vpath, db_show_vpath) 2583 { 2584 struct vnode *vp; 2585 2586 if (!have_addr) { 2587 db_printf("usage: show vpath <struct vnode *>\n"); 2588 return; 2589 } 2590 2591 vp = (struct vnode *)addr; 2592 db_print_vpath(vp); 2593 } 2594 2595 #endif 2596