1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/counter.h> 46 #include <sys/filedesc.h> 47 #include <sys/fnv_hash.h> 48 #include <sys/kernel.h> 49 #include <sys/ktr.h> 50 #include <sys/lock.h> 51 #include <sys/malloc.h> 52 #include <sys/fcntl.h> 53 #include <sys/mount.h> 54 #include <sys/namei.h> 55 #include <sys/proc.h> 56 #include <sys/rwlock.h> 57 #include <sys/sdt.h> 58 #include <sys/smp.h> 59 #include <sys/syscallsubr.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysproto.h> 62 #include <sys/vnode.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 67 #ifdef DDB 68 #include <ddb/ddb.h> 69 #endif 70 71 #include <vm/uma.h> 72 73 SDT_PROVIDER_DECLARE(vfs); 74 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 75 "struct vnode *"); 76 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 77 "char *"); 78 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 79 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 80 "char *", "struct vnode *"); 81 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 82 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 83 "struct vnode *", "char *"); 84 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 87 "struct vnode *", "char *"); 88 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 89 "char *"); 90 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 92 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 93 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 94 "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *", 96 "char *", "int"); 97 SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *", 98 "char *", "int"); 99 100 /* 101 * This structure describes the elements in the cache of recent 102 * names looked up by namei. 103 */ 104 105 struct namecache { 106 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 107 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 108 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 109 struct vnode *nc_dvp; /* vnode of parent of name */ 110 union { 111 struct vnode *nu_vp; /* vnode the name refers to */ 112 u_int nu_neghits; /* negative entry hits */ 113 } n_un; 114 u_char nc_flag; /* flag bits */ 115 u_char nc_nlen; /* length of name */ 116 char nc_name[0]; /* segment name + nul */ 117 }; 118 119 /* 120 * struct namecache_ts repeats struct namecache layout up to the 121 * nc_nlen member. 122 * struct namecache_ts is used in place of struct namecache when time(s) need 123 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 124 * both a non-dotdot directory name plus dotdot for the directory's 125 * parent. 126 */ 127 struct namecache_ts { 128 struct timespec nc_time; /* timespec provided by fs */ 129 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 130 int nc_ticks; /* ticks value when entry was added */ 131 struct namecache nc_nc; 132 }; 133 134 #define nc_vp n_un.nu_vp 135 #define nc_neghits n_un.nu_neghits 136 137 /* 138 * Flags in namecache.nc_flag 139 */ 140 #define NCF_WHITE 0x01 141 #define NCF_ISDOTDOT 0x02 142 #define NCF_TS 0x04 143 #define NCF_DTS 0x08 144 #define NCF_DVDROP 0x10 145 #define NCF_NEGATIVE 0x20 146 #define NCF_HOTNEGATIVE 0x40 147 148 /* 149 * Name caching works as follows: 150 * 151 * Names found by directory scans are retained in a cache 152 * for future reference. It is managed LRU, so frequently 153 * used names will hang around. Cache is indexed by hash value 154 * obtained from (dvp, name) where dvp refers to the directory 155 * containing name. 156 * 157 * If it is a "negative" entry, (i.e. for a name that is known NOT to 158 * exist) the vnode pointer will be NULL. 159 * 160 * Upon reaching the last segment of a path, if the reference 161 * is for DELETE, or NOCACHE is set (rewrite), and the 162 * name is located in the cache, it will be dropped. 163 * 164 * These locks are used (in the order in which they can be taken): 165 * NAME TYPE ROLE 166 * vnodelock mtx vnode lists and v_cache_dd field protection 167 * bucketlock rwlock for access to given set of hash buckets 168 * neglist mtx negative entry LRU management 169 * 170 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 171 * shrinking the LRU list. 172 * 173 * It is legal to take multiple vnodelock and bucketlock locks. The locking 174 * order is lower address first. Both are recursive. 175 * 176 * "." lookups are lockless. 177 * 178 * ".." and vnode -> name lookups require vnodelock. 179 * 180 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 181 * 182 * Insertions and removals of entries require involved vnodes and bucketlocks 183 * to be write-locked to prevent other threads from seeing the entry. 184 * 185 * Some lookups result in removal of the found entry (e.g. getting rid of a 186 * negative entry with the intent to create a positive one), which poses a 187 * problem when multiple threads reach the state. Similarly, two different 188 * threads can purge two different vnodes and try to remove the same name. 189 * 190 * If the already held vnode lock is lower than the second required lock, we 191 * can just take the other lock. However, in the opposite case, this could 192 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 193 * the first node, locking everything in order and revalidating the state. 194 */ 195 196 /* 197 * Structures associated with name caching. 198 */ 199 #define NCHHASH(hash) \ 200 (&nchashtbl[(hash) & nchash]) 201 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 202 static u_long __read_mostly nchash; /* size of hash table */ 203 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 204 "Size of namecache hash table"); 205 static u_long __read_mostly ncnegfactor = 12; /* ratio of negative entries */ 206 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 207 "Ratio of negative namecache entries"); 208 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 209 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 210 "Number of negative entries in namecache"); 211 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 212 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 213 "Number of namecache entries"); 214 static u_long __exclusive_cache_line numcachehv;/* number of cache entries with vnodes held */ 215 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, 216 "Number of namecache entries with vnodes held"); 217 u_int __read_mostly ncsizefactor = 2; 218 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 219 "Size factor for namecache"); 220 static u_int __read_mostly ncpurgeminvnodes; 221 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 222 "Number of vnodes below which purgevfs ignores the request"); 223 static u_int __read_mostly ncneghitsrequeue = 8; 224 SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0, 225 "Number of hits to requeue a negative entry in the LRU list"); 226 227 struct nchstats nchstats; /* cache effectiveness statistics */ 228 229 static struct mtx ncneg_shrink_lock; 230 static int shrink_list_turn; 231 232 struct neglist { 233 struct mtx nl_lock; 234 TAILQ_HEAD(, namecache) nl_list; 235 } __aligned(CACHE_LINE_SIZE); 236 237 static struct neglist __read_mostly *neglists; 238 static struct neglist ncneg_hot; 239 240 #define numneglists (ncneghash + 1) 241 static u_int __read_mostly ncneghash; 242 static inline struct neglist * 243 NCP2NEGLIST(struct namecache *ncp) 244 { 245 246 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 247 } 248 249 #define numbucketlocks (ncbuckethash + 1) 250 static u_int __read_mostly ncbuckethash; 251 static struct rwlock_padalign __read_mostly *bucketlocks; 252 #define HASH2BUCKETLOCK(hash) \ 253 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 254 255 #define numvnodelocks (ncvnodehash + 1) 256 static u_int __read_mostly ncvnodehash; 257 static struct mtx __read_mostly *vnodelocks; 258 static inline struct mtx * 259 VP2VNODELOCK(struct vnode *vp) 260 { 261 262 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 263 } 264 265 /* 266 * UMA zones for the VFS cache. 267 * 268 * The small cache is used for entries with short names, which are the 269 * most common. The large cache is used for entries which are too big to 270 * fit in the small cache. 271 */ 272 static uma_zone_t __read_mostly cache_zone_small; 273 static uma_zone_t __read_mostly cache_zone_small_ts; 274 static uma_zone_t __read_mostly cache_zone_large; 275 static uma_zone_t __read_mostly cache_zone_large_ts; 276 277 #define CACHE_PATH_CUTOFF 35 278 279 static struct namecache * 280 cache_alloc(int len, int ts) 281 { 282 struct namecache_ts *ncp_ts; 283 struct namecache *ncp; 284 285 if (__predict_false(ts)) { 286 if (len <= CACHE_PATH_CUTOFF) 287 ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); 288 else 289 ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); 290 ncp = &ncp_ts->nc_nc; 291 } else { 292 if (len <= CACHE_PATH_CUTOFF) 293 ncp = uma_zalloc(cache_zone_small, M_WAITOK); 294 else 295 ncp = uma_zalloc(cache_zone_large, M_WAITOK); 296 } 297 return (ncp); 298 } 299 300 static void 301 cache_free(struct namecache *ncp) 302 { 303 struct namecache_ts *ncp_ts; 304 305 if (ncp == NULL) 306 return; 307 if ((ncp->nc_flag & NCF_DVDROP) != 0) 308 vdrop(ncp->nc_dvp); 309 if (__predict_false(ncp->nc_flag & NCF_TS)) { 310 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 311 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 312 uma_zfree(cache_zone_small_ts, ncp_ts); 313 else 314 uma_zfree(cache_zone_large_ts, ncp_ts); 315 } else { 316 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 317 uma_zfree(cache_zone_small, ncp); 318 else 319 uma_zfree(cache_zone_large, ncp); 320 } 321 } 322 323 static void 324 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 325 { 326 struct namecache_ts *ncp_ts; 327 328 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 329 (tsp == NULL && ticksp == NULL), 330 ("No NCF_TS")); 331 332 if (tsp == NULL && ticksp == NULL) 333 return; 334 335 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 336 if (tsp != NULL) 337 *tsp = ncp_ts->nc_time; 338 if (ticksp != NULL) 339 *ticksp = ncp_ts->nc_ticks; 340 } 341 342 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 343 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 344 "VFS namecache enabled"); 345 346 /* Export size information to userland */ 347 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 348 sizeof(struct namecache), "sizeof(struct namecache)"); 349 350 /* 351 * The new name cache statistics 352 */ 353 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, 354 "Name cache statistics"); 355 #define STATNODE_ULONG(name, descr) \ 356 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 357 #define STATNODE_COUNTER(name, descr) \ 358 static counter_u64_t __read_mostly name; \ 359 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 360 STATNODE_ULONG(numneg, "Number of negative cache entries"); 361 STATNODE_ULONG(numcache, "Number of cache entries"); 362 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 363 STATNODE_COUNTER(dothits, "Number of '.' hits"); 364 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 365 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 366 STATNODE_COUNTER(nummiss, "Number of cache misses"); 367 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 368 STATNODE_COUNTER(numposzaps, 369 "Number of cache hits (positive) we do not want to cache"); 370 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 371 STATNODE_COUNTER(numnegzaps, 372 "Number of cache hits (negative) we do not want to cache"); 373 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 374 /* These count for kern___getcwd(), too. */ 375 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 376 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 377 STATNODE_COUNTER(numfullpathfail2, 378 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 379 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 380 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 381 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 382 "Number of times zap_and_exit failed to lock"); 383 static long cache_lock_vnodes_cel_3_failures; 384 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 385 "Number of times 3-way vnode locking failed"); 386 387 static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 388 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 389 char *buf, char **retbuf, u_int buflen); 390 391 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 392 393 static int cache_yield; 394 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 395 "Number of times cache called yield"); 396 397 static void 398 cache_maybe_yield(void) 399 { 400 401 if (should_yield()) { 402 cache_yield++; 403 kern_yield(PRI_USER); 404 } 405 } 406 407 static inline void 408 cache_assert_vlp_locked(struct mtx *vlp) 409 { 410 411 if (vlp != NULL) 412 mtx_assert(vlp, MA_OWNED); 413 } 414 415 static inline void 416 cache_assert_vnode_locked(struct vnode *vp) 417 { 418 struct mtx *vlp; 419 420 vlp = VP2VNODELOCK(vp); 421 cache_assert_vlp_locked(vlp); 422 } 423 424 static uint32_t 425 cache_get_hash(char *name, u_char len, struct vnode *dvp) 426 { 427 uint32_t hash; 428 429 hash = fnv_32_buf(name, len, FNV1_32_INIT); 430 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 431 return (hash); 432 } 433 434 static inline struct rwlock * 435 NCP2BUCKETLOCK(struct namecache *ncp) 436 { 437 uint32_t hash; 438 439 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 440 return (HASH2BUCKETLOCK(hash)); 441 } 442 443 #ifdef INVARIANTS 444 static void 445 cache_assert_bucket_locked(struct namecache *ncp, int mode) 446 { 447 struct rwlock *blp; 448 449 blp = NCP2BUCKETLOCK(ncp); 450 rw_assert(blp, mode); 451 } 452 #else 453 #define cache_assert_bucket_locked(x, y) do { } while (0) 454 #endif 455 456 #define cache_sort(x, y) _cache_sort((void **)(x), (void **)(y)) 457 static void 458 _cache_sort(void **p1, void **p2) 459 { 460 void *tmp; 461 462 if (*p1 > *p2) { 463 tmp = *p2; 464 *p2 = *p1; 465 *p1 = tmp; 466 } 467 } 468 469 static void 470 cache_lock_all_buckets(void) 471 { 472 u_int i; 473 474 for (i = 0; i < numbucketlocks; i++) 475 rw_wlock(&bucketlocks[i]); 476 } 477 478 static void 479 cache_unlock_all_buckets(void) 480 { 481 u_int i; 482 483 for (i = 0; i < numbucketlocks; i++) 484 rw_wunlock(&bucketlocks[i]); 485 } 486 487 static void 488 cache_lock_all_vnodes(void) 489 { 490 u_int i; 491 492 for (i = 0; i < numvnodelocks; i++) 493 mtx_lock(&vnodelocks[i]); 494 } 495 496 static void 497 cache_unlock_all_vnodes(void) 498 { 499 u_int i; 500 501 for (i = 0; i < numvnodelocks; i++) 502 mtx_unlock(&vnodelocks[i]); 503 } 504 505 static int 506 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 507 { 508 509 cache_sort(&vlp1, &vlp2); 510 MPASS(vlp2 != NULL); 511 512 if (vlp1 != NULL) { 513 if (!mtx_trylock(vlp1)) 514 return (EAGAIN); 515 } 516 if (!mtx_trylock(vlp2)) { 517 if (vlp1 != NULL) 518 mtx_unlock(vlp1); 519 return (EAGAIN); 520 } 521 522 return (0); 523 } 524 525 static void 526 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 527 { 528 529 MPASS(vlp1 != NULL || vlp2 != NULL); 530 531 if (vlp1 != NULL) 532 mtx_unlock(vlp1); 533 if (vlp2 != NULL) 534 mtx_unlock(vlp2); 535 } 536 537 static int 538 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 539 { 540 struct nchstats snap; 541 542 if (req->oldptr == NULL) 543 return (SYSCTL_OUT(req, 0, sizeof(snap))); 544 545 snap = nchstats; 546 snap.ncs_goodhits = counter_u64_fetch(numposhits); 547 snap.ncs_neghits = counter_u64_fetch(numneghits); 548 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 549 counter_u64_fetch(numnegzaps); 550 snap.ncs_miss = counter_u64_fetch(nummisszap) + 551 counter_u64_fetch(nummiss); 552 553 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 554 } 555 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 556 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 557 "VFS cache effectiveness statistics"); 558 559 #ifdef DIAGNOSTIC 560 /* 561 * Grab an atomic snapshot of the name cache hash chain lengths 562 */ 563 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, 564 "hash table stats"); 565 566 static int 567 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 568 { 569 struct nchashhead *ncpp; 570 struct namecache *ncp; 571 int i, error, n_nchash, *cntbuf; 572 573 retry: 574 n_nchash = nchash + 1; /* nchash is max index, not count */ 575 if (req->oldptr == NULL) 576 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 577 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 578 cache_lock_all_buckets(); 579 if (n_nchash != nchash + 1) { 580 cache_unlock_all_buckets(); 581 free(cntbuf, M_TEMP); 582 goto retry; 583 } 584 /* Scan hash tables counting entries */ 585 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 586 LIST_FOREACH(ncp, ncpp, nc_hash) 587 cntbuf[i]++; 588 cache_unlock_all_buckets(); 589 for (error = 0, i = 0; i < n_nchash; i++) 590 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 591 break; 592 free(cntbuf, M_TEMP); 593 return (error); 594 } 595 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 596 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 597 "nchash chain lengths"); 598 599 static int 600 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 601 { 602 int error; 603 struct nchashhead *ncpp; 604 struct namecache *ncp; 605 int n_nchash; 606 int count, maxlength, used, pct; 607 608 if (!req->oldptr) 609 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 610 611 cache_lock_all_buckets(); 612 n_nchash = nchash + 1; /* nchash is max index, not count */ 613 used = 0; 614 maxlength = 0; 615 616 /* Scan hash tables for applicable entries */ 617 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 618 count = 0; 619 LIST_FOREACH(ncp, ncpp, nc_hash) { 620 count++; 621 } 622 if (count) 623 used++; 624 if (maxlength < count) 625 maxlength = count; 626 } 627 n_nchash = nchash + 1; 628 cache_unlock_all_buckets(); 629 pct = (used * 100) / (n_nchash / 100); 630 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 631 if (error) 632 return (error); 633 error = SYSCTL_OUT(req, &used, sizeof(used)); 634 if (error) 635 return (error); 636 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 637 if (error) 638 return (error); 639 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 640 if (error) 641 return (error); 642 return (0); 643 } 644 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 645 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 646 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 647 #endif 648 649 /* 650 * Negative entries management 651 * 652 * A variation of LRU scheme is used. New entries are hashed into one of 653 * numneglists cold lists. Entries get promoted to the hot list on first hit. 654 * Partial LRU for the hot list is maintained by requeueing them every 655 * ncneghitsrequeue hits. 656 * 657 * The shrinker will demote hot list head and evict from the cold list in a 658 * round-robin manner. 659 */ 660 static void 661 cache_negative_hit(struct namecache *ncp) 662 { 663 struct neglist *neglist; 664 u_int hits; 665 666 MPASS(ncp->nc_flag & NCF_NEGATIVE); 667 hits = atomic_fetchadd_int(&ncp->nc_neghits, 1); 668 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 669 if ((hits % ncneghitsrequeue) != 0) 670 return; 671 mtx_lock(&ncneg_hot.nl_lock); 672 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 673 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 674 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 675 mtx_unlock(&ncneg_hot.nl_lock); 676 return; 677 } 678 /* 679 * The shrinker cleared the flag and removed the entry from 680 * the hot list. Put it back. 681 */ 682 } else { 683 mtx_lock(&ncneg_hot.nl_lock); 684 } 685 neglist = NCP2NEGLIST(ncp); 686 mtx_lock(&neglist->nl_lock); 687 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 688 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 689 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 690 ncp->nc_flag |= NCF_HOTNEGATIVE; 691 } 692 mtx_unlock(&neglist->nl_lock); 693 mtx_unlock(&ncneg_hot.nl_lock); 694 } 695 696 static void 697 cache_negative_insert(struct namecache *ncp, bool neg_locked) 698 { 699 struct neglist *neglist; 700 701 MPASS(ncp->nc_flag & NCF_NEGATIVE); 702 cache_assert_bucket_locked(ncp, RA_WLOCKED); 703 neglist = NCP2NEGLIST(ncp); 704 if (!neg_locked) { 705 mtx_lock(&neglist->nl_lock); 706 } else { 707 mtx_assert(&neglist->nl_lock, MA_OWNED); 708 } 709 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 710 if (!neg_locked) 711 mtx_unlock(&neglist->nl_lock); 712 atomic_add_rel_long(&numneg, 1); 713 } 714 715 static void 716 cache_negative_remove(struct namecache *ncp, bool neg_locked) 717 { 718 struct neglist *neglist; 719 bool hot_locked = false; 720 bool list_locked = false; 721 722 MPASS(ncp->nc_flag & NCF_NEGATIVE); 723 cache_assert_bucket_locked(ncp, RA_WLOCKED); 724 neglist = NCP2NEGLIST(ncp); 725 if (!neg_locked) { 726 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 727 hot_locked = true; 728 mtx_lock(&ncneg_hot.nl_lock); 729 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 730 list_locked = true; 731 mtx_lock(&neglist->nl_lock); 732 } 733 } else { 734 list_locked = true; 735 mtx_lock(&neglist->nl_lock); 736 } 737 } 738 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 739 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 740 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 741 } else { 742 mtx_assert(&neglist->nl_lock, MA_OWNED); 743 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 744 } 745 if (list_locked) 746 mtx_unlock(&neglist->nl_lock); 747 if (hot_locked) 748 mtx_unlock(&ncneg_hot.nl_lock); 749 atomic_subtract_rel_long(&numneg, 1); 750 } 751 752 static void 753 cache_negative_shrink_select(int start, struct namecache **ncpp, 754 struct neglist **neglistpp) 755 { 756 struct neglist *neglist; 757 struct namecache *ncp; 758 int i; 759 760 *ncpp = ncp = NULL; 761 neglist = NULL; 762 763 for (i = start; i < numneglists; i++) { 764 neglist = &neglists[i]; 765 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 766 continue; 767 mtx_lock(&neglist->nl_lock); 768 ncp = TAILQ_FIRST(&neglist->nl_list); 769 if (ncp != NULL) 770 break; 771 mtx_unlock(&neglist->nl_lock); 772 } 773 774 *neglistpp = neglist; 775 *ncpp = ncp; 776 } 777 778 static void 779 cache_negative_zap_one(void) 780 { 781 struct namecache *ncp, *ncp2; 782 struct neglist *neglist; 783 struct mtx *dvlp; 784 struct rwlock *blp; 785 786 if (!mtx_trylock(&ncneg_shrink_lock)) 787 return; 788 789 mtx_lock(&ncneg_hot.nl_lock); 790 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 791 if (ncp != NULL) { 792 neglist = NCP2NEGLIST(ncp); 793 mtx_lock(&neglist->nl_lock); 794 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 795 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 796 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 797 mtx_unlock(&neglist->nl_lock); 798 } 799 800 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 801 shrink_list_turn++; 802 if (shrink_list_turn == numneglists) 803 shrink_list_turn = 0; 804 if (ncp == NULL && shrink_list_turn == 0) 805 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 806 if (ncp == NULL) { 807 mtx_unlock(&ncneg_hot.nl_lock); 808 goto out; 809 } 810 811 MPASS(ncp->nc_flag & NCF_NEGATIVE); 812 dvlp = VP2VNODELOCK(ncp->nc_dvp); 813 blp = NCP2BUCKETLOCK(ncp); 814 mtx_unlock(&neglist->nl_lock); 815 mtx_unlock(&ncneg_hot.nl_lock); 816 mtx_lock(dvlp); 817 rw_wlock(blp); 818 mtx_lock(&neglist->nl_lock); 819 ncp2 = TAILQ_FIRST(&neglist->nl_list); 820 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 821 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 822 ncp = NULL; 823 goto out_unlock_all; 824 } 825 SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 826 ncp->nc_name, ncp->nc_neghits); 827 828 cache_zap_locked(ncp, true); 829 out_unlock_all: 830 mtx_unlock(&neglist->nl_lock); 831 rw_wunlock(blp); 832 mtx_unlock(dvlp); 833 out: 834 mtx_unlock(&ncneg_shrink_lock); 835 cache_free(ncp); 836 } 837 838 /* 839 * cache_zap_locked(): 840 * 841 * Removes a namecache entry from cache, whether it contains an actual 842 * pointer to a vnode or if it is just a negative cache entry. 843 */ 844 static void 845 cache_zap_locked(struct namecache *ncp, bool neg_locked) 846 { 847 848 if (!(ncp->nc_flag & NCF_NEGATIVE)) 849 cache_assert_vnode_locked(ncp->nc_vp); 850 cache_assert_vnode_locked(ncp->nc_dvp); 851 cache_assert_bucket_locked(ncp, RA_WLOCKED); 852 853 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 854 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 855 LIST_REMOVE(ncp, nc_hash); 856 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 857 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 858 ncp->nc_name, ncp->nc_vp); 859 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 860 if (ncp == ncp->nc_vp->v_cache_dd) 861 ncp->nc_vp->v_cache_dd = NULL; 862 } else { 863 SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp, 864 ncp->nc_name, ncp->nc_neghits); 865 cache_negative_remove(ncp, neg_locked); 866 } 867 if (ncp->nc_flag & NCF_ISDOTDOT) { 868 if (ncp == ncp->nc_dvp->v_cache_dd) 869 ncp->nc_dvp->v_cache_dd = NULL; 870 } else { 871 LIST_REMOVE(ncp, nc_src); 872 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 873 ncp->nc_flag |= NCF_DVDROP; 874 atomic_subtract_rel_long(&numcachehv, 1); 875 } 876 } 877 atomic_subtract_rel_long(&numcache, 1); 878 } 879 880 static void 881 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 882 { 883 struct rwlock *blp; 884 885 MPASS(ncp->nc_dvp == vp); 886 MPASS(ncp->nc_flag & NCF_NEGATIVE); 887 cache_assert_vnode_locked(vp); 888 889 blp = NCP2BUCKETLOCK(ncp); 890 rw_wlock(blp); 891 cache_zap_locked(ncp, false); 892 rw_wunlock(blp); 893 } 894 895 static bool 896 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 897 struct mtx **vlpp) 898 { 899 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 900 struct rwlock *blp; 901 902 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 903 cache_assert_vnode_locked(vp); 904 905 if (ncp->nc_flag & NCF_NEGATIVE) { 906 if (*vlpp != NULL) { 907 mtx_unlock(*vlpp); 908 *vlpp = NULL; 909 } 910 cache_zap_negative_locked_vnode_kl(ncp, vp); 911 return (true); 912 } 913 914 pvlp = VP2VNODELOCK(vp); 915 blp = NCP2BUCKETLOCK(ncp); 916 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 917 vlp2 = VP2VNODELOCK(ncp->nc_vp); 918 919 if (*vlpp == vlp1 || *vlpp == vlp2) { 920 to_unlock = *vlpp; 921 *vlpp = NULL; 922 } else { 923 if (*vlpp != NULL) { 924 mtx_unlock(*vlpp); 925 *vlpp = NULL; 926 } 927 cache_sort(&vlp1, &vlp2); 928 if (vlp1 == pvlp) { 929 mtx_lock(vlp2); 930 to_unlock = vlp2; 931 } else { 932 if (!mtx_trylock(vlp1)) 933 goto out_relock; 934 to_unlock = vlp1; 935 } 936 } 937 rw_wlock(blp); 938 cache_zap_locked(ncp, false); 939 rw_wunlock(blp); 940 if (to_unlock != NULL) 941 mtx_unlock(to_unlock); 942 return (true); 943 944 out_relock: 945 mtx_unlock(vlp2); 946 mtx_lock(vlp1); 947 mtx_lock(vlp2); 948 MPASS(*vlpp == NULL); 949 *vlpp = vlp1; 950 return (false); 951 } 952 953 static int 954 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 955 { 956 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 957 struct rwlock *blp; 958 int error = 0; 959 960 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 961 cache_assert_vnode_locked(vp); 962 963 pvlp = VP2VNODELOCK(vp); 964 if (ncp->nc_flag & NCF_NEGATIVE) { 965 cache_zap_negative_locked_vnode_kl(ncp, vp); 966 goto out; 967 } 968 969 blp = NCP2BUCKETLOCK(ncp); 970 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 971 vlp2 = VP2VNODELOCK(ncp->nc_vp); 972 cache_sort(&vlp1, &vlp2); 973 if (vlp1 == pvlp) { 974 mtx_lock(vlp2); 975 to_unlock = vlp2; 976 } else { 977 if (!mtx_trylock(vlp1)) { 978 error = EAGAIN; 979 goto out; 980 } 981 to_unlock = vlp1; 982 } 983 rw_wlock(blp); 984 cache_zap_locked(ncp, false); 985 rw_wunlock(blp); 986 mtx_unlock(to_unlock); 987 out: 988 mtx_unlock(pvlp); 989 return (error); 990 } 991 992 static int 993 cache_zap_wlocked_bucket(struct namecache *ncp, struct rwlock *blp) 994 { 995 struct mtx *dvlp, *vlp; 996 997 cache_assert_bucket_locked(ncp, RA_WLOCKED); 998 999 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1000 vlp = NULL; 1001 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1002 vlp = VP2VNODELOCK(ncp->nc_vp); 1003 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1004 cache_zap_locked(ncp, false); 1005 rw_wunlock(blp); 1006 cache_unlock_vnodes(dvlp, vlp); 1007 return (0); 1008 } 1009 1010 rw_wunlock(blp); 1011 return (EAGAIN); 1012 } 1013 1014 static int 1015 cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp) 1016 { 1017 struct mtx *dvlp, *vlp; 1018 1019 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1020 1021 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1022 vlp = NULL; 1023 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1024 vlp = VP2VNODELOCK(ncp->nc_vp); 1025 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1026 rw_runlock(blp); 1027 rw_wlock(blp); 1028 cache_zap_locked(ncp, false); 1029 rw_wunlock(blp); 1030 cache_unlock_vnodes(dvlp, vlp); 1031 return (0); 1032 } 1033 1034 rw_runlock(blp); 1035 return (EAGAIN); 1036 } 1037 1038 static int 1039 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1040 struct mtx **vlpp1, struct mtx **vlpp2) 1041 { 1042 struct mtx *dvlp, *vlp; 1043 1044 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1045 1046 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1047 vlp = NULL; 1048 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1049 vlp = VP2VNODELOCK(ncp->nc_vp); 1050 cache_sort(&dvlp, &vlp); 1051 1052 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1053 cache_zap_locked(ncp, false); 1054 cache_unlock_vnodes(dvlp, vlp); 1055 *vlpp1 = NULL; 1056 *vlpp2 = NULL; 1057 return (0); 1058 } 1059 1060 if (*vlpp1 != NULL) 1061 mtx_unlock(*vlpp1); 1062 if (*vlpp2 != NULL) 1063 mtx_unlock(*vlpp2); 1064 *vlpp1 = NULL; 1065 *vlpp2 = NULL; 1066 1067 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1068 cache_zap_locked(ncp, false); 1069 cache_unlock_vnodes(dvlp, vlp); 1070 return (0); 1071 } 1072 1073 rw_wunlock(blp); 1074 *vlpp1 = dvlp; 1075 *vlpp2 = vlp; 1076 if (*vlpp1 != NULL) 1077 mtx_lock(*vlpp1); 1078 mtx_lock(*vlpp2); 1079 rw_wlock(blp); 1080 return (EAGAIN); 1081 } 1082 1083 static void 1084 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1085 { 1086 1087 if (blp != NULL) { 1088 rw_runlock(blp); 1089 } else { 1090 mtx_unlock(vlp); 1091 } 1092 } 1093 1094 static int __noinline 1095 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1096 struct timespec *tsp, int *ticksp) 1097 { 1098 int ltype; 1099 1100 *vpp = dvp; 1101 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1102 dvp, cnp->cn_nameptr); 1103 counter_u64_add(dothits, 1); 1104 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1105 if (tsp != NULL) 1106 timespecclear(tsp); 1107 if (ticksp != NULL) 1108 *ticksp = ticks; 1109 vrefact(*vpp); 1110 /* 1111 * When we lookup "." we still can be asked to lock it 1112 * differently... 1113 */ 1114 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1115 if (ltype != VOP_ISLOCKED(*vpp)) { 1116 if (ltype == LK_EXCLUSIVE) { 1117 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1118 if ((*vpp)->v_iflag & VI_DOOMED) { 1119 /* forced unmount */ 1120 vrele(*vpp); 1121 *vpp = NULL; 1122 return (ENOENT); 1123 } 1124 } else 1125 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1126 } 1127 return (-1); 1128 } 1129 1130 static __noinline int 1131 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1132 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1133 { 1134 struct namecache *ncp; 1135 struct rwlock *blp; 1136 struct mtx *dvlp, *dvlp2; 1137 uint32_t hash; 1138 int error; 1139 1140 if (cnp->cn_namelen == 2 && 1141 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1142 counter_u64_add(dotdothits, 1); 1143 dvlp = VP2VNODELOCK(dvp); 1144 dvlp2 = NULL; 1145 mtx_lock(dvlp); 1146 retry_dotdot: 1147 ncp = dvp->v_cache_dd; 1148 if (ncp == NULL) { 1149 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1150 "..", NULL); 1151 mtx_unlock(dvlp); 1152 if (dvlp2 != NULL) 1153 mtx_unlock(dvlp2); 1154 return (0); 1155 } 1156 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1157 if (ncp->nc_dvp != dvp) 1158 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1159 if (!cache_zap_locked_vnode_kl2(ncp, 1160 dvp, &dvlp2)) 1161 goto retry_dotdot; 1162 MPASS(dvp->v_cache_dd == NULL); 1163 mtx_unlock(dvlp); 1164 if (dvlp2 != NULL) 1165 mtx_unlock(dvlp2); 1166 cache_free(ncp); 1167 } else { 1168 dvp->v_cache_dd = NULL; 1169 mtx_unlock(dvlp); 1170 if (dvlp2 != NULL) 1171 mtx_unlock(dvlp2); 1172 } 1173 return (0); 1174 } 1175 1176 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1177 blp = HASH2BUCKETLOCK(hash); 1178 retry: 1179 if (LIST_EMPTY(NCHHASH(hash))) 1180 goto out_no_entry; 1181 1182 rw_wlock(blp); 1183 1184 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1185 counter_u64_add(numchecks, 1); 1186 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1187 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1188 break; 1189 } 1190 1191 /* We failed to find an entry */ 1192 if (ncp == NULL) { 1193 rw_wunlock(blp); 1194 goto out_no_entry; 1195 } 1196 1197 counter_u64_add(numposzaps, 1); 1198 1199 error = cache_zap_wlocked_bucket(ncp, blp); 1200 if (error != 0) { 1201 zap_and_exit_bucket_fail++; 1202 cache_maybe_yield(); 1203 goto retry; 1204 } 1205 cache_free(ncp); 1206 return (0); 1207 out_no_entry: 1208 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1209 counter_u64_add(nummisszap, 1); 1210 return (0); 1211 } 1212 1213 /** 1214 * Lookup a name in the name cache 1215 * 1216 * # Arguments 1217 * 1218 * - dvp: Parent directory in which to search. 1219 * - vpp: Return argument. Will contain desired vnode on cache hit. 1220 * - cnp: Parameters of the name search. The most interesting bits of 1221 * the cn_flags field have the following meanings: 1222 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1223 * it up. 1224 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1225 * - tsp: Return storage for cache timestamp. On a successful (positive 1226 * or negative) lookup, tsp will be filled with any timespec that 1227 * was stored when this cache entry was created. However, it will 1228 * be clear for "." entries. 1229 * - ticks: Return storage for alternate cache timestamp. On a successful 1230 * (positive or negative) lookup, it will contain the ticks value 1231 * that was current when the cache entry was created, unless cnp 1232 * was ".". 1233 * 1234 * # Returns 1235 * 1236 * - -1: A positive cache hit. vpp will contain the desired vnode. 1237 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1238 * to a forced unmount. vpp will not be modified. If the entry 1239 * is a whiteout, then the ISWHITEOUT flag will be set in 1240 * cnp->cn_flags. 1241 * - 0: A cache miss. vpp will not be modified. 1242 * 1243 * # Locking 1244 * 1245 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1246 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1247 * lock is not recursively acquired. 1248 */ 1249 int 1250 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1251 struct timespec *tsp, int *ticksp) 1252 { 1253 struct namecache_ts *ncp_ts; 1254 struct namecache *ncp; 1255 struct rwlock *blp; 1256 struct mtx *dvlp; 1257 uint32_t hash; 1258 int error, ltype; 1259 1260 if (__predict_false(!doingcache)) { 1261 cnp->cn_flags &= ~MAKEENTRY; 1262 return (0); 1263 } 1264 1265 counter_u64_add(numcalls, 1); 1266 1267 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1268 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1269 1270 if ((cnp->cn_flags & MAKEENTRY) == 0) 1271 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1272 1273 retry: 1274 blp = NULL; 1275 dvlp = NULL; 1276 error = 0; 1277 if (cnp->cn_namelen == 2 && 1278 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1279 counter_u64_add(dotdothits, 1); 1280 dvlp = VP2VNODELOCK(dvp); 1281 mtx_lock(dvlp); 1282 ncp = dvp->v_cache_dd; 1283 if (ncp == NULL) { 1284 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1285 "..", NULL); 1286 mtx_unlock(dvlp); 1287 return (0); 1288 } 1289 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1290 if (ncp->nc_flag & NCF_NEGATIVE) 1291 *vpp = NULL; 1292 else 1293 *vpp = ncp->nc_vp; 1294 } else 1295 *vpp = ncp->nc_dvp; 1296 /* Return failure if negative entry was found. */ 1297 if (*vpp == NULL) 1298 goto negative_success; 1299 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1300 dvp, cnp->cn_nameptr, *vpp); 1301 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1302 *vpp); 1303 cache_out_ts(ncp, tsp, ticksp); 1304 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1305 NCF_DTS && tsp != NULL) { 1306 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1307 *tsp = ncp_ts->nc_dotdottime; 1308 } 1309 goto success; 1310 } 1311 1312 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1313 blp = HASH2BUCKETLOCK(hash); 1314 rw_rlock(blp); 1315 1316 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1317 counter_u64_add(numchecks, 1); 1318 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1319 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1320 break; 1321 } 1322 1323 /* We failed to find an entry */ 1324 if (ncp == NULL) { 1325 rw_runlock(blp); 1326 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1327 NULL); 1328 counter_u64_add(nummiss, 1); 1329 return (0); 1330 } 1331 1332 /* We found a "positive" match, return the vnode */ 1333 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1334 counter_u64_add(numposhits, 1); 1335 *vpp = ncp->nc_vp; 1336 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1337 dvp, cnp->cn_nameptr, *vpp, ncp); 1338 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1339 *vpp); 1340 cache_out_ts(ncp, tsp, ticksp); 1341 goto success; 1342 } 1343 1344 negative_success: 1345 /* We found a negative match, and want to create it, so purge */ 1346 if (cnp->cn_nameiop == CREATE) { 1347 counter_u64_add(numnegzaps, 1); 1348 goto zap_and_exit; 1349 } 1350 1351 counter_u64_add(numneghits, 1); 1352 cache_negative_hit(ncp); 1353 if (ncp->nc_flag & NCF_WHITE) 1354 cnp->cn_flags |= ISWHITEOUT; 1355 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 1356 ncp->nc_name); 1357 cache_out_ts(ncp, tsp, ticksp); 1358 cache_lookup_unlock(blp, dvlp); 1359 return (ENOENT); 1360 1361 success: 1362 /* 1363 * On success we return a locked and ref'd vnode as per the lookup 1364 * protocol. 1365 */ 1366 MPASS(dvp != *vpp); 1367 ltype = 0; /* silence gcc warning */ 1368 if (cnp->cn_flags & ISDOTDOT) { 1369 ltype = VOP_ISLOCKED(dvp); 1370 VOP_UNLOCK(dvp, 0); 1371 } 1372 vhold(*vpp); 1373 cache_lookup_unlock(blp, dvlp); 1374 error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); 1375 if (cnp->cn_flags & ISDOTDOT) { 1376 vn_lock(dvp, ltype | LK_RETRY); 1377 if (dvp->v_iflag & VI_DOOMED) { 1378 if (error == 0) 1379 vput(*vpp); 1380 *vpp = NULL; 1381 return (ENOENT); 1382 } 1383 } 1384 if (error) { 1385 *vpp = NULL; 1386 goto retry; 1387 } 1388 if ((cnp->cn_flags & ISLASTCN) && 1389 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1390 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1391 } 1392 return (-1); 1393 1394 zap_and_exit: 1395 if (blp != NULL) 1396 error = cache_zap_rlocked_bucket(ncp, blp); 1397 else 1398 error = cache_zap_locked_vnode(ncp, dvp); 1399 if (error != 0) { 1400 zap_and_exit_bucket_fail++; 1401 cache_maybe_yield(); 1402 goto retry; 1403 } 1404 cache_free(ncp); 1405 return (0); 1406 } 1407 1408 struct celockstate { 1409 struct mtx *vlp[3]; 1410 struct rwlock *blp[2]; 1411 }; 1412 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1413 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1414 1415 static inline void 1416 cache_celockstate_init(struct celockstate *cel) 1417 { 1418 1419 bzero(cel, sizeof(*cel)); 1420 } 1421 1422 static void 1423 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1424 struct vnode *dvp) 1425 { 1426 struct mtx *vlp1, *vlp2; 1427 1428 MPASS(cel->vlp[0] == NULL); 1429 MPASS(cel->vlp[1] == NULL); 1430 MPASS(cel->vlp[2] == NULL); 1431 1432 MPASS(vp != NULL || dvp != NULL); 1433 1434 vlp1 = VP2VNODELOCK(vp); 1435 vlp2 = VP2VNODELOCK(dvp); 1436 cache_sort(&vlp1, &vlp2); 1437 1438 if (vlp1 != NULL) { 1439 mtx_lock(vlp1); 1440 cel->vlp[0] = vlp1; 1441 } 1442 mtx_lock(vlp2); 1443 cel->vlp[1] = vlp2; 1444 } 1445 1446 static void 1447 cache_unlock_vnodes_cel(struct celockstate *cel) 1448 { 1449 1450 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1451 1452 if (cel->vlp[0] != NULL) 1453 mtx_unlock(cel->vlp[0]); 1454 if (cel->vlp[1] != NULL) 1455 mtx_unlock(cel->vlp[1]); 1456 if (cel->vlp[2] != NULL) 1457 mtx_unlock(cel->vlp[2]); 1458 } 1459 1460 static bool 1461 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1462 { 1463 struct mtx *vlp; 1464 bool ret; 1465 1466 cache_assert_vlp_locked(cel->vlp[0]); 1467 cache_assert_vlp_locked(cel->vlp[1]); 1468 MPASS(cel->vlp[2] == NULL); 1469 1470 MPASS(vp != NULL); 1471 vlp = VP2VNODELOCK(vp); 1472 1473 ret = true; 1474 if (vlp >= cel->vlp[1]) { 1475 mtx_lock(vlp); 1476 } else { 1477 if (mtx_trylock(vlp)) 1478 goto out; 1479 cache_lock_vnodes_cel_3_failures++; 1480 cache_unlock_vnodes_cel(cel); 1481 if (vlp < cel->vlp[0]) { 1482 mtx_lock(vlp); 1483 mtx_lock(cel->vlp[0]); 1484 mtx_lock(cel->vlp[1]); 1485 } else { 1486 if (cel->vlp[0] != NULL) 1487 mtx_lock(cel->vlp[0]); 1488 mtx_lock(vlp); 1489 mtx_lock(cel->vlp[1]); 1490 } 1491 ret = false; 1492 } 1493 out: 1494 cel->vlp[2] = vlp; 1495 return (ret); 1496 } 1497 1498 static void 1499 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1500 struct rwlock *blp2) 1501 { 1502 1503 MPASS(cel->blp[0] == NULL); 1504 MPASS(cel->blp[1] == NULL); 1505 1506 cache_sort(&blp1, &blp2); 1507 1508 if (blp1 != NULL) { 1509 rw_wlock(blp1); 1510 cel->blp[0] = blp1; 1511 } 1512 rw_wlock(blp2); 1513 cel->blp[1] = blp2; 1514 } 1515 1516 static void 1517 cache_unlock_buckets_cel(struct celockstate *cel) 1518 { 1519 1520 if (cel->blp[0] != NULL) 1521 rw_wunlock(cel->blp[0]); 1522 rw_wunlock(cel->blp[1]); 1523 } 1524 1525 /* 1526 * Lock part of the cache affected by the insertion. 1527 * 1528 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1529 * However, insertion can result in removal of an old entry. In this 1530 * case we have an additional vnode and bucketlock pair to lock. If the 1531 * entry is negative, ncelock is locked instead of the vnode. 1532 * 1533 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1534 * preserving the locking order (smaller address first). 1535 */ 1536 static void 1537 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1538 uint32_t hash) 1539 { 1540 struct namecache *ncp; 1541 struct rwlock *blps[2]; 1542 1543 blps[0] = HASH2BUCKETLOCK(hash); 1544 for (;;) { 1545 blps[1] = NULL; 1546 cache_lock_vnodes_cel(cel, dvp, vp); 1547 if (vp == NULL || vp->v_type != VDIR) 1548 break; 1549 ncp = vp->v_cache_dd; 1550 if (ncp == NULL) 1551 break; 1552 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1553 break; 1554 MPASS(ncp->nc_dvp == vp); 1555 blps[1] = NCP2BUCKETLOCK(ncp); 1556 if (ncp->nc_flag & NCF_NEGATIVE) 1557 break; 1558 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1559 break; 1560 /* 1561 * All vnodes got re-locked. Re-validate the state and if 1562 * nothing changed we are done. Otherwise restart. 1563 */ 1564 if (ncp == vp->v_cache_dd && 1565 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1566 blps[1] == NCP2BUCKETLOCK(ncp) && 1567 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1568 break; 1569 cache_unlock_vnodes_cel(cel); 1570 cel->vlp[0] = NULL; 1571 cel->vlp[1] = NULL; 1572 cel->vlp[2] = NULL; 1573 } 1574 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1575 } 1576 1577 static void 1578 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1579 uint32_t hash) 1580 { 1581 struct namecache *ncp; 1582 struct rwlock *blps[2]; 1583 1584 blps[0] = HASH2BUCKETLOCK(hash); 1585 for (;;) { 1586 blps[1] = NULL; 1587 cache_lock_vnodes_cel(cel, dvp, vp); 1588 ncp = dvp->v_cache_dd; 1589 if (ncp == NULL) 1590 break; 1591 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1592 break; 1593 MPASS(ncp->nc_dvp == dvp); 1594 blps[1] = NCP2BUCKETLOCK(ncp); 1595 if (ncp->nc_flag & NCF_NEGATIVE) 1596 break; 1597 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1598 break; 1599 if (ncp == dvp->v_cache_dd && 1600 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1601 blps[1] == NCP2BUCKETLOCK(ncp) && 1602 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1603 break; 1604 cache_unlock_vnodes_cel(cel); 1605 cel->vlp[0] = NULL; 1606 cel->vlp[1] = NULL; 1607 cel->vlp[2] = NULL; 1608 } 1609 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1610 } 1611 1612 static void 1613 cache_enter_unlock(struct celockstate *cel) 1614 { 1615 1616 cache_unlock_buckets_cel(cel); 1617 cache_unlock_vnodes_cel(cel); 1618 } 1619 1620 /* 1621 * Add an entry to the cache. 1622 */ 1623 void 1624 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1625 struct timespec *tsp, struct timespec *dtsp) 1626 { 1627 struct celockstate cel; 1628 struct namecache *ncp, *n2, *ndd; 1629 struct namecache_ts *ncp_ts, *n2_ts; 1630 struct nchashhead *ncpp; 1631 struct neglist *neglist; 1632 uint32_t hash; 1633 int flag; 1634 int len; 1635 bool neg_locked; 1636 u_long lnumcache; 1637 1638 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1639 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, 1640 ("cache_enter: Adding a doomed vnode")); 1641 VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, 1642 ("cache_enter: Doomed vnode used as src")); 1643 1644 if (__predict_false(!doingcache)) 1645 return; 1646 1647 /* 1648 * Avoid blowout in namecache entries. 1649 */ 1650 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1651 if (__predict_false(lnumcache >= desiredvnodes * ncsizefactor)) { 1652 atomic_add_long(&numcache, -1); 1653 return; 1654 } 1655 1656 cache_celockstate_init(&cel); 1657 ndd = NULL; 1658 ncp_ts = NULL; 1659 flag = 0; 1660 if (cnp->cn_nameptr[0] == '.') { 1661 if (cnp->cn_namelen == 1) 1662 return; 1663 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1664 len = cnp->cn_namelen; 1665 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1666 cache_enter_lock_dd(&cel, dvp, vp, hash); 1667 /* 1668 * If dotdot entry already exists, just retarget it 1669 * to new parent vnode, otherwise continue with new 1670 * namecache entry allocation. 1671 */ 1672 if ((ncp = dvp->v_cache_dd) != NULL && 1673 ncp->nc_flag & NCF_ISDOTDOT) { 1674 KASSERT(ncp->nc_dvp == dvp, 1675 ("wrong isdotdot parent")); 1676 neg_locked = false; 1677 if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) { 1678 neglist = NCP2NEGLIST(ncp); 1679 mtx_lock(&ncneg_hot.nl_lock); 1680 mtx_lock(&neglist->nl_lock); 1681 neg_locked = true; 1682 } 1683 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1684 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, 1685 ncp, nc_dst); 1686 } else { 1687 cache_negative_remove(ncp, true); 1688 } 1689 if (vp != NULL) { 1690 TAILQ_INSERT_HEAD(&vp->v_cache_dst, 1691 ncp, nc_dst); 1692 ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE); 1693 } else { 1694 ncp->nc_flag &= ~(NCF_HOTNEGATIVE); 1695 ncp->nc_flag |= NCF_NEGATIVE; 1696 cache_negative_insert(ncp, true); 1697 } 1698 if (neg_locked) { 1699 mtx_unlock(&neglist->nl_lock); 1700 mtx_unlock(&ncneg_hot.nl_lock); 1701 } 1702 ncp->nc_vp = vp; 1703 cache_enter_unlock(&cel); 1704 return; 1705 } 1706 dvp->v_cache_dd = NULL; 1707 cache_enter_unlock(&cel); 1708 cache_celockstate_init(&cel); 1709 SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); 1710 flag = NCF_ISDOTDOT; 1711 } 1712 } 1713 1714 /* 1715 * Calculate the hash key and setup as much of the new 1716 * namecache entry as possible before acquiring the lock. 1717 */ 1718 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1719 ncp->nc_flag = flag; 1720 ncp->nc_vp = vp; 1721 if (vp == NULL) 1722 ncp->nc_flag |= NCF_NEGATIVE; 1723 ncp->nc_dvp = dvp; 1724 if (tsp != NULL) { 1725 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1726 ncp_ts->nc_time = *tsp; 1727 ncp_ts->nc_ticks = ticks; 1728 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1729 if (dtsp != NULL) { 1730 ncp_ts->nc_dotdottime = *dtsp; 1731 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1732 } 1733 } 1734 len = ncp->nc_nlen = cnp->cn_namelen; 1735 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1736 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1737 cache_enter_lock(&cel, dvp, vp, hash); 1738 1739 /* 1740 * See if this vnode or negative entry is already in the cache 1741 * with this name. This can happen with concurrent lookups of 1742 * the same path name. 1743 */ 1744 ncpp = NCHHASH(hash); 1745 LIST_FOREACH(n2, ncpp, nc_hash) { 1746 if (n2->nc_dvp == dvp && 1747 n2->nc_nlen == cnp->cn_namelen && 1748 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1749 if (tsp != NULL) { 1750 KASSERT((n2->nc_flag & NCF_TS) != 0, 1751 ("no NCF_TS")); 1752 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1753 n2_ts->nc_time = ncp_ts->nc_time; 1754 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1755 if (dtsp != NULL) { 1756 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1757 if (ncp->nc_flag & NCF_NEGATIVE) 1758 mtx_lock(&ncneg_hot.nl_lock); 1759 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1760 if (ncp->nc_flag & NCF_NEGATIVE) 1761 mtx_unlock(&ncneg_hot.nl_lock); 1762 } 1763 } 1764 goto out_unlock_free; 1765 } 1766 } 1767 1768 if (flag == NCF_ISDOTDOT) { 1769 /* 1770 * See if we are trying to add .. entry, but some other lookup 1771 * has populated v_cache_dd pointer already. 1772 */ 1773 if (dvp->v_cache_dd != NULL) 1774 goto out_unlock_free; 1775 KASSERT(vp == NULL || vp->v_type == VDIR, 1776 ("wrong vnode type %p", vp)); 1777 dvp->v_cache_dd = ncp; 1778 } 1779 1780 if (vp != NULL) { 1781 if (vp->v_type == VDIR) { 1782 if (flag != NCF_ISDOTDOT) { 1783 /* 1784 * For this case, the cache entry maps both the 1785 * directory name in it and the name ".." for the 1786 * directory's parent. 1787 */ 1788 if ((ndd = vp->v_cache_dd) != NULL) { 1789 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1790 cache_zap_locked(ndd, false); 1791 else 1792 ndd = NULL; 1793 } 1794 vp->v_cache_dd = ncp; 1795 } 1796 } else { 1797 vp->v_cache_dd = NULL; 1798 } 1799 } 1800 1801 if (flag != NCF_ISDOTDOT) { 1802 if (LIST_EMPTY(&dvp->v_cache_src)) { 1803 vhold(dvp); 1804 atomic_add_rel_long(&numcachehv, 1); 1805 } 1806 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1807 } 1808 1809 /* 1810 * Insert the new namecache entry into the appropriate chain 1811 * within the cache entries table. 1812 */ 1813 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1814 1815 /* 1816 * If the entry is "negative", we place it into the 1817 * "negative" cache queue, otherwise, we place it into the 1818 * destination vnode's cache entries queue. 1819 */ 1820 if (vp != NULL) { 1821 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1822 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1823 vp); 1824 } else { 1825 if (cnp->cn_flags & ISWHITEOUT) 1826 ncp->nc_flag |= NCF_WHITE; 1827 cache_negative_insert(ncp, false); 1828 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1829 ncp->nc_name); 1830 } 1831 cache_enter_unlock(&cel); 1832 if (numneg * ncnegfactor > lnumcache) 1833 cache_negative_zap_one(); 1834 cache_free(ndd); 1835 return; 1836 out_unlock_free: 1837 cache_enter_unlock(&cel); 1838 cache_free(ncp); 1839 return; 1840 } 1841 1842 static u_int 1843 cache_roundup_2(u_int val) 1844 { 1845 u_int res; 1846 1847 for (res = 1; res <= val; res <<= 1) 1848 continue; 1849 1850 return (res); 1851 } 1852 1853 /* 1854 * Name cache initialization, from vfs_init() when we are booting 1855 */ 1856 static void 1857 nchinit(void *dummy __unused) 1858 { 1859 u_int i; 1860 1861 cache_zone_small = uma_zcreate("S VFS Cache", 1862 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1863 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1864 UMA_ZONE_ZINIT); 1865 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1866 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1867 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1868 UMA_ZONE_ZINIT); 1869 cache_zone_large = uma_zcreate("L VFS Cache", 1870 sizeof(struct namecache) + NAME_MAX + 1, 1871 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1872 UMA_ZONE_ZINIT); 1873 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1874 sizeof(struct namecache_ts) + NAME_MAX + 1, 1875 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1876 UMA_ZONE_ZINIT); 1877 1878 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1879 ncbuckethash = cache_roundup_2(mp_ncpus * 64) - 1; 1880 if (ncbuckethash > nchash) 1881 ncbuckethash = nchash; 1882 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1883 M_WAITOK | M_ZERO); 1884 for (i = 0; i < numbucketlocks; i++) 1885 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 1886 ncvnodehash = cache_roundup_2(mp_ncpus * 64) - 1; 1887 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 1888 M_WAITOK | M_ZERO); 1889 for (i = 0; i < numvnodelocks; i++) 1890 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 1891 ncpurgeminvnodes = numbucketlocks; 1892 1893 ncneghash = 3; 1894 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 1895 M_WAITOK | M_ZERO); 1896 for (i = 0; i < numneglists; i++) { 1897 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 1898 TAILQ_INIT(&neglists[i].nl_list); 1899 } 1900 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 1901 TAILQ_INIT(&ncneg_hot.nl_list); 1902 1903 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 1904 1905 numcalls = counter_u64_alloc(M_WAITOK); 1906 dothits = counter_u64_alloc(M_WAITOK); 1907 dotdothits = counter_u64_alloc(M_WAITOK); 1908 numchecks = counter_u64_alloc(M_WAITOK); 1909 nummiss = counter_u64_alloc(M_WAITOK); 1910 nummisszap = counter_u64_alloc(M_WAITOK); 1911 numposzaps = counter_u64_alloc(M_WAITOK); 1912 numposhits = counter_u64_alloc(M_WAITOK); 1913 numnegzaps = counter_u64_alloc(M_WAITOK); 1914 numneghits = counter_u64_alloc(M_WAITOK); 1915 numfullpathcalls = counter_u64_alloc(M_WAITOK); 1916 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 1917 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 1918 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 1919 numfullpathfound = counter_u64_alloc(M_WAITOK); 1920 } 1921 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 1922 1923 void 1924 cache_changesize(int newmaxvnodes) 1925 { 1926 struct nchashhead *new_nchashtbl, *old_nchashtbl; 1927 u_long new_nchash, old_nchash; 1928 struct namecache *ncp; 1929 uint32_t hash; 1930 int i; 1931 1932 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 1933 if (newmaxvnodes < numbucketlocks) 1934 newmaxvnodes = numbucketlocks; 1935 1936 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 1937 /* If same hash table size, nothing to do */ 1938 if (nchash == new_nchash) { 1939 free(new_nchashtbl, M_VFSCACHE); 1940 return; 1941 } 1942 /* 1943 * Move everything from the old hash table to the new table. 1944 * None of the namecache entries in the table can be removed 1945 * because to do so, they have to be removed from the hash table. 1946 */ 1947 cache_lock_all_vnodes(); 1948 cache_lock_all_buckets(); 1949 old_nchashtbl = nchashtbl; 1950 old_nchash = nchash; 1951 nchashtbl = new_nchashtbl; 1952 nchash = new_nchash; 1953 for (i = 0; i <= old_nchash; i++) { 1954 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 1955 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 1956 ncp->nc_dvp); 1957 LIST_REMOVE(ncp, nc_hash); 1958 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 1959 } 1960 } 1961 cache_unlock_all_buckets(); 1962 cache_unlock_all_vnodes(); 1963 free(old_nchashtbl, M_VFSCACHE); 1964 } 1965 1966 /* 1967 * Invalidate all entries to a particular vnode. 1968 */ 1969 void 1970 cache_purge(struct vnode *vp) 1971 { 1972 TAILQ_HEAD(, namecache) ncps; 1973 struct namecache *ncp, *nnp; 1974 struct mtx *vlp, *vlp2; 1975 1976 CTR1(KTR_VFS, "cache_purge(%p)", vp); 1977 SDT_PROBE1(vfs, namecache, purge, done, vp); 1978 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 1979 vp->v_cache_dd == NULL) 1980 return; 1981 TAILQ_INIT(&ncps); 1982 vlp = VP2VNODELOCK(vp); 1983 vlp2 = NULL; 1984 mtx_lock(vlp); 1985 retry: 1986 while (!LIST_EMPTY(&vp->v_cache_src)) { 1987 ncp = LIST_FIRST(&vp->v_cache_src); 1988 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 1989 goto retry; 1990 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1991 } 1992 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 1993 ncp = TAILQ_FIRST(&vp->v_cache_dst); 1994 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 1995 goto retry; 1996 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1997 } 1998 ncp = vp->v_cache_dd; 1999 if (ncp != NULL) { 2000 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2001 ("lost dotdot link")); 2002 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2003 goto retry; 2004 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2005 } 2006 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2007 mtx_unlock(vlp); 2008 if (vlp2 != NULL) 2009 mtx_unlock(vlp2); 2010 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2011 cache_free(ncp); 2012 } 2013 } 2014 2015 /* 2016 * Invalidate all negative entries for a particular directory vnode. 2017 */ 2018 void 2019 cache_purge_negative(struct vnode *vp) 2020 { 2021 TAILQ_HEAD(, namecache) ncps; 2022 struct namecache *ncp, *nnp; 2023 struct mtx *vlp; 2024 2025 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2026 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2027 if (LIST_EMPTY(&vp->v_cache_src)) 2028 return; 2029 TAILQ_INIT(&ncps); 2030 vlp = VP2VNODELOCK(vp); 2031 mtx_lock(vlp); 2032 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2033 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2034 continue; 2035 cache_zap_negative_locked_vnode_kl(ncp, vp); 2036 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2037 } 2038 mtx_unlock(vlp); 2039 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2040 cache_free(ncp); 2041 } 2042 } 2043 2044 /* 2045 * Flush all entries referencing a particular filesystem. 2046 */ 2047 void 2048 cache_purgevfs(struct mount *mp, bool force) 2049 { 2050 TAILQ_HEAD(, namecache) ncps; 2051 struct mtx *vlp1, *vlp2; 2052 struct rwlock *blp; 2053 struct nchashhead *bucket; 2054 struct namecache *ncp, *nnp; 2055 u_long i, j, n_nchash; 2056 int error; 2057 2058 /* Scan hash tables for applicable entries */ 2059 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2060 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2061 return; 2062 TAILQ_INIT(&ncps); 2063 n_nchash = nchash + 1; 2064 vlp1 = vlp2 = NULL; 2065 for (i = 0; i < numbucketlocks; i++) { 2066 blp = (struct rwlock *)&bucketlocks[i]; 2067 rw_wlock(blp); 2068 for (j = i; j < n_nchash; j += numbucketlocks) { 2069 retry: 2070 bucket = &nchashtbl[j]; 2071 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2072 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2073 if (ncp->nc_dvp->v_mount != mp) 2074 continue; 2075 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2076 &vlp1, &vlp2); 2077 if (error != 0) 2078 goto retry; 2079 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2080 } 2081 } 2082 rw_wunlock(blp); 2083 if (vlp1 == NULL && vlp2 == NULL) 2084 cache_maybe_yield(); 2085 } 2086 if (vlp1 != NULL) 2087 mtx_unlock(vlp1); 2088 if (vlp2 != NULL) 2089 mtx_unlock(vlp2); 2090 2091 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2092 cache_free(ncp); 2093 } 2094 } 2095 2096 /* 2097 * Perform canonical checks and cache lookup and pass on to filesystem 2098 * through the vop_cachedlookup only if needed. 2099 */ 2100 2101 int 2102 vfs_cache_lookup(struct vop_lookup_args *ap) 2103 { 2104 struct vnode *dvp; 2105 int error; 2106 struct vnode **vpp = ap->a_vpp; 2107 struct componentname *cnp = ap->a_cnp; 2108 struct ucred *cred = cnp->cn_cred; 2109 int flags = cnp->cn_flags; 2110 struct thread *td = cnp->cn_thread; 2111 2112 *vpp = NULL; 2113 dvp = ap->a_dvp; 2114 2115 if (dvp->v_type != VDIR) 2116 return (ENOTDIR); 2117 2118 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2119 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2120 return (EROFS); 2121 2122 error = VOP_ACCESS(dvp, VEXEC, cred, td); 2123 if (error) 2124 return (error); 2125 2126 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2127 if (error == 0) 2128 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2129 if (error == -1) 2130 return (0); 2131 return (error); 2132 } 2133 2134 /* 2135 * XXX All of these sysctls would probably be more productive dead. 2136 */ 2137 static int __read_mostly disablecwd; 2138 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 2139 "Disable the getcwd syscall"); 2140 2141 /* Implementation of the getcwd syscall. */ 2142 int 2143 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2144 { 2145 2146 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, 2147 MAXPATHLEN)); 2148 } 2149 2150 int 2151 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen, 2152 size_t path_max) 2153 { 2154 char *bp, *tmpbuf; 2155 struct filedesc *fdp; 2156 struct vnode *cdir, *rdir; 2157 int error; 2158 2159 if (__predict_false(disablecwd)) 2160 return (ENODEV); 2161 if (__predict_false(buflen < 2)) 2162 return (EINVAL); 2163 if (buflen > path_max) 2164 buflen = path_max; 2165 2166 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); 2167 fdp = td->td_proc->p_fd; 2168 FILEDESC_SLOCK(fdp); 2169 cdir = fdp->fd_cdir; 2170 vrefact(cdir); 2171 rdir = fdp->fd_rdir; 2172 vrefact(rdir); 2173 FILEDESC_SUNLOCK(fdp); 2174 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); 2175 vrele(rdir); 2176 vrele(cdir); 2177 2178 if (!error) { 2179 if (bufseg == UIO_SYSSPACE) 2180 bcopy(bp, buf, strlen(bp) + 1); 2181 else 2182 error = copyout(bp, buf, strlen(bp) + 1); 2183 #ifdef KTRACE 2184 if (KTRPOINT(curthread, KTR_NAMEI)) 2185 ktrnamei(bp); 2186 #endif 2187 } 2188 free(tmpbuf, M_TEMP); 2189 return (error); 2190 } 2191 2192 /* 2193 * Thus begins the fullpath magic. 2194 */ 2195 2196 static int __read_mostly disablefullpath; 2197 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, 2198 "Disable the vn_fullpath function"); 2199 2200 /* 2201 * Retrieve the full filesystem path that correspond to a vnode from the name 2202 * cache (if available) 2203 */ 2204 int 2205 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2206 { 2207 char *buf; 2208 struct filedesc *fdp; 2209 struct vnode *rdir; 2210 int error; 2211 2212 if (__predict_false(disablefullpath)) 2213 return (ENODEV); 2214 if (__predict_false(vn == NULL)) 2215 return (EINVAL); 2216 2217 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2218 fdp = td->td_proc->p_fd; 2219 FILEDESC_SLOCK(fdp); 2220 rdir = fdp->fd_rdir; 2221 vrefact(rdir); 2222 FILEDESC_SUNLOCK(fdp); 2223 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); 2224 vrele(rdir); 2225 2226 if (!error) 2227 *freebuf = buf; 2228 else 2229 free(buf, M_TEMP); 2230 return (error); 2231 } 2232 2233 /* 2234 * This function is similar to vn_fullpath, but it attempts to lookup the 2235 * pathname relative to the global root mount point. This is required for the 2236 * auditing sub-system, as audited pathnames must be absolute, relative to the 2237 * global root mount point. 2238 */ 2239 int 2240 vn_fullpath_global(struct thread *td, struct vnode *vn, 2241 char **retbuf, char **freebuf) 2242 { 2243 char *buf; 2244 int error; 2245 2246 if (__predict_false(disablefullpath)) 2247 return (ENODEV); 2248 if (__predict_false(vn == NULL)) 2249 return (EINVAL); 2250 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2251 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); 2252 if (!error) 2253 *freebuf = buf; 2254 else 2255 free(buf, M_TEMP); 2256 return (error); 2257 } 2258 2259 int 2260 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) 2261 { 2262 struct vnode *dvp; 2263 struct namecache *ncp; 2264 struct mtx *vlp; 2265 int error; 2266 2267 vlp = VP2VNODELOCK(*vp); 2268 mtx_lock(vlp); 2269 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2270 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2271 break; 2272 } 2273 if (ncp != NULL) { 2274 if (*buflen < ncp->nc_nlen) { 2275 mtx_unlock(vlp); 2276 vrele(*vp); 2277 counter_u64_add(numfullpathfail4, 1); 2278 error = ENOMEM; 2279 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2280 vp, NULL); 2281 return (error); 2282 } 2283 *buflen -= ncp->nc_nlen; 2284 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2285 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2286 ncp->nc_name, vp); 2287 dvp = *vp; 2288 *vp = ncp->nc_dvp; 2289 vref(*vp); 2290 mtx_unlock(vlp); 2291 vrele(dvp); 2292 return (0); 2293 } 2294 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2295 2296 mtx_unlock(vlp); 2297 vn_lock(*vp, LK_SHARED | LK_RETRY); 2298 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2299 vput(*vp); 2300 if (error) { 2301 counter_u64_add(numfullpathfail2, 1); 2302 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2303 return (error); 2304 } 2305 2306 *vp = dvp; 2307 if (dvp->v_iflag & VI_DOOMED) { 2308 /* forced unmount */ 2309 vrele(dvp); 2310 error = ENOENT; 2311 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2312 return (error); 2313 } 2314 /* 2315 * *vp has its use count incremented still. 2316 */ 2317 2318 return (0); 2319 } 2320 2321 /* 2322 * The magic behind kern___getcwd() and vn_fullpath(). 2323 */ 2324 static int 2325 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 2326 char *buf, char **retbuf, u_int buflen) 2327 { 2328 int error, slash_prefixed; 2329 #ifdef KDTRACE_HOOKS 2330 struct vnode *startvp = vp; 2331 #endif 2332 struct vnode *vp1; 2333 2334 buflen--; 2335 buf[buflen] = '\0'; 2336 error = 0; 2337 slash_prefixed = 0; 2338 2339 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2340 counter_u64_add(numfullpathcalls, 1); 2341 vref(vp); 2342 if (vp->v_type != VDIR) { 2343 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2344 if (error) 2345 return (error); 2346 if (buflen == 0) { 2347 vrele(vp); 2348 return (ENOMEM); 2349 } 2350 buf[--buflen] = '/'; 2351 slash_prefixed = 1; 2352 } 2353 while (vp != rdir && vp != rootvnode) { 2354 /* 2355 * The vp vnode must be already fully constructed, 2356 * since it is either found in namecache or obtained 2357 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2358 * without obtaining the vnode lock. 2359 */ 2360 if ((vp->v_vflag & VV_ROOT) != 0) { 2361 vn_lock(vp, LK_RETRY | LK_SHARED); 2362 2363 /* 2364 * With the vnode locked, check for races with 2365 * unmount, forced or not. Note that we 2366 * already verified that vp is not equal to 2367 * the root vnode, which means that 2368 * mnt_vnodecovered can be NULL only for the 2369 * case of unmount. 2370 */ 2371 if ((vp->v_iflag & VI_DOOMED) != 0 || 2372 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2373 vp1->v_mountedhere != vp->v_mount) { 2374 vput(vp); 2375 error = ENOENT; 2376 SDT_PROBE3(vfs, namecache, fullpath, return, 2377 error, vp, NULL); 2378 break; 2379 } 2380 2381 vref(vp1); 2382 vput(vp); 2383 vp = vp1; 2384 continue; 2385 } 2386 if (vp->v_type != VDIR) { 2387 vrele(vp); 2388 counter_u64_add(numfullpathfail1, 1); 2389 error = ENOTDIR; 2390 SDT_PROBE3(vfs, namecache, fullpath, return, 2391 error, vp, NULL); 2392 break; 2393 } 2394 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2395 if (error) 2396 break; 2397 if (buflen == 0) { 2398 vrele(vp); 2399 error = ENOMEM; 2400 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2401 startvp, NULL); 2402 break; 2403 } 2404 buf[--buflen] = '/'; 2405 slash_prefixed = 1; 2406 } 2407 if (error) 2408 return (error); 2409 if (!slash_prefixed) { 2410 if (buflen == 0) { 2411 vrele(vp); 2412 counter_u64_add(numfullpathfail4, 1); 2413 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2414 startvp, NULL); 2415 return (ENOMEM); 2416 } 2417 buf[--buflen] = '/'; 2418 } 2419 counter_u64_add(numfullpathfound, 1); 2420 vrele(vp); 2421 2422 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); 2423 *retbuf = buf + buflen; 2424 return (0); 2425 } 2426 2427 struct vnode * 2428 vn_dir_dd_ino(struct vnode *vp) 2429 { 2430 struct namecache *ncp; 2431 struct vnode *ddvp; 2432 struct mtx *vlp; 2433 2434 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2435 vlp = VP2VNODELOCK(vp); 2436 mtx_lock(vlp); 2437 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2438 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2439 continue; 2440 ddvp = ncp->nc_dvp; 2441 vhold(ddvp); 2442 mtx_unlock(vlp); 2443 if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread)) 2444 return (NULL); 2445 return (ddvp); 2446 } 2447 mtx_unlock(vlp); 2448 return (NULL); 2449 } 2450 2451 int 2452 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2453 { 2454 struct namecache *ncp; 2455 struct mtx *vlp; 2456 int l; 2457 2458 vlp = VP2VNODELOCK(vp); 2459 mtx_lock(vlp); 2460 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2461 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2462 break; 2463 if (ncp == NULL) { 2464 mtx_unlock(vlp); 2465 return (ENOENT); 2466 } 2467 l = min(ncp->nc_nlen, buflen - 1); 2468 memcpy(buf, ncp->nc_name, l); 2469 mtx_unlock(vlp); 2470 buf[l] = '\0'; 2471 return (0); 2472 } 2473 2474 /* 2475 * This function updates path string to vnode's full global path 2476 * and checks the size of the new path string against the pathlen argument. 2477 * 2478 * Requires a locked, referenced vnode. 2479 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2480 * 2481 * If sysctl debug.disablefullpath is set, ENODEV is returned, 2482 * vnode is left locked and path remain untouched. 2483 * 2484 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2485 * because it falls back to the ".." lookup if the namecache lookup fails. 2486 */ 2487 int 2488 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2489 u_int pathlen) 2490 { 2491 struct nameidata nd; 2492 struct vnode *vp1; 2493 char *rpath, *fbuf; 2494 int error; 2495 2496 ASSERT_VOP_ELOCKED(vp, __func__); 2497 2498 /* Return ENODEV if sysctl debug.disablefullpath==1 */ 2499 if (__predict_false(disablefullpath)) 2500 return (ENODEV); 2501 2502 /* Construct global filesystem path from vp. */ 2503 VOP_UNLOCK(vp, 0); 2504 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2505 2506 if (error != 0) { 2507 vrele(vp); 2508 return (error); 2509 } 2510 2511 if (strlen(rpath) >= pathlen) { 2512 vrele(vp); 2513 error = ENAMETOOLONG; 2514 goto out; 2515 } 2516 2517 /* 2518 * Re-lookup the vnode by path to detect a possible rename. 2519 * As a side effect, the vnode is relocked. 2520 * If vnode was renamed, return ENOENT. 2521 */ 2522 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2523 UIO_SYSSPACE, path, td); 2524 error = namei(&nd); 2525 if (error != 0) { 2526 vrele(vp); 2527 goto out; 2528 } 2529 NDFREE(&nd, NDF_ONLY_PNBUF); 2530 vp1 = nd.ni_vp; 2531 vrele(vp); 2532 if (vp1 == vp) 2533 strcpy(path, rpath); 2534 else { 2535 vput(vp1); 2536 error = ENOENT; 2537 } 2538 2539 out: 2540 free(fbuf, M_TEMP); 2541 return (error); 2542 } 2543 2544 #ifdef DDB 2545 static void 2546 db_print_vpath(struct vnode *vp) 2547 { 2548 2549 while (vp != NULL) { 2550 db_printf("%p: ", vp); 2551 if (vp == rootvnode) { 2552 db_printf("/"); 2553 vp = NULL; 2554 } else { 2555 if (vp->v_vflag & VV_ROOT) { 2556 db_printf("<mount point>"); 2557 vp = vp->v_mount->mnt_vnodecovered; 2558 } else { 2559 struct namecache *ncp; 2560 char *ncn; 2561 int i; 2562 2563 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2564 if (ncp != NULL) { 2565 ncn = ncp->nc_name; 2566 for (i = 0; i < ncp->nc_nlen; i++) 2567 db_printf("%c", *ncn++); 2568 vp = ncp->nc_dvp; 2569 } else { 2570 vp = NULL; 2571 } 2572 } 2573 } 2574 db_printf("\n"); 2575 } 2576 2577 return; 2578 } 2579 2580 DB_SHOW_COMMAND(vpath, db_show_vpath) 2581 { 2582 struct vnode *vp; 2583 2584 if (!have_addr) { 2585 db_printf("usage: show vpath <struct vnode *>\n"); 2586 return; 2587 } 2588 2589 vp = (struct vnode *)addr; 2590 db_print_vpath(vp); 2591 } 2592 2593 #endif 2594