1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/counter.h> 46 #include <sys/filedesc.h> 47 #include <sys/fnv_hash.h> 48 #include <sys/kernel.h> 49 #include <sys/ktr.h> 50 #include <sys/lock.h> 51 #include <sys/malloc.h> 52 #include <sys/fcntl.h> 53 #include <sys/mount.h> 54 #include <sys/namei.h> 55 #include <sys/proc.h> 56 #include <sys/rwlock.h> 57 #include <sys/sdt.h> 58 #include <sys/smp.h> 59 #include <sys/syscallsubr.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysproto.h> 62 #include <sys/vnode.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 67 #ifdef DDB 68 #include <ddb/ddb.h> 69 #endif 70 71 #include <vm/uma.h> 72 73 SDT_PROVIDER_DECLARE(vfs); 74 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 75 "struct vnode *"); 76 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 77 "char *"); 78 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 79 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 80 "char *", "struct vnode *"); 81 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 82 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 83 "struct vnode *", "char *"); 84 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 87 "struct vnode *", "char *"); 88 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 89 "char *"); 90 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 92 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 93 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 94 "struct vnode *"); 95 SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *", 96 "char *", "int"); 97 SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *", 98 "char *", "int"); 99 100 /* 101 * This structure describes the elements in the cache of recent 102 * names looked up by namei. 103 */ 104 105 struct namecache { 106 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 107 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 108 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 109 struct vnode *nc_dvp; /* vnode of parent of name */ 110 union { 111 struct vnode *nu_vp; /* vnode the name refers to */ 112 u_int nu_neghits; /* negative entry hits */ 113 } n_un; 114 u_char nc_flag; /* flag bits */ 115 u_char nc_nlen; /* length of name */ 116 char nc_name[0]; /* segment name + nul */ 117 }; 118 119 /* 120 * struct namecache_ts repeats struct namecache layout up to the 121 * nc_nlen member. 122 * struct namecache_ts is used in place of struct namecache when time(s) need 123 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 124 * both a non-dotdot directory name plus dotdot for the directory's 125 * parent. 126 */ 127 struct namecache_ts { 128 struct timespec nc_time; /* timespec provided by fs */ 129 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 130 int nc_ticks; /* ticks value when entry was added */ 131 struct namecache nc_nc; 132 }; 133 134 #define nc_vp n_un.nu_vp 135 #define nc_neghits n_un.nu_neghits 136 137 /* 138 * Flags in namecache.nc_flag 139 */ 140 #define NCF_WHITE 0x01 141 #define NCF_ISDOTDOT 0x02 142 #define NCF_TS 0x04 143 #define NCF_DTS 0x08 144 #define NCF_DVDROP 0x10 145 #define NCF_NEGATIVE 0x20 146 #define NCF_HOTNEGATIVE 0x40 147 148 /* 149 * Name caching works as follows: 150 * 151 * Names found by directory scans are retained in a cache 152 * for future reference. It is managed LRU, so frequently 153 * used names will hang around. Cache is indexed by hash value 154 * obtained from (dvp, name) where dvp refers to the directory 155 * containing name. 156 * 157 * If it is a "negative" entry, (i.e. for a name that is known NOT to 158 * exist) the vnode pointer will be NULL. 159 * 160 * Upon reaching the last segment of a path, if the reference 161 * is for DELETE, or NOCACHE is set (rewrite), and the 162 * name is located in the cache, it will be dropped. 163 * 164 * These locks are used (in the order in which they can be taken): 165 * NAME TYPE ROLE 166 * vnodelock mtx vnode lists and v_cache_dd field protection 167 * bucketlock rwlock for access to given set of hash buckets 168 * neglist mtx negative entry LRU management 169 * 170 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 171 * shrinking the LRU list. 172 * 173 * It is legal to take multiple vnodelock and bucketlock locks. The locking 174 * order is lower address first. Both are recursive. 175 * 176 * "." lookups are lockless. 177 * 178 * ".." and vnode -> name lookups require vnodelock. 179 * 180 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 181 * 182 * Insertions and removals of entries require involved vnodes and bucketlocks 183 * to be write-locked to prevent other threads from seeing the entry. 184 * 185 * Some lookups result in removal of the found entry (e.g. getting rid of a 186 * negative entry with the intent to create a positive one), which poses a 187 * problem when multiple threads reach the state. Similarly, two different 188 * threads can purge two different vnodes and try to remove the same name. 189 * 190 * If the already held vnode lock is lower than the second required lock, we 191 * can just take the other lock. However, in the opposite case, this could 192 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 193 * the first node, locking everything in order and revalidating the state. 194 */ 195 196 /* 197 * Structures associated with name caching. 198 */ 199 #define NCHHASH(hash) \ 200 (&nchashtbl[(hash) & nchash]) 201 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 202 static u_long __read_mostly nchash; /* size of hash table */ 203 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 204 "Size of namecache hash table"); 205 static u_long __read_mostly ncnegfactor = 12; /* ratio of negative entries */ 206 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 207 "Ratio of negative namecache entries"); 208 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 209 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 210 "Number of negative entries in namecache"); 211 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 212 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 213 "Number of namecache entries"); 214 static u_long __exclusive_cache_line numcachehv;/* number of cache entries with vnodes held */ 215 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, 216 "Number of namecache entries with vnodes held"); 217 u_int __read_mostly ncsizefactor = 2; 218 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 219 "Size factor for namecache"); 220 static u_int __read_mostly ncpurgeminvnodes; 221 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 222 "Number of vnodes below which purgevfs ignores the request"); 223 static u_int __read_mostly ncneghitsrequeue = 8; 224 SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0, 225 "Number of hits to requeue a negative entry in the LRU list"); 226 227 struct nchstats nchstats; /* cache effectiveness statistics */ 228 229 static struct mtx ncneg_shrink_lock; 230 static int shrink_list_turn; 231 232 struct neglist { 233 struct mtx nl_lock; 234 TAILQ_HEAD(, namecache) nl_list; 235 } __aligned(CACHE_LINE_SIZE); 236 237 static struct neglist __read_mostly *neglists; 238 static struct neglist ncneg_hot; 239 240 #define numneglists (ncneghash + 1) 241 static u_int __read_mostly ncneghash; 242 static inline struct neglist * 243 NCP2NEGLIST(struct namecache *ncp) 244 { 245 246 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 247 } 248 249 #define numbucketlocks (ncbuckethash + 1) 250 static u_int __read_mostly ncbuckethash; 251 static struct rwlock_padalign __read_mostly *bucketlocks; 252 #define HASH2BUCKETLOCK(hash) \ 253 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 254 255 #define numvnodelocks (ncvnodehash + 1) 256 static u_int __read_mostly ncvnodehash; 257 static struct mtx __read_mostly *vnodelocks; 258 static inline struct mtx * 259 VP2VNODELOCK(struct vnode *vp) 260 { 261 262 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 263 } 264 265 /* 266 * UMA zones for the VFS cache. 267 * 268 * The small cache is used for entries with short names, which are the 269 * most common. The large cache is used for entries which are too big to 270 * fit in the small cache. 271 */ 272 static uma_zone_t __read_mostly cache_zone_small; 273 static uma_zone_t __read_mostly cache_zone_small_ts; 274 static uma_zone_t __read_mostly cache_zone_large; 275 static uma_zone_t __read_mostly cache_zone_large_ts; 276 277 #define CACHE_PATH_CUTOFF 35 278 279 static struct namecache * 280 cache_alloc(int len, int ts) 281 { 282 struct namecache_ts *ncp_ts; 283 struct namecache *ncp; 284 285 if (__predict_false(ts)) { 286 if (len <= CACHE_PATH_CUTOFF) 287 ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); 288 else 289 ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); 290 ncp = &ncp_ts->nc_nc; 291 } else { 292 if (len <= CACHE_PATH_CUTOFF) 293 ncp = uma_zalloc(cache_zone_small, M_WAITOK); 294 else 295 ncp = uma_zalloc(cache_zone_large, M_WAITOK); 296 } 297 return (ncp); 298 } 299 300 static void 301 cache_free(struct namecache *ncp) 302 { 303 struct namecache_ts *ncp_ts; 304 305 if (ncp == NULL) 306 return; 307 if ((ncp->nc_flag & NCF_DVDROP) != 0) 308 vdrop(ncp->nc_dvp); 309 if (__predict_false(ncp->nc_flag & NCF_TS)) { 310 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 311 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 312 uma_zfree(cache_zone_small_ts, ncp_ts); 313 else 314 uma_zfree(cache_zone_large_ts, ncp_ts); 315 } else { 316 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 317 uma_zfree(cache_zone_small, ncp); 318 else 319 uma_zfree(cache_zone_large, ncp); 320 } 321 } 322 323 static void 324 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 325 { 326 struct namecache_ts *ncp_ts; 327 328 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 329 (tsp == NULL && ticksp == NULL), 330 ("No NCF_TS")); 331 332 if (tsp == NULL && ticksp == NULL) 333 return; 334 335 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 336 if (tsp != NULL) 337 *tsp = ncp_ts->nc_time; 338 if (ticksp != NULL) 339 *ticksp = ncp_ts->nc_ticks; 340 } 341 342 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 343 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 344 "VFS namecache enabled"); 345 346 /* Export size information to userland */ 347 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 348 sizeof(struct namecache), "sizeof(struct namecache)"); 349 350 /* 351 * The new name cache statistics 352 */ 353 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, 354 "Name cache statistics"); 355 #define STATNODE_ULONG(name, descr) \ 356 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 357 #define STATNODE_COUNTER(name, descr) \ 358 static counter_u64_t __read_mostly name; \ 359 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 360 STATNODE_ULONG(numneg, "Number of negative cache entries"); 361 STATNODE_ULONG(numcache, "Number of cache entries"); 362 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 363 STATNODE_COUNTER(dothits, "Number of '.' hits"); 364 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 365 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 366 STATNODE_COUNTER(nummiss, "Number of cache misses"); 367 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 368 STATNODE_COUNTER(numposzaps, 369 "Number of cache hits (positive) we do not want to cache"); 370 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 371 STATNODE_COUNTER(numnegzaps, 372 "Number of cache hits (negative) we do not want to cache"); 373 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 374 /* These count for kern___getcwd(), too. */ 375 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 376 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 377 STATNODE_COUNTER(numfullpathfail2, 378 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 379 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 380 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 381 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 382 "Number of successful removals after relocking"); 383 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 384 "Number of times zap_and_exit failed to lock"); 385 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 386 "Number of times zap_and_exit failed to lock"); 387 static long cache_lock_vnodes_cel_3_failures; 388 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 389 "Number of times 3-way vnode locking failed"); 390 391 static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 392 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 393 char *buf, char **retbuf, u_int buflen); 394 395 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 396 397 static int cache_yield; 398 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 399 "Number of times cache called yield"); 400 401 static void __noinline 402 cache_maybe_yield(void) 403 { 404 405 if (should_yield()) { 406 cache_yield++; 407 kern_yield(PRI_USER); 408 } 409 } 410 411 static inline void 412 cache_assert_vlp_locked(struct mtx *vlp) 413 { 414 415 if (vlp != NULL) 416 mtx_assert(vlp, MA_OWNED); 417 } 418 419 static inline void 420 cache_assert_vnode_locked(struct vnode *vp) 421 { 422 struct mtx *vlp; 423 424 vlp = VP2VNODELOCK(vp); 425 cache_assert_vlp_locked(vlp); 426 } 427 428 static uint32_t 429 cache_get_hash(char *name, u_char len, struct vnode *dvp) 430 { 431 uint32_t hash; 432 433 hash = fnv_32_buf(name, len, FNV1_32_INIT); 434 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 435 return (hash); 436 } 437 438 static inline struct rwlock * 439 NCP2BUCKETLOCK(struct namecache *ncp) 440 { 441 uint32_t hash; 442 443 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 444 return (HASH2BUCKETLOCK(hash)); 445 } 446 447 #ifdef INVARIANTS 448 static void 449 cache_assert_bucket_locked(struct namecache *ncp, int mode) 450 { 451 struct rwlock *blp; 452 453 blp = NCP2BUCKETLOCK(ncp); 454 rw_assert(blp, mode); 455 } 456 #else 457 #define cache_assert_bucket_locked(x, y) do { } while (0) 458 #endif 459 460 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 461 static void 462 _cache_sort_vnodes(void **p1, void **p2) 463 { 464 void *tmp; 465 466 MPASS(*p1 != NULL || *p2 != NULL); 467 468 if (*p1 > *p2) { 469 tmp = *p2; 470 *p2 = *p1; 471 *p1 = tmp; 472 } 473 } 474 475 static void 476 cache_lock_all_buckets(void) 477 { 478 u_int i; 479 480 for (i = 0; i < numbucketlocks; i++) 481 rw_wlock(&bucketlocks[i]); 482 } 483 484 static void 485 cache_unlock_all_buckets(void) 486 { 487 u_int i; 488 489 for (i = 0; i < numbucketlocks; i++) 490 rw_wunlock(&bucketlocks[i]); 491 } 492 493 static void 494 cache_lock_all_vnodes(void) 495 { 496 u_int i; 497 498 for (i = 0; i < numvnodelocks; i++) 499 mtx_lock(&vnodelocks[i]); 500 } 501 502 static void 503 cache_unlock_all_vnodes(void) 504 { 505 u_int i; 506 507 for (i = 0; i < numvnodelocks; i++) 508 mtx_unlock(&vnodelocks[i]); 509 } 510 511 static int 512 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 513 { 514 515 cache_sort_vnodes(&vlp1, &vlp2); 516 517 if (vlp1 != NULL) { 518 if (!mtx_trylock(vlp1)) 519 return (EAGAIN); 520 } 521 if (!mtx_trylock(vlp2)) { 522 if (vlp1 != NULL) 523 mtx_unlock(vlp1); 524 return (EAGAIN); 525 } 526 527 return (0); 528 } 529 530 static void 531 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 532 { 533 534 MPASS(vlp1 != NULL || vlp2 != NULL); 535 MPASS(vlp1 <= vlp2); 536 537 if (vlp1 != NULL) 538 mtx_lock(vlp1); 539 if (vlp2 != NULL) 540 mtx_lock(vlp2); 541 } 542 543 static void 544 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 545 { 546 547 MPASS(vlp1 != NULL || vlp2 != NULL); 548 549 if (vlp1 != NULL) 550 mtx_unlock(vlp1); 551 if (vlp2 != NULL) 552 mtx_unlock(vlp2); 553 } 554 555 static int 556 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 557 { 558 struct nchstats snap; 559 560 if (req->oldptr == NULL) 561 return (SYSCTL_OUT(req, 0, sizeof(snap))); 562 563 snap = nchstats; 564 snap.ncs_goodhits = counter_u64_fetch(numposhits); 565 snap.ncs_neghits = counter_u64_fetch(numneghits); 566 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 567 counter_u64_fetch(numnegzaps); 568 snap.ncs_miss = counter_u64_fetch(nummisszap) + 569 counter_u64_fetch(nummiss); 570 571 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 572 } 573 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 574 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 575 "VFS cache effectiveness statistics"); 576 577 #ifdef DIAGNOSTIC 578 /* 579 * Grab an atomic snapshot of the name cache hash chain lengths 580 */ 581 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, 582 "hash table stats"); 583 584 static int 585 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 586 { 587 struct nchashhead *ncpp; 588 struct namecache *ncp; 589 int i, error, n_nchash, *cntbuf; 590 591 retry: 592 n_nchash = nchash + 1; /* nchash is max index, not count */ 593 if (req->oldptr == NULL) 594 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 595 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 596 cache_lock_all_buckets(); 597 if (n_nchash != nchash + 1) { 598 cache_unlock_all_buckets(); 599 free(cntbuf, M_TEMP); 600 goto retry; 601 } 602 /* Scan hash tables counting entries */ 603 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 604 LIST_FOREACH(ncp, ncpp, nc_hash) 605 cntbuf[i]++; 606 cache_unlock_all_buckets(); 607 for (error = 0, i = 0; i < n_nchash; i++) 608 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 609 break; 610 free(cntbuf, M_TEMP); 611 return (error); 612 } 613 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 614 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 615 "nchash chain lengths"); 616 617 static int 618 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 619 { 620 int error; 621 struct nchashhead *ncpp; 622 struct namecache *ncp; 623 int n_nchash; 624 int count, maxlength, used, pct; 625 626 if (!req->oldptr) 627 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 628 629 cache_lock_all_buckets(); 630 n_nchash = nchash + 1; /* nchash is max index, not count */ 631 used = 0; 632 maxlength = 0; 633 634 /* Scan hash tables for applicable entries */ 635 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 636 count = 0; 637 LIST_FOREACH(ncp, ncpp, nc_hash) { 638 count++; 639 } 640 if (count) 641 used++; 642 if (maxlength < count) 643 maxlength = count; 644 } 645 n_nchash = nchash + 1; 646 cache_unlock_all_buckets(); 647 pct = (used * 100) / (n_nchash / 100); 648 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 649 if (error) 650 return (error); 651 error = SYSCTL_OUT(req, &used, sizeof(used)); 652 if (error) 653 return (error); 654 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 655 if (error) 656 return (error); 657 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 658 if (error) 659 return (error); 660 return (0); 661 } 662 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 663 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 664 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 665 #endif 666 667 /* 668 * Negative entries management 669 * 670 * A variation of LRU scheme is used. New entries are hashed into one of 671 * numneglists cold lists. Entries get promoted to the hot list on first hit. 672 * Partial LRU for the hot list is maintained by requeueing them every 673 * ncneghitsrequeue hits. 674 * 675 * The shrinker will demote hot list head and evict from the cold list in a 676 * round-robin manner. 677 */ 678 static void 679 cache_negative_hit(struct namecache *ncp) 680 { 681 struct neglist *neglist; 682 u_int hits; 683 684 MPASS(ncp->nc_flag & NCF_NEGATIVE); 685 hits = atomic_fetchadd_int(&ncp->nc_neghits, 1); 686 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 687 if ((hits % ncneghitsrequeue) != 0) 688 return; 689 mtx_lock(&ncneg_hot.nl_lock); 690 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 691 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 692 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 693 mtx_unlock(&ncneg_hot.nl_lock); 694 return; 695 } 696 /* 697 * The shrinker cleared the flag and removed the entry from 698 * the hot list. Put it back. 699 */ 700 } else { 701 mtx_lock(&ncneg_hot.nl_lock); 702 } 703 neglist = NCP2NEGLIST(ncp); 704 mtx_lock(&neglist->nl_lock); 705 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 706 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 707 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 708 ncp->nc_flag |= NCF_HOTNEGATIVE; 709 } 710 mtx_unlock(&neglist->nl_lock); 711 mtx_unlock(&ncneg_hot.nl_lock); 712 } 713 714 static void 715 cache_negative_insert(struct namecache *ncp, bool neg_locked) 716 { 717 struct neglist *neglist; 718 719 MPASS(ncp->nc_flag & NCF_NEGATIVE); 720 cache_assert_bucket_locked(ncp, RA_WLOCKED); 721 neglist = NCP2NEGLIST(ncp); 722 if (!neg_locked) { 723 mtx_lock(&neglist->nl_lock); 724 } else { 725 mtx_assert(&neglist->nl_lock, MA_OWNED); 726 } 727 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 728 if (!neg_locked) 729 mtx_unlock(&neglist->nl_lock); 730 atomic_add_rel_long(&numneg, 1); 731 } 732 733 static void 734 cache_negative_remove(struct namecache *ncp, bool neg_locked) 735 { 736 struct neglist *neglist; 737 bool hot_locked = false; 738 bool list_locked = false; 739 740 MPASS(ncp->nc_flag & NCF_NEGATIVE); 741 cache_assert_bucket_locked(ncp, RA_WLOCKED); 742 neglist = NCP2NEGLIST(ncp); 743 if (!neg_locked) { 744 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 745 hot_locked = true; 746 mtx_lock(&ncneg_hot.nl_lock); 747 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 748 list_locked = true; 749 mtx_lock(&neglist->nl_lock); 750 } 751 } else { 752 list_locked = true; 753 mtx_lock(&neglist->nl_lock); 754 } 755 } 756 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 757 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 758 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 759 } else { 760 mtx_assert(&neglist->nl_lock, MA_OWNED); 761 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 762 } 763 if (list_locked) 764 mtx_unlock(&neglist->nl_lock); 765 if (hot_locked) 766 mtx_unlock(&ncneg_hot.nl_lock); 767 atomic_subtract_rel_long(&numneg, 1); 768 } 769 770 static void 771 cache_negative_shrink_select(int start, struct namecache **ncpp, 772 struct neglist **neglistpp) 773 { 774 struct neglist *neglist; 775 struct namecache *ncp; 776 int i; 777 778 *ncpp = ncp = NULL; 779 neglist = NULL; 780 781 for (i = start; i < numneglists; i++) { 782 neglist = &neglists[i]; 783 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 784 continue; 785 mtx_lock(&neglist->nl_lock); 786 ncp = TAILQ_FIRST(&neglist->nl_list); 787 if (ncp != NULL) 788 break; 789 mtx_unlock(&neglist->nl_lock); 790 } 791 792 *neglistpp = neglist; 793 *ncpp = ncp; 794 } 795 796 static void 797 cache_negative_zap_one(void) 798 { 799 struct namecache *ncp, *ncp2; 800 struct neglist *neglist; 801 struct mtx *dvlp; 802 struct rwlock *blp; 803 804 if (!mtx_trylock(&ncneg_shrink_lock)) 805 return; 806 807 mtx_lock(&ncneg_hot.nl_lock); 808 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 809 if (ncp != NULL) { 810 neglist = NCP2NEGLIST(ncp); 811 mtx_lock(&neglist->nl_lock); 812 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 813 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 814 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 815 mtx_unlock(&neglist->nl_lock); 816 } 817 818 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 819 shrink_list_turn++; 820 if (shrink_list_turn == numneglists) 821 shrink_list_turn = 0; 822 if (ncp == NULL && shrink_list_turn == 0) 823 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 824 if (ncp == NULL) { 825 mtx_unlock(&ncneg_hot.nl_lock); 826 goto out; 827 } 828 829 MPASS(ncp->nc_flag & NCF_NEGATIVE); 830 dvlp = VP2VNODELOCK(ncp->nc_dvp); 831 blp = NCP2BUCKETLOCK(ncp); 832 mtx_unlock(&neglist->nl_lock); 833 mtx_unlock(&ncneg_hot.nl_lock); 834 mtx_lock(dvlp); 835 rw_wlock(blp); 836 mtx_lock(&neglist->nl_lock); 837 ncp2 = TAILQ_FIRST(&neglist->nl_list); 838 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 839 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 840 ncp = NULL; 841 goto out_unlock_all; 842 } 843 SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 844 ncp->nc_name, ncp->nc_neghits); 845 846 cache_zap_locked(ncp, true); 847 out_unlock_all: 848 mtx_unlock(&neglist->nl_lock); 849 rw_wunlock(blp); 850 mtx_unlock(dvlp); 851 out: 852 mtx_unlock(&ncneg_shrink_lock); 853 cache_free(ncp); 854 } 855 856 /* 857 * cache_zap_locked(): 858 * 859 * Removes a namecache entry from cache, whether it contains an actual 860 * pointer to a vnode or if it is just a negative cache entry. 861 */ 862 static void 863 cache_zap_locked(struct namecache *ncp, bool neg_locked) 864 { 865 866 if (!(ncp->nc_flag & NCF_NEGATIVE)) 867 cache_assert_vnode_locked(ncp->nc_vp); 868 cache_assert_vnode_locked(ncp->nc_dvp); 869 cache_assert_bucket_locked(ncp, RA_WLOCKED); 870 871 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 872 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 873 LIST_REMOVE(ncp, nc_hash); 874 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 875 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 876 ncp->nc_name, ncp->nc_vp); 877 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 878 if (ncp == ncp->nc_vp->v_cache_dd) 879 ncp->nc_vp->v_cache_dd = NULL; 880 } else { 881 SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp, 882 ncp->nc_name, ncp->nc_neghits); 883 cache_negative_remove(ncp, neg_locked); 884 } 885 if (ncp->nc_flag & NCF_ISDOTDOT) { 886 if (ncp == ncp->nc_dvp->v_cache_dd) 887 ncp->nc_dvp->v_cache_dd = NULL; 888 } else { 889 LIST_REMOVE(ncp, nc_src); 890 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 891 ncp->nc_flag |= NCF_DVDROP; 892 atomic_subtract_rel_long(&numcachehv, 1); 893 } 894 } 895 atomic_subtract_rel_long(&numcache, 1); 896 } 897 898 static void 899 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 900 { 901 struct rwlock *blp; 902 903 MPASS(ncp->nc_dvp == vp); 904 MPASS(ncp->nc_flag & NCF_NEGATIVE); 905 cache_assert_vnode_locked(vp); 906 907 blp = NCP2BUCKETLOCK(ncp); 908 rw_wlock(blp); 909 cache_zap_locked(ncp, false); 910 rw_wunlock(blp); 911 } 912 913 static bool 914 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 915 struct mtx **vlpp) 916 { 917 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 918 struct rwlock *blp; 919 920 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 921 cache_assert_vnode_locked(vp); 922 923 if (ncp->nc_flag & NCF_NEGATIVE) { 924 if (*vlpp != NULL) { 925 mtx_unlock(*vlpp); 926 *vlpp = NULL; 927 } 928 cache_zap_negative_locked_vnode_kl(ncp, vp); 929 return (true); 930 } 931 932 pvlp = VP2VNODELOCK(vp); 933 blp = NCP2BUCKETLOCK(ncp); 934 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 935 vlp2 = VP2VNODELOCK(ncp->nc_vp); 936 937 if (*vlpp == vlp1 || *vlpp == vlp2) { 938 to_unlock = *vlpp; 939 *vlpp = NULL; 940 } else { 941 if (*vlpp != NULL) { 942 mtx_unlock(*vlpp); 943 *vlpp = NULL; 944 } 945 cache_sort_vnodes(&vlp1, &vlp2); 946 if (vlp1 == pvlp) { 947 mtx_lock(vlp2); 948 to_unlock = vlp2; 949 } else { 950 if (!mtx_trylock(vlp1)) 951 goto out_relock; 952 to_unlock = vlp1; 953 } 954 } 955 rw_wlock(blp); 956 cache_zap_locked(ncp, false); 957 rw_wunlock(blp); 958 if (to_unlock != NULL) 959 mtx_unlock(to_unlock); 960 return (true); 961 962 out_relock: 963 mtx_unlock(vlp2); 964 mtx_lock(vlp1); 965 mtx_lock(vlp2); 966 MPASS(*vlpp == NULL); 967 *vlpp = vlp1; 968 return (false); 969 } 970 971 static int __noinline 972 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 973 { 974 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 975 struct rwlock *blp; 976 int error = 0; 977 978 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 979 cache_assert_vnode_locked(vp); 980 981 pvlp = VP2VNODELOCK(vp); 982 if (ncp->nc_flag & NCF_NEGATIVE) { 983 cache_zap_negative_locked_vnode_kl(ncp, vp); 984 goto out; 985 } 986 987 blp = NCP2BUCKETLOCK(ncp); 988 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 989 vlp2 = VP2VNODELOCK(ncp->nc_vp); 990 cache_sort_vnodes(&vlp1, &vlp2); 991 if (vlp1 == pvlp) { 992 mtx_lock(vlp2); 993 to_unlock = vlp2; 994 } else { 995 if (!mtx_trylock(vlp1)) { 996 error = EAGAIN; 997 goto out; 998 } 999 to_unlock = vlp1; 1000 } 1001 rw_wlock(blp); 1002 cache_zap_locked(ncp, false); 1003 rw_wunlock(blp); 1004 mtx_unlock(to_unlock); 1005 out: 1006 mtx_unlock(pvlp); 1007 return (error); 1008 } 1009 1010 /* 1011 * If trylocking failed we can get here. We know enough to take all needed locks 1012 * in the right order and re-lookup the entry. 1013 */ 1014 static int 1015 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1016 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1017 struct rwlock *blp) 1018 { 1019 struct namecache *rncp; 1020 1021 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1022 1023 cache_sort_vnodes(&dvlp, &vlp); 1024 cache_lock_vnodes(dvlp, vlp); 1025 rw_wlock(blp); 1026 LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1027 if (rncp == ncp && rncp->nc_dvp == dvp && 1028 rncp->nc_nlen == cnp->cn_namelen && 1029 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1030 break; 1031 } 1032 if (rncp != NULL) { 1033 cache_zap_locked(rncp, false); 1034 rw_wunlock(blp); 1035 cache_unlock_vnodes(dvlp, vlp); 1036 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1037 return (0); 1038 } 1039 1040 rw_wunlock(blp); 1041 cache_unlock_vnodes(dvlp, vlp); 1042 return (EAGAIN); 1043 } 1044 1045 static int __noinline 1046 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1047 uint32_t hash, struct rwlock *blp) 1048 { 1049 struct mtx *dvlp, *vlp; 1050 struct vnode *dvp; 1051 1052 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1053 1054 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1055 vlp = NULL; 1056 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1057 vlp = VP2VNODELOCK(ncp->nc_vp); 1058 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1059 cache_zap_locked(ncp, false); 1060 rw_wunlock(blp); 1061 cache_unlock_vnodes(dvlp, vlp); 1062 return (0); 1063 } 1064 1065 dvp = ncp->nc_dvp; 1066 rw_wunlock(blp); 1067 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1068 } 1069 1070 static int __noinline 1071 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1072 uint32_t hash, struct rwlock *blp) 1073 { 1074 struct mtx *dvlp, *vlp; 1075 struct vnode *dvp; 1076 1077 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1078 1079 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1080 vlp = NULL; 1081 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1082 vlp = VP2VNODELOCK(ncp->nc_vp); 1083 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1084 rw_runlock(blp); 1085 rw_wlock(blp); 1086 cache_zap_locked(ncp, false); 1087 rw_wunlock(blp); 1088 cache_unlock_vnodes(dvlp, vlp); 1089 return (0); 1090 } 1091 1092 dvp = ncp->nc_dvp; 1093 rw_runlock(blp); 1094 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1095 } 1096 1097 static int 1098 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1099 struct mtx **vlpp1, struct mtx **vlpp2) 1100 { 1101 struct mtx *dvlp, *vlp; 1102 1103 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1104 1105 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1106 vlp = NULL; 1107 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1108 vlp = VP2VNODELOCK(ncp->nc_vp); 1109 cache_sort_vnodes(&dvlp, &vlp); 1110 1111 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1112 cache_zap_locked(ncp, false); 1113 cache_unlock_vnodes(dvlp, vlp); 1114 *vlpp1 = NULL; 1115 *vlpp2 = NULL; 1116 return (0); 1117 } 1118 1119 if (*vlpp1 != NULL) 1120 mtx_unlock(*vlpp1); 1121 if (*vlpp2 != NULL) 1122 mtx_unlock(*vlpp2); 1123 *vlpp1 = NULL; 1124 *vlpp2 = NULL; 1125 1126 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1127 cache_zap_locked(ncp, false); 1128 cache_unlock_vnodes(dvlp, vlp); 1129 return (0); 1130 } 1131 1132 rw_wunlock(blp); 1133 *vlpp1 = dvlp; 1134 *vlpp2 = vlp; 1135 if (*vlpp1 != NULL) 1136 mtx_lock(*vlpp1); 1137 mtx_lock(*vlpp2); 1138 rw_wlock(blp); 1139 return (EAGAIN); 1140 } 1141 1142 static void 1143 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1144 { 1145 1146 if (blp != NULL) { 1147 rw_runlock(blp); 1148 } else { 1149 mtx_unlock(vlp); 1150 } 1151 } 1152 1153 static int __noinline 1154 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1155 struct timespec *tsp, int *ticksp) 1156 { 1157 int ltype; 1158 1159 *vpp = dvp; 1160 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1161 dvp, cnp->cn_nameptr); 1162 counter_u64_add(dothits, 1); 1163 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1164 if (tsp != NULL) 1165 timespecclear(tsp); 1166 if (ticksp != NULL) 1167 *ticksp = ticks; 1168 vrefact(*vpp); 1169 /* 1170 * When we lookup "." we still can be asked to lock it 1171 * differently... 1172 */ 1173 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1174 if (ltype != VOP_ISLOCKED(*vpp)) { 1175 if (ltype == LK_EXCLUSIVE) { 1176 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1177 if ((*vpp)->v_iflag & VI_DOOMED) { 1178 /* forced unmount */ 1179 vrele(*vpp); 1180 *vpp = NULL; 1181 return (ENOENT); 1182 } 1183 } else 1184 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1185 } 1186 return (-1); 1187 } 1188 1189 static __noinline int 1190 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1191 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1192 { 1193 struct namecache *ncp; 1194 struct rwlock *blp; 1195 struct mtx *dvlp, *dvlp2; 1196 uint32_t hash; 1197 int error; 1198 1199 if (cnp->cn_namelen == 2 && 1200 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1201 counter_u64_add(dotdothits, 1); 1202 dvlp = VP2VNODELOCK(dvp); 1203 dvlp2 = NULL; 1204 mtx_lock(dvlp); 1205 retry_dotdot: 1206 ncp = dvp->v_cache_dd; 1207 if (ncp == NULL) { 1208 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1209 "..", NULL); 1210 mtx_unlock(dvlp); 1211 if (dvlp2 != NULL) 1212 mtx_unlock(dvlp2); 1213 return (0); 1214 } 1215 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1216 if (ncp->nc_dvp != dvp) 1217 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1218 if (!cache_zap_locked_vnode_kl2(ncp, 1219 dvp, &dvlp2)) 1220 goto retry_dotdot; 1221 MPASS(dvp->v_cache_dd == NULL); 1222 mtx_unlock(dvlp); 1223 if (dvlp2 != NULL) 1224 mtx_unlock(dvlp2); 1225 cache_free(ncp); 1226 } else { 1227 dvp->v_cache_dd = NULL; 1228 mtx_unlock(dvlp); 1229 if (dvlp2 != NULL) 1230 mtx_unlock(dvlp2); 1231 } 1232 return (0); 1233 } 1234 1235 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1236 blp = HASH2BUCKETLOCK(hash); 1237 retry: 1238 if (LIST_EMPTY(NCHHASH(hash))) 1239 goto out_no_entry; 1240 1241 rw_wlock(blp); 1242 1243 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1244 counter_u64_add(numchecks, 1); 1245 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1246 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1247 break; 1248 } 1249 1250 /* We failed to find an entry */ 1251 if (ncp == NULL) { 1252 rw_wunlock(blp); 1253 goto out_no_entry; 1254 } 1255 1256 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1257 if (__predict_false(error != 0)) { 1258 zap_and_exit_bucket_fail++; 1259 cache_maybe_yield(); 1260 goto retry; 1261 } 1262 counter_u64_add(numposzaps, 1); 1263 cache_free(ncp); 1264 return (0); 1265 out_no_entry: 1266 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1267 counter_u64_add(nummisszap, 1); 1268 return (0); 1269 } 1270 1271 /** 1272 * Lookup a name in the name cache 1273 * 1274 * # Arguments 1275 * 1276 * - dvp: Parent directory in which to search. 1277 * - vpp: Return argument. Will contain desired vnode on cache hit. 1278 * - cnp: Parameters of the name search. The most interesting bits of 1279 * the cn_flags field have the following meanings: 1280 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1281 * it up. 1282 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1283 * - tsp: Return storage for cache timestamp. On a successful (positive 1284 * or negative) lookup, tsp will be filled with any timespec that 1285 * was stored when this cache entry was created. However, it will 1286 * be clear for "." entries. 1287 * - ticks: Return storage for alternate cache timestamp. On a successful 1288 * (positive or negative) lookup, it will contain the ticks value 1289 * that was current when the cache entry was created, unless cnp 1290 * was ".". 1291 * 1292 * # Returns 1293 * 1294 * - -1: A positive cache hit. vpp will contain the desired vnode. 1295 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1296 * to a forced unmount. vpp will not be modified. If the entry 1297 * is a whiteout, then the ISWHITEOUT flag will be set in 1298 * cnp->cn_flags. 1299 * - 0: A cache miss. vpp will not be modified. 1300 * 1301 * # Locking 1302 * 1303 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1304 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1305 * lock is not recursively acquired. 1306 */ 1307 int 1308 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1309 struct timespec *tsp, int *ticksp) 1310 { 1311 struct namecache_ts *ncp_ts; 1312 struct namecache *ncp; 1313 struct rwlock *blp; 1314 struct mtx *dvlp; 1315 uint32_t hash; 1316 enum vgetstate vs; 1317 int error, ltype; 1318 1319 if (__predict_false(!doingcache)) { 1320 cnp->cn_flags &= ~MAKEENTRY; 1321 return (0); 1322 } 1323 1324 counter_u64_add(numcalls, 1); 1325 1326 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1327 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1328 1329 if ((cnp->cn_flags & MAKEENTRY) == 0) 1330 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1331 1332 retry: 1333 blp = NULL; 1334 dvlp = NULL; 1335 error = 0; 1336 if (cnp->cn_namelen == 2 && 1337 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1338 counter_u64_add(dotdothits, 1); 1339 dvlp = VP2VNODELOCK(dvp); 1340 mtx_lock(dvlp); 1341 ncp = dvp->v_cache_dd; 1342 if (ncp == NULL) { 1343 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1344 "..", NULL); 1345 mtx_unlock(dvlp); 1346 return (0); 1347 } 1348 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1349 if (ncp->nc_flag & NCF_NEGATIVE) 1350 *vpp = NULL; 1351 else 1352 *vpp = ncp->nc_vp; 1353 } else 1354 *vpp = ncp->nc_dvp; 1355 /* Return failure if negative entry was found. */ 1356 if (*vpp == NULL) 1357 goto negative_success; 1358 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1359 dvp, cnp->cn_nameptr, *vpp); 1360 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1361 *vpp); 1362 cache_out_ts(ncp, tsp, ticksp); 1363 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1364 NCF_DTS && tsp != NULL) { 1365 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1366 *tsp = ncp_ts->nc_dotdottime; 1367 } 1368 goto success; 1369 } 1370 1371 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1372 blp = HASH2BUCKETLOCK(hash); 1373 rw_rlock(blp); 1374 1375 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1376 counter_u64_add(numchecks, 1); 1377 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1378 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1379 break; 1380 } 1381 1382 /* We failed to find an entry */ 1383 if (ncp == NULL) { 1384 rw_runlock(blp); 1385 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1386 NULL); 1387 counter_u64_add(nummiss, 1); 1388 return (0); 1389 } 1390 1391 /* We found a "positive" match, return the vnode */ 1392 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1393 counter_u64_add(numposhits, 1); 1394 *vpp = ncp->nc_vp; 1395 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1396 dvp, cnp->cn_nameptr, *vpp, ncp); 1397 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1398 *vpp); 1399 cache_out_ts(ncp, tsp, ticksp); 1400 goto success; 1401 } 1402 1403 negative_success: 1404 /* We found a negative match, and want to create it, so purge */ 1405 if (cnp->cn_nameiop == CREATE) { 1406 counter_u64_add(numnegzaps, 1); 1407 goto zap_and_exit; 1408 } 1409 1410 counter_u64_add(numneghits, 1); 1411 cache_negative_hit(ncp); 1412 if (ncp->nc_flag & NCF_WHITE) 1413 cnp->cn_flags |= ISWHITEOUT; 1414 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 1415 ncp->nc_name); 1416 cache_out_ts(ncp, tsp, ticksp); 1417 cache_lookup_unlock(blp, dvlp); 1418 return (ENOENT); 1419 1420 success: 1421 /* 1422 * On success we return a locked and ref'd vnode as per the lookup 1423 * protocol. 1424 */ 1425 MPASS(dvp != *vpp); 1426 ltype = 0; /* silence gcc warning */ 1427 if (cnp->cn_flags & ISDOTDOT) { 1428 ltype = VOP_ISLOCKED(dvp); 1429 VOP_UNLOCK(dvp, 0); 1430 } 1431 vs = vget_prep(*vpp); 1432 cache_lookup_unlock(blp, dvlp); 1433 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1434 if (cnp->cn_flags & ISDOTDOT) { 1435 vn_lock(dvp, ltype | LK_RETRY); 1436 if (dvp->v_iflag & VI_DOOMED) { 1437 if (error == 0) 1438 vput(*vpp); 1439 *vpp = NULL; 1440 return (ENOENT); 1441 } 1442 } 1443 if (error) { 1444 *vpp = NULL; 1445 goto retry; 1446 } 1447 if ((cnp->cn_flags & ISLASTCN) && 1448 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1449 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1450 } 1451 return (-1); 1452 1453 zap_and_exit: 1454 if (blp != NULL) 1455 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1456 else 1457 error = cache_zap_locked_vnode(ncp, dvp); 1458 if (__predict_false(error != 0)) { 1459 zap_and_exit_bucket_fail2++; 1460 cache_maybe_yield(); 1461 goto retry; 1462 } 1463 cache_free(ncp); 1464 return (0); 1465 } 1466 1467 struct celockstate { 1468 struct mtx *vlp[3]; 1469 struct rwlock *blp[2]; 1470 }; 1471 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1472 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1473 1474 static inline void 1475 cache_celockstate_init(struct celockstate *cel) 1476 { 1477 1478 bzero(cel, sizeof(*cel)); 1479 } 1480 1481 static void 1482 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1483 struct vnode *dvp) 1484 { 1485 struct mtx *vlp1, *vlp2; 1486 1487 MPASS(cel->vlp[0] == NULL); 1488 MPASS(cel->vlp[1] == NULL); 1489 MPASS(cel->vlp[2] == NULL); 1490 1491 MPASS(vp != NULL || dvp != NULL); 1492 1493 vlp1 = VP2VNODELOCK(vp); 1494 vlp2 = VP2VNODELOCK(dvp); 1495 cache_sort_vnodes(&vlp1, &vlp2); 1496 1497 if (vlp1 != NULL) { 1498 mtx_lock(vlp1); 1499 cel->vlp[0] = vlp1; 1500 } 1501 mtx_lock(vlp2); 1502 cel->vlp[1] = vlp2; 1503 } 1504 1505 static void 1506 cache_unlock_vnodes_cel(struct celockstate *cel) 1507 { 1508 1509 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1510 1511 if (cel->vlp[0] != NULL) 1512 mtx_unlock(cel->vlp[0]); 1513 if (cel->vlp[1] != NULL) 1514 mtx_unlock(cel->vlp[1]); 1515 if (cel->vlp[2] != NULL) 1516 mtx_unlock(cel->vlp[2]); 1517 } 1518 1519 static bool 1520 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1521 { 1522 struct mtx *vlp; 1523 bool ret; 1524 1525 cache_assert_vlp_locked(cel->vlp[0]); 1526 cache_assert_vlp_locked(cel->vlp[1]); 1527 MPASS(cel->vlp[2] == NULL); 1528 1529 MPASS(vp != NULL); 1530 vlp = VP2VNODELOCK(vp); 1531 1532 ret = true; 1533 if (vlp >= cel->vlp[1]) { 1534 mtx_lock(vlp); 1535 } else { 1536 if (mtx_trylock(vlp)) 1537 goto out; 1538 cache_lock_vnodes_cel_3_failures++; 1539 cache_unlock_vnodes_cel(cel); 1540 if (vlp < cel->vlp[0]) { 1541 mtx_lock(vlp); 1542 mtx_lock(cel->vlp[0]); 1543 mtx_lock(cel->vlp[1]); 1544 } else { 1545 if (cel->vlp[0] != NULL) 1546 mtx_lock(cel->vlp[0]); 1547 mtx_lock(vlp); 1548 mtx_lock(cel->vlp[1]); 1549 } 1550 ret = false; 1551 } 1552 out: 1553 cel->vlp[2] = vlp; 1554 return (ret); 1555 } 1556 1557 static void 1558 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1559 struct rwlock *blp2) 1560 { 1561 1562 MPASS(cel->blp[0] == NULL); 1563 MPASS(cel->blp[1] == NULL); 1564 1565 cache_sort_vnodes(&blp1, &blp2); 1566 1567 if (blp1 != NULL) { 1568 rw_wlock(blp1); 1569 cel->blp[0] = blp1; 1570 } 1571 rw_wlock(blp2); 1572 cel->blp[1] = blp2; 1573 } 1574 1575 static void 1576 cache_unlock_buckets_cel(struct celockstate *cel) 1577 { 1578 1579 if (cel->blp[0] != NULL) 1580 rw_wunlock(cel->blp[0]); 1581 rw_wunlock(cel->blp[1]); 1582 } 1583 1584 /* 1585 * Lock part of the cache affected by the insertion. 1586 * 1587 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1588 * However, insertion can result in removal of an old entry. In this 1589 * case we have an additional vnode and bucketlock pair to lock. If the 1590 * entry is negative, ncelock is locked instead of the vnode. 1591 * 1592 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1593 * preserving the locking order (smaller address first). 1594 */ 1595 static void 1596 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1597 uint32_t hash) 1598 { 1599 struct namecache *ncp; 1600 struct rwlock *blps[2]; 1601 1602 blps[0] = HASH2BUCKETLOCK(hash); 1603 for (;;) { 1604 blps[1] = NULL; 1605 cache_lock_vnodes_cel(cel, dvp, vp); 1606 if (vp == NULL || vp->v_type != VDIR) 1607 break; 1608 ncp = vp->v_cache_dd; 1609 if (ncp == NULL) 1610 break; 1611 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1612 break; 1613 MPASS(ncp->nc_dvp == vp); 1614 blps[1] = NCP2BUCKETLOCK(ncp); 1615 if (ncp->nc_flag & NCF_NEGATIVE) 1616 break; 1617 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1618 break; 1619 /* 1620 * All vnodes got re-locked. Re-validate the state and if 1621 * nothing changed we are done. Otherwise restart. 1622 */ 1623 if (ncp == vp->v_cache_dd && 1624 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1625 blps[1] == NCP2BUCKETLOCK(ncp) && 1626 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1627 break; 1628 cache_unlock_vnodes_cel(cel); 1629 cel->vlp[0] = NULL; 1630 cel->vlp[1] = NULL; 1631 cel->vlp[2] = NULL; 1632 } 1633 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1634 } 1635 1636 static void 1637 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1638 uint32_t hash) 1639 { 1640 struct namecache *ncp; 1641 struct rwlock *blps[2]; 1642 1643 blps[0] = HASH2BUCKETLOCK(hash); 1644 for (;;) { 1645 blps[1] = NULL; 1646 cache_lock_vnodes_cel(cel, dvp, vp); 1647 ncp = dvp->v_cache_dd; 1648 if (ncp == NULL) 1649 break; 1650 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1651 break; 1652 MPASS(ncp->nc_dvp == dvp); 1653 blps[1] = NCP2BUCKETLOCK(ncp); 1654 if (ncp->nc_flag & NCF_NEGATIVE) 1655 break; 1656 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1657 break; 1658 if (ncp == dvp->v_cache_dd && 1659 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1660 blps[1] == NCP2BUCKETLOCK(ncp) && 1661 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1662 break; 1663 cache_unlock_vnodes_cel(cel); 1664 cel->vlp[0] = NULL; 1665 cel->vlp[1] = NULL; 1666 cel->vlp[2] = NULL; 1667 } 1668 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1669 } 1670 1671 static void 1672 cache_enter_unlock(struct celockstate *cel) 1673 { 1674 1675 cache_unlock_buckets_cel(cel); 1676 cache_unlock_vnodes_cel(cel); 1677 } 1678 1679 /* 1680 * Add an entry to the cache. 1681 */ 1682 void 1683 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1684 struct timespec *tsp, struct timespec *dtsp) 1685 { 1686 struct celockstate cel; 1687 struct namecache *ncp, *n2, *ndd; 1688 struct namecache_ts *ncp_ts, *n2_ts; 1689 struct nchashhead *ncpp; 1690 struct neglist *neglist; 1691 uint32_t hash; 1692 int flag; 1693 int len; 1694 bool neg_locked; 1695 u_long lnumcache; 1696 1697 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1698 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, 1699 ("cache_enter: Adding a doomed vnode")); 1700 VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, 1701 ("cache_enter: Doomed vnode used as src")); 1702 1703 if (__predict_false(!doingcache)) 1704 return; 1705 1706 /* 1707 * Avoid blowout in namecache entries. 1708 */ 1709 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1710 if (__predict_false(lnumcache >= desiredvnodes * ncsizefactor)) { 1711 atomic_add_long(&numcache, -1); 1712 return; 1713 } 1714 1715 cache_celockstate_init(&cel); 1716 ndd = NULL; 1717 ncp_ts = NULL; 1718 flag = 0; 1719 if (cnp->cn_nameptr[0] == '.') { 1720 if (cnp->cn_namelen == 1) 1721 return; 1722 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1723 len = cnp->cn_namelen; 1724 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1725 cache_enter_lock_dd(&cel, dvp, vp, hash); 1726 /* 1727 * If dotdot entry already exists, just retarget it 1728 * to new parent vnode, otherwise continue with new 1729 * namecache entry allocation. 1730 */ 1731 if ((ncp = dvp->v_cache_dd) != NULL && 1732 ncp->nc_flag & NCF_ISDOTDOT) { 1733 KASSERT(ncp->nc_dvp == dvp, 1734 ("wrong isdotdot parent")); 1735 neg_locked = false; 1736 if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) { 1737 neglist = NCP2NEGLIST(ncp); 1738 mtx_lock(&ncneg_hot.nl_lock); 1739 mtx_lock(&neglist->nl_lock); 1740 neg_locked = true; 1741 } 1742 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 1743 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, 1744 ncp, nc_dst); 1745 } else { 1746 cache_negative_remove(ncp, true); 1747 } 1748 if (vp != NULL) { 1749 TAILQ_INSERT_HEAD(&vp->v_cache_dst, 1750 ncp, nc_dst); 1751 ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE); 1752 } else { 1753 ncp->nc_flag &= ~(NCF_HOTNEGATIVE); 1754 ncp->nc_flag |= NCF_NEGATIVE; 1755 cache_negative_insert(ncp, true); 1756 } 1757 if (neg_locked) { 1758 mtx_unlock(&neglist->nl_lock); 1759 mtx_unlock(&ncneg_hot.nl_lock); 1760 } 1761 ncp->nc_vp = vp; 1762 cache_enter_unlock(&cel); 1763 return; 1764 } 1765 dvp->v_cache_dd = NULL; 1766 cache_enter_unlock(&cel); 1767 cache_celockstate_init(&cel); 1768 SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); 1769 flag = NCF_ISDOTDOT; 1770 } 1771 } 1772 1773 /* 1774 * Calculate the hash key and setup as much of the new 1775 * namecache entry as possible before acquiring the lock. 1776 */ 1777 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1778 ncp->nc_flag = flag; 1779 ncp->nc_vp = vp; 1780 if (vp == NULL) 1781 ncp->nc_flag |= NCF_NEGATIVE; 1782 ncp->nc_dvp = dvp; 1783 if (tsp != NULL) { 1784 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1785 ncp_ts->nc_time = *tsp; 1786 ncp_ts->nc_ticks = ticks; 1787 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1788 if (dtsp != NULL) { 1789 ncp_ts->nc_dotdottime = *dtsp; 1790 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1791 } 1792 } 1793 len = ncp->nc_nlen = cnp->cn_namelen; 1794 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1795 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1796 cache_enter_lock(&cel, dvp, vp, hash); 1797 1798 /* 1799 * See if this vnode or negative entry is already in the cache 1800 * with this name. This can happen with concurrent lookups of 1801 * the same path name. 1802 */ 1803 ncpp = NCHHASH(hash); 1804 LIST_FOREACH(n2, ncpp, nc_hash) { 1805 if (n2->nc_dvp == dvp && 1806 n2->nc_nlen == cnp->cn_namelen && 1807 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1808 if (tsp != NULL) { 1809 KASSERT((n2->nc_flag & NCF_TS) != 0, 1810 ("no NCF_TS")); 1811 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1812 n2_ts->nc_time = ncp_ts->nc_time; 1813 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1814 if (dtsp != NULL) { 1815 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1816 if (ncp->nc_flag & NCF_NEGATIVE) 1817 mtx_lock(&ncneg_hot.nl_lock); 1818 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1819 if (ncp->nc_flag & NCF_NEGATIVE) 1820 mtx_unlock(&ncneg_hot.nl_lock); 1821 } 1822 } 1823 goto out_unlock_free; 1824 } 1825 } 1826 1827 if (flag == NCF_ISDOTDOT) { 1828 /* 1829 * See if we are trying to add .. entry, but some other lookup 1830 * has populated v_cache_dd pointer already. 1831 */ 1832 if (dvp->v_cache_dd != NULL) 1833 goto out_unlock_free; 1834 KASSERT(vp == NULL || vp->v_type == VDIR, 1835 ("wrong vnode type %p", vp)); 1836 dvp->v_cache_dd = ncp; 1837 } 1838 1839 if (vp != NULL) { 1840 if (vp->v_type == VDIR) { 1841 if (flag != NCF_ISDOTDOT) { 1842 /* 1843 * For this case, the cache entry maps both the 1844 * directory name in it and the name ".." for the 1845 * directory's parent. 1846 */ 1847 if ((ndd = vp->v_cache_dd) != NULL) { 1848 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1849 cache_zap_locked(ndd, false); 1850 else 1851 ndd = NULL; 1852 } 1853 vp->v_cache_dd = ncp; 1854 } 1855 } else { 1856 vp->v_cache_dd = NULL; 1857 } 1858 } 1859 1860 if (flag != NCF_ISDOTDOT) { 1861 if (LIST_EMPTY(&dvp->v_cache_src)) { 1862 vhold(dvp); 1863 atomic_add_rel_long(&numcachehv, 1); 1864 } 1865 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1866 } 1867 1868 /* 1869 * Insert the new namecache entry into the appropriate chain 1870 * within the cache entries table. 1871 */ 1872 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1873 1874 /* 1875 * If the entry is "negative", we place it into the 1876 * "negative" cache queue, otherwise, we place it into the 1877 * destination vnode's cache entries queue. 1878 */ 1879 if (vp != NULL) { 1880 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1881 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1882 vp); 1883 } else { 1884 if (cnp->cn_flags & ISWHITEOUT) 1885 ncp->nc_flag |= NCF_WHITE; 1886 cache_negative_insert(ncp, false); 1887 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1888 ncp->nc_name); 1889 } 1890 cache_enter_unlock(&cel); 1891 if (numneg * ncnegfactor > lnumcache) 1892 cache_negative_zap_one(); 1893 cache_free(ndd); 1894 return; 1895 out_unlock_free: 1896 cache_enter_unlock(&cel); 1897 cache_free(ncp); 1898 return; 1899 } 1900 1901 static u_int 1902 cache_roundup_2(u_int val) 1903 { 1904 u_int res; 1905 1906 for (res = 1; res <= val; res <<= 1) 1907 continue; 1908 1909 return (res); 1910 } 1911 1912 /* 1913 * Name cache initialization, from vfs_init() when we are booting 1914 */ 1915 static void 1916 nchinit(void *dummy __unused) 1917 { 1918 u_int i; 1919 1920 cache_zone_small = uma_zcreate("S VFS Cache", 1921 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1922 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1923 UMA_ZONE_ZINIT); 1924 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1925 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1926 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1927 UMA_ZONE_ZINIT); 1928 cache_zone_large = uma_zcreate("L VFS Cache", 1929 sizeof(struct namecache) + NAME_MAX + 1, 1930 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1931 UMA_ZONE_ZINIT); 1932 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1933 sizeof(struct namecache_ts) + NAME_MAX + 1, 1934 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1935 UMA_ZONE_ZINIT); 1936 1937 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1938 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 1939 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 1940 ncbuckethash = 7; 1941 if (ncbuckethash > nchash) 1942 ncbuckethash = nchash; 1943 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1944 M_WAITOK | M_ZERO); 1945 for (i = 0; i < numbucketlocks; i++) 1946 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 1947 ncvnodehash = ncbuckethash; 1948 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 1949 M_WAITOK | M_ZERO); 1950 for (i = 0; i < numvnodelocks; i++) 1951 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 1952 ncpurgeminvnodes = numbucketlocks * 2; 1953 1954 ncneghash = 3; 1955 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 1956 M_WAITOK | M_ZERO); 1957 for (i = 0; i < numneglists; i++) { 1958 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 1959 TAILQ_INIT(&neglists[i].nl_list); 1960 } 1961 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 1962 TAILQ_INIT(&ncneg_hot.nl_list); 1963 1964 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 1965 1966 numcalls = counter_u64_alloc(M_WAITOK); 1967 dothits = counter_u64_alloc(M_WAITOK); 1968 dotdothits = counter_u64_alloc(M_WAITOK); 1969 numchecks = counter_u64_alloc(M_WAITOK); 1970 nummiss = counter_u64_alloc(M_WAITOK); 1971 nummisszap = counter_u64_alloc(M_WAITOK); 1972 numposzaps = counter_u64_alloc(M_WAITOK); 1973 numposhits = counter_u64_alloc(M_WAITOK); 1974 numnegzaps = counter_u64_alloc(M_WAITOK); 1975 numneghits = counter_u64_alloc(M_WAITOK); 1976 numfullpathcalls = counter_u64_alloc(M_WAITOK); 1977 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 1978 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 1979 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 1980 numfullpathfound = counter_u64_alloc(M_WAITOK); 1981 zap_and_exit_bucket_relock_success = counter_u64_alloc(M_WAITOK); 1982 } 1983 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 1984 1985 void 1986 cache_changesize(int newmaxvnodes) 1987 { 1988 struct nchashhead *new_nchashtbl, *old_nchashtbl; 1989 u_long new_nchash, old_nchash; 1990 struct namecache *ncp; 1991 uint32_t hash; 1992 int i; 1993 1994 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 1995 if (newmaxvnodes < numbucketlocks) 1996 newmaxvnodes = numbucketlocks; 1997 1998 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 1999 /* If same hash table size, nothing to do */ 2000 if (nchash == new_nchash) { 2001 free(new_nchashtbl, M_VFSCACHE); 2002 return; 2003 } 2004 /* 2005 * Move everything from the old hash table to the new table. 2006 * None of the namecache entries in the table can be removed 2007 * because to do so, they have to be removed from the hash table. 2008 */ 2009 cache_lock_all_vnodes(); 2010 cache_lock_all_buckets(); 2011 old_nchashtbl = nchashtbl; 2012 old_nchash = nchash; 2013 nchashtbl = new_nchashtbl; 2014 nchash = new_nchash; 2015 for (i = 0; i <= old_nchash; i++) { 2016 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 2017 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2018 ncp->nc_dvp); 2019 LIST_REMOVE(ncp, nc_hash); 2020 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2021 } 2022 } 2023 cache_unlock_all_buckets(); 2024 cache_unlock_all_vnodes(); 2025 free(old_nchashtbl, M_VFSCACHE); 2026 } 2027 2028 /* 2029 * Invalidate all entries from and to a particular vnode. 2030 */ 2031 void 2032 cache_purge(struct vnode *vp) 2033 { 2034 TAILQ_HEAD(, namecache) ncps; 2035 struct namecache *ncp, *nnp; 2036 struct mtx *vlp, *vlp2; 2037 2038 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2039 SDT_PROBE1(vfs, namecache, purge, done, vp); 2040 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2041 vp->v_cache_dd == NULL) 2042 return; 2043 TAILQ_INIT(&ncps); 2044 vlp = VP2VNODELOCK(vp); 2045 vlp2 = NULL; 2046 mtx_lock(vlp); 2047 retry: 2048 while (!LIST_EMPTY(&vp->v_cache_src)) { 2049 ncp = LIST_FIRST(&vp->v_cache_src); 2050 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2051 goto retry; 2052 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2053 } 2054 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2055 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2056 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2057 goto retry; 2058 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2059 } 2060 ncp = vp->v_cache_dd; 2061 if (ncp != NULL) { 2062 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2063 ("lost dotdot link")); 2064 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2065 goto retry; 2066 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2067 } 2068 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2069 mtx_unlock(vlp); 2070 if (vlp2 != NULL) 2071 mtx_unlock(vlp2); 2072 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2073 cache_free(ncp); 2074 } 2075 } 2076 2077 /* 2078 * Invalidate all negative entries for a particular directory vnode. 2079 */ 2080 void 2081 cache_purge_negative(struct vnode *vp) 2082 { 2083 TAILQ_HEAD(, namecache) ncps; 2084 struct namecache *ncp, *nnp; 2085 struct mtx *vlp; 2086 2087 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2088 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2089 if (LIST_EMPTY(&vp->v_cache_src)) 2090 return; 2091 TAILQ_INIT(&ncps); 2092 vlp = VP2VNODELOCK(vp); 2093 mtx_lock(vlp); 2094 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2095 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2096 continue; 2097 cache_zap_negative_locked_vnode_kl(ncp, vp); 2098 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2099 } 2100 mtx_unlock(vlp); 2101 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2102 cache_free(ncp); 2103 } 2104 } 2105 2106 /* 2107 * Flush all entries referencing a particular filesystem. 2108 */ 2109 void 2110 cache_purgevfs(struct mount *mp, bool force) 2111 { 2112 TAILQ_HEAD(, namecache) ncps; 2113 struct mtx *vlp1, *vlp2; 2114 struct rwlock *blp; 2115 struct nchashhead *bucket; 2116 struct namecache *ncp, *nnp; 2117 u_long i, j, n_nchash; 2118 int error; 2119 2120 /* Scan hash tables for applicable entries */ 2121 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2122 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2123 return; 2124 TAILQ_INIT(&ncps); 2125 n_nchash = nchash + 1; 2126 vlp1 = vlp2 = NULL; 2127 for (i = 0; i < numbucketlocks; i++) { 2128 blp = (struct rwlock *)&bucketlocks[i]; 2129 rw_wlock(blp); 2130 for (j = i; j < n_nchash; j += numbucketlocks) { 2131 retry: 2132 bucket = &nchashtbl[j]; 2133 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2134 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2135 if (ncp->nc_dvp->v_mount != mp) 2136 continue; 2137 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2138 &vlp1, &vlp2); 2139 if (error != 0) 2140 goto retry; 2141 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2142 } 2143 } 2144 rw_wunlock(blp); 2145 if (vlp1 == NULL && vlp2 == NULL) 2146 cache_maybe_yield(); 2147 } 2148 if (vlp1 != NULL) 2149 mtx_unlock(vlp1); 2150 if (vlp2 != NULL) 2151 mtx_unlock(vlp2); 2152 2153 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2154 cache_free(ncp); 2155 } 2156 } 2157 2158 /* 2159 * Perform canonical checks and cache lookup and pass on to filesystem 2160 * through the vop_cachedlookup only if needed. 2161 */ 2162 2163 int 2164 vfs_cache_lookup(struct vop_lookup_args *ap) 2165 { 2166 struct vnode *dvp; 2167 int error; 2168 struct vnode **vpp = ap->a_vpp; 2169 struct componentname *cnp = ap->a_cnp; 2170 struct ucred *cred = cnp->cn_cred; 2171 int flags = cnp->cn_flags; 2172 struct thread *td = cnp->cn_thread; 2173 2174 *vpp = NULL; 2175 dvp = ap->a_dvp; 2176 2177 if (dvp->v_type != VDIR) 2178 return (ENOTDIR); 2179 2180 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2181 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2182 return (EROFS); 2183 2184 error = VOP_ACCESS(dvp, VEXEC, cred, td); 2185 if (error) 2186 return (error); 2187 2188 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2189 if (error == 0) 2190 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2191 if (error == -1) 2192 return (0); 2193 return (error); 2194 } 2195 2196 /* 2197 * XXX All of these sysctls would probably be more productive dead. 2198 */ 2199 static int __read_mostly disablecwd; 2200 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 2201 "Disable the getcwd syscall"); 2202 2203 /* Implementation of the getcwd syscall. */ 2204 int 2205 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2206 { 2207 2208 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, 2209 MAXPATHLEN)); 2210 } 2211 2212 int 2213 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen, 2214 size_t path_max) 2215 { 2216 char *bp, *tmpbuf; 2217 struct filedesc *fdp; 2218 struct vnode *cdir, *rdir; 2219 int error; 2220 2221 if (__predict_false(disablecwd)) 2222 return (ENODEV); 2223 if (__predict_false(buflen < 2)) 2224 return (EINVAL); 2225 if (buflen > path_max) 2226 buflen = path_max; 2227 2228 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); 2229 fdp = td->td_proc->p_fd; 2230 FILEDESC_SLOCK(fdp); 2231 cdir = fdp->fd_cdir; 2232 vrefact(cdir); 2233 rdir = fdp->fd_rdir; 2234 vrefact(rdir); 2235 FILEDESC_SUNLOCK(fdp); 2236 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); 2237 vrele(rdir); 2238 vrele(cdir); 2239 2240 if (!error) { 2241 if (bufseg == UIO_SYSSPACE) 2242 bcopy(bp, buf, strlen(bp) + 1); 2243 else 2244 error = copyout(bp, buf, strlen(bp) + 1); 2245 #ifdef KTRACE 2246 if (KTRPOINT(curthread, KTR_NAMEI)) 2247 ktrnamei(bp); 2248 #endif 2249 } 2250 free(tmpbuf, M_TEMP); 2251 return (error); 2252 } 2253 2254 /* 2255 * Thus begins the fullpath magic. 2256 */ 2257 2258 static int __read_mostly disablefullpath; 2259 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, 2260 "Disable the vn_fullpath function"); 2261 2262 /* 2263 * Retrieve the full filesystem path that correspond to a vnode from the name 2264 * cache (if available) 2265 */ 2266 int 2267 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2268 { 2269 char *buf; 2270 struct filedesc *fdp; 2271 struct vnode *rdir; 2272 int error; 2273 2274 if (__predict_false(disablefullpath)) 2275 return (ENODEV); 2276 if (__predict_false(vn == NULL)) 2277 return (EINVAL); 2278 2279 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2280 fdp = td->td_proc->p_fd; 2281 FILEDESC_SLOCK(fdp); 2282 rdir = fdp->fd_rdir; 2283 vrefact(rdir); 2284 FILEDESC_SUNLOCK(fdp); 2285 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); 2286 vrele(rdir); 2287 2288 if (!error) 2289 *freebuf = buf; 2290 else 2291 free(buf, M_TEMP); 2292 return (error); 2293 } 2294 2295 /* 2296 * This function is similar to vn_fullpath, but it attempts to lookup the 2297 * pathname relative to the global root mount point. This is required for the 2298 * auditing sub-system, as audited pathnames must be absolute, relative to the 2299 * global root mount point. 2300 */ 2301 int 2302 vn_fullpath_global(struct thread *td, struct vnode *vn, 2303 char **retbuf, char **freebuf) 2304 { 2305 char *buf; 2306 int error; 2307 2308 if (__predict_false(disablefullpath)) 2309 return (ENODEV); 2310 if (__predict_false(vn == NULL)) 2311 return (EINVAL); 2312 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2313 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); 2314 if (!error) 2315 *freebuf = buf; 2316 else 2317 free(buf, M_TEMP); 2318 return (error); 2319 } 2320 2321 int 2322 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) 2323 { 2324 struct vnode *dvp; 2325 struct namecache *ncp; 2326 struct mtx *vlp; 2327 int error; 2328 2329 vlp = VP2VNODELOCK(*vp); 2330 mtx_lock(vlp); 2331 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2332 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2333 break; 2334 } 2335 if (ncp != NULL) { 2336 if (*buflen < ncp->nc_nlen) { 2337 mtx_unlock(vlp); 2338 vrele(*vp); 2339 counter_u64_add(numfullpathfail4, 1); 2340 error = ENOMEM; 2341 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2342 vp, NULL); 2343 return (error); 2344 } 2345 *buflen -= ncp->nc_nlen; 2346 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2347 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2348 ncp->nc_name, vp); 2349 dvp = *vp; 2350 *vp = ncp->nc_dvp; 2351 vref(*vp); 2352 mtx_unlock(vlp); 2353 vrele(dvp); 2354 return (0); 2355 } 2356 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2357 2358 mtx_unlock(vlp); 2359 vn_lock(*vp, LK_SHARED | LK_RETRY); 2360 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2361 vput(*vp); 2362 if (error) { 2363 counter_u64_add(numfullpathfail2, 1); 2364 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2365 return (error); 2366 } 2367 2368 *vp = dvp; 2369 if (dvp->v_iflag & VI_DOOMED) { 2370 /* forced unmount */ 2371 vrele(dvp); 2372 error = ENOENT; 2373 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2374 return (error); 2375 } 2376 /* 2377 * *vp has its use count incremented still. 2378 */ 2379 2380 return (0); 2381 } 2382 2383 /* 2384 * The magic behind kern___getcwd() and vn_fullpath(). 2385 */ 2386 static int 2387 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 2388 char *buf, char **retbuf, u_int buflen) 2389 { 2390 int error, slash_prefixed; 2391 #ifdef KDTRACE_HOOKS 2392 struct vnode *startvp = vp; 2393 #endif 2394 struct vnode *vp1; 2395 2396 buflen--; 2397 buf[buflen] = '\0'; 2398 error = 0; 2399 slash_prefixed = 0; 2400 2401 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2402 counter_u64_add(numfullpathcalls, 1); 2403 vref(vp); 2404 if (vp->v_type != VDIR) { 2405 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2406 if (error) 2407 return (error); 2408 if (buflen == 0) { 2409 vrele(vp); 2410 return (ENOMEM); 2411 } 2412 buf[--buflen] = '/'; 2413 slash_prefixed = 1; 2414 } 2415 while (vp != rdir && vp != rootvnode) { 2416 /* 2417 * The vp vnode must be already fully constructed, 2418 * since it is either found in namecache or obtained 2419 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2420 * without obtaining the vnode lock. 2421 */ 2422 if ((vp->v_vflag & VV_ROOT) != 0) { 2423 vn_lock(vp, LK_RETRY | LK_SHARED); 2424 2425 /* 2426 * With the vnode locked, check for races with 2427 * unmount, forced or not. Note that we 2428 * already verified that vp is not equal to 2429 * the root vnode, which means that 2430 * mnt_vnodecovered can be NULL only for the 2431 * case of unmount. 2432 */ 2433 if ((vp->v_iflag & VI_DOOMED) != 0 || 2434 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2435 vp1->v_mountedhere != vp->v_mount) { 2436 vput(vp); 2437 error = ENOENT; 2438 SDT_PROBE3(vfs, namecache, fullpath, return, 2439 error, vp, NULL); 2440 break; 2441 } 2442 2443 vref(vp1); 2444 vput(vp); 2445 vp = vp1; 2446 continue; 2447 } 2448 if (vp->v_type != VDIR) { 2449 vrele(vp); 2450 counter_u64_add(numfullpathfail1, 1); 2451 error = ENOTDIR; 2452 SDT_PROBE3(vfs, namecache, fullpath, return, 2453 error, vp, NULL); 2454 break; 2455 } 2456 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2457 if (error) 2458 break; 2459 if (buflen == 0) { 2460 vrele(vp); 2461 error = ENOMEM; 2462 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2463 startvp, NULL); 2464 break; 2465 } 2466 buf[--buflen] = '/'; 2467 slash_prefixed = 1; 2468 } 2469 if (error) 2470 return (error); 2471 if (!slash_prefixed) { 2472 if (buflen == 0) { 2473 vrele(vp); 2474 counter_u64_add(numfullpathfail4, 1); 2475 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2476 startvp, NULL); 2477 return (ENOMEM); 2478 } 2479 buf[--buflen] = '/'; 2480 } 2481 counter_u64_add(numfullpathfound, 1); 2482 vrele(vp); 2483 2484 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); 2485 *retbuf = buf + buflen; 2486 return (0); 2487 } 2488 2489 struct vnode * 2490 vn_dir_dd_ino(struct vnode *vp) 2491 { 2492 struct namecache *ncp; 2493 struct vnode *ddvp; 2494 struct mtx *vlp; 2495 enum vgetstate vs; 2496 2497 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2498 vlp = VP2VNODELOCK(vp); 2499 mtx_lock(vlp); 2500 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2501 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2502 continue; 2503 ddvp = ncp->nc_dvp; 2504 vs = vget_prep(ddvp); 2505 mtx_unlock(vlp); 2506 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2507 return (NULL); 2508 return (ddvp); 2509 } 2510 mtx_unlock(vlp); 2511 return (NULL); 2512 } 2513 2514 int 2515 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2516 { 2517 struct namecache *ncp; 2518 struct mtx *vlp; 2519 int l; 2520 2521 vlp = VP2VNODELOCK(vp); 2522 mtx_lock(vlp); 2523 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2524 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2525 break; 2526 if (ncp == NULL) { 2527 mtx_unlock(vlp); 2528 return (ENOENT); 2529 } 2530 l = min(ncp->nc_nlen, buflen - 1); 2531 memcpy(buf, ncp->nc_name, l); 2532 mtx_unlock(vlp); 2533 buf[l] = '\0'; 2534 return (0); 2535 } 2536 2537 /* 2538 * This function updates path string to vnode's full global path 2539 * and checks the size of the new path string against the pathlen argument. 2540 * 2541 * Requires a locked, referenced vnode. 2542 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2543 * 2544 * If sysctl debug.disablefullpath is set, ENODEV is returned, 2545 * vnode is left locked and path remain untouched. 2546 * 2547 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2548 * because it falls back to the ".." lookup if the namecache lookup fails. 2549 */ 2550 int 2551 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2552 u_int pathlen) 2553 { 2554 struct nameidata nd; 2555 struct vnode *vp1; 2556 char *rpath, *fbuf; 2557 int error; 2558 2559 ASSERT_VOP_ELOCKED(vp, __func__); 2560 2561 /* Return ENODEV if sysctl debug.disablefullpath==1 */ 2562 if (__predict_false(disablefullpath)) 2563 return (ENODEV); 2564 2565 /* Construct global filesystem path from vp. */ 2566 VOP_UNLOCK(vp, 0); 2567 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2568 2569 if (error != 0) { 2570 vrele(vp); 2571 return (error); 2572 } 2573 2574 if (strlen(rpath) >= pathlen) { 2575 vrele(vp); 2576 error = ENAMETOOLONG; 2577 goto out; 2578 } 2579 2580 /* 2581 * Re-lookup the vnode by path to detect a possible rename. 2582 * As a side effect, the vnode is relocked. 2583 * If vnode was renamed, return ENOENT. 2584 */ 2585 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2586 UIO_SYSSPACE, path, td); 2587 error = namei(&nd); 2588 if (error != 0) { 2589 vrele(vp); 2590 goto out; 2591 } 2592 NDFREE(&nd, NDF_ONLY_PNBUF); 2593 vp1 = nd.ni_vp; 2594 vrele(vp); 2595 if (vp1 == vp) 2596 strcpy(path, rpath); 2597 else { 2598 vput(vp1); 2599 error = ENOENT; 2600 } 2601 2602 out: 2603 free(fbuf, M_TEMP); 2604 return (error); 2605 } 2606 2607 #ifdef DDB 2608 static void 2609 db_print_vpath(struct vnode *vp) 2610 { 2611 2612 while (vp != NULL) { 2613 db_printf("%p: ", vp); 2614 if (vp == rootvnode) { 2615 db_printf("/"); 2616 vp = NULL; 2617 } else { 2618 if (vp->v_vflag & VV_ROOT) { 2619 db_printf("<mount point>"); 2620 vp = vp->v_mount->mnt_vnodecovered; 2621 } else { 2622 struct namecache *ncp; 2623 char *ncn; 2624 int i; 2625 2626 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2627 if (ncp != NULL) { 2628 ncn = ncp->nc_name; 2629 for (i = 0; i < ncp->nc_nlen; i++) 2630 db_printf("%c", *ncn++); 2631 vp = ncp->nc_dvp; 2632 } else { 2633 vp = NULL; 2634 } 2635 } 2636 } 2637 db_printf("\n"); 2638 } 2639 2640 return; 2641 } 2642 2643 DB_SHOW_COMMAND(vpath, db_show_vpath) 2644 { 2645 struct vnode *vp; 2646 2647 if (!have_addr) { 2648 db_printf("usage: show vpath <struct vnode *>\n"); 2649 return; 2650 } 2651 2652 vp = (struct vnode *)addr; 2653 db_print_vpath(vp); 2654 } 2655 2656 #endif 2657