1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/mount.h> 55 #include <sys/namei.h> 56 #include <sys/proc.h> 57 #include <sys/rwlock.h> 58 #include <sys/sdt.h> 59 #include <sys/smp.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysproto.h> 63 #include <sys/vnode.h> 64 #ifdef KTRACE 65 #include <sys/ktrace.h> 66 #endif 67 68 #ifdef DDB 69 #include <ddb/ddb.h> 70 #endif 71 72 #include <vm/uma.h> 73 74 SDT_PROVIDER_DECLARE(vfs); 75 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 76 "struct vnode *"); 77 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 78 "char *"); 79 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 80 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 81 "char *", "struct vnode *"); 82 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 83 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 84 "struct vnode *", "char *"); 85 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 86 "struct vnode *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 88 "struct vnode *", "char *"); 89 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 90 "char *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 92 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 93 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 95 "struct vnode *"); 96 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 97 "char *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 99 "char *"); 100 101 /* 102 * This structure describes the elements in the cache of recent 103 * names looked up by namei. 104 */ 105 106 struct namecache { 107 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 108 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 109 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 110 struct vnode *nc_dvp; /* vnode of parent of name */ 111 union { 112 struct vnode *nu_vp; /* vnode the name refers to */ 113 } n_un; 114 u_char nc_flag; /* flag bits */ 115 u_char nc_nlen; /* length of name */ 116 char nc_name[0]; /* segment name + nul */ 117 }; 118 119 /* 120 * struct namecache_ts repeats struct namecache layout up to the 121 * nc_nlen member. 122 * struct namecache_ts is used in place of struct namecache when time(s) need 123 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 124 * both a non-dotdot directory name plus dotdot for the directory's 125 * parent. 126 */ 127 struct namecache_ts { 128 struct timespec nc_time; /* timespec provided by fs */ 129 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 130 int nc_ticks; /* ticks value when entry was added */ 131 struct namecache nc_nc; 132 }; 133 134 #define nc_vp n_un.nu_vp 135 136 /* 137 * Flags in namecache.nc_flag 138 */ 139 #define NCF_WHITE 0x01 140 #define NCF_ISDOTDOT 0x02 141 #define NCF_TS 0x04 142 #define NCF_DTS 0x08 143 #define NCF_DVDROP 0x10 144 #define NCF_NEGATIVE 0x20 145 #define NCF_HOTNEGATIVE 0x40 146 147 /* 148 * Name caching works as follows: 149 * 150 * Names found by directory scans are retained in a cache 151 * for future reference. It is managed LRU, so frequently 152 * used names will hang around. Cache is indexed by hash value 153 * obtained from (dvp, name) where dvp refers to the directory 154 * containing name. 155 * 156 * If it is a "negative" entry, (i.e. for a name that is known NOT to 157 * exist) the vnode pointer will be NULL. 158 * 159 * Upon reaching the last segment of a path, if the reference 160 * is for DELETE, or NOCACHE is set (rewrite), and the 161 * name is located in the cache, it will be dropped. 162 * 163 * These locks are used (in the order in which they can be taken): 164 * NAME TYPE ROLE 165 * vnodelock mtx vnode lists and v_cache_dd field protection 166 * bucketlock rwlock for access to given set of hash buckets 167 * neglist mtx negative entry LRU management 168 * 169 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 170 * shrinking the LRU list. 171 * 172 * It is legal to take multiple vnodelock and bucketlock locks. The locking 173 * order is lower address first. Both are recursive. 174 * 175 * "." lookups are lockless. 176 * 177 * ".." and vnode -> name lookups require vnodelock. 178 * 179 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 180 * 181 * Insertions and removals of entries require involved vnodes and bucketlocks 182 * to be write-locked to prevent other threads from seeing the entry. 183 * 184 * Some lookups result in removal of the found entry (e.g. getting rid of a 185 * negative entry with the intent to create a positive one), which poses a 186 * problem when multiple threads reach the state. Similarly, two different 187 * threads can purge two different vnodes and try to remove the same name. 188 * 189 * If the already held vnode lock is lower than the second required lock, we 190 * can just take the other lock. However, in the opposite case, this could 191 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 192 * the first node, locking everything in order and revalidating the state. 193 */ 194 195 /* 196 * Structures associated with name caching. 197 */ 198 #define NCHHASH(hash) \ 199 (&nchashtbl[(hash) & nchash]) 200 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 201 static u_long __read_mostly nchash; /* size of hash table */ 202 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 203 "Size of namecache hash table"); 204 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 205 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 206 "Ratio of negative namecache entries"); 207 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 208 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 209 u_int ncsizefactor = 2; 210 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 211 "Size factor for namecache"); 212 static u_int __read_mostly ncpurgeminvnodes; 213 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 214 "Number of vnodes below which purgevfs ignores the request"); 215 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 216 217 struct nchstats nchstats; /* cache effectiveness statistics */ 218 219 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 220 static int shrink_list_turn; 221 222 struct neglist { 223 struct mtx nl_lock; 224 TAILQ_HEAD(, namecache) nl_list; 225 } __aligned(CACHE_LINE_SIZE); 226 227 static struct neglist __read_mostly *neglists; 228 static struct neglist ncneg_hot; 229 static u_long numhotneg; 230 231 #define numneglists (ncneghash + 1) 232 static u_int __read_mostly ncneghash; 233 static inline struct neglist * 234 NCP2NEGLIST(struct namecache *ncp) 235 { 236 237 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 238 } 239 240 #define numbucketlocks (ncbuckethash + 1) 241 static u_int __read_mostly ncbuckethash; 242 static struct rwlock_padalign __read_mostly *bucketlocks; 243 #define HASH2BUCKETLOCK(hash) \ 244 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 245 246 #define numvnodelocks (ncvnodehash + 1) 247 static u_int __read_mostly ncvnodehash; 248 static struct mtx __read_mostly *vnodelocks; 249 static inline struct mtx * 250 VP2VNODELOCK(struct vnode *vp) 251 { 252 253 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 254 } 255 256 /* 257 * UMA zones for the VFS cache. 258 * 259 * The small cache is used for entries with short names, which are the 260 * most common. The large cache is used for entries which are too big to 261 * fit in the small cache. 262 */ 263 static uma_zone_t __read_mostly cache_zone_small; 264 static uma_zone_t __read_mostly cache_zone_small_ts; 265 static uma_zone_t __read_mostly cache_zone_large; 266 static uma_zone_t __read_mostly cache_zone_large_ts; 267 268 #define CACHE_PATH_CUTOFF 35 269 270 static struct namecache * 271 cache_alloc(int len, int ts) 272 { 273 struct namecache_ts *ncp_ts; 274 struct namecache *ncp; 275 276 if (__predict_false(ts)) { 277 if (len <= CACHE_PATH_CUTOFF) 278 ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); 279 else 280 ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); 281 ncp = &ncp_ts->nc_nc; 282 } else { 283 if (len <= CACHE_PATH_CUTOFF) 284 ncp = uma_zalloc(cache_zone_small, M_WAITOK); 285 else 286 ncp = uma_zalloc(cache_zone_large, M_WAITOK); 287 } 288 return (ncp); 289 } 290 291 static void 292 cache_free(struct namecache *ncp) 293 { 294 struct namecache_ts *ncp_ts; 295 296 if (ncp == NULL) 297 return; 298 if ((ncp->nc_flag & NCF_DVDROP) != 0) 299 vdrop(ncp->nc_dvp); 300 if (__predict_false(ncp->nc_flag & NCF_TS)) { 301 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 302 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 303 uma_zfree(cache_zone_small_ts, ncp_ts); 304 else 305 uma_zfree(cache_zone_large_ts, ncp_ts); 306 } else { 307 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 308 uma_zfree(cache_zone_small, ncp); 309 else 310 uma_zfree(cache_zone_large, ncp); 311 } 312 } 313 314 static void 315 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 316 { 317 struct namecache_ts *ncp_ts; 318 319 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 320 (tsp == NULL && ticksp == NULL), 321 ("No NCF_TS")); 322 323 if (tsp == NULL && ticksp == NULL) 324 return; 325 326 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 327 if (tsp != NULL) 328 *tsp = ncp_ts->nc_time; 329 if (ticksp != NULL) 330 *ticksp = ncp_ts->nc_ticks; 331 } 332 333 #ifdef DEBUG_CACHE 334 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 335 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 336 "VFS namecache enabled"); 337 #endif 338 339 /* Export size information to userland */ 340 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 341 sizeof(struct namecache), "sizeof(struct namecache)"); 342 343 /* 344 * The new name cache statistics 345 */ 346 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 347 "Name cache statistics"); 348 #define STATNODE_ULONG(name, descr) \ 349 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 350 #define STATNODE_COUNTER(name, descr) \ 351 static counter_u64_t __read_mostly name; \ 352 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 353 STATNODE_ULONG(numneg, "Number of negative cache entries"); 354 STATNODE_ULONG(numcache, "Number of cache entries"); 355 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 356 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 357 STATNODE_COUNTER(dothits, "Number of '.' hits"); 358 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 359 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 360 STATNODE_COUNTER(nummiss, "Number of cache misses"); 361 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 362 STATNODE_COUNTER(numposzaps, 363 "Number of cache hits (positive) we do not want to cache"); 364 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 365 STATNODE_COUNTER(numnegzaps, 366 "Number of cache hits (negative) we do not want to cache"); 367 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 368 /* These count for vn_getcwd(), too. */ 369 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 370 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 371 STATNODE_COUNTER(numfullpathfail2, 372 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 373 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 374 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 375 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 376 "Number of successful removals after relocking"); 377 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 378 "Number of times zap_and_exit failed to lock"); 379 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 380 "Number of times zap_and_exit failed to lock"); 381 static long cache_lock_vnodes_cel_3_failures; 382 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 383 "Number of times 3-way vnode locking failed"); 384 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 385 STATNODE_COUNTER(numneg_evicted, 386 "Number of negative entries evicted when adding a new entry"); 387 STATNODE_COUNTER(shrinking_skipped, 388 "Number of times shrinking was already in progress"); 389 390 static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 391 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 392 char **freebuf, size_t *buflen); 393 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 394 char *buf, char **retbuf, size_t *buflen); 395 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 396 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 397 398 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 399 400 static int cache_yield; 401 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 402 "Number of times cache called yield"); 403 404 static void __noinline 405 cache_maybe_yield(void) 406 { 407 408 if (should_yield()) { 409 cache_yield++; 410 kern_yield(PRI_USER); 411 } 412 } 413 414 static inline void 415 cache_assert_vlp_locked(struct mtx *vlp) 416 { 417 418 if (vlp != NULL) 419 mtx_assert(vlp, MA_OWNED); 420 } 421 422 static inline void 423 cache_assert_vnode_locked(struct vnode *vp) 424 { 425 struct mtx *vlp; 426 427 vlp = VP2VNODELOCK(vp); 428 cache_assert_vlp_locked(vlp); 429 } 430 431 static uint32_t 432 cache_get_hash(char *name, u_char len, struct vnode *dvp) 433 { 434 uint32_t hash; 435 436 hash = fnv_32_buf(name, len, FNV1_32_INIT); 437 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 438 return (hash); 439 } 440 441 static inline struct rwlock * 442 NCP2BUCKETLOCK(struct namecache *ncp) 443 { 444 uint32_t hash; 445 446 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 447 return (HASH2BUCKETLOCK(hash)); 448 } 449 450 #ifdef INVARIANTS 451 static void 452 cache_assert_bucket_locked(struct namecache *ncp, int mode) 453 { 454 struct rwlock *blp; 455 456 blp = NCP2BUCKETLOCK(ncp); 457 rw_assert(blp, mode); 458 } 459 #else 460 #define cache_assert_bucket_locked(x, y) do { } while (0) 461 #endif 462 463 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 464 static void 465 _cache_sort_vnodes(void **p1, void **p2) 466 { 467 void *tmp; 468 469 MPASS(*p1 != NULL || *p2 != NULL); 470 471 if (*p1 > *p2) { 472 tmp = *p2; 473 *p2 = *p1; 474 *p1 = tmp; 475 } 476 } 477 478 static void 479 cache_lock_all_buckets(void) 480 { 481 u_int i; 482 483 for (i = 0; i < numbucketlocks; i++) 484 rw_wlock(&bucketlocks[i]); 485 } 486 487 static void 488 cache_unlock_all_buckets(void) 489 { 490 u_int i; 491 492 for (i = 0; i < numbucketlocks; i++) 493 rw_wunlock(&bucketlocks[i]); 494 } 495 496 static void 497 cache_lock_all_vnodes(void) 498 { 499 u_int i; 500 501 for (i = 0; i < numvnodelocks; i++) 502 mtx_lock(&vnodelocks[i]); 503 } 504 505 static void 506 cache_unlock_all_vnodes(void) 507 { 508 u_int i; 509 510 for (i = 0; i < numvnodelocks; i++) 511 mtx_unlock(&vnodelocks[i]); 512 } 513 514 static int 515 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 516 { 517 518 cache_sort_vnodes(&vlp1, &vlp2); 519 520 if (vlp1 != NULL) { 521 if (!mtx_trylock(vlp1)) 522 return (EAGAIN); 523 } 524 if (!mtx_trylock(vlp2)) { 525 if (vlp1 != NULL) 526 mtx_unlock(vlp1); 527 return (EAGAIN); 528 } 529 530 return (0); 531 } 532 533 static void 534 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 535 { 536 537 MPASS(vlp1 != NULL || vlp2 != NULL); 538 MPASS(vlp1 <= vlp2); 539 540 if (vlp1 != NULL) 541 mtx_lock(vlp1); 542 if (vlp2 != NULL) 543 mtx_lock(vlp2); 544 } 545 546 static void 547 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 548 { 549 550 MPASS(vlp1 != NULL || vlp2 != NULL); 551 552 if (vlp1 != NULL) 553 mtx_unlock(vlp1); 554 if (vlp2 != NULL) 555 mtx_unlock(vlp2); 556 } 557 558 static int 559 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 560 { 561 struct nchstats snap; 562 563 if (req->oldptr == NULL) 564 return (SYSCTL_OUT(req, 0, sizeof(snap))); 565 566 snap = nchstats; 567 snap.ncs_goodhits = counter_u64_fetch(numposhits); 568 snap.ncs_neghits = counter_u64_fetch(numneghits); 569 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 570 counter_u64_fetch(numnegzaps); 571 snap.ncs_miss = counter_u64_fetch(nummisszap) + 572 counter_u64_fetch(nummiss); 573 574 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 575 } 576 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 577 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 578 "VFS cache effectiveness statistics"); 579 580 #ifdef DIAGNOSTIC 581 /* 582 * Grab an atomic snapshot of the name cache hash chain lengths 583 */ 584 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 585 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 586 "hash table stats"); 587 588 static int 589 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 590 { 591 struct nchashhead *ncpp; 592 struct namecache *ncp; 593 int i, error, n_nchash, *cntbuf; 594 595 retry: 596 n_nchash = nchash + 1; /* nchash is max index, not count */ 597 if (req->oldptr == NULL) 598 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 599 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 600 cache_lock_all_buckets(); 601 if (n_nchash != nchash + 1) { 602 cache_unlock_all_buckets(); 603 free(cntbuf, M_TEMP); 604 goto retry; 605 } 606 /* Scan hash tables counting entries */ 607 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 608 LIST_FOREACH(ncp, ncpp, nc_hash) 609 cntbuf[i]++; 610 cache_unlock_all_buckets(); 611 for (error = 0, i = 0; i < n_nchash; i++) 612 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 613 break; 614 free(cntbuf, M_TEMP); 615 return (error); 616 } 617 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 618 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 619 "nchash chain lengths"); 620 621 static int 622 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 623 { 624 int error; 625 struct nchashhead *ncpp; 626 struct namecache *ncp; 627 int n_nchash; 628 int count, maxlength, used, pct; 629 630 if (!req->oldptr) 631 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 632 633 cache_lock_all_buckets(); 634 n_nchash = nchash + 1; /* nchash is max index, not count */ 635 used = 0; 636 maxlength = 0; 637 638 /* Scan hash tables for applicable entries */ 639 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 640 count = 0; 641 LIST_FOREACH(ncp, ncpp, nc_hash) { 642 count++; 643 } 644 if (count) 645 used++; 646 if (maxlength < count) 647 maxlength = count; 648 } 649 n_nchash = nchash + 1; 650 cache_unlock_all_buckets(); 651 pct = (used * 100) / (n_nchash / 100); 652 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 653 if (error) 654 return (error); 655 error = SYSCTL_OUT(req, &used, sizeof(used)); 656 if (error) 657 return (error); 658 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 659 if (error) 660 return (error); 661 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 662 if (error) 663 return (error); 664 return (0); 665 } 666 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 667 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 668 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 669 #endif 670 671 /* 672 * Negative entries management 673 * 674 * A variation of LRU scheme is used. New entries are hashed into one of 675 * numneglists cold lists. Entries get promoted to the hot list on first hit. 676 * 677 * The shrinker will demote hot list head and evict from the cold list in a 678 * round-robin manner. 679 */ 680 static void 681 cache_negative_hit(struct namecache *ncp) 682 { 683 struct neglist *neglist; 684 685 MPASS(ncp->nc_flag & NCF_NEGATIVE); 686 if (ncp->nc_flag & NCF_HOTNEGATIVE) 687 return; 688 neglist = NCP2NEGLIST(ncp); 689 mtx_lock(&ncneg_hot.nl_lock); 690 mtx_lock(&neglist->nl_lock); 691 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 692 numhotneg++; 693 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 694 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 695 ncp->nc_flag |= NCF_HOTNEGATIVE; 696 } 697 mtx_unlock(&neglist->nl_lock); 698 mtx_unlock(&ncneg_hot.nl_lock); 699 } 700 701 static void 702 cache_negative_insert(struct namecache *ncp, bool neg_locked) 703 { 704 struct neglist *neglist; 705 706 MPASS(ncp->nc_flag & NCF_NEGATIVE); 707 cache_assert_bucket_locked(ncp, RA_WLOCKED); 708 neglist = NCP2NEGLIST(ncp); 709 if (!neg_locked) { 710 mtx_lock(&neglist->nl_lock); 711 } else { 712 mtx_assert(&neglist->nl_lock, MA_OWNED); 713 } 714 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 715 if (!neg_locked) 716 mtx_unlock(&neglist->nl_lock); 717 atomic_add_rel_long(&numneg, 1); 718 } 719 720 static void 721 cache_negative_remove(struct namecache *ncp, bool neg_locked) 722 { 723 struct neglist *neglist; 724 bool hot_locked = false; 725 bool list_locked = false; 726 727 MPASS(ncp->nc_flag & NCF_NEGATIVE); 728 cache_assert_bucket_locked(ncp, RA_WLOCKED); 729 neglist = NCP2NEGLIST(ncp); 730 if (!neg_locked) { 731 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 732 hot_locked = true; 733 mtx_lock(&ncneg_hot.nl_lock); 734 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 735 list_locked = true; 736 mtx_lock(&neglist->nl_lock); 737 } 738 } else { 739 list_locked = true; 740 mtx_lock(&neglist->nl_lock); 741 } 742 } 743 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 744 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 745 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 746 numhotneg--; 747 } else { 748 mtx_assert(&neglist->nl_lock, MA_OWNED); 749 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 750 } 751 if (list_locked) 752 mtx_unlock(&neglist->nl_lock); 753 if (hot_locked) 754 mtx_unlock(&ncneg_hot.nl_lock); 755 atomic_subtract_rel_long(&numneg, 1); 756 } 757 758 static void 759 cache_negative_shrink_select(int start, struct namecache **ncpp, 760 struct neglist **neglistpp) 761 { 762 struct neglist *neglist; 763 struct namecache *ncp; 764 int i; 765 766 *ncpp = ncp = NULL; 767 neglist = NULL; 768 769 for (i = start; i < numneglists; i++) { 770 neglist = &neglists[i]; 771 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 772 continue; 773 mtx_lock(&neglist->nl_lock); 774 ncp = TAILQ_FIRST(&neglist->nl_list); 775 if (ncp != NULL) 776 break; 777 mtx_unlock(&neglist->nl_lock); 778 } 779 780 *neglistpp = neglist; 781 *ncpp = ncp; 782 } 783 784 static void 785 cache_negative_zap_one(void) 786 { 787 struct namecache *ncp, *ncp2; 788 struct neglist *neglist; 789 struct mtx *dvlp; 790 struct rwlock *blp; 791 792 if (mtx_owner(&ncneg_shrink_lock) != NULL || 793 !mtx_trylock(&ncneg_shrink_lock)) { 794 counter_u64_add(shrinking_skipped, 1); 795 return; 796 } 797 798 mtx_lock(&ncneg_hot.nl_lock); 799 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 800 if (ncp != NULL) { 801 neglist = NCP2NEGLIST(ncp); 802 mtx_lock(&neglist->nl_lock); 803 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 804 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 805 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 806 numhotneg--; 807 mtx_unlock(&neglist->nl_lock); 808 } 809 mtx_unlock(&ncneg_hot.nl_lock); 810 811 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 812 shrink_list_turn++; 813 if (shrink_list_turn == numneglists) 814 shrink_list_turn = 0; 815 if (ncp == NULL && shrink_list_turn == 0) 816 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 817 mtx_unlock(&ncneg_shrink_lock); 818 if (ncp == NULL) 819 return; 820 821 MPASS(ncp->nc_flag & NCF_NEGATIVE); 822 dvlp = VP2VNODELOCK(ncp->nc_dvp); 823 blp = NCP2BUCKETLOCK(ncp); 824 mtx_unlock(&neglist->nl_lock); 825 mtx_lock(dvlp); 826 rw_wlock(blp); 827 mtx_lock(&neglist->nl_lock); 828 ncp2 = TAILQ_FIRST(&neglist->nl_list); 829 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 830 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 831 ncp = NULL; 832 } else { 833 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 834 ncp->nc_name); 835 836 cache_zap_locked(ncp, true); 837 counter_u64_add(numneg_evicted, 1); 838 } 839 mtx_unlock(&neglist->nl_lock); 840 rw_wunlock(blp); 841 mtx_unlock(dvlp); 842 cache_free(ncp); 843 } 844 845 /* 846 * cache_zap_locked(): 847 * 848 * Removes a namecache entry from cache, whether it contains an actual 849 * pointer to a vnode or if it is just a negative cache entry. 850 */ 851 static void 852 cache_zap_locked(struct namecache *ncp, bool neg_locked) 853 { 854 855 if (!(ncp->nc_flag & NCF_NEGATIVE)) 856 cache_assert_vnode_locked(ncp->nc_vp); 857 cache_assert_vnode_locked(ncp->nc_dvp); 858 cache_assert_bucket_locked(ncp, RA_WLOCKED); 859 860 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 861 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 862 LIST_REMOVE(ncp, nc_hash); 863 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 864 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 865 ncp->nc_name, ncp->nc_vp); 866 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 867 if (ncp == ncp->nc_vp->v_cache_dd) 868 ncp->nc_vp->v_cache_dd = NULL; 869 } else { 870 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 871 ncp->nc_name); 872 cache_negative_remove(ncp, neg_locked); 873 } 874 if (ncp->nc_flag & NCF_ISDOTDOT) { 875 if (ncp == ncp->nc_dvp->v_cache_dd) 876 ncp->nc_dvp->v_cache_dd = NULL; 877 } else { 878 LIST_REMOVE(ncp, nc_src); 879 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 880 ncp->nc_flag |= NCF_DVDROP; 881 counter_u64_add(numcachehv, -1); 882 } 883 } 884 atomic_subtract_rel_long(&numcache, 1); 885 } 886 887 static void 888 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 889 { 890 struct rwlock *blp; 891 892 MPASS(ncp->nc_dvp == vp); 893 MPASS(ncp->nc_flag & NCF_NEGATIVE); 894 cache_assert_vnode_locked(vp); 895 896 blp = NCP2BUCKETLOCK(ncp); 897 rw_wlock(blp); 898 cache_zap_locked(ncp, false); 899 rw_wunlock(blp); 900 } 901 902 static bool 903 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 904 struct mtx **vlpp) 905 { 906 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 907 struct rwlock *blp; 908 909 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 910 cache_assert_vnode_locked(vp); 911 912 if (ncp->nc_flag & NCF_NEGATIVE) { 913 if (*vlpp != NULL) { 914 mtx_unlock(*vlpp); 915 *vlpp = NULL; 916 } 917 cache_zap_negative_locked_vnode_kl(ncp, vp); 918 return (true); 919 } 920 921 pvlp = VP2VNODELOCK(vp); 922 blp = NCP2BUCKETLOCK(ncp); 923 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 924 vlp2 = VP2VNODELOCK(ncp->nc_vp); 925 926 if (*vlpp == vlp1 || *vlpp == vlp2) { 927 to_unlock = *vlpp; 928 *vlpp = NULL; 929 } else { 930 if (*vlpp != NULL) { 931 mtx_unlock(*vlpp); 932 *vlpp = NULL; 933 } 934 cache_sort_vnodes(&vlp1, &vlp2); 935 if (vlp1 == pvlp) { 936 mtx_lock(vlp2); 937 to_unlock = vlp2; 938 } else { 939 if (!mtx_trylock(vlp1)) 940 goto out_relock; 941 to_unlock = vlp1; 942 } 943 } 944 rw_wlock(blp); 945 cache_zap_locked(ncp, false); 946 rw_wunlock(blp); 947 if (to_unlock != NULL) 948 mtx_unlock(to_unlock); 949 return (true); 950 951 out_relock: 952 mtx_unlock(vlp2); 953 mtx_lock(vlp1); 954 mtx_lock(vlp2); 955 MPASS(*vlpp == NULL); 956 *vlpp = vlp1; 957 return (false); 958 } 959 960 static int __noinline 961 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 962 { 963 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 964 struct rwlock *blp; 965 int error = 0; 966 967 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 968 cache_assert_vnode_locked(vp); 969 970 pvlp = VP2VNODELOCK(vp); 971 if (ncp->nc_flag & NCF_NEGATIVE) { 972 cache_zap_negative_locked_vnode_kl(ncp, vp); 973 goto out; 974 } 975 976 blp = NCP2BUCKETLOCK(ncp); 977 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 978 vlp2 = VP2VNODELOCK(ncp->nc_vp); 979 cache_sort_vnodes(&vlp1, &vlp2); 980 if (vlp1 == pvlp) { 981 mtx_lock(vlp2); 982 to_unlock = vlp2; 983 } else { 984 if (!mtx_trylock(vlp1)) { 985 error = EAGAIN; 986 goto out; 987 } 988 to_unlock = vlp1; 989 } 990 rw_wlock(blp); 991 cache_zap_locked(ncp, false); 992 rw_wunlock(blp); 993 mtx_unlock(to_unlock); 994 out: 995 mtx_unlock(pvlp); 996 return (error); 997 } 998 999 /* 1000 * If trylocking failed we can get here. We know enough to take all needed locks 1001 * in the right order and re-lookup the entry. 1002 */ 1003 static int 1004 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1005 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1006 struct rwlock *blp) 1007 { 1008 struct namecache *rncp; 1009 1010 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1011 1012 cache_sort_vnodes(&dvlp, &vlp); 1013 cache_lock_vnodes(dvlp, vlp); 1014 rw_wlock(blp); 1015 LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1016 if (rncp == ncp && rncp->nc_dvp == dvp && 1017 rncp->nc_nlen == cnp->cn_namelen && 1018 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1019 break; 1020 } 1021 if (rncp != NULL) { 1022 cache_zap_locked(rncp, false); 1023 rw_wunlock(blp); 1024 cache_unlock_vnodes(dvlp, vlp); 1025 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1026 return (0); 1027 } 1028 1029 rw_wunlock(blp); 1030 cache_unlock_vnodes(dvlp, vlp); 1031 return (EAGAIN); 1032 } 1033 1034 static int __noinline 1035 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1036 uint32_t hash, struct rwlock *blp) 1037 { 1038 struct mtx *dvlp, *vlp; 1039 struct vnode *dvp; 1040 1041 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1042 1043 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1044 vlp = NULL; 1045 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1046 vlp = VP2VNODELOCK(ncp->nc_vp); 1047 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1048 cache_zap_locked(ncp, false); 1049 rw_wunlock(blp); 1050 cache_unlock_vnodes(dvlp, vlp); 1051 return (0); 1052 } 1053 1054 dvp = ncp->nc_dvp; 1055 rw_wunlock(blp); 1056 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1057 } 1058 1059 static int __noinline 1060 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1061 uint32_t hash, struct rwlock *blp) 1062 { 1063 struct mtx *dvlp, *vlp; 1064 struct vnode *dvp; 1065 1066 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1067 1068 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1069 vlp = NULL; 1070 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1071 vlp = VP2VNODELOCK(ncp->nc_vp); 1072 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1073 rw_runlock(blp); 1074 rw_wlock(blp); 1075 cache_zap_locked(ncp, false); 1076 rw_wunlock(blp); 1077 cache_unlock_vnodes(dvlp, vlp); 1078 return (0); 1079 } 1080 1081 dvp = ncp->nc_dvp; 1082 rw_runlock(blp); 1083 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1084 } 1085 1086 static int 1087 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1088 struct mtx **vlpp1, struct mtx **vlpp2) 1089 { 1090 struct mtx *dvlp, *vlp; 1091 1092 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1093 1094 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1095 vlp = NULL; 1096 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1097 vlp = VP2VNODELOCK(ncp->nc_vp); 1098 cache_sort_vnodes(&dvlp, &vlp); 1099 1100 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1101 cache_zap_locked(ncp, false); 1102 cache_unlock_vnodes(dvlp, vlp); 1103 *vlpp1 = NULL; 1104 *vlpp2 = NULL; 1105 return (0); 1106 } 1107 1108 if (*vlpp1 != NULL) 1109 mtx_unlock(*vlpp1); 1110 if (*vlpp2 != NULL) 1111 mtx_unlock(*vlpp2); 1112 *vlpp1 = NULL; 1113 *vlpp2 = NULL; 1114 1115 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1116 cache_zap_locked(ncp, false); 1117 cache_unlock_vnodes(dvlp, vlp); 1118 return (0); 1119 } 1120 1121 rw_wunlock(blp); 1122 *vlpp1 = dvlp; 1123 *vlpp2 = vlp; 1124 if (*vlpp1 != NULL) 1125 mtx_lock(*vlpp1); 1126 mtx_lock(*vlpp2); 1127 rw_wlock(blp); 1128 return (EAGAIN); 1129 } 1130 1131 static void 1132 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1133 { 1134 1135 if (blp != NULL) { 1136 rw_runlock(blp); 1137 } else { 1138 mtx_unlock(vlp); 1139 } 1140 } 1141 1142 static int __noinline 1143 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1144 struct timespec *tsp, int *ticksp) 1145 { 1146 int ltype; 1147 1148 *vpp = dvp; 1149 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1150 dvp, cnp->cn_nameptr); 1151 counter_u64_add(dothits, 1); 1152 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1153 if (tsp != NULL) 1154 timespecclear(tsp); 1155 if (ticksp != NULL) 1156 *ticksp = ticks; 1157 vrefact(*vpp); 1158 /* 1159 * When we lookup "." we still can be asked to lock it 1160 * differently... 1161 */ 1162 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1163 if (ltype != VOP_ISLOCKED(*vpp)) { 1164 if (ltype == LK_EXCLUSIVE) { 1165 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1166 if (VN_IS_DOOMED((*vpp))) { 1167 /* forced unmount */ 1168 vrele(*vpp); 1169 *vpp = NULL; 1170 return (ENOENT); 1171 } 1172 } else 1173 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1174 } 1175 return (-1); 1176 } 1177 1178 static __noinline int 1179 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1180 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1181 { 1182 struct namecache *ncp; 1183 struct rwlock *blp; 1184 struct mtx *dvlp, *dvlp2; 1185 uint32_t hash; 1186 int error; 1187 1188 if (cnp->cn_namelen == 2 && 1189 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1190 counter_u64_add(dotdothits, 1); 1191 dvlp = VP2VNODELOCK(dvp); 1192 dvlp2 = NULL; 1193 mtx_lock(dvlp); 1194 retry_dotdot: 1195 ncp = dvp->v_cache_dd; 1196 if (ncp == NULL) { 1197 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1198 "..", NULL); 1199 mtx_unlock(dvlp); 1200 if (dvlp2 != NULL) 1201 mtx_unlock(dvlp2); 1202 return (0); 1203 } 1204 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1205 if (ncp->nc_dvp != dvp) 1206 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1207 if (!cache_zap_locked_vnode_kl2(ncp, 1208 dvp, &dvlp2)) 1209 goto retry_dotdot; 1210 MPASS(dvp->v_cache_dd == NULL); 1211 mtx_unlock(dvlp); 1212 if (dvlp2 != NULL) 1213 mtx_unlock(dvlp2); 1214 cache_free(ncp); 1215 } else { 1216 dvp->v_cache_dd = NULL; 1217 mtx_unlock(dvlp); 1218 if (dvlp2 != NULL) 1219 mtx_unlock(dvlp2); 1220 } 1221 return (0); 1222 } 1223 1224 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1225 blp = HASH2BUCKETLOCK(hash); 1226 retry: 1227 if (LIST_EMPTY(NCHHASH(hash))) 1228 goto out_no_entry; 1229 1230 rw_wlock(blp); 1231 1232 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1233 counter_u64_add(numchecks, 1); 1234 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1235 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1236 break; 1237 } 1238 1239 /* We failed to find an entry */ 1240 if (ncp == NULL) { 1241 rw_wunlock(blp); 1242 goto out_no_entry; 1243 } 1244 1245 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1246 if (__predict_false(error != 0)) { 1247 zap_and_exit_bucket_fail++; 1248 cache_maybe_yield(); 1249 goto retry; 1250 } 1251 counter_u64_add(numposzaps, 1); 1252 cache_free(ncp); 1253 return (0); 1254 out_no_entry: 1255 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1256 counter_u64_add(nummisszap, 1); 1257 return (0); 1258 } 1259 1260 /** 1261 * Lookup a name in the name cache 1262 * 1263 * # Arguments 1264 * 1265 * - dvp: Parent directory in which to search. 1266 * - vpp: Return argument. Will contain desired vnode on cache hit. 1267 * - cnp: Parameters of the name search. The most interesting bits of 1268 * the cn_flags field have the following meanings: 1269 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1270 * it up. 1271 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1272 * - tsp: Return storage for cache timestamp. On a successful (positive 1273 * or negative) lookup, tsp will be filled with any timespec that 1274 * was stored when this cache entry was created. However, it will 1275 * be clear for "." entries. 1276 * - ticks: Return storage for alternate cache timestamp. On a successful 1277 * (positive or negative) lookup, it will contain the ticks value 1278 * that was current when the cache entry was created, unless cnp 1279 * was ".". 1280 * 1281 * # Returns 1282 * 1283 * - -1: A positive cache hit. vpp will contain the desired vnode. 1284 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1285 * to a forced unmount. vpp will not be modified. If the entry 1286 * is a whiteout, then the ISWHITEOUT flag will be set in 1287 * cnp->cn_flags. 1288 * - 0: A cache miss. vpp will not be modified. 1289 * 1290 * # Locking 1291 * 1292 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1293 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1294 * lock is not recursively acquired. 1295 */ 1296 int 1297 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1298 struct timespec *tsp, int *ticksp) 1299 { 1300 struct namecache_ts *ncp_ts; 1301 struct namecache *ncp; 1302 struct rwlock *blp; 1303 struct mtx *dvlp; 1304 uint32_t hash; 1305 enum vgetstate vs; 1306 int error, ltype; 1307 1308 #ifdef DEBUG_CACHE 1309 if (__predict_false(!doingcache)) { 1310 cnp->cn_flags &= ~MAKEENTRY; 1311 return (0); 1312 } 1313 #endif 1314 1315 counter_u64_add(numcalls, 1); 1316 1317 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1318 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1319 1320 if ((cnp->cn_flags & MAKEENTRY) == 0) 1321 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1322 1323 retry: 1324 blp = NULL; 1325 dvlp = NULL; 1326 error = 0; 1327 if (cnp->cn_namelen == 2 && 1328 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1329 counter_u64_add(dotdothits, 1); 1330 dvlp = VP2VNODELOCK(dvp); 1331 mtx_lock(dvlp); 1332 ncp = dvp->v_cache_dd; 1333 if (ncp == NULL) { 1334 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1335 "..", NULL); 1336 mtx_unlock(dvlp); 1337 return (0); 1338 } 1339 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1340 if (ncp->nc_flag & NCF_NEGATIVE) 1341 *vpp = NULL; 1342 else 1343 *vpp = ncp->nc_vp; 1344 } else 1345 *vpp = ncp->nc_dvp; 1346 /* Return failure if negative entry was found. */ 1347 if (*vpp == NULL) 1348 goto negative_success; 1349 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1350 dvp, cnp->cn_nameptr, *vpp); 1351 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1352 *vpp); 1353 cache_out_ts(ncp, tsp, ticksp); 1354 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1355 NCF_DTS && tsp != NULL) { 1356 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1357 *tsp = ncp_ts->nc_dotdottime; 1358 } 1359 goto success; 1360 } 1361 1362 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1363 blp = HASH2BUCKETLOCK(hash); 1364 rw_rlock(blp); 1365 1366 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1367 counter_u64_add(numchecks, 1); 1368 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1369 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1370 break; 1371 } 1372 1373 /* We failed to find an entry */ 1374 if (__predict_false(ncp == NULL)) { 1375 rw_runlock(blp); 1376 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1377 NULL); 1378 counter_u64_add(nummiss, 1); 1379 return (0); 1380 } 1381 1382 if (ncp->nc_flag & NCF_NEGATIVE) 1383 goto negative_success; 1384 1385 /* We found a "positive" match, return the vnode */ 1386 counter_u64_add(numposhits, 1); 1387 *vpp = ncp->nc_vp; 1388 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1389 dvp, cnp->cn_nameptr, *vpp, ncp); 1390 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1391 *vpp); 1392 cache_out_ts(ncp, tsp, ticksp); 1393 success: 1394 /* 1395 * On success we return a locked and ref'd vnode as per the lookup 1396 * protocol. 1397 */ 1398 MPASS(dvp != *vpp); 1399 ltype = 0; /* silence gcc warning */ 1400 if (cnp->cn_flags & ISDOTDOT) { 1401 ltype = VOP_ISLOCKED(dvp); 1402 VOP_UNLOCK(dvp); 1403 } 1404 vs = vget_prep(*vpp); 1405 cache_lookup_unlock(blp, dvlp); 1406 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1407 if (cnp->cn_flags & ISDOTDOT) { 1408 vn_lock(dvp, ltype | LK_RETRY); 1409 if (VN_IS_DOOMED(dvp)) { 1410 if (error == 0) 1411 vput(*vpp); 1412 *vpp = NULL; 1413 return (ENOENT); 1414 } 1415 } 1416 if (error) { 1417 *vpp = NULL; 1418 goto retry; 1419 } 1420 if ((cnp->cn_flags & ISLASTCN) && 1421 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1422 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1423 } 1424 return (-1); 1425 1426 negative_success: 1427 /* We found a negative match, and want to create it, so purge */ 1428 if (cnp->cn_nameiop == CREATE) { 1429 counter_u64_add(numnegzaps, 1); 1430 goto zap_and_exit; 1431 } 1432 1433 counter_u64_add(numneghits, 1); 1434 cache_negative_hit(ncp); 1435 if (ncp->nc_flag & NCF_WHITE) 1436 cnp->cn_flags |= ISWHITEOUT; 1437 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 1438 ncp->nc_name); 1439 cache_out_ts(ncp, tsp, ticksp); 1440 cache_lookup_unlock(blp, dvlp); 1441 return (ENOENT); 1442 1443 zap_and_exit: 1444 if (blp != NULL) 1445 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1446 else 1447 error = cache_zap_locked_vnode(ncp, dvp); 1448 if (__predict_false(error != 0)) { 1449 zap_and_exit_bucket_fail2++; 1450 cache_maybe_yield(); 1451 goto retry; 1452 } 1453 cache_free(ncp); 1454 return (0); 1455 } 1456 1457 struct celockstate { 1458 struct mtx *vlp[3]; 1459 struct rwlock *blp[2]; 1460 }; 1461 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1462 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1463 1464 static inline void 1465 cache_celockstate_init(struct celockstate *cel) 1466 { 1467 1468 bzero(cel, sizeof(*cel)); 1469 } 1470 1471 static void 1472 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1473 struct vnode *dvp) 1474 { 1475 struct mtx *vlp1, *vlp2; 1476 1477 MPASS(cel->vlp[0] == NULL); 1478 MPASS(cel->vlp[1] == NULL); 1479 MPASS(cel->vlp[2] == NULL); 1480 1481 MPASS(vp != NULL || dvp != NULL); 1482 1483 vlp1 = VP2VNODELOCK(vp); 1484 vlp2 = VP2VNODELOCK(dvp); 1485 cache_sort_vnodes(&vlp1, &vlp2); 1486 1487 if (vlp1 != NULL) { 1488 mtx_lock(vlp1); 1489 cel->vlp[0] = vlp1; 1490 } 1491 mtx_lock(vlp2); 1492 cel->vlp[1] = vlp2; 1493 } 1494 1495 static void 1496 cache_unlock_vnodes_cel(struct celockstate *cel) 1497 { 1498 1499 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1500 1501 if (cel->vlp[0] != NULL) 1502 mtx_unlock(cel->vlp[0]); 1503 if (cel->vlp[1] != NULL) 1504 mtx_unlock(cel->vlp[1]); 1505 if (cel->vlp[2] != NULL) 1506 mtx_unlock(cel->vlp[2]); 1507 } 1508 1509 static bool 1510 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1511 { 1512 struct mtx *vlp; 1513 bool ret; 1514 1515 cache_assert_vlp_locked(cel->vlp[0]); 1516 cache_assert_vlp_locked(cel->vlp[1]); 1517 MPASS(cel->vlp[2] == NULL); 1518 1519 MPASS(vp != NULL); 1520 vlp = VP2VNODELOCK(vp); 1521 1522 ret = true; 1523 if (vlp >= cel->vlp[1]) { 1524 mtx_lock(vlp); 1525 } else { 1526 if (mtx_trylock(vlp)) 1527 goto out; 1528 cache_lock_vnodes_cel_3_failures++; 1529 cache_unlock_vnodes_cel(cel); 1530 if (vlp < cel->vlp[0]) { 1531 mtx_lock(vlp); 1532 mtx_lock(cel->vlp[0]); 1533 mtx_lock(cel->vlp[1]); 1534 } else { 1535 if (cel->vlp[0] != NULL) 1536 mtx_lock(cel->vlp[0]); 1537 mtx_lock(vlp); 1538 mtx_lock(cel->vlp[1]); 1539 } 1540 ret = false; 1541 } 1542 out: 1543 cel->vlp[2] = vlp; 1544 return (ret); 1545 } 1546 1547 static void 1548 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1549 struct rwlock *blp2) 1550 { 1551 1552 MPASS(cel->blp[0] == NULL); 1553 MPASS(cel->blp[1] == NULL); 1554 1555 cache_sort_vnodes(&blp1, &blp2); 1556 1557 if (blp1 != NULL) { 1558 rw_wlock(blp1); 1559 cel->blp[0] = blp1; 1560 } 1561 rw_wlock(blp2); 1562 cel->blp[1] = blp2; 1563 } 1564 1565 static void 1566 cache_unlock_buckets_cel(struct celockstate *cel) 1567 { 1568 1569 if (cel->blp[0] != NULL) 1570 rw_wunlock(cel->blp[0]); 1571 rw_wunlock(cel->blp[1]); 1572 } 1573 1574 /* 1575 * Lock part of the cache affected by the insertion. 1576 * 1577 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1578 * However, insertion can result in removal of an old entry. In this 1579 * case we have an additional vnode and bucketlock pair to lock. If the 1580 * entry is negative, ncelock is locked instead of the vnode. 1581 * 1582 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1583 * preserving the locking order (smaller address first). 1584 */ 1585 static void 1586 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1587 uint32_t hash) 1588 { 1589 struct namecache *ncp; 1590 struct rwlock *blps[2]; 1591 1592 blps[0] = HASH2BUCKETLOCK(hash); 1593 for (;;) { 1594 blps[1] = NULL; 1595 cache_lock_vnodes_cel(cel, dvp, vp); 1596 if (vp == NULL || vp->v_type != VDIR) 1597 break; 1598 ncp = vp->v_cache_dd; 1599 if (ncp == NULL) 1600 break; 1601 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1602 break; 1603 MPASS(ncp->nc_dvp == vp); 1604 blps[1] = NCP2BUCKETLOCK(ncp); 1605 if (ncp->nc_flag & NCF_NEGATIVE) 1606 break; 1607 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1608 break; 1609 /* 1610 * All vnodes got re-locked. Re-validate the state and if 1611 * nothing changed we are done. Otherwise restart. 1612 */ 1613 if (ncp == vp->v_cache_dd && 1614 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1615 blps[1] == NCP2BUCKETLOCK(ncp) && 1616 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1617 break; 1618 cache_unlock_vnodes_cel(cel); 1619 cel->vlp[0] = NULL; 1620 cel->vlp[1] = NULL; 1621 cel->vlp[2] = NULL; 1622 } 1623 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1624 } 1625 1626 static void 1627 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1628 uint32_t hash) 1629 { 1630 struct namecache *ncp; 1631 struct rwlock *blps[2]; 1632 1633 blps[0] = HASH2BUCKETLOCK(hash); 1634 for (;;) { 1635 blps[1] = NULL; 1636 cache_lock_vnodes_cel(cel, dvp, vp); 1637 ncp = dvp->v_cache_dd; 1638 if (ncp == NULL) 1639 break; 1640 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1641 break; 1642 MPASS(ncp->nc_dvp == dvp); 1643 blps[1] = NCP2BUCKETLOCK(ncp); 1644 if (ncp->nc_flag & NCF_NEGATIVE) 1645 break; 1646 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1647 break; 1648 if (ncp == dvp->v_cache_dd && 1649 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1650 blps[1] == NCP2BUCKETLOCK(ncp) && 1651 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1652 break; 1653 cache_unlock_vnodes_cel(cel); 1654 cel->vlp[0] = NULL; 1655 cel->vlp[1] = NULL; 1656 cel->vlp[2] = NULL; 1657 } 1658 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1659 } 1660 1661 static void 1662 cache_enter_unlock(struct celockstate *cel) 1663 { 1664 1665 cache_unlock_buckets_cel(cel); 1666 cache_unlock_vnodes_cel(cel); 1667 } 1668 1669 static void __noinline 1670 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1671 struct componentname *cnp) 1672 { 1673 struct celockstate cel; 1674 struct namecache *ncp; 1675 uint32_t hash; 1676 int len; 1677 1678 if (dvp->v_cache_dd == NULL) 1679 return; 1680 len = cnp->cn_namelen; 1681 cache_celockstate_init(&cel); 1682 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1683 cache_enter_lock_dd(&cel, dvp, vp, hash); 1684 ncp = dvp->v_cache_dd; 1685 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1686 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1687 cache_zap_locked(ncp, false); 1688 } else { 1689 ncp = NULL; 1690 } 1691 dvp->v_cache_dd = NULL; 1692 cache_enter_unlock(&cel); 1693 cache_free(ncp); 1694 } 1695 1696 /* 1697 * Add an entry to the cache. 1698 */ 1699 void 1700 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1701 struct timespec *tsp, struct timespec *dtsp) 1702 { 1703 struct celockstate cel; 1704 struct namecache *ncp, *n2, *ndd; 1705 struct namecache_ts *ncp_ts, *n2_ts; 1706 struct nchashhead *ncpp; 1707 uint32_t hash; 1708 int flag; 1709 int len; 1710 u_long lnumcache; 1711 1712 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1713 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1714 ("cache_enter: Adding a doomed vnode")); 1715 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1716 ("cache_enter: Doomed vnode used as src")); 1717 1718 #ifdef DEBUG_CACHE 1719 if (__predict_false(!doingcache)) 1720 return; 1721 #endif 1722 1723 flag = 0; 1724 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1725 if (cnp->cn_namelen == 1) 1726 return; 1727 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1728 cache_enter_dotdot_prep(dvp, vp, cnp); 1729 flag = NCF_ISDOTDOT; 1730 } 1731 } 1732 1733 /* 1734 * Avoid blowout in namecache entries. 1735 */ 1736 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1737 if (__predict_false(lnumcache >= ncsize)) { 1738 atomic_add_long(&numcache, -1); 1739 return; 1740 } 1741 1742 cache_celockstate_init(&cel); 1743 ndd = NULL; 1744 ncp_ts = NULL; 1745 1746 /* 1747 * Calculate the hash key and setup as much of the new 1748 * namecache entry as possible before acquiring the lock. 1749 */ 1750 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1751 ncp->nc_flag = flag; 1752 ncp->nc_vp = vp; 1753 if (vp == NULL) 1754 ncp->nc_flag |= NCF_NEGATIVE; 1755 ncp->nc_dvp = dvp; 1756 if (tsp != NULL) { 1757 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1758 ncp_ts->nc_time = *tsp; 1759 ncp_ts->nc_ticks = ticks; 1760 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1761 if (dtsp != NULL) { 1762 ncp_ts->nc_dotdottime = *dtsp; 1763 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1764 } 1765 } 1766 len = ncp->nc_nlen = cnp->cn_namelen; 1767 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1768 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1769 cache_enter_lock(&cel, dvp, vp, hash); 1770 1771 /* 1772 * See if this vnode or negative entry is already in the cache 1773 * with this name. This can happen with concurrent lookups of 1774 * the same path name. 1775 */ 1776 ncpp = NCHHASH(hash); 1777 LIST_FOREACH(n2, ncpp, nc_hash) { 1778 if (n2->nc_dvp == dvp && 1779 n2->nc_nlen == cnp->cn_namelen && 1780 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1781 if (tsp != NULL) { 1782 KASSERT((n2->nc_flag & NCF_TS) != 0, 1783 ("no NCF_TS")); 1784 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1785 n2_ts->nc_time = ncp_ts->nc_time; 1786 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1787 if (dtsp != NULL) { 1788 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1789 if (ncp->nc_flag & NCF_NEGATIVE) 1790 mtx_lock(&ncneg_hot.nl_lock); 1791 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1792 if (ncp->nc_flag & NCF_NEGATIVE) 1793 mtx_unlock(&ncneg_hot.nl_lock); 1794 } 1795 } 1796 goto out_unlock_free; 1797 } 1798 } 1799 1800 if (flag == NCF_ISDOTDOT) { 1801 /* 1802 * See if we are trying to add .. entry, but some other lookup 1803 * has populated v_cache_dd pointer already. 1804 */ 1805 if (dvp->v_cache_dd != NULL) 1806 goto out_unlock_free; 1807 KASSERT(vp == NULL || vp->v_type == VDIR, 1808 ("wrong vnode type %p", vp)); 1809 dvp->v_cache_dd = ncp; 1810 } 1811 1812 if (vp != NULL) { 1813 if (vp->v_type == VDIR) { 1814 if (flag != NCF_ISDOTDOT) { 1815 /* 1816 * For this case, the cache entry maps both the 1817 * directory name in it and the name ".." for the 1818 * directory's parent. 1819 */ 1820 if ((ndd = vp->v_cache_dd) != NULL) { 1821 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1822 cache_zap_locked(ndd, false); 1823 else 1824 ndd = NULL; 1825 } 1826 vp->v_cache_dd = ncp; 1827 } 1828 } else { 1829 vp->v_cache_dd = NULL; 1830 } 1831 } 1832 1833 if (flag != NCF_ISDOTDOT) { 1834 if (LIST_EMPTY(&dvp->v_cache_src)) { 1835 vhold(dvp); 1836 counter_u64_add(numcachehv, 1); 1837 } 1838 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1839 } 1840 1841 /* 1842 * Insert the new namecache entry into the appropriate chain 1843 * within the cache entries table. 1844 */ 1845 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1846 1847 /* 1848 * If the entry is "negative", we place it into the 1849 * "negative" cache queue, otherwise, we place it into the 1850 * destination vnode's cache entries queue. 1851 */ 1852 if (vp != NULL) { 1853 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1854 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1855 vp); 1856 } else { 1857 if (cnp->cn_flags & ISWHITEOUT) 1858 ncp->nc_flag |= NCF_WHITE; 1859 cache_negative_insert(ncp, false); 1860 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1861 ncp->nc_name); 1862 } 1863 cache_enter_unlock(&cel); 1864 if (numneg * ncnegfactor > lnumcache) 1865 cache_negative_zap_one(); 1866 cache_free(ndd); 1867 return; 1868 out_unlock_free: 1869 cache_enter_unlock(&cel); 1870 cache_free(ncp); 1871 return; 1872 } 1873 1874 static u_int 1875 cache_roundup_2(u_int val) 1876 { 1877 u_int res; 1878 1879 for (res = 1; res <= val; res <<= 1) 1880 continue; 1881 1882 return (res); 1883 } 1884 1885 /* 1886 * Name cache initialization, from vfs_init() when we are booting 1887 */ 1888 static void 1889 nchinit(void *dummy __unused) 1890 { 1891 u_int i; 1892 1893 cache_zone_small = uma_zcreate("S VFS Cache", 1894 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1895 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1896 UMA_ZONE_ZINIT); 1897 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1898 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1899 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1900 UMA_ZONE_ZINIT); 1901 cache_zone_large = uma_zcreate("L VFS Cache", 1902 sizeof(struct namecache) + NAME_MAX + 1, 1903 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1904 UMA_ZONE_ZINIT); 1905 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1906 sizeof(struct namecache_ts) + NAME_MAX + 1, 1907 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1908 UMA_ZONE_ZINIT); 1909 1910 ncsize = desiredvnodes * ncsizefactor; 1911 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1912 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 1913 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 1914 ncbuckethash = 7; 1915 if (ncbuckethash > nchash) 1916 ncbuckethash = nchash; 1917 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1918 M_WAITOK | M_ZERO); 1919 for (i = 0; i < numbucketlocks; i++) 1920 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 1921 ncvnodehash = ncbuckethash; 1922 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 1923 M_WAITOK | M_ZERO); 1924 for (i = 0; i < numvnodelocks; i++) 1925 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 1926 ncpurgeminvnodes = numbucketlocks * 2; 1927 1928 ncneghash = 3; 1929 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 1930 M_WAITOK | M_ZERO); 1931 for (i = 0; i < numneglists; i++) { 1932 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 1933 TAILQ_INIT(&neglists[i].nl_list); 1934 } 1935 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 1936 TAILQ_INIT(&ncneg_hot.nl_list); 1937 1938 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 1939 1940 numcachehv = counter_u64_alloc(M_WAITOK); 1941 numcalls = counter_u64_alloc(M_WAITOK); 1942 dothits = counter_u64_alloc(M_WAITOK); 1943 dotdothits = counter_u64_alloc(M_WAITOK); 1944 numchecks = counter_u64_alloc(M_WAITOK); 1945 nummiss = counter_u64_alloc(M_WAITOK); 1946 nummisszap = counter_u64_alloc(M_WAITOK); 1947 numposzaps = counter_u64_alloc(M_WAITOK); 1948 numposhits = counter_u64_alloc(M_WAITOK); 1949 numnegzaps = counter_u64_alloc(M_WAITOK); 1950 numneghits = counter_u64_alloc(M_WAITOK); 1951 numfullpathcalls = counter_u64_alloc(M_WAITOK); 1952 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 1953 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 1954 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 1955 numfullpathfound = counter_u64_alloc(M_WAITOK); 1956 zap_and_exit_bucket_relock_success = counter_u64_alloc(M_WAITOK); 1957 numneg_evicted = counter_u64_alloc(M_WAITOK); 1958 shrinking_skipped = counter_u64_alloc(M_WAITOK); 1959 } 1960 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 1961 1962 void 1963 cache_changesize(u_long newmaxvnodes) 1964 { 1965 struct nchashhead *new_nchashtbl, *old_nchashtbl; 1966 u_long new_nchash, old_nchash; 1967 struct namecache *ncp; 1968 uint32_t hash; 1969 u_long newncsize; 1970 int i; 1971 1972 newncsize = newmaxvnodes * ncsizefactor; 1973 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 1974 if (newmaxvnodes < numbucketlocks) 1975 newmaxvnodes = numbucketlocks; 1976 1977 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 1978 /* If same hash table size, nothing to do */ 1979 if (nchash == new_nchash) { 1980 free(new_nchashtbl, M_VFSCACHE); 1981 return; 1982 } 1983 /* 1984 * Move everything from the old hash table to the new table. 1985 * None of the namecache entries in the table can be removed 1986 * because to do so, they have to be removed from the hash table. 1987 */ 1988 cache_lock_all_vnodes(); 1989 cache_lock_all_buckets(); 1990 old_nchashtbl = nchashtbl; 1991 old_nchash = nchash; 1992 nchashtbl = new_nchashtbl; 1993 nchash = new_nchash; 1994 for (i = 0; i <= old_nchash; i++) { 1995 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 1996 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 1997 ncp->nc_dvp); 1998 LIST_REMOVE(ncp, nc_hash); 1999 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2000 } 2001 } 2002 ncsize = newncsize; 2003 cache_unlock_all_buckets(); 2004 cache_unlock_all_vnodes(); 2005 free(old_nchashtbl, M_VFSCACHE); 2006 } 2007 2008 /* 2009 * Invalidate all entries from and to a particular vnode. 2010 */ 2011 void 2012 cache_purge(struct vnode *vp) 2013 { 2014 TAILQ_HEAD(, namecache) ncps; 2015 struct namecache *ncp, *nnp; 2016 struct mtx *vlp, *vlp2; 2017 2018 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2019 SDT_PROBE1(vfs, namecache, purge, done, vp); 2020 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2021 vp->v_cache_dd == NULL) 2022 return; 2023 TAILQ_INIT(&ncps); 2024 vlp = VP2VNODELOCK(vp); 2025 vlp2 = NULL; 2026 mtx_lock(vlp); 2027 retry: 2028 while (!LIST_EMPTY(&vp->v_cache_src)) { 2029 ncp = LIST_FIRST(&vp->v_cache_src); 2030 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2031 goto retry; 2032 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2033 } 2034 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2035 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2036 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2037 goto retry; 2038 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2039 } 2040 ncp = vp->v_cache_dd; 2041 if (ncp != NULL) { 2042 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2043 ("lost dotdot link")); 2044 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2045 goto retry; 2046 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2047 } 2048 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2049 mtx_unlock(vlp); 2050 if (vlp2 != NULL) 2051 mtx_unlock(vlp2); 2052 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2053 cache_free(ncp); 2054 } 2055 } 2056 2057 /* 2058 * Invalidate all negative entries for a particular directory vnode. 2059 */ 2060 void 2061 cache_purge_negative(struct vnode *vp) 2062 { 2063 TAILQ_HEAD(, namecache) ncps; 2064 struct namecache *ncp, *nnp; 2065 struct mtx *vlp; 2066 2067 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2068 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2069 if (LIST_EMPTY(&vp->v_cache_src)) 2070 return; 2071 TAILQ_INIT(&ncps); 2072 vlp = VP2VNODELOCK(vp); 2073 mtx_lock(vlp); 2074 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2075 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2076 continue; 2077 cache_zap_negative_locked_vnode_kl(ncp, vp); 2078 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2079 } 2080 mtx_unlock(vlp); 2081 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2082 cache_free(ncp); 2083 } 2084 } 2085 2086 /* 2087 * Flush all entries referencing a particular filesystem. 2088 */ 2089 void 2090 cache_purgevfs(struct mount *mp, bool force) 2091 { 2092 TAILQ_HEAD(, namecache) ncps; 2093 struct mtx *vlp1, *vlp2; 2094 struct rwlock *blp; 2095 struct nchashhead *bucket; 2096 struct namecache *ncp, *nnp; 2097 u_long i, j, n_nchash; 2098 int error; 2099 2100 /* Scan hash tables for applicable entries */ 2101 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2102 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2103 return; 2104 TAILQ_INIT(&ncps); 2105 n_nchash = nchash + 1; 2106 vlp1 = vlp2 = NULL; 2107 for (i = 0; i < numbucketlocks; i++) { 2108 blp = (struct rwlock *)&bucketlocks[i]; 2109 rw_wlock(blp); 2110 for (j = i; j < n_nchash; j += numbucketlocks) { 2111 retry: 2112 bucket = &nchashtbl[j]; 2113 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2114 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2115 if (ncp->nc_dvp->v_mount != mp) 2116 continue; 2117 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2118 &vlp1, &vlp2); 2119 if (error != 0) 2120 goto retry; 2121 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2122 } 2123 } 2124 rw_wunlock(blp); 2125 if (vlp1 == NULL && vlp2 == NULL) 2126 cache_maybe_yield(); 2127 } 2128 if (vlp1 != NULL) 2129 mtx_unlock(vlp1); 2130 if (vlp2 != NULL) 2131 mtx_unlock(vlp2); 2132 2133 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2134 cache_free(ncp); 2135 } 2136 } 2137 2138 /* 2139 * Perform canonical checks and cache lookup and pass on to filesystem 2140 * through the vop_cachedlookup only if needed. 2141 */ 2142 2143 int 2144 vfs_cache_lookup(struct vop_lookup_args *ap) 2145 { 2146 struct vnode *dvp; 2147 int error; 2148 struct vnode **vpp = ap->a_vpp; 2149 struct componentname *cnp = ap->a_cnp; 2150 int flags = cnp->cn_flags; 2151 2152 *vpp = NULL; 2153 dvp = ap->a_dvp; 2154 2155 if (dvp->v_type != VDIR) 2156 return (ENOTDIR); 2157 2158 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2159 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2160 return (EROFS); 2161 2162 error = vn_dir_check_exec(dvp, cnp); 2163 if (error != 0) 2164 return (error); 2165 2166 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2167 if (error == 0) 2168 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2169 if (error == -1) 2170 return (0); 2171 return (error); 2172 } 2173 2174 /* Implementation of the getcwd syscall. */ 2175 int 2176 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2177 { 2178 char *buf, *retbuf; 2179 size_t buflen; 2180 int error; 2181 2182 buflen = uap->buflen; 2183 if (__predict_false(buflen < 2)) 2184 return (EINVAL); 2185 if (buflen > MAXPATHLEN) 2186 buflen = MAXPATHLEN; 2187 2188 buf = malloc(buflen, M_TEMP, M_WAITOK); 2189 error = vn_getcwd(td, buf, &retbuf, &buflen); 2190 if (error == 0) 2191 error = copyout(retbuf, uap->buf, buflen); 2192 free(buf, M_TEMP); 2193 return (error); 2194 } 2195 2196 int 2197 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2198 { 2199 struct pwd *pwd; 2200 int error; 2201 2202 pwd = pwd_hold(td); 2203 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2204 pwd_drop(pwd); 2205 2206 #ifdef KTRACE 2207 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2208 ktrnamei(*retbuf); 2209 #endif 2210 return (error); 2211 } 2212 2213 static int 2214 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2215 size_t size, int flags, enum uio_seg pathseg) 2216 { 2217 struct nameidata nd; 2218 char *retbuf, *freebuf; 2219 int error; 2220 2221 if (flags != 0) 2222 return (EINVAL); 2223 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2224 pathseg, path, fd, &cap_fstat_rights, td); 2225 if ((error = namei(&nd)) != 0) 2226 return (error); 2227 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2228 if (error == 0) { 2229 error = copyout(retbuf, buf, size); 2230 free(freebuf, M_TEMP); 2231 } 2232 NDFREE(&nd, 0); 2233 return (error); 2234 } 2235 2236 int 2237 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2238 { 2239 2240 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2241 uap->flags, UIO_USERSPACE)); 2242 } 2243 2244 /* 2245 * Retrieve the full filesystem path that correspond to a vnode from the name 2246 * cache (if available) 2247 */ 2248 int 2249 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2250 { 2251 struct pwd *pwd; 2252 char *buf; 2253 size_t buflen; 2254 int error; 2255 2256 if (__predict_false(vn == NULL)) 2257 return (EINVAL); 2258 2259 buflen = MAXPATHLEN; 2260 buf = malloc(buflen, M_TEMP, M_WAITOK); 2261 pwd = pwd_hold(td); 2262 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2263 pwd_drop(pwd); 2264 2265 if (!error) 2266 *freebuf = buf; 2267 else 2268 free(buf, M_TEMP); 2269 return (error); 2270 } 2271 2272 /* 2273 * This function is similar to vn_fullpath, but it attempts to lookup the 2274 * pathname relative to the global root mount point. This is required for the 2275 * auditing sub-system, as audited pathnames must be absolute, relative to the 2276 * global root mount point. 2277 */ 2278 int 2279 vn_fullpath_global(struct thread *td, struct vnode *vn, 2280 char **retbuf, char **freebuf) 2281 { 2282 char *buf; 2283 size_t buflen; 2284 int error; 2285 2286 if (__predict_false(vn == NULL)) 2287 return (EINVAL); 2288 buflen = MAXPATHLEN; 2289 buf = malloc(buflen, M_TEMP, M_WAITOK); 2290 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2291 if (!error) 2292 *freebuf = buf; 2293 else 2294 free(buf, M_TEMP); 2295 return (error); 2296 } 2297 2298 int 2299 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2300 { 2301 struct vnode *dvp; 2302 struct namecache *ncp; 2303 struct mtx *vlp; 2304 int error; 2305 2306 vlp = VP2VNODELOCK(*vp); 2307 mtx_lock(vlp); 2308 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2309 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2310 break; 2311 } 2312 if (ncp != NULL) { 2313 if (*buflen < ncp->nc_nlen) { 2314 mtx_unlock(vlp); 2315 vrele(*vp); 2316 counter_u64_add(numfullpathfail4, 1); 2317 error = ENOMEM; 2318 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2319 vp, NULL); 2320 return (error); 2321 } 2322 *buflen -= ncp->nc_nlen; 2323 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2324 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2325 ncp->nc_name, vp); 2326 dvp = *vp; 2327 *vp = ncp->nc_dvp; 2328 vref(*vp); 2329 mtx_unlock(vlp); 2330 vrele(dvp); 2331 return (0); 2332 } 2333 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2334 2335 mtx_unlock(vlp); 2336 vn_lock(*vp, LK_SHARED | LK_RETRY); 2337 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2338 vput(*vp); 2339 if (error) { 2340 counter_u64_add(numfullpathfail2, 1); 2341 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2342 return (error); 2343 } 2344 2345 *vp = dvp; 2346 if (VN_IS_DOOMED(dvp)) { 2347 /* forced unmount */ 2348 vrele(dvp); 2349 error = ENOENT; 2350 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2351 return (error); 2352 } 2353 /* 2354 * *vp has its use count incremented still. 2355 */ 2356 2357 return (0); 2358 } 2359 2360 /* 2361 * Resolve a directory to a pathname. 2362 * 2363 * The name of the directory can always be found in the namecache or fetched 2364 * from the filesystem. There is also guaranteed to be only one parent, meaning 2365 * we can just follow vnodes up until we find the root. 2366 * 2367 * The vnode must be referenced. 2368 */ 2369 static int 2370 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2371 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2372 { 2373 #ifdef KDTRACE_HOOKS 2374 struct vnode *startvp = vp; 2375 #endif 2376 struct vnode *vp1; 2377 size_t buflen; 2378 int error; 2379 2380 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2381 VNPASS(vp->v_usecount > 0, vp); 2382 2383 buflen = *len; 2384 2385 if (!slash_prefixed) { 2386 MPASS(*len >= 2); 2387 buflen--; 2388 buf[buflen] = '\0'; 2389 } 2390 2391 error = 0; 2392 2393 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2394 counter_u64_add(numfullpathcalls, 1); 2395 while (vp != rdir && vp != rootvnode) { 2396 /* 2397 * The vp vnode must be already fully constructed, 2398 * since it is either found in namecache or obtained 2399 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2400 * without obtaining the vnode lock. 2401 */ 2402 if ((vp->v_vflag & VV_ROOT) != 0) { 2403 vn_lock(vp, LK_RETRY | LK_SHARED); 2404 2405 /* 2406 * With the vnode locked, check for races with 2407 * unmount, forced or not. Note that we 2408 * already verified that vp is not equal to 2409 * the root vnode, which means that 2410 * mnt_vnodecovered can be NULL only for the 2411 * case of unmount. 2412 */ 2413 if (VN_IS_DOOMED(vp) || 2414 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2415 vp1->v_mountedhere != vp->v_mount) { 2416 vput(vp); 2417 error = ENOENT; 2418 SDT_PROBE3(vfs, namecache, fullpath, return, 2419 error, vp, NULL); 2420 break; 2421 } 2422 2423 vref(vp1); 2424 vput(vp); 2425 vp = vp1; 2426 continue; 2427 } 2428 if (vp->v_type != VDIR) { 2429 vrele(vp); 2430 counter_u64_add(numfullpathfail1, 1); 2431 error = ENOTDIR; 2432 SDT_PROBE3(vfs, namecache, fullpath, return, 2433 error, vp, NULL); 2434 break; 2435 } 2436 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2437 if (error) 2438 break; 2439 if (buflen == 0) { 2440 vrele(vp); 2441 error = ENOMEM; 2442 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2443 startvp, NULL); 2444 break; 2445 } 2446 buf[--buflen] = '/'; 2447 slash_prefixed = true; 2448 } 2449 if (error) 2450 return (error); 2451 if (!slash_prefixed) { 2452 if (buflen == 0) { 2453 vrele(vp); 2454 counter_u64_add(numfullpathfail4, 1); 2455 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2456 startvp, NULL); 2457 return (ENOMEM); 2458 } 2459 buf[--buflen] = '/'; 2460 } 2461 counter_u64_add(numfullpathfound, 1); 2462 vrele(vp); 2463 2464 *retbuf = buf + buflen; 2465 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2466 *len -= buflen; 2467 *len += addend; 2468 return (0); 2469 } 2470 2471 /* 2472 * Resolve an arbitrary vnode to a pathname. 2473 * 2474 * Note 2 caveats: 2475 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2476 * resolve to a different path than the one used to find it 2477 * - namecache is not mandatory, meaning names are not guaranteed to be added 2478 * (in which case resolving fails) 2479 */ 2480 static int 2481 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2482 char *buf, char **retbuf, size_t *buflen) 2483 { 2484 size_t orig_buflen; 2485 bool slash_prefixed; 2486 int error; 2487 2488 if (*buflen < 2) 2489 return (EINVAL); 2490 2491 orig_buflen = *buflen; 2492 2493 vref(vp); 2494 slash_prefixed = false; 2495 if (vp->v_type != VDIR) { 2496 *buflen -= 1; 2497 buf[*buflen] = '\0'; 2498 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2499 if (error) 2500 return (error); 2501 if (*buflen == 0) { 2502 vrele(vp); 2503 return (ENOMEM); 2504 } 2505 *buflen -= 1; 2506 buf[*buflen] = '/'; 2507 slash_prefixed = true; 2508 } 2509 2510 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2511 orig_buflen - *buflen)); 2512 } 2513 2514 /* 2515 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2516 * 2517 * Since the namecache does not track handlings, the caller is expected to first 2518 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2519 * 2520 * Then we have 2 cases: 2521 * - if the found vnode is a directory, the path can be constructed just by 2522 * fullowing names up the chain 2523 * - otherwise we populate the buffer with the saved name and start resolving 2524 * from the parent 2525 */ 2526 static int 2527 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2528 char **freebuf, size_t *buflen) 2529 { 2530 char *buf, *tmpbuf; 2531 struct pwd *pwd; 2532 struct componentname *cnp; 2533 struct vnode *vp; 2534 size_t addend; 2535 int error; 2536 bool slash_prefixed; 2537 2538 if (*buflen < 2) 2539 return (EINVAL); 2540 if (*buflen > MAXPATHLEN) 2541 *buflen = MAXPATHLEN; 2542 2543 slash_prefixed = false; 2544 2545 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2546 pwd = pwd_hold(td); 2547 2548 addend = 0; 2549 vp = ndp->ni_vp; 2550 if (vp->v_type != VDIR) { 2551 cnp = &ndp->ni_cnd; 2552 addend = cnp->cn_namelen + 2; 2553 if (*buflen < addend) { 2554 error = ENOMEM; 2555 goto out_bad; 2556 } 2557 *buflen -= addend; 2558 tmpbuf = buf + *buflen; 2559 tmpbuf[0] = '/'; 2560 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2561 tmpbuf[addend - 1] = '\0'; 2562 slash_prefixed = true; 2563 vp = ndp->ni_dvp; 2564 } 2565 2566 vref(vp); 2567 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2568 slash_prefixed, addend); 2569 if (error != 0) 2570 goto out_bad; 2571 2572 pwd_drop(pwd); 2573 *freebuf = buf; 2574 2575 return (0); 2576 out_bad: 2577 pwd_drop(pwd); 2578 free(buf, M_TEMP); 2579 return (error); 2580 } 2581 2582 struct vnode * 2583 vn_dir_dd_ino(struct vnode *vp) 2584 { 2585 struct namecache *ncp; 2586 struct vnode *ddvp; 2587 struct mtx *vlp; 2588 enum vgetstate vs; 2589 2590 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2591 vlp = VP2VNODELOCK(vp); 2592 mtx_lock(vlp); 2593 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2594 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2595 continue; 2596 ddvp = ncp->nc_dvp; 2597 vs = vget_prep(ddvp); 2598 mtx_unlock(vlp); 2599 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2600 return (NULL); 2601 return (ddvp); 2602 } 2603 mtx_unlock(vlp); 2604 return (NULL); 2605 } 2606 2607 int 2608 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2609 { 2610 struct namecache *ncp; 2611 struct mtx *vlp; 2612 int l; 2613 2614 vlp = VP2VNODELOCK(vp); 2615 mtx_lock(vlp); 2616 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2617 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2618 break; 2619 if (ncp == NULL) { 2620 mtx_unlock(vlp); 2621 return (ENOENT); 2622 } 2623 l = min(ncp->nc_nlen, buflen - 1); 2624 memcpy(buf, ncp->nc_name, l); 2625 mtx_unlock(vlp); 2626 buf[l] = '\0'; 2627 return (0); 2628 } 2629 2630 /* 2631 * This function updates path string to vnode's full global path 2632 * and checks the size of the new path string against the pathlen argument. 2633 * 2634 * Requires a locked, referenced vnode. 2635 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2636 * 2637 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2638 * because it falls back to the ".." lookup if the namecache lookup fails. 2639 */ 2640 int 2641 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2642 u_int pathlen) 2643 { 2644 struct nameidata nd; 2645 struct vnode *vp1; 2646 char *rpath, *fbuf; 2647 int error; 2648 2649 ASSERT_VOP_ELOCKED(vp, __func__); 2650 2651 /* Construct global filesystem path from vp. */ 2652 VOP_UNLOCK(vp); 2653 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2654 2655 if (error != 0) { 2656 vrele(vp); 2657 return (error); 2658 } 2659 2660 if (strlen(rpath) >= pathlen) { 2661 vrele(vp); 2662 error = ENAMETOOLONG; 2663 goto out; 2664 } 2665 2666 /* 2667 * Re-lookup the vnode by path to detect a possible rename. 2668 * As a side effect, the vnode is relocked. 2669 * If vnode was renamed, return ENOENT. 2670 */ 2671 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2672 UIO_SYSSPACE, path, td); 2673 error = namei(&nd); 2674 if (error != 0) { 2675 vrele(vp); 2676 goto out; 2677 } 2678 NDFREE(&nd, NDF_ONLY_PNBUF); 2679 vp1 = nd.ni_vp; 2680 vrele(vp); 2681 if (vp1 == vp) 2682 strcpy(path, rpath); 2683 else { 2684 vput(vp1); 2685 error = ENOENT; 2686 } 2687 2688 out: 2689 free(fbuf, M_TEMP); 2690 return (error); 2691 } 2692 2693 #ifdef DDB 2694 static void 2695 db_print_vpath(struct vnode *vp) 2696 { 2697 2698 while (vp != NULL) { 2699 db_printf("%p: ", vp); 2700 if (vp == rootvnode) { 2701 db_printf("/"); 2702 vp = NULL; 2703 } else { 2704 if (vp->v_vflag & VV_ROOT) { 2705 db_printf("<mount point>"); 2706 vp = vp->v_mount->mnt_vnodecovered; 2707 } else { 2708 struct namecache *ncp; 2709 char *ncn; 2710 int i; 2711 2712 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2713 if (ncp != NULL) { 2714 ncn = ncp->nc_name; 2715 for (i = 0; i < ncp->nc_nlen; i++) 2716 db_printf("%c", *ncn++); 2717 vp = ncp->nc_dvp; 2718 } else { 2719 vp = NULL; 2720 } 2721 } 2722 } 2723 db_printf("\n"); 2724 } 2725 2726 return; 2727 } 2728 2729 DB_SHOW_COMMAND(vpath, db_show_vpath) 2730 { 2731 struct vnode *vp; 2732 2733 if (!have_addr) { 2734 db_printf("usage: show vpath <struct vnode *>\n"); 2735 return; 2736 } 2737 2738 vp = (struct vnode *)addr; 2739 db_print_vpath(vp); 2740 } 2741 2742 #endif 2743