1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/mount.h> 55 #include <sys/namei.h> 56 #include <sys/proc.h> 57 #include <sys/rwlock.h> 58 #include <sys/sdt.h> 59 #include <sys/smp.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysproto.h> 63 #include <sys/vnode.h> 64 #ifdef KTRACE 65 #include <sys/ktrace.h> 66 #endif 67 68 #ifdef DDB 69 #include <ddb/ddb.h> 70 #endif 71 72 #include <vm/uma.h> 73 74 SDT_PROVIDER_DECLARE(vfs); 75 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 76 "struct vnode *"); 77 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 78 "char *"); 79 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 80 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 81 "char *", "struct vnode *"); 82 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 83 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 84 "struct vnode *", "char *"); 85 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 86 "struct vnode *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 88 "struct vnode *", "char *"); 89 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 90 "char *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 92 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 93 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 95 "struct vnode *"); 96 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 97 "char *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 99 "char *"); 100 101 /* 102 * This structure describes the elements in the cache of recent 103 * names looked up by namei. 104 */ 105 106 struct namecache { 107 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 108 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 109 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 110 struct vnode *nc_dvp; /* vnode of parent of name */ 111 union { 112 struct vnode *nu_vp; /* vnode the name refers to */ 113 } n_un; 114 u_char nc_flag; /* flag bits */ 115 u_char nc_nlen; /* length of name */ 116 char nc_name[0]; /* segment name + nul */ 117 }; 118 119 /* 120 * struct namecache_ts repeats struct namecache layout up to the 121 * nc_nlen member. 122 * struct namecache_ts is used in place of struct namecache when time(s) need 123 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 124 * both a non-dotdot directory name plus dotdot for the directory's 125 * parent. 126 */ 127 struct namecache_ts { 128 struct timespec nc_time; /* timespec provided by fs */ 129 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 130 int nc_ticks; /* ticks value when entry was added */ 131 struct namecache nc_nc; 132 }; 133 134 #define nc_vp n_un.nu_vp 135 136 /* 137 * Flags in namecache.nc_flag 138 */ 139 #define NCF_WHITE 0x01 140 #define NCF_ISDOTDOT 0x02 141 #define NCF_TS 0x04 142 #define NCF_DTS 0x08 143 #define NCF_DVDROP 0x10 144 #define NCF_NEGATIVE 0x20 145 #define NCF_HOTNEGATIVE 0x40 146 147 /* 148 * Name caching works as follows: 149 * 150 * Names found by directory scans are retained in a cache 151 * for future reference. It is managed LRU, so frequently 152 * used names will hang around. Cache is indexed by hash value 153 * obtained from (dvp, name) where dvp refers to the directory 154 * containing name. 155 * 156 * If it is a "negative" entry, (i.e. for a name that is known NOT to 157 * exist) the vnode pointer will be NULL. 158 * 159 * Upon reaching the last segment of a path, if the reference 160 * is for DELETE, or NOCACHE is set (rewrite), and the 161 * name is located in the cache, it will be dropped. 162 * 163 * These locks are used (in the order in which they can be taken): 164 * NAME TYPE ROLE 165 * vnodelock mtx vnode lists and v_cache_dd field protection 166 * bucketlock rwlock for access to given set of hash buckets 167 * neglist mtx negative entry LRU management 168 * 169 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 170 * shrinking the LRU list. 171 * 172 * It is legal to take multiple vnodelock and bucketlock locks. The locking 173 * order is lower address first. Both are recursive. 174 * 175 * "." lookups are lockless. 176 * 177 * ".." and vnode -> name lookups require vnodelock. 178 * 179 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 180 * 181 * Insertions and removals of entries require involved vnodes and bucketlocks 182 * to be write-locked to prevent other threads from seeing the entry. 183 * 184 * Some lookups result in removal of the found entry (e.g. getting rid of a 185 * negative entry with the intent to create a positive one), which poses a 186 * problem when multiple threads reach the state. Similarly, two different 187 * threads can purge two different vnodes and try to remove the same name. 188 * 189 * If the already held vnode lock is lower than the second required lock, we 190 * can just take the other lock. However, in the opposite case, this could 191 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 192 * the first node, locking everything in order and revalidating the state. 193 */ 194 195 /* 196 * Structures associated with name caching. 197 */ 198 #define NCHHASH(hash) \ 199 (&nchashtbl[(hash) & nchash]) 200 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 201 static u_long __read_mostly nchash; /* size of hash table */ 202 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 203 "Size of namecache hash table"); 204 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 205 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 206 "Ratio of negative namecache entries"); 207 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 208 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 209 u_int ncsizefactor = 2; 210 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 211 "Size factor for namecache"); 212 static u_int __read_mostly ncpurgeminvnodes; 213 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 214 "Number of vnodes below which purgevfs ignores the request"); 215 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 216 217 struct nchstats nchstats; /* cache effectiveness statistics */ 218 219 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 220 static int shrink_list_turn; 221 222 struct neglist { 223 struct mtx nl_lock; 224 TAILQ_HEAD(, namecache) nl_list; 225 } __aligned(CACHE_LINE_SIZE); 226 227 static struct neglist __read_mostly *neglists; 228 static struct neglist ncneg_hot; 229 static u_long numhotneg; 230 231 #define numneglists (ncneghash + 1) 232 static u_int __read_mostly ncneghash; 233 static inline struct neglist * 234 NCP2NEGLIST(struct namecache *ncp) 235 { 236 237 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 238 } 239 240 #define numbucketlocks (ncbuckethash + 1) 241 static u_int __read_mostly ncbuckethash; 242 static struct rwlock_padalign __read_mostly *bucketlocks; 243 #define HASH2BUCKETLOCK(hash) \ 244 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 245 246 #define numvnodelocks (ncvnodehash + 1) 247 static u_int __read_mostly ncvnodehash; 248 static struct mtx __read_mostly *vnodelocks; 249 static inline struct mtx * 250 VP2VNODELOCK(struct vnode *vp) 251 { 252 253 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 254 } 255 256 /* 257 * UMA zones for the VFS cache. 258 * 259 * The small cache is used for entries with short names, which are the 260 * most common. The large cache is used for entries which are too big to 261 * fit in the small cache. 262 */ 263 static uma_zone_t __read_mostly cache_zone_small; 264 static uma_zone_t __read_mostly cache_zone_small_ts; 265 static uma_zone_t __read_mostly cache_zone_large; 266 static uma_zone_t __read_mostly cache_zone_large_ts; 267 268 #define CACHE_PATH_CUTOFF 35 269 270 static struct namecache * 271 cache_alloc(int len, int ts) 272 { 273 struct namecache_ts *ncp_ts; 274 struct namecache *ncp; 275 276 if (__predict_false(ts)) { 277 if (len <= CACHE_PATH_CUTOFF) 278 ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); 279 else 280 ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); 281 ncp = &ncp_ts->nc_nc; 282 } else { 283 if (len <= CACHE_PATH_CUTOFF) 284 ncp = uma_zalloc(cache_zone_small, M_WAITOK); 285 else 286 ncp = uma_zalloc(cache_zone_large, M_WAITOK); 287 } 288 return (ncp); 289 } 290 291 static void 292 cache_free(struct namecache *ncp) 293 { 294 struct namecache_ts *ncp_ts; 295 296 if (ncp == NULL) 297 return; 298 if ((ncp->nc_flag & NCF_DVDROP) != 0) 299 vdrop(ncp->nc_dvp); 300 if (__predict_false(ncp->nc_flag & NCF_TS)) { 301 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 302 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 303 uma_zfree(cache_zone_small_ts, ncp_ts); 304 else 305 uma_zfree(cache_zone_large_ts, ncp_ts); 306 } else { 307 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 308 uma_zfree(cache_zone_small, ncp); 309 else 310 uma_zfree(cache_zone_large, ncp); 311 } 312 } 313 314 static void 315 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 316 { 317 struct namecache_ts *ncp_ts; 318 319 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 320 (tsp == NULL && ticksp == NULL), 321 ("No NCF_TS")); 322 323 if (tsp == NULL && ticksp == NULL) 324 return; 325 326 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 327 if (tsp != NULL) 328 *tsp = ncp_ts->nc_time; 329 if (ticksp != NULL) 330 *ticksp = ncp_ts->nc_ticks; 331 } 332 333 #ifdef DEBUG_CACHE 334 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 335 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 336 "VFS namecache enabled"); 337 #endif 338 339 /* Export size information to userland */ 340 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 341 sizeof(struct namecache), "sizeof(struct namecache)"); 342 343 /* 344 * The new name cache statistics 345 */ 346 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, 347 "Name cache statistics"); 348 #define STATNODE_ULONG(name, descr) \ 349 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 350 #define STATNODE_COUNTER(name, descr) \ 351 static counter_u64_t __read_mostly name; \ 352 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 353 STATNODE_ULONG(numneg, "Number of negative cache entries"); 354 STATNODE_ULONG(numcache, "Number of cache entries"); 355 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 356 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 357 STATNODE_COUNTER(dothits, "Number of '.' hits"); 358 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 359 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 360 STATNODE_COUNTER(nummiss, "Number of cache misses"); 361 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 362 STATNODE_COUNTER(numposzaps, 363 "Number of cache hits (positive) we do not want to cache"); 364 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 365 STATNODE_COUNTER(numnegzaps, 366 "Number of cache hits (negative) we do not want to cache"); 367 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 368 /* These count for vn_getcwd(), too. */ 369 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 370 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 371 STATNODE_COUNTER(numfullpathfail2, 372 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 373 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 374 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 375 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 376 "Number of successful removals after relocking"); 377 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 378 "Number of times zap_and_exit failed to lock"); 379 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 380 "Number of times zap_and_exit failed to lock"); 381 static long cache_lock_vnodes_cel_3_failures; 382 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 383 "Number of times 3-way vnode locking failed"); 384 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 385 STATNODE_COUNTER(numneg_evicted, 386 "Number of negative entries evicted when adding a new entry"); 387 STATNODE_COUNTER(shrinking_skipped, 388 "Number of times shrinking was already in progress"); 389 390 static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 391 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 392 char **freebuf, size_t *buflen); 393 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 394 char *buf, char **retbuf, size_t *buflen); 395 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 396 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 397 398 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 399 400 static int cache_yield; 401 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 402 "Number of times cache called yield"); 403 404 static void __noinline 405 cache_maybe_yield(void) 406 { 407 408 if (should_yield()) { 409 cache_yield++; 410 kern_yield(PRI_USER); 411 } 412 } 413 414 static inline void 415 cache_assert_vlp_locked(struct mtx *vlp) 416 { 417 418 if (vlp != NULL) 419 mtx_assert(vlp, MA_OWNED); 420 } 421 422 static inline void 423 cache_assert_vnode_locked(struct vnode *vp) 424 { 425 struct mtx *vlp; 426 427 vlp = VP2VNODELOCK(vp); 428 cache_assert_vlp_locked(vlp); 429 } 430 431 static uint32_t 432 cache_get_hash(char *name, u_char len, struct vnode *dvp) 433 { 434 uint32_t hash; 435 436 hash = fnv_32_buf(name, len, FNV1_32_INIT); 437 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 438 return (hash); 439 } 440 441 static inline struct rwlock * 442 NCP2BUCKETLOCK(struct namecache *ncp) 443 { 444 uint32_t hash; 445 446 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 447 return (HASH2BUCKETLOCK(hash)); 448 } 449 450 #ifdef INVARIANTS 451 static void 452 cache_assert_bucket_locked(struct namecache *ncp, int mode) 453 { 454 struct rwlock *blp; 455 456 blp = NCP2BUCKETLOCK(ncp); 457 rw_assert(blp, mode); 458 } 459 #else 460 #define cache_assert_bucket_locked(x, y) do { } while (0) 461 #endif 462 463 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 464 static void 465 _cache_sort_vnodes(void **p1, void **p2) 466 { 467 void *tmp; 468 469 MPASS(*p1 != NULL || *p2 != NULL); 470 471 if (*p1 > *p2) { 472 tmp = *p2; 473 *p2 = *p1; 474 *p1 = tmp; 475 } 476 } 477 478 static void 479 cache_lock_all_buckets(void) 480 { 481 u_int i; 482 483 for (i = 0; i < numbucketlocks; i++) 484 rw_wlock(&bucketlocks[i]); 485 } 486 487 static void 488 cache_unlock_all_buckets(void) 489 { 490 u_int i; 491 492 for (i = 0; i < numbucketlocks; i++) 493 rw_wunlock(&bucketlocks[i]); 494 } 495 496 static void 497 cache_lock_all_vnodes(void) 498 { 499 u_int i; 500 501 for (i = 0; i < numvnodelocks; i++) 502 mtx_lock(&vnodelocks[i]); 503 } 504 505 static void 506 cache_unlock_all_vnodes(void) 507 { 508 u_int i; 509 510 for (i = 0; i < numvnodelocks; i++) 511 mtx_unlock(&vnodelocks[i]); 512 } 513 514 static int 515 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 516 { 517 518 cache_sort_vnodes(&vlp1, &vlp2); 519 520 if (vlp1 != NULL) { 521 if (!mtx_trylock(vlp1)) 522 return (EAGAIN); 523 } 524 if (!mtx_trylock(vlp2)) { 525 if (vlp1 != NULL) 526 mtx_unlock(vlp1); 527 return (EAGAIN); 528 } 529 530 return (0); 531 } 532 533 static void 534 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 535 { 536 537 MPASS(vlp1 != NULL || vlp2 != NULL); 538 MPASS(vlp1 <= vlp2); 539 540 if (vlp1 != NULL) 541 mtx_lock(vlp1); 542 if (vlp2 != NULL) 543 mtx_lock(vlp2); 544 } 545 546 static void 547 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 548 { 549 550 MPASS(vlp1 != NULL || vlp2 != NULL); 551 552 if (vlp1 != NULL) 553 mtx_unlock(vlp1); 554 if (vlp2 != NULL) 555 mtx_unlock(vlp2); 556 } 557 558 static int 559 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 560 { 561 struct nchstats snap; 562 563 if (req->oldptr == NULL) 564 return (SYSCTL_OUT(req, 0, sizeof(snap))); 565 566 snap = nchstats; 567 snap.ncs_goodhits = counter_u64_fetch(numposhits); 568 snap.ncs_neghits = counter_u64_fetch(numneghits); 569 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 570 counter_u64_fetch(numnegzaps); 571 snap.ncs_miss = counter_u64_fetch(nummisszap) + 572 counter_u64_fetch(nummiss); 573 574 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 575 } 576 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 577 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 578 "VFS cache effectiveness statistics"); 579 580 #ifdef DIAGNOSTIC 581 /* 582 * Grab an atomic snapshot of the name cache hash chain lengths 583 */ 584 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, 585 "hash table stats"); 586 587 static int 588 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 589 { 590 struct nchashhead *ncpp; 591 struct namecache *ncp; 592 int i, error, n_nchash, *cntbuf; 593 594 retry: 595 n_nchash = nchash + 1; /* nchash is max index, not count */ 596 if (req->oldptr == NULL) 597 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 598 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 599 cache_lock_all_buckets(); 600 if (n_nchash != nchash + 1) { 601 cache_unlock_all_buckets(); 602 free(cntbuf, M_TEMP); 603 goto retry; 604 } 605 /* Scan hash tables counting entries */ 606 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 607 LIST_FOREACH(ncp, ncpp, nc_hash) 608 cntbuf[i]++; 609 cache_unlock_all_buckets(); 610 for (error = 0, i = 0; i < n_nchash; i++) 611 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 612 break; 613 free(cntbuf, M_TEMP); 614 return (error); 615 } 616 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 617 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 618 "nchash chain lengths"); 619 620 static int 621 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 622 { 623 int error; 624 struct nchashhead *ncpp; 625 struct namecache *ncp; 626 int n_nchash; 627 int count, maxlength, used, pct; 628 629 if (!req->oldptr) 630 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 631 632 cache_lock_all_buckets(); 633 n_nchash = nchash + 1; /* nchash is max index, not count */ 634 used = 0; 635 maxlength = 0; 636 637 /* Scan hash tables for applicable entries */ 638 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 639 count = 0; 640 LIST_FOREACH(ncp, ncpp, nc_hash) { 641 count++; 642 } 643 if (count) 644 used++; 645 if (maxlength < count) 646 maxlength = count; 647 } 648 n_nchash = nchash + 1; 649 cache_unlock_all_buckets(); 650 pct = (used * 100) / (n_nchash / 100); 651 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 652 if (error) 653 return (error); 654 error = SYSCTL_OUT(req, &used, sizeof(used)); 655 if (error) 656 return (error); 657 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 658 if (error) 659 return (error); 660 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 661 if (error) 662 return (error); 663 return (0); 664 } 665 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 666 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 667 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 668 #endif 669 670 /* 671 * Negative entries management 672 * 673 * A variation of LRU scheme is used. New entries are hashed into one of 674 * numneglists cold lists. Entries get promoted to the hot list on first hit. 675 * 676 * The shrinker will demote hot list head and evict from the cold list in a 677 * round-robin manner. 678 */ 679 static void 680 cache_negative_hit(struct namecache *ncp) 681 { 682 struct neglist *neglist; 683 684 MPASS(ncp->nc_flag & NCF_NEGATIVE); 685 if (ncp->nc_flag & NCF_HOTNEGATIVE) 686 return; 687 neglist = NCP2NEGLIST(ncp); 688 mtx_lock(&ncneg_hot.nl_lock); 689 mtx_lock(&neglist->nl_lock); 690 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 691 numhotneg++; 692 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 693 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 694 ncp->nc_flag |= NCF_HOTNEGATIVE; 695 } 696 mtx_unlock(&neglist->nl_lock); 697 mtx_unlock(&ncneg_hot.nl_lock); 698 } 699 700 static void 701 cache_negative_insert(struct namecache *ncp, bool neg_locked) 702 { 703 struct neglist *neglist; 704 705 MPASS(ncp->nc_flag & NCF_NEGATIVE); 706 cache_assert_bucket_locked(ncp, RA_WLOCKED); 707 neglist = NCP2NEGLIST(ncp); 708 if (!neg_locked) { 709 mtx_lock(&neglist->nl_lock); 710 } else { 711 mtx_assert(&neglist->nl_lock, MA_OWNED); 712 } 713 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 714 if (!neg_locked) 715 mtx_unlock(&neglist->nl_lock); 716 atomic_add_rel_long(&numneg, 1); 717 } 718 719 static void 720 cache_negative_remove(struct namecache *ncp, bool neg_locked) 721 { 722 struct neglist *neglist; 723 bool hot_locked = false; 724 bool list_locked = false; 725 726 MPASS(ncp->nc_flag & NCF_NEGATIVE); 727 cache_assert_bucket_locked(ncp, RA_WLOCKED); 728 neglist = NCP2NEGLIST(ncp); 729 if (!neg_locked) { 730 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 731 hot_locked = true; 732 mtx_lock(&ncneg_hot.nl_lock); 733 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 734 list_locked = true; 735 mtx_lock(&neglist->nl_lock); 736 } 737 } else { 738 list_locked = true; 739 mtx_lock(&neglist->nl_lock); 740 } 741 } 742 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 743 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 744 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 745 numhotneg--; 746 } else { 747 mtx_assert(&neglist->nl_lock, MA_OWNED); 748 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 749 } 750 if (list_locked) 751 mtx_unlock(&neglist->nl_lock); 752 if (hot_locked) 753 mtx_unlock(&ncneg_hot.nl_lock); 754 atomic_subtract_rel_long(&numneg, 1); 755 } 756 757 static void 758 cache_negative_shrink_select(int start, struct namecache **ncpp, 759 struct neglist **neglistpp) 760 { 761 struct neglist *neglist; 762 struct namecache *ncp; 763 int i; 764 765 *ncpp = ncp = NULL; 766 neglist = NULL; 767 768 for (i = start; i < numneglists; i++) { 769 neglist = &neglists[i]; 770 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 771 continue; 772 mtx_lock(&neglist->nl_lock); 773 ncp = TAILQ_FIRST(&neglist->nl_list); 774 if (ncp != NULL) 775 break; 776 mtx_unlock(&neglist->nl_lock); 777 } 778 779 *neglistpp = neglist; 780 *ncpp = ncp; 781 } 782 783 static void 784 cache_negative_zap_one(void) 785 { 786 struct namecache *ncp, *ncp2; 787 struct neglist *neglist; 788 struct mtx *dvlp; 789 struct rwlock *blp; 790 791 if (mtx_owner(&ncneg_shrink_lock) != NULL || 792 !mtx_trylock(&ncneg_shrink_lock)) { 793 counter_u64_add(shrinking_skipped, 1); 794 return; 795 } 796 797 mtx_lock(&ncneg_hot.nl_lock); 798 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 799 if (ncp != NULL) { 800 neglist = NCP2NEGLIST(ncp); 801 mtx_lock(&neglist->nl_lock); 802 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 803 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 804 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 805 numhotneg--; 806 mtx_unlock(&neglist->nl_lock); 807 } 808 mtx_unlock(&ncneg_hot.nl_lock); 809 810 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 811 shrink_list_turn++; 812 if (shrink_list_turn == numneglists) 813 shrink_list_turn = 0; 814 if (ncp == NULL && shrink_list_turn == 0) 815 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 816 mtx_unlock(&ncneg_shrink_lock); 817 if (ncp == NULL) 818 return; 819 820 MPASS(ncp->nc_flag & NCF_NEGATIVE); 821 dvlp = VP2VNODELOCK(ncp->nc_dvp); 822 blp = NCP2BUCKETLOCK(ncp); 823 mtx_unlock(&neglist->nl_lock); 824 mtx_lock(dvlp); 825 rw_wlock(blp); 826 mtx_lock(&neglist->nl_lock); 827 ncp2 = TAILQ_FIRST(&neglist->nl_list); 828 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 829 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 830 ncp = NULL; 831 } else { 832 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 833 ncp->nc_name); 834 835 cache_zap_locked(ncp, true); 836 counter_u64_add(numneg_evicted, 1); 837 } 838 mtx_unlock(&neglist->nl_lock); 839 rw_wunlock(blp); 840 mtx_unlock(dvlp); 841 cache_free(ncp); 842 } 843 844 /* 845 * cache_zap_locked(): 846 * 847 * Removes a namecache entry from cache, whether it contains an actual 848 * pointer to a vnode or if it is just a negative cache entry. 849 */ 850 static void 851 cache_zap_locked(struct namecache *ncp, bool neg_locked) 852 { 853 854 if (!(ncp->nc_flag & NCF_NEGATIVE)) 855 cache_assert_vnode_locked(ncp->nc_vp); 856 cache_assert_vnode_locked(ncp->nc_dvp); 857 cache_assert_bucket_locked(ncp, RA_WLOCKED); 858 859 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 860 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 861 LIST_REMOVE(ncp, nc_hash); 862 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 863 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 864 ncp->nc_name, ncp->nc_vp); 865 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 866 if (ncp == ncp->nc_vp->v_cache_dd) 867 ncp->nc_vp->v_cache_dd = NULL; 868 } else { 869 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 870 ncp->nc_name); 871 cache_negative_remove(ncp, neg_locked); 872 } 873 if (ncp->nc_flag & NCF_ISDOTDOT) { 874 if (ncp == ncp->nc_dvp->v_cache_dd) 875 ncp->nc_dvp->v_cache_dd = NULL; 876 } else { 877 LIST_REMOVE(ncp, nc_src); 878 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 879 ncp->nc_flag |= NCF_DVDROP; 880 counter_u64_add(numcachehv, -1); 881 } 882 } 883 atomic_subtract_rel_long(&numcache, 1); 884 } 885 886 static void 887 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 888 { 889 struct rwlock *blp; 890 891 MPASS(ncp->nc_dvp == vp); 892 MPASS(ncp->nc_flag & NCF_NEGATIVE); 893 cache_assert_vnode_locked(vp); 894 895 blp = NCP2BUCKETLOCK(ncp); 896 rw_wlock(blp); 897 cache_zap_locked(ncp, false); 898 rw_wunlock(blp); 899 } 900 901 static bool 902 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 903 struct mtx **vlpp) 904 { 905 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 906 struct rwlock *blp; 907 908 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 909 cache_assert_vnode_locked(vp); 910 911 if (ncp->nc_flag & NCF_NEGATIVE) { 912 if (*vlpp != NULL) { 913 mtx_unlock(*vlpp); 914 *vlpp = NULL; 915 } 916 cache_zap_negative_locked_vnode_kl(ncp, vp); 917 return (true); 918 } 919 920 pvlp = VP2VNODELOCK(vp); 921 blp = NCP2BUCKETLOCK(ncp); 922 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 923 vlp2 = VP2VNODELOCK(ncp->nc_vp); 924 925 if (*vlpp == vlp1 || *vlpp == vlp2) { 926 to_unlock = *vlpp; 927 *vlpp = NULL; 928 } else { 929 if (*vlpp != NULL) { 930 mtx_unlock(*vlpp); 931 *vlpp = NULL; 932 } 933 cache_sort_vnodes(&vlp1, &vlp2); 934 if (vlp1 == pvlp) { 935 mtx_lock(vlp2); 936 to_unlock = vlp2; 937 } else { 938 if (!mtx_trylock(vlp1)) 939 goto out_relock; 940 to_unlock = vlp1; 941 } 942 } 943 rw_wlock(blp); 944 cache_zap_locked(ncp, false); 945 rw_wunlock(blp); 946 if (to_unlock != NULL) 947 mtx_unlock(to_unlock); 948 return (true); 949 950 out_relock: 951 mtx_unlock(vlp2); 952 mtx_lock(vlp1); 953 mtx_lock(vlp2); 954 MPASS(*vlpp == NULL); 955 *vlpp = vlp1; 956 return (false); 957 } 958 959 static int __noinline 960 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 961 { 962 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 963 struct rwlock *blp; 964 int error = 0; 965 966 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 967 cache_assert_vnode_locked(vp); 968 969 pvlp = VP2VNODELOCK(vp); 970 if (ncp->nc_flag & NCF_NEGATIVE) { 971 cache_zap_negative_locked_vnode_kl(ncp, vp); 972 goto out; 973 } 974 975 blp = NCP2BUCKETLOCK(ncp); 976 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 977 vlp2 = VP2VNODELOCK(ncp->nc_vp); 978 cache_sort_vnodes(&vlp1, &vlp2); 979 if (vlp1 == pvlp) { 980 mtx_lock(vlp2); 981 to_unlock = vlp2; 982 } else { 983 if (!mtx_trylock(vlp1)) { 984 error = EAGAIN; 985 goto out; 986 } 987 to_unlock = vlp1; 988 } 989 rw_wlock(blp); 990 cache_zap_locked(ncp, false); 991 rw_wunlock(blp); 992 mtx_unlock(to_unlock); 993 out: 994 mtx_unlock(pvlp); 995 return (error); 996 } 997 998 /* 999 * If trylocking failed we can get here. We know enough to take all needed locks 1000 * in the right order and re-lookup the entry. 1001 */ 1002 static int 1003 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1004 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1005 struct rwlock *blp) 1006 { 1007 struct namecache *rncp; 1008 1009 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1010 1011 cache_sort_vnodes(&dvlp, &vlp); 1012 cache_lock_vnodes(dvlp, vlp); 1013 rw_wlock(blp); 1014 LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1015 if (rncp == ncp && rncp->nc_dvp == dvp && 1016 rncp->nc_nlen == cnp->cn_namelen && 1017 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1018 break; 1019 } 1020 if (rncp != NULL) { 1021 cache_zap_locked(rncp, false); 1022 rw_wunlock(blp); 1023 cache_unlock_vnodes(dvlp, vlp); 1024 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1025 return (0); 1026 } 1027 1028 rw_wunlock(blp); 1029 cache_unlock_vnodes(dvlp, vlp); 1030 return (EAGAIN); 1031 } 1032 1033 static int __noinline 1034 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1035 uint32_t hash, struct rwlock *blp) 1036 { 1037 struct mtx *dvlp, *vlp; 1038 struct vnode *dvp; 1039 1040 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1041 1042 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1043 vlp = NULL; 1044 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1045 vlp = VP2VNODELOCK(ncp->nc_vp); 1046 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1047 cache_zap_locked(ncp, false); 1048 rw_wunlock(blp); 1049 cache_unlock_vnodes(dvlp, vlp); 1050 return (0); 1051 } 1052 1053 dvp = ncp->nc_dvp; 1054 rw_wunlock(blp); 1055 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1056 } 1057 1058 static int __noinline 1059 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1060 uint32_t hash, struct rwlock *blp) 1061 { 1062 struct mtx *dvlp, *vlp; 1063 struct vnode *dvp; 1064 1065 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1066 1067 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1068 vlp = NULL; 1069 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1070 vlp = VP2VNODELOCK(ncp->nc_vp); 1071 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1072 rw_runlock(blp); 1073 rw_wlock(blp); 1074 cache_zap_locked(ncp, false); 1075 rw_wunlock(blp); 1076 cache_unlock_vnodes(dvlp, vlp); 1077 return (0); 1078 } 1079 1080 dvp = ncp->nc_dvp; 1081 rw_runlock(blp); 1082 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1083 } 1084 1085 static int 1086 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1087 struct mtx **vlpp1, struct mtx **vlpp2) 1088 { 1089 struct mtx *dvlp, *vlp; 1090 1091 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1092 1093 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1094 vlp = NULL; 1095 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1096 vlp = VP2VNODELOCK(ncp->nc_vp); 1097 cache_sort_vnodes(&dvlp, &vlp); 1098 1099 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1100 cache_zap_locked(ncp, false); 1101 cache_unlock_vnodes(dvlp, vlp); 1102 *vlpp1 = NULL; 1103 *vlpp2 = NULL; 1104 return (0); 1105 } 1106 1107 if (*vlpp1 != NULL) 1108 mtx_unlock(*vlpp1); 1109 if (*vlpp2 != NULL) 1110 mtx_unlock(*vlpp2); 1111 *vlpp1 = NULL; 1112 *vlpp2 = NULL; 1113 1114 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1115 cache_zap_locked(ncp, false); 1116 cache_unlock_vnodes(dvlp, vlp); 1117 return (0); 1118 } 1119 1120 rw_wunlock(blp); 1121 *vlpp1 = dvlp; 1122 *vlpp2 = vlp; 1123 if (*vlpp1 != NULL) 1124 mtx_lock(*vlpp1); 1125 mtx_lock(*vlpp2); 1126 rw_wlock(blp); 1127 return (EAGAIN); 1128 } 1129 1130 static void 1131 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1132 { 1133 1134 if (blp != NULL) { 1135 rw_runlock(blp); 1136 } else { 1137 mtx_unlock(vlp); 1138 } 1139 } 1140 1141 static int __noinline 1142 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1143 struct timespec *tsp, int *ticksp) 1144 { 1145 int ltype; 1146 1147 *vpp = dvp; 1148 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1149 dvp, cnp->cn_nameptr); 1150 counter_u64_add(dothits, 1); 1151 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1152 if (tsp != NULL) 1153 timespecclear(tsp); 1154 if (ticksp != NULL) 1155 *ticksp = ticks; 1156 vrefact(*vpp); 1157 /* 1158 * When we lookup "." we still can be asked to lock it 1159 * differently... 1160 */ 1161 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1162 if (ltype != VOP_ISLOCKED(*vpp)) { 1163 if (ltype == LK_EXCLUSIVE) { 1164 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1165 if (VN_IS_DOOMED((*vpp))) { 1166 /* forced unmount */ 1167 vrele(*vpp); 1168 *vpp = NULL; 1169 return (ENOENT); 1170 } 1171 } else 1172 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1173 } 1174 return (-1); 1175 } 1176 1177 static __noinline int 1178 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1179 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1180 { 1181 struct namecache *ncp; 1182 struct rwlock *blp; 1183 struct mtx *dvlp, *dvlp2; 1184 uint32_t hash; 1185 int error; 1186 1187 if (cnp->cn_namelen == 2 && 1188 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1189 counter_u64_add(dotdothits, 1); 1190 dvlp = VP2VNODELOCK(dvp); 1191 dvlp2 = NULL; 1192 mtx_lock(dvlp); 1193 retry_dotdot: 1194 ncp = dvp->v_cache_dd; 1195 if (ncp == NULL) { 1196 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1197 "..", NULL); 1198 mtx_unlock(dvlp); 1199 if (dvlp2 != NULL) 1200 mtx_unlock(dvlp2); 1201 return (0); 1202 } 1203 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1204 if (ncp->nc_dvp != dvp) 1205 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1206 if (!cache_zap_locked_vnode_kl2(ncp, 1207 dvp, &dvlp2)) 1208 goto retry_dotdot; 1209 MPASS(dvp->v_cache_dd == NULL); 1210 mtx_unlock(dvlp); 1211 if (dvlp2 != NULL) 1212 mtx_unlock(dvlp2); 1213 cache_free(ncp); 1214 } else { 1215 dvp->v_cache_dd = NULL; 1216 mtx_unlock(dvlp); 1217 if (dvlp2 != NULL) 1218 mtx_unlock(dvlp2); 1219 } 1220 return (0); 1221 } 1222 1223 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1224 blp = HASH2BUCKETLOCK(hash); 1225 retry: 1226 if (LIST_EMPTY(NCHHASH(hash))) 1227 goto out_no_entry; 1228 1229 rw_wlock(blp); 1230 1231 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1232 counter_u64_add(numchecks, 1); 1233 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1234 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1235 break; 1236 } 1237 1238 /* We failed to find an entry */ 1239 if (ncp == NULL) { 1240 rw_wunlock(blp); 1241 goto out_no_entry; 1242 } 1243 1244 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1245 if (__predict_false(error != 0)) { 1246 zap_and_exit_bucket_fail++; 1247 cache_maybe_yield(); 1248 goto retry; 1249 } 1250 counter_u64_add(numposzaps, 1); 1251 cache_free(ncp); 1252 return (0); 1253 out_no_entry: 1254 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1255 counter_u64_add(nummisszap, 1); 1256 return (0); 1257 } 1258 1259 /** 1260 * Lookup a name in the name cache 1261 * 1262 * # Arguments 1263 * 1264 * - dvp: Parent directory in which to search. 1265 * - vpp: Return argument. Will contain desired vnode on cache hit. 1266 * - cnp: Parameters of the name search. The most interesting bits of 1267 * the cn_flags field have the following meanings: 1268 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1269 * it up. 1270 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1271 * - tsp: Return storage for cache timestamp. On a successful (positive 1272 * or negative) lookup, tsp will be filled with any timespec that 1273 * was stored when this cache entry was created. However, it will 1274 * be clear for "." entries. 1275 * - ticks: Return storage for alternate cache timestamp. On a successful 1276 * (positive or negative) lookup, it will contain the ticks value 1277 * that was current when the cache entry was created, unless cnp 1278 * was ".". 1279 * 1280 * # Returns 1281 * 1282 * - -1: A positive cache hit. vpp will contain the desired vnode. 1283 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1284 * to a forced unmount. vpp will not be modified. If the entry 1285 * is a whiteout, then the ISWHITEOUT flag will be set in 1286 * cnp->cn_flags. 1287 * - 0: A cache miss. vpp will not be modified. 1288 * 1289 * # Locking 1290 * 1291 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1292 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1293 * lock is not recursively acquired. 1294 */ 1295 int 1296 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1297 struct timespec *tsp, int *ticksp) 1298 { 1299 struct namecache_ts *ncp_ts; 1300 struct namecache *ncp; 1301 struct rwlock *blp; 1302 struct mtx *dvlp; 1303 uint32_t hash; 1304 enum vgetstate vs; 1305 int error, ltype; 1306 1307 #ifdef DEBUG_CACHE 1308 if (__predict_false(!doingcache)) { 1309 cnp->cn_flags &= ~MAKEENTRY; 1310 return (0); 1311 } 1312 #endif 1313 1314 counter_u64_add(numcalls, 1); 1315 1316 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1317 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1318 1319 if ((cnp->cn_flags & MAKEENTRY) == 0) 1320 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1321 1322 retry: 1323 blp = NULL; 1324 dvlp = NULL; 1325 error = 0; 1326 if (cnp->cn_namelen == 2 && 1327 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1328 counter_u64_add(dotdothits, 1); 1329 dvlp = VP2VNODELOCK(dvp); 1330 mtx_lock(dvlp); 1331 ncp = dvp->v_cache_dd; 1332 if (ncp == NULL) { 1333 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1334 "..", NULL); 1335 mtx_unlock(dvlp); 1336 return (0); 1337 } 1338 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1339 if (ncp->nc_flag & NCF_NEGATIVE) 1340 *vpp = NULL; 1341 else 1342 *vpp = ncp->nc_vp; 1343 } else 1344 *vpp = ncp->nc_dvp; 1345 /* Return failure if negative entry was found. */ 1346 if (*vpp == NULL) 1347 goto negative_success; 1348 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1349 dvp, cnp->cn_nameptr, *vpp); 1350 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1351 *vpp); 1352 cache_out_ts(ncp, tsp, ticksp); 1353 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1354 NCF_DTS && tsp != NULL) { 1355 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1356 *tsp = ncp_ts->nc_dotdottime; 1357 } 1358 goto success; 1359 } 1360 1361 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1362 blp = HASH2BUCKETLOCK(hash); 1363 rw_rlock(blp); 1364 1365 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1366 counter_u64_add(numchecks, 1); 1367 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1368 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1369 break; 1370 } 1371 1372 /* We failed to find an entry */ 1373 if (__predict_false(ncp == NULL)) { 1374 rw_runlock(blp); 1375 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1376 NULL); 1377 counter_u64_add(nummiss, 1); 1378 return (0); 1379 } 1380 1381 if (ncp->nc_flag & NCF_NEGATIVE) 1382 goto negative_success; 1383 1384 /* We found a "positive" match, return the vnode */ 1385 counter_u64_add(numposhits, 1); 1386 *vpp = ncp->nc_vp; 1387 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1388 dvp, cnp->cn_nameptr, *vpp, ncp); 1389 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1390 *vpp); 1391 cache_out_ts(ncp, tsp, ticksp); 1392 success: 1393 /* 1394 * On success we return a locked and ref'd vnode as per the lookup 1395 * protocol. 1396 */ 1397 MPASS(dvp != *vpp); 1398 ltype = 0; /* silence gcc warning */ 1399 if (cnp->cn_flags & ISDOTDOT) { 1400 ltype = VOP_ISLOCKED(dvp); 1401 VOP_UNLOCK(dvp); 1402 } 1403 vs = vget_prep(*vpp); 1404 cache_lookup_unlock(blp, dvlp); 1405 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1406 if (cnp->cn_flags & ISDOTDOT) { 1407 vn_lock(dvp, ltype | LK_RETRY); 1408 if (VN_IS_DOOMED(dvp)) { 1409 if (error == 0) 1410 vput(*vpp); 1411 *vpp = NULL; 1412 return (ENOENT); 1413 } 1414 } 1415 if (error) { 1416 *vpp = NULL; 1417 goto retry; 1418 } 1419 if ((cnp->cn_flags & ISLASTCN) && 1420 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1421 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1422 } 1423 return (-1); 1424 1425 negative_success: 1426 /* We found a negative match, and want to create it, so purge */ 1427 if (cnp->cn_nameiop == CREATE) { 1428 counter_u64_add(numnegzaps, 1); 1429 goto zap_and_exit; 1430 } 1431 1432 counter_u64_add(numneghits, 1); 1433 cache_negative_hit(ncp); 1434 if (ncp->nc_flag & NCF_WHITE) 1435 cnp->cn_flags |= ISWHITEOUT; 1436 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 1437 ncp->nc_name); 1438 cache_out_ts(ncp, tsp, ticksp); 1439 cache_lookup_unlock(blp, dvlp); 1440 return (ENOENT); 1441 1442 zap_and_exit: 1443 if (blp != NULL) 1444 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1445 else 1446 error = cache_zap_locked_vnode(ncp, dvp); 1447 if (__predict_false(error != 0)) { 1448 zap_and_exit_bucket_fail2++; 1449 cache_maybe_yield(); 1450 goto retry; 1451 } 1452 cache_free(ncp); 1453 return (0); 1454 } 1455 1456 struct celockstate { 1457 struct mtx *vlp[3]; 1458 struct rwlock *blp[2]; 1459 }; 1460 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1461 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1462 1463 static inline void 1464 cache_celockstate_init(struct celockstate *cel) 1465 { 1466 1467 bzero(cel, sizeof(*cel)); 1468 } 1469 1470 static void 1471 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1472 struct vnode *dvp) 1473 { 1474 struct mtx *vlp1, *vlp2; 1475 1476 MPASS(cel->vlp[0] == NULL); 1477 MPASS(cel->vlp[1] == NULL); 1478 MPASS(cel->vlp[2] == NULL); 1479 1480 MPASS(vp != NULL || dvp != NULL); 1481 1482 vlp1 = VP2VNODELOCK(vp); 1483 vlp2 = VP2VNODELOCK(dvp); 1484 cache_sort_vnodes(&vlp1, &vlp2); 1485 1486 if (vlp1 != NULL) { 1487 mtx_lock(vlp1); 1488 cel->vlp[0] = vlp1; 1489 } 1490 mtx_lock(vlp2); 1491 cel->vlp[1] = vlp2; 1492 } 1493 1494 static void 1495 cache_unlock_vnodes_cel(struct celockstate *cel) 1496 { 1497 1498 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1499 1500 if (cel->vlp[0] != NULL) 1501 mtx_unlock(cel->vlp[0]); 1502 if (cel->vlp[1] != NULL) 1503 mtx_unlock(cel->vlp[1]); 1504 if (cel->vlp[2] != NULL) 1505 mtx_unlock(cel->vlp[2]); 1506 } 1507 1508 static bool 1509 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1510 { 1511 struct mtx *vlp; 1512 bool ret; 1513 1514 cache_assert_vlp_locked(cel->vlp[0]); 1515 cache_assert_vlp_locked(cel->vlp[1]); 1516 MPASS(cel->vlp[2] == NULL); 1517 1518 MPASS(vp != NULL); 1519 vlp = VP2VNODELOCK(vp); 1520 1521 ret = true; 1522 if (vlp >= cel->vlp[1]) { 1523 mtx_lock(vlp); 1524 } else { 1525 if (mtx_trylock(vlp)) 1526 goto out; 1527 cache_lock_vnodes_cel_3_failures++; 1528 cache_unlock_vnodes_cel(cel); 1529 if (vlp < cel->vlp[0]) { 1530 mtx_lock(vlp); 1531 mtx_lock(cel->vlp[0]); 1532 mtx_lock(cel->vlp[1]); 1533 } else { 1534 if (cel->vlp[0] != NULL) 1535 mtx_lock(cel->vlp[0]); 1536 mtx_lock(vlp); 1537 mtx_lock(cel->vlp[1]); 1538 } 1539 ret = false; 1540 } 1541 out: 1542 cel->vlp[2] = vlp; 1543 return (ret); 1544 } 1545 1546 static void 1547 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1548 struct rwlock *blp2) 1549 { 1550 1551 MPASS(cel->blp[0] == NULL); 1552 MPASS(cel->blp[1] == NULL); 1553 1554 cache_sort_vnodes(&blp1, &blp2); 1555 1556 if (blp1 != NULL) { 1557 rw_wlock(blp1); 1558 cel->blp[0] = blp1; 1559 } 1560 rw_wlock(blp2); 1561 cel->blp[1] = blp2; 1562 } 1563 1564 static void 1565 cache_unlock_buckets_cel(struct celockstate *cel) 1566 { 1567 1568 if (cel->blp[0] != NULL) 1569 rw_wunlock(cel->blp[0]); 1570 rw_wunlock(cel->blp[1]); 1571 } 1572 1573 /* 1574 * Lock part of the cache affected by the insertion. 1575 * 1576 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1577 * However, insertion can result in removal of an old entry. In this 1578 * case we have an additional vnode and bucketlock pair to lock. If the 1579 * entry is negative, ncelock is locked instead of the vnode. 1580 * 1581 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1582 * preserving the locking order (smaller address first). 1583 */ 1584 static void 1585 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1586 uint32_t hash) 1587 { 1588 struct namecache *ncp; 1589 struct rwlock *blps[2]; 1590 1591 blps[0] = HASH2BUCKETLOCK(hash); 1592 for (;;) { 1593 blps[1] = NULL; 1594 cache_lock_vnodes_cel(cel, dvp, vp); 1595 if (vp == NULL || vp->v_type != VDIR) 1596 break; 1597 ncp = vp->v_cache_dd; 1598 if (ncp == NULL) 1599 break; 1600 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1601 break; 1602 MPASS(ncp->nc_dvp == vp); 1603 blps[1] = NCP2BUCKETLOCK(ncp); 1604 if (ncp->nc_flag & NCF_NEGATIVE) 1605 break; 1606 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1607 break; 1608 /* 1609 * All vnodes got re-locked. Re-validate the state and if 1610 * nothing changed we are done. Otherwise restart. 1611 */ 1612 if (ncp == vp->v_cache_dd && 1613 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1614 blps[1] == NCP2BUCKETLOCK(ncp) && 1615 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1616 break; 1617 cache_unlock_vnodes_cel(cel); 1618 cel->vlp[0] = NULL; 1619 cel->vlp[1] = NULL; 1620 cel->vlp[2] = NULL; 1621 } 1622 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1623 } 1624 1625 static void 1626 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1627 uint32_t hash) 1628 { 1629 struct namecache *ncp; 1630 struct rwlock *blps[2]; 1631 1632 blps[0] = HASH2BUCKETLOCK(hash); 1633 for (;;) { 1634 blps[1] = NULL; 1635 cache_lock_vnodes_cel(cel, dvp, vp); 1636 ncp = dvp->v_cache_dd; 1637 if (ncp == NULL) 1638 break; 1639 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1640 break; 1641 MPASS(ncp->nc_dvp == dvp); 1642 blps[1] = NCP2BUCKETLOCK(ncp); 1643 if (ncp->nc_flag & NCF_NEGATIVE) 1644 break; 1645 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1646 break; 1647 if (ncp == dvp->v_cache_dd && 1648 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1649 blps[1] == NCP2BUCKETLOCK(ncp) && 1650 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1651 break; 1652 cache_unlock_vnodes_cel(cel); 1653 cel->vlp[0] = NULL; 1654 cel->vlp[1] = NULL; 1655 cel->vlp[2] = NULL; 1656 } 1657 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1658 } 1659 1660 static void 1661 cache_enter_unlock(struct celockstate *cel) 1662 { 1663 1664 cache_unlock_buckets_cel(cel); 1665 cache_unlock_vnodes_cel(cel); 1666 } 1667 1668 static void __noinline 1669 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1670 struct componentname *cnp) 1671 { 1672 struct celockstate cel; 1673 struct namecache *ncp; 1674 uint32_t hash; 1675 int len; 1676 1677 if (dvp->v_cache_dd == NULL) 1678 return; 1679 len = cnp->cn_namelen; 1680 cache_celockstate_init(&cel); 1681 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1682 cache_enter_lock_dd(&cel, dvp, vp, hash); 1683 ncp = dvp->v_cache_dd; 1684 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1685 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1686 cache_zap_locked(ncp, false); 1687 } else { 1688 ncp = NULL; 1689 } 1690 dvp->v_cache_dd = NULL; 1691 cache_enter_unlock(&cel); 1692 cache_free(ncp); 1693 } 1694 1695 /* 1696 * Add an entry to the cache. 1697 */ 1698 void 1699 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1700 struct timespec *tsp, struct timespec *dtsp) 1701 { 1702 struct celockstate cel; 1703 struct namecache *ncp, *n2, *ndd; 1704 struct namecache_ts *ncp_ts, *n2_ts; 1705 struct nchashhead *ncpp; 1706 uint32_t hash; 1707 int flag; 1708 int len; 1709 u_long lnumcache; 1710 1711 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1712 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1713 ("cache_enter: Adding a doomed vnode")); 1714 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1715 ("cache_enter: Doomed vnode used as src")); 1716 1717 #ifdef DEBUG_CACHE 1718 if (__predict_false(!doingcache)) 1719 return; 1720 #endif 1721 1722 flag = 0; 1723 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1724 if (cnp->cn_namelen == 1) 1725 return; 1726 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1727 cache_enter_dotdot_prep(dvp, vp, cnp); 1728 flag = NCF_ISDOTDOT; 1729 } 1730 } 1731 1732 /* 1733 * Avoid blowout in namecache entries. 1734 */ 1735 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1736 if (__predict_false(lnumcache >= ncsize)) { 1737 atomic_add_long(&numcache, -1); 1738 return; 1739 } 1740 1741 cache_celockstate_init(&cel); 1742 ndd = NULL; 1743 ncp_ts = NULL; 1744 1745 /* 1746 * Calculate the hash key and setup as much of the new 1747 * namecache entry as possible before acquiring the lock. 1748 */ 1749 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1750 ncp->nc_flag = flag; 1751 ncp->nc_vp = vp; 1752 if (vp == NULL) 1753 ncp->nc_flag |= NCF_NEGATIVE; 1754 ncp->nc_dvp = dvp; 1755 if (tsp != NULL) { 1756 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1757 ncp_ts->nc_time = *tsp; 1758 ncp_ts->nc_ticks = ticks; 1759 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1760 if (dtsp != NULL) { 1761 ncp_ts->nc_dotdottime = *dtsp; 1762 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1763 } 1764 } 1765 len = ncp->nc_nlen = cnp->cn_namelen; 1766 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1767 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1768 cache_enter_lock(&cel, dvp, vp, hash); 1769 1770 /* 1771 * See if this vnode or negative entry is already in the cache 1772 * with this name. This can happen with concurrent lookups of 1773 * the same path name. 1774 */ 1775 ncpp = NCHHASH(hash); 1776 LIST_FOREACH(n2, ncpp, nc_hash) { 1777 if (n2->nc_dvp == dvp && 1778 n2->nc_nlen == cnp->cn_namelen && 1779 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1780 if (tsp != NULL) { 1781 KASSERT((n2->nc_flag & NCF_TS) != 0, 1782 ("no NCF_TS")); 1783 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1784 n2_ts->nc_time = ncp_ts->nc_time; 1785 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1786 if (dtsp != NULL) { 1787 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1788 if (ncp->nc_flag & NCF_NEGATIVE) 1789 mtx_lock(&ncneg_hot.nl_lock); 1790 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1791 if (ncp->nc_flag & NCF_NEGATIVE) 1792 mtx_unlock(&ncneg_hot.nl_lock); 1793 } 1794 } 1795 goto out_unlock_free; 1796 } 1797 } 1798 1799 if (flag == NCF_ISDOTDOT) { 1800 /* 1801 * See if we are trying to add .. entry, but some other lookup 1802 * has populated v_cache_dd pointer already. 1803 */ 1804 if (dvp->v_cache_dd != NULL) 1805 goto out_unlock_free; 1806 KASSERT(vp == NULL || vp->v_type == VDIR, 1807 ("wrong vnode type %p", vp)); 1808 dvp->v_cache_dd = ncp; 1809 } 1810 1811 if (vp != NULL) { 1812 if (vp->v_type == VDIR) { 1813 if (flag != NCF_ISDOTDOT) { 1814 /* 1815 * For this case, the cache entry maps both the 1816 * directory name in it and the name ".." for the 1817 * directory's parent. 1818 */ 1819 if ((ndd = vp->v_cache_dd) != NULL) { 1820 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1821 cache_zap_locked(ndd, false); 1822 else 1823 ndd = NULL; 1824 } 1825 vp->v_cache_dd = ncp; 1826 } 1827 } else { 1828 vp->v_cache_dd = NULL; 1829 } 1830 } 1831 1832 if (flag != NCF_ISDOTDOT) { 1833 if (LIST_EMPTY(&dvp->v_cache_src)) { 1834 vhold(dvp); 1835 counter_u64_add(numcachehv, 1); 1836 } 1837 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1838 } 1839 1840 /* 1841 * Insert the new namecache entry into the appropriate chain 1842 * within the cache entries table. 1843 */ 1844 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1845 1846 /* 1847 * If the entry is "negative", we place it into the 1848 * "negative" cache queue, otherwise, we place it into the 1849 * destination vnode's cache entries queue. 1850 */ 1851 if (vp != NULL) { 1852 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1853 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1854 vp); 1855 } else { 1856 if (cnp->cn_flags & ISWHITEOUT) 1857 ncp->nc_flag |= NCF_WHITE; 1858 cache_negative_insert(ncp, false); 1859 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1860 ncp->nc_name); 1861 } 1862 cache_enter_unlock(&cel); 1863 if (numneg * ncnegfactor > lnumcache) 1864 cache_negative_zap_one(); 1865 cache_free(ndd); 1866 return; 1867 out_unlock_free: 1868 cache_enter_unlock(&cel); 1869 cache_free(ncp); 1870 return; 1871 } 1872 1873 static u_int 1874 cache_roundup_2(u_int val) 1875 { 1876 u_int res; 1877 1878 for (res = 1; res <= val; res <<= 1) 1879 continue; 1880 1881 return (res); 1882 } 1883 1884 /* 1885 * Name cache initialization, from vfs_init() when we are booting 1886 */ 1887 static void 1888 nchinit(void *dummy __unused) 1889 { 1890 u_int i; 1891 1892 cache_zone_small = uma_zcreate("S VFS Cache", 1893 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1894 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1895 UMA_ZONE_ZINIT); 1896 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1897 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1898 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1899 UMA_ZONE_ZINIT); 1900 cache_zone_large = uma_zcreate("L VFS Cache", 1901 sizeof(struct namecache) + NAME_MAX + 1, 1902 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1903 UMA_ZONE_ZINIT); 1904 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1905 sizeof(struct namecache_ts) + NAME_MAX + 1, 1906 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1907 UMA_ZONE_ZINIT); 1908 1909 ncsize = desiredvnodes * ncsizefactor; 1910 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1911 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 1912 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 1913 ncbuckethash = 7; 1914 if (ncbuckethash > nchash) 1915 ncbuckethash = nchash; 1916 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1917 M_WAITOK | M_ZERO); 1918 for (i = 0; i < numbucketlocks; i++) 1919 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 1920 ncvnodehash = ncbuckethash; 1921 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 1922 M_WAITOK | M_ZERO); 1923 for (i = 0; i < numvnodelocks; i++) 1924 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 1925 ncpurgeminvnodes = numbucketlocks * 2; 1926 1927 ncneghash = 3; 1928 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 1929 M_WAITOK | M_ZERO); 1930 for (i = 0; i < numneglists; i++) { 1931 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 1932 TAILQ_INIT(&neglists[i].nl_list); 1933 } 1934 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 1935 TAILQ_INIT(&ncneg_hot.nl_list); 1936 1937 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 1938 1939 numcachehv = counter_u64_alloc(M_WAITOK); 1940 numcalls = counter_u64_alloc(M_WAITOK); 1941 dothits = counter_u64_alloc(M_WAITOK); 1942 dotdothits = counter_u64_alloc(M_WAITOK); 1943 numchecks = counter_u64_alloc(M_WAITOK); 1944 nummiss = counter_u64_alloc(M_WAITOK); 1945 nummisszap = counter_u64_alloc(M_WAITOK); 1946 numposzaps = counter_u64_alloc(M_WAITOK); 1947 numposhits = counter_u64_alloc(M_WAITOK); 1948 numnegzaps = counter_u64_alloc(M_WAITOK); 1949 numneghits = counter_u64_alloc(M_WAITOK); 1950 numfullpathcalls = counter_u64_alloc(M_WAITOK); 1951 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 1952 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 1953 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 1954 numfullpathfound = counter_u64_alloc(M_WAITOK); 1955 zap_and_exit_bucket_relock_success = counter_u64_alloc(M_WAITOK); 1956 numneg_evicted = counter_u64_alloc(M_WAITOK); 1957 shrinking_skipped = counter_u64_alloc(M_WAITOK); 1958 } 1959 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 1960 1961 void 1962 cache_changesize(u_long newmaxvnodes) 1963 { 1964 struct nchashhead *new_nchashtbl, *old_nchashtbl; 1965 u_long new_nchash, old_nchash; 1966 struct namecache *ncp; 1967 uint32_t hash; 1968 u_long newncsize; 1969 int i; 1970 1971 newncsize = newmaxvnodes * ncsizefactor; 1972 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 1973 if (newmaxvnodes < numbucketlocks) 1974 newmaxvnodes = numbucketlocks; 1975 1976 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 1977 /* If same hash table size, nothing to do */ 1978 if (nchash == new_nchash) { 1979 free(new_nchashtbl, M_VFSCACHE); 1980 return; 1981 } 1982 /* 1983 * Move everything from the old hash table to the new table. 1984 * None of the namecache entries in the table can be removed 1985 * because to do so, they have to be removed from the hash table. 1986 */ 1987 cache_lock_all_vnodes(); 1988 cache_lock_all_buckets(); 1989 old_nchashtbl = nchashtbl; 1990 old_nchash = nchash; 1991 nchashtbl = new_nchashtbl; 1992 nchash = new_nchash; 1993 for (i = 0; i <= old_nchash; i++) { 1994 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 1995 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 1996 ncp->nc_dvp); 1997 LIST_REMOVE(ncp, nc_hash); 1998 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 1999 } 2000 } 2001 ncsize = newncsize; 2002 cache_unlock_all_buckets(); 2003 cache_unlock_all_vnodes(); 2004 free(old_nchashtbl, M_VFSCACHE); 2005 } 2006 2007 /* 2008 * Invalidate all entries from and to a particular vnode. 2009 */ 2010 void 2011 cache_purge(struct vnode *vp) 2012 { 2013 TAILQ_HEAD(, namecache) ncps; 2014 struct namecache *ncp, *nnp; 2015 struct mtx *vlp, *vlp2; 2016 2017 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2018 SDT_PROBE1(vfs, namecache, purge, done, vp); 2019 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2020 vp->v_cache_dd == NULL) 2021 return; 2022 TAILQ_INIT(&ncps); 2023 vlp = VP2VNODELOCK(vp); 2024 vlp2 = NULL; 2025 mtx_lock(vlp); 2026 retry: 2027 while (!LIST_EMPTY(&vp->v_cache_src)) { 2028 ncp = LIST_FIRST(&vp->v_cache_src); 2029 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2030 goto retry; 2031 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2032 } 2033 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2034 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2035 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2036 goto retry; 2037 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2038 } 2039 ncp = vp->v_cache_dd; 2040 if (ncp != NULL) { 2041 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2042 ("lost dotdot link")); 2043 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2044 goto retry; 2045 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2046 } 2047 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2048 mtx_unlock(vlp); 2049 if (vlp2 != NULL) 2050 mtx_unlock(vlp2); 2051 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2052 cache_free(ncp); 2053 } 2054 } 2055 2056 /* 2057 * Invalidate all negative entries for a particular directory vnode. 2058 */ 2059 void 2060 cache_purge_negative(struct vnode *vp) 2061 { 2062 TAILQ_HEAD(, namecache) ncps; 2063 struct namecache *ncp, *nnp; 2064 struct mtx *vlp; 2065 2066 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2067 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2068 if (LIST_EMPTY(&vp->v_cache_src)) 2069 return; 2070 TAILQ_INIT(&ncps); 2071 vlp = VP2VNODELOCK(vp); 2072 mtx_lock(vlp); 2073 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2074 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2075 continue; 2076 cache_zap_negative_locked_vnode_kl(ncp, vp); 2077 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2078 } 2079 mtx_unlock(vlp); 2080 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2081 cache_free(ncp); 2082 } 2083 } 2084 2085 /* 2086 * Flush all entries referencing a particular filesystem. 2087 */ 2088 void 2089 cache_purgevfs(struct mount *mp, bool force) 2090 { 2091 TAILQ_HEAD(, namecache) ncps; 2092 struct mtx *vlp1, *vlp2; 2093 struct rwlock *blp; 2094 struct nchashhead *bucket; 2095 struct namecache *ncp, *nnp; 2096 u_long i, j, n_nchash; 2097 int error; 2098 2099 /* Scan hash tables for applicable entries */ 2100 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2101 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2102 return; 2103 TAILQ_INIT(&ncps); 2104 n_nchash = nchash + 1; 2105 vlp1 = vlp2 = NULL; 2106 for (i = 0; i < numbucketlocks; i++) { 2107 blp = (struct rwlock *)&bucketlocks[i]; 2108 rw_wlock(blp); 2109 for (j = i; j < n_nchash; j += numbucketlocks) { 2110 retry: 2111 bucket = &nchashtbl[j]; 2112 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2113 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2114 if (ncp->nc_dvp->v_mount != mp) 2115 continue; 2116 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2117 &vlp1, &vlp2); 2118 if (error != 0) 2119 goto retry; 2120 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2121 } 2122 } 2123 rw_wunlock(blp); 2124 if (vlp1 == NULL && vlp2 == NULL) 2125 cache_maybe_yield(); 2126 } 2127 if (vlp1 != NULL) 2128 mtx_unlock(vlp1); 2129 if (vlp2 != NULL) 2130 mtx_unlock(vlp2); 2131 2132 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2133 cache_free(ncp); 2134 } 2135 } 2136 2137 /* 2138 * Perform canonical checks and cache lookup and pass on to filesystem 2139 * through the vop_cachedlookup only if needed. 2140 */ 2141 2142 int 2143 vfs_cache_lookup(struct vop_lookup_args *ap) 2144 { 2145 struct vnode *dvp; 2146 int error; 2147 struct vnode **vpp = ap->a_vpp; 2148 struct componentname *cnp = ap->a_cnp; 2149 int flags = cnp->cn_flags; 2150 2151 *vpp = NULL; 2152 dvp = ap->a_dvp; 2153 2154 if (dvp->v_type != VDIR) 2155 return (ENOTDIR); 2156 2157 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2158 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2159 return (EROFS); 2160 2161 error = vn_dir_check_exec(dvp, cnp); 2162 if (error != 0) 2163 return (error); 2164 2165 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2166 if (error == 0) 2167 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2168 if (error == -1) 2169 return (0); 2170 return (error); 2171 } 2172 2173 /* Implementation of the getcwd syscall. */ 2174 int 2175 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2176 { 2177 char *buf, *retbuf; 2178 size_t buflen; 2179 int error; 2180 2181 buflen = uap->buflen; 2182 if (__predict_false(buflen < 2)) 2183 return (EINVAL); 2184 if (buflen > MAXPATHLEN) 2185 buflen = MAXPATHLEN; 2186 2187 buf = malloc(buflen, M_TEMP, M_WAITOK); 2188 error = vn_getcwd(td, buf, &retbuf, &buflen); 2189 if (error == 0) 2190 error = copyout(retbuf, uap->buf, buflen); 2191 free(buf, M_TEMP); 2192 return (error); 2193 } 2194 2195 int 2196 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2197 { 2198 struct filedesc *fdp; 2199 struct vnode *cdir, *rdir; 2200 int error; 2201 2202 fdp = td->td_proc->p_fd; 2203 FILEDESC_SLOCK(fdp); 2204 cdir = fdp->fd_cdir; 2205 vrefact(cdir); 2206 rdir = fdp->fd_rdir; 2207 vrefact(rdir); 2208 FILEDESC_SUNLOCK(fdp); 2209 error = vn_fullpath_any(td, cdir, rdir, buf, retbuf, buflen); 2210 vrele(rdir); 2211 vrele(cdir); 2212 2213 #ifdef KTRACE 2214 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2215 ktrnamei(*retbuf); 2216 #endif 2217 return (error); 2218 } 2219 2220 static int 2221 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2222 size_t size, int flags, enum uio_seg pathseg) 2223 { 2224 struct nameidata nd; 2225 char *retbuf, *freebuf; 2226 int error; 2227 2228 if (flags != 0) 2229 return (EINVAL); 2230 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2231 pathseg, path, fd, &cap_fstat_rights, td); 2232 if ((error = namei(&nd)) != 0) 2233 return (error); 2234 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2235 if (error == 0) { 2236 error = copyout(retbuf, buf, size); 2237 free(freebuf, M_TEMP); 2238 } 2239 NDFREE(&nd, 0); 2240 return (error); 2241 } 2242 2243 int 2244 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2245 { 2246 2247 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2248 uap->flags, UIO_USERSPACE)); 2249 } 2250 2251 /* 2252 * Retrieve the full filesystem path that correspond to a vnode from the name 2253 * cache (if available) 2254 */ 2255 int 2256 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2257 { 2258 char *buf; 2259 struct filedesc *fdp; 2260 struct vnode *rdir; 2261 size_t buflen; 2262 int error; 2263 2264 if (__predict_false(vn == NULL)) 2265 return (EINVAL); 2266 2267 buflen = MAXPATHLEN; 2268 buf = malloc(buflen, M_TEMP, M_WAITOK); 2269 fdp = td->td_proc->p_fd; 2270 FILEDESC_SLOCK(fdp); 2271 rdir = fdp->fd_rdir; 2272 vrefact(rdir); 2273 FILEDESC_SUNLOCK(fdp); 2274 error = vn_fullpath_any(td, vn, rdir, buf, retbuf, &buflen); 2275 vrele(rdir); 2276 2277 if (!error) 2278 *freebuf = buf; 2279 else 2280 free(buf, M_TEMP); 2281 return (error); 2282 } 2283 2284 /* 2285 * This function is similar to vn_fullpath, but it attempts to lookup the 2286 * pathname relative to the global root mount point. This is required for the 2287 * auditing sub-system, as audited pathnames must be absolute, relative to the 2288 * global root mount point. 2289 */ 2290 int 2291 vn_fullpath_global(struct thread *td, struct vnode *vn, 2292 char **retbuf, char **freebuf) 2293 { 2294 char *buf; 2295 size_t buflen; 2296 int error; 2297 2298 if (__predict_false(vn == NULL)) 2299 return (EINVAL); 2300 buflen = MAXPATHLEN; 2301 buf = malloc(buflen, M_TEMP, M_WAITOK); 2302 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2303 if (!error) 2304 *freebuf = buf; 2305 else 2306 free(buf, M_TEMP); 2307 return (error); 2308 } 2309 2310 int 2311 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2312 { 2313 struct vnode *dvp; 2314 struct namecache *ncp; 2315 struct mtx *vlp; 2316 int error; 2317 2318 vlp = VP2VNODELOCK(*vp); 2319 mtx_lock(vlp); 2320 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2321 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2322 break; 2323 } 2324 if (ncp != NULL) { 2325 if (*buflen < ncp->nc_nlen) { 2326 mtx_unlock(vlp); 2327 vrele(*vp); 2328 counter_u64_add(numfullpathfail4, 1); 2329 error = ENOMEM; 2330 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2331 vp, NULL); 2332 return (error); 2333 } 2334 *buflen -= ncp->nc_nlen; 2335 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2336 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2337 ncp->nc_name, vp); 2338 dvp = *vp; 2339 *vp = ncp->nc_dvp; 2340 vref(*vp); 2341 mtx_unlock(vlp); 2342 vrele(dvp); 2343 return (0); 2344 } 2345 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2346 2347 mtx_unlock(vlp); 2348 vn_lock(*vp, LK_SHARED | LK_RETRY); 2349 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2350 vput(*vp); 2351 if (error) { 2352 counter_u64_add(numfullpathfail2, 1); 2353 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2354 return (error); 2355 } 2356 2357 *vp = dvp; 2358 if (VN_IS_DOOMED(dvp)) { 2359 /* forced unmount */ 2360 vrele(dvp); 2361 error = ENOENT; 2362 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2363 return (error); 2364 } 2365 /* 2366 * *vp has its use count incremented still. 2367 */ 2368 2369 return (0); 2370 } 2371 2372 /* 2373 * Resolve a directory to a pathname. 2374 * 2375 * The name of the directory can always be found in the namecache or fetched 2376 * from the filesystem. There is also guaranteed to be only one parent, meaning 2377 * we can just follow vnodes up until we find the root. 2378 * 2379 * The vnode must be referenced. 2380 */ 2381 static int 2382 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2383 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2384 { 2385 #ifdef KDTRACE_HOOKS 2386 struct vnode *startvp = vp; 2387 #endif 2388 struct vnode *vp1; 2389 size_t buflen; 2390 int error; 2391 2392 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2393 VNPASS(vp->v_usecount > 0, vp); 2394 2395 buflen = *len; 2396 2397 if (!slash_prefixed) { 2398 MPASS(*len >= 2); 2399 buflen--; 2400 buf[buflen] = '\0'; 2401 } 2402 2403 error = 0; 2404 2405 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2406 counter_u64_add(numfullpathcalls, 1); 2407 while (vp != rdir && vp != rootvnode) { 2408 /* 2409 * The vp vnode must be already fully constructed, 2410 * since it is either found in namecache or obtained 2411 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2412 * without obtaining the vnode lock. 2413 */ 2414 if ((vp->v_vflag & VV_ROOT) != 0) { 2415 vn_lock(vp, LK_RETRY | LK_SHARED); 2416 2417 /* 2418 * With the vnode locked, check for races with 2419 * unmount, forced or not. Note that we 2420 * already verified that vp is not equal to 2421 * the root vnode, which means that 2422 * mnt_vnodecovered can be NULL only for the 2423 * case of unmount. 2424 */ 2425 if (VN_IS_DOOMED(vp) || 2426 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2427 vp1->v_mountedhere != vp->v_mount) { 2428 vput(vp); 2429 error = ENOENT; 2430 SDT_PROBE3(vfs, namecache, fullpath, return, 2431 error, vp, NULL); 2432 break; 2433 } 2434 2435 vref(vp1); 2436 vput(vp); 2437 vp = vp1; 2438 continue; 2439 } 2440 if (vp->v_type != VDIR) { 2441 vrele(vp); 2442 counter_u64_add(numfullpathfail1, 1); 2443 error = ENOTDIR; 2444 SDT_PROBE3(vfs, namecache, fullpath, return, 2445 error, vp, NULL); 2446 break; 2447 } 2448 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2449 if (error) 2450 break; 2451 if (buflen == 0) { 2452 vrele(vp); 2453 error = ENOMEM; 2454 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2455 startvp, NULL); 2456 break; 2457 } 2458 buf[--buflen] = '/'; 2459 slash_prefixed = true; 2460 } 2461 if (error) 2462 return (error); 2463 if (!slash_prefixed) { 2464 if (buflen == 0) { 2465 vrele(vp); 2466 counter_u64_add(numfullpathfail4, 1); 2467 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2468 startvp, NULL); 2469 return (ENOMEM); 2470 } 2471 buf[--buflen] = '/'; 2472 } 2473 counter_u64_add(numfullpathfound, 1); 2474 vrele(vp); 2475 2476 *retbuf = buf + buflen; 2477 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2478 *len -= buflen; 2479 *len += addend; 2480 return (0); 2481 } 2482 2483 /* 2484 * Resolve an arbitrary vnode to a pathname. 2485 * 2486 * Note 2 caveats: 2487 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2488 * resolve to a different path than the one used to find it 2489 * - namecache is not mandatory, meaning names are not guaranteed to be added 2490 * (in which case resolving fails) 2491 */ 2492 static int 2493 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2494 char *buf, char **retbuf, size_t *buflen) 2495 { 2496 size_t orig_buflen; 2497 bool slash_prefixed; 2498 int error; 2499 2500 if (*buflen < 2) 2501 return (EINVAL); 2502 2503 orig_buflen = *buflen; 2504 2505 vref(vp); 2506 slash_prefixed = false; 2507 if (vp->v_type != VDIR) { 2508 *buflen -= 1; 2509 buf[*buflen] = '\0'; 2510 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2511 if (error) 2512 return (error); 2513 if (*buflen == 0) { 2514 vrele(vp); 2515 return (ENOMEM); 2516 } 2517 *buflen -= 1; 2518 buf[*buflen] = '/'; 2519 slash_prefixed = true; 2520 } 2521 2522 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2523 orig_buflen - *buflen)); 2524 } 2525 2526 /* 2527 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2528 * 2529 * Since the namecache does not track handlings, the caller is expected to first 2530 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2531 * 2532 * Then we have 2 cases: 2533 * - if the found vnode is a directory, the path can be constructed just by 2534 * fullowing names up the chain 2535 * - otherwise we populate the buffer with the saved name and start resolving 2536 * from the parent 2537 */ 2538 static int 2539 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2540 char **freebuf, size_t *buflen) 2541 { 2542 char *buf, *tmpbuf; 2543 struct filedesc *fdp; 2544 struct vnode *rdir; 2545 struct componentname *cnp; 2546 struct vnode *vp; 2547 size_t addend; 2548 int error; 2549 bool slash_prefixed; 2550 2551 if (*buflen < 2) 2552 return (EINVAL); 2553 if (*buflen > MAXPATHLEN) 2554 *buflen = MAXPATHLEN; 2555 2556 slash_prefixed = false; 2557 2558 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2559 fdp = td->td_proc->p_fd; 2560 FILEDESC_SLOCK(fdp); 2561 rdir = fdp->fd_rdir; 2562 vrefact(rdir); 2563 FILEDESC_SUNLOCK(fdp); 2564 2565 addend = 0; 2566 vp = ndp->ni_vp; 2567 if (vp->v_type != VDIR) { 2568 cnp = &ndp->ni_cnd; 2569 addend = cnp->cn_namelen + 2; 2570 if (*buflen < addend) { 2571 error = ENOMEM; 2572 goto out_bad; 2573 } 2574 *buflen -= addend; 2575 tmpbuf = buf + *buflen; 2576 tmpbuf[0] = '/'; 2577 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2578 tmpbuf[addend - 1] = '\0'; 2579 slash_prefixed = true; 2580 vp = ndp->ni_dvp; 2581 } 2582 2583 vref(vp); 2584 error = vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, addend); 2585 if (error != 0) 2586 goto out_bad; 2587 2588 vrele(rdir); 2589 *freebuf = buf; 2590 2591 return (0); 2592 out_bad: 2593 vrele(rdir); 2594 free(buf, M_TEMP); 2595 return (error); 2596 } 2597 2598 struct vnode * 2599 vn_dir_dd_ino(struct vnode *vp) 2600 { 2601 struct namecache *ncp; 2602 struct vnode *ddvp; 2603 struct mtx *vlp; 2604 enum vgetstate vs; 2605 2606 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2607 vlp = VP2VNODELOCK(vp); 2608 mtx_lock(vlp); 2609 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2610 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2611 continue; 2612 ddvp = ncp->nc_dvp; 2613 vs = vget_prep(ddvp); 2614 mtx_unlock(vlp); 2615 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2616 return (NULL); 2617 return (ddvp); 2618 } 2619 mtx_unlock(vlp); 2620 return (NULL); 2621 } 2622 2623 int 2624 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2625 { 2626 struct namecache *ncp; 2627 struct mtx *vlp; 2628 int l; 2629 2630 vlp = VP2VNODELOCK(vp); 2631 mtx_lock(vlp); 2632 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2633 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2634 break; 2635 if (ncp == NULL) { 2636 mtx_unlock(vlp); 2637 return (ENOENT); 2638 } 2639 l = min(ncp->nc_nlen, buflen - 1); 2640 memcpy(buf, ncp->nc_name, l); 2641 mtx_unlock(vlp); 2642 buf[l] = '\0'; 2643 return (0); 2644 } 2645 2646 /* 2647 * This function updates path string to vnode's full global path 2648 * and checks the size of the new path string against the pathlen argument. 2649 * 2650 * Requires a locked, referenced vnode. 2651 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2652 * 2653 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2654 * because it falls back to the ".." lookup if the namecache lookup fails. 2655 */ 2656 int 2657 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2658 u_int pathlen) 2659 { 2660 struct nameidata nd; 2661 struct vnode *vp1; 2662 char *rpath, *fbuf; 2663 int error; 2664 2665 ASSERT_VOP_ELOCKED(vp, __func__); 2666 2667 /* Construct global filesystem path from vp. */ 2668 VOP_UNLOCK(vp); 2669 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2670 2671 if (error != 0) { 2672 vrele(vp); 2673 return (error); 2674 } 2675 2676 if (strlen(rpath) >= pathlen) { 2677 vrele(vp); 2678 error = ENAMETOOLONG; 2679 goto out; 2680 } 2681 2682 /* 2683 * Re-lookup the vnode by path to detect a possible rename. 2684 * As a side effect, the vnode is relocked. 2685 * If vnode was renamed, return ENOENT. 2686 */ 2687 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2688 UIO_SYSSPACE, path, td); 2689 error = namei(&nd); 2690 if (error != 0) { 2691 vrele(vp); 2692 goto out; 2693 } 2694 NDFREE(&nd, NDF_ONLY_PNBUF); 2695 vp1 = nd.ni_vp; 2696 vrele(vp); 2697 if (vp1 == vp) 2698 strcpy(path, rpath); 2699 else { 2700 vput(vp1); 2701 error = ENOENT; 2702 } 2703 2704 out: 2705 free(fbuf, M_TEMP); 2706 return (error); 2707 } 2708 2709 #ifdef DDB 2710 static void 2711 db_print_vpath(struct vnode *vp) 2712 { 2713 2714 while (vp != NULL) { 2715 db_printf("%p: ", vp); 2716 if (vp == rootvnode) { 2717 db_printf("/"); 2718 vp = NULL; 2719 } else { 2720 if (vp->v_vflag & VV_ROOT) { 2721 db_printf("<mount point>"); 2722 vp = vp->v_mount->mnt_vnodecovered; 2723 } else { 2724 struct namecache *ncp; 2725 char *ncn; 2726 int i; 2727 2728 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2729 if (ncp != NULL) { 2730 ncn = ncp->nc_name; 2731 for (i = 0; i < ncp->nc_nlen; i++) 2732 db_printf("%c", *ncn++); 2733 vp = ncp->nc_dvp; 2734 } else { 2735 vp = NULL; 2736 } 2737 } 2738 } 2739 db_printf("\n"); 2740 } 2741 2742 return; 2743 } 2744 2745 DB_SHOW_COMMAND(vpath, db_show_vpath) 2746 { 2747 struct vnode *vp; 2748 2749 if (!have_addr) { 2750 db_printf("usage: show vpath <struct vnode *>\n"); 2751 return; 2752 } 2753 2754 vp = (struct vnode *)addr; 2755 db_print_vpath(vp); 2756 } 2757 2758 #endif 2759