1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/mount.h> 55 #include <sys/namei.h> 56 #include <sys/proc.h> 57 #include <sys/rwlock.h> 58 #include <sys/sdt.h> 59 #include <sys/smp.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysproto.h> 63 #include <sys/vnode.h> 64 #ifdef KTRACE 65 #include <sys/ktrace.h> 66 #endif 67 68 #ifdef DDB 69 #include <ddb/ddb.h> 70 #endif 71 72 #include <vm/uma.h> 73 74 SDT_PROVIDER_DECLARE(vfs); 75 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 76 "struct vnode *"); 77 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 78 "char *"); 79 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 80 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 81 "char *", "struct vnode *"); 82 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 83 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 84 "struct vnode *", "char *"); 85 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 86 "struct vnode *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 88 "struct vnode *", "char *"); 89 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 90 "char *"); 91 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 92 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 93 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 94 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 95 "struct vnode *"); 96 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 97 "char *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 99 "char *"); 100 101 /* 102 * This structure describes the elements in the cache of recent 103 * names looked up by namei. 104 */ 105 106 struct namecache { 107 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 108 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 109 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 110 struct vnode *nc_dvp; /* vnode of parent of name */ 111 union { 112 struct vnode *nu_vp; /* vnode the name refers to */ 113 } n_un; 114 u_char nc_flag; /* flag bits */ 115 u_char nc_nlen; /* length of name */ 116 char nc_name[0]; /* segment name + nul */ 117 }; 118 119 /* 120 * struct namecache_ts repeats struct namecache layout up to the 121 * nc_nlen member. 122 * struct namecache_ts is used in place of struct namecache when time(s) need 123 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 124 * both a non-dotdot directory name plus dotdot for the directory's 125 * parent. 126 */ 127 struct namecache_ts { 128 struct timespec nc_time; /* timespec provided by fs */ 129 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 130 int nc_ticks; /* ticks value when entry was added */ 131 struct namecache nc_nc; 132 }; 133 134 #define nc_vp n_un.nu_vp 135 136 /* 137 * Flags in namecache.nc_flag 138 */ 139 #define NCF_WHITE 0x01 140 #define NCF_ISDOTDOT 0x02 141 #define NCF_TS 0x04 142 #define NCF_DTS 0x08 143 #define NCF_DVDROP 0x10 144 #define NCF_NEGATIVE 0x20 145 #define NCF_HOTNEGATIVE 0x40 146 147 /* 148 * Name caching works as follows: 149 * 150 * Names found by directory scans are retained in a cache 151 * for future reference. It is managed LRU, so frequently 152 * used names will hang around. Cache is indexed by hash value 153 * obtained from (dvp, name) where dvp refers to the directory 154 * containing name. 155 * 156 * If it is a "negative" entry, (i.e. for a name that is known NOT to 157 * exist) the vnode pointer will be NULL. 158 * 159 * Upon reaching the last segment of a path, if the reference 160 * is for DELETE, or NOCACHE is set (rewrite), and the 161 * name is located in the cache, it will be dropped. 162 * 163 * These locks are used (in the order in which they can be taken): 164 * NAME TYPE ROLE 165 * vnodelock mtx vnode lists and v_cache_dd field protection 166 * bucketlock rwlock for access to given set of hash buckets 167 * neglist mtx negative entry LRU management 168 * 169 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 170 * shrinking the LRU list. 171 * 172 * It is legal to take multiple vnodelock and bucketlock locks. The locking 173 * order is lower address first. Both are recursive. 174 * 175 * "." lookups are lockless. 176 * 177 * ".." and vnode -> name lookups require vnodelock. 178 * 179 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 180 * 181 * Insertions and removals of entries require involved vnodes and bucketlocks 182 * to be write-locked to prevent other threads from seeing the entry. 183 * 184 * Some lookups result in removal of the found entry (e.g. getting rid of a 185 * negative entry with the intent to create a positive one), which poses a 186 * problem when multiple threads reach the state. Similarly, two different 187 * threads can purge two different vnodes and try to remove the same name. 188 * 189 * If the already held vnode lock is lower than the second required lock, we 190 * can just take the other lock. However, in the opposite case, this could 191 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 192 * the first node, locking everything in order and revalidating the state. 193 */ 194 195 /* 196 * Structures associated with name caching. 197 */ 198 #define NCHHASH(hash) \ 199 (&nchashtbl[(hash) & nchash]) 200 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 201 static u_long __read_mostly nchash; /* size of hash table */ 202 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 203 "Size of namecache hash table"); 204 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 205 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 206 "Ratio of negative namecache entries"); 207 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 208 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 209 u_int ncsizefactor = 2; 210 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 211 "Size factor for namecache"); 212 static u_int __read_mostly ncpurgeminvnodes; 213 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 214 "Number of vnodes below which purgevfs ignores the request"); 215 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 216 217 struct nchstats nchstats; /* cache effectiveness statistics */ 218 219 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 220 static int shrink_list_turn; 221 222 struct neglist { 223 struct mtx nl_lock; 224 TAILQ_HEAD(, namecache) nl_list; 225 } __aligned(CACHE_LINE_SIZE); 226 227 static struct neglist __read_mostly *neglists; 228 static struct neglist ncneg_hot; 229 static u_long numhotneg; 230 231 #define numneglists (ncneghash + 1) 232 static u_int __read_mostly ncneghash; 233 static inline struct neglist * 234 NCP2NEGLIST(struct namecache *ncp) 235 { 236 237 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 238 } 239 240 #define numbucketlocks (ncbuckethash + 1) 241 static u_int __read_mostly ncbuckethash; 242 static struct rwlock_padalign __read_mostly *bucketlocks; 243 #define HASH2BUCKETLOCK(hash) \ 244 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 245 246 #define numvnodelocks (ncvnodehash + 1) 247 static u_int __read_mostly ncvnodehash; 248 static struct mtx __read_mostly *vnodelocks; 249 static inline struct mtx * 250 VP2VNODELOCK(struct vnode *vp) 251 { 252 253 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 254 } 255 256 /* 257 * UMA zones for the VFS cache. 258 * 259 * The small cache is used for entries with short names, which are the 260 * most common. The large cache is used for entries which are too big to 261 * fit in the small cache. 262 */ 263 static uma_zone_t __read_mostly cache_zone_small; 264 static uma_zone_t __read_mostly cache_zone_small_ts; 265 static uma_zone_t __read_mostly cache_zone_large; 266 static uma_zone_t __read_mostly cache_zone_large_ts; 267 268 #define CACHE_PATH_CUTOFF 35 269 270 static struct namecache * 271 cache_alloc(int len, int ts) 272 { 273 struct namecache_ts *ncp_ts; 274 struct namecache *ncp; 275 276 if (__predict_false(ts)) { 277 if (len <= CACHE_PATH_CUTOFF) 278 ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); 279 else 280 ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); 281 ncp = &ncp_ts->nc_nc; 282 } else { 283 if (len <= CACHE_PATH_CUTOFF) 284 ncp = uma_zalloc(cache_zone_small, M_WAITOK); 285 else 286 ncp = uma_zalloc(cache_zone_large, M_WAITOK); 287 } 288 return (ncp); 289 } 290 291 static void 292 cache_free(struct namecache *ncp) 293 { 294 struct namecache_ts *ncp_ts; 295 296 if (ncp == NULL) 297 return; 298 if ((ncp->nc_flag & NCF_DVDROP) != 0) 299 vdrop(ncp->nc_dvp); 300 if (__predict_false(ncp->nc_flag & NCF_TS)) { 301 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 302 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 303 uma_zfree(cache_zone_small_ts, ncp_ts); 304 else 305 uma_zfree(cache_zone_large_ts, ncp_ts); 306 } else { 307 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 308 uma_zfree(cache_zone_small, ncp); 309 else 310 uma_zfree(cache_zone_large, ncp); 311 } 312 } 313 314 static void 315 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 316 { 317 struct namecache_ts *ncp_ts; 318 319 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 320 (tsp == NULL && ticksp == NULL), 321 ("No NCF_TS")); 322 323 if (tsp == NULL && ticksp == NULL) 324 return; 325 326 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 327 if (tsp != NULL) 328 *tsp = ncp_ts->nc_time; 329 if (ticksp != NULL) 330 *ticksp = ncp_ts->nc_ticks; 331 } 332 333 #ifdef DEBUG_CACHE 334 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 335 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 336 "VFS namecache enabled"); 337 #endif 338 339 /* Export size information to userland */ 340 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 341 sizeof(struct namecache), "sizeof(struct namecache)"); 342 343 /* 344 * The new name cache statistics 345 */ 346 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 347 "Name cache statistics"); 348 #define STATNODE_ULONG(name, descr) \ 349 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 350 #define STATNODE_COUNTER(name, descr) \ 351 static COUNTER_U64_DEFINE_EARLY(name); \ 352 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 353 descr); 354 STATNODE_ULONG(numneg, "Number of negative cache entries"); 355 STATNODE_ULONG(numcache, "Number of cache entries"); 356 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 357 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 358 STATNODE_COUNTER(dothits, "Number of '.' hits"); 359 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 360 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 361 STATNODE_COUNTER(nummiss, "Number of cache misses"); 362 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 363 STATNODE_COUNTER(numposzaps, 364 "Number of cache hits (positive) we do not want to cache"); 365 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 366 STATNODE_COUNTER(numnegzaps, 367 "Number of cache hits (negative) we do not want to cache"); 368 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 369 /* These count for vn_getcwd(), too. */ 370 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 371 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 372 STATNODE_COUNTER(numfullpathfail2, 373 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 374 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 375 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 376 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 377 "Number of successful removals after relocking"); 378 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 379 "Number of times zap_and_exit failed to lock"); 380 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 381 "Number of times zap_and_exit failed to lock"); 382 static long cache_lock_vnodes_cel_3_failures; 383 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 384 "Number of times 3-way vnode locking failed"); 385 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 386 STATNODE_COUNTER(numneg_evicted, 387 "Number of negative entries evicted when adding a new entry"); 388 STATNODE_COUNTER(shrinking_skipped, 389 "Number of times shrinking was already in progress"); 390 391 static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 392 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 393 char **freebuf, size_t *buflen); 394 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 395 char *buf, char **retbuf, size_t *buflen); 396 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 397 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 398 399 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 400 401 static int cache_yield; 402 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 403 "Number of times cache called yield"); 404 405 static void __noinline 406 cache_maybe_yield(void) 407 { 408 409 if (should_yield()) { 410 cache_yield++; 411 kern_yield(PRI_USER); 412 } 413 } 414 415 static inline void 416 cache_assert_vlp_locked(struct mtx *vlp) 417 { 418 419 if (vlp != NULL) 420 mtx_assert(vlp, MA_OWNED); 421 } 422 423 static inline void 424 cache_assert_vnode_locked(struct vnode *vp) 425 { 426 struct mtx *vlp; 427 428 vlp = VP2VNODELOCK(vp); 429 cache_assert_vlp_locked(vlp); 430 } 431 432 static uint32_t 433 cache_get_hash(char *name, u_char len, struct vnode *dvp) 434 { 435 uint32_t hash; 436 437 hash = fnv_32_buf(name, len, FNV1_32_INIT); 438 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 439 return (hash); 440 } 441 442 static inline struct rwlock * 443 NCP2BUCKETLOCK(struct namecache *ncp) 444 { 445 uint32_t hash; 446 447 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 448 return (HASH2BUCKETLOCK(hash)); 449 } 450 451 #ifdef INVARIANTS 452 static void 453 cache_assert_bucket_locked(struct namecache *ncp, int mode) 454 { 455 struct rwlock *blp; 456 457 blp = NCP2BUCKETLOCK(ncp); 458 rw_assert(blp, mode); 459 } 460 #else 461 #define cache_assert_bucket_locked(x, y) do { } while (0) 462 #endif 463 464 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 465 static void 466 _cache_sort_vnodes(void **p1, void **p2) 467 { 468 void *tmp; 469 470 MPASS(*p1 != NULL || *p2 != NULL); 471 472 if (*p1 > *p2) { 473 tmp = *p2; 474 *p2 = *p1; 475 *p1 = tmp; 476 } 477 } 478 479 static void 480 cache_lock_all_buckets(void) 481 { 482 u_int i; 483 484 for (i = 0; i < numbucketlocks; i++) 485 rw_wlock(&bucketlocks[i]); 486 } 487 488 static void 489 cache_unlock_all_buckets(void) 490 { 491 u_int i; 492 493 for (i = 0; i < numbucketlocks; i++) 494 rw_wunlock(&bucketlocks[i]); 495 } 496 497 static void 498 cache_lock_all_vnodes(void) 499 { 500 u_int i; 501 502 for (i = 0; i < numvnodelocks; i++) 503 mtx_lock(&vnodelocks[i]); 504 } 505 506 static void 507 cache_unlock_all_vnodes(void) 508 { 509 u_int i; 510 511 for (i = 0; i < numvnodelocks; i++) 512 mtx_unlock(&vnodelocks[i]); 513 } 514 515 static int 516 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 517 { 518 519 cache_sort_vnodes(&vlp1, &vlp2); 520 521 if (vlp1 != NULL) { 522 if (!mtx_trylock(vlp1)) 523 return (EAGAIN); 524 } 525 if (!mtx_trylock(vlp2)) { 526 if (vlp1 != NULL) 527 mtx_unlock(vlp1); 528 return (EAGAIN); 529 } 530 531 return (0); 532 } 533 534 static void 535 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 536 { 537 538 MPASS(vlp1 != NULL || vlp2 != NULL); 539 MPASS(vlp1 <= vlp2); 540 541 if (vlp1 != NULL) 542 mtx_lock(vlp1); 543 if (vlp2 != NULL) 544 mtx_lock(vlp2); 545 } 546 547 static void 548 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 549 { 550 551 MPASS(vlp1 != NULL || vlp2 != NULL); 552 553 if (vlp1 != NULL) 554 mtx_unlock(vlp1); 555 if (vlp2 != NULL) 556 mtx_unlock(vlp2); 557 } 558 559 static int 560 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 561 { 562 struct nchstats snap; 563 564 if (req->oldptr == NULL) 565 return (SYSCTL_OUT(req, 0, sizeof(snap))); 566 567 snap = nchstats; 568 snap.ncs_goodhits = counter_u64_fetch(numposhits); 569 snap.ncs_neghits = counter_u64_fetch(numneghits); 570 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 571 counter_u64_fetch(numnegzaps); 572 snap.ncs_miss = counter_u64_fetch(nummisszap) + 573 counter_u64_fetch(nummiss); 574 575 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 576 } 577 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 578 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 579 "VFS cache effectiveness statistics"); 580 581 #ifdef DIAGNOSTIC 582 /* 583 * Grab an atomic snapshot of the name cache hash chain lengths 584 */ 585 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 586 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 587 "hash table stats"); 588 589 static int 590 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 591 { 592 struct nchashhead *ncpp; 593 struct namecache *ncp; 594 int i, error, n_nchash, *cntbuf; 595 596 retry: 597 n_nchash = nchash + 1; /* nchash is max index, not count */ 598 if (req->oldptr == NULL) 599 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 600 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 601 cache_lock_all_buckets(); 602 if (n_nchash != nchash + 1) { 603 cache_unlock_all_buckets(); 604 free(cntbuf, M_TEMP); 605 goto retry; 606 } 607 /* Scan hash tables counting entries */ 608 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 609 LIST_FOREACH(ncp, ncpp, nc_hash) 610 cntbuf[i]++; 611 cache_unlock_all_buckets(); 612 for (error = 0, i = 0; i < n_nchash; i++) 613 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 614 break; 615 free(cntbuf, M_TEMP); 616 return (error); 617 } 618 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 619 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 620 "nchash chain lengths"); 621 622 static int 623 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 624 { 625 int error; 626 struct nchashhead *ncpp; 627 struct namecache *ncp; 628 int n_nchash; 629 int count, maxlength, used, pct; 630 631 if (!req->oldptr) 632 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 633 634 cache_lock_all_buckets(); 635 n_nchash = nchash + 1; /* nchash is max index, not count */ 636 used = 0; 637 maxlength = 0; 638 639 /* Scan hash tables for applicable entries */ 640 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 641 count = 0; 642 LIST_FOREACH(ncp, ncpp, nc_hash) { 643 count++; 644 } 645 if (count) 646 used++; 647 if (maxlength < count) 648 maxlength = count; 649 } 650 n_nchash = nchash + 1; 651 cache_unlock_all_buckets(); 652 pct = (used * 100) / (n_nchash / 100); 653 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 654 if (error) 655 return (error); 656 error = SYSCTL_OUT(req, &used, sizeof(used)); 657 if (error) 658 return (error); 659 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 660 if (error) 661 return (error); 662 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 663 if (error) 664 return (error); 665 return (0); 666 } 667 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 668 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 669 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 670 #endif 671 672 /* 673 * Negative entries management 674 * 675 * A variation of LRU scheme is used. New entries are hashed into one of 676 * numneglists cold lists. Entries get promoted to the hot list on first hit. 677 * 678 * The shrinker will demote hot list head and evict from the cold list in a 679 * round-robin manner. 680 */ 681 static void 682 cache_negative_hit(struct namecache *ncp) 683 { 684 struct neglist *neglist; 685 686 MPASS(ncp->nc_flag & NCF_NEGATIVE); 687 if (ncp->nc_flag & NCF_HOTNEGATIVE) 688 return; 689 neglist = NCP2NEGLIST(ncp); 690 mtx_lock(&ncneg_hot.nl_lock); 691 mtx_lock(&neglist->nl_lock); 692 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 693 numhotneg++; 694 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 695 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 696 ncp->nc_flag |= NCF_HOTNEGATIVE; 697 } 698 mtx_unlock(&neglist->nl_lock); 699 mtx_unlock(&ncneg_hot.nl_lock); 700 } 701 702 static void 703 cache_negative_insert(struct namecache *ncp, bool neg_locked) 704 { 705 struct neglist *neglist; 706 707 MPASS(ncp->nc_flag & NCF_NEGATIVE); 708 cache_assert_bucket_locked(ncp, RA_WLOCKED); 709 neglist = NCP2NEGLIST(ncp); 710 if (!neg_locked) { 711 mtx_lock(&neglist->nl_lock); 712 } else { 713 mtx_assert(&neglist->nl_lock, MA_OWNED); 714 } 715 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 716 if (!neg_locked) 717 mtx_unlock(&neglist->nl_lock); 718 atomic_add_rel_long(&numneg, 1); 719 } 720 721 static void 722 cache_negative_remove(struct namecache *ncp, bool neg_locked) 723 { 724 struct neglist *neglist; 725 bool hot_locked = false; 726 bool list_locked = false; 727 728 MPASS(ncp->nc_flag & NCF_NEGATIVE); 729 cache_assert_bucket_locked(ncp, RA_WLOCKED); 730 neglist = NCP2NEGLIST(ncp); 731 if (!neg_locked) { 732 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 733 hot_locked = true; 734 mtx_lock(&ncneg_hot.nl_lock); 735 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 736 list_locked = true; 737 mtx_lock(&neglist->nl_lock); 738 } 739 } else { 740 list_locked = true; 741 mtx_lock(&neglist->nl_lock); 742 } 743 } 744 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 745 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 746 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 747 numhotneg--; 748 } else { 749 mtx_assert(&neglist->nl_lock, MA_OWNED); 750 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 751 } 752 if (list_locked) 753 mtx_unlock(&neglist->nl_lock); 754 if (hot_locked) 755 mtx_unlock(&ncneg_hot.nl_lock); 756 atomic_subtract_rel_long(&numneg, 1); 757 } 758 759 static void 760 cache_negative_shrink_select(int start, struct namecache **ncpp, 761 struct neglist **neglistpp) 762 { 763 struct neglist *neglist; 764 struct namecache *ncp; 765 int i; 766 767 *ncpp = ncp = NULL; 768 neglist = NULL; 769 770 for (i = start; i < numneglists; i++) { 771 neglist = &neglists[i]; 772 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 773 continue; 774 mtx_lock(&neglist->nl_lock); 775 ncp = TAILQ_FIRST(&neglist->nl_list); 776 if (ncp != NULL) 777 break; 778 mtx_unlock(&neglist->nl_lock); 779 } 780 781 *neglistpp = neglist; 782 *ncpp = ncp; 783 } 784 785 static void 786 cache_negative_zap_one(void) 787 { 788 struct namecache *ncp, *ncp2; 789 struct neglist *neglist; 790 struct mtx *dvlp; 791 struct rwlock *blp; 792 793 if (mtx_owner(&ncneg_shrink_lock) != NULL || 794 !mtx_trylock(&ncneg_shrink_lock)) { 795 counter_u64_add(shrinking_skipped, 1); 796 return; 797 } 798 799 mtx_lock(&ncneg_hot.nl_lock); 800 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 801 if (ncp != NULL) { 802 neglist = NCP2NEGLIST(ncp); 803 mtx_lock(&neglist->nl_lock); 804 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 805 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 806 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 807 numhotneg--; 808 mtx_unlock(&neglist->nl_lock); 809 } 810 mtx_unlock(&ncneg_hot.nl_lock); 811 812 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 813 shrink_list_turn++; 814 if (shrink_list_turn == numneglists) 815 shrink_list_turn = 0; 816 if (ncp == NULL && shrink_list_turn == 0) 817 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 818 mtx_unlock(&ncneg_shrink_lock); 819 if (ncp == NULL) 820 return; 821 822 MPASS(ncp->nc_flag & NCF_NEGATIVE); 823 dvlp = VP2VNODELOCK(ncp->nc_dvp); 824 blp = NCP2BUCKETLOCK(ncp); 825 mtx_unlock(&neglist->nl_lock); 826 mtx_lock(dvlp); 827 rw_wlock(blp); 828 mtx_lock(&neglist->nl_lock); 829 ncp2 = TAILQ_FIRST(&neglist->nl_list); 830 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 831 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 832 ncp = NULL; 833 } else { 834 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 835 ncp->nc_name); 836 837 cache_zap_locked(ncp, true); 838 counter_u64_add(numneg_evicted, 1); 839 } 840 mtx_unlock(&neglist->nl_lock); 841 rw_wunlock(blp); 842 mtx_unlock(dvlp); 843 cache_free(ncp); 844 } 845 846 /* 847 * cache_zap_locked(): 848 * 849 * Removes a namecache entry from cache, whether it contains an actual 850 * pointer to a vnode or if it is just a negative cache entry. 851 */ 852 static void 853 cache_zap_locked(struct namecache *ncp, bool neg_locked) 854 { 855 856 if (!(ncp->nc_flag & NCF_NEGATIVE)) 857 cache_assert_vnode_locked(ncp->nc_vp); 858 cache_assert_vnode_locked(ncp->nc_dvp); 859 cache_assert_bucket_locked(ncp, RA_WLOCKED); 860 861 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 862 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 863 LIST_REMOVE(ncp, nc_hash); 864 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 865 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 866 ncp->nc_name, ncp->nc_vp); 867 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 868 if (ncp == ncp->nc_vp->v_cache_dd) 869 ncp->nc_vp->v_cache_dd = NULL; 870 } else { 871 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 872 ncp->nc_name); 873 cache_negative_remove(ncp, neg_locked); 874 } 875 if (ncp->nc_flag & NCF_ISDOTDOT) { 876 if (ncp == ncp->nc_dvp->v_cache_dd) 877 ncp->nc_dvp->v_cache_dd = NULL; 878 } else { 879 LIST_REMOVE(ncp, nc_src); 880 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 881 ncp->nc_flag |= NCF_DVDROP; 882 counter_u64_add(numcachehv, -1); 883 } 884 } 885 atomic_subtract_rel_long(&numcache, 1); 886 } 887 888 static void 889 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 890 { 891 struct rwlock *blp; 892 893 MPASS(ncp->nc_dvp == vp); 894 MPASS(ncp->nc_flag & NCF_NEGATIVE); 895 cache_assert_vnode_locked(vp); 896 897 blp = NCP2BUCKETLOCK(ncp); 898 rw_wlock(blp); 899 cache_zap_locked(ncp, false); 900 rw_wunlock(blp); 901 } 902 903 static bool 904 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 905 struct mtx **vlpp) 906 { 907 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 908 struct rwlock *blp; 909 910 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 911 cache_assert_vnode_locked(vp); 912 913 if (ncp->nc_flag & NCF_NEGATIVE) { 914 if (*vlpp != NULL) { 915 mtx_unlock(*vlpp); 916 *vlpp = NULL; 917 } 918 cache_zap_negative_locked_vnode_kl(ncp, vp); 919 return (true); 920 } 921 922 pvlp = VP2VNODELOCK(vp); 923 blp = NCP2BUCKETLOCK(ncp); 924 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 925 vlp2 = VP2VNODELOCK(ncp->nc_vp); 926 927 if (*vlpp == vlp1 || *vlpp == vlp2) { 928 to_unlock = *vlpp; 929 *vlpp = NULL; 930 } else { 931 if (*vlpp != NULL) { 932 mtx_unlock(*vlpp); 933 *vlpp = NULL; 934 } 935 cache_sort_vnodes(&vlp1, &vlp2); 936 if (vlp1 == pvlp) { 937 mtx_lock(vlp2); 938 to_unlock = vlp2; 939 } else { 940 if (!mtx_trylock(vlp1)) 941 goto out_relock; 942 to_unlock = vlp1; 943 } 944 } 945 rw_wlock(blp); 946 cache_zap_locked(ncp, false); 947 rw_wunlock(blp); 948 if (to_unlock != NULL) 949 mtx_unlock(to_unlock); 950 return (true); 951 952 out_relock: 953 mtx_unlock(vlp2); 954 mtx_lock(vlp1); 955 mtx_lock(vlp2); 956 MPASS(*vlpp == NULL); 957 *vlpp = vlp1; 958 return (false); 959 } 960 961 static int __noinline 962 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 963 { 964 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 965 struct rwlock *blp; 966 int error = 0; 967 968 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 969 cache_assert_vnode_locked(vp); 970 971 pvlp = VP2VNODELOCK(vp); 972 if (ncp->nc_flag & NCF_NEGATIVE) { 973 cache_zap_negative_locked_vnode_kl(ncp, vp); 974 goto out; 975 } 976 977 blp = NCP2BUCKETLOCK(ncp); 978 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 979 vlp2 = VP2VNODELOCK(ncp->nc_vp); 980 cache_sort_vnodes(&vlp1, &vlp2); 981 if (vlp1 == pvlp) { 982 mtx_lock(vlp2); 983 to_unlock = vlp2; 984 } else { 985 if (!mtx_trylock(vlp1)) { 986 error = EAGAIN; 987 goto out; 988 } 989 to_unlock = vlp1; 990 } 991 rw_wlock(blp); 992 cache_zap_locked(ncp, false); 993 rw_wunlock(blp); 994 mtx_unlock(to_unlock); 995 out: 996 mtx_unlock(pvlp); 997 return (error); 998 } 999 1000 /* 1001 * If trylocking failed we can get here. We know enough to take all needed locks 1002 * in the right order and re-lookup the entry. 1003 */ 1004 static int 1005 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1006 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1007 struct rwlock *blp) 1008 { 1009 struct namecache *rncp; 1010 1011 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1012 1013 cache_sort_vnodes(&dvlp, &vlp); 1014 cache_lock_vnodes(dvlp, vlp); 1015 rw_wlock(blp); 1016 LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1017 if (rncp == ncp && rncp->nc_dvp == dvp && 1018 rncp->nc_nlen == cnp->cn_namelen && 1019 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1020 break; 1021 } 1022 if (rncp != NULL) { 1023 cache_zap_locked(rncp, false); 1024 rw_wunlock(blp); 1025 cache_unlock_vnodes(dvlp, vlp); 1026 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1027 return (0); 1028 } 1029 1030 rw_wunlock(blp); 1031 cache_unlock_vnodes(dvlp, vlp); 1032 return (EAGAIN); 1033 } 1034 1035 static int __noinline 1036 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1037 uint32_t hash, struct rwlock *blp) 1038 { 1039 struct mtx *dvlp, *vlp; 1040 struct vnode *dvp; 1041 1042 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1043 1044 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1045 vlp = NULL; 1046 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1047 vlp = VP2VNODELOCK(ncp->nc_vp); 1048 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1049 cache_zap_locked(ncp, false); 1050 rw_wunlock(blp); 1051 cache_unlock_vnodes(dvlp, vlp); 1052 return (0); 1053 } 1054 1055 dvp = ncp->nc_dvp; 1056 rw_wunlock(blp); 1057 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1058 } 1059 1060 static int __noinline 1061 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1062 uint32_t hash, struct rwlock *blp) 1063 { 1064 struct mtx *dvlp, *vlp; 1065 struct vnode *dvp; 1066 1067 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1068 1069 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1070 vlp = NULL; 1071 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1072 vlp = VP2VNODELOCK(ncp->nc_vp); 1073 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1074 rw_runlock(blp); 1075 rw_wlock(blp); 1076 cache_zap_locked(ncp, false); 1077 rw_wunlock(blp); 1078 cache_unlock_vnodes(dvlp, vlp); 1079 return (0); 1080 } 1081 1082 dvp = ncp->nc_dvp; 1083 rw_runlock(blp); 1084 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1085 } 1086 1087 static int 1088 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1089 struct mtx **vlpp1, struct mtx **vlpp2) 1090 { 1091 struct mtx *dvlp, *vlp; 1092 1093 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1094 1095 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1096 vlp = NULL; 1097 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1098 vlp = VP2VNODELOCK(ncp->nc_vp); 1099 cache_sort_vnodes(&dvlp, &vlp); 1100 1101 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1102 cache_zap_locked(ncp, false); 1103 cache_unlock_vnodes(dvlp, vlp); 1104 *vlpp1 = NULL; 1105 *vlpp2 = NULL; 1106 return (0); 1107 } 1108 1109 if (*vlpp1 != NULL) 1110 mtx_unlock(*vlpp1); 1111 if (*vlpp2 != NULL) 1112 mtx_unlock(*vlpp2); 1113 *vlpp1 = NULL; 1114 *vlpp2 = NULL; 1115 1116 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1117 cache_zap_locked(ncp, false); 1118 cache_unlock_vnodes(dvlp, vlp); 1119 return (0); 1120 } 1121 1122 rw_wunlock(blp); 1123 *vlpp1 = dvlp; 1124 *vlpp2 = vlp; 1125 if (*vlpp1 != NULL) 1126 mtx_lock(*vlpp1); 1127 mtx_lock(*vlpp2); 1128 rw_wlock(blp); 1129 return (EAGAIN); 1130 } 1131 1132 static void 1133 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1134 { 1135 1136 if (blp != NULL) { 1137 rw_runlock(blp); 1138 } else { 1139 mtx_unlock(vlp); 1140 } 1141 } 1142 1143 static int __noinline 1144 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1145 struct timespec *tsp, int *ticksp) 1146 { 1147 int ltype; 1148 1149 *vpp = dvp; 1150 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1151 dvp, cnp->cn_nameptr); 1152 counter_u64_add(dothits, 1); 1153 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1154 if (tsp != NULL) 1155 timespecclear(tsp); 1156 if (ticksp != NULL) 1157 *ticksp = ticks; 1158 vrefact(*vpp); 1159 /* 1160 * When we lookup "." we still can be asked to lock it 1161 * differently... 1162 */ 1163 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1164 if (ltype != VOP_ISLOCKED(*vpp)) { 1165 if (ltype == LK_EXCLUSIVE) { 1166 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1167 if (VN_IS_DOOMED((*vpp))) { 1168 /* forced unmount */ 1169 vrele(*vpp); 1170 *vpp = NULL; 1171 return (ENOENT); 1172 } 1173 } else 1174 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1175 } 1176 return (-1); 1177 } 1178 1179 static __noinline int 1180 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1181 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1182 { 1183 struct namecache *ncp; 1184 struct rwlock *blp; 1185 struct mtx *dvlp, *dvlp2; 1186 uint32_t hash; 1187 int error; 1188 1189 if (cnp->cn_namelen == 2 && 1190 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1191 counter_u64_add(dotdothits, 1); 1192 dvlp = VP2VNODELOCK(dvp); 1193 dvlp2 = NULL; 1194 mtx_lock(dvlp); 1195 retry_dotdot: 1196 ncp = dvp->v_cache_dd; 1197 if (ncp == NULL) { 1198 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1199 "..", NULL); 1200 mtx_unlock(dvlp); 1201 if (dvlp2 != NULL) 1202 mtx_unlock(dvlp2); 1203 return (0); 1204 } 1205 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1206 if (ncp->nc_dvp != dvp) 1207 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1208 if (!cache_zap_locked_vnode_kl2(ncp, 1209 dvp, &dvlp2)) 1210 goto retry_dotdot; 1211 MPASS(dvp->v_cache_dd == NULL); 1212 mtx_unlock(dvlp); 1213 if (dvlp2 != NULL) 1214 mtx_unlock(dvlp2); 1215 cache_free(ncp); 1216 } else { 1217 dvp->v_cache_dd = NULL; 1218 mtx_unlock(dvlp); 1219 if (dvlp2 != NULL) 1220 mtx_unlock(dvlp2); 1221 } 1222 return (0); 1223 } 1224 1225 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1226 blp = HASH2BUCKETLOCK(hash); 1227 retry: 1228 if (LIST_EMPTY(NCHHASH(hash))) 1229 goto out_no_entry; 1230 1231 rw_wlock(blp); 1232 1233 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1234 counter_u64_add(numchecks, 1); 1235 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1236 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1237 break; 1238 } 1239 1240 /* We failed to find an entry */ 1241 if (ncp == NULL) { 1242 rw_wunlock(blp); 1243 goto out_no_entry; 1244 } 1245 1246 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1247 if (__predict_false(error != 0)) { 1248 zap_and_exit_bucket_fail++; 1249 cache_maybe_yield(); 1250 goto retry; 1251 } 1252 counter_u64_add(numposzaps, 1); 1253 cache_free(ncp); 1254 return (0); 1255 out_no_entry: 1256 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1257 counter_u64_add(nummisszap, 1); 1258 return (0); 1259 } 1260 1261 /** 1262 * Lookup a name in the name cache 1263 * 1264 * # Arguments 1265 * 1266 * - dvp: Parent directory in which to search. 1267 * - vpp: Return argument. Will contain desired vnode on cache hit. 1268 * - cnp: Parameters of the name search. The most interesting bits of 1269 * the cn_flags field have the following meanings: 1270 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1271 * it up. 1272 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1273 * - tsp: Return storage for cache timestamp. On a successful (positive 1274 * or negative) lookup, tsp will be filled with any timespec that 1275 * was stored when this cache entry was created. However, it will 1276 * be clear for "." entries. 1277 * - ticks: Return storage for alternate cache timestamp. On a successful 1278 * (positive or negative) lookup, it will contain the ticks value 1279 * that was current when the cache entry was created, unless cnp 1280 * was ".". 1281 * 1282 * # Returns 1283 * 1284 * - -1: A positive cache hit. vpp will contain the desired vnode. 1285 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1286 * to a forced unmount. vpp will not be modified. If the entry 1287 * is a whiteout, then the ISWHITEOUT flag will be set in 1288 * cnp->cn_flags. 1289 * - 0: A cache miss. vpp will not be modified. 1290 * 1291 * # Locking 1292 * 1293 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1294 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1295 * lock is not recursively acquired. 1296 */ 1297 int 1298 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1299 struct timespec *tsp, int *ticksp) 1300 { 1301 struct namecache_ts *ncp_ts; 1302 struct namecache *ncp; 1303 struct rwlock *blp; 1304 struct mtx *dvlp; 1305 uint32_t hash; 1306 enum vgetstate vs; 1307 int error, ltype; 1308 1309 #ifdef DEBUG_CACHE 1310 if (__predict_false(!doingcache)) { 1311 cnp->cn_flags &= ~MAKEENTRY; 1312 return (0); 1313 } 1314 #endif 1315 1316 counter_u64_add(numcalls, 1); 1317 1318 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1319 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1320 1321 if ((cnp->cn_flags & MAKEENTRY) == 0) 1322 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1323 1324 retry: 1325 blp = NULL; 1326 dvlp = NULL; 1327 error = 0; 1328 if (cnp->cn_namelen == 2 && 1329 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1330 counter_u64_add(dotdothits, 1); 1331 dvlp = VP2VNODELOCK(dvp); 1332 mtx_lock(dvlp); 1333 ncp = dvp->v_cache_dd; 1334 if (ncp == NULL) { 1335 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1336 "..", NULL); 1337 mtx_unlock(dvlp); 1338 return (0); 1339 } 1340 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1341 if (ncp->nc_flag & NCF_NEGATIVE) 1342 *vpp = NULL; 1343 else 1344 *vpp = ncp->nc_vp; 1345 } else 1346 *vpp = ncp->nc_dvp; 1347 /* Return failure if negative entry was found. */ 1348 if (*vpp == NULL) 1349 goto negative_success; 1350 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1351 dvp, cnp->cn_nameptr, *vpp); 1352 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1353 *vpp); 1354 cache_out_ts(ncp, tsp, ticksp); 1355 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1356 NCF_DTS && tsp != NULL) { 1357 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1358 *tsp = ncp_ts->nc_dotdottime; 1359 } 1360 goto success; 1361 } 1362 1363 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1364 blp = HASH2BUCKETLOCK(hash); 1365 rw_rlock(blp); 1366 1367 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1368 counter_u64_add(numchecks, 1); 1369 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1370 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1371 break; 1372 } 1373 1374 /* We failed to find an entry */ 1375 if (__predict_false(ncp == NULL)) { 1376 rw_runlock(blp); 1377 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1378 NULL); 1379 counter_u64_add(nummiss, 1); 1380 return (0); 1381 } 1382 1383 if (ncp->nc_flag & NCF_NEGATIVE) 1384 goto negative_success; 1385 1386 /* We found a "positive" match, return the vnode */ 1387 counter_u64_add(numposhits, 1); 1388 *vpp = ncp->nc_vp; 1389 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1390 dvp, cnp->cn_nameptr, *vpp, ncp); 1391 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1392 *vpp); 1393 cache_out_ts(ncp, tsp, ticksp); 1394 success: 1395 /* 1396 * On success we return a locked and ref'd vnode as per the lookup 1397 * protocol. 1398 */ 1399 MPASS(dvp != *vpp); 1400 ltype = 0; /* silence gcc warning */ 1401 if (cnp->cn_flags & ISDOTDOT) { 1402 ltype = VOP_ISLOCKED(dvp); 1403 VOP_UNLOCK(dvp); 1404 } 1405 vs = vget_prep(*vpp); 1406 cache_lookup_unlock(blp, dvlp); 1407 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1408 if (cnp->cn_flags & ISDOTDOT) { 1409 vn_lock(dvp, ltype | LK_RETRY); 1410 if (VN_IS_DOOMED(dvp)) { 1411 if (error == 0) 1412 vput(*vpp); 1413 *vpp = NULL; 1414 return (ENOENT); 1415 } 1416 } 1417 if (error) { 1418 *vpp = NULL; 1419 goto retry; 1420 } 1421 if ((cnp->cn_flags & ISLASTCN) && 1422 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1423 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1424 } 1425 return (-1); 1426 1427 negative_success: 1428 /* We found a negative match, and want to create it, so purge */ 1429 if (cnp->cn_nameiop == CREATE) { 1430 counter_u64_add(numnegzaps, 1); 1431 goto zap_and_exit; 1432 } 1433 1434 counter_u64_add(numneghits, 1); 1435 cache_negative_hit(ncp); 1436 if (ncp->nc_flag & NCF_WHITE) 1437 cnp->cn_flags |= ISWHITEOUT; 1438 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 1439 ncp->nc_name); 1440 cache_out_ts(ncp, tsp, ticksp); 1441 cache_lookup_unlock(blp, dvlp); 1442 return (ENOENT); 1443 1444 zap_and_exit: 1445 if (blp != NULL) 1446 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1447 else 1448 error = cache_zap_locked_vnode(ncp, dvp); 1449 if (__predict_false(error != 0)) { 1450 zap_and_exit_bucket_fail2++; 1451 cache_maybe_yield(); 1452 goto retry; 1453 } 1454 cache_free(ncp); 1455 return (0); 1456 } 1457 1458 struct celockstate { 1459 struct mtx *vlp[3]; 1460 struct rwlock *blp[2]; 1461 }; 1462 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1463 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1464 1465 static inline void 1466 cache_celockstate_init(struct celockstate *cel) 1467 { 1468 1469 bzero(cel, sizeof(*cel)); 1470 } 1471 1472 static void 1473 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1474 struct vnode *dvp) 1475 { 1476 struct mtx *vlp1, *vlp2; 1477 1478 MPASS(cel->vlp[0] == NULL); 1479 MPASS(cel->vlp[1] == NULL); 1480 MPASS(cel->vlp[2] == NULL); 1481 1482 MPASS(vp != NULL || dvp != NULL); 1483 1484 vlp1 = VP2VNODELOCK(vp); 1485 vlp2 = VP2VNODELOCK(dvp); 1486 cache_sort_vnodes(&vlp1, &vlp2); 1487 1488 if (vlp1 != NULL) { 1489 mtx_lock(vlp1); 1490 cel->vlp[0] = vlp1; 1491 } 1492 mtx_lock(vlp2); 1493 cel->vlp[1] = vlp2; 1494 } 1495 1496 static void 1497 cache_unlock_vnodes_cel(struct celockstate *cel) 1498 { 1499 1500 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1501 1502 if (cel->vlp[0] != NULL) 1503 mtx_unlock(cel->vlp[0]); 1504 if (cel->vlp[1] != NULL) 1505 mtx_unlock(cel->vlp[1]); 1506 if (cel->vlp[2] != NULL) 1507 mtx_unlock(cel->vlp[2]); 1508 } 1509 1510 static bool 1511 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1512 { 1513 struct mtx *vlp; 1514 bool ret; 1515 1516 cache_assert_vlp_locked(cel->vlp[0]); 1517 cache_assert_vlp_locked(cel->vlp[1]); 1518 MPASS(cel->vlp[2] == NULL); 1519 1520 MPASS(vp != NULL); 1521 vlp = VP2VNODELOCK(vp); 1522 1523 ret = true; 1524 if (vlp >= cel->vlp[1]) { 1525 mtx_lock(vlp); 1526 } else { 1527 if (mtx_trylock(vlp)) 1528 goto out; 1529 cache_lock_vnodes_cel_3_failures++; 1530 cache_unlock_vnodes_cel(cel); 1531 if (vlp < cel->vlp[0]) { 1532 mtx_lock(vlp); 1533 mtx_lock(cel->vlp[0]); 1534 mtx_lock(cel->vlp[1]); 1535 } else { 1536 if (cel->vlp[0] != NULL) 1537 mtx_lock(cel->vlp[0]); 1538 mtx_lock(vlp); 1539 mtx_lock(cel->vlp[1]); 1540 } 1541 ret = false; 1542 } 1543 out: 1544 cel->vlp[2] = vlp; 1545 return (ret); 1546 } 1547 1548 static void 1549 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1550 struct rwlock *blp2) 1551 { 1552 1553 MPASS(cel->blp[0] == NULL); 1554 MPASS(cel->blp[1] == NULL); 1555 1556 cache_sort_vnodes(&blp1, &blp2); 1557 1558 if (blp1 != NULL) { 1559 rw_wlock(blp1); 1560 cel->blp[0] = blp1; 1561 } 1562 rw_wlock(blp2); 1563 cel->blp[1] = blp2; 1564 } 1565 1566 static void 1567 cache_unlock_buckets_cel(struct celockstate *cel) 1568 { 1569 1570 if (cel->blp[0] != NULL) 1571 rw_wunlock(cel->blp[0]); 1572 rw_wunlock(cel->blp[1]); 1573 } 1574 1575 /* 1576 * Lock part of the cache affected by the insertion. 1577 * 1578 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1579 * However, insertion can result in removal of an old entry. In this 1580 * case we have an additional vnode and bucketlock pair to lock. If the 1581 * entry is negative, ncelock is locked instead of the vnode. 1582 * 1583 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1584 * preserving the locking order (smaller address first). 1585 */ 1586 static void 1587 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1588 uint32_t hash) 1589 { 1590 struct namecache *ncp; 1591 struct rwlock *blps[2]; 1592 1593 blps[0] = HASH2BUCKETLOCK(hash); 1594 for (;;) { 1595 blps[1] = NULL; 1596 cache_lock_vnodes_cel(cel, dvp, vp); 1597 if (vp == NULL || vp->v_type != VDIR) 1598 break; 1599 ncp = vp->v_cache_dd; 1600 if (ncp == NULL) 1601 break; 1602 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1603 break; 1604 MPASS(ncp->nc_dvp == vp); 1605 blps[1] = NCP2BUCKETLOCK(ncp); 1606 if (ncp->nc_flag & NCF_NEGATIVE) 1607 break; 1608 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1609 break; 1610 /* 1611 * All vnodes got re-locked. Re-validate the state and if 1612 * nothing changed we are done. Otherwise restart. 1613 */ 1614 if (ncp == vp->v_cache_dd && 1615 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1616 blps[1] == NCP2BUCKETLOCK(ncp) && 1617 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1618 break; 1619 cache_unlock_vnodes_cel(cel); 1620 cel->vlp[0] = NULL; 1621 cel->vlp[1] = NULL; 1622 cel->vlp[2] = NULL; 1623 } 1624 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1625 } 1626 1627 static void 1628 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1629 uint32_t hash) 1630 { 1631 struct namecache *ncp; 1632 struct rwlock *blps[2]; 1633 1634 blps[0] = HASH2BUCKETLOCK(hash); 1635 for (;;) { 1636 blps[1] = NULL; 1637 cache_lock_vnodes_cel(cel, dvp, vp); 1638 ncp = dvp->v_cache_dd; 1639 if (ncp == NULL) 1640 break; 1641 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1642 break; 1643 MPASS(ncp->nc_dvp == dvp); 1644 blps[1] = NCP2BUCKETLOCK(ncp); 1645 if (ncp->nc_flag & NCF_NEGATIVE) 1646 break; 1647 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1648 break; 1649 if (ncp == dvp->v_cache_dd && 1650 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1651 blps[1] == NCP2BUCKETLOCK(ncp) && 1652 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1653 break; 1654 cache_unlock_vnodes_cel(cel); 1655 cel->vlp[0] = NULL; 1656 cel->vlp[1] = NULL; 1657 cel->vlp[2] = NULL; 1658 } 1659 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1660 } 1661 1662 static void 1663 cache_enter_unlock(struct celockstate *cel) 1664 { 1665 1666 cache_unlock_buckets_cel(cel); 1667 cache_unlock_vnodes_cel(cel); 1668 } 1669 1670 static void __noinline 1671 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1672 struct componentname *cnp) 1673 { 1674 struct celockstate cel; 1675 struct namecache *ncp; 1676 uint32_t hash; 1677 int len; 1678 1679 if (dvp->v_cache_dd == NULL) 1680 return; 1681 len = cnp->cn_namelen; 1682 cache_celockstate_init(&cel); 1683 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1684 cache_enter_lock_dd(&cel, dvp, vp, hash); 1685 ncp = dvp->v_cache_dd; 1686 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1687 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1688 cache_zap_locked(ncp, false); 1689 } else { 1690 ncp = NULL; 1691 } 1692 dvp->v_cache_dd = NULL; 1693 cache_enter_unlock(&cel); 1694 cache_free(ncp); 1695 } 1696 1697 /* 1698 * Add an entry to the cache. 1699 */ 1700 void 1701 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1702 struct timespec *tsp, struct timespec *dtsp) 1703 { 1704 struct celockstate cel; 1705 struct namecache *ncp, *n2, *ndd; 1706 struct namecache_ts *ncp_ts, *n2_ts; 1707 struct nchashhead *ncpp; 1708 uint32_t hash; 1709 int flag; 1710 int len; 1711 u_long lnumcache; 1712 1713 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1714 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1715 ("cache_enter: Adding a doomed vnode")); 1716 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1717 ("cache_enter: Doomed vnode used as src")); 1718 1719 #ifdef DEBUG_CACHE 1720 if (__predict_false(!doingcache)) 1721 return; 1722 #endif 1723 1724 flag = 0; 1725 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1726 if (cnp->cn_namelen == 1) 1727 return; 1728 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1729 cache_enter_dotdot_prep(dvp, vp, cnp); 1730 flag = NCF_ISDOTDOT; 1731 } 1732 } 1733 1734 /* 1735 * Avoid blowout in namecache entries. 1736 */ 1737 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1738 if (__predict_false(lnumcache >= ncsize)) { 1739 atomic_add_long(&numcache, -1); 1740 return; 1741 } 1742 1743 cache_celockstate_init(&cel); 1744 ndd = NULL; 1745 ncp_ts = NULL; 1746 1747 /* 1748 * Calculate the hash key and setup as much of the new 1749 * namecache entry as possible before acquiring the lock. 1750 */ 1751 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1752 ncp->nc_flag = flag; 1753 ncp->nc_vp = vp; 1754 if (vp == NULL) 1755 ncp->nc_flag |= NCF_NEGATIVE; 1756 ncp->nc_dvp = dvp; 1757 if (tsp != NULL) { 1758 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1759 ncp_ts->nc_time = *tsp; 1760 ncp_ts->nc_ticks = ticks; 1761 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1762 if (dtsp != NULL) { 1763 ncp_ts->nc_dotdottime = *dtsp; 1764 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1765 } 1766 } 1767 len = ncp->nc_nlen = cnp->cn_namelen; 1768 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1769 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1770 cache_enter_lock(&cel, dvp, vp, hash); 1771 1772 /* 1773 * See if this vnode or negative entry is already in the cache 1774 * with this name. This can happen with concurrent lookups of 1775 * the same path name. 1776 */ 1777 ncpp = NCHHASH(hash); 1778 LIST_FOREACH(n2, ncpp, nc_hash) { 1779 if (n2->nc_dvp == dvp && 1780 n2->nc_nlen == cnp->cn_namelen && 1781 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1782 if (tsp != NULL) { 1783 KASSERT((n2->nc_flag & NCF_TS) != 0, 1784 ("no NCF_TS")); 1785 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1786 n2_ts->nc_time = ncp_ts->nc_time; 1787 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1788 if (dtsp != NULL) { 1789 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1790 if (ncp->nc_flag & NCF_NEGATIVE) 1791 mtx_lock(&ncneg_hot.nl_lock); 1792 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1793 if (ncp->nc_flag & NCF_NEGATIVE) 1794 mtx_unlock(&ncneg_hot.nl_lock); 1795 } 1796 } 1797 goto out_unlock_free; 1798 } 1799 } 1800 1801 if (flag == NCF_ISDOTDOT) { 1802 /* 1803 * See if we are trying to add .. entry, but some other lookup 1804 * has populated v_cache_dd pointer already. 1805 */ 1806 if (dvp->v_cache_dd != NULL) 1807 goto out_unlock_free; 1808 KASSERT(vp == NULL || vp->v_type == VDIR, 1809 ("wrong vnode type %p", vp)); 1810 dvp->v_cache_dd = ncp; 1811 } 1812 1813 if (vp != NULL) { 1814 if (vp->v_type == VDIR) { 1815 if (flag != NCF_ISDOTDOT) { 1816 /* 1817 * For this case, the cache entry maps both the 1818 * directory name in it and the name ".." for the 1819 * directory's parent. 1820 */ 1821 if ((ndd = vp->v_cache_dd) != NULL) { 1822 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1823 cache_zap_locked(ndd, false); 1824 else 1825 ndd = NULL; 1826 } 1827 vp->v_cache_dd = ncp; 1828 } 1829 } else { 1830 vp->v_cache_dd = NULL; 1831 } 1832 } 1833 1834 if (flag != NCF_ISDOTDOT) { 1835 if (LIST_EMPTY(&dvp->v_cache_src)) { 1836 vhold(dvp); 1837 counter_u64_add(numcachehv, 1); 1838 } 1839 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1840 } 1841 1842 /* 1843 * Insert the new namecache entry into the appropriate chain 1844 * within the cache entries table. 1845 */ 1846 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1847 1848 /* 1849 * If the entry is "negative", we place it into the 1850 * "negative" cache queue, otherwise, we place it into the 1851 * destination vnode's cache entries queue. 1852 */ 1853 if (vp != NULL) { 1854 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1855 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1856 vp); 1857 } else { 1858 if (cnp->cn_flags & ISWHITEOUT) 1859 ncp->nc_flag |= NCF_WHITE; 1860 cache_negative_insert(ncp, false); 1861 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1862 ncp->nc_name); 1863 } 1864 cache_enter_unlock(&cel); 1865 if (numneg * ncnegfactor > lnumcache) 1866 cache_negative_zap_one(); 1867 cache_free(ndd); 1868 return; 1869 out_unlock_free: 1870 cache_enter_unlock(&cel); 1871 cache_free(ncp); 1872 return; 1873 } 1874 1875 static u_int 1876 cache_roundup_2(u_int val) 1877 { 1878 u_int res; 1879 1880 for (res = 1; res <= val; res <<= 1) 1881 continue; 1882 1883 return (res); 1884 } 1885 1886 /* 1887 * Name cache initialization, from vfs_init() when we are booting 1888 */ 1889 static void 1890 nchinit(void *dummy __unused) 1891 { 1892 u_int i; 1893 1894 cache_zone_small = uma_zcreate("S VFS Cache", 1895 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1896 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1897 UMA_ZONE_ZINIT); 1898 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1899 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1900 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1901 UMA_ZONE_ZINIT); 1902 cache_zone_large = uma_zcreate("L VFS Cache", 1903 sizeof(struct namecache) + NAME_MAX + 1, 1904 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1905 UMA_ZONE_ZINIT); 1906 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1907 sizeof(struct namecache_ts) + NAME_MAX + 1, 1908 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1909 UMA_ZONE_ZINIT); 1910 1911 ncsize = desiredvnodes * ncsizefactor; 1912 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1913 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 1914 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 1915 ncbuckethash = 7; 1916 if (ncbuckethash > nchash) 1917 ncbuckethash = nchash; 1918 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1919 M_WAITOK | M_ZERO); 1920 for (i = 0; i < numbucketlocks; i++) 1921 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 1922 ncvnodehash = ncbuckethash; 1923 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 1924 M_WAITOK | M_ZERO); 1925 for (i = 0; i < numvnodelocks; i++) 1926 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 1927 ncpurgeminvnodes = numbucketlocks * 2; 1928 1929 ncneghash = 3; 1930 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 1931 M_WAITOK | M_ZERO); 1932 for (i = 0; i < numneglists; i++) { 1933 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 1934 TAILQ_INIT(&neglists[i].nl_list); 1935 } 1936 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 1937 TAILQ_INIT(&ncneg_hot.nl_list); 1938 1939 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 1940 } 1941 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 1942 1943 void 1944 cache_changesize(u_long newmaxvnodes) 1945 { 1946 struct nchashhead *new_nchashtbl, *old_nchashtbl; 1947 u_long new_nchash, old_nchash; 1948 struct namecache *ncp; 1949 uint32_t hash; 1950 u_long newncsize; 1951 int i; 1952 1953 newncsize = newmaxvnodes * ncsizefactor; 1954 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 1955 if (newmaxvnodes < numbucketlocks) 1956 newmaxvnodes = numbucketlocks; 1957 1958 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 1959 /* If same hash table size, nothing to do */ 1960 if (nchash == new_nchash) { 1961 free(new_nchashtbl, M_VFSCACHE); 1962 return; 1963 } 1964 /* 1965 * Move everything from the old hash table to the new table. 1966 * None of the namecache entries in the table can be removed 1967 * because to do so, they have to be removed from the hash table. 1968 */ 1969 cache_lock_all_vnodes(); 1970 cache_lock_all_buckets(); 1971 old_nchashtbl = nchashtbl; 1972 old_nchash = nchash; 1973 nchashtbl = new_nchashtbl; 1974 nchash = new_nchash; 1975 for (i = 0; i <= old_nchash; i++) { 1976 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 1977 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 1978 ncp->nc_dvp); 1979 LIST_REMOVE(ncp, nc_hash); 1980 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 1981 } 1982 } 1983 ncsize = newncsize; 1984 cache_unlock_all_buckets(); 1985 cache_unlock_all_vnodes(); 1986 free(old_nchashtbl, M_VFSCACHE); 1987 } 1988 1989 /* 1990 * Invalidate all entries from and to a particular vnode. 1991 */ 1992 void 1993 cache_purge(struct vnode *vp) 1994 { 1995 TAILQ_HEAD(, namecache) ncps; 1996 struct namecache *ncp, *nnp; 1997 struct mtx *vlp, *vlp2; 1998 1999 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2000 SDT_PROBE1(vfs, namecache, purge, done, vp); 2001 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2002 vp->v_cache_dd == NULL) 2003 return; 2004 TAILQ_INIT(&ncps); 2005 vlp = VP2VNODELOCK(vp); 2006 vlp2 = NULL; 2007 mtx_lock(vlp); 2008 retry: 2009 while (!LIST_EMPTY(&vp->v_cache_src)) { 2010 ncp = LIST_FIRST(&vp->v_cache_src); 2011 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2012 goto retry; 2013 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2014 } 2015 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2016 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2017 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2018 goto retry; 2019 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2020 } 2021 ncp = vp->v_cache_dd; 2022 if (ncp != NULL) { 2023 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2024 ("lost dotdot link")); 2025 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2026 goto retry; 2027 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2028 } 2029 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2030 mtx_unlock(vlp); 2031 if (vlp2 != NULL) 2032 mtx_unlock(vlp2); 2033 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2034 cache_free(ncp); 2035 } 2036 } 2037 2038 /* 2039 * Invalidate all negative entries for a particular directory vnode. 2040 */ 2041 void 2042 cache_purge_negative(struct vnode *vp) 2043 { 2044 TAILQ_HEAD(, namecache) ncps; 2045 struct namecache *ncp, *nnp; 2046 struct mtx *vlp; 2047 2048 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2049 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2050 if (LIST_EMPTY(&vp->v_cache_src)) 2051 return; 2052 TAILQ_INIT(&ncps); 2053 vlp = VP2VNODELOCK(vp); 2054 mtx_lock(vlp); 2055 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2056 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2057 continue; 2058 cache_zap_negative_locked_vnode_kl(ncp, vp); 2059 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2060 } 2061 mtx_unlock(vlp); 2062 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2063 cache_free(ncp); 2064 } 2065 } 2066 2067 /* 2068 * Flush all entries referencing a particular filesystem. 2069 */ 2070 void 2071 cache_purgevfs(struct mount *mp, bool force) 2072 { 2073 TAILQ_HEAD(, namecache) ncps; 2074 struct mtx *vlp1, *vlp2; 2075 struct rwlock *blp; 2076 struct nchashhead *bucket; 2077 struct namecache *ncp, *nnp; 2078 u_long i, j, n_nchash; 2079 int error; 2080 2081 /* Scan hash tables for applicable entries */ 2082 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2083 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2084 return; 2085 TAILQ_INIT(&ncps); 2086 n_nchash = nchash + 1; 2087 vlp1 = vlp2 = NULL; 2088 for (i = 0; i < numbucketlocks; i++) { 2089 blp = (struct rwlock *)&bucketlocks[i]; 2090 rw_wlock(blp); 2091 for (j = i; j < n_nchash; j += numbucketlocks) { 2092 retry: 2093 bucket = &nchashtbl[j]; 2094 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2095 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2096 if (ncp->nc_dvp->v_mount != mp) 2097 continue; 2098 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2099 &vlp1, &vlp2); 2100 if (error != 0) 2101 goto retry; 2102 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2103 } 2104 } 2105 rw_wunlock(blp); 2106 if (vlp1 == NULL && vlp2 == NULL) 2107 cache_maybe_yield(); 2108 } 2109 if (vlp1 != NULL) 2110 mtx_unlock(vlp1); 2111 if (vlp2 != NULL) 2112 mtx_unlock(vlp2); 2113 2114 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2115 cache_free(ncp); 2116 } 2117 } 2118 2119 /* 2120 * Perform canonical checks and cache lookup and pass on to filesystem 2121 * through the vop_cachedlookup only if needed. 2122 */ 2123 2124 int 2125 vfs_cache_lookup(struct vop_lookup_args *ap) 2126 { 2127 struct vnode *dvp; 2128 int error; 2129 struct vnode **vpp = ap->a_vpp; 2130 struct componentname *cnp = ap->a_cnp; 2131 int flags = cnp->cn_flags; 2132 2133 *vpp = NULL; 2134 dvp = ap->a_dvp; 2135 2136 if (dvp->v_type != VDIR) 2137 return (ENOTDIR); 2138 2139 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2140 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2141 return (EROFS); 2142 2143 error = vn_dir_check_exec(dvp, cnp); 2144 if (error != 0) 2145 return (error); 2146 2147 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2148 if (error == 0) 2149 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2150 if (error == -1) 2151 return (0); 2152 return (error); 2153 } 2154 2155 /* Implementation of the getcwd syscall. */ 2156 int 2157 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2158 { 2159 char *buf, *retbuf; 2160 size_t buflen; 2161 int error; 2162 2163 buflen = uap->buflen; 2164 if (__predict_false(buflen < 2)) 2165 return (EINVAL); 2166 if (buflen > MAXPATHLEN) 2167 buflen = MAXPATHLEN; 2168 2169 buf = malloc(buflen, M_TEMP, M_WAITOK); 2170 error = vn_getcwd(td, buf, &retbuf, &buflen); 2171 if (error == 0) 2172 error = copyout(retbuf, uap->buf, buflen); 2173 free(buf, M_TEMP); 2174 return (error); 2175 } 2176 2177 int 2178 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2179 { 2180 struct pwd *pwd; 2181 int error; 2182 2183 pwd = pwd_hold(td); 2184 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2185 pwd_drop(pwd); 2186 2187 #ifdef KTRACE 2188 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2189 ktrnamei(*retbuf); 2190 #endif 2191 return (error); 2192 } 2193 2194 static int 2195 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2196 size_t size, int flags, enum uio_seg pathseg) 2197 { 2198 struct nameidata nd; 2199 char *retbuf, *freebuf; 2200 int error; 2201 2202 if (flags != 0) 2203 return (EINVAL); 2204 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2205 pathseg, path, fd, &cap_fstat_rights, td); 2206 if ((error = namei(&nd)) != 0) 2207 return (error); 2208 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2209 if (error == 0) { 2210 error = copyout(retbuf, buf, size); 2211 free(freebuf, M_TEMP); 2212 } 2213 NDFREE(&nd, 0); 2214 return (error); 2215 } 2216 2217 int 2218 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2219 { 2220 2221 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2222 uap->flags, UIO_USERSPACE)); 2223 } 2224 2225 /* 2226 * Retrieve the full filesystem path that correspond to a vnode from the name 2227 * cache (if available) 2228 */ 2229 int 2230 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2231 { 2232 struct pwd *pwd; 2233 char *buf; 2234 size_t buflen; 2235 int error; 2236 2237 if (__predict_false(vn == NULL)) 2238 return (EINVAL); 2239 2240 buflen = MAXPATHLEN; 2241 buf = malloc(buflen, M_TEMP, M_WAITOK); 2242 pwd = pwd_hold(td); 2243 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2244 pwd_drop(pwd); 2245 2246 if (!error) 2247 *freebuf = buf; 2248 else 2249 free(buf, M_TEMP); 2250 return (error); 2251 } 2252 2253 /* 2254 * This function is similar to vn_fullpath, but it attempts to lookup the 2255 * pathname relative to the global root mount point. This is required for the 2256 * auditing sub-system, as audited pathnames must be absolute, relative to the 2257 * global root mount point. 2258 */ 2259 int 2260 vn_fullpath_global(struct thread *td, struct vnode *vn, 2261 char **retbuf, char **freebuf) 2262 { 2263 char *buf; 2264 size_t buflen; 2265 int error; 2266 2267 if (__predict_false(vn == NULL)) 2268 return (EINVAL); 2269 buflen = MAXPATHLEN; 2270 buf = malloc(buflen, M_TEMP, M_WAITOK); 2271 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2272 if (!error) 2273 *freebuf = buf; 2274 else 2275 free(buf, M_TEMP); 2276 return (error); 2277 } 2278 2279 int 2280 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2281 { 2282 struct vnode *dvp; 2283 struct namecache *ncp; 2284 struct mtx *vlp; 2285 int error; 2286 2287 vlp = VP2VNODELOCK(*vp); 2288 mtx_lock(vlp); 2289 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2290 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2291 break; 2292 } 2293 if (ncp != NULL) { 2294 if (*buflen < ncp->nc_nlen) { 2295 mtx_unlock(vlp); 2296 vrele(*vp); 2297 counter_u64_add(numfullpathfail4, 1); 2298 error = ENOMEM; 2299 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2300 vp, NULL); 2301 return (error); 2302 } 2303 *buflen -= ncp->nc_nlen; 2304 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2305 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2306 ncp->nc_name, vp); 2307 dvp = *vp; 2308 *vp = ncp->nc_dvp; 2309 vref(*vp); 2310 mtx_unlock(vlp); 2311 vrele(dvp); 2312 return (0); 2313 } 2314 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2315 2316 mtx_unlock(vlp); 2317 vn_lock(*vp, LK_SHARED | LK_RETRY); 2318 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2319 vput(*vp); 2320 if (error) { 2321 counter_u64_add(numfullpathfail2, 1); 2322 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2323 return (error); 2324 } 2325 2326 *vp = dvp; 2327 if (VN_IS_DOOMED(dvp)) { 2328 /* forced unmount */ 2329 vrele(dvp); 2330 error = ENOENT; 2331 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2332 return (error); 2333 } 2334 /* 2335 * *vp has its use count incremented still. 2336 */ 2337 2338 return (0); 2339 } 2340 2341 /* 2342 * Resolve a directory to a pathname. 2343 * 2344 * The name of the directory can always be found in the namecache or fetched 2345 * from the filesystem. There is also guaranteed to be only one parent, meaning 2346 * we can just follow vnodes up until we find the root. 2347 * 2348 * The vnode must be referenced. 2349 */ 2350 static int 2351 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2352 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2353 { 2354 #ifdef KDTRACE_HOOKS 2355 struct vnode *startvp = vp; 2356 #endif 2357 struct vnode *vp1; 2358 size_t buflen; 2359 int error; 2360 2361 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2362 VNPASS(vp->v_usecount > 0, vp); 2363 2364 buflen = *len; 2365 2366 if (!slash_prefixed) { 2367 MPASS(*len >= 2); 2368 buflen--; 2369 buf[buflen] = '\0'; 2370 } 2371 2372 error = 0; 2373 2374 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2375 counter_u64_add(numfullpathcalls, 1); 2376 while (vp != rdir && vp != rootvnode) { 2377 /* 2378 * The vp vnode must be already fully constructed, 2379 * since it is either found in namecache or obtained 2380 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2381 * without obtaining the vnode lock. 2382 */ 2383 if ((vp->v_vflag & VV_ROOT) != 0) { 2384 vn_lock(vp, LK_RETRY | LK_SHARED); 2385 2386 /* 2387 * With the vnode locked, check for races with 2388 * unmount, forced or not. Note that we 2389 * already verified that vp is not equal to 2390 * the root vnode, which means that 2391 * mnt_vnodecovered can be NULL only for the 2392 * case of unmount. 2393 */ 2394 if (VN_IS_DOOMED(vp) || 2395 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2396 vp1->v_mountedhere != vp->v_mount) { 2397 vput(vp); 2398 error = ENOENT; 2399 SDT_PROBE3(vfs, namecache, fullpath, return, 2400 error, vp, NULL); 2401 break; 2402 } 2403 2404 vref(vp1); 2405 vput(vp); 2406 vp = vp1; 2407 continue; 2408 } 2409 if (vp->v_type != VDIR) { 2410 vrele(vp); 2411 counter_u64_add(numfullpathfail1, 1); 2412 error = ENOTDIR; 2413 SDT_PROBE3(vfs, namecache, fullpath, return, 2414 error, vp, NULL); 2415 break; 2416 } 2417 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2418 if (error) 2419 break; 2420 if (buflen == 0) { 2421 vrele(vp); 2422 error = ENOMEM; 2423 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2424 startvp, NULL); 2425 break; 2426 } 2427 buf[--buflen] = '/'; 2428 slash_prefixed = true; 2429 } 2430 if (error) 2431 return (error); 2432 if (!slash_prefixed) { 2433 if (buflen == 0) { 2434 vrele(vp); 2435 counter_u64_add(numfullpathfail4, 1); 2436 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2437 startvp, NULL); 2438 return (ENOMEM); 2439 } 2440 buf[--buflen] = '/'; 2441 } 2442 counter_u64_add(numfullpathfound, 1); 2443 vrele(vp); 2444 2445 *retbuf = buf + buflen; 2446 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2447 *len -= buflen; 2448 *len += addend; 2449 return (0); 2450 } 2451 2452 /* 2453 * Resolve an arbitrary vnode to a pathname. 2454 * 2455 * Note 2 caveats: 2456 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2457 * resolve to a different path than the one used to find it 2458 * - namecache is not mandatory, meaning names are not guaranteed to be added 2459 * (in which case resolving fails) 2460 */ 2461 static int 2462 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2463 char *buf, char **retbuf, size_t *buflen) 2464 { 2465 size_t orig_buflen; 2466 bool slash_prefixed; 2467 int error; 2468 2469 if (*buflen < 2) 2470 return (EINVAL); 2471 2472 orig_buflen = *buflen; 2473 2474 vref(vp); 2475 slash_prefixed = false; 2476 if (vp->v_type != VDIR) { 2477 *buflen -= 1; 2478 buf[*buflen] = '\0'; 2479 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2480 if (error) 2481 return (error); 2482 if (*buflen == 0) { 2483 vrele(vp); 2484 return (ENOMEM); 2485 } 2486 *buflen -= 1; 2487 buf[*buflen] = '/'; 2488 slash_prefixed = true; 2489 } 2490 2491 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2492 orig_buflen - *buflen)); 2493 } 2494 2495 /* 2496 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2497 * 2498 * Since the namecache does not track handlings, the caller is expected to first 2499 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2500 * 2501 * Then we have 2 cases: 2502 * - if the found vnode is a directory, the path can be constructed just by 2503 * fullowing names up the chain 2504 * - otherwise we populate the buffer with the saved name and start resolving 2505 * from the parent 2506 */ 2507 static int 2508 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2509 char **freebuf, size_t *buflen) 2510 { 2511 char *buf, *tmpbuf; 2512 struct pwd *pwd; 2513 struct componentname *cnp; 2514 struct vnode *vp; 2515 size_t addend; 2516 int error; 2517 bool slash_prefixed; 2518 2519 if (*buflen < 2) 2520 return (EINVAL); 2521 if (*buflen > MAXPATHLEN) 2522 *buflen = MAXPATHLEN; 2523 2524 slash_prefixed = false; 2525 2526 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2527 pwd = pwd_hold(td); 2528 2529 addend = 0; 2530 vp = ndp->ni_vp; 2531 if (vp->v_type != VDIR) { 2532 cnp = &ndp->ni_cnd; 2533 addend = cnp->cn_namelen + 2; 2534 if (*buflen < addend) { 2535 error = ENOMEM; 2536 goto out_bad; 2537 } 2538 *buflen -= addend; 2539 tmpbuf = buf + *buflen; 2540 tmpbuf[0] = '/'; 2541 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2542 tmpbuf[addend - 1] = '\0'; 2543 slash_prefixed = true; 2544 vp = ndp->ni_dvp; 2545 } 2546 2547 vref(vp); 2548 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2549 slash_prefixed, addend); 2550 if (error != 0) 2551 goto out_bad; 2552 2553 pwd_drop(pwd); 2554 *freebuf = buf; 2555 2556 return (0); 2557 out_bad: 2558 pwd_drop(pwd); 2559 free(buf, M_TEMP); 2560 return (error); 2561 } 2562 2563 struct vnode * 2564 vn_dir_dd_ino(struct vnode *vp) 2565 { 2566 struct namecache *ncp; 2567 struct vnode *ddvp; 2568 struct mtx *vlp; 2569 enum vgetstate vs; 2570 2571 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2572 vlp = VP2VNODELOCK(vp); 2573 mtx_lock(vlp); 2574 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2575 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2576 continue; 2577 ddvp = ncp->nc_dvp; 2578 vs = vget_prep(ddvp); 2579 mtx_unlock(vlp); 2580 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2581 return (NULL); 2582 return (ddvp); 2583 } 2584 mtx_unlock(vlp); 2585 return (NULL); 2586 } 2587 2588 int 2589 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2590 { 2591 struct namecache *ncp; 2592 struct mtx *vlp; 2593 int l; 2594 2595 vlp = VP2VNODELOCK(vp); 2596 mtx_lock(vlp); 2597 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2598 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2599 break; 2600 if (ncp == NULL) { 2601 mtx_unlock(vlp); 2602 return (ENOENT); 2603 } 2604 l = min(ncp->nc_nlen, buflen - 1); 2605 memcpy(buf, ncp->nc_name, l); 2606 mtx_unlock(vlp); 2607 buf[l] = '\0'; 2608 return (0); 2609 } 2610 2611 /* 2612 * This function updates path string to vnode's full global path 2613 * and checks the size of the new path string against the pathlen argument. 2614 * 2615 * Requires a locked, referenced vnode. 2616 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2617 * 2618 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2619 * because it falls back to the ".." lookup if the namecache lookup fails. 2620 */ 2621 int 2622 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2623 u_int pathlen) 2624 { 2625 struct nameidata nd; 2626 struct vnode *vp1; 2627 char *rpath, *fbuf; 2628 int error; 2629 2630 ASSERT_VOP_ELOCKED(vp, __func__); 2631 2632 /* Construct global filesystem path from vp. */ 2633 VOP_UNLOCK(vp); 2634 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2635 2636 if (error != 0) { 2637 vrele(vp); 2638 return (error); 2639 } 2640 2641 if (strlen(rpath) >= pathlen) { 2642 vrele(vp); 2643 error = ENAMETOOLONG; 2644 goto out; 2645 } 2646 2647 /* 2648 * Re-lookup the vnode by path to detect a possible rename. 2649 * As a side effect, the vnode is relocked. 2650 * If vnode was renamed, return ENOENT. 2651 */ 2652 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2653 UIO_SYSSPACE, path, td); 2654 error = namei(&nd); 2655 if (error != 0) { 2656 vrele(vp); 2657 goto out; 2658 } 2659 NDFREE(&nd, NDF_ONLY_PNBUF); 2660 vp1 = nd.ni_vp; 2661 vrele(vp); 2662 if (vp1 == vp) 2663 strcpy(path, rpath); 2664 else { 2665 vput(vp1); 2666 error = ENOENT; 2667 } 2668 2669 out: 2670 free(fbuf, M_TEMP); 2671 return (error); 2672 } 2673 2674 #ifdef DDB 2675 static void 2676 db_print_vpath(struct vnode *vp) 2677 { 2678 2679 while (vp != NULL) { 2680 db_printf("%p: ", vp); 2681 if (vp == rootvnode) { 2682 db_printf("/"); 2683 vp = NULL; 2684 } else { 2685 if (vp->v_vflag & VV_ROOT) { 2686 db_printf("<mount point>"); 2687 vp = vp->v_mount->mnt_vnodecovered; 2688 } else { 2689 struct namecache *ncp; 2690 char *ncn; 2691 int i; 2692 2693 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2694 if (ncp != NULL) { 2695 ncn = ncp->nc_name; 2696 for (i = 0; i < ncp->nc_nlen; i++) 2697 db_printf("%c", *ncn++); 2698 vp = ncp->nc_dvp; 2699 } else { 2700 vp = NULL; 2701 } 2702 } 2703 } 2704 db_printf("\n"); 2705 } 2706 2707 return; 2708 } 2709 2710 DB_SHOW_COMMAND(vpath, db_show_vpath) 2711 { 2712 struct vnode *vp; 2713 2714 if (!have_addr) { 2715 db_printf("usage: show vpath <struct vnode *>\n"); 2716 return; 2717 } 2718 2719 vp = (struct vnode *)addr; 2720 db_print_vpath(vp); 2721 } 2722 2723 #endif 2724