1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/mount.h> 55 #include <sys/namei.h> 56 #include <sys/proc.h> 57 #include <sys/rwlock.h> 58 #include <sys/sdt.h> 59 #include <sys/smr.h> 60 #include <sys/smp.h> 61 #include <sys/syscallsubr.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysproto.h> 64 #include <sys/vnode.h> 65 #include <ck_queue.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 70 #ifdef DDB 71 #include <ddb/ddb.h> 72 #endif 73 74 #include <vm/uma.h> 75 76 SDT_PROVIDER_DECLARE(vfs); 77 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 78 "struct vnode *"); 79 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 80 "char *"); 81 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 82 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 83 "char *", "struct vnode *"); 84 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 85 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 86 "struct vnode *", "char *"); 87 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 88 "struct vnode *"); 89 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 90 "struct vnode *", "char *"); 91 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 92 "char *"); 93 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 94 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 95 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 96 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 97 "struct vnode *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 99 "char *"); 100 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 101 "char *"); 102 103 /* 104 * This structure describes the elements in the cache of recent 105 * names looked up by namei. 106 */ 107 108 struct namecache { 109 CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */ 110 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 111 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 112 struct vnode *nc_dvp; /* vnode of parent of name */ 113 union { 114 struct vnode *nu_vp; /* vnode the name refers to */ 115 } n_un; 116 u_char nc_flag; /* flag bits */ 117 u_char nc_nlen; /* length of name */ 118 char nc_name[0]; /* segment name + nul */ 119 }; 120 121 /* 122 * struct namecache_ts repeats struct namecache layout up to the 123 * nc_nlen member. 124 * struct namecache_ts is used in place of struct namecache when time(s) need 125 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 126 * both a non-dotdot directory name plus dotdot for the directory's 127 * parent. 128 */ 129 struct namecache_ts { 130 struct timespec nc_time; /* timespec provided by fs */ 131 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 132 int nc_ticks; /* ticks value when entry was added */ 133 struct namecache nc_nc; 134 }; 135 136 #define nc_vp n_un.nu_vp 137 138 /* 139 * Flags in namecache.nc_flag 140 */ 141 #define NCF_WHITE 0x01 142 #define NCF_ISDOTDOT 0x02 143 #define NCF_TS 0x04 144 #define NCF_DTS 0x08 145 #define NCF_DVDROP 0x10 146 #define NCF_NEGATIVE 0x20 147 #define NCF_HOTNEGATIVE 0x40 148 #define NCF_INVALID 0x80 149 150 /* 151 * Mark an entry as invalid. 152 * 153 * This is called before it starts getting deconstructed. 154 */ 155 static void 156 cache_ncp_invalidate(struct namecache *ncp) 157 { 158 159 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 160 ("%s: entry %p already invalid", __func__, ncp)); 161 ncp->nc_flag |= NCF_INVALID; 162 atomic_thread_fence_rel(); 163 } 164 165 /* 166 * Verify validity of an entry. 167 * 168 * All places which elide locks are supposed to call this after they are 169 * done with reading from an entry. 170 */ 171 static bool 172 cache_ncp_invalid(struct namecache *ncp) 173 { 174 175 atomic_thread_fence_acq(); 176 return ((ncp->nc_flag & NCF_INVALID) != 0); 177 } 178 179 /* 180 * Name caching works as follows: 181 * 182 * Names found by directory scans are retained in a cache 183 * for future reference. It is managed LRU, so frequently 184 * used names will hang around. Cache is indexed by hash value 185 * obtained from (dvp, name) where dvp refers to the directory 186 * containing name. 187 * 188 * If it is a "negative" entry, (i.e. for a name that is known NOT to 189 * exist) the vnode pointer will be NULL. 190 * 191 * Upon reaching the last segment of a path, if the reference 192 * is for DELETE, or NOCACHE is set (rewrite), and the 193 * name is located in the cache, it will be dropped. 194 * 195 * These locks are used (in the order in which they can be taken): 196 * NAME TYPE ROLE 197 * vnodelock mtx vnode lists and v_cache_dd field protection 198 * bucketlock rwlock for access to given set of hash buckets 199 * neglist mtx negative entry LRU management 200 * 201 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 202 * shrinking the LRU list. 203 * 204 * It is legal to take multiple vnodelock and bucketlock locks. The locking 205 * order is lower address first. Both are recursive. 206 * 207 * "." lookups are lockless. 208 * 209 * ".." and vnode -> name lookups require vnodelock. 210 * 211 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 212 * 213 * Insertions and removals of entries require involved vnodes and bucketlocks 214 * to be write-locked to prevent other threads from seeing the entry. 215 * 216 * Some lookups result in removal of the found entry (e.g. getting rid of a 217 * negative entry with the intent to create a positive one), which poses a 218 * problem when multiple threads reach the state. Similarly, two different 219 * threads can purge two different vnodes and try to remove the same name. 220 * 221 * If the already held vnode lock is lower than the second required lock, we 222 * can just take the other lock. However, in the opposite case, this could 223 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 224 * the first node, locking everything in order and revalidating the state. 225 */ 226 227 VFS_SMR_DECLARE; 228 229 /* 230 * Structures associated with name caching. 231 */ 232 #define NCHHASH(hash) \ 233 (&nchashtbl[(hash) & nchash]) 234 static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 235 static u_long __read_mostly nchash; /* size of hash table */ 236 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 237 "Size of namecache hash table"); 238 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 239 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 240 "Ratio of negative namecache entries"); 241 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 242 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 243 u_int ncsizefactor = 2; 244 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 245 "Size factor for namecache"); 246 static u_int __read_mostly ncpurgeminvnodes; 247 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 248 "Number of vnodes below which purgevfs ignores the request"); 249 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 250 251 struct nchstats nchstats; /* cache effectiveness statistics */ 252 253 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 254 static int shrink_list_turn; 255 256 struct neglist { 257 struct mtx nl_lock; 258 TAILQ_HEAD(, namecache) nl_list; 259 } __aligned(CACHE_LINE_SIZE); 260 261 static struct neglist __read_mostly *neglists; 262 static struct neglist ncneg_hot; 263 static u_long numhotneg; 264 265 #define numneglists (ncneghash + 1) 266 static u_int __read_mostly ncneghash; 267 static inline struct neglist * 268 NCP2NEGLIST(struct namecache *ncp) 269 { 270 271 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 272 } 273 274 #define numbucketlocks (ncbuckethash + 1) 275 static u_int __read_mostly ncbuckethash; 276 static struct rwlock_padalign __read_mostly *bucketlocks; 277 #define HASH2BUCKETLOCK(hash) \ 278 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 279 280 #define numvnodelocks (ncvnodehash + 1) 281 static u_int __read_mostly ncvnodehash; 282 static struct mtx __read_mostly *vnodelocks; 283 static inline struct mtx * 284 VP2VNODELOCK(struct vnode *vp) 285 { 286 287 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 288 } 289 290 /* 291 * UMA zones for the VFS cache. 292 * 293 * The small cache is used for entries with short names, which are the 294 * most common. The large cache is used for entries which are too big to 295 * fit in the small cache. 296 */ 297 static uma_zone_t __read_mostly cache_zone_small; 298 static uma_zone_t __read_mostly cache_zone_small_ts; 299 static uma_zone_t __read_mostly cache_zone_large; 300 static uma_zone_t __read_mostly cache_zone_large_ts; 301 302 #define CACHE_PATH_CUTOFF 35 303 304 static struct namecache * 305 cache_alloc(int len, int ts) 306 { 307 struct namecache_ts *ncp_ts; 308 struct namecache *ncp; 309 310 if (__predict_false(ts)) { 311 if (len <= CACHE_PATH_CUTOFF) 312 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 313 else 314 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 315 ncp = &ncp_ts->nc_nc; 316 } else { 317 if (len <= CACHE_PATH_CUTOFF) 318 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 319 else 320 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 321 } 322 return (ncp); 323 } 324 325 static void 326 cache_free(struct namecache *ncp) 327 { 328 struct namecache_ts *ncp_ts; 329 330 if (ncp == NULL) 331 return; 332 if ((ncp->nc_flag & NCF_DVDROP) != 0) 333 vdrop(ncp->nc_dvp); 334 if (__predict_false(ncp->nc_flag & NCF_TS)) { 335 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 336 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 337 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 338 else 339 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 340 } else { 341 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 342 uma_zfree_smr(cache_zone_small, ncp); 343 else 344 uma_zfree_smr(cache_zone_large, ncp); 345 } 346 } 347 348 static void 349 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 350 { 351 struct namecache_ts *ncp_ts; 352 353 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 354 (tsp == NULL && ticksp == NULL), 355 ("No NCF_TS")); 356 357 if (tsp == NULL && ticksp == NULL) 358 return; 359 360 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 361 if (tsp != NULL) 362 *tsp = ncp_ts->nc_time; 363 if (ticksp != NULL) 364 *ticksp = ncp_ts->nc_ticks; 365 } 366 367 #ifdef DEBUG_CACHE 368 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 369 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 370 "VFS namecache enabled"); 371 #endif 372 373 /* Export size information to userland */ 374 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 375 sizeof(struct namecache), "sizeof(struct namecache)"); 376 377 /* 378 * The new name cache statistics 379 */ 380 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 381 "Name cache statistics"); 382 #define STATNODE_ULONG(name, descr) \ 383 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 384 #define STATNODE_COUNTER(name, descr) \ 385 static COUNTER_U64_DEFINE_EARLY(name); \ 386 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 387 descr); 388 STATNODE_ULONG(numneg, "Number of negative cache entries"); 389 STATNODE_ULONG(numcache, "Number of cache entries"); 390 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 391 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 392 STATNODE_COUNTER(dothits, "Number of '.' hits"); 393 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 394 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 395 STATNODE_COUNTER(nummiss, "Number of cache misses"); 396 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 397 STATNODE_COUNTER(numposzaps, 398 "Number of cache hits (positive) we do not want to cache"); 399 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 400 STATNODE_COUNTER(numnegzaps, 401 "Number of cache hits (negative) we do not want to cache"); 402 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 403 /* These count for vn_getcwd(), too. */ 404 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 405 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 406 STATNODE_COUNTER(numfullpathfail2, 407 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 408 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 409 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 410 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 411 "Number of successful removals after relocking"); 412 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 413 "Number of times zap_and_exit failed to lock"); 414 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 415 "Number of times zap_and_exit failed to lock"); 416 static long cache_lock_vnodes_cel_3_failures; 417 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 418 "Number of times 3-way vnode locking failed"); 419 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 420 STATNODE_COUNTER(numneg_evicted, 421 "Number of negative entries evicted when adding a new entry"); 422 STATNODE_COUNTER(shrinking_skipped, 423 "Number of times shrinking was already in progress"); 424 425 static void cache_zap_locked(struct namecache *ncp, bool neg_locked); 426 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 427 char **freebuf, size_t *buflen); 428 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 429 char *buf, char **retbuf, size_t *buflen); 430 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 431 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 432 433 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 434 435 static int cache_yield; 436 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 437 "Number of times cache called yield"); 438 439 static void __noinline 440 cache_maybe_yield(void) 441 { 442 443 if (should_yield()) { 444 cache_yield++; 445 kern_yield(PRI_USER); 446 } 447 } 448 449 static inline void 450 cache_assert_vlp_locked(struct mtx *vlp) 451 { 452 453 if (vlp != NULL) 454 mtx_assert(vlp, MA_OWNED); 455 } 456 457 static inline void 458 cache_assert_vnode_locked(struct vnode *vp) 459 { 460 struct mtx *vlp; 461 462 vlp = VP2VNODELOCK(vp); 463 cache_assert_vlp_locked(vlp); 464 } 465 466 static uint32_t 467 cache_get_hash(char *name, u_char len, struct vnode *dvp) 468 { 469 uint32_t hash; 470 471 hash = fnv_32_buf(name, len, FNV1_32_INIT); 472 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 473 return (hash); 474 } 475 476 static inline struct rwlock * 477 NCP2BUCKETLOCK(struct namecache *ncp) 478 { 479 uint32_t hash; 480 481 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 482 return (HASH2BUCKETLOCK(hash)); 483 } 484 485 #ifdef INVARIANTS 486 static void 487 cache_assert_bucket_locked(struct namecache *ncp, int mode) 488 { 489 struct rwlock *blp; 490 491 blp = NCP2BUCKETLOCK(ncp); 492 rw_assert(blp, mode); 493 } 494 #else 495 #define cache_assert_bucket_locked(x, y) do { } while (0) 496 #endif 497 498 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 499 static void 500 _cache_sort_vnodes(void **p1, void **p2) 501 { 502 void *tmp; 503 504 MPASS(*p1 != NULL || *p2 != NULL); 505 506 if (*p1 > *p2) { 507 tmp = *p2; 508 *p2 = *p1; 509 *p1 = tmp; 510 } 511 } 512 513 static void 514 cache_lock_all_buckets(void) 515 { 516 u_int i; 517 518 for (i = 0; i < numbucketlocks; i++) 519 rw_wlock(&bucketlocks[i]); 520 } 521 522 static void 523 cache_unlock_all_buckets(void) 524 { 525 u_int i; 526 527 for (i = 0; i < numbucketlocks; i++) 528 rw_wunlock(&bucketlocks[i]); 529 } 530 531 static void 532 cache_lock_all_vnodes(void) 533 { 534 u_int i; 535 536 for (i = 0; i < numvnodelocks; i++) 537 mtx_lock(&vnodelocks[i]); 538 } 539 540 static void 541 cache_unlock_all_vnodes(void) 542 { 543 u_int i; 544 545 for (i = 0; i < numvnodelocks; i++) 546 mtx_unlock(&vnodelocks[i]); 547 } 548 549 static int 550 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 551 { 552 553 cache_sort_vnodes(&vlp1, &vlp2); 554 555 if (vlp1 != NULL) { 556 if (!mtx_trylock(vlp1)) 557 return (EAGAIN); 558 } 559 if (!mtx_trylock(vlp2)) { 560 if (vlp1 != NULL) 561 mtx_unlock(vlp1); 562 return (EAGAIN); 563 } 564 565 return (0); 566 } 567 568 static void 569 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 570 { 571 572 MPASS(vlp1 != NULL || vlp2 != NULL); 573 MPASS(vlp1 <= vlp2); 574 575 if (vlp1 != NULL) 576 mtx_lock(vlp1); 577 if (vlp2 != NULL) 578 mtx_lock(vlp2); 579 } 580 581 static void 582 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 583 { 584 585 MPASS(vlp1 != NULL || vlp2 != NULL); 586 587 if (vlp1 != NULL) 588 mtx_unlock(vlp1); 589 if (vlp2 != NULL) 590 mtx_unlock(vlp2); 591 } 592 593 static int 594 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 595 { 596 struct nchstats snap; 597 598 if (req->oldptr == NULL) 599 return (SYSCTL_OUT(req, 0, sizeof(snap))); 600 601 snap = nchstats; 602 snap.ncs_goodhits = counter_u64_fetch(numposhits); 603 snap.ncs_neghits = counter_u64_fetch(numneghits); 604 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 605 counter_u64_fetch(numnegzaps); 606 snap.ncs_miss = counter_u64_fetch(nummisszap) + 607 counter_u64_fetch(nummiss); 608 609 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 610 } 611 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 612 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 613 "VFS cache effectiveness statistics"); 614 615 #ifdef DIAGNOSTIC 616 /* 617 * Grab an atomic snapshot of the name cache hash chain lengths 618 */ 619 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 620 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 621 "hash table stats"); 622 623 static int 624 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 625 { 626 struct nchashhead *ncpp; 627 struct namecache *ncp; 628 int i, error, n_nchash, *cntbuf; 629 630 retry: 631 n_nchash = nchash + 1; /* nchash is max index, not count */ 632 if (req->oldptr == NULL) 633 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 634 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 635 cache_lock_all_buckets(); 636 if (n_nchash != nchash + 1) { 637 cache_unlock_all_buckets(); 638 free(cntbuf, M_TEMP); 639 goto retry; 640 } 641 /* Scan hash tables counting entries */ 642 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 643 CK_LIST_FOREACH(ncp, ncpp, nc_hash) 644 cntbuf[i]++; 645 cache_unlock_all_buckets(); 646 for (error = 0, i = 0; i < n_nchash; i++) 647 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 648 break; 649 free(cntbuf, M_TEMP); 650 return (error); 651 } 652 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 653 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 654 "nchash chain lengths"); 655 656 static int 657 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 658 { 659 int error; 660 struct nchashhead *ncpp; 661 struct namecache *ncp; 662 int n_nchash; 663 int count, maxlength, used, pct; 664 665 if (!req->oldptr) 666 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 667 668 cache_lock_all_buckets(); 669 n_nchash = nchash + 1; /* nchash is max index, not count */ 670 used = 0; 671 maxlength = 0; 672 673 /* Scan hash tables for applicable entries */ 674 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 675 count = 0; 676 CK_LIST_FOREACH(ncp, ncpp, nc_hash) { 677 count++; 678 } 679 if (count) 680 used++; 681 if (maxlength < count) 682 maxlength = count; 683 } 684 n_nchash = nchash + 1; 685 cache_unlock_all_buckets(); 686 pct = (used * 100) / (n_nchash / 100); 687 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 688 if (error) 689 return (error); 690 error = SYSCTL_OUT(req, &used, sizeof(used)); 691 if (error) 692 return (error); 693 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 694 if (error) 695 return (error); 696 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 697 if (error) 698 return (error); 699 return (0); 700 } 701 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 702 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 703 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 704 #endif 705 706 /* 707 * Negative entries management 708 * 709 * A variation of LRU scheme is used. New entries are hashed into one of 710 * numneglists cold lists. Entries get promoted to the hot list on first hit. 711 * 712 * The shrinker will demote hot list head and evict from the cold list in a 713 * round-robin manner. 714 */ 715 static void 716 cache_negative_hit(struct namecache *ncp) 717 { 718 struct neglist *neglist; 719 720 MPASS(ncp->nc_flag & NCF_NEGATIVE); 721 if (ncp->nc_flag & NCF_HOTNEGATIVE) 722 return; 723 neglist = NCP2NEGLIST(ncp); 724 mtx_lock(&ncneg_hot.nl_lock); 725 mtx_lock(&neglist->nl_lock); 726 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 727 numhotneg++; 728 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 729 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 730 ncp->nc_flag |= NCF_HOTNEGATIVE; 731 } 732 mtx_unlock(&neglist->nl_lock); 733 mtx_unlock(&ncneg_hot.nl_lock); 734 } 735 736 static void 737 cache_negative_insert(struct namecache *ncp, bool neg_locked) 738 { 739 struct neglist *neglist; 740 741 MPASS(ncp->nc_flag & NCF_NEGATIVE); 742 cache_assert_bucket_locked(ncp, RA_WLOCKED); 743 neglist = NCP2NEGLIST(ncp); 744 if (!neg_locked) { 745 mtx_lock(&neglist->nl_lock); 746 } else { 747 mtx_assert(&neglist->nl_lock, MA_OWNED); 748 } 749 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 750 if (!neg_locked) 751 mtx_unlock(&neglist->nl_lock); 752 atomic_add_rel_long(&numneg, 1); 753 } 754 755 static void 756 cache_negative_remove(struct namecache *ncp, bool neg_locked) 757 { 758 struct neglist *neglist; 759 bool hot_locked = false; 760 bool list_locked = false; 761 762 MPASS(ncp->nc_flag & NCF_NEGATIVE); 763 cache_assert_bucket_locked(ncp, RA_WLOCKED); 764 neglist = NCP2NEGLIST(ncp); 765 if (!neg_locked) { 766 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 767 hot_locked = true; 768 mtx_lock(&ncneg_hot.nl_lock); 769 if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { 770 list_locked = true; 771 mtx_lock(&neglist->nl_lock); 772 } 773 } else { 774 list_locked = true; 775 mtx_lock(&neglist->nl_lock); 776 } 777 } 778 if (ncp->nc_flag & NCF_HOTNEGATIVE) { 779 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 780 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 781 numhotneg--; 782 } else { 783 mtx_assert(&neglist->nl_lock, MA_OWNED); 784 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 785 } 786 if (list_locked) 787 mtx_unlock(&neglist->nl_lock); 788 if (hot_locked) 789 mtx_unlock(&ncneg_hot.nl_lock); 790 atomic_subtract_rel_long(&numneg, 1); 791 } 792 793 static void 794 cache_negative_shrink_select(int start, struct namecache **ncpp, 795 struct neglist **neglistpp) 796 { 797 struct neglist *neglist; 798 struct namecache *ncp; 799 int i; 800 801 *ncpp = ncp = NULL; 802 neglist = NULL; 803 804 for (i = start; i < numneglists; i++) { 805 neglist = &neglists[i]; 806 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 807 continue; 808 mtx_lock(&neglist->nl_lock); 809 ncp = TAILQ_FIRST(&neglist->nl_list); 810 if (ncp != NULL) 811 break; 812 mtx_unlock(&neglist->nl_lock); 813 } 814 815 *neglistpp = neglist; 816 *ncpp = ncp; 817 } 818 819 static void 820 cache_negative_zap_one(void) 821 { 822 struct namecache *ncp, *ncp2; 823 struct neglist *neglist; 824 struct mtx *dvlp; 825 struct rwlock *blp; 826 827 if (mtx_owner(&ncneg_shrink_lock) != NULL || 828 !mtx_trylock(&ncneg_shrink_lock)) { 829 counter_u64_add(shrinking_skipped, 1); 830 return; 831 } 832 833 mtx_lock(&ncneg_hot.nl_lock); 834 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 835 if (ncp != NULL) { 836 neglist = NCP2NEGLIST(ncp); 837 mtx_lock(&neglist->nl_lock); 838 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 839 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 840 ncp->nc_flag &= ~NCF_HOTNEGATIVE; 841 numhotneg--; 842 mtx_unlock(&neglist->nl_lock); 843 } 844 mtx_unlock(&ncneg_hot.nl_lock); 845 846 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 847 shrink_list_turn++; 848 if (shrink_list_turn == numneglists) 849 shrink_list_turn = 0; 850 if (ncp == NULL && shrink_list_turn == 0) 851 cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); 852 mtx_unlock(&ncneg_shrink_lock); 853 if (ncp == NULL) 854 return; 855 856 MPASS(ncp->nc_flag & NCF_NEGATIVE); 857 dvlp = VP2VNODELOCK(ncp->nc_dvp); 858 blp = NCP2BUCKETLOCK(ncp); 859 mtx_unlock(&neglist->nl_lock); 860 mtx_lock(dvlp); 861 rw_wlock(blp); 862 mtx_lock(&neglist->nl_lock); 863 ncp2 = TAILQ_FIRST(&neglist->nl_list); 864 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 865 blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { 866 ncp = NULL; 867 } else { 868 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 869 ncp->nc_name); 870 871 cache_zap_locked(ncp, true); 872 counter_u64_add(numneg_evicted, 1); 873 } 874 mtx_unlock(&neglist->nl_lock); 875 rw_wunlock(blp); 876 mtx_unlock(dvlp); 877 cache_free(ncp); 878 } 879 880 /* 881 * cache_zap_locked(): 882 * 883 * Removes a namecache entry from cache, whether it contains an actual 884 * pointer to a vnode or if it is just a negative cache entry. 885 */ 886 static void 887 cache_zap_locked(struct namecache *ncp, bool neg_locked) 888 { 889 890 if (!(ncp->nc_flag & NCF_NEGATIVE)) 891 cache_assert_vnode_locked(ncp->nc_vp); 892 cache_assert_vnode_locked(ncp->nc_dvp); 893 cache_assert_bucket_locked(ncp, RA_WLOCKED); 894 895 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 896 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 897 898 cache_ncp_invalidate(ncp); 899 900 CK_LIST_REMOVE(ncp, nc_hash); 901 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 902 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 903 ncp->nc_name, ncp->nc_vp); 904 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 905 if (ncp == ncp->nc_vp->v_cache_dd) 906 ncp->nc_vp->v_cache_dd = NULL; 907 } else { 908 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 909 ncp->nc_name); 910 cache_negative_remove(ncp, neg_locked); 911 } 912 if (ncp->nc_flag & NCF_ISDOTDOT) { 913 if (ncp == ncp->nc_dvp->v_cache_dd) 914 ncp->nc_dvp->v_cache_dd = NULL; 915 } else { 916 LIST_REMOVE(ncp, nc_src); 917 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 918 ncp->nc_flag |= NCF_DVDROP; 919 counter_u64_add(numcachehv, -1); 920 } 921 } 922 atomic_subtract_rel_long(&numcache, 1); 923 } 924 925 static void 926 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 927 { 928 struct rwlock *blp; 929 930 MPASS(ncp->nc_dvp == vp); 931 MPASS(ncp->nc_flag & NCF_NEGATIVE); 932 cache_assert_vnode_locked(vp); 933 934 blp = NCP2BUCKETLOCK(ncp); 935 rw_wlock(blp); 936 cache_zap_locked(ncp, false); 937 rw_wunlock(blp); 938 } 939 940 static bool 941 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 942 struct mtx **vlpp) 943 { 944 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 945 struct rwlock *blp; 946 947 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 948 cache_assert_vnode_locked(vp); 949 950 if (ncp->nc_flag & NCF_NEGATIVE) { 951 if (*vlpp != NULL) { 952 mtx_unlock(*vlpp); 953 *vlpp = NULL; 954 } 955 cache_zap_negative_locked_vnode_kl(ncp, vp); 956 return (true); 957 } 958 959 pvlp = VP2VNODELOCK(vp); 960 blp = NCP2BUCKETLOCK(ncp); 961 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 962 vlp2 = VP2VNODELOCK(ncp->nc_vp); 963 964 if (*vlpp == vlp1 || *vlpp == vlp2) { 965 to_unlock = *vlpp; 966 *vlpp = NULL; 967 } else { 968 if (*vlpp != NULL) { 969 mtx_unlock(*vlpp); 970 *vlpp = NULL; 971 } 972 cache_sort_vnodes(&vlp1, &vlp2); 973 if (vlp1 == pvlp) { 974 mtx_lock(vlp2); 975 to_unlock = vlp2; 976 } else { 977 if (!mtx_trylock(vlp1)) 978 goto out_relock; 979 to_unlock = vlp1; 980 } 981 } 982 rw_wlock(blp); 983 cache_zap_locked(ncp, false); 984 rw_wunlock(blp); 985 if (to_unlock != NULL) 986 mtx_unlock(to_unlock); 987 return (true); 988 989 out_relock: 990 mtx_unlock(vlp2); 991 mtx_lock(vlp1); 992 mtx_lock(vlp2); 993 MPASS(*vlpp == NULL); 994 *vlpp = vlp1; 995 return (false); 996 } 997 998 static int __noinline 999 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1000 { 1001 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1002 struct rwlock *blp; 1003 int error = 0; 1004 1005 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1006 cache_assert_vnode_locked(vp); 1007 1008 pvlp = VP2VNODELOCK(vp); 1009 if (ncp->nc_flag & NCF_NEGATIVE) { 1010 cache_zap_negative_locked_vnode_kl(ncp, vp); 1011 goto out; 1012 } 1013 1014 blp = NCP2BUCKETLOCK(ncp); 1015 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1016 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1017 cache_sort_vnodes(&vlp1, &vlp2); 1018 if (vlp1 == pvlp) { 1019 mtx_lock(vlp2); 1020 to_unlock = vlp2; 1021 } else { 1022 if (!mtx_trylock(vlp1)) { 1023 error = EAGAIN; 1024 goto out; 1025 } 1026 to_unlock = vlp1; 1027 } 1028 rw_wlock(blp); 1029 cache_zap_locked(ncp, false); 1030 rw_wunlock(blp); 1031 mtx_unlock(to_unlock); 1032 out: 1033 mtx_unlock(pvlp); 1034 return (error); 1035 } 1036 1037 /* 1038 * If trylocking failed we can get here. We know enough to take all needed locks 1039 * in the right order and re-lookup the entry. 1040 */ 1041 static int 1042 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1043 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1044 struct rwlock *blp) 1045 { 1046 struct namecache *rncp; 1047 1048 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1049 1050 cache_sort_vnodes(&dvlp, &vlp); 1051 cache_lock_vnodes(dvlp, vlp); 1052 rw_wlock(blp); 1053 CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1054 if (rncp == ncp && rncp->nc_dvp == dvp && 1055 rncp->nc_nlen == cnp->cn_namelen && 1056 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1057 break; 1058 } 1059 if (rncp != NULL) { 1060 cache_zap_locked(rncp, false); 1061 rw_wunlock(blp); 1062 cache_unlock_vnodes(dvlp, vlp); 1063 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1064 return (0); 1065 } 1066 1067 rw_wunlock(blp); 1068 cache_unlock_vnodes(dvlp, vlp); 1069 return (EAGAIN); 1070 } 1071 1072 static int __noinline 1073 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1074 uint32_t hash, struct rwlock *blp) 1075 { 1076 struct mtx *dvlp, *vlp; 1077 struct vnode *dvp; 1078 1079 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1080 1081 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1082 vlp = NULL; 1083 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1084 vlp = VP2VNODELOCK(ncp->nc_vp); 1085 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1086 cache_zap_locked(ncp, false); 1087 rw_wunlock(blp); 1088 cache_unlock_vnodes(dvlp, vlp); 1089 return (0); 1090 } 1091 1092 dvp = ncp->nc_dvp; 1093 rw_wunlock(blp); 1094 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1095 } 1096 1097 static int __noinline 1098 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1099 uint32_t hash, struct rwlock *blp) 1100 { 1101 struct mtx *dvlp, *vlp; 1102 struct vnode *dvp; 1103 1104 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1105 1106 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1107 vlp = NULL; 1108 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1109 vlp = VP2VNODELOCK(ncp->nc_vp); 1110 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1111 rw_runlock(blp); 1112 rw_wlock(blp); 1113 cache_zap_locked(ncp, false); 1114 rw_wunlock(blp); 1115 cache_unlock_vnodes(dvlp, vlp); 1116 return (0); 1117 } 1118 1119 dvp = ncp->nc_dvp; 1120 rw_runlock(blp); 1121 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1122 } 1123 1124 static int 1125 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1126 struct mtx **vlpp1, struct mtx **vlpp2) 1127 { 1128 struct mtx *dvlp, *vlp; 1129 1130 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1131 1132 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1133 vlp = NULL; 1134 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1135 vlp = VP2VNODELOCK(ncp->nc_vp); 1136 cache_sort_vnodes(&dvlp, &vlp); 1137 1138 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1139 cache_zap_locked(ncp, false); 1140 cache_unlock_vnodes(dvlp, vlp); 1141 *vlpp1 = NULL; 1142 *vlpp2 = NULL; 1143 return (0); 1144 } 1145 1146 if (*vlpp1 != NULL) 1147 mtx_unlock(*vlpp1); 1148 if (*vlpp2 != NULL) 1149 mtx_unlock(*vlpp2); 1150 *vlpp1 = NULL; 1151 *vlpp2 = NULL; 1152 1153 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1154 cache_zap_locked(ncp, false); 1155 cache_unlock_vnodes(dvlp, vlp); 1156 return (0); 1157 } 1158 1159 rw_wunlock(blp); 1160 *vlpp1 = dvlp; 1161 *vlpp2 = vlp; 1162 if (*vlpp1 != NULL) 1163 mtx_lock(*vlpp1); 1164 mtx_lock(*vlpp2); 1165 rw_wlock(blp); 1166 return (EAGAIN); 1167 } 1168 1169 static void 1170 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1171 { 1172 1173 if (blp != NULL) { 1174 rw_runlock(blp); 1175 } else { 1176 mtx_unlock(vlp); 1177 } 1178 } 1179 1180 static int __noinline 1181 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1182 struct timespec *tsp, int *ticksp) 1183 { 1184 int ltype; 1185 1186 *vpp = dvp; 1187 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1188 dvp, cnp->cn_nameptr); 1189 counter_u64_add(dothits, 1); 1190 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1191 if (tsp != NULL) 1192 timespecclear(tsp); 1193 if (ticksp != NULL) 1194 *ticksp = ticks; 1195 vrefact(*vpp); 1196 /* 1197 * When we lookup "." we still can be asked to lock it 1198 * differently... 1199 */ 1200 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1201 if (ltype != VOP_ISLOCKED(*vpp)) { 1202 if (ltype == LK_EXCLUSIVE) { 1203 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1204 if (VN_IS_DOOMED((*vpp))) { 1205 /* forced unmount */ 1206 vrele(*vpp); 1207 *vpp = NULL; 1208 return (ENOENT); 1209 } 1210 } else 1211 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1212 } 1213 return (-1); 1214 } 1215 1216 static __noinline int 1217 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1218 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1219 { 1220 struct namecache *ncp; 1221 struct rwlock *blp; 1222 struct mtx *dvlp, *dvlp2; 1223 uint32_t hash; 1224 int error; 1225 1226 if (cnp->cn_namelen == 2 && 1227 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1228 counter_u64_add(dotdothits, 1); 1229 dvlp = VP2VNODELOCK(dvp); 1230 dvlp2 = NULL; 1231 mtx_lock(dvlp); 1232 retry_dotdot: 1233 ncp = dvp->v_cache_dd; 1234 if (ncp == NULL) { 1235 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1236 "..", NULL); 1237 mtx_unlock(dvlp); 1238 if (dvlp2 != NULL) 1239 mtx_unlock(dvlp2); 1240 return (0); 1241 } 1242 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1243 if (ncp->nc_dvp != dvp) 1244 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1245 if (!cache_zap_locked_vnode_kl2(ncp, 1246 dvp, &dvlp2)) 1247 goto retry_dotdot; 1248 MPASS(dvp->v_cache_dd == NULL); 1249 mtx_unlock(dvlp); 1250 if (dvlp2 != NULL) 1251 mtx_unlock(dvlp2); 1252 cache_free(ncp); 1253 } else { 1254 dvp->v_cache_dd = NULL; 1255 mtx_unlock(dvlp); 1256 if (dvlp2 != NULL) 1257 mtx_unlock(dvlp2); 1258 } 1259 return (0); 1260 } 1261 1262 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1263 blp = HASH2BUCKETLOCK(hash); 1264 retry: 1265 if (CK_LIST_EMPTY(NCHHASH(hash))) 1266 goto out_no_entry; 1267 1268 rw_wlock(blp); 1269 1270 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1271 counter_u64_add(numchecks, 1); 1272 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1273 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1274 break; 1275 } 1276 1277 /* We failed to find an entry */ 1278 if (ncp == NULL) { 1279 rw_wunlock(blp); 1280 goto out_no_entry; 1281 } 1282 1283 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1284 if (__predict_false(error != 0)) { 1285 zap_and_exit_bucket_fail++; 1286 cache_maybe_yield(); 1287 goto retry; 1288 } 1289 counter_u64_add(numposzaps, 1); 1290 cache_free(ncp); 1291 return (0); 1292 out_no_entry: 1293 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1294 counter_u64_add(nummisszap, 1); 1295 return (0); 1296 } 1297 1298 /** 1299 * Lookup a name in the name cache 1300 * 1301 * # Arguments 1302 * 1303 * - dvp: Parent directory in which to search. 1304 * - vpp: Return argument. Will contain desired vnode on cache hit. 1305 * - cnp: Parameters of the name search. The most interesting bits of 1306 * the cn_flags field have the following meanings: 1307 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1308 * it up. 1309 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1310 * - tsp: Return storage for cache timestamp. On a successful (positive 1311 * or negative) lookup, tsp will be filled with any timespec that 1312 * was stored when this cache entry was created. However, it will 1313 * be clear for "." entries. 1314 * - ticks: Return storage for alternate cache timestamp. On a successful 1315 * (positive or negative) lookup, it will contain the ticks value 1316 * that was current when the cache entry was created, unless cnp 1317 * was ".". 1318 * 1319 * # Returns 1320 * 1321 * - -1: A positive cache hit. vpp will contain the desired vnode. 1322 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1323 * to a forced unmount. vpp will not be modified. If the entry 1324 * is a whiteout, then the ISWHITEOUT flag will be set in 1325 * cnp->cn_flags. 1326 * - 0: A cache miss. vpp will not be modified. 1327 * 1328 * # Locking 1329 * 1330 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1331 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1332 * lock is not recursively acquired. 1333 */ 1334 int 1335 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1336 struct timespec *tsp, int *ticksp) 1337 { 1338 struct namecache_ts *ncp_ts; 1339 struct namecache *ncp; 1340 struct rwlock *blp; 1341 struct mtx *dvlp; 1342 uint32_t hash; 1343 enum vgetstate vs; 1344 int error, ltype; 1345 bool try_smr, doing_smr, whiteout; 1346 1347 #ifdef DEBUG_CACHE 1348 if (__predict_false(!doingcache)) { 1349 cnp->cn_flags &= ~MAKEENTRY; 1350 return (0); 1351 } 1352 #endif 1353 1354 counter_u64_add(numcalls, 1); 1355 1356 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1357 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1358 1359 if ((cnp->cn_flags & MAKEENTRY) == 0) 1360 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1361 1362 try_smr = true; 1363 if (cnp->cn_nameiop == CREATE) 1364 try_smr = false; 1365 retry: 1366 doing_smr = false; 1367 blp = NULL; 1368 dvlp = NULL; 1369 error = 0; 1370 if (cnp->cn_namelen == 2 && 1371 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1372 counter_u64_add(dotdothits, 1); 1373 dvlp = VP2VNODELOCK(dvp); 1374 mtx_lock(dvlp); 1375 ncp = dvp->v_cache_dd; 1376 if (ncp == NULL) { 1377 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1378 "..", NULL); 1379 mtx_unlock(dvlp); 1380 return (0); 1381 } 1382 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1383 if (ncp->nc_flag & NCF_NEGATIVE) 1384 *vpp = NULL; 1385 else 1386 *vpp = ncp->nc_vp; 1387 } else 1388 *vpp = ncp->nc_dvp; 1389 /* Return failure if negative entry was found. */ 1390 if (*vpp == NULL) 1391 goto negative_success; 1392 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1393 dvp, cnp->cn_nameptr, *vpp); 1394 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1395 *vpp); 1396 cache_out_ts(ncp, tsp, ticksp); 1397 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1398 NCF_DTS && tsp != NULL) { 1399 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1400 *tsp = ncp_ts->nc_dotdottime; 1401 } 1402 goto success; 1403 } 1404 1405 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1406 retry_hashed: 1407 if (try_smr) { 1408 vfs_smr_enter(); 1409 doing_smr = true; 1410 try_smr = false; 1411 } else { 1412 blp = HASH2BUCKETLOCK(hash); 1413 rw_rlock(blp); 1414 } 1415 1416 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1417 counter_u64_add(numchecks, 1); 1418 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1419 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1420 break; 1421 } 1422 1423 /* We failed to find an entry */ 1424 if (__predict_false(ncp == NULL)) { 1425 if (doing_smr) 1426 vfs_smr_exit(); 1427 else 1428 rw_runlock(blp); 1429 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1430 NULL); 1431 counter_u64_add(nummiss, 1); 1432 return (0); 1433 } 1434 1435 if (ncp->nc_flag & NCF_NEGATIVE) 1436 goto negative_success; 1437 1438 /* We found a "positive" match, return the vnode */ 1439 counter_u64_add(numposhits, 1); 1440 *vpp = ncp->nc_vp; 1441 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1442 dvp, cnp->cn_nameptr, *vpp, ncp); 1443 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1444 *vpp); 1445 cache_out_ts(ncp, tsp, ticksp); 1446 success: 1447 /* 1448 * On success we return a locked and ref'd vnode as per the lookup 1449 * protocol. 1450 */ 1451 MPASS(dvp != *vpp); 1452 ltype = 0; /* silence gcc warning */ 1453 if (cnp->cn_flags & ISDOTDOT) { 1454 ltype = VOP_ISLOCKED(dvp); 1455 VOP_UNLOCK(dvp); 1456 } 1457 if (doing_smr) { 1458 if (cache_ncp_invalid(ncp)) { 1459 vfs_smr_exit(); 1460 *vpp = NULL; 1461 goto retry; 1462 } 1463 vs = vget_prep_smr(*vpp); 1464 vfs_smr_exit(); 1465 if (vs == VGET_NONE) { 1466 *vpp = NULL; 1467 goto retry; 1468 } 1469 } else { 1470 vs = vget_prep(*vpp); 1471 cache_lookup_unlock(blp, dvlp); 1472 } 1473 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1474 if (cnp->cn_flags & ISDOTDOT) { 1475 vn_lock(dvp, ltype | LK_RETRY); 1476 if (VN_IS_DOOMED(dvp)) { 1477 if (error == 0) 1478 vput(*vpp); 1479 *vpp = NULL; 1480 return (ENOENT); 1481 } 1482 } 1483 if (error) { 1484 *vpp = NULL; 1485 goto retry; 1486 } 1487 if ((cnp->cn_flags & ISLASTCN) && 1488 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1489 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1490 } 1491 return (-1); 1492 1493 negative_success: 1494 /* We found a negative match, and want to create it, so purge */ 1495 if (cnp->cn_nameiop == CREATE) { 1496 MPASS(!doing_smr); 1497 counter_u64_add(numnegzaps, 1); 1498 goto zap_and_exit; 1499 } 1500 1501 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1502 cache_out_ts(ncp, tsp, ticksp); 1503 counter_u64_add(numneghits, 1); 1504 whiteout = (ncp->nc_flag & NCF_WHITE); 1505 1506 if (doing_smr) { 1507 /* 1508 * We need to take locks to promote an entry. 1509 */ 1510 if ((ncp->nc_flag & NCF_HOTNEGATIVE) == 0 || 1511 cache_ncp_invalid(ncp)) { 1512 vfs_smr_exit(); 1513 doing_smr = false; 1514 goto retry_hashed; 1515 } 1516 vfs_smr_exit(); 1517 } else { 1518 cache_negative_hit(ncp); 1519 cache_lookup_unlock(blp, dvlp); 1520 } 1521 if (whiteout) 1522 cnp->cn_flags |= ISWHITEOUT; 1523 return (ENOENT); 1524 1525 zap_and_exit: 1526 MPASS(!doing_smr); 1527 if (blp != NULL) 1528 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1529 else 1530 error = cache_zap_locked_vnode(ncp, dvp); 1531 if (__predict_false(error != 0)) { 1532 zap_and_exit_bucket_fail2++; 1533 cache_maybe_yield(); 1534 goto retry; 1535 } 1536 cache_free(ncp); 1537 return (0); 1538 } 1539 1540 struct celockstate { 1541 struct mtx *vlp[3]; 1542 struct rwlock *blp[2]; 1543 }; 1544 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1545 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1546 1547 static inline void 1548 cache_celockstate_init(struct celockstate *cel) 1549 { 1550 1551 bzero(cel, sizeof(*cel)); 1552 } 1553 1554 static void 1555 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1556 struct vnode *dvp) 1557 { 1558 struct mtx *vlp1, *vlp2; 1559 1560 MPASS(cel->vlp[0] == NULL); 1561 MPASS(cel->vlp[1] == NULL); 1562 MPASS(cel->vlp[2] == NULL); 1563 1564 MPASS(vp != NULL || dvp != NULL); 1565 1566 vlp1 = VP2VNODELOCK(vp); 1567 vlp2 = VP2VNODELOCK(dvp); 1568 cache_sort_vnodes(&vlp1, &vlp2); 1569 1570 if (vlp1 != NULL) { 1571 mtx_lock(vlp1); 1572 cel->vlp[0] = vlp1; 1573 } 1574 mtx_lock(vlp2); 1575 cel->vlp[1] = vlp2; 1576 } 1577 1578 static void 1579 cache_unlock_vnodes_cel(struct celockstate *cel) 1580 { 1581 1582 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1583 1584 if (cel->vlp[0] != NULL) 1585 mtx_unlock(cel->vlp[0]); 1586 if (cel->vlp[1] != NULL) 1587 mtx_unlock(cel->vlp[1]); 1588 if (cel->vlp[2] != NULL) 1589 mtx_unlock(cel->vlp[2]); 1590 } 1591 1592 static bool 1593 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1594 { 1595 struct mtx *vlp; 1596 bool ret; 1597 1598 cache_assert_vlp_locked(cel->vlp[0]); 1599 cache_assert_vlp_locked(cel->vlp[1]); 1600 MPASS(cel->vlp[2] == NULL); 1601 1602 MPASS(vp != NULL); 1603 vlp = VP2VNODELOCK(vp); 1604 1605 ret = true; 1606 if (vlp >= cel->vlp[1]) { 1607 mtx_lock(vlp); 1608 } else { 1609 if (mtx_trylock(vlp)) 1610 goto out; 1611 cache_lock_vnodes_cel_3_failures++; 1612 cache_unlock_vnodes_cel(cel); 1613 if (vlp < cel->vlp[0]) { 1614 mtx_lock(vlp); 1615 mtx_lock(cel->vlp[0]); 1616 mtx_lock(cel->vlp[1]); 1617 } else { 1618 if (cel->vlp[0] != NULL) 1619 mtx_lock(cel->vlp[0]); 1620 mtx_lock(vlp); 1621 mtx_lock(cel->vlp[1]); 1622 } 1623 ret = false; 1624 } 1625 out: 1626 cel->vlp[2] = vlp; 1627 return (ret); 1628 } 1629 1630 static void 1631 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1632 struct rwlock *blp2) 1633 { 1634 1635 MPASS(cel->blp[0] == NULL); 1636 MPASS(cel->blp[1] == NULL); 1637 1638 cache_sort_vnodes(&blp1, &blp2); 1639 1640 if (blp1 != NULL) { 1641 rw_wlock(blp1); 1642 cel->blp[0] = blp1; 1643 } 1644 rw_wlock(blp2); 1645 cel->blp[1] = blp2; 1646 } 1647 1648 static void 1649 cache_unlock_buckets_cel(struct celockstate *cel) 1650 { 1651 1652 if (cel->blp[0] != NULL) 1653 rw_wunlock(cel->blp[0]); 1654 rw_wunlock(cel->blp[1]); 1655 } 1656 1657 /* 1658 * Lock part of the cache affected by the insertion. 1659 * 1660 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1661 * However, insertion can result in removal of an old entry. In this 1662 * case we have an additional vnode and bucketlock pair to lock. If the 1663 * entry is negative, ncelock is locked instead of the vnode. 1664 * 1665 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1666 * preserving the locking order (smaller address first). 1667 */ 1668 static void 1669 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1670 uint32_t hash) 1671 { 1672 struct namecache *ncp; 1673 struct rwlock *blps[2]; 1674 1675 blps[0] = HASH2BUCKETLOCK(hash); 1676 for (;;) { 1677 blps[1] = NULL; 1678 cache_lock_vnodes_cel(cel, dvp, vp); 1679 if (vp == NULL || vp->v_type != VDIR) 1680 break; 1681 ncp = vp->v_cache_dd; 1682 if (ncp == NULL) 1683 break; 1684 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1685 break; 1686 MPASS(ncp->nc_dvp == vp); 1687 blps[1] = NCP2BUCKETLOCK(ncp); 1688 if (ncp->nc_flag & NCF_NEGATIVE) 1689 break; 1690 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1691 break; 1692 /* 1693 * All vnodes got re-locked. Re-validate the state and if 1694 * nothing changed we are done. Otherwise restart. 1695 */ 1696 if (ncp == vp->v_cache_dd && 1697 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1698 blps[1] == NCP2BUCKETLOCK(ncp) && 1699 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1700 break; 1701 cache_unlock_vnodes_cel(cel); 1702 cel->vlp[0] = NULL; 1703 cel->vlp[1] = NULL; 1704 cel->vlp[2] = NULL; 1705 } 1706 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1707 } 1708 1709 static void 1710 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1711 uint32_t hash) 1712 { 1713 struct namecache *ncp; 1714 struct rwlock *blps[2]; 1715 1716 blps[0] = HASH2BUCKETLOCK(hash); 1717 for (;;) { 1718 blps[1] = NULL; 1719 cache_lock_vnodes_cel(cel, dvp, vp); 1720 ncp = dvp->v_cache_dd; 1721 if (ncp == NULL) 1722 break; 1723 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1724 break; 1725 MPASS(ncp->nc_dvp == dvp); 1726 blps[1] = NCP2BUCKETLOCK(ncp); 1727 if (ncp->nc_flag & NCF_NEGATIVE) 1728 break; 1729 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1730 break; 1731 if (ncp == dvp->v_cache_dd && 1732 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1733 blps[1] == NCP2BUCKETLOCK(ncp) && 1734 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1735 break; 1736 cache_unlock_vnodes_cel(cel); 1737 cel->vlp[0] = NULL; 1738 cel->vlp[1] = NULL; 1739 cel->vlp[2] = NULL; 1740 } 1741 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1742 } 1743 1744 static void 1745 cache_enter_unlock(struct celockstate *cel) 1746 { 1747 1748 cache_unlock_buckets_cel(cel); 1749 cache_unlock_vnodes_cel(cel); 1750 } 1751 1752 static void __noinline 1753 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1754 struct componentname *cnp) 1755 { 1756 struct celockstate cel; 1757 struct namecache *ncp; 1758 uint32_t hash; 1759 int len; 1760 1761 if (dvp->v_cache_dd == NULL) 1762 return; 1763 len = cnp->cn_namelen; 1764 cache_celockstate_init(&cel); 1765 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1766 cache_enter_lock_dd(&cel, dvp, vp, hash); 1767 ncp = dvp->v_cache_dd; 1768 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1769 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1770 cache_zap_locked(ncp, false); 1771 } else { 1772 ncp = NULL; 1773 } 1774 dvp->v_cache_dd = NULL; 1775 cache_enter_unlock(&cel); 1776 cache_free(ncp); 1777 } 1778 1779 /* 1780 * Add an entry to the cache. 1781 */ 1782 void 1783 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1784 struct timespec *tsp, struct timespec *dtsp) 1785 { 1786 struct celockstate cel; 1787 struct namecache *ncp, *n2, *ndd; 1788 struct namecache_ts *ncp_ts, *n2_ts; 1789 struct nchashhead *ncpp; 1790 uint32_t hash; 1791 int flag; 1792 int len; 1793 u_long lnumcache; 1794 1795 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1796 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1797 ("cache_enter: Adding a doomed vnode")); 1798 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1799 ("cache_enter: Doomed vnode used as src")); 1800 1801 #ifdef DEBUG_CACHE 1802 if (__predict_false(!doingcache)) 1803 return; 1804 #endif 1805 1806 flag = 0; 1807 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1808 if (cnp->cn_namelen == 1) 1809 return; 1810 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1811 cache_enter_dotdot_prep(dvp, vp, cnp); 1812 flag = NCF_ISDOTDOT; 1813 } 1814 } 1815 1816 /* 1817 * Avoid blowout in namecache entries. 1818 */ 1819 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1820 if (__predict_false(lnumcache >= ncsize)) { 1821 atomic_add_long(&numcache, -1); 1822 return; 1823 } 1824 1825 cache_celockstate_init(&cel); 1826 ndd = NULL; 1827 ncp_ts = NULL; 1828 1829 /* 1830 * Calculate the hash key and setup as much of the new 1831 * namecache entry as possible before acquiring the lock. 1832 */ 1833 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1834 ncp->nc_flag = flag; 1835 ncp->nc_vp = vp; 1836 if (vp == NULL) 1837 ncp->nc_flag |= NCF_NEGATIVE; 1838 ncp->nc_dvp = dvp; 1839 if (tsp != NULL) { 1840 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1841 ncp_ts->nc_time = *tsp; 1842 ncp_ts->nc_ticks = ticks; 1843 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1844 if (dtsp != NULL) { 1845 ncp_ts->nc_dotdottime = *dtsp; 1846 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1847 } 1848 } 1849 len = ncp->nc_nlen = cnp->cn_namelen; 1850 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1851 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1852 cache_enter_lock(&cel, dvp, vp, hash); 1853 1854 /* 1855 * See if this vnode or negative entry is already in the cache 1856 * with this name. This can happen with concurrent lookups of 1857 * the same path name. 1858 */ 1859 ncpp = NCHHASH(hash); 1860 CK_LIST_FOREACH(n2, ncpp, nc_hash) { 1861 if (n2->nc_dvp == dvp && 1862 n2->nc_nlen == cnp->cn_namelen && 1863 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1864 if (tsp != NULL) { 1865 KASSERT((n2->nc_flag & NCF_TS) != 0, 1866 ("no NCF_TS")); 1867 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1868 n2_ts->nc_time = ncp_ts->nc_time; 1869 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1870 if (dtsp != NULL) { 1871 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1872 if (ncp->nc_flag & NCF_NEGATIVE) 1873 mtx_lock(&ncneg_hot.nl_lock); 1874 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1875 if (ncp->nc_flag & NCF_NEGATIVE) 1876 mtx_unlock(&ncneg_hot.nl_lock); 1877 } 1878 } 1879 goto out_unlock_free; 1880 } 1881 } 1882 1883 if (flag == NCF_ISDOTDOT) { 1884 /* 1885 * See if we are trying to add .. entry, but some other lookup 1886 * has populated v_cache_dd pointer already. 1887 */ 1888 if (dvp->v_cache_dd != NULL) 1889 goto out_unlock_free; 1890 KASSERT(vp == NULL || vp->v_type == VDIR, 1891 ("wrong vnode type %p", vp)); 1892 dvp->v_cache_dd = ncp; 1893 } 1894 1895 if (vp != NULL) { 1896 if (vp->v_type == VDIR) { 1897 if (flag != NCF_ISDOTDOT) { 1898 /* 1899 * For this case, the cache entry maps both the 1900 * directory name in it and the name ".." for the 1901 * directory's parent. 1902 */ 1903 if ((ndd = vp->v_cache_dd) != NULL) { 1904 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1905 cache_zap_locked(ndd, false); 1906 else 1907 ndd = NULL; 1908 } 1909 vp->v_cache_dd = ncp; 1910 } 1911 } else { 1912 vp->v_cache_dd = NULL; 1913 } 1914 } 1915 1916 if (flag != NCF_ISDOTDOT) { 1917 if (LIST_EMPTY(&dvp->v_cache_src)) { 1918 vhold(dvp); 1919 counter_u64_add(numcachehv, 1); 1920 } 1921 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1922 } 1923 1924 /* 1925 * If the entry is "negative", we place it into the 1926 * "negative" cache queue, otherwise, we place it into the 1927 * destination vnode's cache entries queue. 1928 */ 1929 if (vp != NULL) { 1930 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1931 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1932 vp); 1933 } else { 1934 if (cnp->cn_flags & ISWHITEOUT) 1935 ncp->nc_flag |= NCF_WHITE; 1936 cache_negative_insert(ncp, false); 1937 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1938 ncp->nc_name); 1939 } 1940 1941 atomic_thread_fence_rel(); 1942 /* 1943 * Insert the new namecache entry into the appropriate chain 1944 * within the cache entries table. 1945 */ 1946 CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1947 1948 cache_enter_unlock(&cel); 1949 if (numneg * ncnegfactor > lnumcache) 1950 cache_negative_zap_one(); 1951 cache_free(ndd); 1952 return; 1953 out_unlock_free: 1954 cache_enter_unlock(&cel); 1955 cache_free(ncp); 1956 return; 1957 } 1958 1959 static u_int 1960 cache_roundup_2(u_int val) 1961 { 1962 u_int res; 1963 1964 for (res = 1; res <= val; res <<= 1) 1965 continue; 1966 1967 return (res); 1968 } 1969 1970 /* 1971 * Name cache initialization, from vfs_init() when we are booting 1972 */ 1973 static void 1974 nchinit(void *dummy __unused) 1975 { 1976 u_int i; 1977 1978 cache_zone_small = uma_zcreate("S VFS Cache", 1979 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1980 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1981 UMA_ZONE_ZINIT); 1982 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1983 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1984 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1985 UMA_ZONE_ZINIT); 1986 cache_zone_large = uma_zcreate("L VFS Cache", 1987 sizeof(struct namecache) + NAME_MAX + 1, 1988 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 1989 UMA_ZONE_ZINIT); 1990 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1991 sizeof(struct namecache_ts) + NAME_MAX + 1, 1992 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 1993 UMA_ZONE_ZINIT); 1994 1995 VFS_SMR_ZONE_SET(cache_zone_small); 1996 VFS_SMR_ZONE_SET(cache_zone_small_ts); 1997 VFS_SMR_ZONE_SET(cache_zone_large); 1998 VFS_SMR_ZONE_SET(cache_zone_large_ts); 1999 2000 ncsize = desiredvnodes * ncsizefactor; 2001 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 2002 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2003 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2004 ncbuckethash = 7; 2005 if (ncbuckethash > nchash) 2006 ncbuckethash = nchash; 2007 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2008 M_WAITOK | M_ZERO); 2009 for (i = 0; i < numbucketlocks; i++) 2010 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2011 ncvnodehash = ncbuckethash; 2012 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2013 M_WAITOK | M_ZERO); 2014 for (i = 0; i < numvnodelocks; i++) 2015 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2016 ncpurgeminvnodes = numbucketlocks * 2; 2017 2018 ncneghash = 3; 2019 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2020 M_WAITOK | M_ZERO); 2021 for (i = 0; i < numneglists; i++) { 2022 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2023 TAILQ_INIT(&neglists[i].nl_list); 2024 } 2025 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2026 TAILQ_INIT(&ncneg_hot.nl_list); 2027 2028 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2029 } 2030 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2031 2032 void 2033 cache_changesize(u_long newmaxvnodes) 2034 { 2035 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2036 u_long new_nchash, old_nchash; 2037 struct namecache *ncp; 2038 uint32_t hash; 2039 u_long newncsize; 2040 int i; 2041 2042 newncsize = newmaxvnodes * ncsizefactor; 2043 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2044 if (newmaxvnodes < numbucketlocks) 2045 newmaxvnodes = numbucketlocks; 2046 2047 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 2048 /* If same hash table size, nothing to do */ 2049 if (nchash == new_nchash) { 2050 free(new_nchashtbl, M_VFSCACHE); 2051 return; 2052 } 2053 /* 2054 * Move everything from the old hash table to the new table. 2055 * None of the namecache entries in the table can be removed 2056 * because to do so, they have to be removed from the hash table. 2057 */ 2058 cache_lock_all_vnodes(); 2059 cache_lock_all_buckets(); 2060 old_nchashtbl = nchashtbl; 2061 old_nchash = nchash; 2062 nchashtbl = new_nchashtbl; 2063 nchash = new_nchash; 2064 for (i = 0; i <= old_nchash; i++) { 2065 while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) { 2066 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2067 ncp->nc_dvp); 2068 CK_LIST_REMOVE(ncp, nc_hash); 2069 CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2070 } 2071 } 2072 ncsize = newncsize; 2073 cache_unlock_all_buckets(); 2074 cache_unlock_all_vnodes(); 2075 free(old_nchashtbl, M_VFSCACHE); 2076 } 2077 2078 /* 2079 * Invalidate all entries from and to a particular vnode. 2080 */ 2081 void 2082 cache_purge(struct vnode *vp) 2083 { 2084 TAILQ_HEAD(, namecache) ncps; 2085 struct namecache *ncp, *nnp; 2086 struct mtx *vlp, *vlp2; 2087 2088 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2089 SDT_PROBE1(vfs, namecache, purge, done, vp); 2090 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2091 vp->v_cache_dd == NULL) 2092 return; 2093 TAILQ_INIT(&ncps); 2094 vlp = VP2VNODELOCK(vp); 2095 vlp2 = NULL; 2096 mtx_lock(vlp); 2097 retry: 2098 while (!LIST_EMPTY(&vp->v_cache_src)) { 2099 ncp = LIST_FIRST(&vp->v_cache_src); 2100 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2101 goto retry; 2102 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2103 } 2104 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2105 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2106 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2107 goto retry; 2108 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2109 } 2110 ncp = vp->v_cache_dd; 2111 if (ncp != NULL) { 2112 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2113 ("lost dotdot link")); 2114 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2115 goto retry; 2116 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2117 } 2118 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2119 mtx_unlock(vlp); 2120 if (vlp2 != NULL) 2121 mtx_unlock(vlp2); 2122 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2123 cache_free(ncp); 2124 } 2125 } 2126 2127 /* 2128 * Invalidate all negative entries for a particular directory vnode. 2129 */ 2130 void 2131 cache_purge_negative(struct vnode *vp) 2132 { 2133 TAILQ_HEAD(, namecache) ncps; 2134 struct namecache *ncp, *nnp; 2135 struct mtx *vlp; 2136 2137 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2138 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2139 if (LIST_EMPTY(&vp->v_cache_src)) 2140 return; 2141 TAILQ_INIT(&ncps); 2142 vlp = VP2VNODELOCK(vp); 2143 mtx_lock(vlp); 2144 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2145 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2146 continue; 2147 cache_zap_negative_locked_vnode_kl(ncp, vp); 2148 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2149 } 2150 mtx_unlock(vlp); 2151 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2152 cache_free(ncp); 2153 } 2154 } 2155 2156 /* 2157 * Flush all entries referencing a particular filesystem. 2158 */ 2159 void 2160 cache_purgevfs(struct mount *mp, bool force) 2161 { 2162 TAILQ_HEAD(, namecache) ncps; 2163 struct mtx *vlp1, *vlp2; 2164 struct rwlock *blp; 2165 struct nchashhead *bucket; 2166 struct namecache *ncp, *nnp; 2167 u_long i, j, n_nchash; 2168 int error; 2169 2170 /* Scan hash tables for applicable entries */ 2171 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2172 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2173 return; 2174 TAILQ_INIT(&ncps); 2175 n_nchash = nchash + 1; 2176 vlp1 = vlp2 = NULL; 2177 for (i = 0; i < numbucketlocks; i++) { 2178 blp = (struct rwlock *)&bucketlocks[i]; 2179 rw_wlock(blp); 2180 for (j = i; j < n_nchash; j += numbucketlocks) { 2181 retry: 2182 bucket = &nchashtbl[j]; 2183 CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2184 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2185 if (ncp->nc_dvp->v_mount != mp) 2186 continue; 2187 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2188 &vlp1, &vlp2); 2189 if (error != 0) 2190 goto retry; 2191 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2192 } 2193 } 2194 rw_wunlock(blp); 2195 if (vlp1 == NULL && vlp2 == NULL) 2196 cache_maybe_yield(); 2197 } 2198 if (vlp1 != NULL) 2199 mtx_unlock(vlp1); 2200 if (vlp2 != NULL) 2201 mtx_unlock(vlp2); 2202 2203 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2204 cache_free(ncp); 2205 } 2206 } 2207 2208 /* 2209 * Perform canonical checks and cache lookup and pass on to filesystem 2210 * through the vop_cachedlookup only if needed. 2211 */ 2212 2213 int 2214 vfs_cache_lookup(struct vop_lookup_args *ap) 2215 { 2216 struct vnode *dvp; 2217 int error; 2218 struct vnode **vpp = ap->a_vpp; 2219 struct componentname *cnp = ap->a_cnp; 2220 int flags = cnp->cn_flags; 2221 2222 *vpp = NULL; 2223 dvp = ap->a_dvp; 2224 2225 if (dvp->v_type != VDIR) 2226 return (ENOTDIR); 2227 2228 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2229 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2230 return (EROFS); 2231 2232 error = vn_dir_check_exec(dvp, cnp); 2233 if (error != 0) 2234 return (error); 2235 2236 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2237 if (error == 0) 2238 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2239 if (error == -1) 2240 return (0); 2241 return (error); 2242 } 2243 2244 /* Implementation of the getcwd syscall. */ 2245 int 2246 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2247 { 2248 char *buf, *retbuf; 2249 size_t buflen; 2250 int error; 2251 2252 buflen = uap->buflen; 2253 if (__predict_false(buflen < 2)) 2254 return (EINVAL); 2255 if (buflen > MAXPATHLEN) 2256 buflen = MAXPATHLEN; 2257 2258 buf = malloc(buflen, M_TEMP, M_WAITOK); 2259 error = vn_getcwd(td, buf, &retbuf, &buflen); 2260 if (error == 0) 2261 error = copyout(retbuf, uap->buf, buflen); 2262 free(buf, M_TEMP); 2263 return (error); 2264 } 2265 2266 int 2267 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2268 { 2269 struct pwd *pwd; 2270 int error; 2271 2272 pwd = pwd_hold(td); 2273 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2274 pwd_drop(pwd); 2275 2276 #ifdef KTRACE 2277 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2278 ktrnamei(*retbuf); 2279 #endif 2280 return (error); 2281 } 2282 2283 static int 2284 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2285 size_t size, int flags, enum uio_seg pathseg) 2286 { 2287 struct nameidata nd; 2288 char *retbuf, *freebuf; 2289 int error; 2290 2291 if (flags != 0) 2292 return (EINVAL); 2293 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2294 pathseg, path, fd, &cap_fstat_rights, td); 2295 if ((error = namei(&nd)) != 0) 2296 return (error); 2297 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2298 if (error == 0) { 2299 error = copyout(retbuf, buf, size); 2300 free(freebuf, M_TEMP); 2301 } 2302 NDFREE(&nd, 0); 2303 return (error); 2304 } 2305 2306 int 2307 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2308 { 2309 2310 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2311 uap->flags, UIO_USERSPACE)); 2312 } 2313 2314 /* 2315 * Retrieve the full filesystem path that correspond to a vnode from the name 2316 * cache (if available) 2317 */ 2318 int 2319 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2320 { 2321 struct pwd *pwd; 2322 char *buf; 2323 size_t buflen; 2324 int error; 2325 2326 if (__predict_false(vn == NULL)) 2327 return (EINVAL); 2328 2329 buflen = MAXPATHLEN; 2330 buf = malloc(buflen, M_TEMP, M_WAITOK); 2331 pwd = pwd_hold(td); 2332 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2333 pwd_drop(pwd); 2334 2335 if (!error) 2336 *freebuf = buf; 2337 else 2338 free(buf, M_TEMP); 2339 return (error); 2340 } 2341 2342 /* 2343 * This function is similar to vn_fullpath, but it attempts to lookup the 2344 * pathname relative to the global root mount point. This is required for the 2345 * auditing sub-system, as audited pathnames must be absolute, relative to the 2346 * global root mount point. 2347 */ 2348 int 2349 vn_fullpath_global(struct thread *td, struct vnode *vn, 2350 char **retbuf, char **freebuf) 2351 { 2352 char *buf; 2353 size_t buflen; 2354 int error; 2355 2356 if (__predict_false(vn == NULL)) 2357 return (EINVAL); 2358 buflen = MAXPATHLEN; 2359 buf = malloc(buflen, M_TEMP, M_WAITOK); 2360 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2361 if (!error) 2362 *freebuf = buf; 2363 else 2364 free(buf, M_TEMP); 2365 return (error); 2366 } 2367 2368 int 2369 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2370 { 2371 struct vnode *dvp; 2372 struct namecache *ncp; 2373 struct mtx *vlp; 2374 int error; 2375 2376 vlp = VP2VNODELOCK(*vp); 2377 mtx_lock(vlp); 2378 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2379 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2380 break; 2381 } 2382 if (ncp != NULL) { 2383 if (*buflen < ncp->nc_nlen) { 2384 mtx_unlock(vlp); 2385 vrele(*vp); 2386 counter_u64_add(numfullpathfail4, 1); 2387 error = ENOMEM; 2388 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2389 vp, NULL); 2390 return (error); 2391 } 2392 *buflen -= ncp->nc_nlen; 2393 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2394 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2395 ncp->nc_name, vp); 2396 dvp = *vp; 2397 *vp = ncp->nc_dvp; 2398 vref(*vp); 2399 mtx_unlock(vlp); 2400 vrele(dvp); 2401 return (0); 2402 } 2403 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2404 2405 mtx_unlock(vlp); 2406 vn_lock(*vp, LK_SHARED | LK_RETRY); 2407 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2408 vput(*vp); 2409 if (error) { 2410 counter_u64_add(numfullpathfail2, 1); 2411 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2412 return (error); 2413 } 2414 2415 *vp = dvp; 2416 if (VN_IS_DOOMED(dvp)) { 2417 /* forced unmount */ 2418 vrele(dvp); 2419 error = ENOENT; 2420 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2421 return (error); 2422 } 2423 /* 2424 * *vp has its use count incremented still. 2425 */ 2426 2427 return (0); 2428 } 2429 2430 /* 2431 * Resolve a directory to a pathname. 2432 * 2433 * The name of the directory can always be found in the namecache or fetched 2434 * from the filesystem. There is also guaranteed to be only one parent, meaning 2435 * we can just follow vnodes up until we find the root. 2436 * 2437 * The vnode must be referenced. 2438 */ 2439 static int 2440 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2441 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2442 { 2443 #ifdef KDTRACE_HOOKS 2444 struct vnode *startvp = vp; 2445 #endif 2446 struct vnode *vp1; 2447 size_t buflen; 2448 int error; 2449 2450 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2451 VNPASS(vp->v_usecount > 0, vp); 2452 2453 buflen = *len; 2454 2455 if (!slash_prefixed) { 2456 MPASS(*len >= 2); 2457 buflen--; 2458 buf[buflen] = '\0'; 2459 } 2460 2461 error = 0; 2462 2463 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2464 counter_u64_add(numfullpathcalls, 1); 2465 while (vp != rdir && vp != rootvnode) { 2466 /* 2467 * The vp vnode must be already fully constructed, 2468 * since it is either found in namecache or obtained 2469 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2470 * without obtaining the vnode lock. 2471 */ 2472 if ((vp->v_vflag & VV_ROOT) != 0) { 2473 vn_lock(vp, LK_RETRY | LK_SHARED); 2474 2475 /* 2476 * With the vnode locked, check for races with 2477 * unmount, forced or not. Note that we 2478 * already verified that vp is not equal to 2479 * the root vnode, which means that 2480 * mnt_vnodecovered can be NULL only for the 2481 * case of unmount. 2482 */ 2483 if (VN_IS_DOOMED(vp) || 2484 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2485 vp1->v_mountedhere != vp->v_mount) { 2486 vput(vp); 2487 error = ENOENT; 2488 SDT_PROBE3(vfs, namecache, fullpath, return, 2489 error, vp, NULL); 2490 break; 2491 } 2492 2493 vref(vp1); 2494 vput(vp); 2495 vp = vp1; 2496 continue; 2497 } 2498 if (vp->v_type != VDIR) { 2499 vrele(vp); 2500 counter_u64_add(numfullpathfail1, 1); 2501 error = ENOTDIR; 2502 SDT_PROBE3(vfs, namecache, fullpath, return, 2503 error, vp, NULL); 2504 break; 2505 } 2506 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2507 if (error) 2508 break; 2509 if (buflen == 0) { 2510 vrele(vp); 2511 error = ENOMEM; 2512 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2513 startvp, NULL); 2514 break; 2515 } 2516 buf[--buflen] = '/'; 2517 slash_prefixed = true; 2518 } 2519 if (error) 2520 return (error); 2521 if (!slash_prefixed) { 2522 if (buflen == 0) { 2523 vrele(vp); 2524 counter_u64_add(numfullpathfail4, 1); 2525 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2526 startvp, NULL); 2527 return (ENOMEM); 2528 } 2529 buf[--buflen] = '/'; 2530 } 2531 counter_u64_add(numfullpathfound, 1); 2532 vrele(vp); 2533 2534 *retbuf = buf + buflen; 2535 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2536 *len -= buflen; 2537 *len += addend; 2538 return (0); 2539 } 2540 2541 /* 2542 * Resolve an arbitrary vnode to a pathname. 2543 * 2544 * Note 2 caveats: 2545 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2546 * resolve to a different path than the one used to find it 2547 * - namecache is not mandatory, meaning names are not guaranteed to be added 2548 * (in which case resolving fails) 2549 */ 2550 static int 2551 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2552 char *buf, char **retbuf, size_t *buflen) 2553 { 2554 size_t orig_buflen; 2555 bool slash_prefixed; 2556 int error; 2557 2558 if (*buflen < 2) 2559 return (EINVAL); 2560 2561 orig_buflen = *buflen; 2562 2563 vref(vp); 2564 slash_prefixed = false; 2565 if (vp->v_type != VDIR) { 2566 *buflen -= 1; 2567 buf[*buflen] = '\0'; 2568 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2569 if (error) 2570 return (error); 2571 if (*buflen == 0) { 2572 vrele(vp); 2573 return (ENOMEM); 2574 } 2575 *buflen -= 1; 2576 buf[*buflen] = '/'; 2577 slash_prefixed = true; 2578 } 2579 2580 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2581 orig_buflen - *buflen)); 2582 } 2583 2584 /* 2585 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2586 * 2587 * Since the namecache does not track handlings, the caller is expected to first 2588 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2589 * 2590 * Then we have 2 cases: 2591 * - if the found vnode is a directory, the path can be constructed just by 2592 * fullowing names up the chain 2593 * - otherwise we populate the buffer with the saved name and start resolving 2594 * from the parent 2595 */ 2596 static int 2597 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2598 char **freebuf, size_t *buflen) 2599 { 2600 char *buf, *tmpbuf; 2601 struct pwd *pwd; 2602 struct componentname *cnp; 2603 struct vnode *vp; 2604 size_t addend; 2605 int error; 2606 bool slash_prefixed; 2607 2608 if (*buflen < 2) 2609 return (EINVAL); 2610 if (*buflen > MAXPATHLEN) 2611 *buflen = MAXPATHLEN; 2612 2613 slash_prefixed = false; 2614 2615 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2616 pwd = pwd_hold(td); 2617 2618 addend = 0; 2619 vp = ndp->ni_vp; 2620 if (vp->v_type != VDIR) { 2621 cnp = &ndp->ni_cnd; 2622 addend = cnp->cn_namelen + 2; 2623 if (*buflen < addend) { 2624 error = ENOMEM; 2625 goto out_bad; 2626 } 2627 *buflen -= addend; 2628 tmpbuf = buf + *buflen; 2629 tmpbuf[0] = '/'; 2630 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2631 tmpbuf[addend - 1] = '\0'; 2632 slash_prefixed = true; 2633 vp = ndp->ni_dvp; 2634 } 2635 2636 vref(vp); 2637 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2638 slash_prefixed, addend); 2639 if (error != 0) 2640 goto out_bad; 2641 2642 pwd_drop(pwd); 2643 *freebuf = buf; 2644 2645 return (0); 2646 out_bad: 2647 pwd_drop(pwd); 2648 free(buf, M_TEMP); 2649 return (error); 2650 } 2651 2652 struct vnode * 2653 vn_dir_dd_ino(struct vnode *vp) 2654 { 2655 struct namecache *ncp; 2656 struct vnode *ddvp; 2657 struct mtx *vlp; 2658 enum vgetstate vs; 2659 2660 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2661 vlp = VP2VNODELOCK(vp); 2662 mtx_lock(vlp); 2663 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2664 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2665 continue; 2666 ddvp = ncp->nc_dvp; 2667 vs = vget_prep(ddvp); 2668 mtx_unlock(vlp); 2669 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2670 return (NULL); 2671 return (ddvp); 2672 } 2673 mtx_unlock(vlp); 2674 return (NULL); 2675 } 2676 2677 int 2678 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2679 { 2680 struct namecache *ncp; 2681 struct mtx *vlp; 2682 int l; 2683 2684 vlp = VP2VNODELOCK(vp); 2685 mtx_lock(vlp); 2686 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2687 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2688 break; 2689 if (ncp == NULL) { 2690 mtx_unlock(vlp); 2691 return (ENOENT); 2692 } 2693 l = min(ncp->nc_nlen, buflen - 1); 2694 memcpy(buf, ncp->nc_name, l); 2695 mtx_unlock(vlp); 2696 buf[l] = '\0'; 2697 return (0); 2698 } 2699 2700 /* 2701 * This function updates path string to vnode's full global path 2702 * and checks the size of the new path string against the pathlen argument. 2703 * 2704 * Requires a locked, referenced vnode. 2705 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2706 * 2707 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2708 * because it falls back to the ".." lookup if the namecache lookup fails. 2709 */ 2710 int 2711 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2712 u_int pathlen) 2713 { 2714 struct nameidata nd; 2715 struct vnode *vp1; 2716 char *rpath, *fbuf; 2717 int error; 2718 2719 ASSERT_VOP_ELOCKED(vp, __func__); 2720 2721 /* Construct global filesystem path from vp. */ 2722 VOP_UNLOCK(vp); 2723 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2724 2725 if (error != 0) { 2726 vrele(vp); 2727 return (error); 2728 } 2729 2730 if (strlen(rpath) >= pathlen) { 2731 vrele(vp); 2732 error = ENAMETOOLONG; 2733 goto out; 2734 } 2735 2736 /* 2737 * Re-lookup the vnode by path to detect a possible rename. 2738 * As a side effect, the vnode is relocked. 2739 * If vnode was renamed, return ENOENT. 2740 */ 2741 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2742 UIO_SYSSPACE, path, td); 2743 error = namei(&nd); 2744 if (error != 0) { 2745 vrele(vp); 2746 goto out; 2747 } 2748 NDFREE(&nd, NDF_ONLY_PNBUF); 2749 vp1 = nd.ni_vp; 2750 vrele(vp); 2751 if (vp1 == vp) 2752 strcpy(path, rpath); 2753 else { 2754 vput(vp1); 2755 error = ENOENT; 2756 } 2757 2758 out: 2759 free(fbuf, M_TEMP); 2760 return (error); 2761 } 2762 2763 #ifdef DDB 2764 static void 2765 db_print_vpath(struct vnode *vp) 2766 { 2767 2768 while (vp != NULL) { 2769 db_printf("%p: ", vp); 2770 if (vp == rootvnode) { 2771 db_printf("/"); 2772 vp = NULL; 2773 } else { 2774 if (vp->v_vflag & VV_ROOT) { 2775 db_printf("<mount point>"); 2776 vp = vp->v_mount->mnt_vnodecovered; 2777 } else { 2778 struct namecache *ncp; 2779 char *ncn; 2780 int i; 2781 2782 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2783 if (ncp != NULL) { 2784 ncn = ncp->nc_name; 2785 for (i = 0; i < ncp->nc_nlen; i++) 2786 db_printf("%c", *ncn++); 2787 vp = ncp->nc_dvp; 2788 } else { 2789 vp = NULL; 2790 } 2791 } 2792 } 2793 db_printf("\n"); 2794 } 2795 2796 return; 2797 } 2798 2799 DB_SHOW_COMMAND(vpath, db_show_vpath) 2800 { 2801 struct vnode *vp; 2802 2803 if (!have_addr) { 2804 db_printf("usage: show vpath <struct vnode *>\n"); 2805 return; 2806 } 2807 2808 vp = (struct vnode *)addr; 2809 db_print_vpath(vp); 2810 } 2811 2812 #endif 2813