1 /*- 2 * Copyright (c) 1989, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Poul-Henning Kamp of the FreeBSD Project. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_ktrace.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/counter.h> 43 #include <sys/filedesc.h> 44 #include <sys/fnv_hash.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/malloc.h> 48 #include <sys/fcntl.h> 49 #include <sys/mount.h> 50 #include <sys/namei.h> 51 #include <sys/proc.h> 52 #include <sys/rwlock.h> 53 #include <sys/sdt.h> 54 #include <sys/smp.h> 55 #include <sys/syscallsubr.h> 56 #include <sys/sysctl.h> 57 #include <sys/sysproto.h> 58 #include <sys/vnode.h> 59 #ifdef KTRACE 60 #include <sys/ktrace.h> 61 #endif 62 63 #include <vm/uma.h> 64 65 SDT_PROVIDER_DECLARE(vfs); 66 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 67 "struct vnode *"); 68 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 69 "char *"); 70 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 71 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 72 "char *", "struct vnode *"); 73 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 74 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 75 "struct vnode *", "char *"); 76 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 77 "struct vnode *"); 78 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 79 "struct vnode *", "char *"); 80 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 81 "char *"); 82 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 83 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 84 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 85 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 86 "struct vnode *"); 87 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 88 "char *"); 89 90 /* 91 * This structure describes the elements in the cache of recent 92 * names looked up by namei. 93 */ 94 95 struct namecache { 96 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 97 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 98 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 99 struct vnode *nc_dvp; /* vnode of parent of name */ 100 struct vnode *nc_vp; /* vnode the name refers to */ 101 u_char nc_flag; /* flag bits */ 102 u_char nc_nlen; /* length of name */ 103 char nc_name[0]; /* segment name + nul */ 104 }; 105 106 /* 107 * struct namecache_ts repeats struct namecache layout up to the 108 * nc_nlen member. 109 * struct namecache_ts is used in place of struct namecache when time(s) need 110 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 111 * both a non-dotdot directory name plus dotdot for the directory's 112 * parent. 113 */ 114 struct namecache_ts { 115 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 116 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 117 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 118 struct vnode *nc_dvp; /* vnode of parent of name */ 119 struct vnode *nc_vp; /* vnode the name refers to */ 120 u_char nc_flag; /* flag bits */ 121 u_char nc_nlen; /* length of name */ 122 struct timespec nc_time; /* timespec provided by fs */ 123 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 124 int nc_ticks; /* ticks value when entry was added */ 125 char nc_name[0]; /* segment name + nul */ 126 }; 127 128 /* 129 * Flags in namecache.nc_flag 130 */ 131 #define NCF_WHITE 0x01 132 #define NCF_ISDOTDOT 0x02 133 #define NCF_TS 0x04 134 #define NCF_DTS 0x08 135 #define NCF_DVDROP 0x10 136 137 /* 138 * Name caching works as follows: 139 * 140 * Names found by directory scans are retained in a cache 141 * for future reference. It is managed LRU, so frequently 142 * used names will hang around. Cache is indexed by hash value 143 * obtained from (vp, name) where vp refers to the directory 144 * containing name. 145 * 146 * If it is a "negative" entry, (i.e. for a name that is known NOT to 147 * exist) the vnode pointer will be NULL. 148 * 149 * Upon reaching the last segment of a path, if the reference 150 * is for DELETE, or NOCACHE is set (rewrite), and the 151 * name is located in the cache, it will be dropped. 152 * 153 * These locks are used (in the order in which they can be taken): 154 * NAME TYPE ROLE 155 * cache_lock rwlock global, needed for all modifications 156 * bucketlock rwlock for access to given hash bucket 157 * ncneg_mtx mtx negative entry LRU management 158 * 159 * A name -> vnode lookup can be safely performed by either locking cache_lock 160 * or the relevant hash bucket. 161 * 162 * ".." and vnode -> name lookups require cache_lock. 163 * 164 * Modifications require both cache_lock and relevant bucketlock taken for 165 * writing. 166 * 167 * Negative entry LRU management requires ncneg_mtx taken on top of either 168 * cache_lock or bucketlock. 169 */ 170 171 /* 172 * Structures associated with name caching. 173 */ 174 #define NCHHASH(hash) \ 175 (&nchashtbl[(hash) & nchash]) 176 static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ 177 static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ 178 static u_long nchash; /* size of hash table */ 179 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 180 "Size of namecache hash table"); 181 static u_long ncnegfactor = 16; /* ratio of negative entries */ 182 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 183 "Ratio of negative namecache entries"); 184 static u_long numneg; /* number of negative entries allocated */ 185 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 186 "Number of negative entries in namecache"); 187 static u_long numcache; /* number of cache entries allocated */ 188 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 189 "Number of namecache entries"); 190 static u_long numcachehv; /* number of cache entries with vnodes held */ 191 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, 192 "Number of namecache entries with vnodes held"); 193 u_int ncsizefactor = 2; 194 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 195 "Size factor for namecache"); 196 197 struct nchstats nchstats; /* cache effectiveness statistics */ 198 199 static struct rwlock cache_lock; 200 RW_SYSINIT(vfscache, &cache_lock, "ncglobal"); 201 202 #define CACHE_TRY_WLOCK() rw_try_wlock(&cache_lock) 203 #define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock) 204 #define CACHE_RLOCK() rw_rlock(&cache_lock) 205 #define CACHE_RUNLOCK() rw_runlock(&cache_lock) 206 #define CACHE_WLOCK() rw_wlock(&cache_lock) 207 #define CACHE_WUNLOCK() rw_wunlock(&cache_lock) 208 209 static struct mtx_padalign ncneg_mtx; 210 MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF); 211 212 static u_int numbucketlocks; 213 static struct rwlock_padalign *bucketlocks; 214 #define HASH2BUCKETLOCK(hash) \ 215 ((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)])) 216 217 /* 218 * UMA zones for the VFS cache. 219 * 220 * The small cache is used for entries with short names, which are the 221 * most common. The large cache is used for entries which are too big to 222 * fit in the small cache. 223 */ 224 static uma_zone_t cache_zone_small; 225 static uma_zone_t cache_zone_small_ts; 226 static uma_zone_t cache_zone_large; 227 static uma_zone_t cache_zone_large_ts; 228 229 #define CACHE_PATH_CUTOFF 35 230 231 static struct namecache * 232 cache_alloc(int len, int ts) 233 { 234 235 if (len > CACHE_PATH_CUTOFF) { 236 if (ts) 237 return (uma_zalloc(cache_zone_large_ts, M_WAITOK)); 238 else 239 return (uma_zalloc(cache_zone_large, M_WAITOK)); 240 } 241 if (ts) 242 return (uma_zalloc(cache_zone_small_ts, M_WAITOK)); 243 else 244 return (uma_zalloc(cache_zone_small, M_WAITOK)); 245 } 246 247 static void 248 cache_free(struct namecache *ncp) 249 { 250 int ts; 251 252 if (ncp == NULL) 253 return; 254 ts = ncp->nc_flag & NCF_TS; 255 if ((ncp->nc_flag & NCF_DVDROP) != 0) 256 vdrop(ncp->nc_dvp); 257 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) { 258 if (ts) 259 uma_zfree(cache_zone_small_ts, ncp); 260 else 261 uma_zfree(cache_zone_small, ncp); 262 } else if (ts) 263 uma_zfree(cache_zone_large_ts, ncp); 264 else 265 uma_zfree(cache_zone_large, ncp); 266 } 267 268 static char * 269 nc_get_name(struct namecache *ncp) 270 { 271 struct namecache_ts *ncp_ts; 272 273 if ((ncp->nc_flag & NCF_TS) == 0) 274 return (ncp->nc_name); 275 ncp_ts = (struct namecache_ts *)ncp; 276 return (ncp_ts->nc_name); 277 } 278 279 static void 280 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 281 { 282 283 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 284 (tsp == NULL && ticksp == NULL), 285 ("No NCF_TS")); 286 287 if (tsp != NULL) 288 *tsp = ((struct namecache_ts *)ncp)->nc_time; 289 if (ticksp != NULL) 290 *ticksp = ((struct namecache_ts *)ncp)->nc_ticks; 291 } 292 293 static int doingcache = 1; /* 1 => enable the cache */ 294 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 295 "VFS namecache enabled"); 296 297 /* Export size information to userland */ 298 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 299 sizeof(struct namecache), "sizeof(struct namecache)"); 300 301 /* 302 * The new name cache statistics 303 */ 304 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, 305 "Name cache statistics"); 306 #define STATNODE_ULONG(name, descr) \ 307 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 308 #define STATNODE_COUNTER(name, descr) \ 309 static counter_u64_t name; \ 310 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 311 STATNODE_ULONG(numneg, "Number of negative cache entries"); 312 STATNODE_ULONG(numcache, "Number of cache entries"); 313 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 314 STATNODE_COUNTER(dothits, "Number of '.' hits"); 315 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 316 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 317 STATNODE_COUNTER(nummiss, "Number of cache misses"); 318 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 319 STATNODE_COUNTER(numposzaps, 320 "Number of cache hits (positive) we do not want to cache"); 321 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 322 STATNODE_COUNTER(numnegzaps, 323 "Number of cache hits (negative) we do not want to cache"); 324 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 325 /* These count for kern___getcwd(), too. */ 326 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 327 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 328 STATNODE_COUNTER(numfullpathfail2, 329 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 330 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 331 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 332 static long numupgrades; STATNODE_ULONG(numupgrades, 333 "Number of updates of the cache after lookup (write lock + retry)"); 334 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 335 "Number of times bucketlocked zap_and_exit case failed to writelock"); 336 337 static void cache_zap(struct namecache *ncp); 338 static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, 339 u_int *buflen); 340 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 341 char *buf, char **retbuf, u_int buflen); 342 343 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 344 345 static uint32_t 346 cache_get_hash(char *name, u_char len, struct vnode *dvp) 347 { 348 uint32_t hash; 349 350 hash = fnv_32_buf(name, len, FNV1_32_INIT); 351 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 352 return (hash); 353 } 354 355 #ifdef INVARIANTS 356 static void 357 cache_assert_bucket_locked(struct namecache *ncp, int mode) 358 { 359 struct rwlock *bucketlock; 360 uint32_t hash; 361 362 hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); 363 bucketlock = HASH2BUCKETLOCK(hash); 364 rw_assert(bucketlock, mode); 365 } 366 #else 367 #define cache_assert_bucket_locked(x, y) do { } while (0) 368 #endif 369 370 static void 371 cache_lock_all_buckets(void) 372 { 373 u_int i; 374 375 for (i = 0; i < numbucketlocks; i++) 376 rw_wlock(&bucketlocks[i]); 377 } 378 379 static void 380 cache_unlock_all_buckets(void) 381 { 382 u_int i; 383 384 for (i = 0; i < numbucketlocks; i++) 385 rw_wunlock(&bucketlocks[i]); 386 } 387 388 static int 389 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 390 { 391 struct nchstats snap; 392 393 if (req->oldptr == NULL) 394 return (SYSCTL_OUT(req, 0, sizeof(snap))); 395 396 snap = nchstats; 397 snap.ncs_goodhits = counter_u64_fetch(numposhits); 398 snap.ncs_neghits = counter_u64_fetch(numneghits); 399 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 400 counter_u64_fetch(numnegzaps); 401 snap.ncs_miss = counter_u64_fetch(nummisszap) + 402 counter_u64_fetch(nummiss); 403 404 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 405 } 406 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 407 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 408 "VFS cache effectiveness statistics"); 409 410 #ifdef DIAGNOSTIC 411 /* 412 * Grab an atomic snapshot of the name cache hash chain lengths 413 */ 414 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, 415 "hash table stats"); 416 417 static int 418 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 419 { 420 struct nchashhead *ncpp; 421 struct namecache *ncp; 422 int i, error, n_nchash, *cntbuf; 423 424 retry: 425 n_nchash = nchash + 1; /* nchash is max index, not count */ 426 if (req->oldptr == NULL) 427 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 428 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 429 CACHE_RLOCK(); 430 if (n_nchash != nchash + 1) { 431 CACHE_RUNLOCK(); 432 free(cntbuf, M_TEMP); 433 goto retry; 434 } 435 /* Scan hash tables counting entries */ 436 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 437 LIST_FOREACH(ncp, ncpp, nc_hash) 438 cntbuf[i]++; 439 CACHE_RUNLOCK(); 440 for (error = 0, i = 0; i < n_nchash; i++) 441 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 442 break; 443 free(cntbuf, M_TEMP); 444 return (error); 445 } 446 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 447 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 448 "nchash chain lengths"); 449 450 static int 451 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 452 { 453 int error; 454 struct nchashhead *ncpp; 455 struct namecache *ncp; 456 int n_nchash; 457 int count, maxlength, used, pct; 458 459 if (!req->oldptr) 460 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 461 462 CACHE_RLOCK(); 463 n_nchash = nchash + 1; /* nchash is max index, not count */ 464 used = 0; 465 maxlength = 0; 466 467 /* Scan hash tables for applicable entries */ 468 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 469 count = 0; 470 LIST_FOREACH(ncp, ncpp, nc_hash) { 471 count++; 472 } 473 if (count) 474 used++; 475 if (maxlength < count) 476 maxlength = count; 477 } 478 n_nchash = nchash + 1; 479 CACHE_RUNLOCK(); 480 pct = (used * 100) / (n_nchash / 100); 481 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 482 if (error) 483 return (error); 484 error = SYSCTL_OUT(req, &used, sizeof(used)); 485 if (error) 486 return (error); 487 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 488 if (error) 489 return (error); 490 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 491 if (error) 492 return (error); 493 return (0); 494 } 495 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 496 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 497 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 498 #endif 499 500 /* 501 * Negative entries management 502 */ 503 static void 504 cache_negative_hit(struct namecache *ncp) 505 { 506 507 mtx_lock(&ncneg_mtx); 508 TAILQ_REMOVE(&ncneg, ncp, nc_dst); 509 TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); 510 mtx_unlock(&ncneg_mtx); 511 } 512 513 static void 514 cache_negative_insert(struct namecache *ncp) 515 { 516 517 rw_assert(&cache_lock, RA_WLOCKED); 518 cache_assert_bucket_locked(ncp, RA_WLOCKED); 519 MPASS(ncp->nc_vp == NULL); 520 mtx_lock(&ncneg_mtx); 521 TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); 522 numneg++; 523 mtx_unlock(&ncneg_mtx); 524 } 525 526 static void 527 cache_negative_remove(struct namecache *ncp) 528 { 529 530 rw_assert(&cache_lock, RA_WLOCKED); 531 cache_assert_bucket_locked(ncp, RA_WLOCKED); 532 MPASS(ncp->nc_vp == NULL); 533 mtx_lock(&ncneg_mtx); 534 TAILQ_REMOVE(&ncneg, ncp, nc_dst); 535 numneg--; 536 mtx_unlock(&ncneg_mtx); 537 } 538 539 static struct namecache * 540 cache_negative_zap_one(void) 541 { 542 struct namecache *ncp; 543 544 rw_assert(&cache_lock, RA_WLOCKED); 545 ncp = TAILQ_FIRST(&ncneg); 546 KASSERT(ncp->nc_vp == NULL, ("ncp %p vp %p on ncneg", 547 ncp, ncp->nc_vp)); 548 cache_zap(ncp); 549 return (ncp); 550 } 551 552 /* 553 * cache_zap(): 554 * 555 * Removes a namecache entry from cache, whether it contains an actual 556 * pointer to a vnode or if it is just a negative cache entry. 557 */ 558 static void 559 cache_zap_locked(struct namecache *ncp) 560 { 561 562 rw_assert(&cache_lock, RA_WLOCKED); 563 cache_assert_bucket_locked(ncp, RA_WLOCKED); 564 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); 565 if (ncp->nc_vp != NULL) { 566 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 567 nc_get_name(ncp), ncp->nc_vp); 568 } else { 569 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 570 nc_get_name(ncp)); 571 } 572 LIST_REMOVE(ncp, nc_hash); 573 if (ncp->nc_flag & NCF_ISDOTDOT) { 574 if (ncp == ncp->nc_dvp->v_cache_dd) 575 ncp->nc_dvp->v_cache_dd = NULL; 576 } else { 577 LIST_REMOVE(ncp, nc_src); 578 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 579 ncp->nc_flag |= NCF_DVDROP; 580 numcachehv--; 581 } 582 } 583 if (ncp->nc_vp) { 584 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 585 if (ncp == ncp->nc_vp->v_cache_dd) 586 ncp->nc_vp->v_cache_dd = NULL; 587 } else { 588 cache_negative_remove(ncp); 589 } 590 numcache--; 591 } 592 593 static void 594 cache_zap(struct namecache *ncp) 595 { 596 struct rwlock *bucketlock; 597 uint32_t hash; 598 599 rw_assert(&cache_lock, RA_WLOCKED); 600 601 hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp); 602 bucketlock = HASH2BUCKETLOCK(hash); 603 rw_wlock(bucketlock); 604 cache_zap_locked(ncp); 605 rw_wunlock(bucketlock); 606 } 607 608 /* 609 * Lookup an entry in the cache 610 * 611 * Lookup is called with dvp pointing to the directory to search, 612 * cnp pointing to the name of the entry being sought. If the lookup 613 * succeeds, the vnode is returned in *vpp, and a status of -1 is 614 * returned. If the lookup determines that the name does not exist 615 * (negative caching), a status of ENOENT is returned. If the lookup 616 * fails, a status of zero is returned. If the directory vnode is 617 * recycled out from under us due to a forced unmount, a status of 618 * ENOENT is returned. 619 * 620 * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is 621 * unlocked. If we're looking up . an extra ref is taken, but the lock is 622 * not recursively acquired. 623 */ 624 625 enum { UNLOCKED, WLOCKED, RLOCKED }; 626 627 static void 628 cache_unlock(int cache_locked) 629 { 630 631 switch (cache_locked) { 632 case UNLOCKED: 633 break; 634 case WLOCKED: 635 CACHE_WUNLOCK(); 636 break; 637 case RLOCKED: 638 CACHE_RUNLOCK(); 639 break; 640 } 641 } 642 643 int 644 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 645 struct timespec *tsp, int *ticksp) 646 { 647 struct rwlock *bucketlock; 648 struct namecache *ncp; 649 uint32_t hash; 650 int error, ltype, cache_locked; 651 652 if (!doingcache) { 653 cnp->cn_flags &= ~MAKEENTRY; 654 return (0); 655 } 656 retry: 657 bucketlock = NULL; 658 cache_locked = UNLOCKED; 659 error = 0; 660 counter_u64_add(numcalls, 1); 661 662 retry_wlocked: 663 if (cnp->cn_nameptr[0] == '.') { 664 if (cnp->cn_namelen == 1) { 665 *vpp = dvp; 666 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 667 dvp, cnp->cn_nameptr); 668 counter_u64_add(dothits, 1); 669 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 670 if (tsp != NULL) 671 timespecclear(tsp); 672 if (ticksp != NULL) 673 *ticksp = ticks; 674 VREF(*vpp); 675 /* 676 * When we lookup "." we still can be asked to lock it 677 * differently... 678 */ 679 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 680 if (ltype != VOP_ISLOCKED(*vpp)) { 681 if (ltype == LK_EXCLUSIVE) { 682 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 683 if ((*vpp)->v_iflag & VI_DOOMED) { 684 /* forced unmount */ 685 vrele(*vpp); 686 *vpp = NULL; 687 return (ENOENT); 688 } 689 } else 690 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 691 } 692 return (-1); 693 } 694 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 695 counter_u64_add(dotdothits, 1); 696 if (cache_locked == UNLOCKED) { 697 CACHE_RLOCK(); 698 cache_locked = RLOCKED; 699 } 700 701 if (dvp->v_cache_dd == NULL) { 702 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 703 "..", NULL); 704 goto unlock; 705 } 706 if ((cnp->cn_flags & MAKEENTRY) == 0) { 707 if (cache_locked != WLOCKED && 708 !CACHE_UPGRADE_LOCK()) 709 goto wlock; 710 ncp = NULL; 711 if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) { 712 ncp = dvp->v_cache_dd; 713 cache_zap(ncp); 714 } 715 dvp->v_cache_dd = NULL; 716 CACHE_WUNLOCK(); 717 cache_free(ncp); 718 return (0); 719 } 720 ncp = dvp->v_cache_dd; 721 if (ncp->nc_flag & NCF_ISDOTDOT) 722 *vpp = ncp->nc_vp; 723 else 724 *vpp = ncp->nc_dvp; 725 /* Return failure if negative entry was found. */ 726 if (*vpp == NULL) 727 goto negative_success; 728 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 729 dvp, cnp->cn_nameptr, *vpp); 730 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 731 *vpp); 732 cache_out_ts(ncp, tsp, ticksp); 733 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 734 NCF_DTS && tsp != NULL) 735 *tsp = ((struct namecache_ts *)ncp)-> 736 nc_dotdottime; 737 goto success; 738 } 739 } 740 741 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 742 if (cache_locked == UNLOCKED) { 743 bucketlock = HASH2BUCKETLOCK(hash); 744 rw_rlock(bucketlock); 745 } 746 747 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 748 counter_u64_add(numchecks, 1); 749 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 750 !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen)) 751 break; 752 } 753 754 /* We failed to find an entry */ 755 if (ncp == NULL) { 756 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 757 NULL); 758 if ((cnp->cn_flags & MAKEENTRY) == 0) { 759 counter_u64_add(nummisszap, 1); 760 } else { 761 counter_u64_add(nummiss, 1); 762 } 763 goto unlock; 764 } 765 766 /* We don't want to have an entry, so dump it */ 767 if ((cnp->cn_flags & MAKEENTRY) == 0) { 768 counter_u64_add(numposzaps, 1); 769 goto zap_and_exit; 770 } 771 772 /* We found a "positive" match, return the vnode */ 773 if (ncp->nc_vp) { 774 counter_u64_add(numposhits, 1); 775 *vpp = ncp->nc_vp; 776 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 777 dvp, cnp->cn_nameptr, *vpp, ncp); 778 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp), 779 *vpp); 780 cache_out_ts(ncp, tsp, ticksp); 781 goto success; 782 } 783 784 negative_success: 785 /* We found a negative match, and want to create it, so purge */ 786 if (cnp->cn_nameiop == CREATE) { 787 counter_u64_add(numnegzaps, 1); 788 goto zap_and_exit; 789 } 790 791 counter_u64_add(numneghits, 1); 792 cache_negative_hit(ncp); 793 if (ncp->nc_flag & NCF_WHITE) 794 cnp->cn_flags |= ISWHITEOUT; 795 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 796 nc_get_name(ncp)); 797 cache_out_ts(ncp, tsp, ticksp); 798 MPASS(bucketlock != NULL || cache_locked != UNLOCKED); 799 if (bucketlock != NULL) 800 rw_runlock(bucketlock); 801 cache_unlock(cache_locked); 802 return (ENOENT); 803 804 wlock: 805 /* 806 * We need to update the cache after our lookup, so upgrade to 807 * a write lock and retry the operation. 808 */ 809 CACHE_RUNLOCK(); 810 wlock_unlocked: 811 CACHE_WLOCK(); 812 numupgrades++; 813 cache_locked = WLOCKED; 814 goto retry_wlocked; 815 816 success: 817 /* 818 * On success we return a locked and ref'd vnode as per the lookup 819 * protocol. 820 */ 821 MPASS(dvp != *vpp); 822 ltype = 0; /* silence gcc warning */ 823 if (cnp->cn_flags & ISDOTDOT) { 824 ltype = VOP_ISLOCKED(dvp); 825 VOP_UNLOCK(dvp, 0); 826 } 827 vhold(*vpp); 828 MPASS(bucketlock != NULL || cache_locked != UNLOCKED); 829 if (bucketlock != NULL) 830 rw_runlock(bucketlock); 831 cache_unlock(cache_locked); 832 error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); 833 if (cnp->cn_flags & ISDOTDOT) { 834 vn_lock(dvp, ltype | LK_RETRY); 835 if (dvp->v_iflag & VI_DOOMED) { 836 if (error == 0) 837 vput(*vpp); 838 *vpp = NULL; 839 return (ENOENT); 840 } 841 } 842 if (error) { 843 *vpp = NULL; 844 goto retry; 845 } 846 if ((cnp->cn_flags & ISLASTCN) && 847 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 848 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 849 } 850 return (-1); 851 852 unlock: 853 MPASS(bucketlock != NULL || cache_locked != UNLOCKED); 854 if (bucketlock != NULL) 855 rw_runlock(bucketlock); 856 cache_unlock(cache_locked); 857 return (0); 858 859 zap_and_exit: 860 if (bucketlock != NULL) { 861 rw_assert(&cache_lock, RA_UNLOCKED); 862 if (!CACHE_TRY_WLOCK()) { 863 rw_runlock(bucketlock); 864 bucketlock = NULL; 865 zap_and_exit_bucket_fail++; 866 goto wlock_unlocked; 867 } 868 cache_locked = WLOCKED; 869 rw_runlock(bucketlock); 870 bucketlock = NULL; 871 } else if (cache_locked != WLOCKED && !CACHE_UPGRADE_LOCK()) 872 goto wlock; 873 cache_zap(ncp); 874 CACHE_WUNLOCK(); 875 cache_free(ncp); 876 return (0); 877 } 878 879 /* 880 * Add an entry to the cache. 881 */ 882 void 883 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 884 struct timespec *tsp, struct timespec *dtsp) 885 { 886 struct rwlock *bucketlock; 887 struct namecache *ncp, *n2, *ndd, *nneg; 888 struct namecache_ts *n3; 889 struct nchashhead *ncpp; 890 uint32_t hash; 891 int flag; 892 int len; 893 894 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 895 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, 896 ("cache_enter: Adding a doomed vnode")); 897 VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, 898 ("cache_enter: Doomed vnode used as src")); 899 900 if (!doingcache) 901 return; 902 903 /* 904 * Avoid blowout in namecache entries. 905 */ 906 if (numcache >= desiredvnodes * ncsizefactor) 907 return; 908 909 ndd = nneg = NULL; 910 flag = 0; 911 if (cnp->cn_nameptr[0] == '.') { 912 if (cnp->cn_namelen == 1) 913 return; 914 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 915 CACHE_WLOCK(); 916 /* 917 * If dotdot entry already exists, just retarget it 918 * to new parent vnode, otherwise continue with new 919 * namecache entry allocation. 920 */ 921 if ((ncp = dvp->v_cache_dd) != NULL && 922 ncp->nc_flag & NCF_ISDOTDOT) { 923 KASSERT(ncp->nc_dvp == dvp, 924 ("wrong isdotdot parent")); 925 if (ncp->nc_vp != NULL) { 926 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, 927 ncp, nc_dst); 928 } else { 929 cache_negative_remove(ncp); 930 } 931 if (vp != NULL) { 932 TAILQ_INSERT_HEAD(&vp->v_cache_dst, 933 ncp, nc_dst); 934 } else { 935 cache_negative_insert(ncp); 936 } 937 ncp->nc_vp = vp; 938 CACHE_WUNLOCK(); 939 return; 940 } 941 dvp->v_cache_dd = NULL; 942 SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); 943 CACHE_WUNLOCK(); 944 flag = NCF_ISDOTDOT; 945 } 946 } 947 948 /* 949 * Calculate the hash key and setup as much of the new 950 * namecache entry as possible before acquiring the lock. 951 */ 952 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 953 ncp->nc_vp = vp; 954 ncp->nc_dvp = dvp; 955 ncp->nc_flag = flag; 956 if (tsp != NULL) { 957 n3 = (struct namecache_ts *)ncp; 958 n3->nc_time = *tsp; 959 n3->nc_ticks = ticks; 960 n3->nc_flag |= NCF_TS; 961 if (dtsp != NULL) { 962 n3->nc_dotdottime = *dtsp; 963 n3->nc_flag |= NCF_DTS; 964 } 965 } 966 len = ncp->nc_nlen = cnp->cn_namelen; 967 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 968 strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1); 969 CACHE_WLOCK(); 970 971 /* 972 * See if this vnode or negative entry is already in the cache 973 * with this name. This can happen with concurrent lookups of 974 * the same path name. 975 */ 976 ncpp = NCHHASH(hash); 977 LIST_FOREACH(n2, ncpp, nc_hash) { 978 if (n2->nc_dvp == dvp && 979 n2->nc_nlen == cnp->cn_namelen && 980 !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) { 981 if (tsp != NULL) { 982 KASSERT((n2->nc_flag & NCF_TS) != 0, 983 ("no NCF_TS")); 984 n3 = (struct namecache_ts *)n2; 985 n3->nc_time = 986 ((struct namecache_ts *)ncp)->nc_time; 987 n3->nc_ticks = 988 ((struct namecache_ts *)ncp)->nc_ticks; 989 if (dtsp != NULL) { 990 n3->nc_dotdottime = 991 ((struct namecache_ts *)ncp)-> 992 nc_dotdottime; 993 n3->nc_flag |= NCF_DTS; 994 } 995 } 996 CACHE_WUNLOCK(); 997 cache_free(ncp); 998 return; 999 } 1000 } 1001 1002 if (flag == NCF_ISDOTDOT) { 1003 /* 1004 * See if we are trying to add .. entry, but some other lookup 1005 * has populated v_cache_dd pointer already. 1006 */ 1007 if (dvp->v_cache_dd != NULL) { 1008 CACHE_WUNLOCK(); 1009 cache_free(ncp); 1010 return; 1011 } 1012 KASSERT(vp == NULL || vp->v_type == VDIR, 1013 ("wrong vnode type %p", vp)); 1014 dvp->v_cache_dd = ncp; 1015 } 1016 1017 numcache++; 1018 if (vp != NULL) { 1019 if (vp->v_type == VDIR) { 1020 if (flag != NCF_ISDOTDOT) { 1021 /* 1022 * For this case, the cache entry maps both the 1023 * directory name in it and the name ".." for the 1024 * directory's parent. 1025 */ 1026 if ((ndd = vp->v_cache_dd) != NULL) { 1027 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1028 cache_zap(ndd); 1029 else 1030 ndd = NULL; 1031 } 1032 vp->v_cache_dd = ncp; 1033 } 1034 } else { 1035 vp->v_cache_dd = NULL; 1036 } 1037 } 1038 1039 if (flag != NCF_ISDOTDOT) { 1040 if (LIST_EMPTY(&dvp->v_cache_src)) { 1041 vhold(dvp); 1042 numcachehv++; 1043 } 1044 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1045 } 1046 1047 bucketlock = HASH2BUCKETLOCK(hash); 1048 rw_wlock(bucketlock); 1049 1050 /* 1051 * Insert the new namecache entry into the appropriate chain 1052 * within the cache entries table. 1053 */ 1054 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1055 1056 /* 1057 * If the entry is "negative", we place it into the 1058 * "negative" cache queue, otherwise, we place it into the 1059 * destination vnode's cache entries queue. 1060 */ 1061 if (vp != NULL) { 1062 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1063 SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp), 1064 vp); 1065 } else { 1066 if (cnp->cn_flags & ISWHITEOUT) 1067 ncp->nc_flag |= NCF_WHITE; 1068 cache_negative_insert(ncp); 1069 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1070 nc_get_name(ncp)); 1071 } 1072 rw_wunlock(bucketlock); 1073 if (numneg * ncnegfactor > numcache) 1074 nneg = cache_negative_zap_one(); 1075 CACHE_WUNLOCK(); 1076 cache_free(ndd); 1077 cache_free(nneg); 1078 } 1079 1080 static u_int 1081 cache_roundup_2(u_int val) 1082 { 1083 u_int res; 1084 1085 for (res = 1; res <= val; res <<= 1) 1086 continue; 1087 1088 return (res); 1089 } 1090 1091 /* 1092 * Name cache initialization, from vfs_init() when we are booting 1093 */ 1094 static void 1095 nchinit(void *dummy __unused) 1096 { 1097 u_int i; 1098 1099 TAILQ_INIT(&ncneg); 1100 1101 cache_zone_small = uma_zcreate("S VFS Cache", 1102 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 1103 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); 1104 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 1105 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 1106 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); 1107 cache_zone_large = uma_zcreate("L VFS Cache", 1108 sizeof(struct namecache) + NAME_MAX + 1, 1109 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); 1110 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 1111 sizeof(struct namecache_ts) + NAME_MAX + 1, 1112 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); 1113 1114 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 1115 numbucketlocks = cache_roundup_2(mp_ncpus * 16); 1116 if (numbucketlocks > nchash) 1117 numbucketlocks = nchash; 1118 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 1119 M_WAITOK | M_ZERO); 1120 for (i = 0; i < numbucketlocks; i++) 1121 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK); 1122 1123 numcalls = counter_u64_alloc(M_WAITOK); 1124 dothits = counter_u64_alloc(M_WAITOK); 1125 dotdothits = counter_u64_alloc(M_WAITOK); 1126 numchecks = counter_u64_alloc(M_WAITOK); 1127 nummiss = counter_u64_alloc(M_WAITOK); 1128 nummisszap = counter_u64_alloc(M_WAITOK); 1129 numposzaps = counter_u64_alloc(M_WAITOK); 1130 numposhits = counter_u64_alloc(M_WAITOK); 1131 numnegzaps = counter_u64_alloc(M_WAITOK); 1132 numneghits = counter_u64_alloc(M_WAITOK); 1133 numfullpathcalls = counter_u64_alloc(M_WAITOK); 1134 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 1135 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 1136 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 1137 numfullpathfound = counter_u64_alloc(M_WAITOK); 1138 } 1139 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 1140 1141 void 1142 cache_changesize(int newmaxvnodes) 1143 { 1144 struct nchashhead *new_nchashtbl, *old_nchashtbl; 1145 u_long new_nchash, old_nchash; 1146 struct namecache *ncp; 1147 uint32_t hash; 1148 int i; 1149 1150 new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash); 1151 /* If same hash table size, nothing to do */ 1152 if (nchash == new_nchash) { 1153 free(new_nchashtbl, M_VFSCACHE); 1154 return; 1155 } 1156 /* 1157 * Move everything from the old hash table to the new table. 1158 * None of the namecache entries in the table can be removed 1159 * because to do so, they have to be removed from the hash table. 1160 */ 1161 CACHE_WLOCK(); 1162 cache_lock_all_buckets(); 1163 old_nchashtbl = nchashtbl; 1164 old_nchash = nchash; 1165 nchashtbl = new_nchashtbl; 1166 nchash = new_nchash; 1167 for (i = 0; i <= old_nchash; i++) { 1168 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 1169 hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, 1170 ncp->nc_dvp); 1171 LIST_REMOVE(ncp, nc_hash); 1172 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 1173 } 1174 } 1175 cache_unlock_all_buckets(); 1176 CACHE_WUNLOCK(); 1177 free(old_nchashtbl, M_VFSCACHE); 1178 } 1179 1180 /* 1181 * Invalidate all entries to a particular vnode. 1182 */ 1183 void 1184 cache_purge(struct vnode *vp) 1185 { 1186 TAILQ_HEAD(, namecache) ncps; 1187 struct namecache *ncp, *nnp; 1188 1189 CTR1(KTR_VFS, "cache_purge(%p)", vp); 1190 SDT_PROBE1(vfs, namecache, purge, done, vp); 1191 TAILQ_INIT(&ncps); 1192 CACHE_WLOCK(); 1193 while (!LIST_EMPTY(&vp->v_cache_src)) { 1194 ncp = LIST_FIRST(&vp->v_cache_src); 1195 cache_zap(ncp); 1196 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1197 } 1198 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 1199 ncp = TAILQ_FIRST(&vp->v_cache_dst); 1200 cache_zap(ncp); 1201 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1202 } 1203 if (vp->v_cache_dd != NULL) { 1204 ncp = vp->v_cache_dd; 1205 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 1206 ("lost dotdot link")); 1207 cache_zap(ncp); 1208 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1209 } 1210 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 1211 CACHE_WUNLOCK(); 1212 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 1213 cache_free(ncp); 1214 } 1215 } 1216 1217 /* 1218 * Invalidate all negative entries for a particular directory vnode. 1219 */ 1220 void 1221 cache_purge_negative(struct vnode *vp) 1222 { 1223 TAILQ_HEAD(, namecache) ncps; 1224 struct namecache *ncp, *nnp; 1225 1226 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 1227 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 1228 TAILQ_INIT(&ncps); 1229 CACHE_WLOCK(); 1230 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 1231 if (ncp->nc_vp != NULL) 1232 continue; 1233 cache_zap(ncp); 1234 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1235 } 1236 CACHE_WUNLOCK(); 1237 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 1238 cache_free(ncp); 1239 } 1240 } 1241 1242 /* 1243 * Flush all entries referencing a particular filesystem. 1244 */ 1245 void 1246 cache_purgevfs(struct mount *mp) 1247 { 1248 TAILQ_HEAD(, namecache) ncps; 1249 struct rwlock *bucketlock; 1250 struct nchashhead *bucket; 1251 struct namecache *ncp, *nnp; 1252 u_long i, j, n_nchash; 1253 1254 /* Scan hash tables for applicable entries */ 1255 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 1256 TAILQ_INIT(&ncps); 1257 CACHE_WLOCK(); 1258 n_nchash = nchash + 1; 1259 for (i = 0; i < numbucketlocks; i++) { 1260 bucketlock = (struct rwlock *)&bucketlocks[i]; 1261 rw_wlock(bucketlock); 1262 for (j = i; j < n_nchash; j += numbucketlocks) { 1263 bucket = &nchashtbl[j]; 1264 LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 1265 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1266 if (ncp->nc_dvp->v_mount != mp) 1267 continue; 1268 cache_zap_locked(ncp); 1269 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 1270 } 1271 } 1272 rw_wunlock(bucketlock); 1273 } 1274 CACHE_WUNLOCK(); 1275 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 1276 cache_free(ncp); 1277 } 1278 } 1279 1280 /* 1281 * Perform canonical checks and cache lookup and pass on to filesystem 1282 * through the vop_cachedlookup only if needed. 1283 */ 1284 1285 int 1286 vfs_cache_lookup(struct vop_lookup_args *ap) 1287 { 1288 struct vnode *dvp; 1289 int error; 1290 struct vnode **vpp = ap->a_vpp; 1291 struct componentname *cnp = ap->a_cnp; 1292 struct ucred *cred = cnp->cn_cred; 1293 int flags = cnp->cn_flags; 1294 struct thread *td = cnp->cn_thread; 1295 1296 *vpp = NULL; 1297 dvp = ap->a_dvp; 1298 1299 if (dvp->v_type != VDIR) 1300 return (ENOTDIR); 1301 1302 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 1303 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 1304 return (EROFS); 1305 1306 error = VOP_ACCESS(dvp, VEXEC, cred, td); 1307 if (error) 1308 return (error); 1309 1310 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 1311 if (error == 0) 1312 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 1313 if (error == -1) 1314 return (0); 1315 return (error); 1316 } 1317 1318 /* 1319 * XXX All of these sysctls would probably be more productive dead. 1320 */ 1321 static int disablecwd; 1322 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 1323 "Disable the getcwd syscall"); 1324 1325 /* Implementation of the getcwd syscall. */ 1326 int 1327 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 1328 { 1329 1330 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, 1331 MAXPATHLEN)); 1332 } 1333 1334 int 1335 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen, 1336 u_int path_max) 1337 { 1338 char *bp, *tmpbuf; 1339 struct filedesc *fdp; 1340 struct vnode *cdir, *rdir; 1341 int error; 1342 1343 if (disablecwd) 1344 return (ENODEV); 1345 if (buflen < 2) 1346 return (EINVAL); 1347 if (buflen > path_max) 1348 buflen = path_max; 1349 1350 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); 1351 fdp = td->td_proc->p_fd; 1352 FILEDESC_SLOCK(fdp); 1353 cdir = fdp->fd_cdir; 1354 VREF(cdir); 1355 rdir = fdp->fd_rdir; 1356 VREF(rdir); 1357 FILEDESC_SUNLOCK(fdp); 1358 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); 1359 vrele(rdir); 1360 vrele(cdir); 1361 1362 if (!error) { 1363 if (bufseg == UIO_SYSSPACE) 1364 bcopy(bp, buf, strlen(bp) + 1); 1365 else 1366 error = copyout(bp, buf, strlen(bp) + 1); 1367 #ifdef KTRACE 1368 if (KTRPOINT(curthread, KTR_NAMEI)) 1369 ktrnamei(bp); 1370 #endif 1371 } 1372 free(tmpbuf, M_TEMP); 1373 return (error); 1374 } 1375 1376 /* 1377 * Thus begins the fullpath magic. 1378 */ 1379 1380 static int disablefullpath; 1381 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, 1382 "Disable the vn_fullpath function"); 1383 1384 /* 1385 * Retrieve the full filesystem path that correspond to a vnode from the name 1386 * cache (if available) 1387 */ 1388 int 1389 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 1390 { 1391 char *buf; 1392 struct filedesc *fdp; 1393 struct vnode *rdir; 1394 int error; 1395 1396 if (disablefullpath) 1397 return (ENODEV); 1398 if (vn == NULL) 1399 return (EINVAL); 1400 1401 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 1402 fdp = td->td_proc->p_fd; 1403 FILEDESC_SLOCK(fdp); 1404 rdir = fdp->fd_rdir; 1405 VREF(rdir); 1406 FILEDESC_SUNLOCK(fdp); 1407 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); 1408 vrele(rdir); 1409 1410 if (!error) 1411 *freebuf = buf; 1412 else 1413 free(buf, M_TEMP); 1414 return (error); 1415 } 1416 1417 /* 1418 * This function is similar to vn_fullpath, but it attempts to lookup the 1419 * pathname relative to the global root mount point. This is required for the 1420 * auditing sub-system, as audited pathnames must be absolute, relative to the 1421 * global root mount point. 1422 */ 1423 int 1424 vn_fullpath_global(struct thread *td, struct vnode *vn, 1425 char **retbuf, char **freebuf) 1426 { 1427 char *buf; 1428 int error; 1429 1430 if (disablefullpath) 1431 return (ENODEV); 1432 if (vn == NULL) 1433 return (EINVAL); 1434 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 1435 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); 1436 if (!error) 1437 *freebuf = buf; 1438 else 1439 free(buf, M_TEMP); 1440 return (error); 1441 } 1442 1443 int 1444 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) 1445 { 1446 int error; 1447 1448 CACHE_RLOCK(); 1449 error = vn_vptocnp_locked(vp, cred, buf, buflen); 1450 if (error == 0) 1451 CACHE_RUNLOCK(); 1452 return (error); 1453 } 1454 1455 static int 1456 vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, 1457 u_int *buflen) 1458 { 1459 struct vnode *dvp; 1460 struct namecache *ncp; 1461 int error; 1462 1463 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 1464 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1465 break; 1466 } 1467 if (ncp != NULL) { 1468 if (*buflen < ncp->nc_nlen) { 1469 CACHE_RUNLOCK(); 1470 vrele(*vp); 1471 counter_u64_add(numfullpathfail4, 1); 1472 error = ENOMEM; 1473 SDT_PROBE3(vfs, namecache, fullpath, return, error, 1474 vp, NULL); 1475 return (error); 1476 } 1477 *buflen -= ncp->nc_nlen; 1478 memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen); 1479 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 1480 nc_get_name(ncp), vp); 1481 dvp = *vp; 1482 *vp = ncp->nc_dvp; 1483 vref(*vp); 1484 CACHE_RUNLOCK(); 1485 vrele(dvp); 1486 CACHE_RLOCK(); 1487 return (0); 1488 } 1489 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 1490 1491 CACHE_RUNLOCK(); 1492 vn_lock(*vp, LK_SHARED | LK_RETRY); 1493 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 1494 vput(*vp); 1495 if (error) { 1496 counter_u64_add(numfullpathfail2, 1); 1497 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 1498 return (error); 1499 } 1500 1501 *vp = dvp; 1502 CACHE_RLOCK(); 1503 if (dvp->v_iflag & VI_DOOMED) { 1504 /* forced unmount */ 1505 CACHE_RUNLOCK(); 1506 vrele(dvp); 1507 error = ENOENT; 1508 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 1509 return (error); 1510 } 1511 /* 1512 * *vp has its use count incremented still. 1513 */ 1514 1515 return (0); 1516 } 1517 1518 /* 1519 * The magic behind kern___getcwd() and vn_fullpath(). 1520 */ 1521 static int 1522 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 1523 char *buf, char **retbuf, u_int buflen) 1524 { 1525 int error, slash_prefixed; 1526 #ifdef KDTRACE_HOOKS 1527 struct vnode *startvp = vp; 1528 #endif 1529 struct vnode *vp1; 1530 1531 buflen--; 1532 buf[buflen] = '\0'; 1533 error = 0; 1534 slash_prefixed = 0; 1535 1536 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 1537 counter_u64_add(numfullpathcalls, 1); 1538 vref(vp); 1539 CACHE_RLOCK(); 1540 if (vp->v_type != VDIR) { 1541 error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); 1542 if (error) 1543 return (error); 1544 if (buflen == 0) { 1545 CACHE_RUNLOCK(); 1546 vrele(vp); 1547 return (ENOMEM); 1548 } 1549 buf[--buflen] = '/'; 1550 slash_prefixed = 1; 1551 } 1552 while (vp != rdir && vp != rootvnode) { 1553 if (vp->v_vflag & VV_ROOT) { 1554 if (vp->v_iflag & VI_DOOMED) { /* forced unmount */ 1555 CACHE_RUNLOCK(); 1556 vrele(vp); 1557 error = ENOENT; 1558 SDT_PROBE3(vfs, namecache, fullpath, return, 1559 error, vp, NULL); 1560 break; 1561 } 1562 vp1 = vp->v_mount->mnt_vnodecovered; 1563 vref(vp1); 1564 CACHE_RUNLOCK(); 1565 vrele(vp); 1566 vp = vp1; 1567 CACHE_RLOCK(); 1568 continue; 1569 } 1570 if (vp->v_type != VDIR) { 1571 CACHE_RUNLOCK(); 1572 vrele(vp); 1573 counter_u64_add(numfullpathfail1, 1); 1574 error = ENOTDIR; 1575 SDT_PROBE3(vfs, namecache, fullpath, return, 1576 error, vp, NULL); 1577 break; 1578 } 1579 error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); 1580 if (error) 1581 break; 1582 if (buflen == 0) { 1583 CACHE_RUNLOCK(); 1584 vrele(vp); 1585 error = ENOMEM; 1586 SDT_PROBE3(vfs, namecache, fullpath, return, error, 1587 startvp, NULL); 1588 break; 1589 } 1590 buf[--buflen] = '/'; 1591 slash_prefixed = 1; 1592 } 1593 if (error) 1594 return (error); 1595 if (!slash_prefixed) { 1596 if (buflen == 0) { 1597 CACHE_RUNLOCK(); 1598 vrele(vp); 1599 counter_u64_add(numfullpathfail4, 1); 1600 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 1601 startvp, NULL); 1602 return (ENOMEM); 1603 } 1604 buf[--buflen] = '/'; 1605 } 1606 counter_u64_add(numfullpathfound, 1); 1607 CACHE_RUNLOCK(); 1608 vrele(vp); 1609 1610 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); 1611 *retbuf = buf + buflen; 1612 return (0); 1613 } 1614 1615 struct vnode * 1616 vn_dir_dd_ino(struct vnode *vp) 1617 { 1618 struct namecache *ncp; 1619 struct vnode *ddvp; 1620 1621 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 1622 CACHE_RLOCK(); 1623 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 1624 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 1625 continue; 1626 ddvp = ncp->nc_dvp; 1627 vhold(ddvp); 1628 CACHE_RUNLOCK(); 1629 if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread)) 1630 return (NULL); 1631 return (ddvp); 1632 } 1633 CACHE_RUNLOCK(); 1634 return (NULL); 1635 } 1636 1637 int 1638 vn_commname(struct vnode *vp, char *buf, u_int buflen) 1639 { 1640 struct namecache *ncp; 1641 int l; 1642 1643 CACHE_RLOCK(); 1644 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 1645 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1646 break; 1647 if (ncp == NULL) { 1648 CACHE_RUNLOCK(); 1649 return (ENOENT); 1650 } 1651 l = min(ncp->nc_nlen, buflen - 1); 1652 memcpy(buf, nc_get_name(ncp), l); 1653 CACHE_RUNLOCK(); 1654 buf[l] = '\0'; 1655 return (0); 1656 } 1657 1658 /* ABI compat shims for old kernel modules. */ 1659 #undef cache_enter 1660 1661 void cache_enter(struct vnode *dvp, struct vnode *vp, 1662 struct componentname *cnp); 1663 1664 void 1665 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 1666 { 1667 1668 cache_enter_time(dvp, vp, cnp, NULL, NULL); 1669 } 1670 1671 /* 1672 * This function updates path string to vnode's full global path 1673 * and checks the size of the new path string against the pathlen argument. 1674 * 1675 * Requires a locked, referenced vnode. 1676 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 1677 * 1678 * If sysctl debug.disablefullpath is set, ENODEV is returned, 1679 * vnode is left locked and path remain untouched. 1680 * 1681 * If vp is a directory, the call to vn_fullpath_global() always succeeds 1682 * because it falls back to the ".." lookup if the namecache lookup fails. 1683 */ 1684 int 1685 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 1686 u_int pathlen) 1687 { 1688 struct nameidata nd; 1689 struct vnode *vp1; 1690 char *rpath, *fbuf; 1691 int error; 1692 1693 ASSERT_VOP_ELOCKED(vp, __func__); 1694 1695 /* Return ENODEV if sysctl debug.disablefullpath==1 */ 1696 if (disablefullpath) 1697 return (ENODEV); 1698 1699 /* Construct global filesystem path from vp. */ 1700 VOP_UNLOCK(vp, 0); 1701 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 1702 1703 if (error != 0) { 1704 vrele(vp); 1705 return (error); 1706 } 1707 1708 if (strlen(rpath) >= pathlen) { 1709 vrele(vp); 1710 error = ENAMETOOLONG; 1711 goto out; 1712 } 1713 1714 /* 1715 * Re-lookup the vnode by path to detect a possible rename. 1716 * As a side effect, the vnode is relocked. 1717 * If vnode was renamed, return ENOENT. 1718 */ 1719 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 1720 UIO_SYSSPACE, path, td); 1721 error = namei(&nd); 1722 if (error != 0) { 1723 vrele(vp); 1724 goto out; 1725 } 1726 NDFREE(&nd, NDF_ONLY_PNBUF); 1727 vp1 = nd.ni_vp; 1728 vrele(vp); 1729 if (vp1 == vp) 1730 strcpy(path, rpath); 1731 else { 1732 vput(vp1); 1733 error = ENOENT; 1734 } 1735 1736 out: 1737 free(fbuf, M_TEMP); 1738 return (error); 1739 } 1740