1 /*- 2 * Copyright (c) 1989, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Poul-Henning Kamp of the FreeBSD Project. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_ktrace.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/counter.h> 43 #include <sys/filedesc.h> 44 #include <sys/fnv_hash.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/malloc.h> 48 #include <sys/fcntl.h> 49 #include <sys/mount.h> 50 #include <sys/namei.h> 51 #include <sys/proc.h> 52 #include <sys/rwlock.h> 53 #include <sys/sdt.h> 54 #include <sys/syscallsubr.h> 55 #include <sys/sysctl.h> 56 #include <sys/sysproto.h> 57 #include <sys/vnode.h> 58 #ifdef KTRACE 59 #include <sys/ktrace.h> 60 #endif 61 62 #include <vm/uma.h> 63 64 SDT_PROVIDER_DECLARE(vfs); 65 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 66 "struct vnode *"); 67 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 68 "char *"); 69 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 70 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 71 "char *", "struct vnode *"); 72 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 73 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 74 "struct vnode *", "char *"); 75 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 76 "struct vnode *"); 77 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 78 "struct vnode *", "char *"); 79 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 80 "char *"); 81 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 82 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 83 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 84 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 85 "struct vnode *"); 86 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 87 "char *"); 88 89 /* 90 * This structure describes the elements in the cache of recent 91 * names looked up by namei. 92 */ 93 94 struct namecache { 95 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 96 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 97 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 98 struct vnode *nc_dvp; /* vnode of parent of name */ 99 struct vnode *nc_vp; /* vnode the name refers to */ 100 u_char nc_flag; /* flag bits */ 101 u_char nc_nlen; /* length of name */ 102 char nc_name[0]; /* segment name + nul */ 103 }; 104 105 /* 106 * struct namecache_ts repeats struct namecache layout up to the 107 * nc_nlen member. 108 * struct namecache_ts is used in place of struct namecache when time(s) need 109 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 110 * both a non-dotdot directory name plus dotdot for the directory's 111 * parent. 112 */ 113 struct namecache_ts { 114 LIST_ENTRY(namecache) nc_hash; /* hash chain */ 115 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 116 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 117 struct vnode *nc_dvp; /* vnode of parent of name */ 118 struct vnode *nc_vp; /* vnode the name refers to */ 119 u_char nc_flag; /* flag bits */ 120 u_char nc_nlen; /* length of name */ 121 struct timespec nc_time; /* timespec provided by fs */ 122 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 123 int nc_ticks; /* ticks value when entry was added */ 124 char nc_name[0]; /* segment name + nul */ 125 }; 126 127 /* 128 * Flags in namecache.nc_flag 129 */ 130 #define NCF_WHITE 0x01 131 #define NCF_ISDOTDOT 0x02 132 #define NCF_TS 0x04 133 #define NCF_DTS 0x08 134 #define NCF_DVDROP 0x10 135 136 /* 137 * Name caching works as follows: 138 * 139 * Names found by directory scans are retained in a cache 140 * for future reference. It is managed LRU, so frequently 141 * used names will hang around. Cache is indexed by hash value 142 * obtained from (vp, name) where vp refers to the directory 143 * containing name. 144 * 145 * If it is a "negative" entry, (i.e. for a name that is known NOT to 146 * exist) the vnode pointer will be NULL. 147 * 148 * Upon reaching the last segment of a path, if the reference 149 * is for DELETE, or NOCACHE is set (rewrite), and the 150 * name is located in the cache, it will be dropped. 151 */ 152 153 /* 154 * Structures associated with name caching. 155 */ 156 #define NCHHASH(hash) \ 157 (&nchashtbl[(hash) & nchash]) 158 static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ 159 static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ 160 static u_long nchash; /* size of hash table */ 161 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 162 "Size of namecache hash table"); 163 static u_long ncnegfactor = 16; /* ratio of negative entries */ 164 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 165 "Ratio of negative namecache entries"); 166 static u_long numneg; /* number of negative entries allocated */ 167 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 168 "Number of negative entries in namecache"); 169 static u_long numcache; /* number of cache entries allocated */ 170 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 171 "Number of namecache entries"); 172 static u_long numcachehv; /* number of cache entries with vnodes held */ 173 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, 174 "Number of namecache entries with vnodes held"); 175 u_int ncsizefactor = 2; 176 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 177 "Size factor for namecache"); 178 179 struct nchstats nchstats; /* cache effectiveness statistics */ 180 181 static struct rwlock cache_lock; 182 RW_SYSINIT(vfscache, &cache_lock, "Name Cache"); 183 184 #define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock) 185 #define CACHE_RLOCK() rw_rlock(&cache_lock) 186 #define CACHE_RUNLOCK() rw_runlock(&cache_lock) 187 #define CACHE_WLOCK() rw_wlock(&cache_lock) 188 #define CACHE_WUNLOCK() rw_wunlock(&cache_lock) 189 190 static struct mtx_padalign ncneg_mtx; 191 MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "Name Cache neg", MTX_DEF); 192 193 /* 194 * UMA zones for the VFS cache. 195 * 196 * The small cache is used for entries with short names, which are the 197 * most common. The large cache is used for entries which are too big to 198 * fit in the small cache. 199 */ 200 static uma_zone_t cache_zone_small; 201 static uma_zone_t cache_zone_small_ts; 202 static uma_zone_t cache_zone_large; 203 static uma_zone_t cache_zone_large_ts; 204 205 #define CACHE_PATH_CUTOFF 35 206 207 static struct namecache * 208 cache_alloc(int len, int ts) 209 { 210 211 if (len > CACHE_PATH_CUTOFF) { 212 if (ts) 213 return (uma_zalloc(cache_zone_large_ts, M_WAITOK)); 214 else 215 return (uma_zalloc(cache_zone_large, M_WAITOK)); 216 } 217 if (ts) 218 return (uma_zalloc(cache_zone_small_ts, M_WAITOK)); 219 else 220 return (uma_zalloc(cache_zone_small, M_WAITOK)); 221 } 222 223 static void 224 cache_free(struct namecache *ncp) 225 { 226 int ts; 227 228 if (ncp == NULL) 229 return; 230 ts = ncp->nc_flag & NCF_TS; 231 if ((ncp->nc_flag & NCF_DVDROP) != 0) 232 vdrop(ncp->nc_dvp); 233 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) { 234 if (ts) 235 uma_zfree(cache_zone_small_ts, ncp); 236 else 237 uma_zfree(cache_zone_small, ncp); 238 } else if (ts) 239 uma_zfree(cache_zone_large_ts, ncp); 240 else 241 uma_zfree(cache_zone_large, ncp); 242 } 243 244 static char * 245 nc_get_name(struct namecache *ncp) 246 { 247 struct namecache_ts *ncp_ts; 248 249 if ((ncp->nc_flag & NCF_TS) == 0) 250 return (ncp->nc_name); 251 ncp_ts = (struct namecache_ts *)ncp; 252 return (ncp_ts->nc_name); 253 } 254 255 static void 256 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 257 { 258 259 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 260 (tsp == NULL && ticksp == NULL), 261 ("No NCF_TS")); 262 263 if (tsp != NULL) 264 *tsp = ((struct namecache_ts *)ncp)->nc_time; 265 if (ticksp != NULL) 266 *ticksp = ((struct namecache_ts *)ncp)->nc_ticks; 267 } 268 269 static int doingcache = 1; /* 1 => enable the cache */ 270 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 271 "VFS namecache enabled"); 272 273 /* Export size information to userland */ 274 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 275 sizeof(struct namecache), "sizeof(struct namecache)"); 276 277 /* 278 * The new name cache statistics 279 */ 280 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, 281 "Name cache statistics"); 282 #define STATNODE_ULONG(name, descr) \ 283 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 284 #define STATNODE_COUNTER(name, descr) \ 285 static counter_u64_t name; \ 286 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); 287 STATNODE_ULONG(numneg, "Number of negative cache entries"); 288 STATNODE_ULONG(numcache, "Number of cache entries"); 289 STATNODE_COUNTER(numcalls, "Number of cache lookups"); 290 STATNODE_COUNTER(dothits, "Number of '.' hits"); 291 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 292 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 293 STATNODE_COUNTER(nummiss, "Number of cache misses"); 294 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 295 STATNODE_COUNTER(numposzaps, 296 "Number of cache hits (positive) we do not want to cache"); 297 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 298 STATNODE_COUNTER(numnegzaps, 299 "Number of cache hits (negative) we do not want to cache"); 300 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 301 /* These count for kern___getcwd(), too. */ 302 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 303 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 304 STATNODE_COUNTER(numfullpathfail2, 305 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 306 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 307 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 308 static long numupgrades; STATNODE_ULONG(numupgrades, 309 "Number of updates of the cache after lookup (write lock + retry)"); 310 311 static void cache_zap(struct namecache *ncp); 312 static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, 313 u_int *buflen); 314 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 315 char *buf, char **retbuf, u_int buflen); 316 317 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 318 319 static uint32_t 320 cache_get_hash(char *name, u_char len, struct vnode *dvp) 321 { 322 uint32_t hash; 323 324 hash = fnv_32_buf(name, len, FNV1_32_INIT); 325 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 326 return (hash); 327 } 328 329 static int 330 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 331 { 332 struct nchstats snap; 333 334 if (req->oldptr == NULL) 335 return (SYSCTL_OUT(req, 0, sizeof(snap))); 336 337 snap = nchstats; 338 snap.ncs_goodhits = counter_u64_fetch(numposhits); 339 snap.ncs_neghits = counter_u64_fetch(numneghits); 340 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 341 counter_u64_fetch(numnegzaps); 342 snap.ncs_miss = counter_u64_fetch(nummisszap) + 343 counter_u64_fetch(nummiss); 344 345 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 346 } 347 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 348 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 349 "VFS cache effectiveness statistics"); 350 351 #ifdef DIAGNOSTIC 352 /* 353 * Grab an atomic snapshot of the name cache hash chain lengths 354 */ 355 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, 356 "hash table stats"); 357 358 static int 359 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 360 { 361 struct nchashhead *ncpp; 362 struct namecache *ncp; 363 int i, error, n_nchash, *cntbuf; 364 365 retry: 366 n_nchash = nchash + 1; /* nchash is max index, not count */ 367 if (req->oldptr == NULL) 368 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 369 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 370 CACHE_RLOCK(); 371 if (n_nchash != nchash + 1) { 372 CACHE_RUNLOCK(); 373 free(cntbuf, M_TEMP); 374 goto retry; 375 } 376 /* Scan hash tables counting entries */ 377 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 378 LIST_FOREACH(ncp, ncpp, nc_hash) 379 cntbuf[i]++; 380 CACHE_RUNLOCK(); 381 for (error = 0, i = 0; i < n_nchash; i++) 382 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 383 break; 384 free(cntbuf, M_TEMP); 385 return (error); 386 } 387 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 388 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 389 "nchash chain lengths"); 390 391 static int 392 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 393 { 394 int error; 395 struct nchashhead *ncpp; 396 struct namecache *ncp; 397 int n_nchash; 398 int count, maxlength, used, pct; 399 400 if (!req->oldptr) 401 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 402 403 CACHE_RLOCK(); 404 n_nchash = nchash + 1; /* nchash is max index, not count */ 405 used = 0; 406 maxlength = 0; 407 408 /* Scan hash tables for applicable entries */ 409 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 410 count = 0; 411 LIST_FOREACH(ncp, ncpp, nc_hash) { 412 count++; 413 } 414 if (count) 415 used++; 416 if (maxlength < count) 417 maxlength = count; 418 } 419 n_nchash = nchash + 1; 420 CACHE_RUNLOCK(); 421 pct = (used * 100) / (n_nchash / 100); 422 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 423 if (error) 424 return (error); 425 error = SYSCTL_OUT(req, &used, sizeof(used)); 426 if (error) 427 return (error); 428 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 429 if (error) 430 return (error); 431 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 432 if (error) 433 return (error); 434 return (0); 435 } 436 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 437 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 438 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 439 #endif 440 441 /* 442 * Negative entries management 443 */ 444 static void 445 cache_negative_hit(struct namecache *ncp, int wlocked) 446 { 447 448 if (!wlocked) { 449 rw_assert(&cache_lock, RA_RLOCKED); 450 mtx_lock(&ncneg_mtx); 451 } else { 452 rw_assert(&cache_lock, RA_WLOCKED); 453 } 454 455 TAILQ_REMOVE(&ncneg, ncp, nc_dst); 456 TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); 457 458 if (!wlocked) 459 mtx_unlock(&ncneg_mtx); 460 } 461 462 static void 463 cache_negative_insert(struct namecache *ncp) 464 { 465 466 rw_assert(&cache_lock, RA_WLOCKED); 467 MPASS(ncp->nc_vp == NULL); 468 TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); 469 numneg++; 470 } 471 472 static void 473 cache_negative_remove(struct namecache *ncp) 474 { 475 476 rw_assert(&cache_lock, RA_WLOCKED); 477 MPASS(ncp->nc_vp == NULL); 478 TAILQ_REMOVE(&ncneg, ncp, nc_dst); 479 numneg--; 480 } 481 482 static struct namecache * 483 cache_negative_zap_one(void) 484 { 485 struct namecache *ncp; 486 487 rw_assert(&cache_lock, RA_WLOCKED); 488 ncp = TAILQ_FIRST(&ncneg); 489 KASSERT(ncp->nc_vp == NULL, ("ncp %p vp %p on ncneg", 490 ncp, ncp->nc_vp)); 491 cache_zap(ncp); 492 return (ncp); 493 } 494 495 /* 496 * cache_zap(): 497 * 498 * Removes a namecache entry from cache, whether it contains an actual 499 * pointer to a vnode or if it is just a negative cache entry. 500 */ 501 static void 502 cache_zap(struct namecache *ncp) 503 { 504 505 rw_assert(&cache_lock, RA_WLOCKED); 506 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); 507 if (ncp->nc_vp != NULL) { 508 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 509 nc_get_name(ncp), ncp->nc_vp); 510 } else { 511 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 512 nc_get_name(ncp)); 513 } 514 LIST_REMOVE(ncp, nc_hash); 515 if (ncp->nc_flag & NCF_ISDOTDOT) { 516 if (ncp == ncp->nc_dvp->v_cache_dd) 517 ncp->nc_dvp->v_cache_dd = NULL; 518 } else { 519 LIST_REMOVE(ncp, nc_src); 520 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 521 ncp->nc_flag |= NCF_DVDROP; 522 numcachehv--; 523 } 524 } 525 if (ncp->nc_vp) { 526 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 527 if (ncp == ncp->nc_vp->v_cache_dd) 528 ncp->nc_vp->v_cache_dd = NULL; 529 } else { 530 cache_negative_remove(ncp); 531 } 532 numcache--; 533 } 534 535 /* 536 * Lookup an entry in the cache 537 * 538 * Lookup is called with dvp pointing to the directory to search, 539 * cnp pointing to the name of the entry being sought. If the lookup 540 * succeeds, the vnode is returned in *vpp, and a status of -1 is 541 * returned. If the lookup determines that the name does not exist 542 * (negative caching), a status of ENOENT is returned. If the lookup 543 * fails, a status of zero is returned. If the directory vnode is 544 * recycled out from under us due to a forced unmount, a status of 545 * ENOENT is returned. 546 * 547 * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is 548 * unlocked. If we're looking up . an extra ref is taken, but the lock is 549 * not recursively acquired. 550 */ 551 552 int 553 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 554 struct timespec *tsp, int *ticksp) 555 { 556 struct namecache *ncp; 557 uint32_t hash; 558 int error, ltype, wlocked; 559 560 if (!doingcache) { 561 cnp->cn_flags &= ~MAKEENTRY; 562 return (0); 563 } 564 retry: 565 wlocked = 0; 566 counter_u64_add(numcalls, 1); 567 error = 0; 568 569 retry_wlocked: 570 if (cnp->cn_nameptr[0] == '.') { 571 if (cnp->cn_namelen == 1) { 572 *vpp = dvp; 573 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 574 dvp, cnp->cn_nameptr); 575 counter_u64_add(dothits, 1); 576 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 577 if (tsp != NULL) 578 timespecclear(tsp); 579 if (ticksp != NULL) 580 *ticksp = ticks; 581 VREF(*vpp); 582 /* 583 * When we lookup "." we still can be asked to lock it 584 * differently... 585 */ 586 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 587 if (ltype != VOP_ISLOCKED(*vpp)) { 588 if (ltype == LK_EXCLUSIVE) { 589 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 590 if ((*vpp)->v_iflag & VI_DOOMED) { 591 /* forced unmount */ 592 vrele(*vpp); 593 *vpp = NULL; 594 return (ENOENT); 595 } 596 } else 597 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 598 } 599 return (-1); 600 } 601 if (!wlocked) 602 CACHE_RLOCK(); 603 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 604 counter_u64_add(dotdothits, 1); 605 if (dvp->v_cache_dd == NULL) { 606 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 607 "..", NULL); 608 goto unlock; 609 } 610 if ((cnp->cn_flags & MAKEENTRY) == 0) { 611 if (!wlocked && !CACHE_UPGRADE_LOCK()) 612 goto wlock; 613 ncp = NULL; 614 if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) { 615 ncp = dvp->v_cache_dd; 616 cache_zap(ncp); 617 } 618 dvp->v_cache_dd = NULL; 619 CACHE_WUNLOCK(); 620 cache_free(ncp); 621 return (0); 622 } 623 ncp = dvp->v_cache_dd; 624 if (ncp->nc_flag & NCF_ISDOTDOT) 625 *vpp = ncp->nc_vp; 626 else 627 *vpp = ncp->nc_dvp; 628 /* Return failure if negative entry was found. */ 629 if (*vpp == NULL) 630 goto negative_success; 631 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 632 dvp, cnp->cn_nameptr, *vpp); 633 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 634 *vpp); 635 cache_out_ts(ncp, tsp, ticksp); 636 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 637 NCF_DTS && tsp != NULL) 638 *tsp = ((struct namecache_ts *)ncp)-> 639 nc_dotdottime; 640 goto success; 641 } 642 } else if (!wlocked) 643 CACHE_RLOCK(); 644 645 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 646 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 647 counter_u64_add(numchecks, 1); 648 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 649 !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen)) 650 break; 651 } 652 653 /* We failed to find an entry */ 654 if (ncp == NULL) { 655 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 656 NULL); 657 if ((cnp->cn_flags & MAKEENTRY) == 0) { 658 counter_u64_add(nummisszap, 1); 659 } else { 660 counter_u64_add(nummiss, 1); 661 } 662 goto unlock; 663 } 664 665 /* We don't want to have an entry, so dump it */ 666 if ((cnp->cn_flags & MAKEENTRY) == 0) { 667 counter_u64_add(numposzaps, 1); 668 if (!wlocked && !CACHE_UPGRADE_LOCK()) 669 goto wlock; 670 cache_zap(ncp); 671 CACHE_WUNLOCK(); 672 cache_free(ncp); 673 return (0); 674 } 675 676 /* We found a "positive" match, return the vnode */ 677 if (ncp->nc_vp) { 678 counter_u64_add(numposhits, 1); 679 *vpp = ncp->nc_vp; 680 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 681 dvp, cnp->cn_nameptr, *vpp, ncp); 682 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp), 683 *vpp); 684 cache_out_ts(ncp, tsp, ticksp); 685 goto success; 686 } 687 688 negative_success: 689 /* We found a negative match, and want to create it, so purge */ 690 if (cnp->cn_nameiop == CREATE) { 691 counter_u64_add(numnegzaps, 1); 692 if (!wlocked && !CACHE_UPGRADE_LOCK()) 693 goto wlock; 694 cache_zap(ncp); 695 CACHE_WUNLOCK(); 696 cache_free(ncp); 697 return (0); 698 } 699 700 counter_u64_add(numneghits, 1); 701 cache_negative_hit(ncp, wlocked); 702 if (ncp->nc_flag & NCF_WHITE) 703 cnp->cn_flags |= ISWHITEOUT; 704 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 705 nc_get_name(ncp)); 706 cache_out_ts(ncp, tsp, ticksp); 707 if (wlocked) 708 CACHE_WUNLOCK(); 709 else 710 CACHE_RUNLOCK(); 711 return (ENOENT); 712 713 wlock: 714 /* 715 * We need to update the cache after our lookup, so upgrade to 716 * a write lock and retry the operation. 717 */ 718 CACHE_RUNLOCK(); 719 CACHE_WLOCK(); 720 numupgrades++; 721 wlocked = 1; 722 goto retry_wlocked; 723 724 success: 725 /* 726 * On success we return a locked and ref'd vnode as per the lookup 727 * protocol. 728 */ 729 MPASS(dvp != *vpp); 730 ltype = 0; /* silence gcc warning */ 731 if (cnp->cn_flags & ISDOTDOT) { 732 ltype = VOP_ISLOCKED(dvp); 733 VOP_UNLOCK(dvp, 0); 734 } 735 vhold(*vpp); 736 if (wlocked) 737 CACHE_WUNLOCK(); 738 else 739 CACHE_RUNLOCK(); 740 error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); 741 if (cnp->cn_flags & ISDOTDOT) { 742 vn_lock(dvp, ltype | LK_RETRY); 743 if (dvp->v_iflag & VI_DOOMED) { 744 if (error == 0) 745 vput(*vpp); 746 *vpp = NULL; 747 return (ENOENT); 748 } 749 } 750 if (error) { 751 *vpp = NULL; 752 goto retry; 753 } 754 if ((cnp->cn_flags & ISLASTCN) && 755 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 756 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 757 } 758 return (-1); 759 760 unlock: 761 if (wlocked) 762 CACHE_WUNLOCK(); 763 else 764 CACHE_RUNLOCK(); 765 return (0); 766 } 767 768 /* 769 * Add an entry to the cache. 770 */ 771 void 772 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 773 struct timespec *tsp, struct timespec *dtsp) 774 { 775 struct namecache *ncp, *n2, *ndd, *nneg; 776 struct namecache_ts *n3; 777 struct nchashhead *ncpp; 778 uint32_t hash; 779 int flag; 780 int len; 781 782 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 783 VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, 784 ("cache_enter: Adding a doomed vnode")); 785 VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, 786 ("cache_enter: Doomed vnode used as src")); 787 788 if (!doingcache) 789 return; 790 791 /* 792 * Avoid blowout in namecache entries. 793 */ 794 if (numcache >= desiredvnodes * ncsizefactor) 795 return; 796 797 ndd = nneg = NULL; 798 flag = 0; 799 if (cnp->cn_nameptr[0] == '.') { 800 if (cnp->cn_namelen == 1) 801 return; 802 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 803 CACHE_WLOCK(); 804 /* 805 * If dotdot entry already exists, just retarget it 806 * to new parent vnode, otherwise continue with new 807 * namecache entry allocation. 808 */ 809 if ((ncp = dvp->v_cache_dd) != NULL && 810 ncp->nc_flag & NCF_ISDOTDOT) { 811 KASSERT(ncp->nc_dvp == dvp, 812 ("wrong isdotdot parent")); 813 if (ncp->nc_vp != NULL) { 814 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, 815 ncp, nc_dst); 816 } else { 817 cache_negative_remove(ncp); 818 } 819 if (vp != NULL) { 820 TAILQ_INSERT_HEAD(&vp->v_cache_dst, 821 ncp, nc_dst); 822 } else { 823 cache_negative_insert(ncp); 824 } 825 ncp->nc_vp = vp; 826 CACHE_WUNLOCK(); 827 return; 828 } 829 dvp->v_cache_dd = NULL; 830 SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); 831 CACHE_WUNLOCK(); 832 flag = NCF_ISDOTDOT; 833 } 834 } 835 836 /* 837 * Calculate the hash key and setup as much of the new 838 * namecache entry as possible before acquiring the lock. 839 */ 840 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 841 ncp->nc_vp = vp; 842 ncp->nc_dvp = dvp; 843 ncp->nc_flag = flag; 844 if (tsp != NULL) { 845 n3 = (struct namecache_ts *)ncp; 846 n3->nc_time = *tsp; 847 n3->nc_ticks = ticks; 848 n3->nc_flag |= NCF_TS; 849 if (dtsp != NULL) { 850 n3->nc_dotdottime = *dtsp; 851 n3->nc_flag |= NCF_DTS; 852 } 853 } 854 len = ncp->nc_nlen = cnp->cn_namelen; 855 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 856 strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1); 857 CACHE_WLOCK(); 858 859 /* 860 * See if this vnode or negative entry is already in the cache 861 * with this name. This can happen with concurrent lookups of 862 * the same path name. 863 */ 864 ncpp = NCHHASH(hash); 865 LIST_FOREACH(n2, ncpp, nc_hash) { 866 if (n2->nc_dvp == dvp && 867 n2->nc_nlen == cnp->cn_namelen && 868 !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) { 869 if (tsp != NULL) { 870 KASSERT((n2->nc_flag & NCF_TS) != 0, 871 ("no NCF_TS")); 872 n3 = (struct namecache_ts *)n2; 873 n3->nc_time = 874 ((struct namecache_ts *)ncp)->nc_time; 875 n3->nc_ticks = 876 ((struct namecache_ts *)ncp)->nc_ticks; 877 if (dtsp != NULL) { 878 n3->nc_dotdottime = 879 ((struct namecache_ts *)ncp)-> 880 nc_dotdottime; 881 n3->nc_flag |= NCF_DTS; 882 } 883 } 884 CACHE_WUNLOCK(); 885 cache_free(ncp); 886 return; 887 } 888 } 889 890 if (flag == NCF_ISDOTDOT) { 891 /* 892 * See if we are trying to add .. entry, but some other lookup 893 * has populated v_cache_dd pointer already. 894 */ 895 if (dvp->v_cache_dd != NULL) { 896 CACHE_WUNLOCK(); 897 cache_free(ncp); 898 return; 899 } 900 KASSERT(vp == NULL || vp->v_type == VDIR, 901 ("wrong vnode type %p", vp)); 902 dvp->v_cache_dd = ncp; 903 } 904 905 numcache++; 906 if (vp != NULL) { 907 if (vp->v_type == VDIR) { 908 if (flag != NCF_ISDOTDOT) { 909 /* 910 * For this case, the cache entry maps both the 911 * directory name in it and the name ".." for the 912 * directory's parent. 913 */ 914 if ((ndd = vp->v_cache_dd) != NULL) { 915 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 916 cache_zap(ndd); 917 else 918 ndd = NULL; 919 } 920 vp->v_cache_dd = ncp; 921 } 922 } else { 923 vp->v_cache_dd = NULL; 924 } 925 } 926 927 /* 928 * Insert the new namecache entry into the appropriate chain 929 * within the cache entries table. 930 */ 931 LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 932 if (flag != NCF_ISDOTDOT) { 933 if (LIST_EMPTY(&dvp->v_cache_src)) { 934 vhold(dvp); 935 numcachehv++; 936 } 937 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 938 } 939 940 /* 941 * If the entry is "negative", we place it into the 942 * "negative" cache queue, otherwise, we place it into the 943 * destination vnode's cache entries queue. 944 */ 945 if (vp != NULL) { 946 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 947 SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp), 948 vp); 949 } else { 950 if (cnp->cn_flags & ISWHITEOUT) 951 ncp->nc_flag |= NCF_WHITE; 952 cache_negative_insert(ncp); 953 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 954 nc_get_name(ncp)); 955 } 956 if (numneg * ncnegfactor > numcache) 957 nneg = cache_negative_zap_one(); 958 CACHE_WUNLOCK(); 959 cache_free(ndd); 960 cache_free(nneg); 961 } 962 963 /* 964 * Name cache initialization, from vfs_init() when we are booting 965 */ 966 static void 967 nchinit(void *dummy __unused) 968 { 969 970 TAILQ_INIT(&ncneg); 971 972 cache_zone_small = uma_zcreate("S VFS Cache", 973 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 974 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); 975 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 976 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 977 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); 978 cache_zone_large = uma_zcreate("L VFS Cache", 979 sizeof(struct namecache) + NAME_MAX + 1, 980 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); 981 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 982 sizeof(struct namecache_ts) + NAME_MAX + 1, 983 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); 984 985 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 986 987 numcalls = counter_u64_alloc(M_WAITOK); 988 dothits = counter_u64_alloc(M_WAITOK); 989 dotdothits = counter_u64_alloc(M_WAITOK); 990 numchecks = counter_u64_alloc(M_WAITOK); 991 nummiss = counter_u64_alloc(M_WAITOK); 992 nummisszap = counter_u64_alloc(M_WAITOK); 993 numposzaps = counter_u64_alloc(M_WAITOK); 994 numposhits = counter_u64_alloc(M_WAITOK); 995 numnegzaps = counter_u64_alloc(M_WAITOK); 996 numneghits = counter_u64_alloc(M_WAITOK); 997 numfullpathcalls = counter_u64_alloc(M_WAITOK); 998 numfullpathfail1 = counter_u64_alloc(M_WAITOK); 999 numfullpathfail2 = counter_u64_alloc(M_WAITOK); 1000 numfullpathfail4 = counter_u64_alloc(M_WAITOK); 1001 numfullpathfound = counter_u64_alloc(M_WAITOK); 1002 } 1003 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 1004 1005 void 1006 cache_changesize(int newmaxvnodes) 1007 { 1008 struct nchashhead *new_nchashtbl, *old_nchashtbl; 1009 u_long new_nchash, old_nchash; 1010 struct namecache *ncp; 1011 uint32_t hash; 1012 int i; 1013 1014 new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash); 1015 /* If same hash table size, nothing to do */ 1016 if (nchash == new_nchash) { 1017 free(new_nchashtbl, M_VFSCACHE); 1018 return; 1019 } 1020 /* 1021 * Move everything from the old hash table to the new table. 1022 * None of the namecache entries in the table can be removed 1023 * because to do so, they have to be removed from the hash table. 1024 */ 1025 CACHE_WLOCK(); 1026 old_nchashtbl = nchashtbl; 1027 old_nchash = nchash; 1028 nchashtbl = new_nchashtbl; 1029 nchash = new_nchash; 1030 for (i = 0; i <= old_nchash; i++) { 1031 while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { 1032 hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, 1033 ncp->nc_dvp); 1034 LIST_REMOVE(ncp, nc_hash); 1035 LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 1036 } 1037 } 1038 CACHE_WUNLOCK(); 1039 free(old_nchashtbl, M_VFSCACHE); 1040 } 1041 1042 /* 1043 * Invalidate all entries to a particular vnode. 1044 */ 1045 void 1046 cache_purge(struct vnode *vp) 1047 { 1048 TAILQ_HEAD(, namecache) ncps; 1049 struct namecache *ncp, *nnp; 1050 1051 CTR1(KTR_VFS, "cache_purge(%p)", vp); 1052 SDT_PROBE1(vfs, namecache, purge, done, vp); 1053 TAILQ_INIT(&ncps); 1054 CACHE_WLOCK(); 1055 while (!LIST_EMPTY(&vp->v_cache_src)) { 1056 ncp = LIST_FIRST(&vp->v_cache_src); 1057 cache_zap(ncp); 1058 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1059 } 1060 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 1061 ncp = TAILQ_FIRST(&vp->v_cache_dst); 1062 cache_zap(ncp); 1063 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1064 } 1065 if (vp->v_cache_dd != NULL) { 1066 ncp = vp->v_cache_dd; 1067 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 1068 ("lost dotdot link")); 1069 cache_zap(ncp); 1070 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1071 } 1072 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 1073 CACHE_WUNLOCK(); 1074 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 1075 cache_free(ncp); 1076 } 1077 } 1078 1079 /* 1080 * Invalidate all negative entries for a particular directory vnode. 1081 */ 1082 void 1083 cache_purge_negative(struct vnode *vp) 1084 { 1085 TAILQ_HEAD(, namecache) ncps; 1086 struct namecache *ncp, *nnp; 1087 1088 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 1089 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 1090 TAILQ_INIT(&ncps); 1091 CACHE_WLOCK(); 1092 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 1093 if (ncp->nc_vp != NULL) 1094 continue; 1095 cache_zap(ncp); 1096 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1097 } 1098 CACHE_WUNLOCK(); 1099 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 1100 cache_free(ncp); 1101 } 1102 } 1103 1104 /* 1105 * Flush all entries referencing a particular filesystem. 1106 */ 1107 void 1108 cache_purgevfs(struct mount *mp) 1109 { 1110 TAILQ_HEAD(, namecache) ncps; 1111 struct nchashhead *ncpp; 1112 struct namecache *ncp, *nnp; 1113 1114 /* Scan hash tables for applicable entries */ 1115 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 1116 TAILQ_INIT(&ncps); 1117 CACHE_WLOCK(); 1118 for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { 1119 LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) { 1120 if (ncp->nc_dvp->v_mount != mp) 1121 continue; 1122 cache_zap(ncp); 1123 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 1124 } 1125 } 1126 CACHE_WUNLOCK(); 1127 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 1128 cache_free(ncp); 1129 } 1130 } 1131 1132 /* 1133 * Perform canonical checks and cache lookup and pass on to filesystem 1134 * through the vop_cachedlookup only if needed. 1135 */ 1136 1137 int 1138 vfs_cache_lookup(struct vop_lookup_args *ap) 1139 { 1140 struct vnode *dvp; 1141 int error; 1142 struct vnode **vpp = ap->a_vpp; 1143 struct componentname *cnp = ap->a_cnp; 1144 struct ucred *cred = cnp->cn_cred; 1145 int flags = cnp->cn_flags; 1146 struct thread *td = cnp->cn_thread; 1147 1148 *vpp = NULL; 1149 dvp = ap->a_dvp; 1150 1151 if (dvp->v_type != VDIR) 1152 return (ENOTDIR); 1153 1154 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 1155 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 1156 return (EROFS); 1157 1158 error = VOP_ACCESS(dvp, VEXEC, cred, td); 1159 if (error) 1160 return (error); 1161 1162 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 1163 if (error == 0) 1164 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 1165 if (error == -1) 1166 return (0); 1167 return (error); 1168 } 1169 1170 /* 1171 * XXX All of these sysctls would probably be more productive dead. 1172 */ 1173 static int disablecwd; 1174 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 1175 "Disable the getcwd syscall"); 1176 1177 /* Implementation of the getcwd syscall. */ 1178 int 1179 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 1180 { 1181 1182 return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, 1183 MAXPATHLEN)); 1184 } 1185 1186 int 1187 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen, 1188 u_int path_max) 1189 { 1190 char *bp, *tmpbuf; 1191 struct filedesc *fdp; 1192 struct vnode *cdir, *rdir; 1193 int error; 1194 1195 if (disablecwd) 1196 return (ENODEV); 1197 if (buflen < 2) 1198 return (EINVAL); 1199 if (buflen > path_max) 1200 buflen = path_max; 1201 1202 tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); 1203 fdp = td->td_proc->p_fd; 1204 FILEDESC_SLOCK(fdp); 1205 cdir = fdp->fd_cdir; 1206 VREF(cdir); 1207 rdir = fdp->fd_rdir; 1208 VREF(rdir); 1209 FILEDESC_SUNLOCK(fdp); 1210 error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); 1211 vrele(rdir); 1212 vrele(cdir); 1213 1214 if (!error) { 1215 if (bufseg == UIO_SYSSPACE) 1216 bcopy(bp, buf, strlen(bp) + 1); 1217 else 1218 error = copyout(bp, buf, strlen(bp) + 1); 1219 #ifdef KTRACE 1220 if (KTRPOINT(curthread, KTR_NAMEI)) 1221 ktrnamei(bp); 1222 #endif 1223 } 1224 free(tmpbuf, M_TEMP); 1225 return (error); 1226 } 1227 1228 /* 1229 * Thus begins the fullpath magic. 1230 */ 1231 1232 static int disablefullpath; 1233 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, 1234 "Disable the vn_fullpath function"); 1235 1236 /* 1237 * Retrieve the full filesystem path that correspond to a vnode from the name 1238 * cache (if available) 1239 */ 1240 int 1241 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 1242 { 1243 char *buf; 1244 struct filedesc *fdp; 1245 struct vnode *rdir; 1246 int error; 1247 1248 if (disablefullpath) 1249 return (ENODEV); 1250 if (vn == NULL) 1251 return (EINVAL); 1252 1253 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 1254 fdp = td->td_proc->p_fd; 1255 FILEDESC_SLOCK(fdp); 1256 rdir = fdp->fd_rdir; 1257 VREF(rdir); 1258 FILEDESC_SUNLOCK(fdp); 1259 error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); 1260 vrele(rdir); 1261 1262 if (!error) 1263 *freebuf = buf; 1264 else 1265 free(buf, M_TEMP); 1266 return (error); 1267 } 1268 1269 /* 1270 * This function is similar to vn_fullpath, but it attempts to lookup the 1271 * pathname relative to the global root mount point. This is required for the 1272 * auditing sub-system, as audited pathnames must be absolute, relative to the 1273 * global root mount point. 1274 */ 1275 int 1276 vn_fullpath_global(struct thread *td, struct vnode *vn, 1277 char **retbuf, char **freebuf) 1278 { 1279 char *buf; 1280 int error; 1281 1282 if (disablefullpath) 1283 return (ENODEV); 1284 if (vn == NULL) 1285 return (EINVAL); 1286 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 1287 error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); 1288 if (!error) 1289 *freebuf = buf; 1290 else 1291 free(buf, M_TEMP); 1292 return (error); 1293 } 1294 1295 int 1296 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) 1297 { 1298 int error; 1299 1300 CACHE_RLOCK(); 1301 error = vn_vptocnp_locked(vp, cred, buf, buflen); 1302 if (error == 0) 1303 CACHE_RUNLOCK(); 1304 return (error); 1305 } 1306 1307 static int 1308 vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, 1309 u_int *buflen) 1310 { 1311 struct vnode *dvp; 1312 struct namecache *ncp; 1313 int error; 1314 1315 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 1316 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1317 break; 1318 } 1319 if (ncp != NULL) { 1320 if (*buflen < ncp->nc_nlen) { 1321 CACHE_RUNLOCK(); 1322 vrele(*vp); 1323 counter_u64_add(numfullpathfail4, 1); 1324 error = ENOMEM; 1325 SDT_PROBE3(vfs, namecache, fullpath, return, error, 1326 vp, NULL); 1327 return (error); 1328 } 1329 *buflen -= ncp->nc_nlen; 1330 memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen); 1331 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 1332 nc_get_name(ncp), vp); 1333 dvp = *vp; 1334 *vp = ncp->nc_dvp; 1335 vref(*vp); 1336 CACHE_RUNLOCK(); 1337 vrele(dvp); 1338 CACHE_RLOCK(); 1339 return (0); 1340 } 1341 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 1342 1343 CACHE_RUNLOCK(); 1344 vn_lock(*vp, LK_SHARED | LK_RETRY); 1345 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 1346 vput(*vp); 1347 if (error) { 1348 counter_u64_add(numfullpathfail2, 1); 1349 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 1350 return (error); 1351 } 1352 1353 *vp = dvp; 1354 CACHE_RLOCK(); 1355 if (dvp->v_iflag & VI_DOOMED) { 1356 /* forced unmount */ 1357 CACHE_RUNLOCK(); 1358 vrele(dvp); 1359 error = ENOENT; 1360 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 1361 return (error); 1362 } 1363 /* 1364 * *vp has its use count incremented still. 1365 */ 1366 1367 return (0); 1368 } 1369 1370 /* 1371 * The magic behind kern___getcwd() and vn_fullpath(). 1372 */ 1373 static int 1374 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, 1375 char *buf, char **retbuf, u_int buflen) 1376 { 1377 int error, slash_prefixed; 1378 #ifdef KDTRACE_HOOKS 1379 struct vnode *startvp = vp; 1380 #endif 1381 struct vnode *vp1; 1382 1383 buflen--; 1384 buf[buflen] = '\0'; 1385 error = 0; 1386 slash_prefixed = 0; 1387 1388 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 1389 counter_u64_add(numfullpathcalls, 1); 1390 vref(vp); 1391 CACHE_RLOCK(); 1392 if (vp->v_type != VDIR) { 1393 error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); 1394 if (error) 1395 return (error); 1396 if (buflen == 0) { 1397 CACHE_RUNLOCK(); 1398 vrele(vp); 1399 return (ENOMEM); 1400 } 1401 buf[--buflen] = '/'; 1402 slash_prefixed = 1; 1403 } 1404 while (vp != rdir && vp != rootvnode) { 1405 if (vp->v_vflag & VV_ROOT) { 1406 if (vp->v_iflag & VI_DOOMED) { /* forced unmount */ 1407 CACHE_RUNLOCK(); 1408 vrele(vp); 1409 error = ENOENT; 1410 SDT_PROBE3(vfs, namecache, fullpath, return, 1411 error, vp, NULL); 1412 break; 1413 } 1414 vp1 = vp->v_mount->mnt_vnodecovered; 1415 vref(vp1); 1416 CACHE_RUNLOCK(); 1417 vrele(vp); 1418 vp = vp1; 1419 CACHE_RLOCK(); 1420 continue; 1421 } 1422 if (vp->v_type != VDIR) { 1423 CACHE_RUNLOCK(); 1424 vrele(vp); 1425 counter_u64_add(numfullpathfail1, 1); 1426 error = ENOTDIR; 1427 SDT_PROBE3(vfs, namecache, fullpath, return, 1428 error, vp, NULL); 1429 break; 1430 } 1431 error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); 1432 if (error) 1433 break; 1434 if (buflen == 0) { 1435 CACHE_RUNLOCK(); 1436 vrele(vp); 1437 error = ENOMEM; 1438 SDT_PROBE3(vfs, namecache, fullpath, return, error, 1439 startvp, NULL); 1440 break; 1441 } 1442 buf[--buflen] = '/'; 1443 slash_prefixed = 1; 1444 } 1445 if (error) 1446 return (error); 1447 if (!slash_prefixed) { 1448 if (buflen == 0) { 1449 CACHE_RUNLOCK(); 1450 vrele(vp); 1451 counter_u64_add(numfullpathfail4, 1); 1452 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 1453 startvp, NULL); 1454 return (ENOMEM); 1455 } 1456 buf[--buflen] = '/'; 1457 } 1458 counter_u64_add(numfullpathfound, 1); 1459 CACHE_RUNLOCK(); 1460 vrele(vp); 1461 1462 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); 1463 *retbuf = buf + buflen; 1464 return (0); 1465 } 1466 1467 struct vnode * 1468 vn_dir_dd_ino(struct vnode *vp) 1469 { 1470 struct namecache *ncp; 1471 struct vnode *ddvp; 1472 1473 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 1474 CACHE_RLOCK(); 1475 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 1476 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 1477 continue; 1478 ddvp = ncp->nc_dvp; 1479 vhold(ddvp); 1480 CACHE_RUNLOCK(); 1481 if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread)) 1482 return (NULL); 1483 return (ddvp); 1484 } 1485 CACHE_RUNLOCK(); 1486 return (NULL); 1487 } 1488 1489 int 1490 vn_commname(struct vnode *vp, char *buf, u_int buflen) 1491 { 1492 struct namecache *ncp; 1493 int l; 1494 1495 CACHE_RLOCK(); 1496 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 1497 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1498 break; 1499 if (ncp == NULL) { 1500 CACHE_RUNLOCK(); 1501 return (ENOENT); 1502 } 1503 l = min(ncp->nc_nlen, buflen - 1); 1504 memcpy(buf, nc_get_name(ncp), l); 1505 CACHE_RUNLOCK(); 1506 buf[l] = '\0'; 1507 return (0); 1508 } 1509 1510 /* ABI compat shims for old kernel modules. */ 1511 #undef cache_enter 1512 1513 void cache_enter(struct vnode *dvp, struct vnode *vp, 1514 struct componentname *cnp); 1515 1516 void 1517 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) 1518 { 1519 1520 cache_enter_time(dvp, vp, cnp, NULL, NULL); 1521 } 1522 1523 /* 1524 * This function updates path string to vnode's full global path 1525 * and checks the size of the new path string against the pathlen argument. 1526 * 1527 * Requires a locked, referenced vnode. 1528 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 1529 * 1530 * If sysctl debug.disablefullpath is set, ENODEV is returned, 1531 * vnode is left locked and path remain untouched. 1532 * 1533 * If vp is a directory, the call to vn_fullpath_global() always succeeds 1534 * because it falls back to the ".." lookup if the namecache lookup fails. 1535 */ 1536 int 1537 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 1538 u_int pathlen) 1539 { 1540 struct nameidata nd; 1541 struct vnode *vp1; 1542 char *rpath, *fbuf; 1543 int error; 1544 1545 ASSERT_VOP_ELOCKED(vp, __func__); 1546 1547 /* Return ENODEV if sysctl debug.disablefullpath==1 */ 1548 if (disablefullpath) 1549 return (ENODEV); 1550 1551 /* Construct global filesystem path from vp. */ 1552 VOP_UNLOCK(vp, 0); 1553 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 1554 1555 if (error != 0) { 1556 vrele(vp); 1557 return (error); 1558 } 1559 1560 if (strlen(rpath) >= pathlen) { 1561 vrele(vp); 1562 error = ENAMETOOLONG; 1563 goto out; 1564 } 1565 1566 /* 1567 * Re-lookup the vnode by path to detect a possible rename. 1568 * As a side effect, the vnode is relocked. 1569 * If vnode was renamed, return ENOENT. 1570 */ 1571 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 1572 UIO_SYSSPACE, path, td); 1573 error = namei(&nd); 1574 if (error != 0) { 1575 vrele(vp); 1576 goto out; 1577 } 1578 NDFREE(&nd, NDF_ONLY_PNBUF); 1579 vp1 = nd.ni_vp; 1580 vrele(vp); 1581 if (vp1 == vp) 1582 strcpy(path, rpath); 1583 else { 1584 vput(vp1); 1585 error = ENOENT; 1586 } 1587 1588 out: 1589 free(fbuf, M_TEMP); 1590 return (error); 1591 } 1592