xref: /freebsd/sys/kern/vfs_cache.c (revision d2b2128a286a00ee53d79cb88b4e59bf42525cf9)
1 /*-
2  * Copyright (c) 1989, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Poul-Henning Kamp of the FreeBSD Project.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include "opt_ktrace.h"
39 
40 #include <sys/param.h>
41 #include <sys/filedesc.h>
42 #include <sys/fnv_hash.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/namei.h>
48 #include <sys/proc.h>
49 #include <sys/rwlock.h>
50 #include <sys/syscallsubr.h>
51 #include <sys/sysctl.h>
52 #include <sys/sysproto.h>
53 #include <sys/systm.h>
54 #include <sys/vnode.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 
59 #include <vm/uma.h>
60 
61 /*
62  * This structure describes the elements in the cache of recent
63  * names looked up by namei.
64  */
65 
66 struct	namecache {
67 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
68 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
69 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
70 	struct	vnode *nc_dvp;		/* vnode of parent of name */
71 	struct	vnode *nc_vp;		/* vnode the name refers to */
72 	u_char	nc_flag;		/* flag bits */
73 	u_char	nc_nlen;		/* length of name */
74 	char	nc_name[0];		/* segment name */
75 };
76 
77 /*
78  * Name caching works as follows:
79  *
80  * Names found by directory scans are retained in a cache
81  * for future reference.  It is managed LRU, so frequently
82  * used names will hang around.  Cache is indexed by hash value
83  * obtained from (vp, name) where vp refers to the directory
84  * containing name.
85  *
86  * If it is a "negative" entry, (i.e. for a name that is known NOT to
87  * exist) the vnode pointer will be NULL.
88  *
89  * Upon reaching the last segment of a path, if the reference
90  * is for DELETE, or NOCACHE is set (rewrite), and the
91  * name is located in the cache, it will be dropped.
92  */
93 
94 /*
95  * Structures associated with name cacheing.
96  */
97 #define NCHHASH(hash) \
98 	(&nchashtbl[(hash) & nchash])
99 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
100 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
101 static u_long	nchash;			/* size of hash table */
102 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
103 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
104 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
105 static u_long	numneg;			/* number of cache entries allocated */
106 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
107 static u_long	numcache;		/* number of cache entries allocated */
108 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
109 static u_long	numcachehv;		/* number of cache entries with vnodes held */
110 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
111 #if 0
112 static u_long	numcachepl;		/* number of cache purge for leaf entries */
113 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
114 #endif
115 struct	nchstats nchstats;		/* cache effectiveness statistics */
116 
117 static struct rwlock cache_lock;
118 RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
119 
120 #define	CACHE_UPGRADE_LOCK()	rw_try_upgrade(&cache_lock)
121 #define	CACHE_RLOCK()		rw_rlock(&cache_lock)
122 #define	CACHE_RUNLOCK()		rw_runlock(&cache_lock)
123 #define	CACHE_WLOCK()		rw_wlock(&cache_lock)
124 #define	CACHE_WUNLOCK()		rw_wunlock(&cache_lock)
125 
126 /*
127  * UMA zones for the VFS cache.
128  *
129  * The small cache is used for entries with short names, which are the
130  * most common.  The large cache is used for entries which are too big to
131  * fit in the small cache.
132  */
133 static uma_zone_t cache_zone_small;
134 static uma_zone_t cache_zone_large;
135 
136 #define	CACHE_PATH_CUTOFF	32
137 #define	CACHE_ZONE_SMALL	(sizeof(struct namecache) + CACHE_PATH_CUTOFF)
138 #define	CACHE_ZONE_LARGE	(sizeof(struct namecache) + NAME_MAX)
139 
140 #define cache_alloc(len)	uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
141 	cache_zone_small : cache_zone_large, M_WAITOK)
142 #define cache_free(ncp)		do { \
143 	if (ncp != NULL) \
144 		uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
145 		    cache_zone_small : cache_zone_large, (ncp)); \
146 } while (0)
147 
148 static int	doingcache = 1;		/* 1 => enable the cache */
149 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
150 
151 /* Export size information to userland */
152 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
153 	sizeof(struct namecache), "");
154 
155 /*
156  * The new name cache statistics
157  */
158 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
159 #define STATNODE(mode, name, var) \
160 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
161 STATNODE(CTLFLAG_RD, numneg, &numneg);
162 STATNODE(CTLFLAG_RD, numcache, &numcache);
163 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
164 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
165 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
166 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
167 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
168 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
169 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
170 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
171 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
172 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
173 static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades);
174 
175 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE,
176 	&nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
177 
178 
179 
180 static void cache_zap(struct namecache *ncp);
181 static int vn_vptocnp(struct vnode **vp, char **bp, char *buf, u_int *buflen);
182 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
183     char *buf, char **retbuf, u_int buflen);
184 
185 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
186 
187 /*
188  * Flags in namecache.nc_flag
189  */
190 #define NCF_WHITE	1
191 
192 #ifdef DIAGNOSTIC
193 /*
194  * Grab an atomic snapshot of the name cache hash chain lengths
195  */
196 SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
197 
198 static int
199 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
200 {
201 	int error;
202 	struct nchashhead *ncpp;
203 	struct namecache *ncp;
204 	int n_nchash;
205 	int count;
206 
207 	n_nchash = nchash + 1;	/* nchash is max index, not count */
208 	if (!req->oldptr)
209 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
210 
211 	/* Scan hash tables for applicable entries */
212 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
213 		CACHE_RLOCK();
214 		count = 0;
215 		LIST_FOREACH(ncp, ncpp, nc_hash) {
216 			count++;
217 		}
218 		CACHE_RUNLOCK();
219 		error = SYSCTL_OUT(req, &count, sizeof(count));
220 		if (error)
221 			return (error);
222 	}
223 	return (0);
224 }
225 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
226 	CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
227 	"nchash chain lengths");
228 
229 static int
230 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
231 {
232 	int error;
233 	struct nchashhead *ncpp;
234 	struct namecache *ncp;
235 	int n_nchash;
236 	int count, maxlength, used, pct;
237 
238 	if (!req->oldptr)
239 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
240 
241 	n_nchash = nchash + 1;	/* nchash is max index, not count */
242 	used = 0;
243 	maxlength = 0;
244 
245 	/* Scan hash tables for applicable entries */
246 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
247 		count = 0;
248 		CACHE_RLOCK();
249 		LIST_FOREACH(ncp, ncpp, nc_hash) {
250 			count++;
251 		}
252 		CACHE_RUNLOCK();
253 		if (count)
254 			used++;
255 		if (maxlength < count)
256 			maxlength = count;
257 	}
258 	n_nchash = nchash + 1;
259 	pct = (used * 100 * 100) / n_nchash;
260 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
261 	if (error)
262 		return (error);
263 	error = SYSCTL_OUT(req, &used, sizeof(used));
264 	if (error)
265 		return (error);
266 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
267 	if (error)
268 		return (error);
269 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
270 	if (error)
271 		return (error);
272 	return (0);
273 }
274 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
275 	CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
276 	"nchash chain lengths");
277 #endif
278 
279 /*
280  * cache_zap():
281  *
282  *   Removes a namecache entry from cache, whether it contains an actual
283  *   pointer to a vnode or if it is just a negative cache entry.
284  */
285 static void
286 cache_zap(ncp)
287 	struct namecache *ncp;
288 {
289 	struct vnode *vp;
290 
291 	rw_assert(&cache_lock, RA_WLOCKED);
292 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
293 	vp = NULL;
294 	LIST_REMOVE(ncp, nc_hash);
295 	LIST_REMOVE(ncp, nc_src);
296 	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
297 		vp = ncp->nc_dvp;
298 		numcachehv--;
299 	}
300 	if (ncp->nc_vp) {
301 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
302 		ncp->nc_vp->v_dd = NULL;
303 	} else {
304 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
305 		numneg--;
306 	}
307 	numcache--;
308 	cache_free(ncp);
309 	if (vp)
310 		vdrop(vp);
311 }
312 
313 /*
314  * Lookup an entry in the cache
315  *
316  * Lookup is called with dvp pointing to the directory to search,
317  * cnp pointing to the name of the entry being sought. If the lookup
318  * succeeds, the vnode is returned in *vpp, and a status of -1 is
319  * returned. If the lookup determines that the name does not exist
320  * (negative cacheing), a status of ENOENT is returned. If the lookup
321  * fails, a status of zero is returned.  If the directory vnode is
322  * recycled out from under us due to a forced unmount, a status of
323  * ENOENT is returned.
324  *
325  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
326  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
327  * not recursively acquired.
328  */
329 
330 int
331 cache_lookup(dvp, vpp, cnp)
332 	struct vnode *dvp;
333 	struct vnode **vpp;
334 	struct componentname *cnp;
335 {
336 	struct namecache *ncp;
337 	u_int32_t hash;
338 	int error, ltype, wlocked;
339 
340 	if (!doingcache) {
341 		cnp->cn_flags &= ~MAKEENTRY;
342 		return (0);
343 	}
344 retry:
345 	CACHE_RLOCK();
346 	wlocked = 0;
347 	numcalls++;
348 	error = 0;
349 
350 retry_wlocked:
351 	if (cnp->cn_nameptr[0] == '.') {
352 		if (cnp->cn_namelen == 1) {
353 			*vpp = dvp;
354 			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
355 			    dvp, cnp->cn_nameptr);
356 			dothits++;
357 			goto success;
358 		}
359 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
360 			dotdothits++;
361 			if (dvp->v_dd == NULL ||
362 			    (cnp->cn_flags & MAKEENTRY) == 0) {
363 				goto unlock;
364 			}
365 			*vpp = dvp->v_dd;
366 			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
367 			    dvp, cnp->cn_nameptr, *vpp);
368 			goto success;
369 		}
370 	}
371 
372 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
373 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
374 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
375 		numchecks++;
376 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
377 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
378 			break;
379 	}
380 
381 	/* We failed to find an entry */
382 	if (ncp == NULL) {
383 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
384 			nummisszap++;
385 		} else {
386 			nummiss++;
387 		}
388 		nchstats.ncs_miss++;
389 		goto unlock;
390 	}
391 
392 	/* We don't want to have an entry, so dump it */
393 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
394 		numposzaps++;
395 		nchstats.ncs_badhits++;
396 		if (!wlocked && !CACHE_UPGRADE_LOCK())
397 			goto wlock;
398 		cache_zap(ncp);
399 		CACHE_WUNLOCK();
400 		return (0);
401 	}
402 
403 	/* We found a "positive" match, return the vnode */
404 	if (ncp->nc_vp) {
405 		numposhits++;
406 		nchstats.ncs_goodhits++;
407 		*vpp = ncp->nc_vp;
408 		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
409 		    dvp, cnp->cn_nameptr, *vpp, ncp);
410 		goto success;
411 	}
412 
413 	/* We found a negative match, and want to create it, so purge */
414 	if (cnp->cn_nameiop == CREATE) {
415 		numnegzaps++;
416 		nchstats.ncs_badhits++;
417 		if (!wlocked && !CACHE_UPGRADE_LOCK())
418 			goto wlock;
419 		cache_zap(ncp);
420 		CACHE_WUNLOCK();
421 		return (0);
422 	}
423 
424 	if (!wlocked && !CACHE_UPGRADE_LOCK())
425 		goto wlock;
426 	numneghits++;
427 	/*
428 	 * We found a "negative" match, so we shift it to the end of
429 	 * the "negative" cache entries queue to satisfy LRU.  Also,
430 	 * check to see if the entry is a whiteout; indicate this to
431 	 * the componentname, if so.
432 	 */
433 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
434 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
435 	nchstats.ncs_neghits++;
436 	if (ncp->nc_flag & NCF_WHITE)
437 		cnp->cn_flags |= ISWHITEOUT;
438 	CACHE_WUNLOCK();
439 	return (ENOENT);
440 
441 wlock:
442 	/*
443 	 * We need to update the cache after our lookup, so upgrade to
444 	 * a write lock and retry the operation.
445 	 */
446 	CACHE_RUNLOCK();
447 	CACHE_WLOCK();
448 	numupgrades++;
449 	wlocked = 1;
450 	goto retry_wlocked;
451 
452 success:
453 	/*
454 	 * On success we return a locked and ref'd vnode as per the lookup
455 	 * protocol.
456 	 */
457 	if (dvp == *vpp) {   /* lookup on "." */
458 		VREF(*vpp);
459 		if (wlocked)
460 			CACHE_WUNLOCK();
461 		else
462 			CACHE_RUNLOCK();
463 		/*
464 		 * When we lookup "." we still can be asked to lock it
465 		 * differently...
466 		 */
467 		ltype = cnp->cn_lkflags & LK_TYPE_MASK;
468 		if (ltype != VOP_ISLOCKED(*vpp)) {
469 			if (ltype == LK_EXCLUSIVE) {
470 				vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
471 				if ((*vpp)->v_iflag & VI_DOOMED) {
472 					/* forced unmount */
473 					vrele(*vpp);
474 					*vpp = NULL;
475 					return (ENOENT);
476 				}
477 			} else
478 				vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
479 		}
480 		return (-1);
481 	}
482 	ltype = 0;	/* silence gcc warning */
483 	if (cnp->cn_flags & ISDOTDOT) {
484 		ltype = VOP_ISLOCKED(dvp);
485 		VOP_UNLOCK(dvp, 0);
486 	}
487 	VI_LOCK(*vpp);
488 	if (wlocked)
489 		CACHE_WUNLOCK();
490 	else
491 		CACHE_RUNLOCK();
492 	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
493 	if (cnp->cn_flags & ISDOTDOT)
494 		vn_lock(dvp, ltype | LK_RETRY);
495 	if (error) {
496 		*vpp = NULL;
497 		goto retry;
498 	}
499 	if ((cnp->cn_flags & ISLASTCN) &&
500 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
501 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
502 	}
503 	return (-1);
504 
505 unlock:
506 	if (wlocked)
507 		CACHE_WUNLOCK();
508 	else
509 		CACHE_RUNLOCK();
510 	return (0);
511 }
512 
513 /*
514  * Add an entry to the cache.
515  */
516 void
517 cache_enter(dvp, vp, cnp)
518 	struct vnode *dvp;
519 	struct vnode *vp;
520 	struct componentname *cnp;
521 {
522 	struct namecache *ncp, *n2;
523 	struct nchashhead *ncpp;
524 	u_int32_t hash;
525 	int hold;
526 	int zap;
527 	int len;
528 
529 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
530 	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
531 	    ("cahe_enter: Adding a doomed vnode"));
532 
533 	if (!doingcache)
534 		return;
535 
536 	/*
537 	 * Avoid blowout in namecache entries.
538 	 */
539 	if (numcache >= desiredvnodes * 2)
540 		return;
541 
542 	if (cnp->cn_nameptr[0] == '.') {
543 		if (cnp->cn_namelen == 1) {
544 			return;
545 		}
546 		/*
547 		 * For dotdot lookups only cache the v_dd pointer if the
548 		 * directory has a link back to its parent via v_cache_dst.
549 		 * Without this an unlinked directory would keep a soft
550 		 * reference to its parent which could not be NULLd at
551 		 * cache_purge() time.
552 		 */
553 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
554 			CACHE_WLOCK();
555 			if (!TAILQ_EMPTY(&dvp->v_cache_dst))
556 				dvp->v_dd = vp;
557 			CACHE_WUNLOCK();
558 			return;
559 		}
560 	}
561 
562 	hold = 0;
563 	zap = 0;
564 
565 	/*
566 	 * Calculate the hash key and setup as much of the new
567 	 * namecache entry as possible before acquiring the lock.
568 	 */
569 	ncp = cache_alloc(cnp->cn_namelen);
570 	ncp->nc_vp = vp;
571 	ncp->nc_dvp = dvp;
572 	len = ncp->nc_nlen = cnp->cn_namelen;
573 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
574 	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
575 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
576 	CACHE_WLOCK();
577 
578 	/*
579 	 * See if this vnode or negative entry is already in the cache
580 	 * with this name.  This can happen with concurrent lookups of
581 	 * the same path name.
582 	 */
583 	ncpp = NCHHASH(hash);
584 	LIST_FOREACH(n2, ncpp, nc_hash) {
585 		if (n2->nc_dvp == dvp &&
586 		    n2->nc_nlen == cnp->cn_namelen &&
587 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
588 			CACHE_WUNLOCK();
589 			cache_free(ncp);
590 			return;
591 		}
592 	}
593 
594 	numcache++;
595 	if (!vp) {
596 		numneg++;
597 		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
598 	} else if (vp->v_type == VDIR) {
599 		vp->v_dd = dvp;
600 	} else {
601 		vp->v_dd = NULL;
602 	}
603 
604 	/*
605 	 * Insert the new namecache entry into the appropriate chain
606 	 * within the cache entries table.
607 	 */
608 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
609 	if (LIST_EMPTY(&dvp->v_cache_src)) {
610 		hold = 1;
611 		numcachehv++;
612 	}
613 	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
614 	/*
615 	 * If the entry is "negative", we place it into the
616 	 * "negative" cache queue, otherwise, we place it into the
617 	 * destination vnode's cache entries queue.
618 	 */
619 	if (vp) {
620 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
621 	} else {
622 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
623 	}
624 	if (numneg * ncnegfactor > numcache) {
625 		ncp = TAILQ_FIRST(&ncneg);
626 		zap = 1;
627 	}
628 	if (hold)
629 		vhold(dvp);
630 	if (zap)
631 		cache_zap(ncp);
632 	CACHE_WUNLOCK();
633 }
634 
635 /*
636  * Name cache initialization, from vfs_init() when we are booting
637  */
638 static void
639 nchinit(void *dummy __unused)
640 {
641 
642 	TAILQ_INIT(&ncneg);
643 
644 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
645 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
646 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
647 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
648 
649 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
650 }
651 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
652 
653 
654 /*
655  * Invalidate all entries to a particular vnode.
656  */
657 void
658 cache_purge(vp)
659 	struct vnode *vp;
660 {
661 
662 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
663 	CACHE_WLOCK();
664 	while (!LIST_EMPTY(&vp->v_cache_src))
665 		cache_zap(LIST_FIRST(&vp->v_cache_src));
666 	while (!TAILQ_EMPTY(&vp->v_cache_dst))
667 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
668 	vp->v_dd = NULL;
669 	CACHE_WUNLOCK();
670 }
671 
672 /*
673  * Invalidate all negative entries for a particular directory vnode.
674  */
675 void
676 cache_purge_negative(vp)
677 	struct vnode *vp;
678 {
679 	struct namecache *cp, *ncp;
680 
681 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
682 	CACHE_WLOCK();
683 	LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
684 		if (cp->nc_vp == NULL)
685 			cache_zap(cp);
686 	}
687 	CACHE_WUNLOCK();
688 }
689 
690 /*
691  * Flush all entries referencing a particular filesystem.
692  */
693 void
694 cache_purgevfs(mp)
695 	struct mount *mp;
696 {
697 	struct nchashhead *ncpp;
698 	struct namecache *ncp, *nnp;
699 
700 	/* Scan hash tables for applicable entries */
701 	CACHE_WLOCK();
702 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
703 		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
704 			if (ncp->nc_dvp->v_mount == mp)
705 				cache_zap(ncp);
706 		}
707 	}
708 	CACHE_WUNLOCK();
709 }
710 
711 /*
712  * Perform canonical checks and cache lookup and pass on to filesystem
713  * through the vop_cachedlookup only if needed.
714  */
715 
716 int
717 vfs_cache_lookup(ap)
718 	struct vop_lookup_args /* {
719 		struct vnode *a_dvp;
720 		struct vnode **a_vpp;
721 		struct componentname *a_cnp;
722 	} */ *ap;
723 {
724 	struct vnode *dvp;
725 	int error;
726 	struct vnode **vpp = ap->a_vpp;
727 	struct componentname *cnp = ap->a_cnp;
728 	struct ucred *cred = cnp->cn_cred;
729 	int flags = cnp->cn_flags;
730 	struct thread *td = cnp->cn_thread;
731 
732 	*vpp = NULL;
733 	dvp = ap->a_dvp;
734 
735 	if (dvp->v_type != VDIR)
736 		return (ENOTDIR);
737 
738 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
739 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
740 		return (EROFS);
741 
742 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
743 	if (error)
744 		return (error);
745 
746 	error = cache_lookup(dvp, vpp, cnp);
747 	if (error == 0)
748 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
749 	if (error == -1)
750 		return (0);
751 	return (error);
752 }
753 
754 
755 #ifndef _SYS_SYSPROTO_H_
756 struct  __getcwd_args {
757 	u_char	*buf;
758 	u_int	buflen;
759 };
760 #endif
761 
762 /*
763  * XXX All of these sysctls would probably be more productive dead.
764  */
765 static int disablecwd;
766 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
767    "Disable the getcwd syscall");
768 
769 /* Implementation of the getcwd syscall. */
770 int
771 __getcwd(td, uap)
772 	struct thread *td;
773 	struct __getcwd_args *uap;
774 {
775 
776 	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
777 }
778 
779 int
780 kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
781 {
782 	char *bp, *tmpbuf;
783 	struct filedesc *fdp;
784 	struct vnode *cdir, *rdir;
785 	int error, vfslocked;
786 
787 	if (disablecwd)
788 		return (ENODEV);
789 	if (buflen < 2)
790 		return (EINVAL);
791 	if (buflen > MAXPATHLEN)
792 		buflen = MAXPATHLEN;
793 
794 	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
795 	fdp = td->td_proc->p_fd;
796 	FILEDESC_SLOCK(fdp);
797 	cdir = fdp->fd_cdir;
798 	VREF(cdir);
799 	rdir = fdp->fd_rdir;
800 	VREF(rdir);
801 	FILEDESC_SUNLOCK(fdp);
802 	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
803 	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
804 	vrele(rdir);
805 	VFS_UNLOCK_GIANT(vfslocked);
806 	vfslocked = VFS_LOCK_GIANT(cdir->v_mount);
807 	vrele(cdir);
808 	VFS_UNLOCK_GIANT(vfslocked);
809 
810 	if (!error) {
811 		if (bufseg == UIO_SYSSPACE)
812 			bcopy(bp, buf, strlen(bp) + 1);
813 		else
814 			error = copyout(bp, buf, strlen(bp) + 1);
815 #ifdef KTRACE
816 	if (KTRPOINT(curthread, KTR_NAMEI))
817 		ktrnamei(bp);
818 #endif
819 	}
820 	free(tmpbuf, M_TEMP);
821 	return (error);
822 }
823 
824 /*
825  * Thus begins the fullpath magic.
826  */
827 
828 #undef STATNODE
829 #define STATNODE(name)							\
830 	static u_int name;						\
831 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
832 
833 static int disablefullpath;
834 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
835 	"Disable the vn_fullpath function");
836 
837 /* These count for kern___getcwd(), too. */
838 STATNODE(numfullpathcalls);
839 STATNODE(numfullpathfail1);
840 STATNODE(numfullpathfail2);
841 STATNODE(numfullpathfail4);
842 STATNODE(numfullpathfound);
843 
844 /*
845  * Retrieve the full filesystem path that correspond to a vnode from the name
846  * cache (if available)
847  */
848 int
849 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
850 {
851 	char *buf;
852 	struct filedesc *fdp;
853 	struct vnode *rdir;
854 	int error, vfslocked;
855 
856 	if (disablefullpath)
857 		return (ENODEV);
858 	if (vn == NULL)
859 		return (EINVAL);
860 
861 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
862 	fdp = td->td_proc->p_fd;
863 	FILEDESC_SLOCK(fdp);
864 	rdir = fdp->fd_rdir;
865 	VREF(rdir);
866 	FILEDESC_SUNLOCK(fdp);
867 	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
868 	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
869 	vrele(rdir);
870 	VFS_UNLOCK_GIANT(vfslocked);
871 
872 	if (!error)
873 		*freebuf = buf;
874 	else
875 		free(buf, M_TEMP);
876 	return (error);
877 }
878 
879 /*
880  * This function is similar to vn_fullpath, but it attempts to lookup the
881  * pathname relative to the global root mount point.  This is required for the
882  * auditing sub-system, as audited pathnames must be absolute, relative to the
883  * global root mount point.
884  */
885 int
886 vn_fullpath_global(struct thread *td, struct vnode *vn,
887     char **retbuf, char **freebuf)
888 {
889 	char *buf;
890 	int error;
891 
892 	if (disablefullpath)
893 		return (ENODEV);
894 	if (vn == NULL)
895 		return (EINVAL);
896 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
897 	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
898 	if (!error)
899 		*freebuf = buf;
900 	else
901 		free(buf, M_TEMP);
902 	return (error);
903 }
904 
905 static int
906 vn_vptocnp(struct vnode **vp, char **bp, char *buf, u_int *buflen)
907 {
908 	struct vnode *dvp;
909 	int error, vfslocked;
910 
911 	vhold(*vp);
912 	CACHE_RUNLOCK();
913 	vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
914 	vn_lock(*vp, LK_SHARED | LK_RETRY);
915 	error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
916 	VOP_UNLOCK(*vp, 0);
917 	vdrop(*vp);
918 	VFS_UNLOCK_GIANT(vfslocked);
919 	if (error) {
920 		numfullpathfail2++;
921 		return (error);
922 	}
923 	*bp = buf + *buflen;
924 	*vp = dvp;
925 	CACHE_RLOCK();
926 	if ((*vp)->v_iflag & VI_DOOMED) {
927 		/* forced unmount */
928 		CACHE_RUNLOCK();
929 		vdrop(*vp);
930 		return (ENOENT);
931 	}
932 	vdrop(*vp);
933 
934 	return (0);
935 }
936 
937 /*
938  * The magic behind kern___getcwd() and vn_fullpath().
939  */
940 static int
941 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
942     char *buf, char **retbuf, u_int buflen)
943 {
944 	char *bp;
945 	int error, i, slash_prefixed;
946 	struct namecache *ncp;
947 
948 	buflen--;
949 	bp = buf + buflen;
950 	*bp = '\0';
951 	error = 0;
952 	slash_prefixed = 0;
953 
954 	CACHE_RLOCK();
955 	numfullpathcalls++;
956 	if (vp->v_type != VDIR) {
957 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
958 		if (ncp != NULL) {
959 			buflen -= ncp->nc_nlen;
960 			for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
961 				*--bp = ncp->nc_name[i];
962 			if (bp == buf) {
963 				numfullpathfail4++;
964 				CACHE_RUNLOCK();
965 				return (ENOMEM);
966 			}
967 			vp = ncp->nc_dvp;
968 		} else {
969 			error = vn_vptocnp(&vp, &bp, buf, &buflen);
970 			if (error)
971 				return (error);
972 		}
973 		if (buflen <= 0) {
974 			numfullpathfail4++;
975 			CACHE_RUNLOCK();
976 			return (ENOMEM);
977 		}
978 		*--bp = '/';
979 		buflen--;
980 		slash_prefixed = 1;
981 	}
982 	while (vp != rdir && vp != rootvnode) {
983 		if (vp->v_vflag & VV_ROOT) {
984 			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
985 				CACHE_RUNLOCK();
986 				error = ENOENT;
987 				break;
988 			}
989 			vp = vp->v_mount->mnt_vnodecovered;
990 			continue;
991 		}
992 		if (vp->v_type != VDIR) {
993 			numfullpathfail1++;
994 			CACHE_RUNLOCK();
995 			error = ENOTDIR;
996 			break;
997 		}
998 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
999 		if (ncp != NULL) {
1000 			MPASS(vp->v_dd == NULL || ncp->nc_dvp == vp->v_dd);
1001 			buflen -= ncp->nc_nlen;
1002 			for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
1003 				*--bp = ncp->nc_name[i];
1004 			if (bp == buf) {
1005 				numfullpathfail4++;
1006 				CACHE_RUNLOCK();
1007 				error = ENOMEM;
1008 				break;
1009 			}
1010 			vp = ncp->nc_dvp;
1011 		} else {
1012 			error = vn_vptocnp(&vp, &bp, buf, &buflen);
1013 			if (error)
1014 				break;
1015 		}
1016 		if (buflen <= 0) {
1017 			numfullpathfail4++;
1018 			CACHE_RUNLOCK();
1019 			error = ENOMEM;
1020 			break;
1021 		}
1022 		*--bp = '/';
1023 		buflen--;
1024 		slash_prefixed = 1;
1025 	}
1026 	if (error)
1027 		return (error);
1028 	if (!slash_prefixed) {
1029 		if (bp == buf) {
1030 			numfullpathfail4++;
1031 			CACHE_RUNLOCK();
1032 			return (ENOMEM);
1033 		} else
1034 			*--bp = '/';
1035 	}
1036 	numfullpathfound++;
1037 	CACHE_RUNLOCK();
1038 
1039 	*retbuf = bp;
1040 	return (0);
1041 }
1042 
1043 int
1044 vn_commname(struct vnode *vp, char *buf, u_int buflen)
1045 {
1046 	struct namecache *ncp;
1047 	int l;
1048 
1049 	CACHE_RLOCK();
1050 	ncp = TAILQ_FIRST(&vp->v_cache_dst);
1051 	if (!ncp) {
1052 		CACHE_RUNLOCK();
1053 		return (ENOENT);
1054 	}
1055 	l = min(ncp->nc_nlen, buflen - 1);
1056 	memcpy(buf, ncp->nc_name, l);
1057 	CACHE_RUNLOCK();
1058 	buf[l] = '\0';
1059 	return (0);
1060 }
1061