xref: /freebsd/sys/kern/vfs_cache.c (revision 6af83ee0d2941d18880b6aaa2b4facd1d30c6106)
1 /*-
2  * Copyright (c) 1989, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Poul-Henning Kamp of the FreeBSD Project.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/mutex.h>
43 #include <sys/sysctl.h>
44 #include <sys/mount.h>
45 #include <sys/vnode.h>
46 #include <sys/namei.h>
47 #include <sys/malloc.h>
48 #include <sys/syscallsubr.h>
49 #include <sys/sysproto.h>
50 #include <sys/proc.h>
51 #include <sys/filedesc.h>
52 #include <sys/fnv_hash.h>
53 
54 #include <vm/uma.h>
55 
56 /*
57  * This structure describes the elements in the cache of recent
58  * names looked up by namei.
59  */
60 
61 struct	namecache {
62 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
63 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
64 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
65 	struct	vnode *nc_dvp;		/* vnode of parent of name */
66 	struct	vnode *nc_vp;		/* vnode the name refers to */
67 	u_char	nc_flag;		/* flag bits */
68 	u_char	nc_nlen;		/* length of name */
69 	char	nc_name[0];		/* segment name */
70 };
71 
72 /*
73  * Name caching works as follows:
74  *
75  * Names found by directory scans are retained in a cache
76  * for future reference.  It is managed LRU, so frequently
77  * used names will hang around.  Cache is indexed by hash value
78  * obtained from (vp, name) where vp refers to the directory
79  * containing name.
80  *
81  * If it is a "negative" entry, (i.e. for a name that is known NOT to
82  * exist) the vnode pointer will be NULL.
83  *
84  * Upon reaching the last segment of a path, if the reference
85  * is for DELETE, or NOCACHE is set (rewrite), and the
86  * name is located in the cache, it will be dropped.
87  */
88 
89 /*
90  * Structures associated with name cacheing.
91  */
92 #define NCHHASH(hash) \
93 	(&nchashtbl[(hash) & nchash])
94 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
95 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
96 static u_long	nchash;			/* size of hash table */
97 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
98 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
99 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
100 static u_long	numneg;			/* number of cache entries allocated */
101 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
102 static u_long	numcache;		/* number of cache entries allocated */
103 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
104 static u_long	numcachehv;		/* number of cache entries with vnodes held */
105 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
106 #if 0
107 static u_long	numcachepl;		/* number of cache purge for leaf entries */
108 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
109 #endif
110 struct	nchstats nchstats;		/* cache effectiveness statistics */
111 
112 static struct mtx cache_lock;
113 MTX_SYSINIT(vfscache, &cache_lock, "Name Cache", MTX_DEF);
114 
115 #define	CACHE_LOCK()	mtx_lock(&cache_lock)
116 #define	CACHE_UNLOCK()	mtx_unlock(&cache_lock)
117 
118 /*
119  * UMA zones for the VFS cache.
120  *
121  * The small cache is used for entries with short names, which are the
122  * most common.  The large cache is used for entries which are too big to
123  * fit in the small cache.
124  */
125 static uma_zone_t cache_zone_small;
126 static uma_zone_t cache_zone_large;
127 
128 #define	CACHE_PATH_CUTOFF	32
129 #define	CACHE_ZONE_SMALL	(sizeof(struct namecache) + CACHE_PATH_CUTOFF)
130 #define	CACHE_ZONE_LARGE	(sizeof(struct namecache) + NAME_MAX)
131 
132 #define cache_alloc(len)	uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
133 	cache_zone_small : cache_zone_large, M_WAITOK)
134 #define cache_free(ncp)		do { \
135 	if (ncp != NULL) \
136 		uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
137 		    cache_zone_small : cache_zone_large, (ncp)); \
138 } while (0)
139 
140 static int	doingcache = 1;		/* 1 => enable the cache */
141 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
142 
143 /* Export size information to userland */
144 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
145 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
146 
147 /*
148  * The new name cache statistics
149  */
150 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
151 #define STATNODE(mode, name, var) \
152 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
153 STATNODE(CTLFLAG_RD, numneg, &numneg);
154 STATNODE(CTLFLAG_RD, numcache, &numcache);
155 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
156 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
157 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
158 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
159 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
160 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
161 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
162 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
163 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
164 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
165 
166 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
167 	sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
168 
169 
170 
171 static void cache_zap(struct namecache *ncp);
172 
173 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
174 
175 /*
176  * Flags in namecache.nc_flag
177  */
178 #define NCF_WHITE	1
179 
180 /*
181  * Grab an atomic snapshot of the name cache hash chain lengths
182  */
183 SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
184 
185 static int
186 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
187 {
188 	int error;
189 	struct nchashhead *ncpp;
190 	struct namecache *ncp;
191 	int n_nchash;
192 	int count;
193 
194 	n_nchash = nchash + 1;	/* nchash is max index, not count */
195 	if (!req->oldptr)
196 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
197 
198 	/* Scan hash tables for applicable entries */
199 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
200 		count = 0;
201 		LIST_FOREACH(ncp, ncpp, nc_hash) {
202 			count++;
203 		}
204 		error = SYSCTL_OUT(req, &count, sizeof(count));
205 		if (error)
206 			return (error);
207 	}
208 	return (0);
209 }
210 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
211 	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
212 
213 static int
214 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
215 {
216 	int error;
217 	struct nchashhead *ncpp;
218 	struct namecache *ncp;
219 	int n_nchash;
220 	int count, maxlength, used, pct;
221 
222 	if (!req->oldptr)
223 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
224 
225 	n_nchash = nchash + 1;	/* nchash is max index, not count */
226 	used = 0;
227 	maxlength = 0;
228 
229 	/* Scan hash tables for applicable entries */
230 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
231 		count = 0;
232 		LIST_FOREACH(ncp, ncpp, nc_hash) {
233 			count++;
234 		}
235 		if (count)
236 			used++;
237 		if (maxlength < count)
238 			maxlength = count;
239 	}
240 	n_nchash = nchash + 1;
241 	pct = (used * 100 * 100) / n_nchash;
242 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
243 	if (error)
244 		return (error);
245 	error = SYSCTL_OUT(req, &used, sizeof(used));
246 	if (error)
247 		return (error);
248 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
249 	if (error)
250 		return (error);
251 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
252 	if (error)
253 		return (error);
254 	return (0);
255 }
256 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
257 	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
258 
259 /*
260  * cache_zap():
261  *
262  *   Removes a namecache entry from cache, whether it contains an actual
263  *   pointer to a vnode or if it is just a negative cache entry.
264  */
265 static void
266 cache_zap(ncp)
267 	struct namecache *ncp;
268 {
269 	struct vnode *vp;
270 
271 	mtx_assert(&cache_lock, MA_OWNED);
272 	vp = NULL;
273 	LIST_REMOVE(ncp, nc_hash);
274 	LIST_REMOVE(ncp, nc_src);
275 	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
276 		vp = ncp->nc_dvp;
277 		numcachehv--;
278 	}
279 	if (ncp->nc_vp) {
280 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
281 	} else {
282 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
283 		numneg--;
284 	}
285 	numcache--;
286 	cache_free(ncp);
287 	if (vp)
288 		vdrop(vp);
289 }
290 
291 /*
292  * cache_leaf_test()
293  *
294  *      Test whether this (directory) vnode's namei cache entry contains
295  *      subdirectories or not.  Used to determine whether the directory is
296  *      a leaf in the namei cache or not.  Note: the directory may still
297  *      contain files in the namei cache.
298  *
299  *      Returns 0 if the directory is a leaf, -1 if it isn't.
300  */
301 int
302 cache_leaf_test(struct vnode *vp)
303 {
304 	struct namecache *ncpc;
305 	int leaf;
306 
307 	leaf = 0;
308 	CACHE_LOCK();
309 	for (ncpc = LIST_FIRST(&vp->v_cache_src);
310 	     ncpc != NULL;
311 	     ncpc = LIST_NEXT(ncpc, nc_src)
312 	 ) {
313 		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR) {
314 			leaf = -1;
315 			break;
316 		}
317 	}
318 	CACHE_UNLOCK();
319 	return (leaf);
320 }
321 
322 /*
323  * Lookup an entry in the cache
324  *
325  * Lookup is called with dvp pointing to the directory to search,
326  * cnp pointing to the name of the entry being sought. If the lookup
327  * succeeds, the vnode is returned in *vpp, and a status of -1 is
328  * returned. If the lookup determines that the name does not exist
329  * (negative cacheing), a status of ENOENT is returned. If the lookup
330  * fails, a status of zero is returned.
331  */
332 
333 int
334 cache_lookup(dvp, vpp, cnp)
335 	struct vnode *dvp;
336 	struct vnode **vpp;
337 	struct componentname *cnp;
338 {
339 	struct namecache *ncp;
340 	u_int32_t hash;
341 
342 	if (!doingcache) {
343 		cnp->cn_flags &= ~MAKEENTRY;
344 		return (0);
345 	}
346 
347 	CACHE_LOCK();
348 	numcalls++;
349 
350 	if (cnp->cn_nameptr[0] == '.') {
351 		if (cnp->cn_namelen == 1) {
352 			*vpp = dvp;
353 			dothits++;
354 			CACHE_UNLOCK();
355 			return (-1);
356 		}
357 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
358 			dotdothits++;
359 			if (dvp->v_dd->v_id != dvp->v_ddid ||
360 			    (cnp->cn_flags & MAKEENTRY) == 0) {
361 				dvp->v_ddid = 0;
362 				CACHE_UNLOCK();
363 				return (0);
364 			}
365 			*vpp = dvp->v_dd;
366 			CACHE_UNLOCK();
367 			return (-1);
368 		}
369 	}
370 
371 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
372 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
373 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
374 		numchecks++;
375 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
376 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
377 			break;
378 	}
379 
380 	/* We failed to find an entry */
381 	if (ncp == 0) {
382 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
383 			nummisszap++;
384 		} else {
385 			nummiss++;
386 		}
387 		nchstats.ncs_miss++;
388 		CACHE_UNLOCK();
389 		return (0);
390 	}
391 
392 	/* We don't want to have an entry, so dump it */
393 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
394 		numposzaps++;
395 		nchstats.ncs_badhits++;
396 		cache_zap(ncp);
397 		CACHE_UNLOCK();
398 		return (0);
399 	}
400 
401 	/* We found a "positive" match, return the vnode */
402 	if (ncp->nc_vp) {
403 		numposhits++;
404 		nchstats.ncs_goodhits++;
405 		*vpp = ncp->nc_vp;
406 		CACHE_UNLOCK();
407 		return (-1);
408 	}
409 
410 	/* We found a negative match, and want to create it, so purge */
411 	if (cnp->cn_nameiop == CREATE) {
412 		numnegzaps++;
413 		nchstats.ncs_badhits++;
414 		cache_zap(ncp);
415 		CACHE_UNLOCK();
416 		return (0);
417 	}
418 
419 	numneghits++;
420 	/*
421 	 * We found a "negative" match, so we shift it to the end of
422 	 * the "negative" cache entries queue to satisfy LRU.  Also,
423 	 * check to see if the entry is a whiteout; indicate this to
424 	 * the componentname, if so.
425 	 */
426 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
427 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
428 	nchstats.ncs_neghits++;
429 	if (ncp->nc_flag & NCF_WHITE)
430 		cnp->cn_flags |= ISWHITEOUT;
431 	CACHE_UNLOCK();
432 	return (ENOENT);
433 }
434 
435 /*
436  * Add an entry to the cache.
437  */
438 void
439 cache_enter(dvp, vp, cnp)
440 	struct vnode *dvp;
441 	struct vnode *vp;
442 	struct componentname *cnp;
443 {
444 	struct namecache *ncp;
445 	struct nchashhead *ncpp;
446 	u_int32_t hash;
447 	int hold;
448 	int zap;
449 	int len;
450 
451 	if (!doingcache)
452 		return;
453 
454 	if (cnp->cn_nameptr[0] == '.') {
455 		if (cnp->cn_namelen == 1) {
456 			return;
457 		}
458 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
459 			if (vp) {
460 				dvp->v_dd = vp;
461 				dvp->v_ddid = vp->v_id;
462 			} else {
463 				dvp->v_dd = dvp;
464 				dvp->v_ddid = 0;
465 			}
466 			return;
467 		}
468 	}
469 
470 	hold = 0;
471 	zap = 0;
472 	ncp = cache_alloc(cnp->cn_namelen);
473 	CACHE_LOCK();
474 	numcache++;
475 	if (!vp) {
476 		numneg++;
477 		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
478 	} else if (vp->v_type == VDIR) {
479 		vp->v_dd = dvp;
480 		vp->v_ddid = dvp->v_id;
481 	}
482 
483 	/*
484 	 * Set the rest of the namecache entry elements, calculate it's
485 	 * hash key and insert it into the appropriate chain within
486 	 * the cache entries table.
487 	 */
488 	ncp->nc_vp = vp;
489 	ncp->nc_dvp = dvp;
490 	len = ncp->nc_nlen = cnp->cn_namelen;
491 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
492 	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
493 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
494 	ncpp = NCHHASH(hash);
495 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
496 	if (LIST_EMPTY(&dvp->v_cache_src)) {
497 		hold = 1;
498 		numcachehv++;
499 	}
500 	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
501 	/*
502 	 * If the entry is "negative", we place it into the
503 	 * "negative" cache queue, otherwise, we place it into the
504 	 * destination vnode's cache entries queue.
505 	 */
506 	if (vp) {
507 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
508 	} else {
509 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
510 	}
511 	if (numneg * ncnegfactor > numcache) {
512 		ncp = TAILQ_FIRST(&ncneg);
513 		zap = 1;
514 	}
515 	if (hold)
516 		vhold(dvp);
517 	if (zap)
518 		cache_zap(ncp);
519 	CACHE_UNLOCK();
520 }
521 
522 /*
523  * Name cache initialization, from vfs_init() when we are booting
524  */
525 static void
526 nchinit(void *dummy __unused)
527 {
528 
529 	TAILQ_INIT(&ncneg);
530 
531 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
532 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
533 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
534 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
535 
536 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
537 }
538 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
539 
540 
541 /*
542  * Invalidate all entries to a particular vnode.
543  *
544  * Remove all entries in the namecache relating to this vnode and
545  * change the v_id.  We take the v_id from a global counter, since
546  * it becomes a handy sequence number in crash-dumps that way.
547  * No valid vnode will ever have (v_id == 0).
548  *
549  * XXX: Only time and the size of v_id prevents this from failing:
550  * XXX: In theory we should hunt down all (struct vnode*, v_id)
551  * XXX: soft references and nuke them, at least on the global
552  * XXX: v_id wraparound.  The period of resistance can be extended
553  * XXX: by incrementing each vnodes v_id individually instead of
554  * XXX: using the global v_id.
555  */
556 
557 /*
558  * XXX This is sometimes called when a vnode may still be re-used, in which
559  * case v_dd may be invalid.  Need to look this up.
560  */
561 void
562 cache_purge(vp)
563 	struct vnode *vp;
564 {
565 	static u_long nextid;
566 
567 	CACHE_LOCK();
568 	while (!LIST_EMPTY(&vp->v_cache_src))
569 		cache_zap(LIST_FIRST(&vp->v_cache_src));
570 	while (!TAILQ_EMPTY(&vp->v_cache_dst))
571 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
572 
573 	do
574 		nextid++;
575 	while (nextid == vp->v_id || !nextid);
576 	vp->v_id = nextid;
577 	vp->v_dd = vp;
578 	vp->v_ddid = 0;
579 	CACHE_UNLOCK();
580 }
581 
582 /*
583  * Flush all entries referencing a particular filesystem.
584  *
585  * Since we need to check it anyway, we will flush all the invalid
586  * entries at the same time.
587  */
588 void
589 cache_purgevfs(mp)
590 	struct mount *mp;
591 {
592 	struct nchashhead *ncpp;
593 	struct namecache *ncp, *nnp;
594 	struct nchashhead mplist;
595 
596 	LIST_INIT(&mplist);
597 	ncp = NULL;
598 
599 	/* Scan hash tables for applicable entries */
600 	CACHE_LOCK();
601 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
602 		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
603 			nnp = LIST_NEXT(ncp, nc_hash);
604 			if (ncp->nc_dvp->v_mount == mp) {
605 				LIST_REMOVE(ncp, nc_hash);
606 				LIST_INSERT_HEAD(&mplist, ncp, nc_hash);
607 			}
608 		}
609 	}
610 	while (!LIST_EMPTY(&mplist))
611 		cache_zap(LIST_FIRST(&mplist));
612 	CACHE_UNLOCK();
613 }
614 
615 /*
616  * Perform canonical checks and cache lookup and pass on to filesystem
617  * through the vop_cachedlookup only if needed.
618  */
619 
620 int
621 vfs_cache_lookup(ap)
622 	struct vop_lookup_args /* {
623 		struct vnode *a_dvp;
624 		struct vnode **a_vpp;
625 		struct componentname *a_cnp;
626 	} */ *ap;
627 {
628 	struct vnode *dvp, *vp;
629 	int lockparent;
630 	int error;
631 	struct vnode **vpp = ap->a_vpp;
632 	struct componentname *cnp = ap->a_cnp;
633 	struct ucred *cred = cnp->cn_cred;
634 	int flags = cnp->cn_flags;
635 	struct thread *td = cnp->cn_thread;
636 	u_long vpid;	/* capability number of vnode */
637 
638 	*vpp = NULL;
639 	dvp = ap->a_dvp;
640 	lockparent = flags & LOCKPARENT;
641 
642 	if (dvp->v_type != VDIR)
643 		return (ENOTDIR);
644 
645 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
646 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
647 		return (EROFS);
648 
649 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
650 
651 	if (error)
652 		return (error);
653 
654 	error = cache_lookup(dvp, vpp, cnp);
655 
656 #ifdef LOOKUP_SHARED
657 	if (!error) {
658 		/* We do this because the rest of the system now expects to get
659 		 * a shared lock, which is later upgraded if LOCKSHARED is not
660 		 * set.  We have so many cases here because of bugs that yield
661 		 * inconsistant lock states.  This all badly needs to be fixed
662 		 */
663 		error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
664 		if (!error) {
665 			int flock;
666 
667 			flock = VOP_ISLOCKED(*vpp, td);
668 			if (flock != LK_EXCLUSIVE) {
669 				if (flock == 0) {
670 					if ((flags & ISLASTCN) &&
671 					    (flags & LOCKSHARED))
672 						VOP_LOCK(*vpp, LK_SHARED, td);
673 					else
674 						VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
675 				}
676 			} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
677 				VOP_LOCK(*vpp, LK_DOWNGRADE, td);
678 		}
679 		return (error);
680 	}
681 #else
682 	if (!error)
683 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
684 #endif
685 
686 	if (error == ENOENT)
687 		return (error);
688 
689 	vp = *vpp;
690 	vpid = vp->v_id;
691 	cnp->cn_flags &= ~PDIRUNLOCK;
692 	if (dvp == vp) {   /* lookup on "." */
693 		VREF(vp);
694 		error = 0;
695 	} else if (flags & ISDOTDOT) {
696 		VOP_UNLOCK(dvp, 0, td);
697 		cnp->cn_flags |= PDIRUNLOCK;
698 #ifdef LOOKUP_SHARED
699 		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
700 			error = vget(vp, LK_SHARED, td);
701 		else
702 			error = vget(vp, LK_EXCLUSIVE, td);
703 #else
704 		error = vget(vp, LK_EXCLUSIVE, td);
705 #endif
706 
707 		if (!error && lockparent && (flags & ISLASTCN)) {
708 			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
709 				cnp->cn_flags &= ~PDIRUNLOCK;
710 		}
711 	} else {
712 #ifdef LOOKUP_SHARED
713 		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
714 			error = vget(vp, LK_SHARED, td);
715 		else
716 			error = vget(vp, LK_EXCLUSIVE, td);
717 #else
718 		error = vget(vp, LK_EXCLUSIVE, td);
719 #endif
720 		if (!lockparent || error || !(flags & ISLASTCN)) {
721 			VOP_UNLOCK(dvp, 0, td);
722 			cnp->cn_flags |= PDIRUNLOCK;
723 		}
724 	}
725 	/*
726 	 * Check that the capability number did not change
727 	 * while we were waiting for the lock.
728 	 */
729 	if (!error) {
730 		if (vpid == vp->v_id)
731 			return (0);
732 		vput(vp);
733 		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
734 			VOP_UNLOCK(dvp, 0, td);
735 			cnp->cn_flags |= PDIRUNLOCK;
736 		}
737 	}
738 	if (cnp->cn_flags & PDIRUNLOCK) {
739 		error = vn_lock(dvp, LK_EXCLUSIVE, td);
740 		if (error)
741 			return (error);
742 		cnp->cn_flags &= ~PDIRUNLOCK;
743 	}
744 #ifdef LOOKUP_SHARED
745 	error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
746 
747 	if (!error) {
748 		int flock = 0;
749 
750 		flock = VOP_ISLOCKED(*vpp, td);
751 		if (flock != LK_EXCLUSIVE) {
752 			if (flock == 0) {
753 				if ((flags & ISLASTCN) && (flags & LOCKSHARED))
754 					VOP_LOCK(*vpp, LK_SHARED, td);
755 				else
756 					VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
757 			}
758 		} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
759 			VOP_LOCK(*vpp, LK_DOWNGRADE, td);
760 	}
761 
762 	return (error);
763 #else
764 	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
765 #endif
766 }
767 
768 
769 #ifndef _SYS_SYSPROTO_H_
770 struct  __getcwd_args {
771 	u_char	*buf;
772 	u_int	buflen;
773 };
774 #endif
775 
776 /*
777  * XXX All of these sysctls would probably be more productive dead.
778  */
779 static int disablecwd;
780 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
781    "Disable the getcwd syscall");
782 
783 /* Various statistics for the getcwd syscall */
784 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
785 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
786 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
787 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
788 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
789 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
790 
791 /* Implementation of the getcwd syscall */
792 int
793 __getcwd(td, uap)
794 	struct thread *td;
795 	struct __getcwd_args *uap;
796 {
797 
798 	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
799 }
800 
801 int
802 kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
803 {
804 	char *bp, *tmpbuf;
805 	int error, i, slash_prefixed;
806 	struct filedesc *fdp;
807 	struct namecache *ncp;
808 	struct vnode *vp;
809 
810 	numcwdcalls++;
811 	if (disablecwd)
812 		return (ENODEV);
813 	if (buflen < 2)
814 		return (EINVAL);
815 	if (buflen > MAXPATHLEN)
816 		buflen = MAXPATHLEN;
817 	mtx_lock(&Giant);
818 	error = 0;
819 	tmpbuf = bp = malloc(buflen, M_TEMP, M_WAITOK);
820 	bp += buflen - 1;
821 	*bp = '\0';
822 	fdp = td->td_proc->p_fd;
823 	slash_prefixed = 0;
824 	FILEDESC_LOCK(fdp);
825 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
826 		if (vp->v_vflag & VV_ROOT) {
827 			if (vp->v_mount == NULL) {	/* forced unmount */
828 				error = EBADF;
829 				goto out;
830 			}
831 			vp = vp->v_mount->mnt_vnodecovered;
832 			continue;
833 		}
834 		if (vp->v_dd->v_id != vp->v_ddid) {
835 			numcwdfail1++;
836 			error = ENOTDIR;
837 			goto out;
838 		}
839 		CACHE_LOCK();
840 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
841 		if (!ncp) {
842 			numcwdfail2++;
843 			CACHE_UNLOCK();
844 			error = ENOENT;
845 			goto out;
846 		}
847 		if (ncp->nc_dvp != vp->v_dd) {
848 			numcwdfail3++;
849 			CACHE_UNLOCK();
850 			error = EBADF;
851 			goto out;
852 		}
853 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
854 			if (bp == tmpbuf) {
855 				numcwdfail4++;
856 				CACHE_UNLOCK();
857 				error = ENOMEM;
858 				goto out;
859 			}
860 			*--bp = ncp->nc_name[i];
861 		}
862 		if (bp == tmpbuf) {
863 			numcwdfail4++;
864 			CACHE_UNLOCK();
865 			error = ENOMEM;
866 			goto out;
867 		}
868 		*--bp = '/';
869 		slash_prefixed = 1;
870 		vp = vp->v_dd;
871 		CACHE_UNLOCK();
872 	}
873 	if (!slash_prefixed) {
874 		if (bp == tmpbuf) {
875 			numcwdfail4++;
876 			error = ENOMEM;
877 			goto out;
878 		}
879 		*--bp = '/';
880 	}
881 	FILEDESC_UNLOCK(fdp);
882 	mtx_unlock(&Giant);
883 	numcwdfound++;
884 	if (bufseg == UIO_SYSSPACE)
885 		bcopy(bp, buf, strlen(bp) + 1);
886 	else
887 		error = copyout(bp, buf, strlen(bp) + 1);
888 	free(tmpbuf, M_TEMP);
889 	return (error);
890 out:
891 	FILEDESC_UNLOCK(fdp);
892 	mtx_unlock(&Giant);
893 	free(tmpbuf, M_TEMP);
894 	return (error);
895 }
896 
897 /*
898  * Thus begins the fullpath magic.
899  */
900 
901 #undef STATNODE
902 #define STATNODE(name)							\
903 	static u_int name;						\
904 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
905 
906 static int disablefullpath;
907 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
908 	"Disable the vn_fullpath function");
909 
910 STATNODE(numfullpathcalls);
911 STATNODE(numfullpathfail1);
912 STATNODE(numfullpathfail2);
913 STATNODE(numfullpathfail3);
914 STATNODE(numfullpathfail4);
915 STATNODE(numfullpathfound);
916 
917 /*
918  * Retrieve the full filesystem path that correspond to a vnode from the name
919  * cache (if available)
920  */
921 int
922 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
923 {
924 	char *bp, *buf;
925 	int i, slash_prefixed;
926 	struct filedesc *fdp;
927 	struct namecache *ncp;
928 	struct vnode *vp;
929 
930 	numfullpathcalls++;
931 	if (disablefullpath)
932 		return (ENODEV);
933 	if (vn == NULL)
934 		return (EINVAL);
935 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
936 	bp = buf + MAXPATHLEN - 1;
937 	*bp = '\0';
938 	fdp = td->td_proc->p_fd;
939 	slash_prefixed = 0;
940 	ASSERT_VOP_LOCKED(vn, "vn_fullpath");
941 	FILEDESC_LOCK(fdp);
942 	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
943 		if (vp->v_vflag & VV_ROOT) {
944 			if (vp->v_mount == NULL) {	/* forced unmount */
945 				FILEDESC_UNLOCK(fdp);
946 				free(buf, M_TEMP);
947 				return (EBADF);
948 			}
949 			vp = vp->v_mount->mnt_vnodecovered;
950 			continue;
951 		}
952 		if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
953 			FILEDESC_UNLOCK(fdp);
954 			free(buf, M_TEMP);
955 			numfullpathfail1++;
956 			return (ENOTDIR);
957 		}
958 		CACHE_LOCK();
959 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
960 		if (!ncp) {
961 			numfullpathfail2++;
962 			CACHE_UNLOCK();
963 			FILEDESC_UNLOCK(fdp);
964 			free(buf, M_TEMP);
965 			return (ENOENT);
966 		}
967 		if (vp != vn && ncp->nc_dvp != vp->v_dd) {
968 			numfullpathfail3++;
969 			CACHE_UNLOCK();
970 			FILEDESC_UNLOCK(fdp);
971 			free(buf, M_TEMP);
972 			return (EBADF);
973 		}
974 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
975 			if (bp == buf) {
976 				numfullpathfail4++;
977 				CACHE_UNLOCK();
978 				FILEDESC_UNLOCK(fdp);
979 				free(buf, M_TEMP);
980 				return (ENOMEM);
981 			}
982 			*--bp = ncp->nc_name[i];
983 		}
984 		if (bp == buf) {
985 			numfullpathfail4++;
986 			CACHE_UNLOCK();
987 			FILEDESC_UNLOCK(fdp);
988 			free(buf, M_TEMP);
989 			return (ENOMEM);
990 		}
991 		*--bp = '/';
992 		slash_prefixed = 1;
993 		vp = ncp->nc_dvp;
994 		CACHE_UNLOCK();
995 	}
996 	if (!slash_prefixed) {
997 		if (bp == buf) {
998 			numfullpathfail4++;
999 			FILEDESC_UNLOCK(fdp);
1000 			free(buf, M_TEMP);
1001 			return (ENOMEM);
1002 		}
1003 		*--bp = '/';
1004 	}
1005 	FILEDESC_UNLOCK(fdp);
1006 	numfullpathfound++;
1007 	*retbuf = bp;
1008 	*freebuf = buf;
1009 	return (0);
1010 }
1011