xref: /freebsd/sys/kern/vfs_cache.c (revision 17d6c636720d00f77e5d098daf4c278f89d84f7b)
1 /*
2  * Copyright (c) 1989, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Poul-Henning Kamp of the FreeBSD Project.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
37  * $FreeBSD$
38  */
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/sysctl.h>
45 #include <sys/mount.h>
46 #include <sys/vnode.h>
47 #include <sys/namei.h>
48 #include <sys/malloc.h>
49 #include <sys/sysproto.h>
50 #include <sys/proc.h>
51 #include <sys/filedesc.h>
52 #include <sys/fnv_hash.h>
53 
54 /*
55  * This structure describes the elements in the cache of recent
56  * names looked up by namei.
57  */
58 
59 struct	namecache {
60 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
61 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
62 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
63 	struct	vnode *nc_dvp;		/* vnode of parent of name */
64 	struct	vnode *nc_vp;		/* vnode the name refers to */
65 	u_char	nc_flag;		/* flag bits */
66 	u_char	nc_nlen;		/* length of name */
67 	char	nc_name[0];		/* segment name */
68 };
69 
70 /*
71  * Name caching works as follows:
72  *
73  * Names found by directory scans are retained in a cache
74  * for future reference.  It is managed LRU, so frequently
75  * used names will hang around.  Cache is indexed by hash value
76  * obtained from (vp, name) where vp refers to the directory
77  * containing name.
78  *
79  * If it is a "negative" entry, (i.e. for a name that is known NOT to
80  * exist) the vnode pointer will be NULL.
81  *
82  * Upon reaching the last segment of a path, if the reference
83  * is for DELETE, or NOCACHE is set (rewrite), and the
84  * name is located in the cache, it will be dropped.
85  */
86 
87 /*
88  * Structures associated with name cacheing.
89  */
90 #define NCHHASH(hash) \
91 	(&nchashtbl[(hash) & nchash])
92 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
93 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
94 static u_long	nchash;			/* size of hash table */
95 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
96 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
97 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
98 static u_long	numneg;		/* number of cache entries allocated */
99 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
100 static u_long	numcache;		/* number of cache entries allocated */
101 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
102 static u_long	numcachehv;		/* number of cache entries with vnodes held */
103 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
104 #if 0
105 static u_long	numcachepl;		/* number of cache purge for leaf entries */
106 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
107 #endif
108 struct	nchstats nchstats;		/* cache effectiveness statistics */
109 
110 static int	doingcache = 1;		/* 1 => enable the cache */
111 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
112 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
113 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
114 
115 /*
116  * The new name cache statistics
117  */
118 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
119 #define STATNODE(mode, name, var) \
120 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
121 STATNODE(CTLFLAG_RD, numneg, &numneg);
122 STATNODE(CTLFLAG_RD, numcache, &numcache);
123 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
124 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
125 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
126 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
127 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
128 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
129 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
130 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
131 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
132 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
133 
134 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
135         sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
136 
137 
138 
139 static void cache_zap __P((struct namecache *ncp));
140 
141 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
142 
143 /*
144  * Flags in namecache.nc_flag
145  */
146 #define NCF_WHITE	1
147 
148 /*
149  * Grab an atomic snapshot of the name cache hash chain lengths
150  */
151 SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
152 
153 static int
154 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
155 {
156 	int error;
157 	struct nchashhead *ncpp;
158 	struct namecache *ncp;
159 	int n_nchash;
160 	int count;
161 
162 	n_nchash = nchash + 1;	/* nchash is max index, not count */
163 	if (!req->oldptr)
164 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
165 
166 	/* Scan hash tables for applicable entries */
167 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
168 		count = 0;
169 		LIST_FOREACH(ncp, ncpp, nc_hash) {
170 			count++;
171 		}
172 		error = SYSCTL_OUT(req, (caddr_t)&count, sizeof(count));
173 		if (error)
174 			return (error);
175 	}
176 	return (0);
177 }
178 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
179 	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
180 
181 static int
182 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
183 {
184 	int error;
185 	struct nchashhead *ncpp;
186 	struct namecache *ncp;
187 	int n_nchash;
188 	int count, maxlength, used, pct;
189 
190 	if (!req->oldptr)
191 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
192 
193 	n_nchash = nchash + 1;	/* nchash is max index, not count */
194 	used = 0;
195 	maxlength = 0;
196 
197 	/* Scan hash tables for applicable entries */
198 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
199 		count = 0;
200 		LIST_FOREACH(ncp, ncpp, nc_hash) {
201 			count++;
202 		}
203 		if (count)
204 			used++;
205 		if (maxlength < count)
206 			maxlength = count;
207 	}
208 	n_nchash = nchash + 1;
209 	pct = (used * 100 * 100) / n_nchash;
210 	error = SYSCTL_OUT(req, (caddr_t)&n_nchash, sizeof(n_nchash));
211 	if (error)
212 		return (error);
213 	error = SYSCTL_OUT(req, (caddr_t)&used, sizeof(used));
214 	if (error)
215 		return (error);
216 	error = SYSCTL_OUT(req, (caddr_t)&maxlength, sizeof(maxlength));
217 	if (error)
218 		return (error);
219 	error = SYSCTL_OUT(req, (caddr_t)&pct, sizeof(pct));
220 	if (error)
221 		return (error);
222 	return (0);
223 }
224 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
225 	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
226 
227 /*
228  * Delete an entry from its hash list and move it to the front
229  * of the LRU list for immediate reuse.
230  */
231 static void
232 cache_zap(ncp)
233 	struct namecache *ncp;
234 {
235 	LIST_REMOVE(ncp, nc_hash);
236 	LIST_REMOVE(ncp, nc_src);
237 	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
238 		vdrop(ncp->nc_dvp);
239 		numcachehv--;
240 	}
241 	if (ncp->nc_vp) {
242 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
243 	} else {
244 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
245 		numneg--;
246 	}
247 	numcache--;
248 	free(ncp, M_VFSCACHE);
249 }
250 
251 /*
252  * cache_leaf_test()
253  *
254  *      Test whether this (directory) vnode's namei cache entry contains
255  *      subdirectories or not.  Used to determine whether the directory is
256  *      a leaf in the namei cache or not.  Note: the directory may still
257  *      contain files in the namei cache.
258  *
259  *      Returns 0 if the directory is a leaf, -1 if it isn't.
260  */
261 int
262 cache_leaf_test(struct vnode *vp)
263 {
264 	struct namecache *ncpc;
265 
266 	for (ncpc = LIST_FIRST(&vp->v_cache_src);
267 	     ncpc != NULL;
268 	     ncpc = LIST_NEXT(ncpc, nc_src)
269 	) {
270 		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR)
271 			return(-1);
272 	}
273 	return(0);
274 }
275 
276 /*
277  * Lookup an entry in the cache
278  *
279  * We don't do this if the segment name is long, simply so the cache
280  * can avoid holding long names (which would either waste space, or
281  * add greatly to the complexity).
282  *
283  * Lookup is called with dvp pointing to the directory to search,
284  * cnp pointing to the name of the entry being sought. If the lookup
285  * succeeds, the vnode is returned in *vpp, and a status of -1 is
286  * returned. If the lookup determines that the name does not exist
287  * (negative cacheing), a status of ENOENT is returned. If the lookup
288  * fails, a status of zero is returned.
289  */
290 
291 int
292 cache_lookup(dvp, vpp, cnp)
293 	struct vnode *dvp;
294 	struct vnode **vpp;
295 	struct componentname *cnp;
296 {
297 	struct namecache *ncp;
298 	u_int32_t hash;
299 
300 	if (!doingcache) {
301 		cnp->cn_flags &= ~MAKEENTRY;
302 		return (0);
303 	}
304 
305 	numcalls++;
306 
307 	if (cnp->cn_nameptr[0] == '.') {
308 		if (cnp->cn_namelen == 1) {
309 			*vpp = dvp;
310 			dothits++;
311 			return (-1);
312 		}
313 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
314 			dotdothits++;
315 			if (dvp->v_dd->v_id != dvp->v_ddid ||
316 			    (cnp->cn_flags & MAKEENTRY) == 0) {
317 				dvp->v_ddid = 0;
318 				return (0);
319 			}
320 			*vpp = dvp->v_dd;
321 			return (-1);
322 		}
323 	}
324 
325 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
326 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
327 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
328 		numchecks++;
329 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
330 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
331 			break;
332 	}
333 
334 	/* We failed to find an entry */
335 	if (ncp == 0) {
336 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
337 			nummisszap++;
338 		} else {
339 			nummiss++;
340 		}
341 		nchstats.ncs_miss++;
342 		return (0);
343 	}
344 
345 	/* We don't want to have an entry, so dump it */
346 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
347 		numposzaps++;
348 		nchstats.ncs_badhits++;
349 		cache_zap(ncp);
350 		return (0);
351 	}
352 
353 	/* We found a "positive" match, return the vnode */
354         if (ncp->nc_vp) {
355 		numposhits++;
356 		nchstats.ncs_goodhits++;
357 		*vpp = ncp->nc_vp;
358 		return (-1);
359 	}
360 
361 	/* We found a negative match, and want to create it, so purge */
362 	if (cnp->cn_nameiop == CREATE) {
363 		numnegzaps++;
364 		nchstats.ncs_badhits++;
365 		cache_zap(ncp);
366 		return (0);
367 	}
368 
369 	numneghits++;
370 	/*
371 	 * We found a "negative" match, ENOENT notifies client of this match.
372 	 * The nc_vpid field records whether this is a whiteout.
373 	 */
374 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
375 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
376 	nchstats.ncs_neghits++;
377 	if (ncp->nc_flag & NCF_WHITE)
378 		cnp->cn_flags |= ISWHITEOUT;
379 	return (ENOENT);
380 }
381 
382 /*
383  * Add an entry to the cache.
384  */
385 void
386 cache_enter(dvp, vp, cnp)
387 	struct vnode *dvp;
388 	struct vnode *vp;
389 	struct componentname *cnp;
390 {
391 	struct namecache *ncp;
392 	struct nchashhead *ncpp;
393 	u_int32_t hash;
394 	int len;
395 
396 	if (!doingcache)
397 		return;
398 
399 	if (cnp->cn_nameptr[0] == '.') {
400 		if (cnp->cn_namelen == 1) {
401 			return;
402 		}
403 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
404 			if (vp) {
405 				dvp->v_dd = vp;
406 				dvp->v_ddid = vp->v_id;
407 			} else {
408 				dvp->v_dd = dvp;
409 				dvp->v_ddid = 0;
410 			}
411 			return;
412 		}
413 	}
414 
415 	ncp = (struct namecache *)
416 		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
417 	bzero((char *)ncp, sizeof *ncp);
418 	numcache++;
419 	if (!vp) {
420 		numneg++;
421 		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
422 	} else if (vp->v_type == VDIR) {
423 		vp->v_dd = dvp;
424 		vp->v_ddid = dvp->v_id;
425 	}
426 
427 	/*
428 	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
429 	 * For negative entries, we have to record whether it is a whiteout.
430 	 * the whiteout flag is stored in the nc_vpid field which is
431 	 * otherwise unused.
432 	 */
433 	ncp->nc_vp = vp;
434 	ncp->nc_dvp = dvp;
435 	len = ncp->nc_nlen = cnp->cn_namelen;
436 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
437 	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
438 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
439 	ncpp = NCHHASH(hash);
440 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
441 	if (LIST_EMPTY(&dvp->v_cache_src)) {
442 		vhold(dvp);
443 		numcachehv++;
444 	}
445 	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
446 	if (vp) {
447 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
448 	} else {
449 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
450 	}
451 	if (numneg * ncnegfactor > numcache) {
452 		ncp = TAILQ_FIRST(&ncneg);
453 		cache_zap(ncp);
454 	}
455 }
456 
457 /*
458  * Name cache initialization, from vfs_init() when we are booting
459  */
460 static void
461 nchinit(void *dummy __unused)
462 {
463 
464 	TAILQ_INIT(&ncneg);
465 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
466 }
467 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
468 
469 
470 /*
471  * Invalidate all entries to a particular vnode.
472  *
473  * Remove all entries in the namecache relating to this vnode and
474  * change the v_id.  We take the v_id from a global counter, since
475  * it becomes a handy sequence number in crash-dumps that way.
476  * No valid vnode will ever have (v_id == 0).
477  *
478  * XXX: Only time and the size of v_id prevents this from failing:
479  * XXX: In theory we should hunt down all (struct vnode*, v_id)
480  * XXX: soft references and nuke them, at least on the global
481  * XXX: v_id wraparound.  The period of resistance can be extended
482  * XXX: by incrementing each vnodes v_id individually instead of
483  * XXX: using the global v_id.
484  */
485 
486 void
487 cache_purge(vp)
488 	struct vnode *vp;
489 {
490 	static u_long nextid;
491 
492 	while (!LIST_EMPTY(&vp->v_cache_src))
493 		cache_zap(LIST_FIRST(&vp->v_cache_src));
494 	while (!TAILQ_EMPTY(&vp->v_cache_dst))
495 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
496 
497 	do
498 		nextid++;
499 	while (nextid == vp->v_id || !nextid);
500 	vp->v_id = nextid;
501 	vp->v_dd = vp;
502 	vp->v_ddid = 0;
503 }
504 
505 /*
506  * Flush all entries referencing a particular filesystem.
507  *
508  * Since we need to check it anyway, we will flush all the invalid
509  * entries at the same time.
510  */
511 void
512 cache_purgevfs(mp)
513 	struct mount *mp;
514 {
515 	struct nchashhead *ncpp;
516 	struct namecache *ncp, *nnp;
517 
518 	/* Scan hash tables for applicable entries */
519 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
520 		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
521 			nnp = LIST_NEXT(ncp, nc_hash);
522 			if (ncp->nc_dvp->v_mount == mp) {
523 				cache_zap(ncp);
524 			}
525 		}
526 	}
527 }
528 
529 #if 0
530 
531 /*
532  * Flush all dirctory entries with no child directories held in
533  * the cache.
534  *
535  * Since we need to check it anyway, we will flush all the invalid
536  * entries at the same time.
537  */
538 void
539 cache_purgeleafdirs(ndir)
540 	int ndir;
541 {
542 	struct nchashhead *ncpp;
543 	struct namecache *ncp, *nnp, *ncpc, *nnpc;
544 	struct vnode *dvp;
545 
546 	/* Scan hash tables for applicable entries */
547 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl && ndir > 0; ncpp--) {
548 		for (ncp = LIST_FIRST(ncpp); ncp != 0 && ndir > 0; ncp = nnp) {
549 			nnp = LIST_NEXT(ncp, nc_hash);
550 			if (ncp->nc_dvp != 0) {
551 				/*
552 				 * Skip over if nc_dvp of this cache holds
553 				 * a child directory, or the hold count of
554 				 * nc_dvp is greater than 1 (in which case
555 				 * nc_dvp is likely to be the working
556 				 * directory of a process).
557 				 */
558 				if (ncp->nc_dvp->v_holdcnt > 1)
559 					continue;
560 				for (ncpc = LIST_FIRST(&ncp->nc_dvp->v_cache_src);
561 				     ncpc != 0; ncpc = nnpc) {
562 					nnpc = LIST_NEXT(ncpc, nc_src);
563 					if (ncpc->nc_vp != 0 && ncpc->nc_vp->v_type == VDIR)
564 						break;
565 				}
566 				if (ncpc == 0) {
567 					/*
568 					 * Zap all of this directory's children,
569 					 * held in ncp->nc_dvp->v_cache_src.
570 					 */
571 					dvp = ncp->nc_dvp;
572 					while (!LIST_EMPTY(&dvp->v_cache_src))
573 						cache_zap(LIST_FIRST(&dvp->v_cache_src));
574 
575 					ndir--;
576 
577 					/* Restart in case where nnp is reclaimed. */
578 					nnp = LIST_FIRST(ncpp);
579 					continue;
580 				}
581 			}
582 		}
583 	}
584 	numcachepl++;
585 }
586 
587 #endif
588 
589 /*
590  * Perform canonical checks and cache lookup and pass on to filesystem
591  * through the vop_cachedlookup only if needed.
592  */
593 
594 int
595 vfs_cache_lookup(ap)
596 	struct vop_lookup_args /* {
597 		struct vnode *a_dvp;
598 		struct vnode **a_vpp;
599 		struct componentname *a_cnp;
600 	} */ *ap;
601 {
602 	struct vnode *dvp, *vp;
603 	int lockparent;
604 	int error;
605 	struct vnode **vpp = ap->a_vpp;
606 	struct componentname *cnp = ap->a_cnp;
607 	struct ucred *cred = cnp->cn_cred;
608 	int flags = cnp->cn_flags;
609 	struct thread *td = cnp->cn_thread;
610 	u_long vpid;	/* capability number of vnode */
611 
612 	*vpp = NULL;
613 	dvp = ap->a_dvp;
614 	lockparent = flags & LOCKPARENT;
615 
616 	if (dvp->v_type != VDIR)
617                 return (ENOTDIR);
618 
619 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
620 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
621 		return (EROFS);
622 
623 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
624 
625 	if (error)
626 		return (error);
627 
628 	error = cache_lookup(dvp, vpp, cnp);
629 
630 	if (!error)
631 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
632 
633 	if (error == ENOENT)
634 		return (error);
635 
636 	vp = *vpp;
637 	vpid = vp->v_id;
638 	cnp->cn_flags &= ~PDIRUNLOCK;
639 	if (dvp == vp) {   /* lookup on "." */
640 		VREF(vp);
641 		error = 0;
642 	} else if (flags & ISDOTDOT) {
643 		VOP_UNLOCK(dvp, 0, td);
644 		cnp->cn_flags |= PDIRUNLOCK;
645 		error = vget(vp, LK_EXCLUSIVE, td);
646 		if (!error && lockparent && (flags & ISLASTCN)) {
647 			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
648 				cnp->cn_flags &= ~PDIRUNLOCK;
649 		}
650 	} else {
651 		error = vget(vp, LK_EXCLUSIVE, td);
652 		if (!lockparent || error || !(flags & ISLASTCN)) {
653 			VOP_UNLOCK(dvp, 0, td);
654 			cnp->cn_flags |= PDIRUNLOCK;
655 		}
656 	}
657 	/*
658 	 * Check that the capability number did not change
659 	 * while we were waiting for the lock.
660 	 */
661 	if (!error) {
662 		if (vpid == vp->v_id)
663 			return (0);
664 		vput(vp);
665 		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
666 			VOP_UNLOCK(dvp, 0, td);
667 			cnp->cn_flags |= PDIRUNLOCK;
668 		}
669 	}
670 	if (cnp->cn_flags & PDIRUNLOCK) {
671 		error = vn_lock(dvp, LK_EXCLUSIVE, td);
672 		if (error)
673 			return (error);
674 		cnp->cn_flags &= ~PDIRUNLOCK;
675 	}
676 	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
677 }
678 
679 
680 #ifndef _SYS_SYSPROTO_H_
681 struct  __getcwd_args {
682 	u_char	*buf;
683 	u_int	buflen;
684 };
685 #endif
686 
687 static int disablecwd;
688 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
689 
690 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
691 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
692 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
693 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
694 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
695 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
696 int
697 __getcwd(td, uap)
698 	struct thread *td;
699 	struct __getcwd_args *uap;
700 {
701 	char *bp, *buf;
702 	int error, i, slash_prefixed;
703 	struct filedesc *fdp;
704 	struct namecache *ncp;
705 	struct vnode *vp;
706 
707 	numcwdcalls++;
708 	if (disablecwd)
709 		return (ENODEV);
710 	if (uap->buflen < 2)
711 		return (EINVAL);
712 	if (uap->buflen > MAXPATHLEN)
713 		uap->buflen = MAXPATHLEN;
714 	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
715 	bp += uap->buflen - 1;
716 	*bp = '\0';
717 	fdp = td->td_proc->p_fd;
718 	slash_prefixed = 0;
719 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
720 		if (vp->v_flag & VROOT) {
721 			if (vp->v_mount == NULL) {	/* forced unmount */
722 				free(buf, M_TEMP);
723 				return (EBADF);
724 			}
725 			vp = vp->v_mount->mnt_vnodecovered;
726 			continue;
727 		}
728 		if (vp->v_dd->v_id != vp->v_ddid) {
729 			numcwdfail1++;
730 			free(buf, M_TEMP);
731 			return (ENOTDIR);
732 		}
733 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
734 		if (!ncp) {
735 			numcwdfail2++;
736 			free(buf, M_TEMP);
737 			return (ENOENT);
738 		}
739 		if (ncp->nc_dvp != vp->v_dd) {
740 			numcwdfail3++;
741 			free(buf, M_TEMP);
742 			return (EBADF);
743 		}
744 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
745 			if (bp == buf) {
746 				numcwdfail4++;
747 				free(buf, M_TEMP);
748 				return (ENOMEM);
749 			}
750 			*--bp = ncp->nc_name[i];
751 		}
752 		if (bp == buf) {
753 			numcwdfail4++;
754 			free(buf, M_TEMP);
755 			return (ENOMEM);
756 		}
757 		*--bp = '/';
758 		slash_prefixed = 1;
759 		vp = vp->v_dd;
760 	}
761 	if (!slash_prefixed) {
762 		if (bp == buf) {
763 			numcwdfail4++;
764 			free(buf, M_TEMP);
765 			return (ENOMEM);
766 		}
767 		*--bp = '/';
768 	}
769 	numcwdfound++;
770 	error = copyout(bp, uap->buf, strlen(bp) + 1);
771 	free(buf, M_TEMP);
772 	return (error);
773 }
774 
775 /*
776  * Thus begins the fullpath magic.
777  */
778 
779 #undef STATNODE
780 #define STATNODE(name)							\
781 	static u_int name;						\
782 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
783 
784 static int disablefullpath;
785 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
786     &disablefullpath, 0, "");
787 
788 STATNODE(numfullpathcalls);
789 STATNODE(numfullpathfail1);
790 STATNODE(numfullpathfail2);
791 STATNODE(numfullpathfail3);
792 STATNODE(numfullpathfail4);
793 STATNODE(numfullpathfound);
794 
795 int
796 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
797 {
798 	char *bp, *buf;
799 	int i, slash_prefixed;
800 	struct filedesc *fdp;
801 	struct namecache *ncp;
802 	struct vnode *vp;
803 
804 	numfullpathcalls++;
805 	if (disablefullpath)
806 		return (ENODEV);
807 	if (vn == NULL)
808 		return (EINVAL);
809 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
810 	bp = buf + MAXPATHLEN - 1;
811 	*bp = '\0';
812 	fdp = td->td_proc->p_fd;
813 	slash_prefixed = 0;
814 	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
815 		if (vp->v_flag & VROOT) {
816 			if (vp->v_mount == NULL) {	/* forced unmount */
817 				free(buf, M_TEMP);
818 				return (EBADF);
819 			}
820 			vp = vp->v_mount->mnt_vnodecovered;
821 			continue;
822 		}
823 		if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
824 			numfullpathfail1++;
825 			free(buf, M_TEMP);
826 			return (ENOTDIR);
827 		}
828 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
829 		if (!ncp) {
830 			numfullpathfail2++;
831 			free(buf, M_TEMP);
832 			return (ENOENT);
833 		}
834 		if (vp != vn && ncp->nc_dvp != vp->v_dd) {
835 			numfullpathfail3++;
836 			free(buf, M_TEMP);
837 			return (EBADF);
838 		}
839 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
840 			if (bp == buf) {
841 				numfullpathfail4++;
842 				free(buf, M_TEMP);
843 				return (ENOMEM);
844 			}
845 			*--bp = ncp->nc_name[i];
846 		}
847 		if (bp == buf) {
848 			numfullpathfail4++;
849 			free(buf, M_TEMP);
850 			return (ENOMEM);
851 		}
852 		*--bp = '/';
853 		slash_prefixed = 1;
854 		vp = ncp->nc_dvp;
855 	}
856 	if (!slash_prefixed) {
857 		if (bp == buf) {
858 			numfullpathfail4++;
859 			free(buf, M_TEMP);
860 			return (ENOMEM);
861 		}
862 		*--bp = '/';
863 	}
864 	numfullpathfound++;
865 	*retbuf = bp;
866 	*freebuf = buf;
867 	return (0);
868 }
869