xref: /freebsd/sys/kern/vfs_cache.c (revision 70fe064ad7cab6c0444b91622f60ec6a462f308a)
1 /*
2  * Copyright (c) 1989, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Poul-Henning Kamp of the FreeBSD Project.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
37  * $FreeBSD$
38  */
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/sysctl.h>
46 #include <sys/mount.h>
47 #include <sys/vnode.h>
48 #include <sys/namei.h>
49 #include <sys/malloc.h>
50 #include <sys/sysproto.h>
51 #include <sys/proc.h>
52 #include <sys/filedesc.h>
53 #include <sys/fnv_hash.h>
54 
55 /*
56  * This structure describes the elements in the cache of recent
57  * names looked up by namei.
58  */
59 
60 struct	namecache {
61 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
62 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
63 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
64 	struct	vnode *nc_dvp;		/* vnode of parent of name */
65 	struct	vnode *nc_vp;		/* vnode the name refers to */
66 	u_char	nc_flag;		/* flag bits */
67 	u_char	nc_nlen;		/* length of name */
68 	char	nc_name[0];		/* segment name */
69 };
70 
71 /*
72  * Name caching works as follows:
73  *
74  * Names found by directory scans are retained in a cache
75  * for future reference.  It is managed LRU, so frequently
76  * used names will hang around.  Cache is indexed by hash value
77  * obtained from (vp, name) where vp refers to the directory
78  * containing name.
79  *
80  * If it is a "negative" entry, (i.e. for a name that is known NOT to
81  * exist) the vnode pointer will be NULL.
82  *
83  * Upon reaching the last segment of a path, if the reference
84  * is for DELETE, or NOCACHE is set (rewrite), and the
85  * name is located in the cache, it will be dropped.
86  */
87 
88 /*
89  * Structures associated with name cacheing.
90  */
91 #define NCHHASH(hash) \
92 	(&nchashtbl[(hash) & nchash])
93 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
94 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
95 static u_long	nchash;			/* size of hash table */
96 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
97 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
98 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
99 static u_long	numneg;		/* number of cache entries allocated */
100 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
101 static u_long	numcache;		/* number of cache entries allocated */
102 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
103 static u_long	numcachehv;		/* number of cache entries with vnodes held */
104 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
105 #if 0
106 static u_long	numcachepl;		/* number of cache purge for leaf entries */
107 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
108 #endif
109 struct	nchstats nchstats;		/* cache effectiveness statistics */
110 
111 static int	doingcache = 1;		/* 1 => enable the cache */
112 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
113 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
114 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
115 
116 /*
117  * The new name cache statistics
118  */
119 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
120 #define STATNODE(mode, name, var) \
121 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
122 STATNODE(CTLFLAG_RD, numneg, &numneg);
123 STATNODE(CTLFLAG_RD, numcache, &numcache);
124 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
125 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
126 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
127 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
128 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
129 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
130 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
131 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
132 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
133 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
134 
135 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
136         sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
137 
138 
139 
140 static void cache_zap __P((struct namecache *ncp));
141 
142 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
143 
144 /*
145  * Flags in namecache.nc_flag
146  */
147 #define NCF_WHITE	1
148 
149 /*
150  * Grab an atomic snapshot of the name cache hash chain lengths
151  */
152 SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
153 
154 static int
155 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
156 {
157 	int error;
158 	struct nchashhead *ncpp;
159 	struct namecache *ncp;
160 	int n_nchash;
161 	int count;
162 
163 	n_nchash = nchash + 1;	/* nchash is max index, not count */
164 	if (!req->oldptr)
165 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
166 
167 	/* Scan hash tables for applicable entries */
168 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
169 		count = 0;
170 		LIST_FOREACH(ncp, ncpp, nc_hash) {
171 			count++;
172 		}
173 		error = SYSCTL_OUT(req, (caddr_t)&count, sizeof(count));
174 		if (error)
175 			return (error);
176 	}
177 	return (0);
178 }
179 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
180 	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
181 
182 static int
183 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
184 {
185 	int error;
186 	struct nchashhead *ncpp;
187 	struct namecache *ncp;
188 	int n_nchash;
189 	int count, maxlength, used, pct;
190 
191 	if (!req->oldptr)
192 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
193 
194 	n_nchash = nchash + 1;	/* nchash is max index, not count */
195 	used = 0;
196 	maxlength = 0;
197 
198 	/* Scan hash tables for applicable entries */
199 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
200 		count = 0;
201 		LIST_FOREACH(ncp, ncpp, nc_hash) {
202 			count++;
203 		}
204 		if (count)
205 			used++;
206 		if (maxlength < count)
207 			maxlength = count;
208 	}
209 	n_nchash = nchash + 1;
210 	pct = (used * 100 * 100) / n_nchash;
211 	error = SYSCTL_OUT(req, (caddr_t)&n_nchash, sizeof(n_nchash));
212 	if (error)
213 		return (error);
214 	error = SYSCTL_OUT(req, (caddr_t)&used, sizeof(used));
215 	if (error)
216 		return (error);
217 	error = SYSCTL_OUT(req, (caddr_t)&maxlength, sizeof(maxlength));
218 	if (error)
219 		return (error);
220 	error = SYSCTL_OUT(req, (caddr_t)&pct, sizeof(pct));
221 	if (error)
222 		return (error);
223 	return (0);
224 }
225 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
226 	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
227 
228 /*
229  * Delete an entry from its hash list and move it to the front
230  * of the LRU list for immediate reuse.
231  */
232 static void
233 cache_zap(ncp)
234 	struct namecache *ncp;
235 {
236 	LIST_REMOVE(ncp, nc_hash);
237 	LIST_REMOVE(ncp, nc_src);
238 	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
239 		vdrop(ncp->nc_dvp);
240 		numcachehv--;
241 	}
242 	if (ncp->nc_vp) {
243 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
244 	} else {
245 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
246 		numneg--;
247 	}
248 	numcache--;
249 	free(ncp, M_VFSCACHE);
250 }
251 
252 /*
253  * cache_leaf_test()
254  *
255  *      Test whether this (directory) vnode's namei cache entry contains
256  *      subdirectories or not.  Used to determine whether the directory is
257  *      a leaf in the namei cache or not.  Note: the directory may still
258  *      contain files in the namei cache.
259  *
260  *      Returns 0 if the directory is a leaf, -1 if it isn't.
261  */
262 int
263 cache_leaf_test(struct vnode *vp)
264 {
265 	struct namecache *ncpc;
266 
267 	for (ncpc = LIST_FIRST(&vp->v_cache_src);
268 	     ncpc != NULL;
269 	     ncpc = LIST_NEXT(ncpc, nc_src)
270 	) {
271 		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR)
272 			return(-1);
273 	}
274 	return(0);
275 }
276 
277 /*
278  * Lookup an entry in the cache
279  *
280  * We don't do this if the segment name is long, simply so the cache
281  * can avoid holding long names (which would either waste space, or
282  * add greatly to the complexity).
283  *
284  * Lookup is called with dvp pointing to the directory to search,
285  * cnp pointing to the name of the entry being sought. If the lookup
286  * succeeds, the vnode is returned in *vpp, and a status of -1 is
287  * returned. If the lookup determines that the name does not exist
288  * (negative cacheing), a status of ENOENT is returned. If the lookup
289  * fails, a status of zero is returned.
290  */
291 
292 int
293 cache_lookup(dvp, vpp, cnp)
294 	struct vnode *dvp;
295 	struct vnode **vpp;
296 	struct componentname *cnp;
297 {
298 	struct namecache *ncp;
299 	u_int32_t hash;
300 
301 	if (!doingcache) {
302 		cnp->cn_flags &= ~MAKEENTRY;
303 		return (0);
304 	}
305 
306 	numcalls++;
307 
308 	if (cnp->cn_nameptr[0] == '.') {
309 		if (cnp->cn_namelen == 1) {
310 			*vpp = dvp;
311 			dothits++;
312 			return (-1);
313 		}
314 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
315 			dotdothits++;
316 			if (dvp->v_dd->v_id != dvp->v_ddid ||
317 			    (cnp->cn_flags & MAKEENTRY) == 0) {
318 				dvp->v_ddid = 0;
319 				return (0);
320 			}
321 			*vpp = dvp->v_dd;
322 			return (-1);
323 		}
324 	}
325 
326 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
327 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
328 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
329 		numchecks++;
330 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
331 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
332 			break;
333 	}
334 
335 	/* We failed to find an entry */
336 	if (ncp == 0) {
337 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
338 			nummisszap++;
339 		} else {
340 			nummiss++;
341 		}
342 		nchstats.ncs_miss++;
343 		return (0);
344 	}
345 
346 	/* We don't want to have an entry, so dump it */
347 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
348 		numposzaps++;
349 		nchstats.ncs_badhits++;
350 		cache_zap(ncp);
351 		return (0);
352 	}
353 
354 	/* We found a "positive" match, return the vnode */
355         if (ncp->nc_vp) {
356 		numposhits++;
357 		nchstats.ncs_goodhits++;
358 		*vpp = ncp->nc_vp;
359 		return (-1);
360 	}
361 
362 	/* We found a negative match, and want to create it, so purge */
363 	if (cnp->cn_nameiop == CREATE) {
364 		numnegzaps++;
365 		nchstats.ncs_badhits++;
366 		cache_zap(ncp);
367 		return (0);
368 	}
369 
370 	numneghits++;
371 	/*
372 	 * We found a "negative" match, ENOENT notifies client of this match.
373 	 * The nc_vpid field records whether this is a whiteout.
374 	 */
375 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
376 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
377 	nchstats.ncs_neghits++;
378 	if (ncp->nc_flag & NCF_WHITE)
379 		cnp->cn_flags |= ISWHITEOUT;
380 	return (ENOENT);
381 }
382 
383 /*
384  * Add an entry to the cache.
385  */
386 void
387 cache_enter(dvp, vp, cnp)
388 	struct vnode *dvp;
389 	struct vnode *vp;
390 	struct componentname *cnp;
391 {
392 	struct namecache *ncp;
393 	struct nchashhead *ncpp;
394 	u_int32_t hash;
395 	int len;
396 
397 	if (!doingcache)
398 		return;
399 
400 	if (cnp->cn_nameptr[0] == '.') {
401 		if (cnp->cn_namelen == 1) {
402 			return;
403 		}
404 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
405 			if (vp) {
406 				dvp->v_dd = vp;
407 				dvp->v_ddid = vp->v_id;
408 			} else {
409 				dvp->v_dd = dvp;
410 				dvp->v_ddid = 0;
411 			}
412 			return;
413 		}
414 	}
415 
416 	ncp = (struct namecache *)
417 		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
418 	bzero((char *)ncp, sizeof *ncp);
419 	numcache++;
420 	if (!vp) {
421 		numneg++;
422 		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
423 	} else if (vp->v_type == VDIR) {
424 		vp->v_dd = dvp;
425 		vp->v_ddid = dvp->v_id;
426 	}
427 
428 	/*
429 	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
430 	 * For negative entries, we have to record whether it is a whiteout.
431 	 * the whiteout flag is stored in the nc_vpid field which is
432 	 * otherwise unused.
433 	 */
434 	ncp->nc_vp = vp;
435 	ncp->nc_dvp = dvp;
436 	len = ncp->nc_nlen = cnp->cn_namelen;
437 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
438 	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
439 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
440 	ncpp = NCHHASH(hash);
441 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
442 	if (LIST_EMPTY(&dvp->v_cache_src)) {
443 		vhold(dvp);
444 		numcachehv++;
445 	}
446 	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
447 	if (vp) {
448 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
449 	} else {
450 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
451 	}
452 	if (numneg * ncnegfactor > numcache) {
453 		ncp = TAILQ_FIRST(&ncneg);
454 		cache_zap(ncp);
455 	}
456 }
457 
458 /*
459  * Name cache initialization, from vfs_init() when we are booting
460  */
461 static void
462 nchinit(void *dummy __unused)
463 {
464 
465 	TAILQ_INIT(&ncneg);
466 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
467 }
468 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
469 
470 
471 /*
472  * Invalidate all entries to a particular vnode.
473  *
474  * Remove all entries in the namecache relating to this vnode and
475  * change the v_id.  We take the v_id from a global counter, since
476  * it becomes a handy sequence number in crash-dumps that way.
477  * No valid vnode will ever have (v_id == 0).
478  *
479  * XXX: Only time and the size of v_id prevents this from failing:
480  * XXX: In theory we should hunt down all (struct vnode*, v_id)
481  * XXX: soft references and nuke them, at least on the global
482  * XXX: v_id wraparound.  The period of resistance can be extended
483  * XXX: by incrementing each vnodes v_id individually instead of
484  * XXX: using the global v_id.
485  */
486 
487 void
488 cache_purge(vp)
489 	struct vnode *vp;
490 {
491 	static u_long nextid;
492 
493 	while (!LIST_EMPTY(&vp->v_cache_src))
494 		cache_zap(LIST_FIRST(&vp->v_cache_src));
495 	while (!TAILQ_EMPTY(&vp->v_cache_dst))
496 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
497 
498 	do
499 		nextid++;
500 	while (nextid == vp->v_id || !nextid);
501 	vp->v_id = nextid;
502 	vp->v_dd = vp;
503 	vp->v_ddid = 0;
504 }
505 
506 /*
507  * Flush all entries referencing a particular filesystem.
508  *
509  * Since we need to check it anyway, we will flush all the invalid
510  * entries at the same time.
511  */
512 void
513 cache_purgevfs(mp)
514 	struct mount *mp;
515 {
516 	struct nchashhead *ncpp;
517 	struct namecache *ncp, *nnp;
518 
519 	/* Scan hash tables for applicable entries */
520 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
521 		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
522 			nnp = LIST_NEXT(ncp, nc_hash);
523 			if (ncp->nc_dvp->v_mount == mp) {
524 				cache_zap(ncp);
525 			}
526 		}
527 	}
528 }
529 
530 #if 0
531 
532 /*
533  * Flush all dirctory entries with no child directories held in
534  * the cache.
535  *
536  * Since we need to check it anyway, we will flush all the invalid
537  * entries at the same time.
538  */
539 void
540 cache_purgeleafdirs(ndir)
541 	int ndir;
542 {
543 	struct nchashhead *ncpp;
544 	struct namecache *ncp, *nnp, *ncpc, *nnpc;
545 	struct vnode *dvp;
546 
547 	/* Scan hash tables for applicable entries */
548 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl && ndir > 0; ncpp--) {
549 		for (ncp = LIST_FIRST(ncpp); ncp != 0 && ndir > 0; ncp = nnp) {
550 			nnp = LIST_NEXT(ncp, nc_hash);
551 			if (ncp->nc_dvp != 0) {
552 				/*
553 				 * Skip over if nc_dvp of this cache holds
554 				 * a child directory, or the hold count of
555 				 * nc_dvp is greater than 1 (in which case
556 				 * nc_dvp is likely to be the working
557 				 * directory of a process).
558 				 */
559 				if (ncp->nc_dvp->v_holdcnt > 1)
560 					continue;
561 				for (ncpc = LIST_FIRST(&ncp->nc_dvp->v_cache_src);
562 				     ncpc != 0; ncpc = nnpc) {
563 					nnpc = LIST_NEXT(ncpc, nc_src);
564 					if (ncpc->nc_vp != 0 && ncpc->nc_vp->v_type == VDIR)
565 						break;
566 				}
567 				if (ncpc == 0) {
568 					/*
569 					 * Zap all of this directory's children,
570 					 * held in ncp->nc_dvp->v_cache_src.
571 					 */
572 					dvp = ncp->nc_dvp;
573 					while (!LIST_EMPTY(&dvp->v_cache_src))
574 						cache_zap(LIST_FIRST(&dvp->v_cache_src));
575 
576 					ndir--;
577 
578 					/* Restart in case where nnp is reclaimed. */
579 					nnp = LIST_FIRST(ncpp);
580 					continue;
581 				}
582 			}
583 		}
584 	}
585 	numcachepl++;
586 }
587 
588 #endif
589 
590 /*
591  * Perform canonical checks and cache lookup and pass on to filesystem
592  * through the vop_cachedlookup only if needed.
593  */
594 
595 int
596 vfs_cache_lookup(ap)
597 	struct vop_lookup_args /* {
598 		struct vnode *a_dvp;
599 		struct vnode **a_vpp;
600 		struct componentname *a_cnp;
601 	} */ *ap;
602 {
603 	struct vnode *dvp, *vp;
604 	int lockparent;
605 	int error;
606 	struct vnode **vpp = ap->a_vpp;
607 	struct componentname *cnp = ap->a_cnp;
608 	struct ucred *cred = cnp->cn_cred;
609 	int flags = cnp->cn_flags;
610 	struct thread *td = cnp->cn_thread;
611 	u_long vpid;	/* capability number of vnode */
612 
613 	*vpp = NULL;
614 	dvp = ap->a_dvp;
615 	lockparent = flags & LOCKPARENT;
616 
617 	if (dvp->v_type != VDIR)
618                 return (ENOTDIR);
619 
620 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
621 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
622 		return (EROFS);
623 
624 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
625 
626 	if (error)
627 		return (error);
628 
629 	error = cache_lookup(dvp, vpp, cnp);
630 
631 	if (!error)
632 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
633 
634 	if (error == ENOENT)
635 		return (error);
636 
637 	vp = *vpp;
638 	vpid = vp->v_id;
639 	cnp->cn_flags &= ~PDIRUNLOCK;
640 	if (dvp == vp) {   /* lookup on "." */
641 		VREF(vp);
642 		error = 0;
643 	} else if (flags & ISDOTDOT) {
644 		VOP_UNLOCK(dvp, 0, td);
645 		cnp->cn_flags |= PDIRUNLOCK;
646 		error = vget(vp, LK_EXCLUSIVE, td);
647 		if (!error && lockparent && (flags & ISLASTCN)) {
648 			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
649 				cnp->cn_flags &= ~PDIRUNLOCK;
650 		}
651 	} else {
652 		error = vget(vp, LK_EXCLUSIVE, td);
653 		if (!lockparent || error || !(flags & ISLASTCN)) {
654 			VOP_UNLOCK(dvp, 0, td);
655 			cnp->cn_flags |= PDIRUNLOCK;
656 		}
657 	}
658 	/*
659 	 * Check that the capability number did not change
660 	 * while we were waiting for the lock.
661 	 */
662 	if (!error) {
663 		if (vpid == vp->v_id)
664 			return (0);
665 		vput(vp);
666 		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
667 			VOP_UNLOCK(dvp, 0, td);
668 			cnp->cn_flags |= PDIRUNLOCK;
669 		}
670 	}
671 	if (cnp->cn_flags & PDIRUNLOCK) {
672 		error = vn_lock(dvp, LK_EXCLUSIVE, td);
673 		if (error)
674 			return (error);
675 		cnp->cn_flags &= ~PDIRUNLOCK;
676 	}
677 	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
678 }
679 
680 
681 #ifndef _SYS_SYSPROTO_H_
682 struct  __getcwd_args {
683 	u_char	*buf;
684 	u_int	buflen;
685 };
686 #endif
687 
688 static int disablecwd;
689 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
690 
691 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
692 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
693 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
694 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
695 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
696 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
697 int
698 __getcwd(td, uap)
699 	struct thread *td;
700 	struct __getcwd_args *uap;
701 {
702 	char *bp, *buf;
703 	int error, i, slash_prefixed;
704 	struct filedesc *fdp;
705 	struct namecache *ncp;
706 	struct vnode *vp;
707 
708 	numcwdcalls++;
709 	if (disablecwd)
710 		return (ENODEV);
711 	if (uap->buflen < 2)
712 		return (EINVAL);
713 	if (uap->buflen > MAXPATHLEN)
714 		uap->buflen = MAXPATHLEN;
715 	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
716 	bp += uap->buflen - 1;
717 	*bp = '\0';
718 	fdp = td->td_proc->p_fd;
719 	slash_prefixed = 0;
720 	FILEDESC_LOCK(fdp);
721 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
722 		if (vp->v_flag & VROOT) {
723 			if (vp->v_mount == NULL) {	/* forced unmount */
724 				FILEDESC_UNLOCK(fdp);
725 				free(buf, M_TEMP);
726 				return (EBADF);
727 			}
728 			vp = vp->v_mount->mnt_vnodecovered;
729 			continue;
730 		}
731 		if (vp->v_dd->v_id != vp->v_ddid) {
732 			FILEDESC_UNLOCK(fdp);
733 			numcwdfail1++;
734 			free(buf, M_TEMP);
735 			return (ENOTDIR);
736 		}
737 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
738 		if (!ncp) {
739 			FILEDESC_UNLOCK(fdp);
740 			numcwdfail2++;
741 			free(buf, M_TEMP);
742 			return (ENOENT);
743 		}
744 		if (ncp->nc_dvp != vp->v_dd) {
745 			FILEDESC_UNLOCK(fdp);
746 			numcwdfail3++;
747 			free(buf, M_TEMP);
748 			return (EBADF);
749 		}
750 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
751 			if (bp == buf) {
752 				FILEDESC_UNLOCK(fdp);
753 				numcwdfail4++;
754 				free(buf, M_TEMP);
755 				return (ENOMEM);
756 			}
757 			*--bp = ncp->nc_name[i];
758 		}
759 		if (bp == buf) {
760 			FILEDESC_UNLOCK(fdp);
761 			numcwdfail4++;
762 			free(buf, M_TEMP);
763 			return (ENOMEM);
764 		}
765 		*--bp = '/';
766 		slash_prefixed = 1;
767 		vp = vp->v_dd;
768 	}
769 	FILEDESC_UNLOCK(fdp);
770 	if (!slash_prefixed) {
771 		if (bp == buf) {
772 			numcwdfail4++;
773 			free(buf, M_TEMP);
774 			return (ENOMEM);
775 		}
776 		*--bp = '/';
777 	}
778 	numcwdfound++;
779 	error = copyout(bp, uap->buf, strlen(bp) + 1);
780 	free(buf, M_TEMP);
781 	return (error);
782 }
783 
784 /*
785  * Thus begins the fullpath magic.
786  */
787 
788 #undef STATNODE
789 #define STATNODE(name)							\
790 	static u_int name;						\
791 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
792 
793 static int disablefullpath;
794 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
795     &disablefullpath, 0, "");
796 
797 STATNODE(numfullpathcalls);
798 STATNODE(numfullpathfail1);
799 STATNODE(numfullpathfail2);
800 STATNODE(numfullpathfail3);
801 STATNODE(numfullpathfail4);
802 STATNODE(numfullpathfound);
803 
804 int
805 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
806 {
807 	char *bp, *buf;
808 	int i, slash_prefixed;
809 	struct filedesc *fdp;
810 	struct namecache *ncp;
811 	struct vnode *vp;
812 
813 	numfullpathcalls++;
814 	if (disablefullpath)
815 		return (ENODEV);
816 	if (vn == NULL)
817 		return (EINVAL);
818 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
819 	bp = buf + MAXPATHLEN - 1;
820 	*bp = '\0';
821 	fdp = td->td_proc->p_fd;
822 	slash_prefixed = 0;
823 	FILEDESC_LOCK(fdp);
824 	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
825 		if (vp->v_flag & VROOT) {
826 			if (vp->v_mount == NULL) {	/* forced unmount */
827 				FILEDESC_UNLOCK(fdp);
828 				free(buf, M_TEMP);
829 				return (EBADF);
830 			}
831 			vp = vp->v_mount->mnt_vnodecovered;
832 			continue;
833 		}
834 		if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
835 			FILEDESC_UNLOCK(fdp);
836 			numfullpathfail1++;
837 			free(buf, M_TEMP);
838 			return (ENOTDIR);
839 		}
840 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
841 		if (!ncp) {
842 			FILEDESC_UNLOCK(fdp);
843 			numfullpathfail2++;
844 			free(buf, M_TEMP);
845 			return (ENOENT);
846 		}
847 		if (vp != vn && ncp->nc_dvp != vp->v_dd) {
848 			FILEDESC_UNLOCK(fdp);
849 			numfullpathfail3++;
850 			free(buf, M_TEMP);
851 			return (EBADF);
852 		}
853 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
854 			if (bp == buf) {
855 				FILEDESC_UNLOCK(fdp);
856 				numfullpathfail4++;
857 				free(buf, M_TEMP);
858 				return (ENOMEM);
859 			}
860 			*--bp = ncp->nc_name[i];
861 		}
862 		if (bp == buf) {
863 			FILEDESC_UNLOCK(fdp);
864 			numfullpathfail4++;
865 			free(buf, M_TEMP);
866 			return (ENOMEM);
867 		}
868 		*--bp = '/';
869 		slash_prefixed = 1;
870 		vp = ncp->nc_dvp;
871 	}
872 	if (!slash_prefixed) {
873 		if (bp == buf) {
874 			FILEDESC_UNLOCK(fdp);
875 			numfullpathfail4++;
876 			free(buf, M_TEMP);
877 			return (ENOMEM);
878 		}
879 		*--bp = '/';
880 	}
881 	FILEDESC_UNLOCK(fdp);
882 	numfullpathfound++;
883 	*retbuf = bp;
884 	*freebuf = buf;
885 	return (0);
886 }
887