xref: /freebsd/sys/kern/vfs_cache.c (revision 42c159fe388a3765f69860c84183700af37aca8a)
1 /*
2  * Copyright (c) 1989, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Poul-Henning Kamp of the FreeBSD Project.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
37  * $FreeBSD$
38  */
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/sysctl.h>
46 #include <sys/mount.h>
47 #include <sys/vnode.h>
48 #include <sys/namei.h>
49 #include <sys/malloc.h>
50 #include <sys/sysproto.h>
51 #include <sys/proc.h>
52 #include <sys/filedesc.h>
53 #include <sys/fnv_hash.h>
54 
55 /*
56  * This structure describes the elements in the cache of recent
57  * names looked up by namei.
58  */
59 
60 struct	namecache {
61 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
62 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
63 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
64 	struct	vnode *nc_dvp;		/* vnode of parent of name */
65 	struct	vnode *nc_vp;		/* vnode the name refers to */
66 	u_char	nc_flag;		/* flag bits */
67 	u_char	nc_nlen;		/* length of name */
68 	char	nc_name[0];		/* segment name */
69 };
70 
71 /*
72  * Name caching works as follows:
73  *
74  * Names found by directory scans are retained in a cache
75  * for future reference.  It is managed LRU, so frequently
76  * used names will hang around.  Cache is indexed by hash value
77  * obtained from (vp, name) where vp refers to the directory
78  * containing name.
79  *
80  * If it is a "negative" entry, (i.e. for a name that is known NOT to
81  * exist) the vnode pointer will be NULL.
82  *
83  * Upon reaching the last segment of a path, if the reference
84  * is for DELETE, or NOCACHE is set (rewrite), and the
85  * name is located in the cache, it will be dropped.
86  */
87 
88 /*
89  * Structures associated with name cacheing.
90  */
91 #define NCHHASH(hash) \
92 	(&nchashtbl[(hash) & nchash])
93 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
94 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
95 static u_long	nchash;			/* size of hash table */
96 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
97 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
98 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
99 static u_long	numneg;			/* number of cache entries allocated */
100 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
101 static u_long	numcache;		/* number of cache entries allocated */
102 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
103 static u_long	numcachehv;		/* number of cache entries with vnodes held */
104 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
105 #if 0
106 static u_long	numcachepl;		/* number of cache purge for leaf entries */
107 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
108 #endif
109 struct	nchstats nchstats;		/* cache effectiveness statistics */
110 
111 static int	doingcache = 1;		/* 1 => enable the cache */
112 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
113 
114 /* Export size information to userland */
115 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
116 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
117 
118 /*
119  * The new name cache statistics
120  */
121 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
122 #define STATNODE(mode, name, var) \
123 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
124 STATNODE(CTLFLAG_RD, numneg, &numneg);
125 STATNODE(CTLFLAG_RD, numcache, &numcache);
126 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
127 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
128 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
129 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
130 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
131 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
132 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
133 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
134 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
135 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
136 
137 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
138         sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
139 
140 
141 
142 static void cache_zap(struct namecache *ncp);
143 
144 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
145 
146 /*
147  * Flags in namecache.nc_flag
148  */
149 #define NCF_WHITE	1
150 
151 /*
152  * Grab an atomic snapshot of the name cache hash chain lengths
153  */
154 SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
155 
156 static int
157 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
158 {
159 	int error;
160 	struct nchashhead *ncpp;
161 	struct namecache *ncp;
162 	int n_nchash;
163 	int count;
164 
165 	n_nchash = nchash + 1;	/* nchash is max index, not count */
166 	if (!req->oldptr)
167 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
168 
169 	/* Scan hash tables for applicable entries */
170 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
171 		count = 0;
172 		LIST_FOREACH(ncp, ncpp, nc_hash) {
173 			count++;
174 		}
175 		error = SYSCTL_OUT(req, (caddr_t)&count, sizeof(count));
176 		if (error)
177 			return (error);
178 	}
179 	return (0);
180 }
181 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
182 	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
183 
184 static int
185 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
186 {
187 	int error;
188 	struct nchashhead *ncpp;
189 	struct namecache *ncp;
190 	int n_nchash;
191 	int count, maxlength, used, pct;
192 
193 	if (!req->oldptr)
194 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
195 
196 	n_nchash = nchash + 1;	/* nchash is max index, not count */
197 	used = 0;
198 	maxlength = 0;
199 
200 	/* Scan hash tables for applicable entries */
201 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
202 		count = 0;
203 		LIST_FOREACH(ncp, ncpp, nc_hash) {
204 			count++;
205 		}
206 		if (count)
207 			used++;
208 		if (maxlength < count)
209 			maxlength = count;
210 	}
211 	n_nchash = nchash + 1;
212 	pct = (used * 100 * 100) / n_nchash;
213 	error = SYSCTL_OUT(req, (caddr_t)&n_nchash, sizeof(n_nchash));
214 	if (error)
215 		return (error);
216 	error = SYSCTL_OUT(req, (caddr_t)&used, sizeof(used));
217 	if (error)
218 		return (error);
219 	error = SYSCTL_OUT(req, (caddr_t)&maxlength, sizeof(maxlength));
220 	if (error)
221 		return (error);
222 	error = SYSCTL_OUT(req, (caddr_t)&pct, sizeof(pct));
223 	if (error)
224 		return (error);
225 	return (0);
226 }
227 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
228 	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
229 
230 /*
231  * Delete an entry from its hash list and move it to the front
232  * of the LRU list for immediate reuse.
233  */
234 static void
235 cache_zap(ncp)
236 	struct namecache *ncp;
237 {
238 	LIST_REMOVE(ncp, nc_hash);
239 	LIST_REMOVE(ncp, nc_src);
240 	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
241 		vdrop(ncp->nc_dvp);
242 		numcachehv--;
243 	}
244 	if (ncp->nc_vp) {
245 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
246 	} else {
247 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
248 		numneg--;
249 	}
250 	numcache--;
251 	free(ncp, M_VFSCACHE);
252 }
253 
254 /*
255  * cache_leaf_test()
256  *
257  *      Test whether this (directory) vnode's namei cache entry contains
258  *      subdirectories or not.  Used to determine whether the directory is
259  *      a leaf in the namei cache or not.  Note: the directory may still
260  *      contain files in the namei cache.
261  *
262  *      Returns 0 if the directory is a leaf, -1 if it isn't.
263  */
264 int
265 cache_leaf_test(struct vnode *vp)
266 {
267 	struct namecache *ncpc;
268 
269 	for (ncpc = LIST_FIRST(&vp->v_cache_src);
270 	     ncpc != NULL;
271 	     ncpc = LIST_NEXT(ncpc, nc_src)
272 	) {
273 		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR)
274 			return(-1);
275 	}
276 	return(0);
277 }
278 
279 /*
280  * Lookup an entry in the cache
281  *
282  * We don't do this if the segment name is long, simply so the cache
283  * can avoid holding long names (which would either waste space, or
284  * add greatly to the complexity).
285  *
286  * Lookup is called with dvp pointing to the directory to search,
287  * cnp pointing to the name of the entry being sought. If the lookup
288  * succeeds, the vnode is returned in *vpp, and a status of -1 is
289  * returned. If the lookup determines that the name does not exist
290  * (negative cacheing), a status of ENOENT is returned. If the lookup
291  * fails, a status of zero is returned.
292  */
293 
294 int
295 cache_lookup(dvp, vpp, cnp)
296 	struct vnode *dvp;
297 	struct vnode **vpp;
298 	struct componentname *cnp;
299 {
300 	struct namecache *ncp;
301 	u_int32_t hash;
302 
303 	if (!doingcache) {
304 		cnp->cn_flags &= ~MAKEENTRY;
305 		return (0);
306 	}
307 
308 	numcalls++;
309 
310 	if (cnp->cn_nameptr[0] == '.') {
311 		if (cnp->cn_namelen == 1) {
312 			*vpp = dvp;
313 			dothits++;
314 			return (-1);
315 		}
316 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
317 			dotdothits++;
318 			if (dvp->v_dd->v_id != dvp->v_ddid ||
319 			    (cnp->cn_flags & MAKEENTRY) == 0) {
320 				dvp->v_ddid = 0;
321 				return (0);
322 			}
323 			*vpp = dvp->v_dd;
324 			return (-1);
325 		}
326 	}
327 
328 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
329 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
330 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
331 		numchecks++;
332 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
333 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
334 			break;
335 	}
336 
337 	/* We failed to find an entry */
338 	if (ncp == 0) {
339 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
340 			nummisszap++;
341 		} else {
342 			nummiss++;
343 		}
344 		nchstats.ncs_miss++;
345 		return (0);
346 	}
347 
348 	/* We don't want to have an entry, so dump it */
349 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
350 		numposzaps++;
351 		nchstats.ncs_badhits++;
352 		cache_zap(ncp);
353 		return (0);
354 	}
355 
356 	/* We found a "positive" match, return the vnode */
357         if (ncp->nc_vp) {
358 		numposhits++;
359 		nchstats.ncs_goodhits++;
360 		*vpp = ncp->nc_vp;
361 		return (-1);
362 	}
363 
364 	/* We found a negative match, and want to create it, so purge */
365 	if (cnp->cn_nameiop == CREATE) {
366 		numnegzaps++;
367 		nchstats.ncs_badhits++;
368 		cache_zap(ncp);
369 		return (0);
370 	}
371 
372 	numneghits++;
373 	/*
374 	 * We found a "negative" match, ENOENT notifies client of this match.
375 	 * The nc_vpid field records whether this is a whiteout.
376 	 */
377 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
378 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
379 	nchstats.ncs_neghits++;
380 	if (ncp->nc_flag & NCF_WHITE)
381 		cnp->cn_flags |= ISWHITEOUT;
382 	return (ENOENT);
383 }
384 
385 /*
386  * Add an entry to the cache.
387  */
388 void
389 cache_enter(dvp, vp, cnp)
390 	struct vnode *dvp;
391 	struct vnode *vp;
392 	struct componentname *cnp;
393 {
394 	struct namecache *ncp;
395 	struct nchashhead *ncpp;
396 	u_int32_t hash;
397 	int len;
398 
399 	if (!doingcache)
400 		return;
401 
402 	if (cnp->cn_nameptr[0] == '.') {
403 		if (cnp->cn_namelen == 1) {
404 			return;
405 		}
406 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
407 			if (vp) {
408 				dvp->v_dd = vp;
409 				dvp->v_ddid = vp->v_id;
410 			} else {
411 				dvp->v_dd = dvp;
412 				dvp->v_ddid = 0;
413 			}
414 			return;
415 		}
416 	}
417 
418 	ncp = (struct namecache *)
419 		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
420 	bzero((char *)ncp, sizeof *ncp);
421 	numcache++;
422 	if (!vp) {
423 		numneg++;
424 		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
425 	} else if (vp->v_type == VDIR) {
426 		vp->v_dd = dvp;
427 		vp->v_ddid = dvp->v_id;
428 	}
429 
430 	/*
431 	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
432 	 * For negative entries, we have to record whether it is a whiteout.
433 	 * the whiteout flag is stored in the nc_vpid field which is
434 	 * otherwise unused.
435 	 */
436 	ncp->nc_vp = vp;
437 	ncp->nc_dvp = dvp;
438 	len = ncp->nc_nlen = cnp->cn_namelen;
439 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
440 	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
441 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
442 	ncpp = NCHHASH(hash);
443 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
444 	if (LIST_EMPTY(&dvp->v_cache_src)) {
445 		vhold(dvp);
446 		numcachehv++;
447 	}
448 	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
449 	if (vp) {
450 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
451 	} else {
452 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
453 	}
454 	if (numneg * ncnegfactor > numcache) {
455 		ncp = TAILQ_FIRST(&ncneg);
456 		cache_zap(ncp);
457 	}
458 }
459 
460 /*
461  * Name cache initialization, from vfs_init() when we are booting
462  */
463 static void
464 nchinit(void *dummy __unused)
465 {
466 
467 	TAILQ_INIT(&ncneg);
468 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
469 }
470 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
471 
472 
473 /*
474  * Invalidate all entries to a particular vnode.
475  *
476  * Remove all entries in the namecache relating to this vnode and
477  * change the v_id.  We take the v_id from a global counter, since
478  * it becomes a handy sequence number in crash-dumps that way.
479  * No valid vnode will ever have (v_id == 0).
480  *
481  * XXX: Only time and the size of v_id prevents this from failing:
482  * XXX: In theory we should hunt down all (struct vnode*, v_id)
483  * XXX: soft references and nuke them, at least on the global
484  * XXX: v_id wraparound.  The period of resistance can be extended
485  * XXX: by incrementing each vnodes v_id individually instead of
486  * XXX: using the global v_id.
487  */
488 
489 void
490 cache_purge(vp)
491 	struct vnode *vp;
492 {
493 	static u_long nextid;
494 
495 	while (!LIST_EMPTY(&vp->v_cache_src))
496 		cache_zap(LIST_FIRST(&vp->v_cache_src));
497 	while (!TAILQ_EMPTY(&vp->v_cache_dst))
498 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
499 
500 	do
501 		nextid++;
502 	while (nextid == vp->v_id || !nextid);
503 	vp->v_id = nextid;
504 	vp->v_dd = vp;
505 	vp->v_ddid = 0;
506 }
507 
508 /*
509  * Flush all entries referencing a particular filesystem.
510  *
511  * Since we need to check it anyway, we will flush all the invalid
512  * entries at the same time.
513  */
514 void
515 cache_purgevfs(mp)
516 	struct mount *mp;
517 {
518 	struct nchashhead *ncpp;
519 	struct namecache *ncp, *nnp;
520 
521 	/* Scan hash tables for applicable entries */
522 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
523 		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
524 			nnp = LIST_NEXT(ncp, nc_hash);
525 			if (ncp->nc_dvp->v_mount == mp) {
526 				cache_zap(ncp);
527 			}
528 		}
529 	}
530 }
531 
532 /*
533  * Perform canonical checks and cache lookup and pass on to filesystem
534  * through the vop_cachedlookup only if needed.
535  */
536 
537 int
538 vfs_cache_lookup(ap)
539 	struct vop_lookup_args /* {
540 		struct vnode *a_dvp;
541 		struct vnode **a_vpp;
542 		struct componentname *a_cnp;
543 	} */ *ap;
544 {
545 	struct vnode *dvp, *vp;
546 	int lockparent;
547 	int error;
548 	struct vnode **vpp = ap->a_vpp;
549 	struct componentname *cnp = ap->a_cnp;
550 	struct ucred *cred = cnp->cn_cred;
551 	int flags = cnp->cn_flags;
552 	struct thread *td = cnp->cn_thread;
553 	u_long vpid;	/* capability number of vnode */
554 
555 	*vpp = NULL;
556 	dvp = ap->a_dvp;
557 	lockparent = flags & LOCKPARENT;
558 
559 	if (dvp->v_type != VDIR)
560                 return (ENOTDIR);
561 
562 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
563 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
564 		return (EROFS);
565 
566 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
567 
568 	if (error)
569 		return (error);
570 
571 	error = cache_lookup(dvp, vpp, cnp);
572 
573 #ifdef LOOKUP_SHARED
574 	if (!error) {
575 		/* We do this because the rest of the system now expects to get
576 		 * a shared lock, which is later upgraded if LOCKSHARED is not
577 		 * set.  We have so many cases here because of bugs that yield
578 		 * inconsistant lock states.  This all badly needs to be fixed
579 		 */
580 		error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
581 		if (!error) {
582 			int flock;
583 
584 			flock = VOP_ISLOCKED(*vpp, td);
585 			if (flock != LK_EXCLUSIVE) {
586 				if (flock == 0) {
587 					if ((flags & ISLASTCN) &&
588 					    (flags & LOCKSHARED))
589 						VOP_LOCK(*vpp, LK_SHARED, td);
590 					else
591 						VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
592 				}
593 			} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
594 				VOP_LOCK(*vpp, LK_DOWNGRADE, td);
595 		}
596 		return (error);
597 	}
598 #else
599 	if (!error)
600 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
601 #endif
602 
603 	if (error == ENOENT)
604 		return (error);
605 
606 	vp = *vpp;
607 	vpid = vp->v_id;
608 	cnp->cn_flags &= ~PDIRUNLOCK;
609 	if (dvp == vp) {   /* lookup on "." */
610 		VREF(vp);
611 		error = 0;
612 	} else if (flags & ISDOTDOT) {
613 		VOP_UNLOCK(dvp, 0, td);
614 		cnp->cn_flags |= PDIRUNLOCK;
615 #ifdef LOOKUP_SHARED
616 		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
617 			error = vget(vp, LK_SHARED, td);
618 		else
619 			error = vget(vp, LK_EXCLUSIVE, td);
620 #else
621 		error = vget(vp, LK_EXCLUSIVE, td);
622 #endif
623 
624 		if (!error && lockparent && (flags & ISLASTCN)) {
625 			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
626 				cnp->cn_flags &= ~PDIRUNLOCK;
627 		}
628 	} else {
629 #ifdef LOOKUP_SHARED
630 		if ((flags & ISLASTCN) && (flags & LOCKSHARED))
631 			error = vget(vp, LK_SHARED, td);
632 		else
633 			error = vget(vp, LK_EXCLUSIVE, td);
634 #else
635 		error = vget(vp, LK_EXCLUSIVE, td);
636 #endif
637 		if (!lockparent || error || !(flags & ISLASTCN)) {
638 			VOP_UNLOCK(dvp, 0, td);
639 			cnp->cn_flags |= PDIRUNLOCK;
640 		}
641 	}
642 	/*
643 	 * Check that the capability number did not change
644 	 * while we were waiting for the lock.
645 	 */
646 	if (!error) {
647 		if (vpid == vp->v_id)
648 			return (0);
649 		vput(vp);
650 		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
651 			VOP_UNLOCK(dvp, 0, td);
652 			cnp->cn_flags |= PDIRUNLOCK;
653 		}
654 	}
655 	if (cnp->cn_flags & PDIRUNLOCK) {
656 		error = vn_lock(dvp, LK_EXCLUSIVE, td);
657 		if (error)
658 			return (error);
659 		cnp->cn_flags &= ~PDIRUNLOCK;
660 	}
661 #ifdef LOOKUP_SHARED
662 	error = VOP_CACHEDLOOKUP(dvp, vpp, cnp);
663 
664 	if (!error) {
665 		int flock = 0;
666 
667 		flock = VOP_ISLOCKED(*vpp, td);
668 		if (flock != LK_EXCLUSIVE) {
669 			if (flock == 0) {
670 				if ((flags & ISLASTCN) && (flags & LOCKSHARED))
671 					VOP_LOCK(*vpp, LK_SHARED, td);
672 				else
673 					VOP_LOCK(*vpp, LK_EXCLUSIVE, td);
674 			}
675 		} else if ((flags & ISLASTCN) && (flags & LOCKSHARED))
676 			VOP_LOCK(*vpp, LK_DOWNGRADE, td);
677 	}
678 
679 	return (error);
680 #else
681 	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
682 #endif
683 }
684 
685 
686 #ifndef _SYS_SYSPROTO_H_
687 struct  __getcwd_args {
688 	u_char	*buf;
689 	u_int	buflen;
690 };
691 #endif
692 
693 /*
694  * XXX All of these sysctls would probably be more productive dead.
695  */
696 static int disablecwd;
697 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
698    "Disable the getcwd syscall");
699 
700 /* Various statistics for the getcwd syscall */
701 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
702 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
703 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
704 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
705 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
706 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
707 
708 /* Implementation of the getcwd syscall */
709 int
710 __getcwd(td, uap)
711 	struct thread *td;
712 	struct __getcwd_args *uap;
713 {
714 	char *bp, *buf;
715 	int error, i, slash_prefixed;
716 	struct filedesc *fdp;
717 	struct namecache *ncp;
718 	struct vnode *vp;
719 
720 	numcwdcalls++;
721 	if (disablecwd)
722 		return (ENODEV);
723 	if (uap->buflen < 2)
724 		return (EINVAL);
725 	if (uap->buflen > MAXPATHLEN)
726 		uap->buflen = MAXPATHLEN;
727 	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
728 	bp += uap->buflen - 1;
729 	*bp = '\0';
730 	fdp = td->td_proc->p_fd;
731 	slash_prefixed = 0;
732 	FILEDESC_LOCK(fdp);
733 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
734 		if (vp->v_flag & VROOT) {
735 			if (vp->v_mount == NULL) {	/* forced unmount */
736 				FILEDESC_UNLOCK(fdp);
737 				free(buf, M_TEMP);
738 				return (EBADF);
739 			}
740 			vp = vp->v_mount->mnt_vnodecovered;
741 			continue;
742 		}
743 		if (vp->v_dd->v_id != vp->v_ddid) {
744 			FILEDESC_UNLOCK(fdp);
745 			numcwdfail1++;
746 			free(buf, M_TEMP);
747 			return (ENOTDIR);
748 		}
749 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
750 		if (!ncp) {
751 			FILEDESC_UNLOCK(fdp);
752 			numcwdfail2++;
753 			free(buf, M_TEMP);
754 			return (ENOENT);
755 		}
756 		if (ncp->nc_dvp != vp->v_dd) {
757 			FILEDESC_UNLOCK(fdp);
758 			numcwdfail3++;
759 			free(buf, M_TEMP);
760 			return (EBADF);
761 		}
762 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
763 			if (bp == buf) {
764 				FILEDESC_UNLOCK(fdp);
765 				numcwdfail4++;
766 				free(buf, M_TEMP);
767 				return (ENOMEM);
768 			}
769 			*--bp = ncp->nc_name[i];
770 		}
771 		if (bp == buf) {
772 			FILEDESC_UNLOCK(fdp);
773 			numcwdfail4++;
774 			free(buf, M_TEMP);
775 			return (ENOMEM);
776 		}
777 		*--bp = '/';
778 		slash_prefixed = 1;
779 		vp = vp->v_dd;
780 	}
781 	FILEDESC_UNLOCK(fdp);
782 	if (!slash_prefixed) {
783 		if (bp == buf) {
784 			numcwdfail4++;
785 			free(buf, M_TEMP);
786 			return (ENOMEM);
787 		}
788 		*--bp = '/';
789 	}
790 	numcwdfound++;
791 	error = copyout(bp, uap->buf, strlen(bp) + 1);
792 	free(buf, M_TEMP);
793 	return (error);
794 }
795 
796 /*
797  * Thus begins the fullpath magic.
798  */
799 
800 #undef STATNODE
801 #define STATNODE(name)							\
802 	static u_int name;						\
803 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
804 
805 static int disablefullpath;
806 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
807 	"Disable the vn_fullpath function");
808 
809 STATNODE(numfullpathcalls);
810 STATNODE(numfullpathfail1);
811 STATNODE(numfullpathfail2);
812 STATNODE(numfullpathfail3);
813 STATNODE(numfullpathfail4);
814 STATNODE(numfullpathfound);
815 
816 /*
817  * Retrieve the full filesystem path that correspond to a vnode from the name
818  * cache (if available)
819  */
820 int
821 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
822 {
823 	char *bp, *buf;
824 	int i, slash_prefixed;
825 	struct filedesc *fdp;
826 	struct namecache *ncp;
827 	struct vnode *vp;
828 
829 	numfullpathcalls++;
830 	if (disablefullpath)
831 		return (ENODEV);
832 	if (vn == NULL)
833 		return (EINVAL);
834 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
835 	bp = buf + MAXPATHLEN - 1;
836 	*bp = '\0';
837 	fdp = td->td_proc->p_fd;
838 	slash_prefixed = 0;
839 	FILEDESC_LOCK(fdp);
840 	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
841 		if (vp->v_flag & VROOT) {
842 			if (vp->v_mount == NULL) {	/* forced unmount */
843 				FILEDESC_UNLOCK(fdp);
844 				free(buf, M_TEMP);
845 				return (EBADF);
846 			}
847 			vp = vp->v_mount->mnt_vnodecovered;
848 			continue;
849 		}
850 		if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
851 			FILEDESC_UNLOCK(fdp);
852 			numfullpathfail1++;
853 			free(buf, M_TEMP);
854 			return (ENOTDIR);
855 		}
856 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
857 		if (!ncp) {
858 			FILEDESC_UNLOCK(fdp);
859 			numfullpathfail2++;
860 			free(buf, M_TEMP);
861 			return (ENOENT);
862 		}
863 		if (vp != vn && ncp->nc_dvp != vp->v_dd) {
864 			FILEDESC_UNLOCK(fdp);
865 			numfullpathfail3++;
866 			free(buf, M_TEMP);
867 			return (EBADF);
868 		}
869 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
870 			if (bp == buf) {
871 				FILEDESC_UNLOCK(fdp);
872 				numfullpathfail4++;
873 				free(buf, M_TEMP);
874 				return (ENOMEM);
875 			}
876 			*--bp = ncp->nc_name[i];
877 		}
878 		if (bp == buf) {
879 			FILEDESC_UNLOCK(fdp);
880 			numfullpathfail4++;
881 			free(buf, M_TEMP);
882 			return (ENOMEM);
883 		}
884 		*--bp = '/';
885 		slash_prefixed = 1;
886 		vp = ncp->nc_dvp;
887 	}
888 	if (!slash_prefixed) {
889 		if (bp == buf) {
890 			FILEDESC_UNLOCK(fdp);
891 			numfullpathfail4++;
892 			free(buf, M_TEMP);
893 			return (ENOMEM);
894 		}
895 		*--bp = '/';
896 	}
897 	FILEDESC_UNLOCK(fdp);
898 	numfullpathfound++;
899 	*retbuf = bp;
900 	*freebuf = buf;
901 	return (0);
902 }
903