xref: /freebsd/sys/kern/vfs_cache.c (revision 5521ff5a4d1929056e7ffc982fac3341ca54df7c)
1 /*
2  * Copyright (c) 1989, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Poul-Henning Kamp of the FreeBSD Project.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
37  * $FreeBSD$
38  */
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/sysctl.h>
45 #include <sys/mount.h>
46 #include <sys/vnode.h>
47 #include <sys/namei.h>
48 #include <sys/malloc.h>
49 #include <sys/sysproto.h>
50 #include <sys/proc.h>
51 #include <sys/filedesc.h>
52 #include <sys/fnv_hash.h>
53 
54 /*
55  * This structure describes the elements in the cache of recent
56  * names looked up by namei.
57  */
58 
59 struct	namecache {
60 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
61 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
62 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
63 	struct	vnode *nc_dvp;		/* vnode of parent of name */
64 	struct	vnode *nc_vp;		/* vnode the name refers to */
65 	u_char	nc_flag;		/* flag bits */
66 	u_char	nc_nlen;		/* length of name */
67 	char	nc_name[0];		/* segment name */
68 };
69 
70 /*
71  * Name caching works as follows:
72  *
73  * Names found by directory scans are retained in a cache
74  * for future reference.  It is managed LRU, so frequently
75  * used names will hang around.  Cache is indexed by hash value
76  * obtained from (vp, name) where vp refers to the directory
77  * containing name.
78  *
79  * If it is a "negative" entry, (i.e. for a name that is known NOT to
80  * exist) the vnode pointer will be NULL.
81  *
82  * Upon reaching the last segment of a path, if the reference
83  * is for DELETE, or NOCACHE is set (rewrite), and the
84  * name is located in the cache, it will be dropped.
85  */
86 
87 /*
88  * Structures associated with name cacheing.
89  */
90 #define NCHHASH(hash) \
91 	(&nchashtbl[(hash) & nchash])
92 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
93 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
94 static u_long	nchash;			/* size of hash table */
95 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
96 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
97 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
98 static u_long	numneg;		/* number of cache entries allocated */
99 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
100 static u_long	numcache;		/* number of cache entries allocated */
101 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
102 static u_long	numcachehv;		/* number of cache entries with vnodes held */
103 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
104 static u_long	numcachepl;		/* number of cache purge for leaf entries */
105 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
106 struct	nchstats nchstats;		/* cache effectiveness statistics */
107 
108 static int	doingcache = 1;		/* 1 => enable the cache */
109 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
110 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
111 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
112 
113 /*
114  * The new name cache statistics
115  */
116 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
117 #define STATNODE(mode, name, var) \
118 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
119 STATNODE(CTLFLAG_RD, numneg, &numneg);
120 STATNODE(CTLFLAG_RD, numcache, &numcache);
121 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
122 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
123 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
124 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
125 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
126 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
127 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
128 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
129 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
130 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
131 
132 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
133         sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
134 
135 
136 
137 static void cache_zap __P((struct namecache *ncp));
138 
139 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
140 
141 /*
142  * Flags in namecache.nc_flag
143  */
144 #define NCF_WHITE	1
145 
146 /*
147  * Grab an atomic snapshot of the name cache hash chain lengths
148  */
149 SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
150 
151 static int
152 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
153 {
154 	int error;
155 	struct nchashhead *ncpp;
156 	struct namecache *ncp;
157 	int n_nchash;
158 	int count;
159 
160 	n_nchash = nchash + 1;	/* nchash is max index, not count */
161 	if (!req->oldptr)
162 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
163 
164 	/* Scan hash tables for applicable entries */
165 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
166 		count = 0;
167 		LIST_FOREACH(ncp, ncpp, nc_hash) {
168 			count++;
169 		}
170 		error = SYSCTL_OUT(req, (caddr_t)&count, sizeof(count));
171 		if (error)
172 			return (error);
173 	}
174 	return (0);
175 }
176 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
177 	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
178 
179 static int
180 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
181 {
182 	int error;
183 	struct nchashhead *ncpp;
184 	struct namecache *ncp;
185 	int n_nchash;
186 	int count, maxlength, used, pct;
187 
188 	if (!req->oldptr)
189 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
190 
191 	n_nchash = nchash + 1;	/* nchash is max index, not count */
192 	used = 0;
193 	maxlength = 0;
194 
195 	/* Scan hash tables for applicable entries */
196 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
197 		count = 0;
198 		LIST_FOREACH(ncp, ncpp, nc_hash) {
199 			count++;
200 		}
201 		if (count)
202 			used++;
203 		if (maxlength < count)
204 			maxlength = count;
205 	}
206 	n_nchash = nchash + 1;
207 	pct = (used * 100 * 100) / n_nchash;
208 	error = SYSCTL_OUT(req, (caddr_t)&n_nchash, sizeof(n_nchash));
209 	if (error)
210 		return (error);
211 	error = SYSCTL_OUT(req, (caddr_t)&used, sizeof(used));
212 	if (error)
213 		return (error);
214 	error = SYSCTL_OUT(req, (caddr_t)&maxlength, sizeof(maxlength));
215 	if (error)
216 		return (error);
217 	error = SYSCTL_OUT(req, (caddr_t)&pct, sizeof(pct));
218 	if (error)
219 		return (error);
220 	return (0);
221 }
222 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
223 	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
224 
225 /*
226  * Delete an entry from its hash list and move it to the front
227  * of the LRU list for immediate reuse.
228  */
229 static void
230 cache_zap(ncp)
231 	struct namecache *ncp;
232 {
233 	LIST_REMOVE(ncp, nc_hash);
234 	LIST_REMOVE(ncp, nc_src);
235 	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
236 		vdrop(ncp->nc_dvp);
237 		numcachehv--;
238 	}
239 	if (ncp->nc_vp) {
240 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
241 	} else {
242 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
243 		numneg--;
244 	}
245 	numcache--;
246 	free(ncp, M_VFSCACHE);
247 }
248 
249 /*
250  * Lookup an entry in the cache
251  *
252  * We don't do this if the segment name is long, simply so the cache
253  * can avoid holding long names (which would either waste space, or
254  * add greatly to the complexity).
255  *
256  * Lookup is called with dvp pointing to the directory to search,
257  * cnp pointing to the name of the entry being sought. If the lookup
258  * succeeds, the vnode is returned in *vpp, and a status of -1 is
259  * returned. If the lookup determines that the name does not exist
260  * (negative cacheing), a status of ENOENT is returned. If the lookup
261  * fails, a status of zero is returned.
262  */
263 
264 int
265 cache_lookup(dvp, vpp, cnp)
266 	struct vnode *dvp;
267 	struct vnode **vpp;
268 	struct componentname *cnp;
269 {
270 	struct namecache *ncp;
271 	u_int32_t hash;
272 
273 	if (!doingcache) {
274 		cnp->cn_flags &= ~MAKEENTRY;
275 		return (0);
276 	}
277 
278 	numcalls++;
279 
280 	if (cnp->cn_nameptr[0] == '.') {
281 		if (cnp->cn_namelen == 1) {
282 			*vpp = dvp;
283 			dothits++;
284 			return (-1);
285 		}
286 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
287 			dotdothits++;
288 			if (dvp->v_dd->v_id != dvp->v_ddid ||
289 			    (cnp->cn_flags & MAKEENTRY) == 0) {
290 				dvp->v_ddid = 0;
291 				return (0);
292 			}
293 			*vpp = dvp->v_dd;
294 			return (-1);
295 		}
296 	}
297 
298 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
299 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
300 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
301 		numchecks++;
302 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
303 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
304 			break;
305 	}
306 
307 	/* We failed to find an entry */
308 	if (ncp == 0) {
309 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
310 			nummisszap++;
311 		} else {
312 			nummiss++;
313 		}
314 		nchstats.ncs_miss++;
315 		return (0);
316 	}
317 
318 	/* We don't want to have an entry, so dump it */
319 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
320 		numposzaps++;
321 		nchstats.ncs_badhits++;
322 		cache_zap(ncp);
323 		return (0);
324 	}
325 
326 	/* We found a "positive" match, return the vnode */
327         if (ncp->nc_vp) {
328 		numposhits++;
329 		nchstats.ncs_goodhits++;
330 		*vpp = ncp->nc_vp;
331 		return (-1);
332 	}
333 
334 	/* We found a negative match, and want to create it, so purge */
335 	if (cnp->cn_nameiop == CREATE) {
336 		numnegzaps++;
337 		nchstats.ncs_badhits++;
338 		cache_zap(ncp);
339 		return (0);
340 	}
341 
342 	numneghits++;
343 	/*
344 	 * We found a "negative" match, ENOENT notifies client of this match.
345 	 * The nc_vpid field records whether this is a whiteout.
346 	 */
347 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
348 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
349 	nchstats.ncs_neghits++;
350 	if (ncp->nc_flag & NCF_WHITE)
351 		cnp->cn_flags |= ISWHITEOUT;
352 	return (ENOENT);
353 }
354 
355 /*
356  * Add an entry to the cache.
357  */
358 void
359 cache_enter(dvp, vp, cnp)
360 	struct vnode *dvp;
361 	struct vnode *vp;
362 	struct componentname *cnp;
363 {
364 	struct namecache *ncp;
365 	struct nchashhead *ncpp;
366 	u_int32_t hash;
367 	int len;
368 
369 	if (!doingcache)
370 		return;
371 
372 	if (cnp->cn_nameptr[0] == '.') {
373 		if (cnp->cn_namelen == 1) {
374 			return;
375 		}
376 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
377 			if (vp) {
378 				dvp->v_dd = vp;
379 				dvp->v_ddid = vp->v_id;
380 			} else {
381 				dvp->v_dd = dvp;
382 				dvp->v_ddid = 0;
383 			}
384 			return;
385 		}
386 	}
387 
388 	ncp = (struct namecache *)
389 		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
390 	bzero((char *)ncp, sizeof *ncp);
391 	numcache++;
392 	if (!vp) {
393 		numneg++;
394 		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
395 	} else if (vp->v_type == VDIR) {
396 		vp->v_dd = dvp;
397 		vp->v_ddid = dvp->v_id;
398 	}
399 
400 	/*
401 	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
402 	 * For negative entries, we have to record whether it is a whiteout.
403 	 * the whiteout flag is stored in the nc_vpid field which is
404 	 * otherwise unused.
405 	 */
406 	ncp->nc_vp = vp;
407 	ncp->nc_dvp = dvp;
408 	len = ncp->nc_nlen = cnp->cn_namelen;
409 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
410 	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
411 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
412 	ncpp = NCHHASH(hash);
413 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
414 	if (LIST_EMPTY(&dvp->v_cache_src)) {
415 		vhold(dvp);
416 		numcachehv++;
417 	}
418 	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
419 	if (vp) {
420 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
421 	} else {
422 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
423 	}
424 	if (numneg * ncnegfactor > numcache) {
425 		ncp = TAILQ_FIRST(&ncneg);
426 		cache_zap(ncp);
427 	}
428 }
429 
430 /*
431  * Name cache initialization, from vfs_init() when we are booting
432  */
433 static void
434 nchinit(void *dummy __unused)
435 {
436 
437 	TAILQ_INIT(&ncneg);
438 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
439 }
440 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
441 
442 
443 /*
444  * Invalidate all entries to a particular vnode.
445  *
446  * Remove all entries in the namecache relating to this vnode and
447  * change the v_id.  We take the v_id from a global counter, since
448  * it becomes a handy sequence number in crash-dumps that way.
449  * No valid vnode will ever have (v_id == 0).
450  *
451  * XXX: Only time and the size of v_id prevents this from failing:
452  * XXX: In theory we should hunt down all (struct vnode*, v_id)
453  * XXX: soft references and nuke them, at least on the global
454  * XXX: v_id wraparound.  The period of resistance can be extended
455  * XXX: by incrementing each vnodes v_id individually instead of
456  * XXX: using the global v_id.
457  */
458 
459 void
460 cache_purge(vp)
461 	struct vnode *vp;
462 {
463 	static u_long nextid;
464 
465 	while (!LIST_EMPTY(&vp->v_cache_src))
466 		cache_zap(LIST_FIRST(&vp->v_cache_src));
467 	while (!TAILQ_EMPTY(&vp->v_cache_dst))
468 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
469 
470 	do
471 		nextid++;
472 	while (nextid == vp->v_id || !nextid);
473 	vp->v_id = nextid;
474 	vp->v_dd = vp;
475 	vp->v_ddid = 0;
476 }
477 
478 /*
479  * Flush all entries referencing a particular filesystem.
480  *
481  * Since we need to check it anyway, we will flush all the invalid
482  * entries at the same time.
483  */
484 void
485 cache_purgevfs(mp)
486 	struct mount *mp;
487 {
488 	struct nchashhead *ncpp;
489 	struct namecache *ncp, *nnp;
490 
491 	/* Scan hash tables for applicable entries */
492 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
493 		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
494 			nnp = LIST_NEXT(ncp, nc_hash);
495 			if (ncp->nc_dvp->v_mount == mp) {
496 				cache_zap(ncp);
497 			}
498 		}
499 	}
500 }
501 
502 /*
503  * Flush all dirctory entries with no child directories held in
504  * the cache.
505  *
506  * Since we need to check it anyway, we will flush all the invalid
507  * entries at the same time.
508  */
509 void
510 cache_purgeleafdirs(ndir)
511 	int ndir;
512 {
513 	struct nchashhead *ncpp;
514 	struct namecache *ncp, *nnp, *ncpc, *nnpc;
515 	struct vnode *dvp;
516 
517 	/* Scan hash tables for applicable entries */
518 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl && ndir > 0; ncpp--) {
519 		for (ncp = LIST_FIRST(ncpp); ncp != 0 && ndir > 0; ncp = nnp) {
520 			nnp = LIST_NEXT(ncp, nc_hash);
521 			if (ncp->nc_dvp != 0) {
522 				/*
523 				 * Skip over if nc_dvp of this cache holds
524 				 * a child directory, or the hold count of
525 				 * nc_dvp is greater than 1 (in which case
526 				 * nc_dvp is likely to be the working
527 				 * directory of a process).
528 				 */
529 				if (ncp->nc_dvp->v_holdcnt > 1)
530 					continue;
531 				for (ncpc = LIST_FIRST(&ncp->nc_dvp->v_cache_src);
532 				     ncpc != 0; ncpc = nnpc) {
533 					nnpc = LIST_NEXT(ncpc, nc_src);
534 					if (ncpc->nc_vp != 0 && ncpc->nc_vp->v_type == VDIR)
535 						break;
536 				}
537 				if (ncpc == 0) {
538 					/*
539 					 * Zap all of this directory's children,
540 					 * held in ncp->nc_dvp->v_cache_src.
541 					 */
542 					dvp = ncp->nc_dvp;
543 					while (!LIST_EMPTY(&dvp->v_cache_src))
544 						cache_zap(LIST_FIRST(&dvp->v_cache_src));
545 
546 					ndir--;
547 
548 					/* Restart in case where nnp is reclaimed. */
549 					nnp = LIST_FIRST(ncpp);
550 					continue;
551 				}
552 			}
553 		}
554 	}
555 	numcachepl++;
556 }
557 
558 /*
559  * Perform canonical checks and cache lookup and pass on to filesystem
560  * through the vop_cachedlookup only if needed.
561  */
562 
563 int
564 vfs_cache_lookup(ap)
565 	struct vop_lookup_args /* {
566 		struct vnode *a_dvp;
567 		struct vnode **a_vpp;
568 		struct componentname *a_cnp;
569 	} */ *ap;
570 {
571 	struct vnode *dvp, *vp;
572 	int lockparent;
573 	int error;
574 	struct vnode **vpp = ap->a_vpp;
575 	struct componentname *cnp = ap->a_cnp;
576 	struct ucred *cred = cnp->cn_cred;
577 	int flags = cnp->cn_flags;
578 	struct proc *p = cnp->cn_proc;
579 	u_long vpid;	/* capability number of vnode */
580 
581 	*vpp = NULL;
582 	dvp = ap->a_dvp;
583 	lockparent = flags & LOCKPARENT;
584 
585 	if (dvp->v_type != VDIR)
586                 return (ENOTDIR);
587 
588 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
589 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
590 		return (EROFS);
591 
592 	error = VOP_ACCESS(dvp, VEXEC, cred, p);
593 
594 	if (error)
595 		return (error);
596 
597 	error = cache_lookup(dvp, vpp, cnp);
598 
599 	if (!error)
600 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
601 
602 	if (error == ENOENT)
603 		return (error);
604 
605 	vp = *vpp;
606 	vpid = vp->v_id;
607 	cnp->cn_flags &= ~PDIRUNLOCK;
608 	if (dvp == vp) {   /* lookup on "." */
609 		VREF(vp);
610 		error = 0;
611 	} else if (flags & ISDOTDOT) {
612 		VOP_UNLOCK(dvp, 0, p);
613 		cnp->cn_flags |= PDIRUNLOCK;
614 		error = vget(vp, LK_EXCLUSIVE, p);
615 		if (!error && lockparent && (flags & ISLASTCN)) {
616 			if ((error = vn_lock(dvp, LK_EXCLUSIVE, p)) == 0)
617 				cnp->cn_flags &= ~PDIRUNLOCK;
618 		}
619 	} else {
620 		error = vget(vp, LK_EXCLUSIVE, p);
621 		if (!lockparent || error || !(flags & ISLASTCN)) {
622 			VOP_UNLOCK(dvp, 0, p);
623 			cnp->cn_flags |= PDIRUNLOCK;
624 		}
625 	}
626 	/*
627 	 * Check that the capability number did not change
628 	 * while we were waiting for the lock.
629 	 */
630 	if (!error) {
631 		if (vpid == vp->v_id)
632 			return (0);
633 		vput(vp);
634 		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
635 			VOP_UNLOCK(dvp, 0, p);
636 			cnp->cn_flags |= PDIRUNLOCK;
637 		}
638 	}
639 	if (cnp->cn_flags & PDIRUNLOCK) {
640 		error = vn_lock(dvp, LK_EXCLUSIVE, p);
641 		if (error)
642 			return (error);
643 		cnp->cn_flags &= ~PDIRUNLOCK;
644 	}
645 	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
646 }
647 
648 
649 #ifndef _SYS_SYSPROTO_H_
650 struct  __getcwd_args {
651 	u_char	*buf;
652 	u_int	buflen;
653 };
654 #endif
655 
656 static int disablecwd;
657 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
658 
659 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
660 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
661 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
662 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
663 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
664 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
665 int
666 __getcwd(p, uap)
667 	struct proc *p;
668 	struct __getcwd_args *uap;
669 {
670 	char *bp, *buf;
671 	int error, i, slash_prefixed;
672 	struct filedesc *fdp;
673 	struct namecache *ncp;
674 	struct vnode *vp;
675 
676 	numcwdcalls++;
677 	if (disablecwd)
678 		return (ENODEV);
679 	if (uap->buflen < 2)
680 		return (EINVAL);
681 	if (uap->buflen > MAXPATHLEN)
682 		uap->buflen = MAXPATHLEN;
683 	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
684 	bp += uap->buflen - 1;
685 	*bp = '\0';
686 	fdp = p->p_fd;
687 	slash_prefixed = 0;
688 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
689 		if (vp->v_flag & VROOT) {
690 			if (vp->v_mount == NULL)	/* forced unmount */
691 				return (EBADF);
692 			vp = vp->v_mount->mnt_vnodecovered;
693 			continue;
694 		}
695 		if (vp->v_dd->v_id != vp->v_ddid) {
696 			numcwdfail1++;
697 			free(buf, M_TEMP);
698 			return (ENOTDIR);
699 		}
700 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
701 		if (!ncp) {
702 			numcwdfail2++;
703 			free(buf, M_TEMP);
704 			return (ENOENT);
705 		}
706 		if (ncp->nc_dvp != vp->v_dd) {
707 			numcwdfail3++;
708 			free(buf, M_TEMP);
709 			return (EBADF);
710 		}
711 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
712 			if (bp == buf) {
713 				numcwdfail4++;
714 				free(buf, M_TEMP);
715 				return (ENOMEM);
716 			}
717 			*--bp = ncp->nc_name[i];
718 		}
719 		if (bp == buf) {
720 			numcwdfail4++;
721 			free(buf, M_TEMP);
722 			return (ENOMEM);
723 		}
724 		*--bp = '/';
725 		slash_prefixed = 1;
726 		vp = vp->v_dd;
727 	}
728 	if (!slash_prefixed) {
729 		if (bp == buf) {
730 			numcwdfail4++;
731 			free(buf, M_TEMP);
732 			return (ENOMEM);
733 		}
734 		*--bp = '/';
735 	}
736 	numcwdfound++;
737 	error = copyout(bp, uap->buf, strlen(bp) + 1);
738 	free(buf, M_TEMP);
739 	return (error);
740 }
741 
742 /*
743  * Thus begins the fullpath magic.
744  */
745 
746 #undef STATNODE
747 #define STATNODE(name)							\
748 	static u_int name;						\
749 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
750 
751 static int disablefullpath;
752 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
753     &disablefullpath, 0, "");
754 
755 STATNODE(numfullpathcalls);
756 STATNODE(numfullpathfail1);
757 STATNODE(numfullpathfail2);
758 STATNODE(numfullpathfail3);
759 STATNODE(numfullpathfail4);
760 STATNODE(numfullpathfound);
761 
762 int
763 textvp_fullpath(struct proc *p, char **retbuf, char **retfreebuf) {
764 	char *bp, *buf;
765 	int i, slash_prefixed;
766 	struct filedesc *fdp;
767 	struct namecache *ncp;
768 	struct vnode *vp, *textvp;
769 
770 	numfullpathcalls++;
771 	if (disablefullpath)
772 		return (ENODEV);
773 	textvp = p->p_textvp;
774 	if (textvp == NULL)
775 		return (EINVAL);
776 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
777 	bp = buf + MAXPATHLEN - 1;
778 	*bp = '\0';
779 	fdp = p->p_fd;
780 	slash_prefixed = 0;
781 	for (vp = textvp; vp != fdp->fd_rdir && vp != rootvnode;) {
782 		if (vp->v_flag & VROOT) {
783 			if (vp->v_mount == NULL) {	/* forced unmount */
784 				free(buf, M_TEMP);
785 				return (EBADF);
786 			}
787 			vp = vp->v_mount->mnt_vnodecovered;
788 			continue;
789 		}
790 		if (vp != textvp && vp->v_dd->v_id != vp->v_ddid) {
791 			numfullpathfail1++;
792 			free(buf, M_TEMP);
793 			return (ENOTDIR);
794 		}
795 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
796 		if (!ncp) {
797 			numfullpathfail2++;
798 			free(buf, M_TEMP);
799 			return (ENOENT);
800 		}
801 		if (vp != textvp && ncp->nc_dvp != vp->v_dd) {
802 			numfullpathfail3++;
803 			free(buf, M_TEMP);
804 			return (EBADF);
805 		}
806 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
807 			if (bp == buf) {
808 				numfullpathfail4++;
809 				free(buf, M_TEMP);
810 				return (ENOMEM);
811 			}
812 			*--bp = ncp->nc_name[i];
813 		}
814 		if (bp == buf) {
815 			numfullpathfail4++;
816 			free(buf, M_TEMP);
817 			return (ENOMEM);
818 		}
819 		*--bp = '/';
820 		slash_prefixed = 1;
821 		vp = ncp->nc_dvp;
822 	}
823 	if (!slash_prefixed) {
824 		if (bp == buf) {
825 			numfullpathfail4++;
826 			free(buf, M_TEMP);
827 			return (ENOMEM);
828 		}
829 		*--bp = '/';
830 	}
831 	numfullpathfound++;
832 	*retbuf = bp;
833 	*retfreebuf = buf;
834 	return (0);
835 }
836