xref: /freebsd/sys/kern/vfs_cache.c (revision 9af6c78cd43b18e169f10802142c61638bd62bed)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ddb.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/counter.h>
46 #include <sys/filedesc.h>
47 #include <sys/fnv_hash.h>
48 #include <sys/kernel.h>
49 #include <sys/ktr.h>
50 #include <sys/lock.h>
51 #include <sys/malloc.h>
52 #include <sys/fcntl.h>
53 #include <sys/mount.h>
54 #include <sys/namei.h>
55 #include <sys/proc.h>
56 #include <sys/rwlock.h>
57 #include <sys/sdt.h>
58 #include <sys/smp.h>
59 #include <sys/syscallsubr.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysproto.h>
62 #include <sys/vnode.h>
63 #ifdef KTRACE
64 #include <sys/ktrace.h>
65 #endif
66 
67 #ifdef DDB
68 #include <ddb/ddb.h>
69 #endif
70 
71 #include <vm/uma.h>
72 
73 SDT_PROVIDER_DECLARE(vfs);
74 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
75     "struct vnode *");
76 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
77     "char *");
78 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
79 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
80     "char *", "struct vnode *");
81 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
82 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
83     "struct vnode *", "char *");
84 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
85     "struct vnode *");
86 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
87     "struct vnode *", "char *");
88 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
89     "char *");
90 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
91 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
92 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
93 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
94     "struct vnode *");
95 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
96     "char *");
97 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
98     "char *");
99 
100 /*
101  * This structure describes the elements in the cache of recent
102  * names looked up by namei.
103  */
104 
105 struct	namecache {
106 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
107 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
108 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
109 	struct	vnode *nc_dvp;		/* vnode of parent of name */
110 	union {
111 		struct	vnode *nu_vp;	/* vnode the name refers to */
112 	} n_un;
113 	u_char	nc_flag;		/* flag bits */
114 	u_char	nc_nlen;		/* length of name */
115 	char	nc_name[0];		/* segment name + nul */
116 };
117 
118 /*
119  * struct namecache_ts repeats struct namecache layout up to the
120  * nc_nlen member.
121  * struct namecache_ts is used in place of struct namecache when time(s) need
122  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
123  * both a non-dotdot directory name plus dotdot for the directory's
124  * parent.
125  */
126 struct	namecache_ts {
127 	struct	timespec nc_time;	/* timespec provided by fs */
128 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
129 	int	nc_ticks;		/* ticks value when entry was added */
130 	struct namecache nc_nc;
131 };
132 
133 #define	nc_vp		n_un.nu_vp
134 
135 /*
136  * Flags in namecache.nc_flag
137  */
138 #define NCF_WHITE	0x01
139 #define NCF_ISDOTDOT	0x02
140 #define	NCF_TS		0x04
141 #define	NCF_DTS		0x08
142 #define	NCF_DVDROP	0x10
143 #define	NCF_NEGATIVE	0x20
144 #define	NCF_HOTNEGATIVE	0x40
145 
146 /*
147  * Name caching works as follows:
148  *
149  * Names found by directory scans are retained in a cache
150  * for future reference.  It is managed LRU, so frequently
151  * used names will hang around.  Cache is indexed by hash value
152  * obtained from (dvp, name) where dvp refers to the directory
153  * containing name.
154  *
155  * If it is a "negative" entry, (i.e. for a name that is known NOT to
156  * exist) the vnode pointer will be NULL.
157  *
158  * Upon reaching the last segment of a path, if the reference
159  * is for DELETE, or NOCACHE is set (rewrite), and the
160  * name is located in the cache, it will be dropped.
161  *
162  * These locks are used (in the order in which they can be taken):
163  * NAME		TYPE	ROLE
164  * vnodelock	mtx	vnode lists and v_cache_dd field protection
165  * bucketlock	rwlock	for access to given set of hash buckets
166  * neglist	mtx	negative entry LRU management
167  *
168  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
169  * shrinking the LRU list.
170  *
171  * It is legal to take multiple vnodelock and bucketlock locks. The locking
172  * order is lower address first. Both are recursive.
173  *
174  * "." lookups are lockless.
175  *
176  * ".." and vnode -> name lookups require vnodelock.
177  *
178  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
179  *
180  * Insertions and removals of entries require involved vnodes and bucketlocks
181  * to be write-locked to prevent other threads from seeing the entry.
182  *
183  * Some lookups result in removal of the found entry (e.g. getting rid of a
184  * negative entry with the intent to create a positive one), which poses a
185  * problem when multiple threads reach the state. Similarly, two different
186  * threads can purge two different vnodes and try to remove the same name.
187  *
188  * If the already held vnode lock is lower than the second required lock, we
189  * can just take the other lock. However, in the opposite case, this could
190  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
191  * the first node, locking everything in order and revalidating the state.
192  */
193 
194 /*
195  * Structures associated with name caching.
196  */
197 #define NCHHASH(hash) \
198 	(&nchashtbl[(hash) & nchash])
199 static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
200 static u_long __read_mostly	nchash;			/* size of hash table */
201 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
202     "Size of namecache hash table");
203 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
204 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
205     "Ratio of negative namecache entries");
206 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
207 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
208 static u_long __exclusive_cache_line	numcachehv;/* number of cache entries with vnodes held */
209 u_int ncsizefactor = 2;
210 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
211     "Size factor for namecache");
212 static u_int __read_mostly	ncpurgeminvnodes;
213 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
214     "Number of vnodes below which purgevfs ignores the request");
215 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
216 
217 struct nchstats	nchstats;		/* cache effectiveness statistics */
218 
219 static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
220 static int	shrink_list_turn;
221 
222 struct neglist {
223 	struct mtx		nl_lock;
224 	TAILQ_HEAD(, namecache) nl_list;
225 } __aligned(CACHE_LINE_SIZE);
226 
227 static struct neglist __read_mostly	*neglists;
228 static struct neglist ncneg_hot;
229 static u_long numhotneg;
230 
231 #define	numneglists (ncneghash + 1)
232 static u_int __read_mostly	ncneghash;
233 static inline struct neglist *
234 NCP2NEGLIST(struct namecache *ncp)
235 {
236 
237 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
238 }
239 
240 #define	numbucketlocks (ncbuckethash + 1)
241 static u_int __read_mostly  ncbuckethash;
242 static struct rwlock_padalign __read_mostly  *bucketlocks;
243 #define	HASH2BUCKETLOCK(hash) \
244 	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
245 
246 #define	numvnodelocks (ncvnodehash + 1)
247 static u_int __read_mostly  ncvnodehash;
248 static struct mtx __read_mostly *vnodelocks;
249 static inline struct mtx *
250 VP2VNODELOCK(struct vnode *vp)
251 {
252 
253 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
254 }
255 
256 /*
257  * UMA zones for the VFS cache.
258  *
259  * The small cache is used for entries with short names, which are the
260  * most common.  The large cache is used for entries which are too big to
261  * fit in the small cache.
262  */
263 static uma_zone_t __read_mostly cache_zone_small;
264 static uma_zone_t __read_mostly cache_zone_small_ts;
265 static uma_zone_t __read_mostly cache_zone_large;
266 static uma_zone_t __read_mostly cache_zone_large_ts;
267 
268 #define	CACHE_PATH_CUTOFF	35
269 
270 static struct namecache *
271 cache_alloc(int len, int ts)
272 {
273 	struct namecache_ts *ncp_ts;
274 	struct namecache *ncp;
275 
276 	if (__predict_false(ts)) {
277 		if (len <= CACHE_PATH_CUTOFF)
278 			ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK);
279 		else
280 			ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK);
281 		ncp = &ncp_ts->nc_nc;
282 	} else {
283 		if (len <= CACHE_PATH_CUTOFF)
284 			ncp = uma_zalloc(cache_zone_small, M_WAITOK);
285 		else
286 			ncp = uma_zalloc(cache_zone_large, M_WAITOK);
287 	}
288 	return (ncp);
289 }
290 
291 static void
292 cache_free(struct namecache *ncp)
293 {
294 	struct namecache_ts *ncp_ts;
295 
296 	if (ncp == NULL)
297 		return;
298 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
299 		vdrop(ncp->nc_dvp);
300 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
301 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
302 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
303 			uma_zfree(cache_zone_small_ts, ncp_ts);
304 		else
305 			uma_zfree(cache_zone_large_ts, ncp_ts);
306 	} else {
307 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
308 			uma_zfree(cache_zone_small, ncp);
309 		else
310 			uma_zfree(cache_zone_large, ncp);
311 	}
312 }
313 
314 static void
315 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
316 {
317 	struct namecache_ts *ncp_ts;
318 
319 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
320 	    (tsp == NULL && ticksp == NULL),
321 	    ("No NCF_TS"));
322 
323 	if (tsp == NULL && ticksp == NULL)
324 		return;
325 
326 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
327 	if (tsp != NULL)
328 		*tsp = ncp_ts->nc_time;
329 	if (ticksp != NULL)
330 		*ticksp = ncp_ts->nc_ticks;
331 }
332 
333 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
334 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
335     "VFS namecache enabled");
336 
337 /* Export size information to userland */
338 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
339     sizeof(struct namecache), "sizeof(struct namecache)");
340 
341 /*
342  * The new name cache statistics
343  */
344 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
345     "Name cache statistics");
346 #define STATNODE_ULONG(name, descr)	\
347 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
348 #define STATNODE_COUNTER(name, descr)	\
349 	static counter_u64_t __read_mostly name; \
350 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr);
351 STATNODE_ULONG(numneg, "Number of negative cache entries");
352 STATNODE_ULONG(numcache, "Number of cache entries");
353 STATNODE_ULONG(numcachehv, "Number of namecache entries with vnodes held");
354 STATNODE_COUNTER(numcalls, "Number of cache lookups");
355 STATNODE_COUNTER(dothits, "Number of '.' hits");
356 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
357 STATNODE_COUNTER(numchecks, "Number of checks in lookup");
358 STATNODE_COUNTER(nummiss, "Number of cache misses");
359 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
360 STATNODE_COUNTER(numposzaps,
361     "Number of cache hits (positive) we do not want to cache");
362 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
363 STATNODE_COUNTER(numnegzaps,
364     "Number of cache hits (negative) we do not want to cache");
365 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
366 /* These count for kern___getcwd(), too. */
367 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
368 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
369 STATNODE_COUNTER(numfullpathfail2,
370     "Number of fullpath search errors (VOP_VPTOCNP failures)");
371 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
372 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
373 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
374     "Number of successful removals after relocking");
375 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
376     "Number of times zap_and_exit failed to lock");
377 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
378     "Number of times zap_and_exit failed to lock");
379 static long cache_lock_vnodes_cel_3_failures;
380 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
381     "Number of times 3-way vnode locking failed");
382 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
383 STATNODE_COUNTER(numneg_evicted,
384     "Number of negative entries evicted when adding a new entry");
385 STATNODE_COUNTER(shrinking_skipped,
386     "Number of times shrinking was already in progress");
387 
388 static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
389 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
390     char *buf, char **retbuf, u_int buflen);
391 
392 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
393 
394 static int cache_yield;
395 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
396     "Number of times cache called yield");
397 
398 static void __noinline
399 cache_maybe_yield(void)
400 {
401 
402 	if (should_yield()) {
403 		cache_yield++;
404 		kern_yield(PRI_USER);
405 	}
406 }
407 
408 static inline void
409 cache_assert_vlp_locked(struct mtx *vlp)
410 {
411 
412 	if (vlp != NULL)
413 		mtx_assert(vlp, MA_OWNED);
414 }
415 
416 static inline void
417 cache_assert_vnode_locked(struct vnode *vp)
418 {
419 	struct mtx *vlp;
420 
421 	vlp = VP2VNODELOCK(vp);
422 	cache_assert_vlp_locked(vlp);
423 }
424 
425 static uint32_t
426 cache_get_hash(char *name, u_char len, struct vnode *dvp)
427 {
428 	uint32_t hash;
429 
430 	hash = fnv_32_buf(name, len, FNV1_32_INIT);
431 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
432 	return (hash);
433 }
434 
435 static inline struct rwlock *
436 NCP2BUCKETLOCK(struct namecache *ncp)
437 {
438 	uint32_t hash;
439 
440 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
441 	return (HASH2BUCKETLOCK(hash));
442 }
443 
444 #ifdef INVARIANTS
445 static void
446 cache_assert_bucket_locked(struct namecache *ncp, int mode)
447 {
448 	struct rwlock *blp;
449 
450 	blp = NCP2BUCKETLOCK(ncp);
451 	rw_assert(blp, mode);
452 }
453 #else
454 #define cache_assert_bucket_locked(x, y) do { } while (0)
455 #endif
456 
457 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
458 static void
459 _cache_sort_vnodes(void **p1, void **p2)
460 {
461 	void *tmp;
462 
463 	MPASS(*p1 != NULL || *p2 != NULL);
464 
465 	if (*p1 > *p2) {
466 		tmp = *p2;
467 		*p2 = *p1;
468 		*p1 = tmp;
469 	}
470 }
471 
472 static void
473 cache_lock_all_buckets(void)
474 {
475 	u_int i;
476 
477 	for (i = 0; i < numbucketlocks; i++)
478 		rw_wlock(&bucketlocks[i]);
479 }
480 
481 static void
482 cache_unlock_all_buckets(void)
483 {
484 	u_int i;
485 
486 	for (i = 0; i < numbucketlocks; i++)
487 		rw_wunlock(&bucketlocks[i]);
488 }
489 
490 static void
491 cache_lock_all_vnodes(void)
492 {
493 	u_int i;
494 
495 	for (i = 0; i < numvnodelocks; i++)
496 		mtx_lock(&vnodelocks[i]);
497 }
498 
499 static void
500 cache_unlock_all_vnodes(void)
501 {
502 	u_int i;
503 
504 	for (i = 0; i < numvnodelocks; i++)
505 		mtx_unlock(&vnodelocks[i]);
506 }
507 
508 static int
509 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
510 {
511 
512 	cache_sort_vnodes(&vlp1, &vlp2);
513 
514 	if (vlp1 != NULL) {
515 		if (!mtx_trylock(vlp1))
516 			return (EAGAIN);
517 	}
518 	if (!mtx_trylock(vlp2)) {
519 		if (vlp1 != NULL)
520 			mtx_unlock(vlp1);
521 		return (EAGAIN);
522 	}
523 
524 	return (0);
525 }
526 
527 static void
528 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
529 {
530 
531 	MPASS(vlp1 != NULL || vlp2 != NULL);
532 	MPASS(vlp1 <= vlp2);
533 
534 	if (vlp1 != NULL)
535 		mtx_lock(vlp1);
536 	if (vlp2 != NULL)
537 		mtx_lock(vlp2);
538 }
539 
540 static void
541 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
542 {
543 
544 	MPASS(vlp1 != NULL || vlp2 != NULL);
545 
546 	if (vlp1 != NULL)
547 		mtx_unlock(vlp1);
548 	if (vlp2 != NULL)
549 		mtx_unlock(vlp2);
550 }
551 
552 static int
553 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
554 {
555 	struct nchstats snap;
556 
557 	if (req->oldptr == NULL)
558 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
559 
560 	snap = nchstats;
561 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
562 	snap.ncs_neghits = counter_u64_fetch(numneghits);
563 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
564 	    counter_u64_fetch(numnegzaps);
565 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
566 	    counter_u64_fetch(nummiss);
567 
568 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
569 }
570 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
571     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
572     "VFS cache effectiveness statistics");
573 
574 #ifdef DIAGNOSTIC
575 /*
576  * Grab an atomic snapshot of the name cache hash chain lengths
577  */
578 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
579     "hash table stats");
580 
581 static int
582 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
583 {
584 	struct nchashhead *ncpp;
585 	struct namecache *ncp;
586 	int i, error, n_nchash, *cntbuf;
587 
588 retry:
589 	n_nchash = nchash + 1;	/* nchash is max index, not count */
590 	if (req->oldptr == NULL)
591 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
592 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
593 	cache_lock_all_buckets();
594 	if (n_nchash != nchash + 1) {
595 		cache_unlock_all_buckets();
596 		free(cntbuf, M_TEMP);
597 		goto retry;
598 	}
599 	/* Scan hash tables counting entries */
600 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
601 		LIST_FOREACH(ncp, ncpp, nc_hash)
602 			cntbuf[i]++;
603 	cache_unlock_all_buckets();
604 	for (error = 0, i = 0; i < n_nchash; i++)
605 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
606 			break;
607 	free(cntbuf, M_TEMP);
608 	return (error);
609 }
610 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
611     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
612     "nchash chain lengths");
613 
614 static int
615 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
616 {
617 	int error;
618 	struct nchashhead *ncpp;
619 	struct namecache *ncp;
620 	int n_nchash;
621 	int count, maxlength, used, pct;
622 
623 	if (!req->oldptr)
624 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
625 
626 	cache_lock_all_buckets();
627 	n_nchash = nchash + 1;	/* nchash is max index, not count */
628 	used = 0;
629 	maxlength = 0;
630 
631 	/* Scan hash tables for applicable entries */
632 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
633 		count = 0;
634 		LIST_FOREACH(ncp, ncpp, nc_hash) {
635 			count++;
636 		}
637 		if (count)
638 			used++;
639 		if (maxlength < count)
640 			maxlength = count;
641 	}
642 	n_nchash = nchash + 1;
643 	cache_unlock_all_buckets();
644 	pct = (used * 100) / (n_nchash / 100);
645 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
646 	if (error)
647 		return (error);
648 	error = SYSCTL_OUT(req, &used, sizeof(used));
649 	if (error)
650 		return (error);
651 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
652 	if (error)
653 		return (error);
654 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
655 	if (error)
656 		return (error);
657 	return (0);
658 }
659 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
660     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
661     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
662 #endif
663 
664 /*
665  * Negative entries management
666  *
667  * A variation of LRU scheme is used. New entries are hashed into one of
668  * numneglists cold lists. Entries get promoted to the hot list on first hit.
669  *
670  * The shrinker will demote hot list head and evict from the cold list in a
671  * round-robin manner.
672  */
673 static void
674 cache_negative_hit(struct namecache *ncp)
675 {
676 	struct neglist *neglist;
677 
678 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
679 	if (ncp->nc_flag & NCF_HOTNEGATIVE)
680 		return;
681 	neglist = NCP2NEGLIST(ncp);
682 	mtx_lock(&ncneg_hot.nl_lock);
683 	mtx_lock(&neglist->nl_lock);
684 	if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
685 		numhotneg++;
686 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
687 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
688 		ncp->nc_flag |= NCF_HOTNEGATIVE;
689 	}
690 	mtx_unlock(&neglist->nl_lock);
691 	mtx_unlock(&ncneg_hot.nl_lock);
692 }
693 
694 static void
695 cache_negative_insert(struct namecache *ncp, bool neg_locked)
696 {
697 	struct neglist *neglist;
698 
699 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
700 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
701 	neglist = NCP2NEGLIST(ncp);
702 	if (!neg_locked) {
703 		mtx_lock(&neglist->nl_lock);
704 	} else {
705 		mtx_assert(&neglist->nl_lock, MA_OWNED);
706 	}
707 	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
708 	if (!neg_locked)
709 		mtx_unlock(&neglist->nl_lock);
710 	atomic_add_rel_long(&numneg, 1);
711 }
712 
713 static void
714 cache_negative_remove(struct namecache *ncp, bool neg_locked)
715 {
716 	struct neglist *neglist;
717 	bool hot_locked = false;
718 	bool list_locked = false;
719 
720 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
721 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
722 	neglist = NCP2NEGLIST(ncp);
723 	if (!neg_locked) {
724 		if (ncp->nc_flag & NCF_HOTNEGATIVE) {
725 			hot_locked = true;
726 			mtx_lock(&ncneg_hot.nl_lock);
727 			if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
728 				list_locked = true;
729 				mtx_lock(&neglist->nl_lock);
730 			}
731 		} else {
732 			list_locked = true;
733 			mtx_lock(&neglist->nl_lock);
734 		}
735 	}
736 	if (ncp->nc_flag & NCF_HOTNEGATIVE) {
737 		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
738 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
739 		numhotneg--;
740 	} else {
741 		mtx_assert(&neglist->nl_lock, MA_OWNED);
742 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
743 	}
744 	if (list_locked)
745 		mtx_unlock(&neglist->nl_lock);
746 	if (hot_locked)
747 		mtx_unlock(&ncneg_hot.nl_lock);
748 	atomic_subtract_rel_long(&numneg, 1);
749 }
750 
751 static void
752 cache_negative_shrink_select(int start, struct namecache **ncpp,
753     struct neglist **neglistpp)
754 {
755 	struct neglist *neglist;
756 	struct namecache *ncp;
757 	int i;
758 
759 	*ncpp = ncp = NULL;
760 	neglist = NULL;
761 
762 	for (i = start; i < numneglists; i++) {
763 		neglist = &neglists[i];
764 		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
765 			continue;
766 		mtx_lock(&neglist->nl_lock);
767 		ncp = TAILQ_FIRST(&neglist->nl_list);
768 		if (ncp != NULL)
769 			break;
770 		mtx_unlock(&neglist->nl_lock);
771 	}
772 
773 	*neglistpp = neglist;
774 	*ncpp = ncp;
775 }
776 
777 static void
778 cache_negative_zap_one(void)
779 {
780 	struct namecache *ncp, *ncp2;
781 	struct neglist *neglist;
782 	struct mtx *dvlp;
783 	struct rwlock *blp;
784 
785 	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
786 	    !mtx_trylock(&ncneg_shrink_lock)) {
787 		counter_u64_add(shrinking_skipped, 1);
788 		return;
789 	}
790 
791 	mtx_lock(&ncneg_hot.nl_lock);
792 	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
793 	if (ncp != NULL) {
794 		neglist = NCP2NEGLIST(ncp);
795 		mtx_lock(&neglist->nl_lock);
796 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
797 		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
798 		ncp->nc_flag &= ~NCF_HOTNEGATIVE;
799 		numhotneg--;
800 		mtx_unlock(&neglist->nl_lock);
801 	}
802 	mtx_unlock(&ncneg_hot.nl_lock);
803 
804 	cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
805 	shrink_list_turn++;
806 	if (shrink_list_turn == numneglists)
807 		shrink_list_turn = 0;
808 	if (ncp == NULL && shrink_list_turn == 0)
809 		cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
810 	mtx_unlock(&ncneg_shrink_lock);
811 	if (ncp == NULL)
812 		return;
813 
814 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
815 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
816 	blp = NCP2BUCKETLOCK(ncp);
817 	mtx_unlock(&neglist->nl_lock);
818 	mtx_lock(dvlp);
819 	rw_wlock(blp);
820 	mtx_lock(&neglist->nl_lock);
821 	ncp2 = TAILQ_FIRST(&neglist->nl_list);
822 	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
823 	    blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) {
824 		ncp = NULL;
825 	} else {
826 		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
827 		    ncp->nc_name);
828 
829 		cache_zap_locked(ncp, true);
830 		counter_u64_add(numneg_evicted, 1);
831 	}
832 	mtx_unlock(&neglist->nl_lock);
833 	rw_wunlock(blp);
834 	mtx_unlock(dvlp);
835 	cache_free(ncp);
836 }
837 
838 /*
839  * cache_zap_locked():
840  *
841  *   Removes a namecache entry from cache, whether it contains an actual
842  *   pointer to a vnode or if it is just a negative cache entry.
843  */
844 static void
845 cache_zap_locked(struct namecache *ncp, bool neg_locked)
846 {
847 
848 	if (!(ncp->nc_flag & NCF_NEGATIVE))
849 		cache_assert_vnode_locked(ncp->nc_vp);
850 	cache_assert_vnode_locked(ncp->nc_dvp);
851 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
852 
853 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
854 	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
855 	LIST_REMOVE(ncp, nc_hash);
856 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
857 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
858 		    ncp->nc_name, ncp->nc_vp);
859 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
860 		if (ncp == ncp->nc_vp->v_cache_dd)
861 			ncp->nc_vp->v_cache_dd = NULL;
862 	} else {
863 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
864 		    ncp->nc_name);
865 		cache_negative_remove(ncp, neg_locked);
866 	}
867 	if (ncp->nc_flag & NCF_ISDOTDOT) {
868 		if (ncp == ncp->nc_dvp->v_cache_dd)
869 			ncp->nc_dvp->v_cache_dd = NULL;
870 	} else {
871 		LIST_REMOVE(ncp, nc_src);
872 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
873 			ncp->nc_flag |= NCF_DVDROP;
874 			atomic_subtract_rel_long(&numcachehv, 1);
875 		}
876 	}
877 	atomic_subtract_rel_long(&numcache, 1);
878 }
879 
880 static void
881 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
882 {
883 	struct rwlock *blp;
884 
885 	MPASS(ncp->nc_dvp == vp);
886 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
887 	cache_assert_vnode_locked(vp);
888 
889 	blp = NCP2BUCKETLOCK(ncp);
890 	rw_wlock(blp);
891 	cache_zap_locked(ncp, false);
892 	rw_wunlock(blp);
893 }
894 
895 static bool
896 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
897     struct mtx **vlpp)
898 {
899 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
900 	struct rwlock *blp;
901 
902 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
903 	cache_assert_vnode_locked(vp);
904 
905 	if (ncp->nc_flag & NCF_NEGATIVE) {
906 		if (*vlpp != NULL) {
907 			mtx_unlock(*vlpp);
908 			*vlpp = NULL;
909 		}
910 		cache_zap_negative_locked_vnode_kl(ncp, vp);
911 		return (true);
912 	}
913 
914 	pvlp = VP2VNODELOCK(vp);
915 	blp = NCP2BUCKETLOCK(ncp);
916 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
917 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
918 
919 	if (*vlpp == vlp1 || *vlpp == vlp2) {
920 		to_unlock = *vlpp;
921 		*vlpp = NULL;
922 	} else {
923 		if (*vlpp != NULL) {
924 			mtx_unlock(*vlpp);
925 			*vlpp = NULL;
926 		}
927 		cache_sort_vnodes(&vlp1, &vlp2);
928 		if (vlp1 == pvlp) {
929 			mtx_lock(vlp2);
930 			to_unlock = vlp2;
931 		} else {
932 			if (!mtx_trylock(vlp1))
933 				goto out_relock;
934 			to_unlock = vlp1;
935 		}
936 	}
937 	rw_wlock(blp);
938 	cache_zap_locked(ncp, false);
939 	rw_wunlock(blp);
940 	if (to_unlock != NULL)
941 		mtx_unlock(to_unlock);
942 	return (true);
943 
944 out_relock:
945 	mtx_unlock(vlp2);
946 	mtx_lock(vlp1);
947 	mtx_lock(vlp2);
948 	MPASS(*vlpp == NULL);
949 	*vlpp = vlp1;
950 	return (false);
951 }
952 
953 static int __noinline
954 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
955 {
956 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
957 	struct rwlock *blp;
958 	int error = 0;
959 
960 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
961 	cache_assert_vnode_locked(vp);
962 
963 	pvlp = VP2VNODELOCK(vp);
964 	if (ncp->nc_flag & NCF_NEGATIVE) {
965 		cache_zap_negative_locked_vnode_kl(ncp, vp);
966 		goto out;
967 	}
968 
969 	blp = NCP2BUCKETLOCK(ncp);
970 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
971 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
972 	cache_sort_vnodes(&vlp1, &vlp2);
973 	if (vlp1 == pvlp) {
974 		mtx_lock(vlp2);
975 		to_unlock = vlp2;
976 	} else {
977 		if (!mtx_trylock(vlp1)) {
978 			error = EAGAIN;
979 			goto out;
980 		}
981 		to_unlock = vlp1;
982 	}
983 	rw_wlock(blp);
984 	cache_zap_locked(ncp, false);
985 	rw_wunlock(blp);
986 	mtx_unlock(to_unlock);
987 out:
988 	mtx_unlock(pvlp);
989 	return (error);
990 }
991 
992 /*
993  * If trylocking failed we can get here. We know enough to take all needed locks
994  * in the right order and re-lookup the entry.
995  */
996 static int
997 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
998     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
999     struct rwlock *blp)
1000 {
1001 	struct namecache *rncp;
1002 
1003 	cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1004 
1005 	cache_sort_vnodes(&dvlp, &vlp);
1006 	cache_lock_vnodes(dvlp, vlp);
1007 	rw_wlock(blp);
1008 	LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1009 		if (rncp == ncp && rncp->nc_dvp == dvp &&
1010 		    rncp->nc_nlen == cnp->cn_namelen &&
1011 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1012 			break;
1013 	}
1014 	if (rncp != NULL) {
1015 		cache_zap_locked(rncp, false);
1016 		rw_wunlock(blp);
1017 		cache_unlock_vnodes(dvlp, vlp);
1018 		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1019 		return (0);
1020 	}
1021 
1022 	rw_wunlock(blp);
1023 	cache_unlock_vnodes(dvlp, vlp);
1024 	return (EAGAIN);
1025 }
1026 
1027 static int __noinline
1028 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1029     uint32_t hash, struct rwlock *blp)
1030 {
1031 	struct mtx *dvlp, *vlp;
1032 	struct vnode *dvp;
1033 
1034 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1035 
1036 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1037 	vlp = NULL;
1038 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1039 		vlp = VP2VNODELOCK(ncp->nc_vp);
1040 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1041 		cache_zap_locked(ncp, false);
1042 		rw_wunlock(blp);
1043 		cache_unlock_vnodes(dvlp, vlp);
1044 		return (0);
1045 	}
1046 
1047 	dvp = ncp->nc_dvp;
1048 	rw_wunlock(blp);
1049 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1050 }
1051 
1052 static int __noinline
1053 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1054     uint32_t hash, struct rwlock *blp)
1055 {
1056 	struct mtx *dvlp, *vlp;
1057 	struct vnode *dvp;
1058 
1059 	cache_assert_bucket_locked(ncp, RA_RLOCKED);
1060 
1061 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1062 	vlp = NULL;
1063 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1064 		vlp = VP2VNODELOCK(ncp->nc_vp);
1065 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1066 		rw_runlock(blp);
1067 		rw_wlock(blp);
1068 		cache_zap_locked(ncp, false);
1069 		rw_wunlock(blp);
1070 		cache_unlock_vnodes(dvlp, vlp);
1071 		return (0);
1072 	}
1073 
1074 	dvp = ncp->nc_dvp;
1075 	rw_runlock(blp);
1076 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1077 }
1078 
1079 static int
1080 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1081     struct mtx **vlpp1, struct mtx **vlpp2)
1082 {
1083 	struct mtx *dvlp, *vlp;
1084 
1085 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1086 
1087 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1088 	vlp = NULL;
1089 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1090 		vlp = VP2VNODELOCK(ncp->nc_vp);
1091 	cache_sort_vnodes(&dvlp, &vlp);
1092 
1093 	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1094 		cache_zap_locked(ncp, false);
1095 		cache_unlock_vnodes(dvlp, vlp);
1096 		*vlpp1 = NULL;
1097 		*vlpp2 = NULL;
1098 		return (0);
1099 	}
1100 
1101 	if (*vlpp1 != NULL)
1102 		mtx_unlock(*vlpp1);
1103 	if (*vlpp2 != NULL)
1104 		mtx_unlock(*vlpp2);
1105 	*vlpp1 = NULL;
1106 	*vlpp2 = NULL;
1107 
1108 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1109 		cache_zap_locked(ncp, false);
1110 		cache_unlock_vnodes(dvlp, vlp);
1111 		return (0);
1112 	}
1113 
1114 	rw_wunlock(blp);
1115 	*vlpp1 = dvlp;
1116 	*vlpp2 = vlp;
1117 	if (*vlpp1 != NULL)
1118 		mtx_lock(*vlpp1);
1119 	mtx_lock(*vlpp2);
1120 	rw_wlock(blp);
1121 	return (EAGAIN);
1122 }
1123 
1124 static void
1125 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1126 {
1127 
1128 	if (blp != NULL) {
1129 		rw_runlock(blp);
1130 	} else {
1131 		mtx_unlock(vlp);
1132 	}
1133 }
1134 
1135 static int __noinline
1136 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1137     struct timespec *tsp, int *ticksp)
1138 {
1139 	int ltype;
1140 
1141 	*vpp = dvp;
1142 	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1143 			dvp, cnp->cn_nameptr);
1144 	counter_u64_add(dothits, 1);
1145 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1146 	if (tsp != NULL)
1147 		timespecclear(tsp);
1148 	if (ticksp != NULL)
1149 		*ticksp = ticks;
1150 	vrefact(*vpp);
1151 	/*
1152 	 * When we lookup "." we still can be asked to lock it
1153 	 * differently...
1154 	 */
1155 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1156 	if (ltype != VOP_ISLOCKED(*vpp)) {
1157 		if (ltype == LK_EXCLUSIVE) {
1158 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1159 			if ((*vpp)->v_iflag & VI_DOOMED) {
1160 				/* forced unmount */
1161 				vrele(*vpp);
1162 				*vpp = NULL;
1163 				return (ENOENT);
1164 			}
1165 		} else
1166 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1167 	}
1168 	return (-1);
1169 }
1170 
1171 static __noinline int
1172 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1173     struct componentname *cnp, struct timespec *tsp, int *ticksp)
1174 {
1175 	struct namecache *ncp;
1176 	struct rwlock *blp;
1177 	struct mtx *dvlp, *dvlp2;
1178 	uint32_t hash;
1179 	int error;
1180 
1181 	if (cnp->cn_namelen == 2 &&
1182 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1183 		counter_u64_add(dotdothits, 1);
1184 		dvlp = VP2VNODELOCK(dvp);
1185 		dvlp2 = NULL;
1186 		mtx_lock(dvlp);
1187 retry_dotdot:
1188 		ncp = dvp->v_cache_dd;
1189 		if (ncp == NULL) {
1190 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1191 			    "..", NULL);
1192 			mtx_unlock(dvlp);
1193 			if (dvlp2 != NULL)
1194 				mtx_unlock(dvlp2);
1195 			return (0);
1196 		}
1197 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1198 			if (ncp->nc_dvp != dvp)
1199 				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1200 			if (!cache_zap_locked_vnode_kl2(ncp,
1201 			    dvp, &dvlp2))
1202 				goto retry_dotdot;
1203 			MPASS(dvp->v_cache_dd == NULL);
1204 			mtx_unlock(dvlp);
1205 			if (dvlp2 != NULL)
1206 				mtx_unlock(dvlp2);
1207 			cache_free(ncp);
1208 		} else {
1209 			dvp->v_cache_dd = NULL;
1210 			mtx_unlock(dvlp);
1211 			if (dvlp2 != NULL)
1212 				mtx_unlock(dvlp2);
1213 		}
1214 		return (0);
1215 	}
1216 
1217 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1218 	blp = HASH2BUCKETLOCK(hash);
1219 retry:
1220 	if (LIST_EMPTY(NCHHASH(hash)))
1221 		goto out_no_entry;
1222 
1223 	rw_wlock(blp);
1224 
1225 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1226 		counter_u64_add(numchecks, 1);
1227 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1228 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1229 			break;
1230 	}
1231 
1232 	/* We failed to find an entry */
1233 	if (ncp == NULL) {
1234 		rw_wunlock(blp);
1235 		goto out_no_entry;
1236 	}
1237 
1238 	error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1239 	if (__predict_false(error != 0)) {
1240 		zap_and_exit_bucket_fail++;
1241 		cache_maybe_yield();
1242 		goto retry;
1243 	}
1244 	counter_u64_add(numposzaps, 1);
1245 	cache_free(ncp);
1246 	return (0);
1247 out_no_entry:
1248 	SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1249 	counter_u64_add(nummisszap, 1);
1250 	return (0);
1251 }
1252 
1253 /**
1254  * Lookup a name in the name cache
1255  *
1256  * # Arguments
1257  *
1258  * - dvp:	Parent directory in which to search.
1259  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1260  * - cnp:	Parameters of the name search.  The most interesting bits of
1261  *   		the cn_flags field have the following meanings:
1262  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1263  *   			it up.
1264  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1265  * - tsp:	Return storage for cache timestamp.  On a successful (positive
1266  *   		or negative) lookup, tsp will be filled with any timespec that
1267  *   		was stored when this cache entry was created.  However, it will
1268  *   		be clear for "." entries.
1269  * - ticks:	Return storage for alternate cache timestamp.  On a successful
1270  *   		(positive or negative) lookup, it will contain the ticks value
1271  *   		that was current when the cache entry was created, unless cnp
1272  *   		was ".".
1273  *
1274  * # Returns
1275  *
1276  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1277  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1278  *		to a forced unmount.  vpp will not be modified.  If the entry
1279  *		is a whiteout, then the ISWHITEOUT flag will be set in
1280  *		cnp->cn_flags.
1281  * - 0:		A cache miss.  vpp will not be modified.
1282  *
1283  * # Locking
1284  *
1285  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1286  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1287  * lock is not recursively acquired.
1288  */
1289 int
1290 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1291     struct timespec *tsp, int *ticksp)
1292 {
1293 	struct namecache_ts *ncp_ts;
1294 	struct namecache *ncp;
1295 	struct rwlock *blp;
1296 	struct mtx *dvlp;
1297 	uint32_t hash;
1298 	enum vgetstate vs;
1299 	int error, ltype;
1300 
1301 	if (__predict_false(!doingcache)) {
1302 		cnp->cn_flags &= ~MAKEENTRY;
1303 		return (0);
1304 	}
1305 
1306 	counter_u64_add(numcalls, 1);
1307 
1308 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1309 		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1310 
1311 	if ((cnp->cn_flags & MAKEENTRY) == 0)
1312 		return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1313 
1314 retry:
1315 	blp = NULL;
1316 	dvlp = NULL;
1317 	error = 0;
1318 	if (cnp->cn_namelen == 2 &&
1319 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1320 		counter_u64_add(dotdothits, 1);
1321 		dvlp = VP2VNODELOCK(dvp);
1322 		mtx_lock(dvlp);
1323 		ncp = dvp->v_cache_dd;
1324 		if (ncp == NULL) {
1325 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1326 			    "..", NULL);
1327 			mtx_unlock(dvlp);
1328 			return (0);
1329 		}
1330 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1331 			if (ncp->nc_flag & NCF_NEGATIVE)
1332 				*vpp = NULL;
1333 			else
1334 				*vpp = ncp->nc_vp;
1335 		} else
1336 			*vpp = ncp->nc_dvp;
1337 		/* Return failure if negative entry was found. */
1338 		if (*vpp == NULL)
1339 			goto negative_success;
1340 		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1341 		    dvp, cnp->cn_nameptr, *vpp);
1342 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1343 		    *vpp);
1344 		cache_out_ts(ncp, tsp, ticksp);
1345 		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1346 		    NCF_DTS && tsp != NULL) {
1347 			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1348 			*tsp = ncp_ts->nc_dotdottime;
1349 		}
1350 		goto success;
1351 	}
1352 
1353 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1354 	blp = HASH2BUCKETLOCK(hash);
1355 	rw_rlock(blp);
1356 
1357 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1358 		counter_u64_add(numchecks, 1);
1359 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1360 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1361 			break;
1362 	}
1363 
1364 	/* We failed to find an entry */
1365 	if (__predict_false(ncp == NULL)) {
1366 		rw_runlock(blp);
1367 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1368 		    NULL);
1369 		counter_u64_add(nummiss, 1);
1370 		return (0);
1371 	}
1372 
1373 	if (ncp->nc_flag & NCF_NEGATIVE)
1374 		goto negative_success;
1375 
1376 	/* We found a "positive" match, return the vnode */
1377 	counter_u64_add(numposhits, 1);
1378 	*vpp = ncp->nc_vp;
1379 	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1380 	    dvp, cnp->cn_nameptr, *vpp, ncp);
1381 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1382 	    *vpp);
1383 	cache_out_ts(ncp, tsp, ticksp);
1384 success:
1385 	/*
1386 	 * On success we return a locked and ref'd vnode as per the lookup
1387 	 * protocol.
1388 	 */
1389 	MPASS(dvp != *vpp);
1390 	ltype = 0;	/* silence gcc warning */
1391 	if (cnp->cn_flags & ISDOTDOT) {
1392 		ltype = VOP_ISLOCKED(dvp);
1393 		VOP_UNLOCK(dvp, 0);
1394 	}
1395 	vs = vget_prep(*vpp);
1396 	cache_lookup_unlock(blp, dvlp);
1397 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1398 	if (cnp->cn_flags & ISDOTDOT) {
1399 		vn_lock(dvp, ltype | LK_RETRY);
1400 		if (dvp->v_iflag & VI_DOOMED) {
1401 			if (error == 0)
1402 				vput(*vpp);
1403 			*vpp = NULL;
1404 			return (ENOENT);
1405 		}
1406 	}
1407 	if (error) {
1408 		*vpp = NULL;
1409 		goto retry;
1410 	}
1411 	if ((cnp->cn_flags & ISLASTCN) &&
1412 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1413 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1414 	}
1415 	return (-1);
1416 
1417 negative_success:
1418 	/* We found a negative match, and want to create it, so purge */
1419 	if (cnp->cn_nameiop == CREATE) {
1420 		counter_u64_add(numnegzaps, 1);
1421 		goto zap_and_exit;
1422 	}
1423 
1424 	counter_u64_add(numneghits, 1);
1425 	cache_negative_hit(ncp);
1426 	if (ncp->nc_flag & NCF_WHITE)
1427 		cnp->cn_flags |= ISWHITEOUT;
1428 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
1429 	    ncp->nc_name);
1430 	cache_out_ts(ncp, tsp, ticksp);
1431 	cache_lookup_unlock(blp, dvlp);
1432 	return (ENOENT);
1433 
1434 zap_and_exit:
1435 	if (blp != NULL)
1436 		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1437 	else
1438 		error = cache_zap_locked_vnode(ncp, dvp);
1439 	if (__predict_false(error != 0)) {
1440 		zap_and_exit_bucket_fail2++;
1441 		cache_maybe_yield();
1442 		goto retry;
1443 	}
1444 	cache_free(ncp);
1445 	return (0);
1446 }
1447 
1448 struct celockstate {
1449 	struct mtx *vlp[3];
1450 	struct rwlock *blp[2];
1451 };
1452 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1453 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1454 
1455 static inline void
1456 cache_celockstate_init(struct celockstate *cel)
1457 {
1458 
1459 	bzero(cel, sizeof(*cel));
1460 }
1461 
1462 static void
1463 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1464     struct vnode *dvp)
1465 {
1466 	struct mtx *vlp1, *vlp2;
1467 
1468 	MPASS(cel->vlp[0] == NULL);
1469 	MPASS(cel->vlp[1] == NULL);
1470 	MPASS(cel->vlp[2] == NULL);
1471 
1472 	MPASS(vp != NULL || dvp != NULL);
1473 
1474 	vlp1 = VP2VNODELOCK(vp);
1475 	vlp2 = VP2VNODELOCK(dvp);
1476 	cache_sort_vnodes(&vlp1, &vlp2);
1477 
1478 	if (vlp1 != NULL) {
1479 		mtx_lock(vlp1);
1480 		cel->vlp[0] = vlp1;
1481 	}
1482 	mtx_lock(vlp2);
1483 	cel->vlp[1] = vlp2;
1484 }
1485 
1486 static void
1487 cache_unlock_vnodes_cel(struct celockstate *cel)
1488 {
1489 
1490 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1491 
1492 	if (cel->vlp[0] != NULL)
1493 		mtx_unlock(cel->vlp[0]);
1494 	if (cel->vlp[1] != NULL)
1495 		mtx_unlock(cel->vlp[1]);
1496 	if (cel->vlp[2] != NULL)
1497 		mtx_unlock(cel->vlp[2]);
1498 }
1499 
1500 static bool
1501 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1502 {
1503 	struct mtx *vlp;
1504 	bool ret;
1505 
1506 	cache_assert_vlp_locked(cel->vlp[0]);
1507 	cache_assert_vlp_locked(cel->vlp[1]);
1508 	MPASS(cel->vlp[2] == NULL);
1509 
1510 	MPASS(vp != NULL);
1511 	vlp = VP2VNODELOCK(vp);
1512 
1513 	ret = true;
1514 	if (vlp >= cel->vlp[1]) {
1515 		mtx_lock(vlp);
1516 	} else {
1517 		if (mtx_trylock(vlp))
1518 			goto out;
1519 		cache_lock_vnodes_cel_3_failures++;
1520 		cache_unlock_vnodes_cel(cel);
1521 		if (vlp < cel->vlp[0]) {
1522 			mtx_lock(vlp);
1523 			mtx_lock(cel->vlp[0]);
1524 			mtx_lock(cel->vlp[1]);
1525 		} else {
1526 			if (cel->vlp[0] != NULL)
1527 				mtx_lock(cel->vlp[0]);
1528 			mtx_lock(vlp);
1529 			mtx_lock(cel->vlp[1]);
1530 		}
1531 		ret = false;
1532 	}
1533 out:
1534 	cel->vlp[2] = vlp;
1535 	return (ret);
1536 }
1537 
1538 static void
1539 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1540     struct rwlock *blp2)
1541 {
1542 
1543 	MPASS(cel->blp[0] == NULL);
1544 	MPASS(cel->blp[1] == NULL);
1545 
1546 	cache_sort_vnodes(&blp1, &blp2);
1547 
1548 	if (blp1 != NULL) {
1549 		rw_wlock(blp1);
1550 		cel->blp[0] = blp1;
1551 	}
1552 	rw_wlock(blp2);
1553 	cel->blp[1] = blp2;
1554 }
1555 
1556 static void
1557 cache_unlock_buckets_cel(struct celockstate *cel)
1558 {
1559 
1560 	if (cel->blp[0] != NULL)
1561 		rw_wunlock(cel->blp[0]);
1562 	rw_wunlock(cel->blp[1]);
1563 }
1564 
1565 /*
1566  * Lock part of the cache affected by the insertion.
1567  *
1568  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1569  * However, insertion can result in removal of an old entry. In this
1570  * case we have an additional vnode and bucketlock pair to lock. If the
1571  * entry is negative, ncelock is locked instead of the vnode.
1572  *
1573  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1574  * preserving the locking order (smaller address first).
1575  */
1576 static void
1577 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1578     uint32_t hash)
1579 {
1580 	struct namecache *ncp;
1581 	struct rwlock *blps[2];
1582 
1583 	blps[0] = HASH2BUCKETLOCK(hash);
1584 	for (;;) {
1585 		blps[1] = NULL;
1586 		cache_lock_vnodes_cel(cel, dvp, vp);
1587 		if (vp == NULL || vp->v_type != VDIR)
1588 			break;
1589 		ncp = vp->v_cache_dd;
1590 		if (ncp == NULL)
1591 			break;
1592 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1593 			break;
1594 		MPASS(ncp->nc_dvp == vp);
1595 		blps[1] = NCP2BUCKETLOCK(ncp);
1596 		if (ncp->nc_flag & NCF_NEGATIVE)
1597 			break;
1598 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1599 			break;
1600 		/*
1601 		 * All vnodes got re-locked. Re-validate the state and if
1602 		 * nothing changed we are done. Otherwise restart.
1603 		 */
1604 		if (ncp == vp->v_cache_dd &&
1605 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1606 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1607 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1608 			break;
1609 		cache_unlock_vnodes_cel(cel);
1610 		cel->vlp[0] = NULL;
1611 		cel->vlp[1] = NULL;
1612 		cel->vlp[2] = NULL;
1613 	}
1614 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1615 }
1616 
1617 static void
1618 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1619     uint32_t hash)
1620 {
1621 	struct namecache *ncp;
1622 	struct rwlock *blps[2];
1623 
1624 	blps[0] = HASH2BUCKETLOCK(hash);
1625 	for (;;) {
1626 		blps[1] = NULL;
1627 		cache_lock_vnodes_cel(cel, dvp, vp);
1628 		ncp = dvp->v_cache_dd;
1629 		if (ncp == NULL)
1630 			break;
1631 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1632 			break;
1633 		MPASS(ncp->nc_dvp == dvp);
1634 		blps[1] = NCP2BUCKETLOCK(ncp);
1635 		if (ncp->nc_flag & NCF_NEGATIVE)
1636 			break;
1637 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1638 			break;
1639 		if (ncp == dvp->v_cache_dd &&
1640 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1641 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1642 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1643 			break;
1644 		cache_unlock_vnodes_cel(cel);
1645 		cel->vlp[0] = NULL;
1646 		cel->vlp[1] = NULL;
1647 		cel->vlp[2] = NULL;
1648 	}
1649 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1650 }
1651 
1652 static void
1653 cache_enter_unlock(struct celockstate *cel)
1654 {
1655 
1656 	cache_unlock_buckets_cel(cel);
1657 	cache_unlock_vnodes_cel(cel);
1658 }
1659 
1660 /*
1661  * Add an entry to the cache.
1662  */
1663 void
1664 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1665     struct timespec *tsp, struct timespec *dtsp)
1666 {
1667 	struct celockstate cel;
1668 	struct namecache *ncp, *n2, *ndd;
1669 	struct namecache_ts *ncp_ts, *n2_ts;
1670 	struct nchashhead *ncpp;
1671 	struct neglist *neglist;
1672 	uint32_t hash;
1673 	int flag;
1674 	int len;
1675 	bool neg_locked, held_dvp;
1676 	u_long lnumcache;
1677 
1678 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1679 	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
1680 	    ("cache_enter: Adding a doomed vnode"));
1681 	VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
1682 	    ("cache_enter: Doomed vnode used as src"));
1683 
1684 	if (__predict_false(!doingcache))
1685 		return;
1686 
1687 	/*
1688 	 * Avoid blowout in namecache entries.
1689 	 */
1690 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1691 	if (__predict_false(lnumcache >= ncsize)) {
1692 		atomic_add_long(&numcache, -1);
1693 		return;
1694 	}
1695 
1696 	cache_celockstate_init(&cel);
1697 	ndd = NULL;
1698 	ncp_ts = NULL;
1699 	flag = 0;
1700 	if (cnp->cn_nameptr[0] == '.') {
1701 		if (cnp->cn_namelen == 1)
1702 			return;
1703 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1704 			len = cnp->cn_namelen;
1705 			hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1706 			cache_enter_lock_dd(&cel, dvp, vp, hash);
1707 			/*
1708 			 * If dotdot entry already exists, just retarget it
1709 			 * to new parent vnode, otherwise continue with new
1710 			 * namecache entry allocation.
1711 			 */
1712 			if ((ncp = dvp->v_cache_dd) != NULL &&
1713 			    ncp->nc_flag & NCF_ISDOTDOT) {
1714 				KASSERT(ncp->nc_dvp == dvp,
1715 				    ("wrong isdotdot parent"));
1716 				neg_locked = false;
1717 				if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) {
1718 					neglist = NCP2NEGLIST(ncp);
1719 					mtx_lock(&ncneg_hot.nl_lock);
1720 					mtx_lock(&neglist->nl_lock);
1721 					neg_locked = true;
1722 				}
1723 				if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1724 					TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
1725 					    ncp, nc_dst);
1726 				} else {
1727 					cache_negative_remove(ncp, true);
1728 				}
1729 				if (vp != NULL) {
1730 					TAILQ_INSERT_HEAD(&vp->v_cache_dst,
1731 					    ncp, nc_dst);
1732 					if (ncp->nc_flag & NCF_HOTNEGATIVE)
1733 						numhotneg--;
1734 					ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE);
1735 				} else {
1736 					if (ncp->nc_flag & NCF_HOTNEGATIVE) {
1737 						numhotneg--;
1738 						ncp->nc_flag &= ~(NCF_HOTNEGATIVE);
1739 					}
1740 					ncp->nc_flag |= NCF_NEGATIVE;
1741 					cache_negative_insert(ncp, true);
1742 				}
1743 				if (neg_locked) {
1744 					mtx_unlock(&neglist->nl_lock);
1745 					mtx_unlock(&ncneg_hot.nl_lock);
1746 				}
1747 				ncp->nc_vp = vp;
1748 				cache_enter_unlock(&cel);
1749 				return;
1750 			}
1751 			dvp->v_cache_dd = NULL;
1752 			cache_enter_unlock(&cel);
1753 			cache_celockstate_init(&cel);
1754 			SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp);
1755 			flag = NCF_ISDOTDOT;
1756 		}
1757 	}
1758 
1759 	held_dvp = false;
1760 	if (LIST_EMPTY(&dvp->v_cache_src) && flag != NCF_ISDOTDOT) {
1761 		vhold(dvp);
1762 		atomic_add_long(&numcachehv, 1);
1763 		held_dvp = true;
1764 	}
1765 
1766 	/*
1767 	 * Calculate the hash key and setup as much of the new
1768 	 * namecache entry as possible before acquiring the lock.
1769 	 */
1770 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1771 	ncp->nc_flag = flag;
1772 	ncp->nc_vp = vp;
1773 	if (vp == NULL)
1774 		ncp->nc_flag |= NCF_NEGATIVE;
1775 	ncp->nc_dvp = dvp;
1776 	if (tsp != NULL) {
1777 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1778 		ncp_ts->nc_time = *tsp;
1779 		ncp_ts->nc_ticks = ticks;
1780 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
1781 		if (dtsp != NULL) {
1782 			ncp_ts->nc_dotdottime = *dtsp;
1783 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1784 		}
1785 	}
1786 	len = ncp->nc_nlen = cnp->cn_namelen;
1787 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1788 	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
1789 	cache_enter_lock(&cel, dvp, vp, hash);
1790 
1791 	/*
1792 	 * See if this vnode or negative entry is already in the cache
1793 	 * with this name.  This can happen with concurrent lookups of
1794 	 * the same path name.
1795 	 */
1796 	ncpp = NCHHASH(hash);
1797 	LIST_FOREACH(n2, ncpp, nc_hash) {
1798 		if (n2->nc_dvp == dvp &&
1799 		    n2->nc_nlen == cnp->cn_namelen &&
1800 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1801 			if (tsp != NULL) {
1802 				KASSERT((n2->nc_flag & NCF_TS) != 0,
1803 				    ("no NCF_TS"));
1804 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1805 				n2_ts->nc_time = ncp_ts->nc_time;
1806 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
1807 				if (dtsp != NULL) {
1808 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1809 					if (ncp->nc_flag & NCF_NEGATIVE)
1810 						mtx_lock(&ncneg_hot.nl_lock);
1811 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
1812 					if (ncp->nc_flag & NCF_NEGATIVE)
1813 						mtx_unlock(&ncneg_hot.nl_lock);
1814 				}
1815 			}
1816 			goto out_unlock_free;
1817 		}
1818 	}
1819 
1820 	if (flag == NCF_ISDOTDOT) {
1821 		/*
1822 		 * See if we are trying to add .. entry, but some other lookup
1823 		 * has populated v_cache_dd pointer already.
1824 		 */
1825 		if (dvp->v_cache_dd != NULL)
1826 			goto out_unlock_free;
1827 		KASSERT(vp == NULL || vp->v_type == VDIR,
1828 		    ("wrong vnode type %p", vp));
1829 		dvp->v_cache_dd = ncp;
1830 	}
1831 
1832 	if (vp != NULL) {
1833 		if (vp->v_type == VDIR) {
1834 			if (flag != NCF_ISDOTDOT) {
1835 				/*
1836 				 * For this case, the cache entry maps both the
1837 				 * directory name in it and the name ".." for the
1838 				 * directory's parent.
1839 				 */
1840 				if ((ndd = vp->v_cache_dd) != NULL) {
1841 					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1842 						cache_zap_locked(ndd, false);
1843 					else
1844 						ndd = NULL;
1845 				}
1846 				vp->v_cache_dd = ncp;
1847 			}
1848 		} else {
1849 			vp->v_cache_dd = NULL;
1850 		}
1851 	}
1852 
1853 	if (flag != NCF_ISDOTDOT) {
1854 		if (LIST_EMPTY(&dvp->v_cache_src)) {
1855 			if (!held_dvp) {
1856 				vhold(dvp);
1857 				atomic_add_long(&numcachehv, 1);
1858 			}
1859 		} else {
1860 			if (held_dvp) {
1861 				/*
1862 				 * This will not take the interlock as someone
1863 				 * else already holds the vnode on account of
1864 				 * the namecache and we hold locks preventing
1865 				 * this from changing.
1866 				 */
1867 				vdrop(dvp);
1868 				atomic_subtract_long(&numcachehv, 1);
1869 			}
1870 		}
1871 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
1872 	}
1873 
1874 	/*
1875 	 * Insert the new namecache entry into the appropriate chain
1876 	 * within the cache entries table.
1877 	 */
1878 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
1879 
1880 	/*
1881 	 * If the entry is "negative", we place it into the
1882 	 * "negative" cache queue, otherwise, we place it into the
1883 	 * destination vnode's cache entries queue.
1884 	 */
1885 	if (vp != NULL) {
1886 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
1887 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
1888 		    vp);
1889 	} else {
1890 		if (cnp->cn_flags & ISWHITEOUT)
1891 			ncp->nc_flag |= NCF_WHITE;
1892 		cache_negative_insert(ncp, false);
1893 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
1894 		    ncp->nc_name);
1895 	}
1896 	cache_enter_unlock(&cel);
1897 	if (numneg * ncnegfactor > lnumcache)
1898 		cache_negative_zap_one();
1899 	cache_free(ndd);
1900 	return;
1901 out_unlock_free:
1902 	cache_enter_unlock(&cel);
1903 	cache_free(ncp);
1904 	if (held_dvp) {
1905 		vdrop(dvp);
1906 		atomic_subtract_long(&numcachehv, 1);
1907 	}
1908 	return;
1909 }
1910 
1911 static u_int
1912 cache_roundup_2(u_int val)
1913 {
1914 	u_int res;
1915 
1916 	for (res = 1; res <= val; res <<= 1)
1917 		continue;
1918 
1919 	return (res);
1920 }
1921 
1922 /*
1923  * Name cache initialization, from vfs_init() when we are booting
1924  */
1925 static void
1926 nchinit(void *dummy __unused)
1927 {
1928 	u_int i;
1929 
1930 	cache_zone_small = uma_zcreate("S VFS Cache",
1931 	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
1932 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
1933 	    UMA_ZONE_ZINIT);
1934 	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
1935 	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
1936 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
1937 	    UMA_ZONE_ZINIT);
1938 	cache_zone_large = uma_zcreate("L VFS Cache",
1939 	    sizeof(struct namecache) + NAME_MAX + 1,
1940 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
1941 	    UMA_ZONE_ZINIT);
1942 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
1943 	    sizeof(struct namecache_ts) + NAME_MAX + 1,
1944 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
1945 	    UMA_ZONE_ZINIT);
1946 
1947 	ncsize = desiredvnodes * ncsizefactor;
1948 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
1949 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
1950 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
1951 		ncbuckethash = 7;
1952 	if (ncbuckethash > nchash)
1953 		ncbuckethash = nchash;
1954 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
1955 	    M_WAITOK | M_ZERO);
1956 	for (i = 0; i < numbucketlocks; i++)
1957 		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
1958 	ncvnodehash = ncbuckethash;
1959 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
1960 	    M_WAITOK | M_ZERO);
1961 	for (i = 0; i < numvnodelocks; i++)
1962 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
1963 	ncpurgeminvnodes = numbucketlocks * 2;
1964 
1965 	ncneghash = 3;
1966 	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
1967 	    M_WAITOK | M_ZERO);
1968 	for (i = 0; i < numneglists; i++) {
1969 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
1970 		TAILQ_INIT(&neglists[i].nl_list);
1971 	}
1972 	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
1973 	TAILQ_INIT(&ncneg_hot.nl_list);
1974 
1975 	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
1976 
1977 	numcalls = counter_u64_alloc(M_WAITOK);
1978 	dothits = counter_u64_alloc(M_WAITOK);
1979 	dotdothits = counter_u64_alloc(M_WAITOK);
1980 	numchecks = counter_u64_alloc(M_WAITOK);
1981 	nummiss = counter_u64_alloc(M_WAITOK);
1982 	nummisszap = counter_u64_alloc(M_WAITOK);
1983 	numposzaps = counter_u64_alloc(M_WAITOK);
1984 	numposhits = counter_u64_alloc(M_WAITOK);
1985 	numnegzaps = counter_u64_alloc(M_WAITOK);
1986 	numneghits = counter_u64_alloc(M_WAITOK);
1987 	numfullpathcalls = counter_u64_alloc(M_WAITOK);
1988 	numfullpathfail1 = counter_u64_alloc(M_WAITOK);
1989 	numfullpathfail2 = counter_u64_alloc(M_WAITOK);
1990 	numfullpathfail4 = counter_u64_alloc(M_WAITOK);
1991 	numfullpathfound = counter_u64_alloc(M_WAITOK);
1992 	zap_and_exit_bucket_relock_success = counter_u64_alloc(M_WAITOK);
1993 	numneg_evicted = counter_u64_alloc(M_WAITOK);
1994 	shrinking_skipped = counter_u64_alloc(M_WAITOK);
1995 }
1996 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
1997 
1998 void
1999 cache_changesize(int newmaxvnodes)
2000 {
2001 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
2002 	u_long new_nchash, old_nchash;
2003 	struct namecache *ncp;
2004 	uint32_t hash;
2005 	int newncsize;
2006 	int i;
2007 
2008 	newncsize = newmaxvnodes * ncsizefactor;
2009 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2010 	if (newmaxvnodes < numbucketlocks)
2011 		newmaxvnodes = numbucketlocks;
2012 
2013 	new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
2014 	/* If same hash table size, nothing to do */
2015 	if (nchash == new_nchash) {
2016 		free(new_nchashtbl, M_VFSCACHE);
2017 		return;
2018 	}
2019 	/*
2020 	 * Move everything from the old hash table to the new table.
2021 	 * None of the namecache entries in the table can be removed
2022 	 * because to do so, they have to be removed from the hash table.
2023 	 */
2024 	cache_lock_all_vnodes();
2025 	cache_lock_all_buckets();
2026 	old_nchashtbl = nchashtbl;
2027 	old_nchash = nchash;
2028 	nchashtbl = new_nchashtbl;
2029 	nchash = new_nchash;
2030 	for (i = 0; i <= old_nchash; i++) {
2031 		while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
2032 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2033 			    ncp->nc_dvp);
2034 			LIST_REMOVE(ncp, nc_hash);
2035 			LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2036 		}
2037 	}
2038 	ncsize = newncsize;
2039 	cache_unlock_all_buckets();
2040 	cache_unlock_all_vnodes();
2041 	free(old_nchashtbl, M_VFSCACHE);
2042 }
2043 
2044 /*
2045  * Invalidate all entries from and to a particular vnode.
2046  */
2047 void
2048 cache_purge(struct vnode *vp)
2049 {
2050 	TAILQ_HEAD(, namecache) ncps;
2051 	struct namecache *ncp, *nnp;
2052 	struct mtx *vlp, *vlp2;
2053 
2054 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
2055 	SDT_PROBE1(vfs, namecache, purge, done, vp);
2056 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2057 	    vp->v_cache_dd == NULL)
2058 		return;
2059 	TAILQ_INIT(&ncps);
2060 	vlp = VP2VNODELOCK(vp);
2061 	vlp2 = NULL;
2062 	mtx_lock(vlp);
2063 retry:
2064 	while (!LIST_EMPTY(&vp->v_cache_src)) {
2065 		ncp = LIST_FIRST(&vp->v_cache_src);
2066 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2067 			goto retry;
2068 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2069 	}
2070 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2071 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2072 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2073 			goto retry;
2074 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2075 	}
2076 	ncp = vp->v_cache_dd;
2077 	if (ncp != NULL) {
2078 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2079 		   ("lost dotdot link"));
2080 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2081 			goto retry;
2082 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2083 	}
2084 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2085 	mtx_unlock(vlp);
2086 	if (vlp2 != NULL)
2087 		mtx_unlock(vlp2);
2088 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2089 		cache_free(ncp);
2090 	}
2091 }
2092 
2093 /*
2094  * Invalidate all negative entries for a particular directory vnode.
2095  */
2096 void
2097 cache_purge_negative(struct vnode *vp)
2098 {
2099 	TAILQ_HEAD(, namecache) ncps;
2100 	struct namecache *ncp, *nnp;
2101 	struct mtx *vlp;
2102 
2103 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2104 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2105 	if (LIST_EMPTY(&vp->v_cache_src))
2106 		return;
2107 	TAILQ_INIT(&ncps);
2108 	vlp = VP2VNODELOCK(vp);
2109 	mtx_lock(vlp);
2110 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2111 		if (!(ncp->nc_flag & NCF_NEGATIVE))
2112 			continue;
2113 		cache_zap_negative_locked_vnode_kl(ncp, vp);
2114 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2115 	}
2116 	mtx_unlock(vlp);
2117 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2118 		cache_free(ncp);
2119 	}
2120 }
2121 
2122 /*
2123  * Flush all entries referencing a particular filesystem.
2124  */
2125 void
2126 cache_purgevfs(struct mount *mp, bool force)
2127 {
2128 	TAILQ_HEAD(, namecache) ncps;
2129 	struct mtx *vlp1, *vlp2;
2130 	struct rwlock *blp;
2131 	struct nchashhead *bucket;
2132 	struct namecache *ncp, *nnp;
2133 	u_long i, j, n_nchash;
2134 	int error;
2135 
2136 	/* Scan hash tables for applicable entries */
2137 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2138 	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2139 		return;
2140 	TAILQ_INIT(&ncps);
2141 	n_nchash = nchash + 1;
2142 	vlp1 = vlp2 = NULL;
2143 	for (i = 0; i < numbucketlocks; i++) {
2144 		blp = (struct rwlock *)&bucketlocks[i];
2145 		rw_wlock(blp);
2146 		for (j = i; j < n_nchash; j += numbucketlocks) {
2147 retry:
2148 			bucket = &nchashtbl[j];
2149 			LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2150 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
2151 				if (ncp->nc_dvp->v_mount != mp)
2152 					continue;
2153 				error = cache_zap_wlocked_bucket_kl(ncp, blp,
2154 				    &vlp1, &vlp2);
2155 				if (error != 0)
2156 					goto retry;
2157 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2158 			}
2159 		}
2160 		rw_wunlock(blp);
2161 		if (vlp1 == NULL && vlp2 == NULL)
2162 			cache_maybe_yield();
2163 	}
2164 	if (vlp1 != NULL)
2165 		mtx_unlock(vlp1);
2166 	if (vlp2 != NULL)
2167 		mtx_unlock(vlp2);
2168 
2169 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2170 		cache_free(ncp);
2171 	}
2172 }
2173 
2174 /*
2175  * Perform canonical checks and cache lookup and pass on to filesystem
2176  * through the vop_cachedlookup only if needed.
2177  */
2178 
2179 int
2180 vfs_cache_lookup(struct vop_lookup_args *ap)
2181 {
2182 	struct vnode *dvp;
2183 	int error;
2184 	struct vnode **vpp = ap->a_vpp;
2185 	struct componentname *cnp = ap->a_cnp;
2186 	struct ucred *cred = cnp->cn_cred;
2187 	int flags = cnp->cn_flags;
2188 	struct thread *td = cnp->cn_thread;
2189 
2190 	*vpp = NULL;
2191 	dvp = ap->a_dvp;
2192 
2193 	if (dvp->v_type != VDIR)
2194 		return (ENOTDIR);
2195 
2196 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2197 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2198 		return (EROFS);
2199 
2200 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
2201 	if (error)
2202 		return (error);
2203 
2204 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2205 	if (error == 0)
2206 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2207 	if (error == -1)
2208 		return (0);
2209 	return (error);
2210 }
2211 
2212 /*
2213  * XXX All of these sysctls would probably be more productive dead.
2214  */
2215 static int __read_mostly disablecwd;
2216 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
2217    "Disable the getcwd syscall");
2218 
2219 /* Implementation of the getcwd syscall. */
2220 int
2221 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2222 {
2223 
2224 	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
2225 	    MAXPATHLEN));
2226 }
2227 
2228 int
2229 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen,
2230     size_t path_max)
2231 {
2232 	char *bp, *tmpbuf;
2233 	struct filedesc *fdp;
2234 	struct vnode *cdir, *rdir;
2235 	int error;
2236 
2237 	if (__predict_false(disablecwd))
2238 		return (ENODEV);
2239 	if (__predict_false(buflen < 2))
2240 		return (EINVAL);
2241 	if (buflen > path_max)
2242 		buflen = path_max;
2243 
2244 	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
2245 	fdp = td->td_proc->p_fd;
2246 	FILEDESC_SLOCK(fdp);
2247 	cdir = fdp->fd_cdir;
2248 	vrefact(cdir);
2249 	rdir = fdp->fd_rdir;
2250 	vrefact(rdir);
2251 	FILEDESC_SUNLOCK(fdp);
2252 	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
2253 	vrele(rdir);
2254 	vrele(cdir);
2255 
2256 	if (!error) {
2257 		if (bufseg == UIO_SYSSPACE)
2258 			bcopy(bp, buf, strlen(bp) + 1);
2259 		else
2260 			error = copyout(bp, buf, strlen(bp) + 1);
2261 #ifdef KTRACE
2262 	if (KTRPOINT(curthread, KTR_NAMEI))
2263 		ktrnamei(bp);
2264 #endif
2265 	}
2266 	free(tmpbuf, M_TEMP);
2267 	return (error);
2268 }
2269 
2270 /*
2271  * Thus begins the fullpath magic.
2272  */
2273 
2274 static int __read_mostly disablefullpath;
2275 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
2276     "Disable the vn_fullpath function");
2277 
2278 /*
2279  * Retrieve the full filesystem path that correspond to a vnode from the name
2280  * cache (if available)
2281  */
2282 int
2283 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2284 {
2285 	char *buf;
2286 	struct filedesc *fdp;
2287 	struct vnode *rdir;
2288 	int error;
2289 
2290 	if (__predict_false(disablefullpath))
2291 		return (ENODEV);
2292 	if (__predict_false(vn == NULL))
2293 		return (EINVAL);
2294 
2295 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
2296 	fdp = td->td_proc->p_fd;
2297 	FILEDESC_SLOCK(fdp);
2298 	rdir = fdp->fd_rdir;
2299 	vrefact(rdir);
2300 	FILEDESC_SUNLOCK(fdp);
2301 	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
2302 	vrele(rdir);
2303 
2304 	if (!error)
2305 		*freebuf = buf;
2306 	else
2307 		free(buf, M_TEMP);
2308 	return (error);
2309 }
2310 
2311 /*
2312  * This function is similar to vn_fullpath, but it attempts to lookup the
2313  * pathname relative to the global root mount point.  This is required for the
2314  * auditing sub-system, as audited pathnames must be absolute, relative to the
2315  * global root mount point.
2316  */
2317 int
2318 vn_fullpath_global(struct thread *td, struct vnode *vn,
2319     char **retbuf, char **freebuf)
2320 {
2321 	char *buf;
2322 	int error;
2323 
2324 	if (__predict_false(disablefullpath))
2325 		return (ENODEV);
2326 	if (__predict_false(vn == NULL))
2327 		return (EINVAL);
2328 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
2329 	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
2330 	if (!error)
2331 		*freebuf = buf;
2332 	else
2333 		free(buf, M_TEMP);
2334 	return (error);
2335 }
2336 
2337 int
2338 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
2339 {
2340 	struct vnode *dvp;
2341 	struct namecache *ncp;
2342 	struct mtx *vlp;
2343 	int error;
2344 
2345 	vlp = VP2VNODELOCK(*vp);
2346 	mtx_lock(vlp);
2347 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2348 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2349 			break;
2350 	}
2351 	if (ncp != NULL) {
2352 		if (*buflen < ncp->nc_nlen) {
2353 			mtx_unlock(vlp);
2354 			vrele(*vp);
2355 			counter_u64_add(numfullpathfail4, 1);
2356 			error = ENOMEM;
2357 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2358 			    vp, NULL);
2359 			return (error);
2360 		}
2361 		*buflen -= ncp->nc_nlen;
2362 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2363 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2364 		    ncp->nc_name, vp);
2365 		dvp = *vp;
2366 		*vp = ncp->nc_dvp;
2367 		vref(*vp);
2368 		mtx_unlock(vlp);
2369 		vrele(dvp);
2370 		return (0);
2371 	}
2372 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2373 
2374 	mtx_unlock(vlp);
2375 	vn_lock(*vp, LK_SHARED | LK_RETRY);
2376 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2377 	vput(*vp);
2378 	if (error) {
2379 		counter_u64_add(numfullpathfail2, 1);
2380 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2381 		return (error);
2382 	}
2383 
2384 	*vp = dvp;
2385 	if (dvp->v_iflag & VI_DOOMED) {
2386 		/* forced unmount */
2387 		vrele(dvp);
2388 		error = ENOENT;
2389 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2390 		return (error);
2391 	}
2392 	/*
2393 	 * *vp has its use count incremented still.
2394 	 */
2395 
2396 	return (0);
2397 }
2398 
2399 /*
2400  * The magic behind kern___getcwd() and vn_fullpath().
2401  */
2402 static int
2403 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
2404     char *buf, char **retbuf, u_int buflen)
2405 {
2406 	int error, slash_prefixed;
2407 #ifdef KDTRACE_HOOKS
2408 	struct vnode *startvp = vp;
2409 #endif
2410 	struct vnode *vp1;
2411 
2412 	buflen--;
2413 	buf[buflen] = '\0';
2414 	error = 0;
2415 	slash_prefixed = 0;
2416 
2417 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2418 	counter_u64_add(numfullpathcalls, 1);
2419 	vref(vp);
2420 	if (vp->v_type != VDIR) {
2421 		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2422 		if (error)
2423 			return (error);
2424 		if (buflen == 0) {
2425 			vrele(vp);
2426 			return (ENOMEM);
2427 		}
2428 		buf[--buflen] = '/';
2429 		slash_prefixed = 1;
2430 	}
2431 	while (vp != rdir && vp != rootvnode) {
2432 		/*
2433 		 * The vp vnode must be already fully constructed,
2434 		 * since it is either found in namecache or obtained
2435 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2436 		 * without obtaining the vnode lock.
2437 		 */
2438 		if ((vp->v_vflag & VV_ROOT) != 0) {
2439 			vn_lock(vp, LK_RETRY | LK_SHARED);
2440 
2441 			/*
2442 			 * With the vnode locked, check for races with
2443 			 * unmount, forced or not.  Note that we
2444 			 * already verified that vp is not equal to
2445 			 * the root vnode, which means that
2446 			 * mnt_vnodecovered can be NULL only for the
2447 			 * case of unmount.
2448 			 */
2449 			if ((vp->v_iflag & VI_DOOMED) != 0 ||
2450 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2451 			    vp1->v_mountedhere != vp->v_mount) {
2452 				vput(vp);
2453 				error = ENOENT;
2454 				SDT_PROBE3(vfs, namecache, fullpath, return,
2455 				    error, vp, NULL);
2456 				break;
2457 			}
2458 
2459 			vref(vp1);
2460 			vput(vp);
2461 			vp = vp1;
2462 			continue;
2463 		}
2464 		if (vp->v_type != VDIR) {
2465 			vrele(vp);
2466 			counter_u64_add(numfullpathfail1, 1);
2467 			error = ENOTDIR;
2468 			SDT_PROBE3(vfs, namecache, fullpath, return,
2469 			    error, vp, NULL);
2470 			break;
2471 		}
2472 		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2473 		if (error)
2474 			break;
2475 		if (buflen == 0) {
2476 			vrele(vp);
2477 			error = ENOMEM;
2478 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2479 			    startvp, NULL);
2480 			break;
2481 		}
2482 		buf[--buflen] = '/';
2483 		slash_prefixed = 1;
2484 	}
2485 	if (error)
2486 		return (error);
2487 	if (!slash_prefixed) {
2488 		if (buflen == 0) {
2489 			vrele(vp);
2490 			counter_u64_add(numfullpathfail4, 1);
2491 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2492 			    startvp, NULL);
2493 			return (ENOMEM);
2494 		}
2495 		buf[--buflen] = '/';
2496 	}
2497 	counter_u64_add(numfullpathfound, 1);
2498 	vrele(vp);
2499 
2500 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
2501 	*retbuf = buf + buflen;
2502 	return (0);
2503 }
2504 
2505 struct vnode *
2506 vn_dir_dd_ino(struct vnode *vp)
2507 {
2508 	struct namecache *ncp;
2509 	struct vnode *ddvp;
2510 	struct mtx *vlp;
2511 	enum vgetstate vs;
2512 
2513 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2514 	vlp = VP2VNODELOCK(vp);
2515 	mtx_lock(vlp);
2516 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2517 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2518 			continue;
2519 		ddvp = ncp->nc_dvp;
2520 		vs = vget_prep(ddvp);
2521 		mtx_unlock(vlp);
2522 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2523 			return (NULL);
2524 		return (ddvp);
2525 	}
2526 	mtx_unlock(vlp);
2527 	return (NULL);
2528 }
2529 
2530 int
2531 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2532 {
2533 	struct namecache *ncp;
2534 	struct mtx *vlp;
2535 	int l;
2536 
2537 	vlp = VP2VNODELOCK(vp);
2538 	mtx_lock(vlp);
2539 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2540 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2541 			break;
2542 	if (ncp == NULL) {
2543 		mtx_unlock(vlp);
2544 		return (ENOENT);
2545 	}
2546 	l = min(ncp->nc_nlen, buflen - 1);
2547 	memcpy(buf, ncp->nc_name, l);
2548 	mtx_unlock(vlp);
2549 	buf[l] = '\0';
2550 	return (0);
2551 }
2552 
2553 /*
2554  * This function updates path string to vnode's full global path
2555  * and checks the size of the new path string against the pathlen argument.
2556  *
2557  * Requires a locked, referenced vnode.
2558  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2559  *
2560  * If sysctl debug.disablefullpath is set, ENODEV is returned,
2561  * vnode is left locked and path remain untouched.
2562  *
2563  * If vp is a directory, the call to vn_fullpath_global() always succeeds
2564  * because it falls back to the ".." lookup if the namecache lookup fails.
2565  */
2566 int
2567 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2568     u_int pathlen)
2569 {
2570 	struct nameidata nd;
2571 	struct vnode *vp1;
2572 	char *rpath, *fbuf;
2573 	int error;
2574 
2575 	ASSERT_VOP_ELOCKED(vp, __func__);
2576 
2577 	/* Return ENODEV if sysctl debug.disablefullpath==1 */
2578 	if (__predict_false(disablefullpath))
2579 		return (ENODEV);
2580 
2581 	/* Construct global filesystem path from vp. */
2582 	VOP_UNLOCK(vp, 0);
2583 	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2584 
2585 	if (error != 0) {
2586 		vrele(vp);
2587 		return (error);
2588 	}
2589 
2590 	if (strlen(rpath) >= pathlen) {
2591 		vrele(vp);
2592 		error = ENAMETOOLONG;
2593 		goto out;
2594 	}
2595 
2596 	/*
2597 	 * Re-lookup the vnode by path to detect a possible rename.
2598 	 * As a side effect, the vnode is relocked.
2599 	 * If vnode was renamed, return ENOENT.
2600 	 */
2601 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2602 	    UIO_SYSSPACE, path, td);
2603 	error = namei(&nd);
2604 	if (error != 0) {
2605 		vrele(vp);
2606 		goto out;
2607 	}
2608 	NDFREE(&nd, NDF_ONLY_PNBUF);
2609 	vp1 = nd.ni_vp;
2610 	vrele(vp);
2611 	if (vp1 == vp)
2612 		strcpy(path, rpath);
2613 	else {
2614 		vput(vp1);
2615 		error = ENOENT;
2616 	}
2617 
2618 out:
2619 	free(fbuf, M_TEMP);
2620 	return (error);
2621 }
2622 
2623 #ifdef DDB
2624 static void
2625 db_print_vpath(struct vnode *vp)
2626 {
2627 
2628 	while (vp != NULL) {
2629 		db_printf("%p: ", vp);
2630 		if (vp == rootvnode) {
2631 			db_printf("/");
2632 			vp = NULL;
2633 		} else {
2634 			if (vp->v_vflag & VV_ROOT) {
2635 				db_printf("<mount point>");
2636 				vp = vp->v_mount->mnt_vnodecovered;
2637 			} else {
2638 				struct namecache *ncp;
2639 				char *ncn;
2640 				int i;
2641 
2642 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
2643 				if (ncp != NULL) {
2644 					ncn = ncp->nc_name;
2645 					for (i = 0; i < ncp->nc_nlen; i++)
2646 						db_printf("%c", *ncn++);
2647 					vp = ncp->nc_dvp;
2648 				} else {
2649 					vp = NULL;
2650 				}
2651 			}
2652 		}
2653 		db_printf("\n");
2654 	}
2655 
2656 	return;
2657 }
2658 
2659 DB_SHOW_COMMAND(vpath, db_show_vpath)
2660 {
2661 	struct vnode *vp;
2662 
2663 	if (!have_addr) {
2664 		db_printf("usage: show vpath <struct vnode *>\n");
2665 		return;
2666 	}
2667 
2668 	vp = (struct vnode *)addr;
2669 	db_print_vpath(vp);
2670 }
2671 
2672 #endif
2673