xref: /freebsd/sys/kern/vfs_cache.c (revision a134ebd6e63f658f2d3d04ac0c60d23bcaa86dd7)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ddb.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
50 #include <sys/ktr.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
54 #include <sys/jail.h>
55 #include <sys/mount.h>
56 #include <sys/namei.h>
57 #include <sys/proc.h>
58 #include <sys/rwlock.h>
59 #include <sys/seqc.h>
60 #include <sys/sdt.h>
61 #include <sys/smr.h>
62 #include <sys/smp.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/sysproto.h>
66 #include <sys/vnode.h>
67 #include <ck_queue.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 
72 #include <sys/capsicum.h>
73 
74 #include <security/audit/audit.h>
75 #include <security/mac/mac_framework.h>
76 
77 #ifdef DDB
78 #include <ddb/ddb.h>
79 #endif
80 
81 #include <vm/uma.h>
82 
83 SDT_PROVIDER_DECLARE(vfs);
84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
85     "struct vnode *");
86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
87     "char *");
88 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
89 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
90     "char *", "struct vnode *");
91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
93     "struct vnode *", "char *");
94 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
95     "struct vnode *");
96 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
97     "struct vnode *", "char *");
98 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
99     "char *");
100 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
101 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
102 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
103 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
104     "struct vnode *");
105 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
106     "char *");
107 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
108     "char *");
109 
110 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
111 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
112 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
113 
114 /*
115  * This structure describes the elements in the cache of recent
116  * names looked up by namei.
117  */
118 struct negstate {
119 	u_char neg_flag;
120 };
121 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
122     "the state must fit in a union with a pointer without growing it");
123 
124 struct	namecache {
125 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
126 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
127 	CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
128 	struct	vnode *nc_dvp;		/* vnode of parent of name */
129 	union {
130 		struct	vnode *nu_vp;	/* vnode the name refers to */
131 		struct	negstate nu_neg;/* negative entry state */
132 	} n_un;
133 	u_char	nc_flag;		/* flag bits */
134 	u_char	nc_nlen;		/* length of name */
135 	char	nc_name[0];		/* segment name + nul */
136 };
137 
138 /*
139  * struct namecache_ts repeats struct namecache layout up to the
140  * nc_nlen member.
141  * struct namecache_ts is used in place of struct namecache when time(s) need
142  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
143  * both a non-dotdot directory name plus dotdot for the directory's
144  * parent.
145  *
146  * See below for alignment requirement.
147  */
148 struct	namecache_ts {
149 	struct	timespec nc_time;	/* timespec provided by fs */
150 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
151 	int	nc_ticks;		/* ticks value when entry was added */
152 	struct namecache nc_nc;
153 };
154 
155 /*
156  * At least mips n32 performs 64-bit accesses to timespec as found
157  * in namecache_ts and requires them to be aligned. Since others
158  * may be in the same spot suffer a little bit and enforce the
159  * alignment for everyone. Note this is a nop for 64-bit platforms.
160  */
161 #define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
162 
163 #define	nc_vp		n_un.nu_vp
164 #define	nc_neg		n_un.nu_neg
165 
166 /*
167  * Flags in namecache.nc_flag
168  */
169 #define NCF_WHITE	0x01
170 #define NCF_ISDOTDOT	0x02
171 #define	NCF_TS		0x04
172 #define	NCF_DTS		0x08
173 #define	NCF_DVDROP	0x10
174 #define	NCF_NEGATIVE	0x20
175 #define	NCF_INVALID	0x40
176 #define	NCF_WIP		0x80
177 
178 /*
179  * Flags in negstate.neg_flag
180  */
181 #define NEG_HOT		0x01
182 
183 /*
184  * Mark an entry as invalid.
185  *
186  * This is called before it starts getting deconstructed.
187  */
188 static void
189 cache_ncp_invalidate(struct namecache *ncp)
190 {
191 
192 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
193 	    ("%s: entry %p already invalid", __func__, ncp));
194 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
195 	atomic_thread_fence_rel();
196 }
197 
198 /*
199  * Check whether the entry can be safely used.
200  *
201  * All places which elide locks are supposed to call this after they are
202  * done with reading from an entry.
203  */
204 static bool
205 cache_ncp_canuse(struct namecache *ncp)
206 {
207 
208 	atomic_thread_fence_acq();
209 	return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
210 }
211 
212 /*
213  * Name caching works as follows:
214  *
215  * Names found by directory scans are retained in a cache
216  * for future reference.  It is managed LRU, so frequently
217  * used names will hang around.  Cache is indexed by hash value
218  * obtained from (dvp, name) where dvp refers to the directory
219  * containing name.
220  *
221  * If it is a "negative" entry, (i.e. for a name that is known NOT to
222  * exist) the vnode pointer will be NULL.
223  *
224  * Upon reaching the last segment of a path, if the reference
225  * is for DELETE, or NOCACHE is set (rewrite), and the
226  * name is located in the cache, it will be dropped.
227  *
228  * These locks are used (in the order in which they can be taken):
229  * NAME		TYPE	ROLE
230  * vnodelock	mtx	vnode lists and v_cache_dd field protection
231  * bucketlock	rwlock	for access to given set of hash buckets
232  * neglist	mtx	negative entry LRU management
233  *
234  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
235  * shrinking the LRU list.
236  *
237  * It is legal to take multiple vnodelock and bucketlock locks. The locking
238  * order is lower address first. Both are recursive.
239  *
240  * "." lookups are lockless.
241  *
242  * ".." and vnode -> name lookups require vnodelock.
243  *
244  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
245  *
246  * Insertions and removals of entries require involved vnodes and bucketlocks
247  * to be write-locked to prevent other threads from seeing the entry.
248  *
249  * Some lookups result in removal of the found entry (e.g. getting rid of a
250  * negative entry with the intent to create a positive one), which poses a
251  * problem when multiple threads reach the state. Similarly, two different
252  * threads can purge two different vnodes and try to remove the same name.
253  *
254  * If the already held vnode lock is lower than the second required lock, we
255  * can just take the other lock. However, in the opposite case, this could
256  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
257  * the first node, locking everything in order and revalidating the state.
258  */
259 
260 VFS_SMR_DECLARE;
261 
262 /*
263  * Structures associated with name caching.
264  */
265 #define NCHHASH(hash) \
266 	(&nchashtbl[(hash) & nchash])
267 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
268 static u_long __read_mostly	nchash;			/* size of hash table */
269 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
270     "Size of namecache hash table");
271 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
272 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
273     "Ratio of negative namecache entries");
274 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
275 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
276 u_int ncsizefactor = 2;
277 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
278     "Size factor for namecache");
279 static u_int __read_mostly	ncpurgeminvnodes;
280 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
281     "Number of vnodes below which purgevfs ignores the request");
282 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
283 
284 struct nchstats	nchstats;		/* cache effectiveness statistics */
285 
286 static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
287 
288 struct neglist {
289 	struct mtx		nl_lock;
290 	TAILQ_HEAD(, namecache) nl_list;
291 } __aligned(CACHE_LINE_SIZE);
292 
293 static struct neglist __read_mostly	*neglists;
294 static struct neglist ncneg_hot;
295 static u_long numhotneg;
296 
297 #define ncneghash	3
298 #define	numneglists	(ncneghash + 1)
299 static inline struct neglist *
300 NCP2NEGLIST(struct namecache *ncp)
301 {
302 
303 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
304 }
305 
306 static inline struct negstate *
307 NCP2NEGSTATE(struct namecache *ncp)
308 {
309 
310 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
311 	return (&ncp->nc_neg);
312 }
313 
314 #define	numbucketlocks (ncbuckethash + 1)
315 static u_int __read_mostly  ncbuckethash;
316 static struct rwlock_padalign __read_mostly  *bucketlocks;
317 #define	HASH2BUCKETLOCK(hash) \
318 	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
319 
320 #define	numvnodelocks (ncvnodehash + 1)
321 static u_int __read_mostly  ncvnodehash;
322 static struct mtx __read_mostly *vnodelocks;
323 static inline struct mtx *
324 VP2VNODELOCK(struct vnode *vp)
325 {
326 
327 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
328 }
329 
330 /*
331  * UMA zones for the VFS cache.
332  *
333  * The small cache is used for entries with short names, which are the
334  * most common.  The large cache is used for entries which are too big to
335  * fit in the small cache.
336  */
337 static uma_zone_t __read_mostly cache_zone_small;
338 static uma_zone_t __read_mostly cache_zone_small_ts;
339 static uma_zone_t __read_mostly cache_zone_large;
340 static uma_zone_t __read_mostly cache_zone_large_ts;
341 
342 #define	CACHE_PATH_CUTOFF	35
343 
344 static struct namecache *
345 cache_alloc(int len, int ts)
346 {
347 	struct namecache_ts *ncp_ts;
348 	struct namecache *ncp;
349 
350 	if (__predict_false(ts)) {
351 		if (len <= CACHE_PATH_CUTOFF)
352 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
353 		else
354 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
355 		ncp = &ncp_ts->nc_nc;
356 	} else {
357 		if (len <= CACHE_PATH_CUTOFF)
358 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
359 		else
360 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
361 	}
362 	return (ncp);
363 }
364 
365 static void
366 cache_free(struct namecache *ncp)
367 {
368 	struct namecache_ts *ncp_ts;
369 
370 	if (ncp == NULL)
371 		return;
372 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
373 		vdrop(ncp->nc_dvp);
374 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
375 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
376 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
377 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
378 		else
379 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
380 	} else {
381 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
382 			uma_zfree_smr(cache_zone_small, ncp);
383 		else
384 			uma_zfree_smr(cache_zone_large, ncp);
385 	}
386 }
387 
388 static void
389 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
390 {
391 	struct namecache_ts *ncp_ts;
392 
393 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
394 	    (tsp == NULL && ticksp == NULL),
395 	    ("No NCF_TS"));
396 
397 	if (tsp == NULL && ticksp == NULL)
398 		return;
399 
400 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
401 	if (tsp != NULL)
402 		*tsp = ncp_ts->nc_time;
403 	if (ticksp != NULL)
404 		*ticksp = ncp_ts->nc_ticks;
405 }
406 
407 #ifdef DEBUG_CACHE
408 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
409 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
410     "VFS namecache enabled");
411 #endif
412 
413 /* Export size information to userland */
414 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
415     sizeof(struct namecache), "sizeof(struct namecache)");
416 
417 /*
418  * The new name cache statistics
419  */
420 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
421     "Name cache statistics");
422 #define STATNODE_ULONG(name, descr)					\
423 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
424 #define STATNODE_COUNTER(name, descr)					\
425 	static COUNTER_U64_DEFINE_EARLY(name);				\
426 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
427 	    descr);
428 STATNODE_ULONG(numneg, "Number of negative cache entries");
429 STATNODE_ULONG(numcache, "Number of cache entries");
430 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
431 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
432 STATNODE_COUNTER(dothits, "Number of '.' hits");
433 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
434 STATNODE_COUNTER(nummiss, "Number of cache misses");
435 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
436 STATNODE_COUNTER(numposzaps,
437     "Number of cache hits (positive) we do not want to cache");
438 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
439 STATNODE_COUNTER(numnegzaps,
440     "Number of cache hits (negative) we do not want to cache");
441 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
442 /* These count for vn_getcwd(), too. */
443 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
444 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
445 STATNODE_COUNTER(numfullpathfail2,
446     "Number of fullpath search errors (VOP_VPTOCNP failures)");
447 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
448 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
449 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
450     "Number of successful removals after relocking");
451 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
452     "Number of times zap_and_exit failed to lock");
453 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
454     "Number of times zap_and_exit failed to lock");
455 static long cache_lock_vnodes_cel_3_failures;
456 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
457     "Number of times 3-way vnode locking failed");
458 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
459 STATNODE_COUNTER(numneg_evicted,
460     "Number of negative entries evicted when adding a new entry");
461 STATNODE_COUNTER(shrinking_skipped,
462     "Number of times shrinking was already in progress");
463 
464 static void cache_zap_locked(struct namecache *ncp);
465 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
466     char **freebuf, size_t *buflen);
467 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
468     char *buf, char **retbuf, size_t *buflen);
469 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
470     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
471 
472 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
473 
474 static int cache_yield;
475 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
476     "Number of times cache called yield");
477 
478 static void __noinline
479 cache_maybe_yield(void)
480 {
481 
482 	if (should_yield()) {
483 		cache_yield++;
484 		kern_yield(PRI_USER);
485 	}
486 }
487 
488 static inline void
489 cache_assert_vlp_locked(struct mtx *vlp)
490 {
491 
492 	if (vlp != NULL)
493 		mtx_assert(vlp, MA_OWNED);
494 }
495 
496 static inline void
497 cache_assert_vnode_locked(struct vnode *vp)
498 {
499 	struct mtx *vlp;
500 
501 	vlp = VP2VNODELOCK(vp);
502 	cache_assert_vlp_locked(vlp);
503 }
504 
505 /*
506  * TODO: With the value stored we can do better than computing the hash based
507  * on the address and the choice of FNV should also be revisisted.
508  */
509 static void
510 cache_prehash(struct vnode *vp)
511 {
512 
513 	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
514 }
515 
516 static uint32_t
517 cache_get_hash(char *name, u_char len, struct vnode *dvp)
518 {
519 
520 	return (fnv_32_buf(name, len, dvp->v_nchash));
521 }
522 
523 static inline struct nchashhead *
524 NCP2BUCKET(struct namecache *ncp)
525 {
526 	uint32_t hash;
527 
528 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
529 	return (NCHHASH(hash));
530 }
531 
532 static inline struct rwlock *
533 NCP2BUCKETLOCK(struct namecache *ncp)
534 {
535 	uint32_t hash;
536 
537 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
538 	return (HASH2BUCKETLOCK(hash));
539 }
540 
541 #ifdef INVARIANTS
542 static void
543 cache_assert_bucket_locked(struct namecache *ncp, int mode)
544 {
545 	struct rwlock *blp;
546 
547 	blp = NCP2BUCKETLOCK(ncp);
548 	rw_assert(blp, mode);
549 }
550 #else
551 #define cache_assert_bucket_locked(x, y) do { } while (0)
552 #endif
553 
554 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
555 static void
556 _cache_sort_vnodes(void **p1, void **p2)
557 {
558 	void *tmp;
559 
560 	MPASS(*p1 != NULL || *p2 != NULL);
561 
562 	if (*p1 > *p2) {
563 		tmp = *p2;
564 		*p2 = *p1;
565 		*p1 = tmp;
566 	}
567 }
568 
569 static void
570 cache_lock_all_buckets(void)
571 {
572 	u_int i;
573 
574 	for (i = 0; i < numbucketlocks; i++)
575 		rw_wlock(&bucketlocks[i]);
576 }
577 
578 static void
579 cache_unlock_all_buckets(void)
580 {
581 	u_int i;
582 
583 	for (i = 0; i < numbucketlocks; i++)
584 		rw_wunlock(&bucketlocks[i]);
585 }
586 
587 static void
588 cache_lock_all_vnodes(void)
589 {
590 	u_int i;
591 
592 	for (i = 0; i < numvnodelocks; i++)
593 		mtx_lock(&vnodelocks[i]);
594 }
595 
596 static void
597 cache_unlock_all_vnodes(void)
598 {
599 	u_int i;
600 
601 	for (i = 0; i < numvnodelocks; i++)
602 		mtx_unlock(&vnodelocks[i]);
603 }
604 
605 static int
606 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
607 {
608 
609 	cache_sort_vnodes(&vlp1, &vlp2);
610 
611 	if (vlp1 != NULL) {
612 		if (!mtx_trylock(vlp1))
613 			return (EAGAIN);
614 	}
615 	if (!mtx_trylock(vlp2)) {
616 		if (vlp1 != NULL)
617 			mtx_unlock(vlp1);
618 		return (EAGAIN);
619 	}
620 
621 	return (0);
622 }
623 
624 static void
625 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
626 {
627 
628 	MPASS(vlp1 != NULL || vlp2 != NULL);
629 	MPASS(vlp1 <= vlp2);
630 
631 	if (vlp1 != NULL)
632 		mtx_lock(vlp1);
633 	if (vlp2 != NULL)
634 		mtx_lock(vlp2);
635 }
636 
637 static void
638 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
639 {
640 
641 	MPASS(vlp1 != NULL || vlp2 != NULL);
642 
643 	if (vlp1 != NULL)
644 		mtx_unlock(vlp1);
645 	if (vlp2 != NULL)
646 		mtx_unlock(vlp2);
647 }
648 
649 static int
650 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
651 {
652 	struct nchstats snap;
653 
654 	if (req->oldptr == NULL)
655 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
656 
657 	snap = nchstats;
658 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
659 	snap.ncs_neghits = counter_u64_fetch(numneghits);
660 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
661 	    counter_u64_fetch(numnegzaps);
662 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
663 	    counter_u64_fetch(nummiss);
664 
665 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
666 }
667 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
668     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
669     "VFS cache effectiveness statistics");
670 
671 #ifdef DIAGNOSTIC
672 /*
673  * Grab an atomic snapshot of the name cache hash chain lengths
674  */
675 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
676     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
677     "hash table stats");
678 
679 static int
680 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
681 {
682 	struct nchashhead *ncpp;
683 	struct namecache *ncp;
684 	int i, error, n_nchash, *cntbuf;
685 
686 retry:
687 	n_nchash = nchash + 1;	/* nchash is max index, not count */
688 	if (req->oldptr == NULL)
689 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
690 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
691 	cache_lock_all_buckets();
692 	if (n_nchash != nchash + 1) {
693 		cache_unlock_all_buckets();
694 		free(cntbuf, M_TEMP);
695 		goto retry;
696 	}
697 	/* Scan hash tables counting entries */
698 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
699 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
700 			cntbuf[i]++;
701 	cache_unlock_all_buckets();
702 	for (error = 0, i = 0; i < n_nchash; i++)
703 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
704 			break;
705 	free(cntbuf, M_TEMP);
706 	return (error);
707 }
708 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
709     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
710     "nchash chain lengths");
711 
712 static int
713 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
714 {
715 	int error;
716 	struct nchashhead *ncpp;
717 	struct namecache *ncp;
718 	int n_nchash;
719 	int count, maxlength, used, pct;
720 
721 	if (!req->oldptr)
722 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
723 
724 	cache_lock_all_buckets();
725 	n_nchash = nchash + 1;	/* nchash is max index, not count */
726 	used = 0;
727 	maxlength = 0;
728 
729 	/* Scan hash tables for applicable entries */
730 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
731 		count = 0;
732 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
733 			count++;
734 		}
735 		if (count)
736 			used++;
737 		if (maxlength < count)
738 			maxlength = count;
739 	}
740 	n_nchash = nchash + 1;
741 	cache_unlock_all_buckets();
742 	pct = (used * 100) / (n_nchash / 100);
743 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
744 	if (error)
745 		return (error);
746 	error = SYSCTL_OUT(req, &used, sizeof(used));
747 	if (error)
748 		return (error);
749 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
750 	if (error)
751 		return (error);
752 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
753 	if (error)
754 		return (error);
755 	return (0);
756 }
757 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
758     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
759     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
760 #endif
761 
762 /*
763  * Negative entries management
764  *
765  * A variation of LRU scheme is used. New entries are hashed into one of
766  * numneglists cold lists. Entries get promoted to the hot list on first hit.
767  *
768  * The shrinker will demote hot list head and evict from the cold list in a
769  * round-robin manner.
770  */
771 static void
772 cache_negative_init(struct namecache *ncp)
773 {
774 	struct negstate *negstate;
775 
776 	ncp->nc_flag |= NCF_NEGATIVE;
777 	negstate = NCP2NEGSTATE(ncp);
778 	negstate->neg_flag = 0;
779 }
780 
781 static void
782 cache_negative_hit(struct namecache *ncp)
783 {
784 	struct neglist *neglist;
785 	struct negstate *negstate;
786 
787 	negstate = NCP2NEGSTATE(ncp);
788 	if ((negstate->neg_flag & NEG_HOT) != 0)
789 		return;
790 	neglist = NCP2NEGLIST(ncp);
791 	mtx_lock(&ncneg_hot.nl_lock);
792 	mtx_lock(&neglist->nl_lock);
793 	if ((negstate->neg_flag & NEG_HOT) == 0) {
794 		numhotneg++;
795 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
796 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
797 		negstate->neg_flag |= NEG_HOT;
798 	}
799 	mtx_unlock(&neglist->nl_lock);
800 	mtx_unlock(&ncneg_hot.nl_lock);
801 }
802 
803 static void
804 cache_negative_insert(struct namecache *ncp)
805 {
806 	struct neglist *neglist;
807 
808 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
809 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
810 	neglist = NCP2NEGLIST(ncp);
811 	mtx_lock(&neglist->nl_lock);
812 	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
813 	mtx_unlock(&neglist->nl_lock);
814 	atomic_add_rel_long(&numneg, 1);
815 }
816 
817 static void
818 cache_negative_remove(struct namecache *ncp)
819 {
820 	struct neglist *neglist;
821 	struct negstate *negstate;
822 	bool hot_locked = false;
823 	bool list_locked = false;
824 
825 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
826 	neglist = NCP2NEGLIST(ncp);
827 	negstate = NCP2NEGSTATE(ncp);
828 	if ((negstate->neg_flag & NEG_HOT) != 0) {
829 		hot_locked = true;
830 		mtx_lock(&ncneg_hot.nl_lock);
831 		if ((negstate->neg_flag & NEG_HOT) == 0) {
832 			list_locked = true;
833 			mtx_lock(&neglist->nl_lock);
834 		}
835 	} else {
836 		list_locked = true;
837 		mtx_lock(&neglist->nl_lock);
838 		/*
839 		 * We may be racing against promotion in lockless lookup.
840 		 */
841 		if ((negstate->neg_flag & NEG_HOT) != 0) {
842 			mtx_unlock(&neglist->nl_lock);
843 			hot_locked = true;
844 			mtx_lock(&ncneg_hot.nl_lock);
845 			mtx_lock(&neglist->nl_lock);
846 		}
847 	}
848 	if ((negstate->neg_flag & NEG_HOT) != 0) {
849 		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
850 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
851 		numhotneg--;
852 	} else {
853 		mtx_assert(&neglist->nl_lock, MA_OWNED);
854 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
855 	}
856 	if (list_locked)
857 		mtx_unlock(&neglist->nl_lock);
858 	if (hot_locked)
859 		mtx_unlock(&ncneg_hot.nl_lock);
860 	atomic_subtract_rel_long(&numneg, 1);
861 }
862 
863 static void
864 cache_negative_shrink_select(struct namecache **ncpp,
865     struct neglist **neglistpp)
866 {
867 	struct neglist *neglist;
868 	struct namecache *ncp;
869 	static u_int cycle;
870 	u_int i;
871 
872 	*ncpp = ncp = NULL;
873 
874 	for (i = 0; i < numneglists; i++) {
875 		neglist = &neglists[(cycle + i) % numneglists];
876 		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
877 			continue;
878 		mtx_lock(&neglist->nl_lock);
879 		ncp = TAILQ_FIRST(&neglist->nl_list);
880 		if (ncp != NULL)
881 			break;
882 		mtx_unlock(&neglist->nl_lock);
883 	}
884 
885 	*neglistpp = neglist;
886 	*ncpp = ncp;
887 	cycle++;
888 }
889 
890 static void
891 cache_negative_zap_one(void)
892 {
893 	struct namecache *ncp, *ncp2;
894 	struct neglist *neglist;
895 	struct negstate *negstate;
896 	struct mtx *dvlp;
897 	struct rwlock *blp;
898 
899 	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
900 	    !mtx_trylock(&ncneg_shrink_lock)) {
901 		counter_u64_add(shrinking_skipped, 1);
902 		return;
903 	}
904 
905 	mtx_lock(&ncneg_hot.nl_lock);
906 	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
907 	if (ncp != NULL) {
908 		neglist = NCP2NEGLIST(ncp);
909 		negstate = NCP2NEGSTATE(ncp);
910 		mtx_lock(&neglist->nl_lock);
911 		MPASS((negstate->neg_flag & NEG_HOT) != 0);
912 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
913 		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
914 		negstate->neg_flag &= ~NEG_HOT;
915 		numhotneg--;
916 		mtx_unlock(&neglist->nl_lock);
917 	}
918 	mtx_unlock(&ncneg_hot.nl_lock);
919 
920 	cache_negative_shrink_select(&ncp, &neglist);
921 
922 	mtx_unlock(&ncneg_shrink_lock);
923 	if (ncp == NULL)
924 		return;
925 
926 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
927 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
928 	blp = NCP2BUCKETLOCK(ncp);
929 	mtx_unlock(&neglist->nl_lock);
930 	mtx_lock(dvlp);
931 	rw_wlock(blp);
932 	/*
933 	 * Enter SMR to safely check the negative list.
934 	 * Even if the found pointer matches, the entry may now be reallocated
935 	 * and used by a different vnode.
936 	 */
937 	vfs_smr_enter();
938 	ncp2 = TAILQ_FIRST(&neglist->nl_list);
939 	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
940 	    blp != NCP2BUCKETLOCK(ncp2)) {
941 		vfs_smr_exit();
942 		ncp = NULL;
943 	} else {
944 		vfs_smr_exit();
945 		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
946 		    ncp->nc_name);
947 		cache_zap_locked(ncp);
948 		counter_u64_add(numneg_evicted, 1);
949 	}
950 	rw_wunlock(blp);
951 	mtx_unlock(dvlp);
952 	cache_free(ncp);
953 }
954 
955 /*
956  * cache_zap_locked():
957  *
958  *   Removes a namecache entry from cache, whether it contains an actual
959  *   pointer to a vnode or if it is just a negative cache entry.
960  */
961 static void
962 cache_zap_locked(struct namecache *ncp)
963 {
964 	struct nchashhead *ncpp;
965 
966 	if (!(ncp->nc_flag & NCF_NEGATIVE))
967 		cache_assert_vnode_locked(ncp->nc_vp);
968 	cache_assert_vnode_locked(ncp->nc_dvp);
969 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
970 
971 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
972 	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
973 
974 	cache_ncp_invalidate(ncp);
975 
976 	ncpp = NCP2BUCKET(ncp);
977 	CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
978 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
979 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
980 		    ncp->nc_name, ncp->nc_vp);
981 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
982 		if (ncp == ncp->nc_vp->v_cache_dd) {
983 			vn_seqc_write_begin_unheld(ncp->nc_vp);
984 			ncp->nc_vp->v_cache_dd = NULL;
985 			vn_seqc_write_end(ncp->nc_vp);
986 		}
987 	} else {
988 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
989 		    ncp->nc_name);
990 		cache_negative_remove(ncp);
991 	}
992 	if (ncp->nc_flag & NCF_ISDOTDOT) {
993 		if (ncp == ncp->nc_dvp->v_cache_dd) {
994 			vn_seqc_write_begin_unheld(ncp->nc_dvp);
995 			ncp->nc_dvp->v_cache_dd = NULL;
996 			vn_seqc_write_end(ncp->nc_dvp);
997 		}
998 	} else {
999 		LIST_REMOVE(ncp, nc_src);
1000 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1001 			ncp->nc_flag |= NCF_DVDROP;
1002 			counter_u64_add(numcachehv, -1);
1003 		}
1004 	}
1005 	atomic_subtract_rel_long(&numcache, 1);
1006 }
1007 
1008 static void
1009 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1010 {
1011 	struct rwlock *blp;
1012 
1013 	MPASS(ncp->nc_dvp == vp);
1014 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1015 	cache_assert_vnode_locked(vp);
1016 
1017 	blp = NCP2BUCKETLOCK(ncp);
1018 	rw_wlock(blp);
1019 	cache_zap_locked(ncp);
1020 	rw_wunlock(blp);
1021 }
1022 
1023 static bool
1024 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1025     struct mtx **vlpp)
1026 {
1027 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1028 	struct rwlock *blp;
1029 
1030 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1031 	cache_assert_vnode_locked(vp);
1032 
1033 	if (ncp->nc_flag & NCF_NEGATIVE) {
1034 		if (*vlpp != NULL) {
1035 			mtx_unlock(*vlpp);
1036 			*vlpp = NULL;
1037 		}
1038 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1039 		return (true);
1040 	}
1041 
1042 	pvlp = VP2VNODELOCK(vp);
1043 	blp = NCP2BUCKETLOCK(ncp);
1044 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1045 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1046 
1047 	if (*vlpp == vlp1 || *vlpp == vlp2) {
1048 		to_unlock = *vlpp;
1049 		*vlpp = NULL;
1050 	} else {
1051 		if (*vlpp != NULL) {
1052 			mtx_unlock(*vlpp);
1053 			*vlpp = NULL;
1054 		}
1055 		cache_sort_vnodes(&vlp1, &vlp2);
1056 		if (vlp1 == pvlp) {
1057 			mtx_lock(vlp2);
1058 			to_unlock = vlp2;
1059 		} else {
1060 			if (!mtx_trylock(vlp1))
1061 				goto out_relock;
1062 			to_unlock = vlp1;
1063 		}
1064 	}
1065 	rw_wlock(blp);
1066 	cache_zap_locked(ncp);
1067 	rw_wunlock(blp);
1068 	if (to_unlock != NULL)
1069 		mtx_unlock(to_unlock);
1070 	return (true);
1071 
1072 out_relock:
1073 	mtx_unlock(vlp2);
1074 	mtx_lock(vlp1);
1075 	mtx_lock(vlp2);
1076 	MPASS(*vlpp == NULL);
1077 	*vlpp = vlp1;
1078 	return (false);
1079 }
1080 
1081 static int __noinline
1082 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1083 {
1084 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1085 	struct rwlock *blp;
1086 	int error = 0;
1087 
1088 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1089 	cache_assert_vnode_locked(vp);
1090 
1091 	pvlp = VP2VNODELOCK(vp);
1092 	if (ncp->nc_flag & NCF_NEGATIVE) {
1093 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1094 		goto out;
1095 	}
1096 
1097 	blp = NCP2BUCKETLOCK(ncp);
1098 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1099 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1100 	cache_sort_vnodes(&vlp1, &vlp2);
1101 	if (vlp1 == pvlp) {
1102 		mtx_lock(vlp2);
1103 		to_unlock = vlp2;
1104 	} else {
1105 		if (!mtx_trylock(vlp1)) {
1106 			error = EAGAIN;
1107 			goto out;
1108 		}
1109 		to_unlock = vlp1;
1110 	}
1111 	rw_wlock(blp);
1112 	cache_zap_locked(ncp);
1113 	rw_wunlock(blp);
1114 	mtx_unlock(to_unlock);
1115 out:
1116 	mtx_unlock(pvlp);
1117 	return (error);
1118 }
1119 
1120 /*
1121  * If trylocking failed we can get here. We know enough to take all needed locks
1122  * in the right order and re-lookup the entry.
1123  */
1124 static int
1125 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1126     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1127     struct rwlock *blp)
1128 {
1129 	struct namecache *rncp;
1130 
1131 	cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1132 
1133 	cache_sort_vnodes(&dvlp, &vlp);
1134 	cache_lock_vnodes(dvlp, vlp);
1135 	rw_wlock(blp);
1136 	CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1137 		if (rncp == ncp && rncp->nc_dvp == dvp &&
1138 		    rncp->nc_nlen == cnp->cn_namelen &&
1139 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1140 			break;
1141 	}
1142 	if (rncp != NULL) {
1143 		cache_zap_locked(rncp);
1144 		rw_wunlock(blp);
1145 		cache_unlock_vnodes(dvlp, vlp);
1146 		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1147 		return (0);
1148 	}
1149 
1150 	rw_wunlock(blp);
1151 	cache_unlock_vnodes(dvlp, vlp);
1152 	return (EAGAIN);
1153 }
1154 
1155 static int __noinline
1156 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1157     uint32_t hash, struct rwlock *blp)
1158 {
1159 	struct mtx *dvlp, *vlp;
1160 	struct vnode *dvp;
1161 
1162 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1163 
1164 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1165 	vlp = NULL;
1166 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1167 		vlp = VP2VNODELOCK(ncp->nc_vp);
1168 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1169 		cache_zap_locked(ncp);
1170 		rw_wunlock(blp);
1171 		cache_unlock_vnodes(dvlp, vlp);
1172 		return (0);
1173 	}
1174 
1175 	dvp = ncp->nc_dvp;
1176 	rw_wunlock(blp);
1177 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1178 }
1179 
1180 static int __noinline
1181 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1182     uint32_t hash, struct rwlock *blp)
1183 {
1184 	struct mtx *dvlp, *vlp;
1185 	struct vnode *dvp;
1186 
1187 	cache_assert_bucket_locked(ncp, RA_RLOCKED);
1188 
1189 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1190 	vlp = NULL;
1191 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1192 		vlp = VP2VNODELOCK(ncp->nc_vp);
1193 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1194 		rw_runlock(blp);
1195 		rw_wlock(blp);
1196 		cache_zap_locked(ncp);
1197 		rw_wunlock(blp);
1198 		cache_unlock_vnodes(dvlp, vlp);
1199 		return (0);
1200 	}
1201 
1202 	dvp = ncp->nc_dvp;
1203 	rw_runlock(blp);
1204 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1205 }
1206 
1207 static int
1208 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1209     struct mtx **vlpp1, struct mtx **vlpp2)
1210 {
1211 	struct mtx *dvlp, *vlp;
1212 
1213 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1214 
1215 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1216 	vlp = NULL;
1217 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1218 		vlp = VP2VNODELOCK(ncp->nc_vp);
1219 	cache_sort_vnodes(&dvlp, &vlp);
1220 
1221 	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1222 		cache_zap_locked(ncp);
1223 		cache_unlock_vnodes(dvlp, vlp);
1224 		*vlpp1 = NULL;
1225 		*vlpp2 = NULL;
1226 		return (0);
1227 	}
1228 
1229 	if (*vlpp1 != NULL)
1230 		mtx_unlock(*vlpp1);
1231 	if (*vlpp2 != NULL)
1232 		mtx_unlock(*vlpp2);
1233 	*vlpp1 = NULL;
1234 	*vlpp2 = NULL;
1235 
1236 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1237 		cache_zap_locked(ncp);
1238 		cache_unlock_vnodes(dvlp, vlp);
1239 		return (0);
1240 	}
1241 
1242 	rw_wunlock(blp);
1243 	*vlpp1 = dvlp;
1244 	*vlpp2 = vlp;
1245 	if (*vlpp1 != NULL)
1246 		mtx_lock(*vlpp1);
1247 	mtx_lock(*vlpp2);
1248 	rw_wlock(blp);
1249 	return (EAGAIN);
1250 }
1251 
1252 static void
1253 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1254 {
1255 
1256 	if (blp != NULL) {
1257 		rw_runlock(blp);
1258 	} else {
1259 		mtx_unlock(vlp);
1260 	}
1261 }
1262 
1263 static int __noinline
1264 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1265     struct timespec *tsp, int *ticksp)
1266 {
1267 	int ltype;
1268 
1269 	*vpp = dvp;
1270 	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1271 			dvp, cnp->cn_nameptr);
1272 	counter_u64_add(dothits, 1);
1273 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1274 	if (tsp != NULL)
1275 		timespecclear(tsp);
1276 	if (ticksp != NULL)
1277 		*ticksp = ticks;
1278 	vrefact(*vpp);
1279 	/*
1280 	 * When we lookup "." we still can be asked to lock it
1281 	 * differently...
1282 	 */
1283 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1284 	if (ltype != VOP_ISLOCKED(*vpp)) {
1285 		if (ltype == LK_EXCLUSIVE) {
1286 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1287 			if (VN_IS_DOOMED((*vpp))) {
1288 				/* forced unmount */
1289 				vrele(*vpp);
1290 				*vpp = NULL;
1291 				return (ENOENT);
1292 			}
1293 		} else
1294 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1295 	}
1296 	return (-1);
1297 }
1298 
1299 static __noinline int
1300 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1301     struct componentname *cnp, struct timespec *tsp, int *ticksp)
1302 {
1303 	struct namecache *ncp;
1304 	struct rwlock *blp;
1305 	struct mtx *dvlp, *dvlp2;
1306 	uint32_t hash;
1307 	int error;
1308 
1309 	if (cnp->cn_namelen == 2 &&
1310 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1311 		counter_u64_add(dotdothits, 1);
1312 		dvlp = VP2VNODELOCK(dvp);
1313 		dvlp2 = NULL;
1314 		mtx_lock(dvlp);
1315 retry_dotdot:
1316 		ncp = dvp->v_cache_dd;
1317 		if (ncp == NULL) {
1318 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1319 			    "..", NULL);
1320 			mtx_unlock(dvlp);
1321 			if (dvlp2 != NULL)
1322 				mtx_unlock(dvlp2);
1323 			return (0);
1324 		}
1325 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1326 			if (ncp->nc_dvp != dvp)
1327 				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1328 			if (!cache_zap_locked_vnode_kl2(ncp,
1329 			    dvp, &dvlp2))
1330 				goto retry_dotdot;
1331 			MPASS(dvp->v_cache_dd == NULL);
1332 			mtx_unlock(dvlp);
1333 			if (dvlp2 != NULL)
1334 				mtx_unlock(dvlp2);
1335 			cache_free(ncp);
1336 		} else {
1337 			vn_seqc_write_begin(dvp);
1338 			dvp->v_cache_dd = NULL;
1339 			vn_seqc_write_end(dvp);
1340 			mtx_unlock(dvlp);
1341 			if (dvlp2 != NULL)
1342 				mtx_unlock(dvlp2);
1343 		}
1344 		return (0);
1345 	}
1346 
1347 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1348 	blp = HASH2BUCKETLOCK(hash);
1349 retry:
1350 	if (CK_SLIST_EMPTY(NCHHASH(hash)))
1351 		goto out_no_entry;
1352 
1353 	rw_wlock(blp);
1354 
1355 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1356 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1357 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1358 			break;
1359 	}
1360 
1361 	/* We failed to find an entry */
1362 	if (ncp == NULL) {
1363 		rw_wunlock(blp);
1364 		goto out_no_entry;
1365 	}
1366 
1367 	error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1368 	if (__predict_false(error != 0)) {
1369 		zap_and_exit_bucket_fail++;
1370 		cache_maybe_yield();
1371 		goto retry;
1372 	}
1373 	counter_u64_add(numposzaps, 1);
1374 	cache_free(ncp);
1375 	return (0);
1376 out_no_entry:
1377 	SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1378 	counter_u64_add(nummisszap, 1);
1379 	return (0);
1380 }
1381 
1382 /**
1383  * Lookup a name in the name cache
1384  *
1385  * # Arguments
1386  *
1387  * - dvp:	Parent directory in which to search.
1388  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1389  * - cnp:	Parameters of the name search.  The most interesting bits of
1390  *   		the cn_flags field have the following meanings:
1391  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1392  *   			it up.
1393  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1394  * - tsp:	Return storage for cache timestamp.  On a successful (positive
1395  *   		or negative) lookup, tsp will be filled with any timespec that
1396  *   		was stored when this cache entry was created.  However, it will
1397  *   		be clear for "." entries.
1398  * - ticks:	Return storage for alternate cache timestamp.  On a successful
1399  *   		(positive or negative) lookup, it will contain the ticks value
1400  *   		that was current when the cache entry was created, unless cnp
1401  *   		was ".".
1402  *
1403  * # Returns
1404  *
1405  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1406  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1407  *		to a forced unmount.  vpp will not be modified.  If the entry
1408  *		is a whiteout, then the ISWHITEOUT flag will be set in
1409  *		cnp->cn_flags.
1410  * - 0:		A cache miss.  vpp will not be modified.
1411  *
1412  * # Locking
1413  *
1414  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1415  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1416  * lock is not recursively acquired.
1417  */
1418 int
1419 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1420     struct timespec *tsp, int *ticksp)
1421 {
1422 	struct namecache_ts *ncp_ts;
1423 	struct namecache *ncp;
1424 	struct negstate *negstate;
1425 	struct rwlock *blp;
1426 	struct mtx *dvlp;
1427 	uint32_t hash;
1428 	enum vgetstate vs;
1429 	int error, ltype;
1430 	bool try_smr, doing_smr, whiteout;
1431 
1432 #ifdef DEBUG_CACHE
1433 	if (__predict_false(!doingcache)) {
1434 		cnp->cn_flags &= ~MAKEENTRY;
1435 		return (0);
1436 	}
1437 #endif
1438 
1439 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1440 		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1441 
1442 	if ((cnp->cn_flags & MAKEENTRY) == 0)
1443 		return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1444 
1445 	try_smr = true;
1446 	if (cnp->cn_nameiop == CREATE)
1447 		try_smr = false;
1448 retry:
1449 	doing_smr = false;
1450 	blp = NULL;
1451 	dvlp = NULL;
1452 	error = 0;
1453 	if (cnp->cn_namelen == 2 &&
1454 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1455 		counter_u64_add(dotdothits, 1);
1456 		dvlp = VP2VNODELOCK(dvp);
1457 		mtx_lock(dvlp);
1458 		ncp = dvp->v_cache_dd;
1459 		if (ncp == NULL) {
1460 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1461 			    "..", NULL);
1462 			mtx_unlock(dvlp);
1463 			return (0);
1464 		}
1465 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1466 			if (ncp->nc_flag & NCF_NEGATIVE)
1467 				*vpp = NULL;
1468 			else
1469 				*vpp = ncp->nc_vp;
1470 		} else
1471 			*vpp = ncp->nc_dvp;
1472 		/* Return failure if negative entry was found. */
1473 		if (*vpp == NULL)
1474 			goto negative_success;
1475 		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1476 		    dvp, cnp->cn_nameptr, *vpp);
1477 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1478 		    *vpp);
1479 		cache_out_ts(ncp, tsp, ticksp);
1480 		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1481 		    NCF_DTS && tsp != NULL) {
1482 			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1483 			*tsp = ncp_ts->nc_dotdottime;
1484 		}
1485 		goto success;
1486 	}
1487 
1488 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1489 retry_hashed:
1490 	if (try_smr) {
1491 		vfs_smr_enter();
1492 		doing_smr = true;
1493 		try_smr = false;
1494 	} else {
1495 		blp = HASH2BUCKETLOCK(hash);
1496 		rw_rlock(blp);
1497 	}
1498 
1499 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1500 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1501 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1502 			break;
1503 	}
1504 
1505 	/* We failed to find an entry */
1506 	if (__predict_false(ncp == NULL)) {
1507 		if (doing_smr)
1508 			vfs_smr_exit();
1509 		else
1510 			rw_runlock(blp);
1511 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1512 		    NULL);
1513 		counter_u64_add(nummiss, 1);
1514 		return (0);
1515 	}
1516 
1517 	if (ncp->nc_flag & NCF_NEGATIVE)
1518 		goto negative_success;
1519 
1520 	/* We found a "positive" match, return the vnode */
1521 	counter_u64_add(numposhits, 1);
1522 	*vpp = ncp->nc_vp;
1523 	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1524 	    dvp, cnp->cn_nameptr, *vpp, ncp);
1525 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1526 	    *vpp);
1527 	cache_out_ts(ncp, tsp, ticksp);
1528 success:
1529 	/*
1530 	 * On success we return a locked and ref'd vnode as per the lookup
1531 	 * protocol.
1532 	 */
1533 	MPASS(dvp != *vpp);
1534 	ltype = 0;	/* silence gcc warning */
1535 	if (cnp->cn_flags & ISDOTDOT) {
1536 		ltype = VOP_ISLOCKED(dvp);
1537 		VOP_UNLOCK(dvp);
1538 	}
1539 	if (doing_smr) {
1540 		if (!cache_ncp_canuse(ncp)) {
1541 			vfs_smr_exit();
1542 			*vpp = NULL;
1543 			goto retry;
1544 		}
1545 		vs = vget_prep_smr(*vpp);
1546 		vfs_smr_exit();
1547 		if (__predict_false(vs == VGET_NONE)) {
1548 			*vpp = NULL;
1549 			goto retry;
1550 		}
1551 	} else {
1552 		vs = vget_prep(*vpp);
1553 		cache_lookup_unlock(blp, dvlp);
1554 	}
1555 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1556 	if (cnp->cn_flags & ISDOTDOT) {
1557 		vn_lock(dvp, ltype | LK_RETRY);
1558 		if (VN_IS_DOOMED(dvp)) {
1559 			if (error == 0)
1560 				vput(*vpp);
1561 			*vpp = NULL;
1562 			return (ENOENT);
1563 		}
1564 	}
1565 	if (error) {
1566 		*vpp = NULL;
1567 		goto retry;
1568 	}
1569 	if ((cnp->cn_flags & ISLASTCN) &&
1570 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1571 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1572 	}
1573 	return (-1);
1574 
1575 negative_success:
1576 	/* We found a negative match, and want to create it, so purge */
1577 	if (cnp->cn_nameiop == CREATE) {
1578 		MPASS(!doing_smr);
1579 		counter_u64_add(numnegzaps, 1);
1580 		goto zap_and_exit;
1581 	}
1582 
1583 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1584 	cache_out_ts(ncp, tsp, ticksp);
1585 	counter_u64_add(numneghits, 1);
1586 	whiteout = (ncp->nc_flag & NCF_WHITE);
1587 
1588 	if (doing_smr) {
1589 		/*
1590 		 * We need to take locks to promote an entry.
1591 		 */
1592 		negstate = NCP2NEGSTATE(ncp);
1593 		if ((negstate->neg_flag & NEG_HOT) == 0 ||
1594 		    !cache_ncp_canuse(ncp)) {
1595 			vfs_smr_exit();
1596 			doing_smr = false;
1597 			goto retry_hashed;
1598 		}
1599 		vfs_smr_exit();
1600 	} else {
1601 		cache_negative_hit(ncp);
1602 		cache_lookup_unlock(blp, dvlp);
1603 	}
1604 	if (whiteout)
1605 		cnp->cn_flags |= ISWHITEOUT;
1606 	return (ENOENT);
1607 
1608 zap_and_exit:
1609 	MPASS(!doing_smr);
1610 	if (blp != NULL)
1611 		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1612 	else
1613 		error = cache_zap_locked_vnode(ncp, dvp);
1614 	if (__predict_false(error != 0)) {
1615 		zap_and_exit_bucket_fail2++;
1616 		cache_maybe_yield();
1617 		goto retry;
1618 	}
1619 	cache_free(ncp);
1620 	return (0);
1621 }
1622 
1623 struct celockstate {
1624 	struct mtx *vlp[3];
1625 	struct rwlock *blp[2];
1626 };
1627 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1628 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1629 
1630 static inline void
1631 cache_celockstate_init(struct celockstate *cel)
1632 {
1633 
1634 	bzero(cel, sizeof(*cel));
1635 }
1636 
1637 static void
1638 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1639     struct vnode *dvp)
1640 {
1641 	struct mtx *vlp1, *vlp2;
1642 
1643 	MPASS(cel->vlp[0] == NULL);
1644 	MPASS(cel->vlp[1] == NULL);
1645 	MPASS(cel->vlp[2] == NULL);
1646 
1647 	MPASS(vp != NULL || dvp != NULL);
1648 
1649 	vlp1 = VP2VNODELOCK(vp);
1650 	vlp2 = VP2VNODELOCK(dvp);
1651 	cache_sort_vnodes(&vlp1, &vlp2);
1652 
1653 	if (vlp1 != NULL) {
1654 		mtx_lock(vlp1);
1655 		cel->vlp[0] = vlp1;
1656 	}
1657 	mtx_lock(vlp2);
1658 	cel->vlp[1] = vlp2;
1659 }
1660 
1661 static void
1662 cache_unlock_vnodes_cel(struct celockstate *cel)
1663 {
1664 
1665 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1666 
1667 	if (cel->vlp[0] != NULL)
1668 		mtx_unlock(cel->vlp[0]);
1669 	if (cel->vlp[1] != NULL)
1670 		mtx_unlock(cel->vlp[1]);
1671 	if (cel->vlp[2] != NULL)
1672 		mtx_unlock(cel->vlp[2]);
1673 }
1674 
1675 static bool
1676 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1677 {
1678 	struct mtx *vlp;
1679 	bool ret;
1680 
1681 	cache_assert_vlp_locked(cel->vlp[0]);
1682 	cache_assert_vlp_locked(cel->vlp[1]);
1683 	MPASS(cel->vlp[2] == NULL);
1684 
1685 	MPASS(vp != NULL);
1686 	vlp = VP2VNODELOCK(vp);
1687 
1688 	ret = true;
1689 	if (vlp >= cel->vlp[1]) {
1690 		mtx_lock(vlp);
1691 	} else {
1692 		if (mtx_trylock(vlp))
1693 			goto out;
1694 		cache_lock_vnodes_cel_3_failures++;
1695 		cache_unlock_vnodes_cel(cel);
1696 		if (vlp < cel->vlp[0]) {
1697 			mtx_lock(vlp);
1698 			mtx_lock(cel->vlp[0]);
1699 			mtx_lock(cel->vlp[1]);
1700 		} else {
1701 			if (cel->vlp[0] != NULL)
1702 				mtx_lock(cel->vlp[0]);
1703 			mtx_lock(vlp);
1704 			mtx_lock(cel->vlp[1]);
1705 		}
1706 		ret = false;
1707 	}
1708 out:
1709 	cel->vlp[2] = vlp;
1710 	return (ret);
1711 }
1712 
1713 static void
1714 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1715     struct rwlock *blp2)
1716 {
1717 
1718 	MPASS(cel->blp[0] == NULL);
1719 	MPASS(cel->blp[1] == NULL);
1720 
1721 	cache_sort_vnodes(&blp1, &blp2);
1722 
1723 	if (blp1 != NULL) {
1724 		rw_wlock(blp1);
1725 		cel->blp[0] = blp1;
1726 	}
1727 	rw_wlock(blp2);
1728 	cel->blp[1] = blp2;
1729 }
1730 
1731 static void
1732 cache_unlock_buckets_cel(struct celockstate *cel)
1733 {
1734 
1735 	if (cel->blp[0] != NULL)
1736 		rw_wunlock(cel->blp[0]);
1737 	rw_wunlock(cel->blp[1]);
1738 }
1739 
1740 /*
1741  * Lock part of the cache affected by the insertion.
1742  *
1743  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1744  * However, insertion can result in removal of an old entry. In this
1745  * case we have an additional vnode and bucketlock pair to lock. If the
1746  * entry is negative, ncelock is locked instead of the vnode.
1747  *
1748  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1749  * preserving the locking order (smaller address first).
1750  */
1751 static void
1752 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1753     uint32_t hash)
1754 {
1755 	struct namecache *ncp;
1756 	struct rwlock *blps[2];
1757 
1758 	blps[0] = HASH2BUCKETLOCK(hash);
1759 	for (;;) {
1760 		blps[1] = NULL;
1761 		cache_lock_vnodes_cel(cel, dvp, vp);
1762 		if (vp == NULL || vp->v_type != VDIR)
1763 			break;
1764 		ncp = vp->v_cache_dd;
1765 		if (ncp == NULL)
1766 			break;
1767 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1768 			break;
1769 		MPASS(ncp->nc_dvp == vp);
1770 		blps[1] = NCP2BUCKETLOCK(ncp);
1771 		if (ncp->nc_flag & NCF_NEGATIVE)
1772 			break;
1773 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1774 			break;
1775 		/*
1776 		 * All vnodes got re-locked. Re-validate the state and if
1777 		 * nothing changed we are done. Otherwise restart.
1778 		 */
1779 		if (ncp == vp->v_cache_dd &&
1780 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1781 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1782 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1783 			break;
1784 		cache_unlock_vnodes_cel(cel);
1785 		cel->vlp[0] = NULL;
1786 		cel->vlp[1] = NULL;
1787 		cel->vlp[2] = NULL;
1788 	}
1789 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1790 }
1791 
1792 static void
1793 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1794     uint32_t hash)
1795 {
1796 	struct namecache *ncp;
1797 	struct rwlock *blps[2];
1798 
1799 	blps[0] = HASH2BUCKETLOCK(hash);
1800 	for (;;) {
1801 		blps[1] = NULL;
1802 		cache_lock_vnodes_cel(cel, dvp, vp);
1803 		ncp = dvp->v_cache_dd;
1804 		if (ncp == NULL)
1805 			break;
1806 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1807 			break;
1808 		MPASS(ncp->nc_dvp == dvp);
1809 		blps[1] = NCP2BUCKETLOCK(ncp);
1810 		if (ncp->nc_flag & NCF_NEGATIVE)
1811 			break;
1812 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1813 			break;
1814 		if (ncp == dvp->v_cache_dd &&
1815 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1816 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1817 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1818 			break;
1819 		cache_unlock_vnodes_cel(cel);
1820 		cel->vlp[0] = NULL;
1821 		cel->vlp[1] = NULL;
1822 		cel->vlp[2] = NULL;
1823 	}
1824 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1825 }
1826 
1827 static void
1828 cache_enter_unlock(struct celockstate *cel)
1829 {
1830 
1831 	cache_unlock_buckets_cel(cel);
1832 	cache_unlock_vnodes_cel(cel);
1833 }
1834 
1835 static void __noinline
1836 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1837     struct componentname *cnp)
1838 {
1839 	struct celockstate cel;
1840 	struct namecache *ncp;
1841 	uint32_t hash;
1842 	int len;
1843 
1844 	if (dvp->v_cache_dd == NULL)
1845 		return;
1846 	len = cnp->cn_namelen;
1847 	cache_celockstate_init(&cel);
1848 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1849 	cache_enter_lock_dd(&cel, dvp, vp, hash);
1850 	vn_seqc_write_begin(dvp);
1851 	ncp = dvp->v_cache_dd;
1852 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1853 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1854 		cache_zap_locked(ncp);
1855 	} else {
1856 		ncp = NULL;
1857 	}
1858 	dvp->v_cache_dd = NULL;
1859 	vn_seqc_write_end(dvp);
1860 	cache_enter_unlock(&cel);
1861 	cache_free(ncp);
1862 }
1863 
1864 /*
1865  * Add an entry to the cache.
1866  */
1867 void
1868 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1869     struct timespec *tsp, struct timespec *dtsp)
1870 {
1871 	struct celockstate cel;
1872 	struct namecache *ncp, *n2, *ndd;
1873 	struct namecache_ts *ncp_ts, *n2_ts;
1874 	struct nchashhead *ncpp;
1875 	uint32_t hash;
1876 	int flag;
1877 	int len;
1878 	u_long lnumcache;
1879 
1880 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1881 	VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp,
1882 	    ("cache_enter: Adding a doomed vnode"));
1883 	VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp,
1884 	    ("cache_enter: Doomed vnode used as src"));
1885 
1886 #ifdef DEBUG_CACHE
1887 	if (__predict_false(!doingcache))
1888 		return;
1889 #endif
1890 
1891 	flag = 0;
1892 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1893 		if (cnp->cn_namelen == 1)
1894 			return;
1895 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1896 			cache_enter_dotdot_prep(dvp, vp, cnp);
1897 			flag = NCF_ISDOTDOT;
1898 		}
1899 	}
1900 
1901 	/*
1902 	 * Avoid blowout in namecache entries.
1903 	 */
1904 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1905 	if (__predict_false(lnumcache >= ncsize)) {
1906 		atomic_add_long(&numcache, -1);
1907 		counter_u64_add(numdrops, 1);
1908 		return;
1909 	}
1910 
1911 	cache_celockstate_init(&cel);
1912 	ndd = NULL;
1913 	ncp_ts = NULL;
1914 
1915 	/*
1916 	 * Calculate the hash key and setup as much of the new
1917 	 * namecache entry as possible before acquiring the lock.
1918 	 */
1919 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1920 	ncp->nc_flag = flag | NCF_WIP;
1921 	ncp->nc_vp = vp;
1922 	if (vp == NULL)
1923 		cache_negative_init(ncp);
1924 	ncp->nc_dvp = dvp;
1925 	if (tsp != NULL) {
1926 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1927 		ncp_ts->nc_time = *tsp;
1928 		ncp_ts->nc_ticks = ticks;
1929 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
1930 		if (dtsp != NULL) {
1931 			ncp_ts->nc_dotdottime = *dtsp;
1932 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1933 		}
1934 	}
1935 	len = ncp->nc_nlen = cnp->cn_namelen;
1936 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1937 	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
1938 	cache_enter_lock(&cel, dvp, vp, hash);
1939 
1940 	/*
1941 	 * See if this vnode or negative entry is already in the cache
1942 	 * with this name.  This can happen with concurrent lookups of
1943 	 * the same path name.
1944 	 */
1945 	ncpp = NCHHASH(hash);
1946 	CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
1947 		if (n2->nc_dvp == dvp &&
1948 		    n2->nc_nlen == cnp->cn_namelen &&
1949 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1950 			if (tsp != NULL) {
1951 				KASSERT((n2->nc_flag & NCF_TS) != 0,
1952 				    ("no NCF_TS"));
1953 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1954 				n2_ts->nc_time = ncp_ts->nc_time;
1955 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
1956 				if (dtsp != NULL) {
1957 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1958 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
1959 				}
1960 			}
1961 			goto out_unlock_free;
1962 		}
1963 	}
1964 
1965 	if (flag == NCF_ISDOTDOT) {
1966 		/*
1967 		 * See if we are trying to add .. entry, but some other lookup
1968 		 * has populated v_cache_dd pointer already.
1969 		 */
1970 		if (dvp->v_cache_dd != NULL)
1971 			goto out_unlock_free;
1972 		KASSERT(vp == NULL || vp->v_type == VDIR,
1973 		    ("wrong vnode type %p", vp));
1974 		vn_seqc_write_begin(dvp);
1975 		dvp->v_cache_dd = ncp;
1976 		vn_seqc_write_end(dvp);
1977 	}
1978 
1979 	if (vp != NULL) {
1980 		if (vp->v_type == VDIR) {
1981 			if (flag != NCF_ISDOTDOT) {
1982 				/*
1983 				 * For this case, the cache entry maps both the
1984 				 * directory name in it and the name ".." for the
1985 				 * directory's parent.
1986 				 */
1987 				vn_seqc_write_begin(vp);
1988 				if ((ndd = vp->v_cache_dd) != NULL) {
1989 					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1990 						cache_zap_locked(ndd);
1991 					else
1992 						ndd = NULL;
1993 				}
1994 				vp->v_cache_dd = ncp;
1995 				vn_seqc_write_end(vp);
1996 			}
1997 		} else {
1998 			if (vp->v_cache_dd != NULL) {
1999 				vn_seqc_write_begin(vp);
2000 				vp->v_cache_dd = NULL;
2001 				vn_seqc_write_end(vp);
2002 			}
2003 		}
2004 	}
2005 
2006 	if (flag != NCF_ISDOTDOT) {
2007 		if (LIST_EMPTY(&dvp->v_cache_src)) {
2008 			vhold(dvp);
2009 			counter_u64_add(numcachehv, 1);
2010 		}
2011 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2012 	}
2013 
2014 	/*
2015 	 * If the entry is "negative", we place it into the
2016 	 * "negative" cache queue, otherwise, we place it into the
2017 	 * destination vnode's cache entries queue.
2018 	 */
2019 	if (vp != NULL) {
2020 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2021 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2022 		    vp);
2023 	} else {
2024 		if (cnp->cn_flags & ISWHITEOUT)
2025 			ncp->nc_flag |= NCF_WHITE;
2026 		cache_negative_insert(ncp);
2027 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2028 		    ncp->nc_name);
2029 	}
2030 
2031 	/*
2032 	 * Insert the new namecache entry into the appropriate chain
2033 	 * within the cache entries table.
2034 	 */
2035 	CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2036 
2037 	atomic_thread_fence_rel();
2038 	/*
2039 	 * Mark the entry as fully constructed.
2040 	 * It is immutable past this point until its removal.
2041 	 */
2042 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2043 
2044 	cache_enter_unlock(&cel);
2045 	if (numneg * ncnegfactor > lnumcache)
2046 		cache_negative_zap_one();
2047 	cache_free(ndd);
2048 	return;
2049 out_unlock_free:
2050 	cache_enter_unlock(&cel);
2051 	atomic_add_long(&numcache, -1);
2052 	cache_free(ncp);
2053 	return;
2054 }
2055 
2056 static u_int
2057 cache_roundup_2(u_int val)
2058 {
2059 	u_int res;
2060 
2061 	for (res = 1; res <= val; res <<= 1)
2062 		continue;
2063 
2064 	return (res);
2065 }
2066 
2067 static struct nchashhead *
2068 nchinittbl(u_long elements, u_long *hashmask)
2069 {
2070 	struct nchashhead *hashtbl;
2071 	u_long hashsize, i;
2072 
2073 	hashsize = cache_roundup_2(elements) / 2;
2074 
2075 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2076 	for (i = 0; i < hashsize; i++)
2077 		CK_SLIST_INIT(&hashtbl[i]);
2078 	*hashmask = hashsize - 1;
2079 	return (hashtbl);
2080 }
2081 
2082 static void
2083 ncfreetbl(struct nchashhead *hashtbl)
2084 {
2085 
2086 	free(hashtbl, M_VFSCACHE);
2087 }
2088 
2089 /*
2090  * Name cache initialization, from vfs_init() when we are booting
2091  */
2092 static void
2093 nchinit(void *dummy __unused)
2094 {
2095 	u_int i;
2096 
2097 	cache_zone_small = uma_zcreate("S VFS Cache",
2098 	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
2099 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
2100 	    UMA_ZONE_ZINIT);
2101 	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
2102 	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
2103 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
2104 	    UMA_ZONE_ZINIT);
2105 	cache_zone_large = uma_zcreate("L VFS Cache",
2106 	    sizeof(struct namecache) + NAME_MAX + 1,
2107 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
2108 	    UMA_ZONE_ZINIT);
2109 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
2110 	    sizeof(struct namecache_ts) + NAME_MAX + 1,
2111 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
2112 	    UMA_ZONE_ZINIT);
2113 
2114 	VFS_SMR_ZONE_SET(cache_zone_small);
2115 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2116 	VFS_SMR_ZONE_SET(cache_zone_large);
2117 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2118 
2119 	ncsize = desiredvnodes * ncsizefactor;
2120 	nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2121 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2122 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2123 		ncbuckethash = 7;
2124 	if (ncbuckethash > nchash)
2125 		ncbuckethash = nchash;
2126 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2127 	    M_WAITOK | M_ZERO);
2128 	for (i = 0; i < numbucketlocks; i++)
2129 		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2130 	ncvnodehash = ncbuckethash;
2131 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2132 	    M_WAITOK | M_ZERO);
2133 	for (i = 0; i < numvnodelocks; i++)
2134 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2135 	ncpurgeminvnodes = numbucketlocks * 2;
2136 
2137 	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2138 	    M_WAITOK | M_ZERO);
2139 	for (i = 0; i < numneglists; i++) {
2140 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2141 		TAILQ_INIT(&neglists[i].nl_list);
2142 	}
2143 	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2144 	TAILQ_INIT(&ncneg_hot.nl_list);
2145 
2146 	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2147 }
2148 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2149 
2150 void
2151 cache_vnode_init(struct vnode *vp)
2152 {
2153 
2154 	LIST_INIT(&vp->v_cache_src);
2155 	TAILQ_INIT(&vp->v_cache_dst);
2156 	vp->v_cache_dd = NULL;
2157 	cache_prehash(vp);
2158 }
2159 
2160 void
2161 cache_changesize(u_long newmaxvnodes)
2162 {
2163 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
2164 	u_long new_nchash, old_nchash;
2165 	struct namecache *ncp;
2166 	uint32_t hash;
2167 	u_long newncsize;
2168 	int i;
2169 
2170 	newncsize = newmaxvnodes * ncsizefactor;
2171 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2172 	if (newmaxvnodes < numbucketlocks)
2173 		newmaxvnodes = numbucketlocks;
2174 
2175 	new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2176 	/* If same hash table size, nothing to do */
2177 	if (nchash == new_nchash) {
2178 		ncfreetbl(new_nchashtbl);
2179 		return;
2180 	}
2181 	/*
2182 	 * Move everything from the old hash table to the new table.
2183 	 * None of the namecache entries in the table can be removed
2184 	 * because to do so, they have to be removed from the hash table.
2185 	 */
2186 	cache_lock_all_vnodes();
2187 	cache_lock_all_buckets();
2188 	old_nchashtbl = nchashtbl;
2189 	old_nchash = nchash;
2190 	nchashtbl = new_nchashtbl;
2191 	nchash = new_nchash;
2192 	for (i = 0; i <= old_nchash; i++) {
2193 		while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2194 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2195 			    ncp->nc_dvp);
2196 			CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2197 			CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2198 		}
2199 	}
2200 	ncsize = newncsize;
2201 	cache_unlock_all_buckets();
2202 	cache_unlock_all_vnodes();
2203 	ncfreetbl(old_nchashtbl);
2204 }
2205 
2206 /*
2207  * Invalidate all entries from and to a particular vnode.
2208  */
2209 static void
2210 cache_purge_impl(struct vnode *vp)
2211 {
2212 	TAILQ_HEAD(, namecache) ncps;
2213 	struct namecache *ncp, *nnp;
2214 	struct mtx *vlp, *vlp2;
2215 
2216 	TAILQ_INIT(&ncps);
2217 	vlp = VP2VNODELOCK(vp);
2218 	vlp2 = NULL;
2219 	mtx_assert(vlp, MA_OWNED);
2220 retry:
2221 	while (!LIST_EMPTY(&vp->v_cache_src)) {
2222 		ncp = LIST_FIRST(&vp->v_cache_src);
2223 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2224 			goto retry;
2225 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2226 	}
2227 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2228 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2229 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2230 			goto retry;
2231 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2232 	}
2233 	ncp = vp->v_cache_dd;
2234 	if (ncp != NULL) {
2235 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2236 		   ("lost dotdot link"));
2237 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2238 			goto retry;
2239 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2240 	}
2241 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2242 	mtx_unlock(vlp);
2243 	if (vlp2 != NULL)
2244 		mtx_unlock(vlp2);
2245 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2246 		cache_free(ncp);
2247 	}
2248 }
2249 
2250 void
2251 cache_purge(struct vnode *vp)
2252 {
2253 	struct mtx *vlp;
2254 
2255 	SDT_PROBE1(vfs, namecache, purge, done, vp);
2256 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2257 	    vp->v_cache_dd == NULL)
2258 		return;
2259 	vlp = VP2VNODELOCK(vp);
2260 	mtx_lock(vlp);
2261 	cache_purge_impl(vp);
2262 }
2263 
2264 /*
2265  * Only to be used by vgone.
2266  */
2267 void
2268 cache_purge_vgone(struct vnode *vp)
2269 {
2270 	struct mtx *vlp;
2271 
2272 	VNPASS(VN_IS_DOOMED(vp), vp);
2273 	vlp = VP2VNODELOCK(vp);
2274 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2275 	    vp->v_cache_dd == NULL)) {
2276 		mtx_lock(vlp);
2277 		cache_purge_impl(vp);
2278 		mtx_assert(vlp, MA_NOTOWNED);
2279 		return;
2280 	}
2281 
2282 	/*
2283 	 * All the NULL pointer state we found above may be transient.
2284 	 * Serialize against a possible thread doing cache_purge.
2285 	 */
2286 	mtx_wait_unlocked(vlp);
2287 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2288 	    vp->v_cache_dd == NULL)) {
2289 		mtx_lock(vlp);
2290 		cache_purge_impl(vp);
2291 		mtx_assert(vlp, MA_NOTOWNED);
2292 		return;
2293 	}
2294 	return;
2295 }
2296 
2297 /*
2298  * Invalidate all negative entries for a particular directory vnode.
2299  */
2300 void
2301 cache_purge_negative(struct vnode *vp)
2302 {
2303 	TAILQ_HEAD(, namecache) ncps;
2304 	struct namecache *ncp, *nnp;
2305 	struct mtx *vlp;
2306 
2307 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2308 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2309 	if (LIST_EMPTY(&vp->v_cache_src))
2310 		return;
2311 	TAILQ_INIT(&ncps);
2312 	vlp = VP2VNODELOCK(vp);
2313 	mtx_lock(vlp);
2314 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2315 		if (!(ncp->nc_flag & NCF_NEGATIVE))
2316 			continue;
2317 		cache_zap_negative_locked_vnode_kl(ncp, vp);
2318 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2319 	}
2320 	mtx_unlock(vlp);
2321 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2322 		cache_free(ncp);
2323 	}
2324 }
2325 
2326 /*
2327  * Flush all entries referencing a particular filesystem.
2328  */
2329 void
2330 cache_purgevfs(struct mount *mp, bool force)
2331 {
2332 	TAILQ_HEAD(, namecache) ncps;
2333 	struct mtx *vlp1, *vlp2;
2334 	struct rwlock *blp;
2335 	struct nchashhead *bucket;
2336 	struct namecache *ncp, *nnp;
2337 	u_long i, j, n_nchash;
2338 	int error;
2339 
2340 	/* Scan hash tables for applicable entries */
2341 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2342 	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2343 		return;
2344 	TAILQ_INIT(&ncps);
2345 	n_nchash = nchash + 1;
2346 	vlp1 = vlp2 = NULL;
2347 	for (i = 0; i < numbucketlocks; i++) {
2348 		blp = (struct rwlock *)&bucketlocks[i];
2349 		rw_wlock(blp);
2350 		for (j = i; j < n_nchash; j += numbucketlocks) {
2351 retry:
2352 			bucket = &nchashtbl[j];
2353 			CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2354 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
2355 				if (ncp->nc_dvp->v_mount != mp)
2356 					continue;
2357 				error = cache_zap_wlocked_bucket_kl(ncp, blp,
2358 				    &vlp1, &vlp2);
2359 				if (error != 0)
2360 					goto retry;
2361 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2362 			}
2363 		}
2364 		rw_wunlock(blp);
2365 		if (vlp1 == NULL && vlp2 == NULL)
2366 			cache_maybe_yield();
2367 	}
2368 	if (vlp1 != NULL)
2369 		mtx_unlock(vlp1);
2370 	if (vlp2 != NULL)
2371 		mtx_unlock(vlp2);
2372 
2373 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2374 		cache_free(ncp);
2375 	}
2376 }
2377 
2378 /*
2379  * Perform canonical checks and cache lookup and pass on to filesystem
2380  * through the vop_cachedlookup only if needed.
2381  */
2382 
2383 int
2384 vfs_cache_lookup(struct vop_lookup_args *ap)
2385 {
2386 	struct vnode *dvp;
2387 	int error;
2388 	struct vnode **vpp = ap->a_vpp;
2389 	struct componentname *cnp = ap->a_cnp;
2390 	int flags = cnp->cn_flags;
2391 
2392 	*vpp = NULL;
2393 	dvp = ap->a_dvp;
2394 
2395 	if (dvp->v_type != VDIR)
2396 		return (ENOTDIR);
2397 
2398 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2399 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2400 		return (EROFS);
2401 
2402 	error = vn_dir_check_exec(dvp, cnp);
2403 	if (error != 0)
2404 		return (error);
2405 
2406 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2407 	if (error == 0)
2408 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2409 	if (error == -1)
2410 		return (0);
2411 	return (error);
2412 }
2413 
2414 /* Implementation of the getcwd syscall. */
2415 int
2416 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2417 {
2418 	char *buf, *retbuf;
2419 	size_t buflen;
2420 	int error;
2421 
2422 	buflen = uap->buflen;
2423 	if (__predict_false(buflen < 2))
2424 		return (EINVAL);
2425 	if (buflen > MAXPATHLEN)
2426 		buflen = MAXPATHLEN;
2427 
2428 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2429 	error = vn_getcwd(td, buf, &retbuf, &buflen);
2430 	if (error == 0)
2431 		error = copyout(retbuf, uap->buf, buflen);
2432 	free(buf, M_TEMP);
2433 	return (error);
2434 }
2435 
2436 int
2437 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen)
2438 {
2439 	struct pwd *pwd;
2440 	int error;
2441 
2442 	pwd = pwd_hold(td);
2443 	error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen);
2444 	pwd_drop(pwd);
2445 
2446 #ifdef KTRACE
2447 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2448 		ktrnamei(*retbuf);
2449 #endif
2450 	return (error);
2451 }
2452 
2453 static int
2454 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2455     size_t size, int flags, enum uio_seg pathseg)
2456 {
2457 	struct nameidata nd;
2458 	char *retbuf, *freebuf;
2459 	int error;
2460 
2461 	if (flags != 0)
2462 		return (EINVAL);
2463 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2464 	    pathseg, path, fd, &cap_fstat_rights, td);
2465 	if ((error = namei(&nd)) != 0)
2466 		return (error);
2467 	error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size);
2468 	if (error == 0) {
2469 		error = copyout(retbuf, buf, size);
2470 		free(freebuf, M_TEMP);
2471 	}
2472 	NDFREE(&nd, 0);
2473 	return (error);
2474 }
2475 
2476 int
2477 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2478 {
2479 
2480 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2481 	    uap->flags, UIO_USERSPACE));
2482 }
2483 
2484 /*
2485  * Retrieve the full filesystem path that correspond to a vnode from the name
2486  * cache (if available)
2487  */
2488 int
2489 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2490 {
2491 	struct pwd *pwd;
2492 	char *buf;
2493 	size_t buflen;
2494 	int error;
2495 
2496 	if (__predict_false(vn == NULL))
2497 		return (EINVAL);
2498 
2499 	buflen = MAXPATHLEN;
2500 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2501 	pwd = pwd_hold(td);
2502 	error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen);
2503 	pwd_drop(pwd);
2504 
2505 	if (!error)
2506 		*freebuf = buf;
2507 	else
2508 		free(buf, M_TEMP);
2509 	return (error);
2510 }
2511 
2512 /*
2513  * This function is similar to vn_fullpath, but it attempts to lookup the
2514  * pathname relative to the global root mount point.  This is required for the
2515  * auditing sub-system, as audited pathnames must be absolute, relative to the
2516  * global root mount point.
2517  */
2518 int
2519 vn_fullpath_global(struct thread *td, struct vnode *vn,
2520     char **retbuf, char **freebuf)
2521 {
2522 	char *buf;
2523 	size_t buflen;
2524 	int error;
2525 
2526 	if (__predict_false(vn == NULL))
2527 		return (EINVAL);
2528 	buflen = MAXPATHLEN;
2529 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2530 	error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen);
2531 	if (!error)
2532 		*freebuf = buf;
2533 	else
2534 		free(buf, M_TEMP);
2535 	return (error);
2536 }
2537 
2538 int
2539 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2540 {
2541 	struct vnode *dvp;
2542 	struct namecache *ncp;
2543 	struct mtx *vlp;
2544 	int error;
2545 
2546 	vlp = VP2VNODELOCK(*vp);
2547 	mtx_lock(vlp);
2548 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2549 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2550 			break;
2551 	}
2552 	if (ncp != NULL) {
2553 		if (*buflen < ncp->nc_nlen) {
2554 			mtx_unlock(vlp);
2555 			vrele(*vp);
2556 			counter_u64_add(numfullpathfail4, 1);
2557 			error = ENOMEM;
2558 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2559 			    vp, NULL);
2560 			return (error);
2561 		}
2562 		*buflen -= ncp->nc_nlen;
2563 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2564 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2565 		    ncp->nc_name, vp);
2566 		dvp = *vp;
2567 		*vp = ncp->nc_dvp;
2568 		vref(*vp);
2569 		mtx_unlock(vlp);
2570 		vrele(dvp);
2571 		return (0);
2572 	}
2573 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2574 
2575 	mtx_unlock(vlp);
2576 	vn_lock(*vp, LK_SHARED | LK_RETRY);
2577 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2578 	vput(*vp);
2579 	if (error) {
2580 		counter_u64_add(numfullpathfail2, 1);
2581 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2582 		return (error);
2583 	}
2584 
2585 	*vp = dvp;
2586 	if (VN_IS_DOOMED(dvp)) {
2587 		/* forced unmount */
2588 		vrele(dvp);
2589 		error = ENOENT;
2590 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2591 		return (error);
2592 	}
2593 	/*
2594 	 * *vp has its use count incremented still.
2595 	 */
2596 
2597 	return (0);
2598 }
2599 
2600 /*
2601  * Resolve a directory to a pathname.
2602  *
2603  * The name of the directory can always be found in the namecache or fetched
2604  * from the filesystem. There is also guaranteed to be only one parent, meaning
2605  * we can just follow vnodes up until we find the root.
2606  *
2607  * The vnode must be referenced.
2608  */
2609 static int
2610 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
2611     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend)
2612 {
2613 #ifdef KDTRACE_HOOKS
2614 	struct vnode *startvp = vp;
2615 #endif
2616 	struct vnode *vp1;
2617 	size_t buflen;
2618 	int error;
2619 
2620 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2621 	VNPASS(vp->v_usecount > 0, vp);
2622 
2623 	buflen = *len;
2624 
2625 	if (!slash_prefixed) {
2626 		MPASS(*len >= 2);
2627 		buflen--;
2628 		buf[buflen] = '\0';
2629 	}
2630 
2631 	error = 0;
2632 
2633 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2634 	counter_u64_add(numfullpathcalls, 1);
2635 	while (vp != rdir && vp != rootvnode) {
2636 		/*
2637 		 * The vp vnode must be already fully constructed,
2638 		 * since it is either found in namecache or obtained
2639 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2640 		 * without obtaining the vnode lock.
2641 		 */
2642 		if ((vp->v_vflag & VV_ROOT) != 0) {
2643 			vn_lock(vp, LK_RETRY | LK_SHARED);
2644 
2645 			/*
2646 			 * With the vnode locked, check for races with
2647 			 * unmount, forced or not.  Note that we
2648 			 * already verified that vp is not equal to
2649 			 * the root vnode, which means that
2650 			 * mnt_vnodecovered can be NULL only for the
2651 			 * case of unmount.
2652 			 */
2653 			if (VN_IS_DOOMED(vp) ||
2654 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2655 			    vp1->v_mountedhere != vp->v_mount) {
2656 				vput(vp);
2657 				error = ENOENT;
2658 				SDT_PROBE3(vfs, namecache, fullpath, return,
2659 				    error, vp, NULL);
2660 				break;
2661 			}
2662 
2663 			vref(vp1);
2664 			vput(vp);
2665 			vp = vp1;
2666 			continue;
2667 		}
2668 		if (vp->v_type != VDIR) {
2669 			vrele(vp);
2670 			counter_u64_add(numfullpathfail1, 1);
2671 			error = ENOTDIR;
2672 			SDT_PROBE3(vfs, namecache, fullpath, return,
2673 			    error, vp, NULL);
2674 			break;
2675 		}
2676 		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2677 		if (error)
2678 			break;
2679 		if (buflen == 0) {
2680 			vrele(vp);
2681 			error = ENOMEM;
2682 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2683 			    startvp, NULL);
2684 			break;
2685 		}
2686 		buf[--buflen] = '/';
2687 		slash_prefixed = true;
2688 	}
2689 	if (error)
2690 		return (error);
2691 	if (!slash_prefixed) {
2692 		if (buflen == 0) {
2693 			vrele(vp);
2694 			counter_u64_add(numfullpathfail4, 1);
2695 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2696 			    startvp, NULL);
2697 			return (ENOMEM);
2698 		}
2699 		buf[--buflen] = '/';
2700 	}
2701 	counter_u64_add(numfullpathfound, 1);
2702 	vrele(vp);
2703 
2704 	*retbuf = buf + buflen;
2705 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2706 	*len -= buflen;
2707 	*len += addend;
2708 	return (0);
2709 }
2710 
2711 /*
2712  * Resolve an arbitrary vnode to a pathname.
2713  *
2714  * Note 2 caveats:
2715  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2716  *   resolve to a different path than the one used to find it
2717  * - namecache is not mandatory, meaning names are not guaranteed to be added
2718  *   (in which case resolving fails)
2719  */
2720 static int
2721 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
2722     char *buf, char **retbuf, size_t *buflen)
2723 {
2724 	size_t orig_buflen;
2725 	bool slash_prefixed;
2726 	int error;
2727 
2728 	if (*buflen < 2)
2729 		return (EINVAL);
2730 
2731 	orig_buflen = *buflen;
2732 
2733 	vref(vp);
2734 	slash_prefixed = false;
2735 	if (vp->v_type != VDIR) {
2736 		*buflen -= 1;
2737 		buf[*buflen] = '\0';
2738 		error = vn_vptocnp(&vp, td->td_ucred, buf, buflen);
2739 		if (error)
2740 			return (error);
2741 		if (*buflen == 0) {
2742 			vrele(vp);
2743 			return (ENOMEM);
2744 		}
2745 		*buflen -= 1;
2746 		buf[*buflen] = '/';
2747 		slash_prefixed = true;
2748 	}
2749 
2750 	return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed,
2751 	    orig_buflen - *buflen));
2752 }
2753 
2754 /*
2755  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2756  *
2757  * Since the namecache does not track handlings, the caller is expected to first
2758  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2759  *
2760  * Then we have 2 cases:
2761  * - if the found vnode is a directory, the path can be constructed just by
2762  *   fullowing names up the chain
2763  * - otherwise we populate the buffer with the saved name and start resolving
2764  *   from the parent
2765  */
2766 static int
2767 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
2768     char **freebuf, size_t *buflen)
2769 {
2770 	char *buf, *tmpbuf;
2771 	struct pwd *pwd;
2772 	struct componentname *cnp;
2773 	struct vnode *vp;
2774 	size_t addend;
2775 	int error;
2776 	bool slash_prefixed;
2777 
2778 	if (*buflen < 2)
2779 		return (EINVAL);
2780 	if (*buflen > MAXPATHLEN)
2781 		*buflen = MAXPATHLEN;
2782 
2783 	slash_prefixed = false;
2784 
2785 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
2786 	pwd = pwd_hold(td);
2787 
2788 	addend = 0;
2789 	vp = ndp->ni_vp;
2790 	if (vp->v_type != VDIR) {
2791 		cnp = &ndp->ni_cnd;
2792 		addend = cnp->cn_namelen + 2;
2793 		if (*buflen < addend) {
2794 			error = ENOMEM;
2795 			goto out_bad;
2796 		}
2797 		*buflen -= addend;
2798 		tmpbuf = buf + *buflen;
2799 		tmpbuf[0] = '/';
2800 		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
2801 		tmpbuf[addend - 1] = '\0';
2802 		slash_prefixed = true;
2803 		vp = ndp->ni_dvp;
2804 	}
2805 
2806 	vref(vp);
2807 	error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen,
2808 	    slash_prefixed, addend);
2809 	if (error != 0)
2810 		goto out_bad;
2811 
2812 	pwd_drop(pwd);
2813 	*freebuf = buf;
2814 
2815 	return (0);
2816 out_bad:
2817 	pwd_drop(pwd);
2818 	free(buf, M_TEMP);
2819 	return (error);
2820 }
2821 
2822 struct vnode *
2823 vn_dir_dd_ino(struct vnode *vp)
2824 {
2825 	struct namecache *ncp;
2826 	struct vnode *ddvp;
2827 	struct mtx *vlp;
2828 	enum vgetstate vs;
2829 
2830 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2831 	vlp = VP2VNODELOCK(vp);
2832 	mtx_lock(vlp);
2833 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2834 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2835 			continue;
2836 		ddvp = ncp->nc_dvp;
2837 		vs = vget_prep(ddvp);
2838 		mtx_unlock(vlp);
2839 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2840 			return (NULL);
2841 		return (ddvp);
2842 	}
2843 	mtx_unlock(vlp);
2844 	return (NULL);
2845 }
2846 
2847 int
2848 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2849 {
2850 	struct namecache *ncp;
2851 	struct mtx *vlp;
2852 	int l;
2853 
2854 	vlp = VP2VNODELOCK(vp);
2855 	mtx_lock(vlp);
2856 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2857 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2858 			break;
2859 	if (ncp == NULL) {
2860 		mtx_unlock(vlp);
2861 		return (ENOENT);
2862 	}
2863 	l = min(ncp->nc_nlen, buflen - 1);
2864 	memcpy(buf, ncp->nc_name, l);
2865 	mtx_unlock(vlp);
2866 	buf[l] = '\0';
2867 	return (0);
2868 }
2869 
2870 /*
2871  * This function updates path string to vnode's full global path
2872  * and checks the size of the new path string against the pathlen argument.
2873  *
2874  * Requires a locked, referenced vnode.
2875  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2876  *
2877  * If vp is a directory, the call to vn_fullpath_global() always succeeds
2878  * because it falls back to the ".." lookup if the namecache lookup fails.
2879  */
2880 int
2881 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2882     u_int pathlen)
2883 {
2884 	struct nameidata nd;
2885 	struct vnode *vp1;
2886 	char *rpath, *fbuf;
2887 	int error;
2888 
2889 	ASSERT_VOP_ELOCKED(vp, __func__);
2890 
2891 	/* Construct global filesystem path from vp. */
2892 	VOP_UNLOCK(vp);
2893 	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2894 
2895 	if (error != 0) {
2896 		vrele(vp);
2897 		return (error);
2898 	}
2899 
2900 	if (strlen(rpath) >= pathlen) {
2901 		vrele(vp);
2902 		error = ENAMETOOLONG;
2903 		goto out;
2904 	}
2905 
2906 	/*
2907 	 * Re-lookup the vnode by path to detect a possible rename.
2908 	 * As a side effect, the vnode is relocked.
2909 	 * If vnode was renamed, return ENOENT.
2910 	 */
2911 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2912 	    UIO_SYSSPACE, path, td);
2913 	error = namei(&nd);
2914 	if (error != 0) {
2915 		vrele(vp);
2916 		goto out;
2917 	}
2918 	NDFREE(&nd, NDF_ONLY_PNBUF);
2919 	vp1 = nd.ni_vp;
2920 	vrele(vp);
2921 	if (vp1 == vp)
2922 		strcpy(path, rpath);
2923 	else {
2924 		vput(vp1);
2925 		error = ENOENT;
2926 	}
2927 
2928 out:
2929 	free(fbuf, M_TEMP);
2930 	return (error);
2931 }
2932 
2933 #ifdef DDB
2934 static void
2935 db_print_vpath(struct vnode *vp)
2936 {
2937 
2938 	while (vp != NULL) {
2939 		db_printf("%p: ", vp);
2940 		if (vp == rootvnode) {
2941 			db_printf("/");
2942 			vp = NULL;
2943 		} else {
2944 			if (vp->v_vflag & VV_ROOT) {
2945 				db_printf("<mount point>");
2946 				vp = vp->v_mount->mnt_vnodecovered;
2947 			} else {
2948 				struct namecache *ncp;
2949 				char *ncn;
2950 				int i;
2951 
2952 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
2953 				if (ncp != NULL) {
2954 					ncn = ncp->nc_name;
2955 					for (i = 0; i < ncp->nc_nlen; i++)
2956 						db_printf("%c", *ncn++);
2957 					vp = ncp->nc_dvp;
2958 				} else {
2959 					vp = NULL;
2960 				}
2961 			}
2962 		}
2963 		db_printf("\n");
2964 	}
2965 
2966 	return;
2967 }
2968 
2969 DB_SHOW_COMMAND(vpath, db_show_vpath)
2970 {
2971 	struct vnode *vp;
2972 
2973 	if (!have_addr) {
2974 		db_printf("usage: show vpath <struct vnode *>\n");
2975 		return;
2976 	}
2977 
2978 	vp = (struct vnode *)addr;
2979 	db_print_vpath(vp);
2980 }
2981 
2982 #endif
2983 
2984 extern uma_zone_t namei_zone;
2985 
2986 static bool __read_frequently cache_fast_lookup = true;
2987 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
2988     &cache_fast_lookup, 0, "");
2989 
2990 #define CACHE_FPL_FAILED	-2020
2991 
2992 static void
2993 cache_fpl_cleanup_cnp(struct componentname *cnp)
2994 {
2995 
2996 	uma_zfree(namei_zone, cnp->cn_pnbuf);
2997 #ifdef DIAGNOSTIC
2998 	cnp->cn_pnbuf = NULL;
2999 	cnp->cn_nameptr = NULL;
3000 #endif
3001 }
3002 
3003 static void
3004 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3005 {
3006 	struct componentname *cnp;
3007 
3008 	cnp = &ndp->ni_cnd;
3009 	while (*(cnp->cn_nameptr) == '/') {
3010 		cnp->cn_nameptr++;
3011 		ndp->ni_pathlen--;
3012 	}
3013 
3014 	*dpp = ndp->ni_rootdir;
3015 }
3016 
3017 /*
3018  * Components of nameidata (or objects it can point to) which may
3019  * need restoring in case fast path lookup fails.
3020  */
3021 struct nameidata_saved {
3022 	long cn_namelen;
3023 	char *cn_nameptr;
3024 	size_t ni_pathlen;
3025 	int cn_flags;
3026 };
3027 
3028 struct cache_fpl {
3029 	struct nameidata *ndp;
3030 	struct componentname *cnp;
3031 	struct pwd *pwd;
3032 	struct vnode *dvp;
3033 	struct vnode *tvp;
3034 	seqc_t dvp_seqc;
3035 	seqc_t tvp_seqc;
3036 	struct nameidata_saved snd;
3037 	int line;
3038 	enum cache_fpl_status status:8;
3039 	bool in_smr;
3040 };
3041 
3042 static void
3043 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3044 {
3045 
3046 	snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3047 	snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3048 	snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3049 	snd->ni_pathlen = fpl->ndp->ni_pathlen;
3050 }
3051 
3052 static void
3053 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3054 {
3055 
3056 	fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3057 	fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3058 	fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3059 	fpl->ndp->ni_pathlen = snd->ni_pathlen;
3060 }
3061 
3062 #ifdef INVARIANTS
3063 #define cache_fpl_smr_assert_entered(fpl) ({			\
3064 	struct cache_fpl *_fpl = (fpl);				\
3065 	MPASS(_fpl->in_smr == true);				\
3066 	VFS_SMR_ASSERT_ENTERED();				\
3067 })
3068 #define cache_fpl_smr_assert_not_entered(fpl) ({		\
3069 	struct cache_fpl *_fpl = (fpl);				\
3070 	MPASS(_fpl->in_smr == false);				\
3071 	VFS_SMR_ASSERT_NOT_ENTERED();				\
3072 })
3073 #else
3074 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3075 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3076 #endif
3077 
3078 #define cache_fpl_smr_enter_initial(fpl) ({			\
3079 	struct cache_fpl *_fpl = (fpl);				\
3080 	vfs_smr_enter();					\
3081 	_fpl->in_smr = true;					\
3082 })
3083 
3084 #define cache_fpl_smr_enter(fpl) ({				\
3085 	struct cache_fpl *_fpl = (fpl);				\
3086 	MPASS(_fpl->in_smr == false);				\
3087 	vfs_smr_enter();					\
3088 	_fpl->in_smr = true;					\
3089 })
3090 
3091 #define cache_fpl_smr_exit(fpl) ({				\
3092 	struct cache_fpl *_fpl = (fpl);				\
3093 	MPASS(_fpl->in_smr == true);				\
3094 	vfs_smr_exit();						\
3095 	_fpl->in_smr = false;					\
3096 })
3097 
3098 static int
3099 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3100 {
3101 
3102 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3103 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3104 		    ("%s: converting to abort from %d at %d, set at %d\n",
3105 		    __func__, fpl->status, line, fpl->line));
3106 	}
3107 	fpl->status = CACHE_FPL_STATUS_ABORTED;
3108 	fpl->line = line;
3109 	return (CACHE_FPL_FAILED);
3110 }
3111 
3112 #define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
3113 
3114 static int
3115 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3116 {
3117 
3118 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3119 	    ("%s: setting to partial at %d, but already set to %d at %d\n",
3120 	    __func__, line, fpl->status, fpl->line));
3121 	cache_fpl_smr_assert_entered(fpl);
3122 	fpl->status = CACHE_FPL_STATUS_PARTIAL;
3123 	fpl->line = line;
3124 	return (CACHE_FPL_FAILED);
3125 }
3126 
3127 #define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
3128 
3129 static int
3130 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3131 {
3132 
3133 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3134 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
3135 	    __func__, line, fpl->status, fpl->line));
3136 	cache_fpl_smr_assert_not_entered(fpl);
3137 	MPASS(error != CACHE_FPL_FAILED);
3138 	fpl->status = CACHE_FPL_STATUS_HANDLED;
3139 	fpl->line = line;
3140 	return (error);
3141 }
3142 
3143 #define cache_fpl_handled(x, e)	cache_fpl_handled_impl((x), (e), __LINE__)
3144 
3145 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3146 	(LOCKLEAF | LOCKPARENT | WANTPARENT | FOLLOW | LOCKSHARED | SAVENAME | \
3147 	 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2)
3148 
3149 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3150 	(ISDOTDOT | MAKEENTRY | ISLASTCN)
3151 
3152 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3153     "supported and internal flags overlap");
3154 
3155 static bool
3156 cache_fpl_islastcn(struct nameidata *ndp)
3157 {
3158 
3159 	return (*ndp->ni_next == 0);
3160 }
3161 
3162 static bool
3163 cache_fpl_isdotdot(struct componentname *cnp)
3164 {
3165 
3166 	if (cnp->cn_namelen == 2 &&
3167 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3168 		return (true);
3169 	return (false);
3170 }
3171 
3172 static bool
3173 cache_can_fplookup(struct cache_fpl *fpl)
3174 {
3175 	struct nameidata *ndp;
3176 	struct componentname *cnp;
3177 	struct thread *td;
3178 
3179 	ndp = fpl->ndp;
3180 	cnp = fpl->cnp;
3181 	td = cnp->cn_thread;
3182 
3183 	if (!cache_fast_lookup) {
3184 		cache_fpl_aborted(fpl);
3185 		return (false);
3186 	}
3187 #ifdef MAC
3188 	if (mac_vnode_check_lookup_enabled()) {
3189 		cache_fpl_aborted(fpl);
3190 		return (false);
3191 	}
3192 #endif
3193 	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3194 		cache_fpl_aborted(fpl);
3195 		return (false);
3196 	}
3197 	if (cnp->cn_nameiop != LOOKUP) {
3198 		cache_fpl_aborted(fpl);
3199 		return (false);
3200 	}
3201 	if (ndp->ni_dirfd != AT_FDCWD) {
3202 		cache_fpl_aborted(fpl);
3203 		return (false);
3204 	}
3205 	if (IN_CAPABILITY_MODE(td)) {
3206 		cache_fpl_aborted(fpl);
3207 		return (false);
3208 	}
3209 	if (AUDITING_TD(td)) {
3210 		cache_fpl_aborted(fpl);
3211 		return (false);
3212 	}
3213 	if (ndp->ni_startdir != NULL) {
3214 		cache_fpl_aborted(fpl);
3215 		return (false);
3216 	}
3217 	return (true);
3218 }
3219 
3220 static bool
3221 cache_fplookup_vnode_supported(struct vnode *vp)
3222 {
3223 
3224 	return (vp->v_type != VLNK);
3225 }
3226 
3227 /*
3228  * Move a negative entry to the hot list.
3229  *
3230  * We have to take locks, but they may be contended and in the worst
3231  * case we may need to go off CPU. We don't want to spin within the
3232  * smr section and we can't block with it. Instead we are going to
3233  * look up the entry again.
3234  */
3235 static int __noinline
3236 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3237     uint32_t hash)
3238 {
3239 	struct componentname *cnp;
3240 	struct namecache *ncp;
3241 	struct neglist *neglist;
3242 	struct negstate *negstate;
3243 	struct vnode *dvp;
3244 	u_char nc_flag;
3245 
3246 	cnp = fpl->cnp;
3247 	dvp = fpl->dvp;
3248 
3249 	if (!vhold_smr(dvp))
3250 		return (cache_fpl_aborted(fpl));
3251 
3252 	neglist = NCP2NEGLIST(oncp);
3253 	cache_fpl_smr_exit(fpl);
3254 
3255 	mtx_lock(&ncneg_hot.nl_lock);
3256 	mtx_lock(&neglist->nl_lock);
3257 	/*
3258 	 * For hash iteration.
3259 	 */
3260 	cache_fpl_smr_enter(fpl);
3261 
3262 	/*
3263 	 * Avoid all surprises by only succeeding if we got the same entry and
3264 	 * bailing completely otherwise.
3265 	 *
3266 	 * In particular at this point there can be a new ncp which matches the
3267 	 * search but hashes to a different neglist.
3268 	 */
3269 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3270 		if (ncp == oncp)
3271 			break;
3272 	}
3273 
3274 	/*
3275 	 * No match to begin with.
3276 	 */
3277 	if (__predict_false(ncp == NULL)) {
3278 		goto out_abort;
3279 	}
3280 
3281 	/*
3282 	 * The newly found entry may be something different...
3283 	 */
3284 	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3285 	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3286 		goto out_abort;
3287 	}
3288 
3289 	/*
3290 	 * ... and not even negative.
3291 	 */
3292 	nc_flag = atomic_load_char(&ncp->nc_flag);
3293 	if ((nc_flag & NCF_NEGATIVE) == 0) {
3294 		goto out_abort;
3295 	}
3296 
3297 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3298 		goto out_abort;
3299 	}
3300 
3301 	negstate = NCP2NEGSTATE(ncp);
3302 	if ((negstate->neg_flag & NEG_HOT) == 0) {
3303 		numhotneg++;
3304 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3305 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3306 		negstate->neg_flag |= NEG_HOT;
3307 	}
3308 
3309 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3310 	counter_u64_add(numneghits, 1);
3311 	cache_fpl_smr_exit(fpl);
3312 	mtx_unlock(&neglist->nl_lock);
3313 	mtx_unlock(&ncneg_hot.nl_lock);
3314 	vdrop(dvp);
3315 	return (cache_fpl_handled(fpl, ENOENT));
3316 out_abort:
3317 	cache_fpl_smr_exit(fpl);
3318 	mtx_unlock(&neglist->nl_lock);
3319 	mtx_unlock(&ncneg_hot.nl_lock);
3320 	vdrop(dvp);
3321 	return (cache_fpl_aborted(fpl));
3322 }
3323 
3324 /*
3325  * The target vnode is not supported, prepare for the slow path to take over.
3326  */
3327 static int __noinline
3328 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3329 {
3330 	struct nameidata *ndp;
3331 	struct componentname *cnp;
3332 	enum vgetstate dvs;
3333 	struct vnode *dvp;
3334 	struct pwd *pwd;
3335 	seqc_t dvp_seqc;
3336 
3337 	ndp = fpl->ndp;
3338 	cnp = fpl->cnp;
3339 	dvp = fpl->dvp;
3340 	dvp_seqc = fpl->dvp_seqc;
3341 
3342 	dvs = vget_prep_smr(dvp);
3343 	if (__predict_false(dvs == VGET_NONE)) {
3344 		cache_fpl_smr_exit(fpl);
3345 		return (cache_fpl_aborted(fpl));
3346 	}
3347 
3348 	cache_fpl_smr_exit(fpl);
3349 
3350 	vget_finish_ref(dvp, dvs);
3351 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3352 		vrele(dvp);
3353 		return (cache_fpl_aborted(fpl));
3354 	}
3355 
3356 	pwd = pwd_hold(curthread);
3357 	if (fpl->pwd != pwd) {
3358 		vrele(dvp);
3359 		pwd_drop(pwd);
3360 		return (cache_fpl_aborted(fpl));
3361 	}
3362 
3363 	cache_fpl_restore(fpl, &fpl->snd);
3364 
3365 	ndp->ni_startdir = dvp;
3366 	cnp->cn_flags |= MAKEENTRY;
3367 	if (cache_fpl_islastcn(ndp))
3368 		cnp->cn_flags |= ISLASTCN;
3369 	if (cache_fpl_isdotdot(cnp))
3370 		cnp->cn_flags |= ISDOTDOT;
3371 
3372 	return (0);
3373 }
3374 
3375 static int
3376 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3377 {
3378 	struct componentname *cnp;
3379 	struct vnode *tvp;
3380 	seqc_t tvp_seqc;
3381 	int error, lkflags;
3382 
3383 	cnp = fpl->cnp;
3384 	tvp = fpl->tvp;
3385 	tvp_seqc = fpl->tvp_seqc;
3386 
3387 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
3388 		lkflags = LK_SHARED;
3389 		if ((cnp->cn_flags & LOCKSHARED) == 0)
3390 			lkflags = LK_EXCLUSIVE;
3391 		error = vget_finish(tvp, lkflags, tvs);
3392 		if (error != 0) {
3393 			return (cache_fpl_aborted(fpl));
3394 		}
3395 	} else {
3396 		vget_finish_ref(tvp, tvs);
3397 	}
3398 
3399 	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3400 		if ((cnp->cn_flags & LOCKLEAF) != 0)
3401 			vput(tvp);
3402 		else
3403 			vrele(tvp);
3404 		return (cache_fpl_aborted(fpl));
3405 	}
3406 
3407 	return (cache_fpl_handled(fpl, 0));
3408 }
3409 
3410 static int __noinline
3411 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3412 {
3413 	struct componentname *cnp;
3414 	enum vgetstate dvs, tvs;
3415 	struct vnode *dvp, *tvp;
3416 	seqc_t dvp_seqc, tvp_seqc;
3417 	int error;
3418 
3419 	cnp = fpl->cnp;
3420 	dvp = fpl->dvp;
3421 	dvp_seqc = fpl->dvp_seqc;
3422 	tvp = fpl->tvp;
3423 	tvp_seqc = fpl->tvp_seqc;
3424 
3425 	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3426 
3427 	/*
3428 	 * This is less efficient than it can be for simplicity.
3429 	 */
3430 	dvs = vget_prep_smr(dvp);
3431 	if (__predict_false(dvs == VGET_NONE)) {
3432 		return (cache_fpl_aborted(fpl));
3433 	}
3434 	tvs = vget_prep_smr(tvp);
3435 	if (__predict_false(tvs == VGET_NONE)) {
3436 		cache_fpl_smr_exit(fpl);
3437 		vget_abort(dvp, dvs);
3438 		return (cache_fpl_aborted(fpl));
3439 	}
3440 
3441 	cache_fpl_smr_exit(fpl);
3442 
3443 	if ((cnp->cn_flags & LOCKPARENT) != 0) {
3444 		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3445 		if (error != 0) {
3446 			vget_abort(tvp, tvs);
3447 			return (cache_fpl_aborted(fpl));
3448 		}
3449 	} else {
3450 		vget_finish_ref(dvp, dvs);
3451 	}
3452 
3453 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3454 		vget_abort(tvp, tvs);
3455 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3456 			vput(dvp);
3457 		else
3458 			vrele(dvp);
3459 		cache_fpl_aborted(fpl);
3460 		return (error);
3461 	}
3462 
3463 	error = cache_fplookup_final_child(fpl, tvs);
3464 	if (error != 0) {
3465 		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3466 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3467 			vput(dvp);
3468 		else
3469 			vrele(dvp);
3470 		return (error);
3471 	}
3472 
3473 	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3474 	return (0);
3475 }
3476 
3477 static int
3478 cache_fplookup_final(struct cache_fpl *fpl)
3479 {
3480 	struct componentname *cnp;
3481 	enum vgetstate tvs;
3482 	struct vnode *dvp, *tvp;
3483 	seqc_t dvp_seqc, tvp_seqc;
3484 
3485 	cnp = fpl->cnp;
3486 	dvp = fpl->dvp;
3487 	dvp_seqc = fpl->dvp_seqc;
3488 	tvp = fpl->tvp;
3489 	tvp_seqc = fpl->tvp_seqc;
3490 
3491 	VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3492 
3493 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3494 		return (cache_fplookup_final_withparent(fpl));
3495 
3496 	tvs = vget_prep_smr(tvp);
3497 	if (__predict_false(tvs == VGET_NONE)) {
3498 		return (cache_fpl_partial(fpl));
3499 	}
3500 
3501 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3502 		cache_fpl_smr_exit(fpl);
3503 		vget_abort(tvp, tvs);
3504 		return (cache_fpl_aborted(fpl));
3505 	}
3506 
3507 	cache_fpl_smr_exit(fpl);
3508 	return (cache_fplookup_final_child(fpl, tvs));
3509 }
3510 
3511 static int __noinline
3512 cache_fplookup_dot(struct cache_fpl *fpl)
3513 {
3514 	struct vnode *dvp;
3515 
3516 	dvp = fpl->dvp;
3517 
3518 	fpl->tvp = dvp;
3519 	fpl->tvp_seqc = vn_seqc_read_any(dvp);
3520 	if (seqc_in_modify(fpl->tvp_seqc)) {
3521 		return (cache_fpl_aborted(fpl));
3522 	}
3523 
3524 	counter_u64_add(dothits, 1);
3525 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3526 
3527 	return (0);
3528 }
3529 
3530 static int __noinline
3531 cache_fplookup_dotdot(struct cache_fpl *fpl)
3532 {
3533 	struct nameidata *ndp;
3534 	struct componentname *cnp;
3535 	struct namecache *ncp;
3536 	struct vnode *dvp;
3537 	struct prison *pr;
3538 	u_char nc_flag;
3539 
3540 	ndp = fpl->ndp;
3541 	cnp = fpl->cnp;
3542 	dvp = fpl->dvp;
3543 
3544 	/*
3545 	 * XXX this is racy the same way regular lookup is
3546 	 */
3547 	for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3548 	    pr = pr->pr_parent)
3549 		if (dvp == pr->pr_root)
3550 			break;
3551 
3552 	if (dvp == ndp->ni_rootdir ||
3553 	    dvp == ndp->ni_topdir ||
3554 	    dvp == rootvnode ||
3555 	    pr != NULL) {
3556 		fpl->tvp = dvp;
3557 		fpl->tvp_seqc = vn_seqc_read_any(dvp);
3558 		if (seqc_in_modify(fpl->tvp_seqc)) {
3559 			return (cache_fpl_aborted(fpl));
3560 		}
3561 		return (0);
3562 	}
3563 
3564 	if ((dvp->v_vflag & VV_ROOT) != 0) {
3565 		/*
3566 		 * TODO
3567 		 * The opposite of climb mount is needed here.
3568 		 */
3569 		return (cache_fpl_aborted(fpl));
3570 	}
3571 
3572 	ncp = atomic_load_ptr(&dvp->v_cache_dd);
3573 	if (ncp == NULL) {
3574 		return (cache_fpl_aborted(fpl));
3575 	}
3576 
3577 	nc_flag = atomic_load_char(&ncp->nc_flag);
3578 	if ((nc_flag & NCF_ISDOTDOT) != 0) {
3579 		if ((nc_flag & NCF_NEGATIVE) != 0)
3580 			return (cache_fpl_aborted(fpl));
3581 		fpl->tvp = ncp->nc_vp;
3582 	} else {
3583 		fpl->tvp = ncp->nc_dvp;
3584 	}
3585 
3586 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3587 		return (cache_fpl_aborted(fpl));
3588 	}
3589 
3590 	fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3591 	if (seqc_in_modify(fpl->tvp_seqc)) {
3592 		return (cache_fpl_partial(fpl));
3593 	}
3594 
3595 	counter_u64_add(dotdothits, 1);
3596 	return (0);
3597 }
3598 
3599 static int
3600 cache_fplookup_next(struct cache_fpl *fpl)
3601 {
3602 	struct componentname *cnp;
3603 	struct namecache *ncp;
3604 	struct negstate *negstate;
3605 	struct vnode *dvp, *tvp;
3606 	u_char nc_flag;
3607 	uint32_t hash;
3608 	bool neg_hot;
3609 
3610 	cnp = fpl->cnp;
3611 	dvp = fpl->dvp;
3612 
3613 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3614 		return (cache_fplookup_dot(fpl));
3615 	}
3616 
3617 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3618 
3619 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3620 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3621 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3622 			break;
3623 	}
3624 
3625 	/*
3626 	 * If there is no entry we have to punt to the slow path to perform
3627 	 * actual lookup. Should there be nothing with this name a negative
3628 	 * entry will be created.
3629 	 */
3630 	if (__predict_false(ncp == NULL)) {
3631 		return (cache_fpl_partial(fpl));
3632 	}
3633 
3634 	tvp = atomic_load_ptr(&ncp->nc_vp);
3635 	nc_flag = atomic_load_char(&ncp->nc_flag);
3636 	if ((nc_flag & NCF_NEGATIVE) != 0) {
3637 		negstate = NCP2NEGSTATE(ncp);
3638 		neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3639 		if (__predict_false(!cache_ncp_canuse(ncp))) {
3640 			return (cache_fpl_partial(fpl));
3641 		}
3642 		if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3643 			return (cache_fpl_partial(fpl));
3644 		}
3645 		if (!neg_hot) {
3646 			return (cache_fplookup_negative_promote(fpl, ncp, hash));
3647 		}
3648 		SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3649 		    ncp->nc_name);
3650 		counter_u64_add(numneghits, 1);
3651 		cache_fpl_smr_exit(fpl);
3652 		return (cache_fpl_handled(fpl, ENOENT));
3653 	}
3654 
3655 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3656 		return (cache_fpl_partial(fpl));
3657 	}
3658 
3659 	fpl->tvp = tvp;
3660 	fpl->tvp_seqc = vn_seqc_read_any(tvp);
3661 	if (seqc_in_modify(fpl->tvp_seqc)) {
3662 		return (cache_fpl_partial(fpl));
3663 	}
3664 
3665 	if (!cache_fplookup_vnode_supported(tvp)) {
3666 		return (cache_fpl_partial(fpl));
3667 	}
3668 
3669 	counter_u64_add(numposhits, 1);
3670 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3671 	return (0);
3672 }
3673 
3674 static bool
3675 cache_fplookup_mp_supported(struct mount *mp)
3676 {
3677 
3678 	if (mp == NULL)
3679 		return (false);
3680 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3681 		return (false);
3682 	if ((mp->mnt_flag & MNT_UNION) != 0)
3683 		return (false);
3684 	return (true);
3685 }
3686 
3687 /*
3688  * Walk up the mount stack (if any).
3689  *
3690  * Correctness is provided in the following ways:
3691  * - all vnodes are protected from freeing with SMR
3692  * - struct mount objects are type stable making them always safe to access
3693  * - stability of the particular mount is provided by busying it
3694  * - relationship between the vnode which is mounted on and the mount is
3695  *   verified with the vnode sequence counter after busying
3696  * - association between root vnode of the mount and the mount is protected
3697  *   by busy
3698  *
3699  * From that point on we can read the sequence counter of the root vnode
3700  * and get the next mount on the stack (if any) using the same protection.
3701  *
3702  * By the end of successful walk we are guaranteed the reached state was
3703  * indeed present at least at some point which matches the regular lookup.
3704  */
3705 static int __noinline
3706 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3707 {
3708 	struct mount *mp, *prev_mp;
3709 	struct vnode *vp;
3710 	seqc_t vp_seqc;
3711 
3712 	vp = fpl->tvp;
3713 	vp_seqc = fpl->tvp_seqc;
3714 
3715 	VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
3716 	mp = atomic_load_ptr(&vp->v_mountedhere);
3717 	if (mp == NULL)
3718 		return (0);
3719 
3720 	prev_mp = NULL;
3721 	for (;;) {
3722 		if (!vfs_op_thread_enter_crit(mp)) {
3723 			if (prev_mp != NULL)
3724 				vfs_op_thread_exit_crit(prev_mp);
3725 			return (cache_fpl_partial(fpl));
3726 		}
3727 		if (prev_mp != NULL)
3728 			vfs_op_thread_exit_crit(prev_mp);
3729 		if (!vn_seqc_consistent(vp, vp_seqc)) {
3730 			vfs_op_thread_exit_crit(mp);
3731 			return (cache_fpl_partial(fpl));
3732 		}
3733 		if (!cache_fplookup_mp_supported(mp)) {
3734 			vfs_op_thread_exit_crit(mp);
3735 			return (cache_fpl_partial(fpl));
3736 		}
3737 		vp = atomic_load_ptr(&mp->mnt_rootvnode);
3738 		if (vp == NULL || VN_IS_DOOMED(vp)) {
3739 			vfs_op_thread_exit_crit(mp);
3740 			return (cache_fpl_partial(fpl));
3741 		}
3742 		vp_seqc = vn_seqc_read_any(vp);
3743 		if (seqc_in_modify(vp_seqc)) {
3744 			vfs_op_thread_exit_crit(mp);
3745 			return (cache_fpl_partial(fpl));
3746 		}
3747 		prev_mp = mp;
3748 		mp = atomic_load_ptr(&vp->v_mountedhere);
3749 		if (mp == NULL)
3750 			break;
3751 	}
3752 
3753 	vfs_op_thread_exit_crit(prev_mp);
3754 	fpl->tvp = vp;
3755 	fpl->tvp_seqc = vp_seqc;
3756 	return (0);
3757 }
3758 
3759 static bool
3760 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
3761 {
3762 	struct mount *mp;
3763 	struct vnode *vp;
3764 
3765 	vp = fpl->tvp;
3766 
3767 	/*
3768 	 * Hack: while this is a union, the pointer tends to be NULL so save on
3769 	 * a branch.
3770 	 */
3771 	mp = atomic_load_ptr(&vp->v_mountedhere);
3772 	if (mp == NULL)
3773 		return (false);
3774 	if (vp->v_type == VDIR)
3775 		return (true);
3776 	return (false);
3777 }
3778 
3779 /*
3780  * Parse the path.
3781  *
3782  * The code is mostly copy-pasted from regular lookup, see lookup().
3783  * The structure is maintained along with comments for easier maintenance.
3784  * Deduplicating the code will become feasible after fast path lookup
3785  * becomes more feature-complete.
3786  */
3787 static int
3788 cache_fplookup_parse(struct cache_fpl *fpl)
3789 {
3790 	struct nameidata *ndp;
3791 	struct componentname *cnp;
3792 	char *cp;
3793 	char *prev_ni_next;             /* saved ndp->ni_next */
3794 	size_t prev_ni_pathlen;         /* saved ndp->ni_pathlen */
3795 
3796 	ndp = fpl->ndp;
3797 	cnp = fpl->cnp;
3798 
3799 	/*
3800 	 * Search a new directory.
3801 	 *
3802 	 * The last component of the filename is left accessible via
3803 	 * cnp->cn_nameptr for callers that need the name. Callers needing
3804 	 * the name set the SAVENAME flag. When done, they assume
3805 	 * responsibility for freeing the pathname buffer.
3806 	 */
3807 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
3808 		continue;
3809 	cnp->cn_namelen = cp - cnp->cn_nameptr;
3810 	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
3811 		cache_fpl_smr_exit(fpl);
3812 		return (cache_fpl_handled(fpl, ENAMETOOLONG));
3813 	}
3814 	prev_ni_pathlen = ndp->ni_pathlen;
3815 	ndp->ni_pathlen -= cnp->cn_namelen;
3816 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
3817 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
3818 	prev_ni_next = ndp->ni_next;
3819 	ndp->ni_next = cp;
3820 
3821 	/*
3822 	 * Replace multiple slashes by a single slash and trailing slashes
3823 	 * by a null.  This must be done before VOP_LOOKUP() because some
3824 	 * fs's don't know about trailing slashes.  Remember if there were
3825 	 * trailing slashes to handle symlinks, existing non-directories
3826 	 * and non-existing files that won't be directories specially later.
3827 	 */
3828 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
3829 		cp++;
3830 		ndp->ni_pathlen--;
3831 		if (*cp == '\0') {
3832 			/*
3833 			 * TODO
3834 			 * Regular lookup performs the following:
3835 			 * *ndp->ni_next = '\0';
3836 			 * cnp->cn_flags |= TRAILINGSLASH;
3837 			 *
3838 			 * Which is problematic since it modifies data read
3839 			 * from userspace. Then if fast path lookup was to
3840 			 * abort we would have to either restore it or convey
3841 			 * the flag. Since this is a corner case just ignore
3842 			 * it for simplicity.
3843 			 */
3844 			return (cache_fpl_partial(fpl));
3845 		}
3846 	}
3847 	ndp->ni_next = cp;
3848 
3849 	/*
3850 	 * Check for degenerate name (e.g. / or "")
3851 	 * which is a way of talking about a directory,
3852 	 * e.g. like "/." or ".".
3853 	 *
3854 	 * TODO
3855 	 * Another corner case handled by the regular lookup
3856 	 */
3857 	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
3858 		return (cache_fpl_partial(fpl));
3859 	}
3860 	return (0);
3861 }
3862 
3863 static void
3864 cache_fplookup_parse_advance(struct cache_fpl *fpl)
3865 {
3866 	struct nameidata *ndp;
3867 	struct componentname *cnp;
3868 
3869 	ndp = fpl->ndp;
3870 	cnp = fpl->cnp;
3871 
3872 	cnp->cn_nameptr = ndp->ni_next;
3873 	while (*cnp->cn_nameptr == '/') {
3874 		cnp->cn_nameptr++;
3875 		ndp->ni_pathlen--;
3876 	}
3877 }
3878 
3879 static int __noinline
3880 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
3881 {
3882 
3883 	switch (error) {
3884 	case EAGAIN:
3885 		/*
3886 		 * Can happen when racing against vgone.
3887 		 * */
3888 	case EOPNOTSUPP:
3889 		cache_fpl_partial(fpl);
3890 		break;
3891 	default:
3892 		/*
3893 		 * See the API contract for VOP_FPLOOKUP_VEXEC.
3894 		 */
3895 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3896 			error = cache_fpl_aborted(fpl);
3897 		} else {
3898 			cache_fpl_smr_exit(fpl);
3899 			cache_fpl_handled(fpl, error);
3900 		}
3901 		break;
3902 	}
3903 	return (error);
3904 }
3905 
3906 static int
3907 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
3908 {
3909 	struct nameidata *ndp;
3910 	struct componentname *cnp;
3911 	struct mount *mp;
3912 	int error;
3913 
3914 	error = CACHE_FPL_FAILED;
3915 	ndp = fpl->ndp;
3916 	cnp = fpl->cnp;
3917 
3918 	cache_fpl_checkpoint(fpl, &fpl->snd);
3919 
3920 	fpl->dvp = dvp;
3921 	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
3922 	if (seqc_in_modify(fpl->dvp_seqc)) {
3923 		cache_fpl_aborted(fpl);
3924 		goto out;
3925 	}
3926 	mp = atomic_load_ptr(&fpl->dvp->v_mount);
3927 	if (!cache_fplookup_mp_supported(mp)) {
3928 		cache_fpl_aborted(fpl);
3929 		goto out;
3930 	}
3931 
3932 	VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3933 
3934 	for (;;) {
3935 		error = cache_fplookup_parse(fpl);
3936 		if (__predict_false(error != 0)) {
3937 			break;
3938 		}
3939 
3940 		VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3941 
3942 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
3943 		if (__predict_false(error != 0)) {
3944 			error = cache_fplookup_failed_vexec(fpl, error);
3945 			break;
3946 		}
3947 
3948 		if (__predict_false(cache_fpl_isdotdot(cnp))) {
3949 			error = cache_fplookup_dotdot(fpl);
3950 			if (__predict_false(error != 0)) {
3951 				break;
3952 			}
3953 		} else {
3954 			error = cache_fplookup_next(fpl);
3955 			if (__predict_false(error != 0)) {
3956 				break;
3957 			}
3958 
3959 			VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3960 
3961 			if (cache_fplookup_need_climb_mount(fpl)) {
3962 				error = cache_fplookup_climb_mount(fpl);
3963 				if (__predict_false(error != 0)) {
3964 					break;
3965 				}
3966 			}
3967 		}
3968 
3969 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3970 
3971 		if (cache_fpl_islastcn(ndp)) {
3972 			error = cache_fplookup_final(fpl);
3973 			break;
3974 		}
3975 
3976 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3977 			error = cache_fpl_aborted(fpl);
3978 			break;
3979 		}
3980 
3981 		fpl->dvp = fpl->tvp;
3982 		fpl->dvp_seqc = fpl->tvp_seqc;
3983 
3984 		cache_fplookup_parse_advance(fpl);
3985 		cache_fpl_checkpoint(fpl, &fpl->snd);
3986 	}
3987 out:
3988 	switch (fpl->status) {
3989 	case CACHE_FPL_STATUS_UNSET:
3990 		__assert_unreachable();
3991 		break;
3992 	case CACHE_FPL_STATUS_PARTIAL:
3993 		cache_fpl_smr_assert_entered(fpl);
3994 		return (cache_fplookup_partial_setup(fpl));
3995 	case CACHE_FPL_STATUS_ABORTED:
3996 		if (fpl->in_smr)
3997 			cache_fpl_smr_exit(fpl);
3998 		return (CACHE_FPL_FAILED);
3999 	case CACHE_FPL_STATUS_HANDLED:
4000 		MPASS(error != CACHE_FPL_FAILED);
4001 		cache_fpl_smr_assert_not_entered(fpl);
4002 		if (__predict_false(error != 0)) {
4003 			ndp->ni_dvp = NULL;
4004 			ndp->ni_vp = NULL;
4005 			cache_fpl_cleanup_cnp(cnp);
4006 			return (error);
4007 		}
4008 		ndp->ni_dvp = fpl->dvp;
4009 		ndp->ni_vp = fpl->tvp;
4010 		if (cnp->cn_flags & SAVENAME)
4011 			cnp->cn_flags |= HASBUF;
4012 		else
4013 			cache_fpl_cleanup_cnp(cnp);
4014 		return (error);
4015 	}
4016 }
4017 
4018 /*
4019  * Fast path lookup protected with SMR and sequence counters.
4020  *
4021  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4022  *
4023  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4024  * outlined below.
4025  *
4026  * Traditional vnode lookup conceptually looks like this:
4027  *
4028  * vn_lock(current);
4029  * for (;;) {
4030  *	next = find();
4031  *	vn_lock(next);
4032  *	vn_unlock(current);
4033  *	current = next;
4034  *	if (last)
4035  *	    break;
4036  * }
4037  * return (current);
4038  *
4039  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4040  * any modifications thanks to holding respective locks.
4041  *
4042  * The same guarantee can be provided with a combination of safe memory
4043  * reclamation and sequence counters instead. If all operations which affect
4044  * the relationship between the current vnode and the one we are looking for
4045  * also modify the counter, we can verify whether all the conditions held as
4046  * we made the jump. This includes things like permissions, mount points etc.
4047  * Counter modification is provided by enclosing relevant places in
4048  * vn_seqc_write_begin()/end() calls.
4049  *
4050  * Thus this translates to:
4051  *
4052  * vfs_smr_enter();
4053  * dvp_seqc = seqc_read_any(dvp);
4054  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4055  *     abort();
4056  * for (;;) {
4057  * 	tvp = find();
4058  * 	tvp_seqc = seqc_read_any(tvp);
4059  * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4060  * 	    abort();
4061  * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4062  * 	    abort();
4063  * 	dvp = tvp; // we know nothing of importance has changed
4064  * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4065  * 	if (last)
4066  * 	    break;
4067  * }
4068  * vget(); // secure the vnode
4069  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4070  * 	    abort();
4071  * // at this point we know nothing has changed for any parent<->child pair
4072  * // as they were crossed during the lookup, meaning we matched the guarantee
4073  * // of the locked variant
4074  * return (tvp);
4075  *
4076  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4077  * - they are called while within vfs_smr protection which they must never exit
4078  * - EAGAIN can be returned to denote checking could not be performed, it is
4079  *   always valid to return it
4080  * - if the sequence counter has not changed the result must be valid
4081  * - if the sequence counter has changed both false positives and false negatives
4082  *   are permitted (since the result will be rejected later)
4083  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4084  *
4085  * Caveats to watch out for:
4086  * - vnodes are passed unlocked and unreferenced with nothing stopping
4087  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4088  *   to use atomic_load_ptr to fetch it.
4089  * - the aforementioned object can also get freed, meaning absent other means it
4090  *   should be protected with vfs_smr
4091  * - either safely checking permissions as they are modified or guaranteeing
4092  *   their stability is left to the routine
4093  */
4094 int
4095 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4096     struct pwd **pwdp)
4097 {
4098 	struct cache_fpl fpl;
4099 	struct pwd *pwd;
4100 	struct vnode *dvp;
4101 	struct componentname *cnp;
4102 	struct nameidata_saved orig;
4103 	int error;
4104 
4105 	MPASS(ndp->ni_lcf == 0);
4106 
4107 	fpl.status = CACHE_FPL_STATUS_UNSET;
4108 	fpl.ndp = ndp;
4109 	fpl.cnp = &ndp->ni_cnd;
4110 	MPASS(curthread == fpl.cnp->cn_thread);
4111 
4112 	if (!cache_can_fplookup(&fpl)) {
4113 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4114 		*status = fpl.status;
4115 		return (EOPNOTSUPP);
4116 	}
4117 
4118 	cache_fpl_checkpoint(&fpl, &orig);
4119 
4120 	cache_fpl_smr_enter_initial(&fpl);
4121 	pwd = pwd_get_smr();
4122 	fpl.pwd = pwd;
4123 	ndp->ni_rootdir = pwd->pwd_rdir;
4124 	ndp->ni_topdir = pwd->pwd_jdir;
4125 
4126 	cnp = fpl.cnp;
4127 	cnp->cn_nameptr = cnp->cn_pnbuf;
4128 	if (cnp->cn_pnbuf[0] == '/') {
4129 		cache_fpl_handle_root(ndp, &dvp);
4130 	} else {
4131 		MPASS(ndp->ni_dirfd == AT_FDCWD);
4132 		dvp = pwd->pwd_cdir;
4133 	}
4134 
4135 	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4136 
4137 	error = cache_fplookup_impl(dvp, &fpl);
4138 	cache_fpl_smr_assert_not_entered(&fpl);
4139 	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4140 
4141 	*status = fpl.status;
4142 	switch (fpl.status) {
4143 	case CACHE_FPL_STATUS_UNSET:
4144 		__assert_unreachable();
4145 		break;
4146 	case CACHE_FPL_STATUS_HANDLED:
4147 		SDT_PROBE3(vfs, namei, lookup, return, error,
4148 		    (error == 0 ? ndp->ni_vp : NULL), true);
4149 		break;
4150 	case CACHE_FPL_STATUS_PARTIAL:
4151 		*pwdp = fpl.pwd;
4152 		/*
4153 		 * Status restored by cache_fplookup_partial_setup.
4154 		 */
4155 		break;
4156 	case CACHE_FPL_STATUS_ABORTED:
4157 		cache_fpl_restore(&fpl, &orig);
4158 		break;
4159 	}
4160 	return (error);
4161 }
4162