xref: /freebsd/sys/kern/vfs_cache.c (revision 9e5787d2284e187abb5b654d924394a65772e004)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ddb.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
50 #include <sys/ktr.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
54 #include <sys/jail.h>
55 #include <sys/mount.h>
56 #include <sys/namei.h>
57 #include <sys/proc.h>
58 #include <sys/rwlock.h>
59 #include <sys/seqc.h>
60 #include <sys/sdt.h>
61 #include <sys/smr.h>
62 #include <sys/smp.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/sysproto.h>
66 #include <sys/vnode.h>
67 #include <ck_queue.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 
72 #include <sys/capsicum.h>
73 
74 #include <security/audit/audit.h>
75 #include <security/mac/mac_framework.h>
76 
77 #ifdef DDB
78 #include <ddb/ddb.h>
79 #endif
80 
81 #include <vm/uma.h>
82 
83 SDT_PROVIDER_DECLARE(vfs);
84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
85     "struct vnode *");
86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
87     "char *");
88 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
89     "const char *");
90 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
91     "struct namecache *", "int", "int");
92 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
93 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
94     "char *", "struct vnode *");
95 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
96 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
97     "struct vnode *", "char *");
98 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
99     "struct vnode *");
100 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
101     "struct vnode *", "char *");
102 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
103     "char *");
104 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
105     "struct componentname *");
106 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
107     "struct componentname *");
108 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
109 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
110 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
111 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
112     "struct vnode *");
113 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
114     "char *");
115 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
116     "char *");
117 
118 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
119 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
120 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
121 
122 /*
123  * This structure describes the elements in the cache of recent
124  * names looked up by namei.
125  */
126 struct negstate {
127 	u_char neg_flag;
128 };
129 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
130     "the state must fit in a union with a pointer without growing it");
131 
132 struct	namecache {
133 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
134 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
135 	CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
136 	struct	vnode *nc_dvp;		/* vnode of parent of name */
137 	union {
138 		struct	vnode *nu_vp;	/* vnode the name refers to */
139 		struct	negstate nu_neg;/* negative entry state */
140 	} n_un;
141 	u_char	nc_flag;		/* flag bits */
142 	u_char	nc_nlen;		/* length of name */
143 	char	nc_name[0];		/* segment name + nul */
144 };
145 
146 /*
147  * struct namecache_ts repeats struct namecache layout up to the
148  * nc_nlen member.
149  * struct namecache_ts is used in place of struct namecache when time(s) need
150  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
151  * both a non-dotdot directory name plus dotdot for the directory's
152  * parent.
153  *
154  * See below for alignment requirement.
155  */
156 struct	namecache_ts {
157 	struct	timespec nc_time;	/* timespec provided by fs */
158 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
159 	int	nc_ticks;		/* ticks value when entry was added */
160 	struct namecache nc_nc;
161 };
162 
163 /*
164  * At least mips n32 performs 64-bit accesses to timespec as found
165  * in namecache_ts and requires them to be aligned. Since others
166  * may be in the same spot suffer a little bit and enforce the
167  * alignment for everyone. Note this is a nop for 64-bit platforms.
168  */
169 #define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
170 #define	CACHE_PATH_CUTOFF	39
171 
172 #define CACHE_ZONE_SMALL_SIZE		(sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
173 #define CACHE_ZONE_SMALL_TS_SIZE	(sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
174 #define CACHE_ZONE_LARGE_SIZE		(sizeof(struct namecache) + NAME_MAX + 1)
175 #define CACHE_ZONE_LARGE_TS_SIZE	(sizeof(struct namecache_ts) + NAME_MAX + 1)
176 
177 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
178 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
179 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
180 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
181 
182 #define	nc_vp		n_un.nu_vp
183 #define	nc_neg		n_un.nu_neg
184 
185 /*
186  * Flags in namecache.nc_flag
187  */
188 #define NCF_WHITE	0x01
189 #define NCF_ISDOTDOT	0x02
190 #define	NCF_TS		0x04
191 #define	NCF_DTS		0x08
192 #define	NCF_DVDROP	0x10
193 #define	NCF_NEGATIVE	0x20
194 #define	NCF_INVALID	0x40
195 #define	NCF_WIP		0x80
196 
197 /*
198  * Flags in negstate.neg_flag
199  */
200 #define NEG_HOT		0x01
201 
202 /*
203  * Mark an entry as invalid.
204  *
205  * This is called before it starts getting deconstructed.
206  */
207 static void
208 cache_ncp_invalidate(struct namecache *ncp)
209 {
210 
211 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
212 	    ("%s: entry %p already invalid", __func__, ncp));
213 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
214 	atomic_thread_fence_rel();
215 }
216 
217 /*
218  * Check whether the entry can be safely used.
219  *
220  * All places which elide locks are supposed to call this after they are
221  * done with reading from an entry.
222  */
223 static bool
224 cache_ncp_canuse(struct namecache *ncp)
225 {
226 
227 	atomic_thread_fence_acq();
228 	return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
229 }
230 
231 /*
232  * Name caching works as follows:
233  *
234  * Names found by directory scans are retained in a cache
235  * for future reference.  It is managed LRU, so frequently
236  * used names will hang around.  Cache is indexed by hash value
237  * obtained from (dvp, name) where dvp refers to the directory
238  * containing name.
239  *
240  * If it is a "negative" entry, (i.e. for a name that is known NOT to
241  * exist) the vnode pointer will be NULL.
242  *
243  * Upon reaching the last segment of a path, if the reference
244  * is for DELETE, or NOCACHE is set (rewrite), and the
245  * name is located in the cache, it will be dropped.
246  *
247  * These locks are used (in the order in which they can be taken):
248  * NAME		TYPE	ROLE
249  * vnodelock	mtx	vnode lists and v_cache_dd field protection
250  * bucketlock	rwlock	for access to given set of hash buckets
251  * neglist	mtx	negative entry LRU management
252  *
253  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
254  * shrinking the LRU list.
255  *
256  * It is legal to take multiple vnodelock and bucketlock locks. The locking
257  * order is lower address first. Both are recursive.
258  *
259  * "." lookups are lockless.
260  *
261  * ".." and vnode -> name lookups require vnodelock.
262  *
263  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
264  *
265  * Insertions and removals of entries require involved vnodes and bucketlocks
266  * to be write-locked to prevent other threads from seeing the entry.
267  *
268  * Some lookups result in removal of the found entry (e.g. getting rid of a
269  * negative entry with the intent to create a positive one), which poses a
270  * problem when multiple threads reach the state. Similarly, two different
271  * threads can purge two different vnodes and try to remove the same name.
272  *
273  * If the already held vnode lock is lower than the second required lock, we
274  * can just take the other lock. However, in the opposite case, this could
275  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
276  * the first node, locking everything in order and revalidating the state.
277  */
278 
279 VFS_SMR_DECLARE;
280 
281 /*
282  * Structures associated with name caching.
283  */
284 #define NCHHASH(hash) \
285 	(&nchashtbl[(hash) & nchash])
286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
287 static u_long __read_mostly	nchash;			/* size of hash table */
288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
289     "Size of namecache hash table");
290 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
292     "Ratio of negative namecache entries");
293 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
294 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
295 u_int ncsizefactor = 2;
296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
297     "Size factor for namecache");
298 static u_int __read_mostly	ncpurgeminvnodes;
299 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
300     "Number of vnodes below which purgevfs ignores the request");
301 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
302 
303 struct nchstats	nchstats;		/* cache effectiveness statistics */
304 
305 static bool __read_frequently cache_fast_revlookup = true;
306 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
307     &cache_fast_revlookup, 0, "");
308 
309 static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
310 
311 struct neglist {
312 	struct mtx		nl_lock;
313 	TAILQ_HEAD(, namecache) nl_list;
314 } __aligned(CACHE_LINE_SIZE);
315 
316 static struct neglist __read_mostly	*neglists;
317 static struct neglist ncneg_hot;
318 static u_long numhotneg;
319 
320 #define ncneghash	3
321 #define	numneglists	(ncneghash + 1)
322 static inline struct neglist *
323 NCP2NEGLIST(struct namecache *ncp)
324 {
325 
326 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
327 }
328 
329 static inline struct negstate *
330 NCP2NEGSTATE(struct namecache *ncp)
331 {
332 
333 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
334 	return (&ncp->nc_neg);
335 }
336 
337 #define	numbucketlocks (ncbuckethash + 1)
338 static u_int __read_mostly  ncbuckethash;
339 static struct rwlock_padalign __read_mostly  *bucketlocks;
340 #define	HASH2BUCKETLOCK(hash) \
341 	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
342 
343 #define	numvnodelocks (ncvnodehash + 1)
344 static u_int __read_mostly  ncvnodehash;
345 static struct mtx __read_mostly *vnodelocks;
346 static inline struct mtx *
347 VP2VNODELOCK(struct vnode *vp)
348 {
349 
350 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
351 }
352 
353 /*
354  * UMA zones for the VFS cache.
355  *
356  * The small cache is used for entries with short names, which are the
357  * most common.  The large cache is used for entries which are too big to
358  * fit in the small cache.
359  */
360 static uma_zone_t __read_mostly cache_zone_small;
361 static uma_zone_t __read_mostly cache_zone_small_ts;
362 static uma_zone_t __read_mostly cache_zone_large;
363 static uma_zone_t __read_mostly cache_zone_large_ts;
364 
365 static struct namecache *
366 cache_alloc(int len, int ts)
367 {
368 	struct namecache_ts *ncp_ts;
369 	struct namecache *ncp;
370 
371 	if (__predict_false(ts)) {
372 		if (len <= CACHE_PATH_CUTOFF)
373 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
374 		else
375 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
376 		ncp = &ncp_ts->nc_nc;
377 	} else {
378 		if (len <= CACHE_PATH_CUTOFF)
379 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
380 		else
381 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
382 	}
383 	return (ncp);
384 }
385 
386 static void
387 cache_free(struct namecache *ncp)
388 {
389 	struct namecache_ts *ncp_ts;
390 
391 	if (ncp == NULL)
392 		return;
393 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
394 		vdrop(ncp->nc_dvp);
395 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
396 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
397 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
398 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
399 		else
400 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
401 	} else {
402 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
403 			uma_zfree_smr(cache_zone_small, ncp);
404 		else
405 			uma_zfree_smr(cache_zone_large, ncp);
406 	}
407 }
408 
409 static void
410 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
411 {
412 	struct namecache_ts *ncp_ts;
413 
414 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
415 	    (tsp == NULL && ticksp == NULL),
416 	    ("No NCF_TS"));
417 
418 	if (tsp == NULL && ticksp == NULL)
419 		return;
420 
421 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
422 	if (tsp != NULL)
423 		*tsp = ncp_ts->nc_time;
424 	if (ticksp != NULL)
425 		*ticksp = ncp_ts->nc_ticks;
426 }
427 
428 #ifdef DEBUG_CACHE
429 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
430 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
431     "VFS namecache enabled");
432 #endif
433 
434 /* Export size information to userland */
435 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
436     sizeof(struct namecache), "sizeof(struct namecache)");
437 
438 /*
439  * The new name cache statistics
440  */
441 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
442     "Name cache statistics");
443 #define STATNODE_ULONG(name, descr)					\
444 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
445 #define STATNODE_COUNTER(name, descr)					\
446 	static COUNTER_U64_DEFINE_EARLY(name);				\
447 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
448 	    descr);
449 STATNODE_ULONG(numneg, "Number of negative cache entries");
450 STATNODE_ULONG(numcache, "Number of cache entries");
451 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
452 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
453 STATNODE_COUNTER(dothits, "Number of '.' hits");
454 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
455 STATNODE_COUNTER(nummiss, "Number of cache misses");
456 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
457 STATNODE_COUNTER(numposzaps,
458     "Number of cache hits (positive) we do not want to cache");
459 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
460 STATNODE_COUNTER(numnegzaps,
461     "Number of cache hits (negative) we do not want to cache");
462 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
463 /* These count for vn_getcwd(), too. */
464 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
465 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
466 STATNODE_COUNTER(numfullpathfail2,
467     "Number of fullpath search errors (VOP_VPTOCNP failures)");
468 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
469 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
470 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
471     "Number of successful removals after relocking");
472 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
473     "Number of times zap_and_exit failed to lock");
474 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
475     "Number of times zap_and_exit failed to lock");
476 static long cache_lock_vnodes_cel_3_failures;
477 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
478     "Number of times 3-way vnode locking failed");
479 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
480 STATNODE_COUNTER(numneg_evicted,
481     "Number of negative entries evicted when adding a new entry");
482 STATNODE_COUNTER(shrinking_skipped,
483     "Number of times shrinking was already in progress");
484 
485 static void cache_zap_locked(struct namecache *ncp);
486 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
487     char **freebuf, size_t *buflen);
488 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
489     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
490 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
491     char **retbuf, size_t *buflen);
492 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
493     char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
494 
495 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
496 
497 static int cache_yield;
498 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
499     "Number of times cache called yield");
500 
501 static void __noinline
502 cache_maybe_yield(void)
503 {
504 
505 	if (should_yield()) {
506 		cache_yield++;
507 		kern_yield(PRI_USER);
508 	}
509 }
510 
511 static inline void
512 cache_assert_vlp_locked(struct mtx *vlp)
513 {
514 
515 	if (vlp != NULL)
516 		mtx_assert(vlp, MA_OWNED);
517 }
518 
519 static inline void
520 cache_assert_vnode_locked(struct vnode *vp)
521 {
522 	struct mtx *vlp;
523 
524 	vlp = VP2VNODELOCK(vp);
525 	cache_assert_vlp_locked(vlp);
526 }
527 
528 /*
529  * TODO: With the value stored we can do better than computing the hash based
530  * on the address. The choice of FNV should also be revisited.
531  */
532 static void
533 cache_prehash(struct vnode *vp)
534 {
535 
536 	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
537 }
538 
539 static uint32_t
540 cache_get_hash(char *name, u_char len, struct vnode *dvp)
541 {
542 
543 	return (fnv_32_buf(name, len, dvp->v_nchash));
544 }
545 
546 static inline struct nchashhead *
547 NCP2BUCKET(struct namecache *ncp)
548 {
549 	uint32_t hash;
550 
551 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
552 	return (NCHHASH(hash));
553 }
554 
555 static inline struct rwlock *
556 NCP2BUCKETLOCK(struct namecache *ncp)
557 {
558 	uint32_t hash;
559 
560 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
561 	return (HASH2BUCKETLOCK(hash));
562 }
563 
564 #ifdef INVARIANTS
565 static void
566 cache_assert_bucket_locked(struct namecache *ncp, int mode)
567 {
568 	struct rwlock *blp;
569 
570 	blp = NCP2BUCKETLOCK(ncp);
571 	rw_assert(blp, mode);
572 }
573 #else
574 #define cache_assert_bucket_locked(x, y) do { } while (0)
575 #endif
576 
577 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
578 static void
579 _cache_sort_vnodes(void **p1, void **p2)
580 {
581 	void *tmp;
582 
583 	MPASS(*p1 != NULL || *p2 != NULL);
584 
585 	if (*p1 > *p2) {
586 		tmp = *p2;
587 		*p2 = *p1;
588 		*p1 = tmp;
589 	}
590 }
591 
592 static void
593 cache_lock_all_buckets(void)
594 {
595 	u_int i;
596 
597 	for (i = 0; i < numbucketlocks; i++)
598 		rw_wlock(&bucketlocks[i]);
599 }
600 
601 static void
602 cache_unlock_all_buckets(void)
603 {
604 	u_int i;
605 
606 	for (i = 0; i < numbucketlocks; i++)
607 		rw_wunlock(&bucketlocks[i]);
608 }
609 
610 static void
611 cache_lock_all_vnodes(void)
612 {
613 	u_int i;
614 
615 	for (i = 0; i < numvnodelocks; i++)
616 		mtx_lock(&vnodelocks[i]);
617 }
618 
619 static void
620 cache_unlock_all_vnodes(void)
621 {
622 	u_int i;
623 
624 	for (i = 0; i < numvnodelocks; i++)
625 		mtx_unlock(&vnodelocks[i]);
626 }
627 
628 static int
629 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
630 {
631 
632 	cache_sort_vnodes(&vlp1, &vlp2);
633 
634 	if (vlp1 != NULL) {
635 		if (!mtx_trylock(vlp1))
636 			return (EAGAIN);
637 	}
638 	if (!mtx_trylock(vlp2)) {
639 		if (vlp1 != NULL)
640 			mtx_unlock(vlp1);
641 		return (EAGAIN);
642 	}
643 
644 	return (0);
645 }
646 
647 static void
648 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
649 {
650 
651 	MPASS(vlp1 != NULL || vlp2 != NULL);
652 	MPASS(vlp1 <= vlp2);
653 
654 	if (vlp1 != NULL)
655 		mtx_lock(vlp1);
656 	if (vlp2 != NULL)
657 		mtx_lock(vlp2);
658 }
659 
660 static void
661 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
662 {
663 
664 	MPASS(vlp1 != NULL || vlp2 != NULL);
665 
666 	if (vlp1 != NULL)
667 		mtx_unlock(vlp1);
668 	if (vlp2 != NULL)
669 		mtx_unlock(vlp2);
670 }
671 
672 static int
673 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
674 {
675 	struct nchstats snap;
676 
677 	if (req->oldptr == NULL)
678 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
679 
680 	snap = nchstats;
681 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
682 	snap.ncs_neghits = counter_u64_fetch(numneghits);
683 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
684 	    counter_u64_fetch(numnegzaps);
685 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
686 	    counter_u64_fetch(nummiss);
687 
688 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
689 }
690 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
691     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
692     "VFS cache effectiveness statistics");
693 
694 #ifdef DIAGNOSTIC
695 /*
696  * Grab an atomic snapshot of the name cache hash chain lengths
697  */
698 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
699     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
700     "hash table stats");
701 
702 static int
703 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
704 {
705 	struct nchashhead *ncpp;
706 	struct namecache *ncp;
707 	int i, error, n_nchash, *cntbuf;
708 
709 retry:
710 	n_nchash = nchash + 1;	/* nchash is max index, not count */
711 	if (req->oldptr == NULL)
712 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
713 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
714 	cache_lock_all_buckets();
715 	if (n_nchash != nchash + 1) {
716 		cache_unlock_all_buckets();
717 		free(cntbuf, M_TEMP);
718 		goto retry;
719 	}
720 	/* Scan hash tables counting entries */
721 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
722 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
723 			cntbuf[i]++;
724 	cache_unlock_all_buckets();
725 	for (error = 0, i = 0; i < n_nchash; i++)
726 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
727 			break;
728 	free(cntbuf, M_TEMP);
729 	return (error);
730 }
731 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
732     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
733     "nchash chain lengths");
734 
735 static int
736 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
737 {
738 	int error;
739 	struct nchashhead *ncpp;
740 	struct namecache *ncp;
741 	int n_nchash;
742 	int count, maxlength, used, pct;
743 
744 	if (!req->oldptr)
745 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
746 
747 	cache_lock_all_buckets();
748 	n_nchash = nchash + 1;	/* nchash is max index, not count */
749 	used = 0;
750 	maxlength = 0;
751 
752 	/* Scan hash tables for applicable entries */
753 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
754 		count = 0;
755 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
756 			count++;
757 		}
758 		if (count)
759 			used++;
760 		if (maxlength < count)
761 			maxlength = count;
762 	}
763 	n_nchash = nchash + 1;
764 	cache_unlock_all_buckets();
765 	pct = (used * 100) / (n_nchash / 100);
766 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
767 	if (error)
768 		return (error);
769 	error = SYSCTL_OUT(req, &used, sizeof(used));
770 	if (error)
771 		return (error);
772 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
773 	if (error)
774 		return (error);
775 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
776 	if (error)
777 		return (error);
778 	return (0);
779 }
780 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
781     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
782     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
783 #endif
784 
785 /*
786  * Negative entries management
787  *
788  * A variation of LRU scheme is used. New entries are hashed into one of
789  * numneglists cold lists. Entries get promoted to the hot list on first hit.
790  *
791  * The shrinker will demote hot list head and evict from the cold list in a
792  * round-robin manner.
793  */
794 static void
795 cache_negative_init(struct namecache *ncp)
796 {
797 	struct negstate *negstate;
798 
799 	ncp->nc_flag |= NCF_NEGATIVE;
800 	negstate = NCP2NEGSTATE(ncp);
801 	negstate->neg_flag = 0;
802 }
803 
804 static void
805 cache_negative_hit(struct namecache *ncp)
806 {
807 	struct neglist *neglist;
808 	struct negstate *negstate;
809 
810 	negstate = NCP2NEGSTATE(ncp);
811 	if ((negstate->neg_flag & NEG_HOT) != 0)
812 		return;
813 	neglist = NCP2NEGLIST(ncp);
814 	mtx_lock(&ncneg_hot.nl_lock);
815 	mtx_lock(&neglist->nl_lock);
816 	if ((negstate->neg_flag & NEG_HOT) == 0) {
817 		numhotneg++;
818 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
819 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
820 		negstate->neg_flag |= NEG_HOT;
821 	}
822 	mtx_unlock(&neglist->nl_lock);
823 	mtx_unlock(&ncneg_hot.nl_lock);
824 }
825 
826 static void
827 cache_negative_insert(struct namecache *ncp)
828 {
829 	struct neglist *neglist;
830 
831 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
832 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
833 	neglist = NCP2NEGLIST(ncp);
834 	mtx_lock(&neglist->nl_lock);
835 	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
836 	mtx_unlock(&neglist->nl_lock);
837 	atomic_add_rel_long(&numneg, 1);
838 }
839 
840 static void
841 cache_negative_remove(struct namecache *ncp)
842 {
843 	struct neglist *neglist;
844 	struct negstate *negstate;
845 	bool hot_locked = false;
846 	bool list_locked = false;
847 
848 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
849 	neglist = NCP2NEGLIST(ncp);
850 	negstate = NCP2NEGSTATE(ncp);
851 	if ((negstate->neg_flag & NEG_HOT) != 0) {
852 		hot_locked = true;
853 		mtx_lock(&ncneg_hot.nl_lock);
854 		if ((negstate->neg_flag & NEG_HOT) == 0) {
855 			list_locked = true;
856 			mtx_lock(&neglist->nl_lock);
857 		}
858 	} else {
859 		list_locked = true;
860 		mtx_lock(&neglist->nl_lock);
861 		/*
862 		 * We may be racing against promotion in lockless lookup.
863 		 */
864 		if ((negstate->neg_flag & NEG_HOT) != 0) {
865 			mtx_unlock(&neglist->nl_lock);
866 			hot_locked = true;
867 			mtx_lock(&ncneg_hot.nl_lock);
868 			mtx_lock(&neglist->nl_lock);
869 		}
870 	}
871 	if ((negstate->neg_flag & NEG_HOT) != 0) {
872 		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
873 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
874 		numhotneg--;
875 	} else {
876 		mtx_assert(&neglist->nl_lock, MA_OWNED);
877 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
878 	}
879 	if (list_locked)
880 		mtx_unlock(&neglist->nl_lock);
881 	if (hot_locked)
882 		mtx_unlock(&ncneg_hot.nl_lock);
883 	atomic_subtract_rel_long(&numneg, 1);
884 }
885 
886 static void
887 cache_negative_shrink_select(struct namecache **ncpp,
888     struct neglist **neglistpp)
889 {
890 	struct neglist *neglist;
891 	struct namecache *ncp;
892 	static u_int cycle;
893 	u_int i;
894 
895 	*ncpp = ncp = NULL;
896 
897 	for (i = 0; i < numneglists; i++) {
898 		neglist = &neglists[(cycle + i) % numneglists];
899 		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
900 			continue;
901 		mtx_lock(&neglist->nl_lock);
902 		ncp = TAILQ_FIRST(&neglist->nl_list);
903 		if (ncp != NULL)
904 			break;
905 		mtx_unlock(&neglist->nl_lock);
906 	}
907 
908 	*neglistpp = neglist;
909 	*ncpp = ncp;
910 	cycle++;
911 }
912 
913 static void
914 cache_negative_zap_one(void)
915 {
916 	struct namecache *ncp, *ncp2;
917 	struct neglist *neglist;
918 	struct negstate *negstate;
919 	struct mtx *dvlp;
920 	struct rwlock *blp;
921 
922 	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
923 	    !mtx_trylock(&ncneg_shrink_lock)) {
924 		counter_u64_add(shrinking_skipped, 1);
925 		return;
926 	}
927 
928 	mtx_lock(&ncneg_hot.nl_lock);
929 	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
930 	if (ncp != NULL) {
931 		neglist = NCP2NEGLIST(ncp);
932 		negstate = NCP2NEGSTATE(ncp);
933 		mtx_lock(&neglist->nl_lock);
934 		MPASS((negstate->neg_flag & NEG_HOT) != 0);
935 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
936 		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
937 		negstate->neg_flag &= ~NEG_HOT;
938 		numhotneg--;
939 		mtx_unlock(&neglist->nl_lock);
940 	}
941 	mtx_unlock(&ncneg_hot.nl_lock);
942 
943 	cache_negative_shrink_select(&ncp, &neglist);
944 
945 	mtx_unlock(&ncneg_shrink_lock);
946 	if (ncp == NULL)
947 		return;
948 
949 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
950 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
951 	blp = NCP2BUCKETLOCK(ncp);
952 	mtx_unlock(&neglist->nl_lock);
953 	mtx_lock(dvlp);
954 	rw_wlock(blp);
955 	/*
956 	 * Enter SMR to safely check the negative list.
957 	 * Even if the found pointer matches, the entry may now be reallocated
958 	 * and used by a different vnode.
959 	 */
960 	vfs_smr_enter();
961 	ncp2 = TAILQ_FIRST(&neglist->nl_list);
962 	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
963 	    blp != NCP2BUCKETLOCK(ncp2)) {
964 		vfs_smr_exit();
965 		ncp = NULL;
966 	} else {
967 		vfs_smr_exit();
968 		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
969 		    ncp->nc_name);
970 		cache_zap_locked(ncp);
971 		counter_u64_add(numneg_evicted, 1);
972 	}
973 	rw_wunlock(blp);
974 	mtx_unlock(dvlp);
975 	cache_free(ncp);
976 }
977 
978 /*
979  * cache_zap_locked():
980  *
981  *   Removes a namecache entry from cache, whether it contains an actual
982  *   pointer to a vnode or if it is just a negative cache entry.
983  */
984 static void
985 cache_zap_locked(struct namecache *ncp)
986 {
987 	struct nchashhead *ncpp;
988 
989 	if (!(ncp->nc_flag & NCF_NEGATIVE))
990 		cache_assert_vnode_locked(ncp->nc_vp);
991 	cache_assert_vnode_locked(ncp->nc_dvp);
992 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
993 
994 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
995 	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
996 
997 	cache_ncp_invalidate(ncp);
998 
999 	ncpp = NCP2BUCKET(ncp);
1000 	CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1001 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1002 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1003 		    ncp->nc_name, ncp->nc_vp);
1004 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1005 		if (ncp == ncp->nc_vp->v_cache_dd) {
1006 			vn_seqc_write_begin_unheld(ncp->nc_vp);
1007 			ncp->nc_vp->v_cache_dd = NULL;
1008 			vn_seqc_write_end(ncp->nc_vp);
1009 		}
1010 	} else {
1011 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1012 		    ncp->nc_name);
1013 		cache_negative_remove(ncp);
1014 	}
1015 	if (ncp->nc_flag & NCF_ISDOTDOT) {
1016 		if (ncp == ncp->nc_dvp->v_cache_dd) {
1017 			vn_seqc_write_begin_unheld(ncp->nc_dvp);
1018 			ncp->nc_dvp->v_cache_dd = NULL;
1019 			vn_seqc_write_end(ncp->nc_dvp);
1020 		}
1021 	} else {
1022 		LIST_REMOVE(ncp, nc_src);
1023 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1024 			ncp->nc_flag |= NCF_DVDROP;
1025 			counter_u64_add(numcachehv, -1);
1026 		}
1027 	}
1028 	atomic_subtract_rel_long(&numcache, 1);
1029 }
1030 
1031 static void
1032 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1033 {
1034 	struct rwlock *blp;
1035 
1036 	MPASS(ncp->nc_dvp == vp);
1037 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1038 	cache_assert_vnode_locked(vp);
1039 
1040 	blp = NCP2BUCKETLOCK(ncp);
1041 	rw_wlock(blp);
1042 	cache_zap_locked(ncp);
1043 	rw_wunlock(blp);
1044 }
1045 
1046 static bool
1047 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1048     struct mtx **vlpp)
1049 {
1050 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1051 	struct rwlock *blp;
1052 
1053 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1054 	cache_assert_vnode_locked(vp);
1055 
1056 	if (ncp->nc_flag & NCF_NEGATIVE) {
1057 		if (*vlpp != NULL) {
1058 			mtx_unlock(*vlpp);
1059 			*vlpp = NULL;
1060 		}
1061 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1062 		return (true);
1063 	}
1064 
1065 	pvlp = VP2VNODELOCK(vp);
1066 	blp = NCP2BUCKETLOCK(ncp);
1067 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1068 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1069 
1070 	if (*vlpp == vlp1 || *vlpp == vlp2) {
1071 		to_unlock = *vlpp;
1072 		*vlpp = NULL;
1073 	} else {
1074 		if (*vlpp != NULL) {
1075 			mtx_unlock(*vlpp);
1076 			*vlpp = NULL;
1077 		}
1078 		cache_sort_vnodes(&vlp1, &vlp2);
1079 		if (vlp1 == pvlp) {
1080 			mtx_lock(vlp2);
1081 			to_unlock = vlp2;
1082 		} else {
1083 			if (!mtx_trylock(vlp1))
1084 				goto out_relock;
1085 			to_unlock = vlp1;
1086 		}
1087 	}
1088 	rw_wlock(blp);
1089 	cache_zap_locked(ncp);
1090 	rw_wunlock(blp);
1091 	if (to_unlock != NULL)
1092 		mtx_unlock(to_unlock);
1093 	return (true);
1094 
1095 out_relock:
1096 	mtx_unlock(vlp2);
1097 	mtx_lock(vlp1);
1098 	mtx_lock(vlp2);
1099 	MPASS(*vlpp == NULL);
1100 	*vlpp = vlp1;
1101 	return (false);
1102 }
1103 
1104 static int __noinline
1105 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1106 {
1107 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1108 	struct rwlock *blp;
1109 	int error = 0;
1110 
1111 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1112 	cache_assert_vnode_locked(vp);
1113 
1114 	pvlp = VP2VNODELOCK(vp);
1115 	if (ncp->nc_flag & NCF_NEGATIVE) {
1116 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1117 		goto out;
1118 	}
1119 
1120 	blp = NCP2BUCKETLOCK(ncp);
1121 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1122 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1123 	cache_sort_vnodes(&vlp1, &vlp2);
1124 	if (vlp1 == pvlp) {
1125 		mtx_lock(vlp2);
1126 		to_unlock = vlp2;
1127 	} else {
1128 		if (!mtx_trylock(vlp1)) {
1129 			error = EAGAIN;
1130 			goto out;
1131 		}
1132 		to_unlock = vlp1;
1133 	}
1134 	rw_wlock(blp);
1135 	cache_zap_locked(ncp);
1136 	rw_wunlock(blp);
1137 	mtx_unlock(to_unlock);
1138 out:
1139 	mtx_unlock(pvlp);
1140 	return (error);
1141 }
1142 
1143 /*
1144  * If trylocking failed we can get here. We know enough to take all needed locks
1145  * in the right order and re-lookup the entry.
1146  */
1147 static int
1148 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1149     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1150     struct rwlock *blp)
1151 {
1152 	struct namecache *rncp;
1153 
1154 	cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1155 
1156 	cache_sort_vnodes(&dvlp, &vlp);
1157 	cache_lock_vnodes(dvlp, vlp);
1158 	rw_wlock(blp);
1159 	CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1160 		if (rncp == ncp && rncp->nc_dvp == dvp &&
1161 		    rncp->nc_nlen == cnp->cn_namelen &&
1162 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1163 			break;
1164 	}
1165 	if (rncp != NULL) {
1166 		cache_zap_locked(rncp);
1167 		rw_wunlock(blp);
1168 		cache_unlock_vnodes(dvlp, vlp);
1169 		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1170 		return (0);
1171 	}
1172 
1173 	rw_wunlock(blp);
1174 	cache_unlock_vnodes(dvlp, vlp);
1175 	return (EAGAIN);
1176 }
1177 
1178 static int __noinline
1179 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1180     uint32_t hash, struct rwlock *blp)
1181 {
1182 	struct mtx *dvlp, *vlp;
1183 	struct vnode *dvp;
1184 
1185 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1186 
1187 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1188 	vlp = NULL;
1189 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1190 		vlp = VP2VNODELOCK(ncp->nc_vp);
1191 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1192 		cache_zap_locked(ncp);
1193 		rw_wunlock(blp);
1194 		cache_unlock_vnodes(dvlp, vlp);
1195 		return (0);
1196 	}
1197 
1198 	dvp = ncp->nc_dvp;
1199 	rw_wunlock(blp);
1200 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1201 }
1202 
1203 static int __noinline
1204 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1205     uint32_t hash, struct rwlock *blp)
1206 {
1207 	struct mtx *dvlp, *vlp;
1208 	struct vnode *dvp;
1209 
1210 	cache_assert_bucket_locked(ncp, RA_RLOCKED);
1211 
1212 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1213 	vlp = NULL;
1214 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1215 		vlp = VP2VNODELOCK(ncp->nc_vp);
1216 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1217 		rw_runlock(blp);
1218 		rw_wlock(blp);
1219 		cache_zap_locked(ncp);
1220 		rw_wunlock(blp);
1221 		cache_unlock_vnodes(dvlp, vlp);
1222 		return (0);
1223 	}
1224 
1225 	dvp = ncp->nc_dvp;
1226 	rw_runlock(blp);
1227 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1228 }
1229 
1230 static int
1231 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1232     struct mtx **vlpp1, struct mtx **vlpp2)
1233 {
1234 	struct mtx *dvlp, *vlp;
1235 
1236 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1237 
1238 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1239 	vlp = NULL;
1240 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1241 		vlp = VP2VNODELOCK(ncp->nc_vp);
1242 	cache_sort_vnodes(&dvlp, &vlp);
1243 
1244 	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1245 		cache_zap_locked(ncp);
1246 		cache_unlock_vnodes(dvlp, vlp);
1247 		*vlpp1 = NULL;
1248 		*vlpp2 = NULL;
1249 		return (0);
1250 	}
1251 
1252 	if (*vlpp1 != NULL)
1253 		mtx_unlock(*vlpp1);
1254 	if (*vlpp2 != NULL)
1255 		mtx_unlock(*vlpp2);
1256 	*vlpp1 = NULL;
1257 	*vlpp2 = NULL;
1258 
1259 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1260 		cache_zap_locked(ncp);
1261 		cache_unlock_vnodes(dvlp, vlp);
1262 		return (0);
1263 	}
1264 
1265 	rw_wunlock(blp);
1266 	*vlpp1 = dvlp;
1267 	*vlpp2 = vlp;
1268 	if (*vlpp1 != NULL)
1269 		mtx_lock(*vlpp1);
1270 	mtx_lock(*vlpp2);
1271 	rw_wlock(blp);
1272 	return (EAGAIN);
1273 }
1274 
1275 static void
1276 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1277 {
1278 
1279 	if (blp != NULL) {
1280 		rw_runlock(blp);
1281 	} else {
1282 		mtx_unlock(vlp);
1283 	}
1284 }
1285 
1286 static int __noinline
1287 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1288     struct timespec *tsp, int *ticksp)
1289 {
1290 	int ltype;
1291 
1292 	*vpp = dvp;
1293 	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1294 			dvp, cnp->cn_nameptr);
1295 	counter_u64_add(dothits, 1);
1296 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1297 	if (tsp != NULL)
1298 		timespecclear(tsp);
1299 	if (ticksp != NULL)
1300 		*ticksp = ticks;
1301 	vrefact(*vpp);
1302 	/*
1303 	 * When we lookup "." we still can be asked to lock it
1304 	 * differently...
1305 	 */
1306 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1307 	if (ltype != VOP_ISLOCKED(*vpp)) {
1308 		if (ltype == LK_EXCLUSIVE) {
1309 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1310 			if (VN_IS_DOOMED((*vpp))) {
1311 				/* forced unmount */
1312 				vrele(*vpp);
1313 				*vpp = NULL;
1314 				return (ENOENT);
1315 			}
1316 		} else
1317 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1318 	}
1319 	return (-1);
1320 }
1321 
1322 static __noinline int
1323 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1324 {
1325 	struct namecache *ncp;
1326 	struct rwlock *blp;
1327 	struct mtx *dvlp, *dvlp2;
1328 	uint32_t hash;
1329 	int error;
1330 
1331 	if (cnp->cn_namelen == 2 &&
1332 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1333 		dvlp = VP2VNODELOCK(dvp);
1334 		dvlp2 = NULL;
1335 		mtx_lock(dvlp);
1336 retry_dotdot:
1337 		ncp = dvp->v_cache_dd;
1338 		if (ncp == NULL) {
1339 			mtx_unlock(dvlp);
1340 			if (dvlp2 != NULL)
1341 				mtx_unlock(dvlp2);
1342 			SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1343 			return (0);
1344 		}
1345 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1346 			if (ncp->nc_dvp != dvp)
1347 				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1348 			if (!cache_zap_locked_vnode_kl2(ncp,
1349 			    dvp, &dvlp2))
1350 				goto retry_dotdot;
1351 			MPASS(dvp->v_cache_dd == NULL);
1352 			mtx_unlock(dvlp);
1353 			if (dvlp2 != NULL)
1354 				mtx_unlock(dvlp2);
1355 			cache_free(ncp);
1356 		} else {
1357 			vn_seqc_write_begin(dvp);
1358 			dvp->v_cache_dd = NULL;
1359 			vn_seqc_write_end(dvp);
1360 			mtx_unlock(dvlp);
1361 			if (dvlp2 != NULL)
1362 				mtx_unlock(dvlp2);
1363 		}
1364 		SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1365 		return (1);
1366 	}
1367 
1368 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1369 	blp = HASH2BUCKETLOCK(hash);
1370 retry:
1371 	if (CK_SLIST_EMPTY(NCHHASH(hash)))
1372 		goto out_no_entry;
1373 
1374 	rw_wlock(blp);
1375 
1376 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1377 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1378 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1379 			break;
1380 	}
1381 
1382 	/* We failed to find an entry */
1383 	if (ncp == NULL) {
1384 		rw_wunlock(blp);
1385 		goto out_no_entry;
1386 	}
1387 
1388 	error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1389 	if (__predict_false(error != 0)) {
1390 		zap_and_exit_bucket_fail++;
1391 		cache_maybe_yield();
1392 		goto retry;
1393 	}
1394 	counter_u64_add(numposzaps, 1);
1395 	cache_free(ncp);
1396 	SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1397 	return (1);
1398 out_no_entry:
1399 	SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1400 	counter_u64_add(nummisszap, 1);
1401 	return (0);
1402 }
1403 
1404 /**
1405  * Lookup a name in the name cache
1406  *
1407  * # Arguments
1408  *
1409  * - dvp:	Parent directory in which to search.
1410  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1411  * - cnp:	Parameters of the name search.  The most interesting bits of
1412  *   		the cn_flags field have the following meanings:
1413  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1414  *   			it up.
1415  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1416  * - tsp:	Return storage for cache timestamp.  On a successful (positive
1417  *   		or negative) lookup, tsp will be filled with any timespec that
1418  *   		was stored when this cache entry was created.  However, it will
1419  *   		be clear for "." entries.
1420  * - ticks:	Return storage for alternate cache timestamp.  On a successful
1421  *   		(positive or negative) lookup, it will contain the ticks value
1422  *   		that was current when the cache entry was created, unless cnp
1423  *   		was ".".
1424  *
1425  * # Returns
1426  *
1427  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1428  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1429  *		to a forced unmount.  vpp will not be modified.  If the entry
1430  *		is a whiteout, then the ISWHITEOUT flag will be set in
1431  *		cnp->cn_flags.
1432  * - 0:		A cache miss.  vpp will not be modified.
1433  *
1434  * # Locking
1435  *
1436  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1437  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1438  * lock is not recursively acquired.
1439  */
1440 int
1441 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1442     struct timespec *tsp, int *ticksp)
1443 {
1444 	struct namecache_ts *ncp_ts;
1445 	struct namecache *ncp;
1446 	struct negstate *negstate;
1447 	struct rwlock *blp;
1448 	struct mtx *dvlp;
1449 	uint32_t hash;
1450 	enum vgetstate vs;
1451 	int error, ltype;
1452 	bool try_smr, doing_smr, whiteout;
1453 
1454 #ifdef DEBUG_CACHE
1455 	if (__predict_false(!doingcache)) {
1456 		cnp->cn_flags &= ~MAKEENTRY;
1457 		return (0);
1458 	}
1459 #endif
1460 
1461 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1462 		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1463 
1464 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
1465 		cache_remove_cnp(dvp, cnp);
1466 		return (0);
1467 	}
1468 
1469 	try_smr = true;
1470 	if (cnp->cn_nameiop == CREATE)
1471 		try_smr = false;
1472 retry:
1473 	doing_smr = false;
1474 	blp = NULL;
1475 	dvlp = NULL;
1476 	error = 0;
1477 	if (cnp->cn_namelen == 2 &&
1478 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1479 		counter_u64_add(dotdothits, 1);
1480 		dvlp = VP2VNODELOCK(dvp);
1481 		mtx_lock(dvlp);
1482 		ncp = dvp->v_cache_dd;
1483 		if (ncp == NULL) {
1484 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1485 			    "..", NULL);
1486 			mtx_unlock(dvlp);
1487 			return (0);
1488 		}
1489 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1490 			if (ncp->nc_flag & NCF_NEGATIVE)
1491 				*vpp = NULL;
1492 			else
1493 				*vpp = ncp->nc_vp;
1494 		} else
1495 			*vpp = ncp->nc_dvp;
1496 		/* Return failure if negative entry was found. */
1497 		if (*vpp == NULL)
1498 			goto negative_success;
1499 		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1500 		    dvp, cnp->cn_nameptr, *vpp);
1501 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1502 		    *vpp);
1503 		cache_out_ts(ncp, tsp, ticksp);
1504 		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1505 		    NCF_DTS && tsp != NULL) {
1506 			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1507 			*tsp = ncp_ts->nc_dotdottime;
1508 		}
1509 		goto success;
1510 	}
1511 
1512 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1513 retry_hashed:
1514 	if (try_smr) {
1515 		vfs_smr_enter();
1516 		doing_smr = true;
1517 		try_smr = false;
1518 	} else {
1519 		blp = HASH2BUCKETLOCK(hash);
1520 		rw_rlock(blp);
1521 	}
1522 
1523 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1524 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1525 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1526 			break;
1527 	}
1528 
1529 	/* We failed to find an entry */
1530 	if (__predict_false(ncp == NULL)) {
1531 		if (doing_smr)
1532 			vfs_smr_exit();
1533 		else
1534 			rw_runlock(blp);
1535 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1536 		    NULL);
1537 		counter_u64_add(nummiss, 1);
1538 		return (0);
1539 	}
1540 
1541 	if (ncp->nc_flag & NCF_NEGATIVE)
1542 		goto negative_success;
1543 
1544 	/* We found a "positive" match, return the vnode */
1545 	counter_u64_add(numposhits, 1);
1546 	*vpp = ncp->nc_vp;
1547 	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1548 	    dvp, cnp->cn_nameptr, *vpp, ncp);
1549 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1550 	    *vpp);
1551 	cache_out_ts(ncp, tsp, ticksp);
1552 success:
1553 	/*
1554 	 * On success we return a locked and ref'd vnode as per the lookup
1555 	 * protocol.
1556 	 */
1557 	MPASS(dvp != *vpp);
1558 	ltype = 0;	/* silence gcc warning */
1559 	if (cnp->cn_flags & ISDOTDOT) {
1560 		ltype = VOP_ISLOCKED(dvp);
1561 		VOP_UNLOCK(dvp);
1562 	}
1563 	if (doing_smr) {
1564 		if (!cache_ncp_canuse(ncp)) {
1565 			vfs_smr_exit();
1566 			*vpp = NULL;
1567 			goto retry;
1568 		}
1569 		vs = vget_prep_smr(*vpp);
1570 		vfs_smr_exit();
1571 		if (__predict_false(vs == VGET_NONE)) {
1572 			*vpp = NULL;
1573 			goto retry;
1574 		}
1575 	} else {
1576 		vs = vget_prep(*vpp);
1577 		cache_lookup_unlock(blp, dvlp);
1578 	}
1579 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1580 	if (cnp->cn_flags & ISDOTDOT) {
1581 		vn_lock(dvp, ltype | LK_RETRY);
1582 		if (VN_IS_DOOMED(dvp)) {
1583 			if (error == 0)
1584 				vput(*vpp);
1585 			*vpp = NULL;
1586 			return (ENOENT);
1587 		}
1588 	}
1589 	if (error) {
1590 		*vpp = NULL;
1591 		goto retry;
1592 	}
1593 	if ((cnp->cn_flags & ISLASTCN) &&
1594 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1595 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1596 	}
1597 	return (-1);
1598 
1599 negative_success:
1600 	/* We found a negative match, and want to create it, so purge */
1601 	if (cnp->cn_nameiop == CREATE) {
1602 		MPASS(!doing_smr);
1603 		counter_u64_add(numnegzaps, 1);
1604 		goto zap_and_exit;
1605 	}
1606 
1607 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1608 	cache_out_ts(ncp, tsp, ticksp);
1609 	counter_u64_add(numneghits, 1);
1610 	whiteout = (ncp->nc_flag & NCF_WHITE);
1611 
1612 	if (doing_smr) {
1613 		/*
1614 		 * We need to take locks to promote an entry.
1615 		 */
1616 		negstate = NCP2NEGSTATE(ncp);
1617 		if ((negstate->neg_flag & NEG_HOT) == 0 ||
1618 		    !cache_ncp_canuse(ncp)) {
1619 			vfs_smr_exit();
1620 			doing_smr = false;
1621 			goto retry_hashed;
1622 		}
1623 		vfs_smr_exit();
1624 	} else {
1625 		cache_negative_hit(ncp);
1626 		cache_lookup_unlock(blp, dvlp);
1627 	}
1628 	if (whiteout)
1629 		cnp->cn_flags |= ISWHITEOUT;
1630 	return (ENOENT);
1631 
1632 zap_and_exit:
1633 	MPASS(!doing_smr);
1634 	if (blp != NULL)
1635 		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1636 	else
1637 		error = cache_zap_locked_vnode(ncp, dvp);
1638 	if (__predict_false(error != 0)) {
1639 		zap_and_exit_bucket_fail2++;
1640 		cache_maybe_yield();
1641 		goto retry;
1642 	}
1643 	cache_free(ncp);
1644 	return (0);
1645 }
1646 
1647 struct celockstate {
1648 	struct mtx *vlp[3];
1649 	struct rwlock *blp[2];
1650 };
1651 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1652 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1653 
1654 static inline void
1655 cache_celockstate_init(struct celockstate *cel)
1656 {
1657 
1658 	bzero(cel, sizeof(*cel));
1659 }
1660 
1661 static void
1662 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1663     struct vnode *dvp)
1664 {
1665 	struct mtx *vlp1, *vlp2;
1666 
1667 	MPASS(cel->vlp[0] == NULL);
1668 	MPASS(cel->vlp[1] == NULL);
1669 	MPASS(cel->vlp[2] == NULL);
1670 
1671 	MPASS(vp != NULL || dvp != NULL);
1672 
1673 	vlp1 = VP2VNODELOCK(vp);
1674 	vlp2 = VP2VNODELOCK(dvp);
1675 	cache_sort_vnodes(&vlp1, &vlp2);
1676 
1677 	if (vlp1 != NULL) {
1678 		mtx_lock(vlp1);
1679 		cel->vlp[0] = vlp1;
1680 	}
1681 	mtx_lock(vlp2);
1682 	cel->vlp[1] = vlp2;
1683 }
1684 
1685 static void
1686 cache_unlock_vnodes_cel(struct celockstate *cel)
1687 {
1688 
1689 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1690 
1691 	if (cel->vlp[0] != NULL)
1692 		mtx_unlock(cel->vlp[0]);
1693 	if (cel->vlp[1] != NULL)
1694 		mtx_unlock(cel->vlp[1]);
1695 	if (cel->vlp[2] != NULL)
1696 		mtx_unlock(cel->vlp[2]);
1697 }
1698 
1699 static bool
1700 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1701 {
1702 	struct mtx *vlp;
1703 	bool ret;
1704 
1705 	cache_assert_vlp_locked(cel->vlp[0]);
1706 	cache_assert_vlp_locked(cel->vlp[1]);
1707 	MPASS(cel->vlp[2] == NULL);
1708 
1709 	MPASS(vp != NULL);
1710 	vlp = VP2VNODELOCK(vp);
1711 
1712 	ret = true;
1713 	if (vlp >= cel->vlp[1]) {
1714 		mtx_lock(vlp);
1715 	} else {
1716 		if (mtx_trylock(vlp))
1717 			goto out;
1718 		cache_lock_vnodes_cel_3_failures++;
1719 		cache_unlock_vnodes_cel(cel);
1720 		if (vlp < cel->vlp[0]) {
1721 			mtx_lock(vlp);
1722 			mtx_lock(cel->vlp[0]);
1723 			mtx_lock(cel->vlp[1]);
1724 		} else {
1725 			if (cel->vlp[0] != NULL)
1726 				mtx_lock(cel->vlp[0]);
1727 			mtx_lock(vlp);
1728 			mtx_lock(cel->vlp[1]);
1729 		}
1730 		ret = false;
1731 	}
1732 out:
1733 	cel->vlp[2] = vlp;
1734 	return (ret);
1735 }
1736 
1737 static void
1738 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1739     struct rwlock *blp2)
1740 {
1741 
1742 	MPASS(cel->blp[0] == NULL);
1743 	MPASS(cel->blp[1] == NULL);
1744 
1745 	cache_sort_vnodes(&blp1, &blp2);
1746 
1747 	if (blp1 != NULL) {
1748 		rw_wlock(blp1);
1749 		cel->blp[0] = blp1;
1750 	}
1751 	rw_wlock(blp2);
1752 	cel->blp[1] = blp2;
1753 }
1754 
1755 static void
1756 cache_unlock_buckets_cel(struct celockstate *cel)
1757 {
1758 
1759 	if (cel->blp[0] != NULL)
1760 		rw_wunlock(cel->blp[0]);
1761 	rw_wunlock(cel->blp[1]);
1762 }
1763 
1764 /*
1765  * Lock part of the cache affected by the insertion.
1766  *
1767  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1768  * However, insertion can result in removal of an old entry. In this
1769  * case we have an additional vnode and bucketlock pair to lock. If the
1770  * entry is negative, ncelock is locked instead of the vnode.
1771  *
1772  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1773  * preserving the locking order (smaller address first).
1774  */
1775 static void
1776 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1777     uint32_t hash)
1778 {
1779 	struct namecache *ncp;
1780 	struct rwlock *blps[2];
1781 
1782 	blps[0] = HASH2BUCKETLOCK(hash);
1783 	for (;;) {
1784 		blps[1] = NULL;
1785 		cache_lock_vnodes_cel(cel, dvp, vp);
1786 		if (vp == NULL || vp->v_type != VDIR)
1787 			break;
1788 		ncp = vp->v_cache_dd;
1789 		if (ncp == NULL)
1790 			break;
1791 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1792 			break;
1793 		MPASS(ncp->nc_dvp == vp);
1794 		blps[1] = NCP2BUCKETLOCK(ncp);
1795 		if (ncp->nc_flag & NCF_NEGATIVE)
1796 			break;
1797 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1798 			break;
1799 		/*
1800 		 * All vnodes got re-locked. Re-validate the state and if
1801 		 * nothing changed we are done. Otherwise restart.
1802 		 */
1803 		if (ncp == vp->v_cache_dd &&
1804 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1805 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1806 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1807 			break;
1808 		cache_unlock_vnodes_cel(cel);
1809 		cel->vlp[0] = NULL;
1810 		cel->vlp[1] = NULL;
1811 		cel->vlp[2] = NULL;
1812 	}
1813 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1814 }
1815 
1816 static void
1817 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1818     uint32_t hash)
1819 {
1820 	struct namecache *ncp;
1821 	struct rwlock *blps[2];
1822 
1823 	blps[0] = HASH2BUCKETLOCK(hash);
1824 	for (;;) {
1825 		blps[1] = NULL;
1826 		cache_lock_vnodes_cel(cel, dvp, vp);
1827 		ncp = dvp->v_cache_dd;
1828 		if (ncp == NULL)
1829 			break;
1830 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1831 			break;
1832 		MPASS(ncp->nc_dvp == dvp);
1833 		blps[1] = NCP2BUCKETLOCK(ncp);
1834 		if (ncp->nc_flag & NCF_NEGATIVE)
1835 			break;
1836 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1837 			break;
1838 		if (ncp == dvp->v_cache_dd &&
1839 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1840 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1841 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1842 			break;
1843 		cache_unlock_vnodes_cel(cel);
1844 		cel->vlp[0] = NULL;
1845 		cel->vlp[1] = NULL;
1846 		cel->vlp[2] = NULL;
1847 	}
1848 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1849 }
1850 
1851 static void
1852 cache_enter_unlock(struct celockstate *cel)
1853 {
1854 
1855 	cache_unlock_buckets_cel(cel);
1856 	cache_unlock_vnodes_cel(cel);
1857 }
1858 
1859 static void __noinline
1860 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1861     struct componentname *cnp)
1862 {
1863 	struct celockstate cel;
1864 	struct namecache *ncp;
1865 	uint32_t hash;
1866 	int len;
1867 
1868 	if (dvp->v_cache_dd == NULL)
1869 		return;
1870 	len = cnp->cn_namelen;
1871 	cache_celockstate_init(&cel);
1872 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1873 	cache_enter_lock_dd(&cel, dvp, vp, hash);
1874 	vn_seqc_write_begin(dvp);
1875 	ncp = dvp->v_cache_dd;
1876 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1877 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1878 		cache_zap_locked(ncp);
1879 	} else {
1880 		ncp = NULL;
1881 	}
1882 	dvp->v_cache_dd = NULL;
1883 	vn_seqc_write_end(dvp);
1884 	cache_enter_unlock(&cel);
1885 	cache_free(ncp);
1886 }
1887 
1888 /*
1889  * Add an entry to the cache.
1890  */
1891 void
1892 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1893     struct timespec *tsp, struct timespec *dtsp)
1894 {
1895 	struct celockstate cel;
1896 	struct namecache *ncp, *n2, *ndd;
1897 	struct namecache_ts *ncp_ts, *n2_ts;
1898 	struct nchashhead *ncpp;
1899 	uint32_t hash;
1900 	int flag;
1901 	int len;
1902 	u_long lnumcache;
1903 
1904 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1905 	VNPASS(!VN_IS_DOOMED(dvp), dvp);
1906 	VNPASS(dvp->v_type != VNON, dvp);
1907 	if (vp != NULL) {
1908 		VNPASS(!VN_IS_DOOMED(vp), vp);
1909 		VNPASS(vp->v_type != VNON, vp);
1910 	}
1911 
1912 #ifdef DEBUG_CACHE
1913 	if (__predict_false(!doingcache))
1914 		return;
1915 #endif
1916 
1917 	flag = 0;
1918 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1919 		if (cnp->cn_namelen == 1)
1920 			return;
1921 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1922 			cache_enter_dotdot_prep(dvp, vp, cnp);
1923 			flag = NCF_ISDOTDOT;
1924 		}
1925 	}
1926 
1927 	/*
1928 	 * Avoid blowout in namecache entries.
1929 	 */
1930 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1931 	if (__predict_false(lnumcache >= ncsize)) {
1932 		atomic_add_long(&numcache, -1);
1933 		counter_u64_add(numdrops, 1);
1934 		return;
1935 	}
1936 
1937 	cache_celockstate_init(&cel);
1938 	ndd = NULL;
1939 	ncp_ts = NULL;
1940 
1941 	/*
1942 	 * Calculate the hash key and setup as much of the new
1943 	 * namecache entry as possible before acquiring the lock.
1944 	 */
1945 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1946 	ncp->nc_flag = flag | NCF_WIP;
1947 	ncp->nc_vp = vp;
1948 	if (vp == NULL)
1949 		cache_negative_init(ncp);
1950 	ncp->nc_dvp = dvp;
1951 	if (tsp != NULL) {
1952 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1953 		ncp_ts->nc_time = *tsp;
1954 		ncp_ts->nc_ticks = ticks;
1955 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
1956 		if (dtsp != NULL) {
1957 			ncp_ts->nc_dotdottime = *dtsp;
1958 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1959 		}
1960 	}
1961 	len = ncp->nc_nlen = cnp->cn_namelen;
1962 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1963 	memcpy(ncp->nc_name, cnp->cn_nameptr, len);
1964 	ncp->nc_name[len] = '\0';
1965 	cache_enter_lock(&cel, dvp, vp, hash);
1966 
1967 	/*
1968 	 * See if this vnode or negative entry is already in the cache
1969 	 * with this name.  This can happen with concurrent lookups of
1970 	 * the same path name.
1971 	 */
1972 	ncpp = NCHHASH(hash);
1973 	CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
1974 		if (n2->nc_dvp == dvp &&
1975 		    n2->nc_nlen == cnp->cn_namelen &&
1976 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1977 			MPASS(cache_ncp_canuse(n2));
1978 			if ((n2->nc_flag & NCF_NEGATIVE) != 0)
1979 				KASSERT(vp == NULL,
1980 				    ("%s: found entry pointing to a different vnode (%p != %p)",
1981 				    __func__, NULL, vp));
1982 			else
1983 				KASSERT(n2->nc_vp == vp,
1984 				    ("%s: found entry pointing to a different vnode (%p != %p)",
1985 				    __func__, n2->nc_vp, vp));
1986 			if (tsp != NULL) {
1987 				KASSERT((n2->nc_flag & NCF_TS) != 0,
1988 				    ("no NCF_TS"));
1989 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1990 				n2_ts->nc_time = ncp_ts->nc_time;
1991 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
1992 				if (dtsp != NULL) {
1993 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1994 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
1995 				}
1996 			}
1997 			goto out_unlock_free;
1998 		}
1999 	}
2000 
2001 	if (flag == NCF_ISDOTDOT) {
2002 		/*
2003 		 * See if we are trying to add .. entry, but some other lookup
2004 		 * has populated v_cache_dd pointer already.
2005 		 */
2006 		if (dvp->v_cache_dd != NULL)
2007 			goto out_unlock_free;
2008 		KASSERT(vp == NULL || vp->v_type == VDIR,
2009 		    ("wrong vnode type %p", vp));
2010 		vn_seqc_write_begin(dvp);
2011 		dvp->v_cache_dd = ncp;
2012 		vn_seqc_write_end(dvp);
2013 	}
2014 
2015 	if (vp != NULL) {
2016 		if (flag != NCF_ISDOTDOT) {
2017 			/*
2018 			 * For this case, the cache entry maps both the
2019 			 * directory name in it and the name ".." for the
2020 			 * directory's parent.
2021 			 */
2022 			vn_seqc_write_begin(vp);
2023 			if ((ndd = vp->v_cache_dd) != NULL) {
2024 				if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2025 					cache_zap_locked(ndd);
2026 				else
2027 					ndd = NULL;
2028 			}
2029 			vp->v_cache_dd = ncp;
2030 			vn_seqc_write_end(vp);
2031 		} else if (vp->v_type != VDIR) {
2032 			if (vp->v_cache_dd != NULL) {
2033 				vn_seqc_write_begin(vp);
2034 				vp->v_cache_dd = NULL;
2035 				vn_seqc_write_end(vp);
2036 			}
2037 		}
2038 	}
2039 
2040 	if (flag != NCF_ISDOTDOT) {
2041 		if (LIST_EMPTY(&dvp->v_cache_src)) {
2042 			vhold(dvp);
2043 			counter_u64_add(numcachehv, 1);
2044 		}
2045 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2046 	}
2047 
2048 	/*
2049 	 * If the entry is "negative", we place it into the
2050 	 * "negative" cache queue, otherwise, we place it into the
2051 	 * destination vnode's cache entries queue.
2052 	 */
2053 	if (vp != NULL) {
2054 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2055 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2056 		    vp);
2057 	} else {
2058 		if (cnp->cn_flags & ISWHITEOUT)
2059 			ncp->nc_flag |= NCF_WHITE;
2060 		cache_negative_insert(ncp);
2061 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2062 		    ncp->nc_name);
2063 	}
2064 
2065 	/*
2066 	 * Insert the new namecache entry into the appropriate chain
2067 	 * within the cache entries table.
2068 	 */
2069 	CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2070 
2071 	atomic_thread_fence_rel();
2072 	/*
2073 	 * Mark the entry as fully constructed.
2074 	 * It is immutable past this point until its removal.
2075 	 */
2076 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2077 
2078 	cache_enter_unlock(&cel);
2079 	if (numneg * ncnegfactor > lnumcache)
2080 		cache_negative_zap_one();
2081 	cache_free(ndd);
2082 	return;
2083 out_unlock_free:
2084 	cache_enter_unlock(&cel);
2085 	atomic_add_long(&numcache, -1);
2086 	cache_free(ncp);
2087 	return;
2088 }
2089 
2090 static u_int
2091 cache_roundup_2(u_int val)
2092 {
2093 	u_int res;
2094 
2095 	for (res = 1; res <= val; res <<= 1)
2096 		continue;
2097 
2098 	return (res);
2099 }
2100 
2101 static struct nchashhead *
2102 nchinittbl(u_long elements, u_long *hashmask)
2103 {
2104 	struct nchashhead *hashtbl;
2105 	u_long hashsize, i;
2106 
2107 	hashsize = cache_roundup_2(elements) / 2;
2108 
2109 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2110 	for (i = 0; i < hashsize; i++)
2111 		CK_SLIST_INIT(&hashtbl[i]);
2112 	*hashmask = hashsize - 1;
2113 	return (hashtbl);
2114 }
2115 
2116 static void
2117 ncfreetbl(struct nchashhead *hashtbl)
2118 {
2119 
2120 	free(hashtbl, M_VFSCACHE);
2121 }
2122 
2123 /*
2124  * Name cache initialization, from vfs_init() when we are booting
2125  */
2126 static void
2127 nchinit(void *dummy __unused)
2128 {
2129 	u_int i;
2130 
2131 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2132 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2133 	cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2134 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2135 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2136 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2137 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2138 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2139 
2140 	VFS_SMR_ZONE_SET(cache_zone_small);
2141 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2142 	VFS_SMR_ZONE_SET(cache_zone_large);
2143 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2144 
2145 	ncsize = desiredvnodes * ncsizefactor;
2146 	nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2147 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2148 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2149 		ncbuckethash = 7;
2150 	if (ncbuckethash > nchash)
2151 		ncbuckethash = nchash;
2152 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2153 	    M_WAITOK | M_ZERO);
2154 	for (i = 0; i < numbucketlocks; i++)
2155 		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2156 	ncvnodehash = ncbuckethash;
2157 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2158 	    M_WAITOK | M_ZERO);
2159 	for (i = 0; i < numvnodelocks; i++)
2160 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2161 	ncpurgeminvnodes = numbucketlocks * 2;
2162 
2163 	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2164 	    M_WAITOK | M_ZERO);
2165 	for (i = 0; i < numneglists; i++) {
2166 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2167 		TAILQ_INIT(&neglists[i].nl_list);
2168 	}
2169 	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2170 	TAILQ_INIT(&ncneg_hot.nl_list);
2171 
2172 	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2173 }
2174 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2175 
2176 void
2177 cache_vnode_init(struct vnode *vp)
2178 {
2179 
2180 	LIST_INIT(&vp->v_cache_src);
2181 	TAILQ_INIT(&vp->v_cache_dst);
2182 	vp->v_cache_dd = NULL;
2183 	cache_prehash(vp);
2184 }
2185 
2186 void
2187 cache_changesize(u_long newmaxvnodes)
2188 {
2189 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
2190 	u_long new_nchash, old_nchash;
2191 	struct namecache *ncp;
2192 	uint32_t hash;
2193 	u_long newncsize;
2194 	int i;
2195 
2196 	newncsize = newmaxvnodes * ncsizefactor;
2197 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2198 	if (newmaxvnodes < numbucketlocks)
2199 		newmaxvnodes = numbucketlocks;
2200 
2201 	new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2202 	/* If same hash table size, nothing to do */
2203 	if (nchash == new_nchash) {
2204 		ncfreetbl(new_nchashtbl);
2205 		return;
2206 	}
2207 	/*
2208 	 * Move everything from the old hash table to the new table.
2209 	 * None of the namecache entries in the table can be removed
2210 	 * because to do so, they have to be removed from the hash table.
2211 	 */
2212 	cache_lock_all_vnodes();
2213 	cache_lock_all_buckets();
2214 	old_nchashtbl = nchashtbl;
2215 	old_nchash = nchash;
2216 	nchashtbl = new_nchashtbl;
2217 	nchash = new_nchash;
2218 	for (i = 0; i <= old_nchash; i++) {
2219 		while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2220 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2221 			    ncp->nc_dvp);
2222 			CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2223 			CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2224 		}
2225 	}
2226 	ncsize = newncsize;
2227 	cache_unlock_all_buckets();
2228 	cache_unlock_all_vnodes();
2229 	ncfreetbl(old_nchashtbl);
2230 }
2231 
2232 /*
2233  * Invalidate all entries from and to a particular vnode.
2234  */
2235 static void
2236 cache_purge_impl(struct vnode *vp)
2237 {
2238 	TAILQ_HEAD(, namecache) ncps;
2239 	struct namecache *ncp, *nnp;
2240 	struct mtx *vlp, *vlp2;
2241 
2242 	TAILQ_INIT(&ncps);
2243 	vlp = VP2VNODELOCK(vp);
2244 	vlp2 = NULL;
2245 	mtx_assert(vlp, MA_OWNED);
2246 retry:
2247 	while (!LIST_EMPTY(&vp->v_cache_src)) {
2248 		ncp = LIST_FIRST(&vp->v_cache_src);
2249 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2250 			goto retry;
2251 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2252 	}
2253 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2254 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2255 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2256 			goto retry;
2257 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2258 	}
2259 	ncp = vp->v_cache_dd;
2260 	if (ncp != NULL) {
2261 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2262 		   ("lost dotdot link"));
2263 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2264 			goto retry;
2265 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2266 	}
2267 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2268 	mtx_unlock(vlp);
2269 	if (vlp2 != NULL)
2270 		mtx_unlock(vlp2);
2271 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2272 		cache_free(ncp);
2273 	}
2274 }
2275 
2276 void
2277 cache_purge(struct vnode *vp)
2278 {
2279 	struct mtx *vlp;
2280 
2281 	SDT_PROBE1(vfs, namecache, purge, done, vp);
2282 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2283 	    vp->v_cache_dd == NULL)
2284 		return;
2285 	vlp = VP2VNODELOCK(vp);
2286 	mtx_lock(vlp);
2287 	cache_purge_impl(vp);
2288 }
2289 
2290 /*
2291  * Only to be used by vgone.
2292  */
2293 void
2294 cache_purge_vgone(struct vnode *vp)
2295 {
2296 	struct mtx *vlp;
2297 
2298 	VNPASS(VN_IS_DOOMED(vp), vp);
2299 	vlp = VP2VNODELOCK(vp);
2300 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2301 	    vp->v_cache_dd == NULL)) {
2302 		mtx_lock(vlp);
2303 		cache_purge_impl(vp);
2304 		mtx_assert(vlp, MA_NOTOWNED);
2305 		return;
2306 	}
2307 
2308 	/*
2309 	 * All the NULL pointer state we found above may be transient.
2310 	 * Serialize against a possible thread doing cache_purge.
2311 	 */
2312 	mtx_wait_unlocked(vlp);
2313 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2314 	    vp->v_cache_dd == NULL)) {
2315 		mtx_lock(vlp);
2316 		cache_purge_impl(vp);
2317 		mtx_assert(vlp, MA_NOTOWNED);
2318 		return;
2319 	}
2320 	return;
2321 }
2322 
2323 /*
2324  * Invalidate all negative entries for a particular directory vnode.
2325  */
2326 void
2327 cache_purge_negative(struct vnode *vp)
2328 {
2329 	TAILQ_HEAD(, namecache) ncps;
2330 	struct namecache *ncp, *nnp;
2331 	struct mtx *vlp;
2332 
2333 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2334 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2335 	if (LIST_EMPTY(&vp->v_cache_src))
2336 		return;
2337 	TAILQ_INIT(&ncps);
2338 	vlp = VP2VNODELOCK(vp);
2339 	mtx_lock(vlp);
2340 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2341 		if (!(ncp->nc_flag & NCF_NEGATIVE))
2342 			continue;
2343 		cache_zap_negative_locked_vnode_kl(ncp, vp);
2344 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2345 	}
2346 	mtx_unlock(vlp);
2347 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2348 		cache_free(ncp);
2349 	}
2350 }
2351 
2352 void
2353 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2354     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2355 {
2356 
2357 	ASSERT_VOP_IN_SEQC(fdvp);
2358 	ASSERT_VOP_IN_SEQC(fvp);
2359 	ASSERT_VOP_IN_SEQC(tdvp);
2360 	if (tvp != NULL)
2361 		ASSERT_VOP_IN_SEQC(tvp);
2362 
2363 	cache_purge(fvp);
2364 	if (tvp != NULL) {
2365 		cache_purge(tvp);
2366 		KASSERT(!cache_remove_cnp(tdvp, tcnp),
2367 		    ("%s: lingering negative entry", __func__));
2368 	} else {
2369 		cache_remove_cnp(tdvp, tcnp);
2370 	}
2371 }
2372 
2373 /*
2374  * Flush all entries referencing a particular filesystem.
2375  */
2376 void
2377 cache_purgevfs(struct mount *mp, bool force)
2378 {
2379 	TAILQ_HEAD(, namecache) ncps;
2380 	struct mtx *vlp1, *vlp2;
2381 	struct rwlock *blp;
2382 	struct nchashhead *bucket;
2383 	struct namecache *ncp, *nnp;
2384 	u_long i, j, n_nchash;
2385 	int error;
2386 
2387 	/* Scan hash tables for applicable entries */
2388 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2389 	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2390 		return;
2391 	TAILQ_INIT(&ncps);
2392 	n_nchash = nchash + 1;
2393 	vlp1 = vlp2 = NULL;
2394 	for (i = 0; i < numbucketlocks; i++) {
2395 		blp = (struct rwlock *)&bucketlocks[i];
2396 		rw_wlock(blp);
2397 		for (j = i; j < n_nchash; j += numbucketlocks) {
2398 retry:
2399 			bucket = &nchashtbl[j];
2400 			CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2401 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
2402 				if (ncp->nc_dvp->v_mount != mp)
2403 					continue;
2404 				error = cache_zap_wlocked_bucket_kl(ncp, blp,
2405 				    &vlp1, &vlp2);
2406 				if (error != 0)
2407 					goto retry;
2408 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2409 			}
2410 		}
2411 		rw_wunlock(blp);
2412 		if (vlp1 == NULL && vlp2 == NULL)
2413 			cache_maybe_yield();
2414 	}
2415 	if (vlp1 != NULL)
2416 		mtx_unlock(vlp1);
2417 	if (vlp2 != NULL)
2418 		mtx_unlock(vlp2);
2419 
2420 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2421 		cache_free(ncp);
2422 	}
2423 }
2424 
2425 /*
2426  * Perform canonical checks and cache lookup and pass on to filesystem
2427  * through the vop_cachedlookup only if needed.
2428  */
2429 
2430 int
2431 vfs_cache_lookup(struct vop_lookup_args *ap)
2432 {
2433 	struct vnode *dvp;
2434 	int error;
2435 	struct vnode **vpp = ap->a_vpp;
2436 	struct componentname *cnp = ap->a_cnp;
2437 	int flags = cnp->cn_flags;
2438 
2439 	*vpp = NULL;
2440 	dvp = ap->a_dvp;
2441 
2442 	if (dvp->v_type != VDIR)
2443 		return (ENOTDIR);
2444 
2445 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2446 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2447 		return (EROFS);
2448 
2449 	error = vn_dir_check_exec(dvp, cnp);
2450 	if (error != 0)
2451 		return (error);
2452 
2453 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2454 	if (error == 0)
2455 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2456 	if (error == -1)
2457 		return (0);
2458 	return (error);
2459 }
2460 
2461 /* Implementation of the getcwd syscall. */
2462 int
2463 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2464 {
2465 	char *buf, *retbuf;
2466 	size_t buflen;
2467 	int error;
2468 
2469 	buflen = uap->buflen;
2470 	if (__predict_false(buflen < 2))
2471 		return (EINVAL);
2472 	if (buflen > MAXPATHLEN)
2473 		buflen = MAXPATHLEN;
2474 
2475 	buf = uma_zalloc(namei_zone, M_WAITOK);
2476 	error = vn_getcwd(buf, &retbuf, &buflen);
2477 	if (error == 0)
2478 		error = copyout(retbuf, uap->buf, buflen);
2479 	uma_zfree(namei_zone, buf);
2480 	return (error);
2481 }
2482 
2483 int
2484 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2485 {
2486 	struct pwd *pwd;
2487 	int error;
2488 
2489 	vfs_smr_enter();
2490 	pwd = pwd_get_smr();
2491 	error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2492 	    buflen, false, 0);
2493 	VFS_SMR_ASSERT_NOT_ENTERED();
2494 	if (error < 0) {
2495 		pwd = pwd_hold(curthread);
2496 		error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2497 		    retbuf, buflen);
2498 		pwd_drop(pwd);
2499 	}
2500 
2501 #ifdef KTRACE
2502 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2503 		ktrnamei(*retbuf);
2504 #endif
2505 	return (error);
2506 }
2507 
2508 static int
2509 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2510     size_t size, int flags, enum uio_seg pathseg)
2511 {
2512 	struct nameidata nd;
2513 	char *retbuf, *freebuf;
2514 	int error;
2515 
2516 	if (flags != 0)
2517 		return (EINVAL);
2518 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2519 	    pathseg, path, fd, &cap_fstat_rights, td);
2520 	if ((error = namei(&nd)) != 0)
2521 		return (error);
2522 	error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2523 	if (error == 0) {
2524 		error = copyout(retbuf, buf, size);
2525 		free(freebuf, M_TEMP);
2526 	}
2527 	NDFREE(&nd, 0);
2528 	return (error);
2529 }
2530 
2531 int
2532 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2533 {
2534 
2535 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2536 	    uap->flags, UIO_USERSPACE));
2537 }
2538 
2539 /*
2540  * Retrieve the full filesystem path that correspond to a vnode from the name
2541  * cache (if available)
2542  */
2543 int
2544 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2545 {
2546 	struct pwd *pwd;
2547 	char *buf;
2548 	size_t buflen;
2549 	int error;
2550 
2551 	if (__predict_false(vp == NULL))
2552 		return (EINVAL);
2553 
2554 	buflen = MAXPATHLEN;
2555 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2556 	vfs_smr_enter();
2557 	pwd = pwd_get_smr();
2558 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2559 	VFS_SMR_ASSERT_NOT_ENTERED();
2560 	if (error < 0) {
2561 		pwd = pwd_hold(curthread);
2562 		error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2563 		pwd_drop(pwd);
2564 	}
2565 	if (error == 0)
2566 		*freebuf = buf;
2567 	else
2568 		free(buf, M_TEMP);
2569 	return (error);
2570 }
2571 
2572 /*
2573  * This function is similar to vn_fullpath, but it attempts to lookup the
2574  * pathname relative to the global root mount point.  This is required for the
2575  * auditing sub-system, as audited pathnames must be absolute, relative to the
2576  * global root mount point.
2577  */
2578 int
2579 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2580 {
2581 	char *buf;
2582 	size_t buflen;
2583 	int error;
2584 
2585 	if (__predict_false(vp == NULL))
2586 		return (EINVAL);
2587 	buflen = MAXPATHLEN;
2588 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2589 	vfs_smr_enter();
2590 	error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2591 	VFS_SMR_ASSERT_NOT_ENTERED();
2592 	if (error < 0) {
2593 		error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2594 	}
2595 	if (error == 0)
2596 		*freebuf = buf;
2597 	else
2598 		free(buf, M_TEMP);
2599 	return (error);
2600 }
2601 
2602 static struct namecache *
2603 vn_dd_from_dst(struct vnode *vp)
2604 {
2605 	struct namecache *ncp;
2606 
2607 	cache_assert_vnode_locked(vp);
2608 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2609 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2610 			return (ncp);
2611 	}
2612 	return (NULL);
2613 }
2614 
2615 int
2616 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2617 {
2618 	struct vnode *dvp;
2619 	struct namecache *ncp;
2620 	struct mtx *vlp;
2621 	int error;
2622 
2623 	vlp = VP2VNODELOCK(*vp);
2624 	mtx_lock(vlp);
2625 	ncp = (*vp)->v_cache_dd;
2626 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2627 		KASSERT(ncp == vn_dd_from_dst(*vp),
2628 		    ("%s: mismatch for dd entry (%p != %p)", __func__,
2629 		    ncp, vn_dd_from_dst(*vp)));
2630 	} else {
2631 		ncp = vn_dd_from_dst(*vp);
2632 	}
2633 	if (ncp != NULL) {
2634 		if (*buflen < ncp->nc_nlen) {
2635 			mtx_unlock(vlp);
2636 			vrele(*vp);
2637 			counter_u64_add(numfullpathfail4, 1);
2638 			error = ENOMEM;
2639 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2640 			    vp, NULL);
2641 			return (error);
2642 		}
2643 		*buflen -= ncp->nc_nlen;
2644 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2645 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2646 		    ncp->nc_name, vp);
2647 		dvp = *vp;
2648 		*vp = ncp->nc_dvp;
2649 		vref(*vp);
2650 		mtx_unlock(vlp);
2651 		vrele(dvp);
2652 		return (0);
2653 	}
2654 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2655 
2656 	mtx_unlock(vlp);
2657 	vn_lock(*vp, LK_SHARED | LK_RETRY);
2658 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2659 	vput(*vp);
2660 	if (error) {
2661 		counter_u64_add(numfullpathfail2, 1);
2662 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2663 		return (error);
2664 	}
2665 
2666 	*vp = dvp;
2667 	if (VN_IS_DOOMED(dvp)) {
2668 		/* forced unmount */
2669 		vrele(dvp);
2670 		error = ENOENT;
2671 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2672 		return (error);
2673 	}
2674 	/*
2675 	 * *vp has its use count incremented still.
2676 	 */
2677 
2678 	return (0);
2679 }
2680 
2681 /*
2682  * Resolve a directory to a pathname.
2683  *
2684  * The name of the directory can always be found in the namecache or fetched
2685  * from the filesystem. There is also guaranteed to be only one parent, meaning
2686  * we can just follow vnodes up until we find the root.
2687  *
2688  * The vnode must be referenced.
2689  */
2690 static int
2691 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2692     size_t *len, bool slash_prefixed, size_t addend)
2693 {
2694 #ifdef KDTRACE_HOOKS
2695 	struct vnode *startvp = vp;
2696 #endif
2697 	struct vnode *vp1;
2698 	size_t buflen;
2699 	int error;
2700 
2701 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2702 	VNPASS(vp->v_usecount > 0, vp);
2703 
2704 	buflen = *len;
2705 
2706 	if (!slash_prefixed) {
2707 		MPASS(*len >= 2);
2708 		buflen--;
2709 		buf[buflen] = '\0';
2710 	}
2711 
2712 	error = 0;
2713 
2714 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2715 	counter_u64_add(numfullpathcalls, 1);
2716 	while (vp != rdir && vp != rootvnode) {
2717 		/*
2718 		 * The vp vnode must be already fully constructed,
2719 		 * since it is either found in namecache or obtained
2720 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2721 		 * without obtaining the vnode lock.
2722 		 */
2723 		if ((vp->v_vflag & VV_ROOT) != 0) {
2724 			vn_lock(vp, LK_RETRY | LK_SHARED);
2725 
2726 			/*
2727 			 * With the vnode locked, check for races with
2728 			 * unmount, forced or not.  Note that we
2729 			 * already verified that vp is not equal to
2730 			 * the root vnode, which means that
2731 			 * mnt_vnodecovered can be NULL only for the
2732 			 * case of unmount.
2733 			 */
2734 			if (VN_IS_DOOMED(vp) ||
2735 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2736 			    vp1->v_mountedhere != vp->v_mount) {
2737 				vput(vp);
2738 				error = ENOENT;
2739 				SDT_PROBE3(vfs, namecache, fullpath, return,
2740 				    error, vp, NULL);
2741 				break;
2742 			}
2743 
2744 			vref(vp1);
2745 			vput(vp);
2746 			vp = vp1;
2747 			continue;
2748 		}
2749 		if (vp->v_type != VDIR) {
2750 			vrele(vp);
2751 			counter_u64_add(numfullpathfail1, 1);
2752 			error = ENOTDIR;
2753 			SDT_PROBE3(vfs, namecache, fullpath, return,
2754 			    error, vp, NULL);
2755 			break;
2756 		}
2757 		error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2758 		if (error)
2759 			break;
2760 		if (buflen == 0) {
2761 			vrele(vp);
2762 			error = ENOMEM;
2763 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2764 			    startvp, NULL);
2765 			break;
2766 		}
2767 		buf[--buflen] = '/';
2768 		slash_prefixed = true;
2769 	}
2770 	if (error)
2771 		return (error);
2772 	if (!slash_prefixed) {
2773 		if (buflen == 0) {
2774 			vrele(vp);
2775 			counter_u64_add(numfullpathfail4, 1);
2776 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2777 			    startvp, NULL);
2778 			return (ENOMEM);
2779 		}
2780 		buf[--buflen] = '/';
2781 	}
2782 	counter_u64_add(numfullpathfound, 1);
2783 	vrele(vp);
2784 
2785 	*retbuf = buf + buflen;
2786 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2787 	*len -= buflen;
2788 	*len += addend;
2789 	return (0);
2790 }
2791 
2792 /*
2793  * Resolve an arbitrary vnode to a pathname.
2794  *
2795  * Note 2 caveats:
2796  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2797  *   resolve to a different path than the one used to find it
2798  * - namecache is not mandatory, meaning names are not guaranteed to be added
2799  *   (in which case resolving fails)
2800  */
2801 static void __inline
2802 cache_rev_failed_impl(int *reason, int line)
2803 {
2804 
2805 	*reason = line;
2806 }
2807 #define cache_rev_failed(var)	cache_rev_failed_impl((var), __LINE__)
2808 
2809 static int
2810 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
2811     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
2812 {
2813 #ifdef KDTRACE_HOOKS
2814 	struct vnode *startvp = vp;
2815 #endif
2816 	struct vnode *tvp;
2817 	struct mount *mp;
2818 	struct namecache *ncp;
2819 	size_t orig_buflen;
2820 	int reason;
2821 	int error;
2822 #ifdef KDTRACE_HOOKS
2823 	int i;
2824 #endif
2825 	seqc_t vp_seqc, tvp_seqc;
2826 	u_char nc_flag;
2827 
2828 	VFS_SMR_ASSERT_ENTERED();
2829 
2830 	if (!cache_fast_revlookup) {
2831 		vfs_smr_exit();
2832 		return (-1);
2833 	}
2834 
2835 	orig_buflen = *buflen;
2836 
2837 	if (!slash_prefixed) {
2838 		MPASS(*buflen >= 2);
2839 		*buflen -= 1;
2840 		buf[*buflen] = '\0';
2841 	}
2842 
2843 	if (vp == rdir || vp == rootvnode) {
2844 		if (!slash_prefixed) {
2845 			*buflen -= 1;
2846 			buf[*buflen] = '/';
2847 		}
2848 		goto out_ok;
2849 	}
2850 
2851 #ifdef KDTRACE_HOOKS
2852 	i = 0;
2853 #endif
2854 	error = -1;
2855 	vp_seqc = vn_seqc_read_any(vp);
2856 	if (seqc_in_modify(vp_seqc)) {
2857 		cache_rev_failed(&reason);
2858 		goto out_abort;
2859 	}
2860 
2861 	for (;;) {
2862 #ifdef KDTRACE_HOOKS
2863 		i++;
2864 #endif
2865 		if ((vp->v_vflag & VV_ROOT) != 0) {
2866 			mp = atomic_load_ptr(&vp->v_mount);
2867 			if (mp == NULL) {
2868 				cache_rev_failed(&reason);
2869 				goto out_abort;
2870 			}
2871 			tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
2872 			tvp_seqc = vn_seqc_read_any(tvp);
2873 			if (seqc_in_modify(tvp_seqc)) {
2874 				cache_rev_failed(&reason);
2875 				goto out_abort;
2876 			}
2877 			if (!vn_seqc_consistent(vp, vp_seqc)) {
2878 				cache_rev_failed(&reason);
2879 				goto out_abort;
2880 			}
2881 			vp = tvp;
2882 			vp_seqc = tvp_seqc;
2883 			continue;
2884 		}
2885 		ncp = atomic_load_ptr(&vp->v_cache_dd);
2886 		if (ncp == NULL) {
2887 			cache_rev_failed(&reason);
2888 			goto out_abort;
2889 		}
2890 		nc_flag = atomic_load_char(&ncp->nc_flag);
2891 		if ((nc_flag & NCF_ISDOTDOT) != 0) {
2892 			cache_rev_failed(&reason);
2893 			goto out_abort;
2894 		}
2895 		if (!cache_ncp_canuse(ncp)) {
2896 			cache_rev_failed(&reason);
2897 			goto out_abort;
2898 		}
2899 		if (ncp->nc_nlen >= *buflen) {
2900 			cache_rev_failed(&reason);
2901 			error = ENOMEM;
2902 			goto out_abort;
2903 		}
2904 		*buflen -= ncp->nc_nlen;
2905 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2906 		*buflen -= 1;
2907 		buf[*buflen] = '/';
2908 		tvp = ncp->nc_dvp;
2909 		tvp_seqc = vn_seqc_read_any(tvp);
2910 		if (seqc_in_modify(tvp_seqc)) {
2911 			cache_rev_failed(&reason);
2912 			goto out_abort;
2913 		}
2914 		if (!vn_seqc_consistent(vp, vp_seqc)) {
2915 			cache_rev_failed(&reason);
2916 			goto out_abort;
2917 		}
2918 		vp = tvp;
2919 		vp_seqc = tvp_seqc;
2920 		if (vp == rdir || vp == rootvnode)
2921 			break;
2922 	}
2923 out_ok:
2924 	vfs_smr_exit();
2925 	*retbuf = buf + *buflen;
2926 	*buflen = orig_buflen - *buflen + addend;
2927 	SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
2928 	return (0);
2929 
2930 out_abort:
2931 	*buflen = orig_buflen;
2932 	SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
2933 	vfs_smr_exit();
2934 	return (error);
2935 }
2936 
2937 static int
2938 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2939     size_t *buflen)
2940 {
2941 	size_t orig_buflen;
2942 	bool slash_prefixed;
2943 	int error;
2944 
2945 	if (*buflen < 2)
2946 		return (EINVAL);
2947 
2948 	orig_buflen = *buflen;
2949 
2950 	vref(vp);
2951 	slash_prefixed = false;
2952 	if (vp->v_type != VDIR) {
2953 		*buflen -= 1;
2954 		buf[*buflen] = '\0';
2955 		error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
2956 		if (error)
2957 			return (error);
2958 		if (*buflen == 0) {
2959 			vrele(vp);
2960 			return (ENOMEM);
2961 		}
2962 		*buflen -= 1;
2963 		buf[*buflen] = '/';
2964 		slash_prefixed = true;
2965 	}
2966 
2967 	return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
2968 	    orig_buflen - *buflen));
2969 }
2970 
2971 /*
2972  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2973  *
2974  * Since the namecache does not track handlings, the caller is expected to first
2975  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2976  *
2977  * Then we have 2 cases:
2978  * - if the found vnode is a directory, the path can be constructed just by
2979  *   fullowing names up the chain
2980  * - otherwise we populate the buffer with the saved name and start resolving
2981  *   from the parent
2982  */
2983 static int
2984 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
2985     size_t *buflen)
2986 {
2987 	char *buf, *tmpbuf;
2988 	struct pwd *pwd;
2989 	struct componentname *cnp;
2990 	struct vnode *vp;
2991 	size_t addend;
2992 	int error;
2993 	bool slash_prefixed;
2994 	enum vtype type;
2995 
2996 	if (*buflen < 2)
2997 		return (EINVAL);
2998 	if (*buflen > MAXPATHLEN)
2999 		*buflen = MAXPATHLEN;
3000 
3001 	slash_prefixed = false;
3002 
3003 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
3004 
3005 	addend = 0;
3006 	vp = ndp->ni_vp;
3007 	/*
3008 	 * Check for VBAD to work around the vp_crossmp bug in lookup().
3009 	 *
3010 	 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3011 	 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3012 	 * If the type is VDIR (like in this very case) we can skip looking
3013 	 * at ni_dvp in the first place. However, since vnodes get passed here
3014 	 * unlocked the target may transition to doomed state (type == VBAD)
3015 	 * before we get to evaluate the condition. If this happens, we will
3016 	 * populate part of the buffer and descend to vn_fullpath_dir with
3017 	 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3018 	 *
3019 	 * This should be atomic_load(&vp->v_type) but it is ilegal to take
3020 	 * an address of a bit field, even if said field is sized to char.
3021 	 * Work around the problem by reading the value into a full-sized enum
3022 	 * and then re-reading it with atomic_load which will still prevent
3023 	 * the compiler from re-reading down the road.
3024 	 */
3025 	type = vp->v_type;
3026 	type = atomic_load_int(&type);
3027 	if (type == VBAD) {
3028 		error = ENOENT;
3029 		goto out_bad;
3030 	}
3031 	if (type != VDIR) {
3032 		cnp = &ndp->ni_cnd;
3033 		addend = cnp->cn_namelen + 2;
3034 		if (*buflen < addend) {
3035 			error = ENOMEM;
3036 			goto out_bad;
3037 		}
3038 		*buflen -= addend;
3039 		tmpbuf = buf + *buflen;
3040 		tmpbuf[0] = '/';
3041 		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3042 		tmpbuf[addend - 1] = '\0';
3043 		slash_prefixed = true;
3044 		vp = ndp->ni_dvp;
3045 	}
3046 
3047 	vfs_smr_enter();
3048 	pwd = pwd_get_smr();
3049 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3050 	    slash_prefixed, addend);
3051 	VFS_SMR_ASSERT_NOT_ENTERED();
3052 	if (error < 0) {
3053 		pwd = pwd_hold(curthread);
3054 		vref(vp);
3055 		error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3056 		    slash_prefixed, addend);
3057 		pwd_drop(pwd);
3058 		if (error != 0)
3059 			goto out_bad;
3060 	}
3061 
3062 	*freebuf = buf;
3063 
3064 	return (0);
3065 out_bad:
3066 	free(buf, M_TEMP);
3067 	return (error);
3068 }
3069 
3070 struct vnode *
3071 vn_dir_dd_ino(struct vnode *vp)
3072 {
3073 	struct namecache *ncp;
3074 	struct vnode *ddvp;
3075 	struct mtx *vlp;
3076 	enum vgetstate vs;
3077 
3078 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3079 	vlp = VP2VNODELOCK(vp);
3080 	mtx_lock(vlp);
3081 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3082 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3083 			continue;
3084 		ddvp = ncp->nc_dvp;
3085 		vs = vget_prep(ddvp);
3086 		mtx_unlock(vlp);
3087 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3088 			return (NULL);
3089 		return (ddvp);
3090 	}
3091 	mtx_unlock(vlp);
3092 	return (NULL);
3093 }
3094 
3095 int
3096 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3097 {
3098 	struct namecache *ncp;
3099 	struct mtx *vlp;
3100 	int l;
3101 
3102 	vlp = VP2VNODELOCK(vp);
3103 	mtx_lock(vlp);
3104 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3105 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3106 			break;
3107 	if (ncp == NULL) {
3108 		mtx_unlock(vlp);
3109 		return (ENOENT);
3110 	}
3111 	l = min(ncp->nc_nlen, buflen - 1);
3112 	memcpy(buf, ncp->nc_name, l);
3113 	mtx_unlock(vlp);
3114 	buf[l] = '\0';
3115 	return (0);
3116 }
3117 
3118 /*
3119  * This function updates path string to vnode's full global path
3120  * and checks the size of the new path string against the pathlen argument.
3121  *
3122  * Requires a locked, referenced vnode.
3123  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3124  *
3125  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3126  * because it falls back to the ".." lookup if the namecache lookup fails.
3127  */
3128 int
3129 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3130     u_int pathlen)
3131 {
3132 	struct nameidata nd;
3133 	struct vnode *vp1;
3134 	char *rpath, *fbuf;
3135 	int error;
3136 
3137 	ASSERT_VOP_ELOCKED(vp, __func__);
3138 
3139 	/* Construct global filesystem path from vp. */
3140 	VOP_UNLOCK(vp);
3141 	error = vn_fullpath_global(vp, &rpath, &fbuf);
3142 
3143 	if (error != 0) {
3144 		vrele(vp);
3145 		return (error);
3146 	}
3147 
3148 	if (strlen(rpath) >= pathlen) {
3149 		vrele(vp);
3150 		error = ENAMETOOLONG;
3151 		goto out;
3152 	}
3153 
3154 	/*
3155 	 * Re-lookup the vnode by path to detect a possible rename.
3156 	 * As a side effect, the vnode is relocked.
3157 	 * If vnode was renamed, return ENOENT.
3158 	 */
3159 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3160 	    UIO_SYSSPACE, path, td);
3161 	error = namei(&nd);
3162 	if (error != 0) {
3163 		vrele(vp);
3164 		goto out;
3165 	}
3166 	NDFREE(&nd, NDF_ONLY_PNBUF);
3167 	vp1 = nd.ni_vp;
3168 	vrele(vp);
3169 	if (vp1 == vp)
3170 		strcpy(path, rpath);
3171 	else {
3172 		vput(vp1);
3173 		error = ENOENT;
3174 	}
3175 
3176 out:
3177 	free(fbuf, M_TEMP);
3178 	return (error);
3179 }
3180 
3181 #ifdef DDB
3182 static void
3183 db_print_vpath(struct vnode *vp)
3184 {
3185 
3186 	while (vp != NULL) {
3187 		db_printf("%p: ", vp);
3188 		if (vp == rootvnode) {
3189 			db_printf("/");
3190 			vp = NULL;
3191 		} else {
3192 			if (vp->v_vflag & VV_ROOT) {
3193 				db_printf("<mount point>");
3194 				vp = vp->v_mount->mnt_vnodecovered;
3195 			} else {
3196 				struct namecache *ncp;
3197 				char *ncn;
3198 				int i;
3199 
3200 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
3201 				if (ncp != NULL) {
3202 					ncn = ncp->nc_name;
3203 					for (i = 0; i < ncp->nc_nlen; i++)
3204 						db_printf("%c", *ncn++);
3205 					vp = ncp->nc_dvp;
3206 				} else {
3207 					vp = NULL;
3208 				}
3209 			}
3210 		}
3211 		db_printf("\n");
3212 	}
3213 
3214 	return;
3215 }
3216 
3217 DB_SHOW_COMMAND(vpath, db_show_vpath)
3218 {
3219 	struct vnode *vp;
3220 
3221 	if (!have_addr) {
3222 		db_printf("usage: show vpath <struct vnode *>\n");
3223 		return;
3224 	}
3225 
3226 	vp = (struct vnode *)addr;
3227 	db_print_vpath(vp);
3228 }
3229 
3230 #endif
3231 
3232 static bool __read_frequently cache_fast_lookup = true;
3233 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3234     &cache_fast_lookup, 0, "");
3235 
3236 #define CACHE_FPL_FAILED	-2020
3237 
3238 static void
3239 cache_fpl_cleanup_cnp(struct componentname *cnp)
3240 {
3241 
3242 	uma_zfree(namei_zone, cnp->cn_pnbuf);
3243 #ifdef DIAGNOSTIC
3244 	cnp->cn_pnbuf = NULL;
3245 	cnp->cn_nameptr = NULL;
3246 #endif
3247 }
3248 
3249 static void
3250 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3251 {
3252 	struct componentname *cnp;
3253 
3254 	cnp = &ndp->ni_cnd;
3255 	while (*(cnp->cn_nameptr) == '/') {
3256 		cnp->cn_nameptr++;
3257 		ndp->ni_pathlen--;
3258 	}
3259 
3260 	*dpp = ndp->ni_rootdir;
3261 }
3262 
3263 /*
3264  * Components of nameidata (or objects it can point to) which may
3265  * need restoring in case fast path lookup fails.
3266  */
3267 struct nameidata_saved {
3268 	long cn_namelen;
3269 	char *cn_nameptr;
3270 	size_t ni_pathlen;
3271 	int cn_flags;
3272 };
3273 
3274 struct cache_fpl {
3275 	struct nameidata *ndp;
3276 	struct componentname *cnp;
3277 	struct pwd *pwd;
3278 	struct vnode *dvp;
3279 	struct vnode *tvp;
3280 	seqc_t dvp_seqc;
3281 	seqc_t tvp_seqc;
3282 	struct nameidata_saved snd;
3283 	int line;
3284 	enum cache_fpl_status status:8;
3285 	bool in_smr;
3286 };
3287 
3288 static void
3289 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3290 {
3291 
3292 	snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3293 	snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3294 	snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3295 	snd->ni_pathlen = fpl->ndp->ni_pathlen;
3296 }
3297 
3298 static void
3299 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3300 {
3301 
3302 	fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3303 	fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3304 	fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3305 	fpl->ndp->ni_pathlen = snd->ni_pathlen;
3306 }
3307 
3308 #ifdef INVARIANTS
3309 #define cache_fpl_smr_assert_entered(fpl) ({			\
3310 	struct cache_fpl *_fpl = (fpl);				\
3311 	MPASS(_fpl->in_smr == true);				\
3312 	VFS_SMR_ASSERT_ENTERED();				\
3313 })
3314 #define cache_fpl_smr_assert_not_entered(fpl) ({		\
3315 	struct cache_fpl *_fpl = (fpl);				\
3316 	MPASS(_fpl->in_smr == false);				\
3317 	VFS_SMR_ASSERT_NOT_ENTERED();				\
3318 })
3319 #else
3320 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3321 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3322 #endif
3323 
3324 #define cache_fpl_smr_enter_initial(fpl) ({			\
3325 	struct cache_fpl *_fpl = (fpl);				\
3326 	vfs_smr_enter();					\
3327 	_fpl->in_smr = true;					\
3328 })
3329 
3330 #define cache_fpl_smr_enter(fpl) ({				\
3331 	struct cache_fpl *_fpl = (fpl);				\
3332 	MPASS(_fpl->in_smr == false);				\
3333 	vfs_smr_enter();					\
3334 	_fpl->in_smr = true;					\
3335 })
3336 
3337 #define cache_fpl_smr_exit(fpl) ({				\
3338 	struct cache_fpl *_fpl = (fpl);				\
3339 	MPASS(_fpl->in_smr == true);				\
3340 	vfs_smr_exit();						\
3341 	_fpl->in_smr = false;					\
3342 })
3343 
3344 static int
3345 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3346 {
3347 
3348 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3349 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3350 		    ("%s: converting to abort from %d at %d, set at %d\n",
3351 		    __func__, fpl->status, line, fpl->line));
3352 	}
3353 	fpl->status = CACHE_FPL_STATUS_ABORTED;
3354 	fpl->line = line;
3355 	return (CACHE_FPL_FAILED);
3356 }
3357 
3358 #define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
3359 
3360 static int
3361 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3362 {
3363 
3364 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3365 	    ("%s: setting to partial at %d, but already set to %d at %d\n",
3366 	    __func__, line, fpl->status, fpl->line));
3367 	cache_fpl_smr_assert_entered(fpl);
3368 	fpl->status = CACHE_FPL_STATUS_PARTIAL;
3369 	fpl->line = line;
3370 	return (CACHE_FPL_FAILED);
3371 }
3372 
3373 #define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
3374 
3375 static int
3376 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3377 {
3378 
3379 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3380 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
3381 	    __func__, line, fpl->status, fpl->line));
3382 	cache_fpl_smr_assert_not_entered(fpl);
3383 	MPASS(error != CACHE_FPL_FAILED);
3384 	fpl->status = CACHE_FPL_STATUS_HANDLED;
3385 	fpl->line = line;
3386 	return (error);
3387 }
3388 
3389 #define cache_fpl_handled(x, e)	cache_fpl_handled_impl((x), (e), __LINE__)
3390 
3391 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3392 	(LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3393 	 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3394 
3395 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3396 	(ISDOTDOT | MAKEENTRY | ISLASTCN)
3397 
3398 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3399     "supported and internal flags overlap");
3400 
3401 static bool
3402 cache_fpl_islastcn(struct nameidata *ndp)
3403 {
3404 
3405 	return (*ndp->ni_next == 0);
3406 }
3407 
3408 static bool
3409 cache_fpl_isdotdot(struct componentname *cnp)
3410 {
3411 
3412 	if (cnp->cn_namelen == 2 &&
3413 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3414 		return (true);
3415 	return (false);
3416 }
3417 
3418 static bool
3419 cache_can_fplookup(struct cache_fpl *fpl)
3420 {
3421 	struct nameidata *ndp;
3422 	struct componentname *cnp;
3423 	struct thread *td;
3424 
3425 	ndp = fpl->ndp;
3426 	cnp = fpl->cnp;
3427 	td = cnp->cn_thread;
3428 
3429 	if (!cache_fast_lookup) {
3430 		cache_fpl_aborted(fpl);
3431 		return (false);
3432 	}
3433 #ifdef MAC
3434 	if (mac_vnode_check_lookup_enabled()) {
3435 		cache_fpl_aborted(fpl);
3436 		return (false);
3437 	}
3438 #endif
3439 	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3440 		cache_fpl_aborted(fpl);
3441 		return (false);
3442 	}
3443 	if (ndp->ni_dirfd != AT_FDCWD) {
3444 		cache_fpl_aborted(fpl);
3445 		return (false);
3446 	}
3447 	if (IN_CAPABILITY_MODE(td)) {
3448 		cache_fpl_aborted(fpl);
3449 		return (false);
3450 	}
3451 	if (AUDITING_TD(td)) {
3452 		cache_fpl_aborted(fpl);
3453 		return (false);
3454 	}
3455 	if (ndp->ni_startdir != NULL) {
3456 		cache_fpl_aborted(fpl);
3457 		return (false);
3458 	}
3459 	return (true);
3460 }
3461 
3462 static bool
3463 cache_fplookup_vnode_supported(struct vnode *vp)
3464 {
3465 
3466 	return (vp->v_type != VLNK);
3467 }
3468 
3469 /*
3470  * Move a negative entry to the hot list.
3471  *
3472  * We have to take locks, but they may be contended and in the worst
3473  * case we may need to go off CPU. We don't want to spin within the
3474  * smr section and we can't block with it. Instead we are going to
3475  * look up the entry again.
3476  */
3477 static int __noinline
3478 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3479     uint32_t hash)
3480 {
3481 	struct componentname *cnp;
3482 	struct namecache *ncp;
3483 	struct neglist *neglist;
3484 	struct negstate *negstate;
3485 	struct vnode *dvp;
3486 	u_char nc_flag;
3487 
3488 	cnp = fpl->cnp;
3489 	dvp = fpl->dvp;
3490 
3491 	if (!vhold_smr(dvp))
3492 		return (cache_fpl_aborted(fpl));
3493 
3494 	neglist = NCP2NEGLIST(oncp);
3495 	cache_fpl_smr_exit(fpl);
3496 
3497 	mtx_lock(&ncneg_hot.nl_lock);
3498 	mtx_lock(&neglist->nl_lock);
3499 	/*
3500 	 * For hash iteration.
3501 	 */
3502 	cache_fpl_smr_enter(fpl);
3503 
3504 	/*
3505 	 * Avoid all surprises by only succeeding if we got the same entry and
3506 	 * bailing completely otherwise.
3507 	 *
3508 	 * In particular at this point there can be a new ncp which matches the
3509 	 * search but hashes to a different neglist.
3510 	 */
3511 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3512 		if (ncp == oncp)
3513 			break;
3514 	}
3515 
3516 	/*
3517 	 * No match to begin with.
3518 	 */
3519 	if (__predict_false(ncp == NULL)) {
3520 		goto out_abort;
3521 	}
3522 
3523 	/*
3524 	 * The newly found entry may be something different...
3525 	 */
3526 	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3527 	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3528 		goto out_abort;
3529 	}
3530 
3531 	/*
3532 	 * ... and not even negative.
3533 	 */
3534 	nc_flag = atomic_load_char(&ncp->nc_flag);
3535 	if ((nc_flag & NCF_NEGATIVE) == 0) {
3536 		goto out_abort;
3537 	}
3538 
3539 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3540 		goto out_abort;
3541 	}
3542 
3543 	negstate = NCP2NEGSTATE(ncp);
3544 	if ((negstate->neg_flag & NEG_HOT) == 0) {
3545 		numhotneg++;
3546 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3547 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3548 		negstate->neg_flag |= NEG_HOT;
3549 	}
3550 
3551 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3552 	counter_u64_add(numneghits, 1);
3553 	cache_fpl_smr_exit(fpl);
3554 	mtx_unlock(&neglist->nl_lock);
3555 	mtx_unlock(&ncneg_hot.nl_lock);
3556 	vdrop(dvp);
3557 	return (cache_fpl_handled(fpl, ENOENT));
3558 out_abort:
3559 	cache_fpl_smr_exit(fpl);
3560 	mtx_unlock(&neglist->nl_lock);
3561 	mtx_unlock(&ncneg_hot.nl_lock);
3562 	vdrop(dvp);
3563 	return (cache_fpl_aborted(fpl));
3564 }
3565 
3566 /*
3567  * The target vnode is not supported, prepare for the slow path to take over.
3568  */
3569 static int __noinline
3570 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3571 {
3572 	struct nameidata *ndp;
3573 	struct componentname *cnp;
3574 	enum vgetstate dvs;
3575 	struct vnode *dvp;
3576 	struct pwd *pwd;
3577 	seqc_t dvp_seqc;
3578 
3579 	ndp = fpl->ndp;
3580 	cnp = fpl->cnp;
3581 	dvp = fpl->dvp;
3582 	dvp_seqc = fpl->dvp_seqc;
3583 
3584 	dvs = vget_prep_smr(dvp);
3585 	if (__predict_false(dvs == VGET_NONE)) {
3586 		cache_fpl_smr_exit(fpl);
3587 		return (cache_fpl_aborted(fpl));
3588 	}
3589 
3590 	cache_fpl_smr_exit(fpl);
3591 
3592 	vget_finish_ref(dvp, dvs);
3593 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3594 		vrele(dvp);
3595 		return (cache_fpl_aborted(fpl));
3596 	}
3597 
3598 	pwd = pwd_hold(curthread);
3599 	if (fpl->pwd != pwd) {
3600 		vrele(dvp);
3601 		pwd_drop(pwd);
3602 		return (cache_fpl_aborted(fpl));
3603 	}
3604 
3605 	cache_fpl_restore(fpl, &fpl->snd);
3606 
3607 	ndp->ni_startdir = dvp;
3608 	cnp->cn_flags |= MAKEENTRY;
3609 	if (cache_fpl_islastcn(ndp))
3610 		cnp->cn_flags |= ISLASTCN;
3611 	if (cache_fpl_isdotdot(cnp))
3612 		cnp->cn_flags |= ISDOTDOT;
3613 
3614 	return (0);
3615 }
3616 
3617 static int
3618 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3619 {
3620 	struct componentname *cnp;
3621 	struct vnode *tvp;
3622 	seqc_t tvp_seqc;
3623 	int error, lkflags;
3624 
3625 	cnp = fpl->cnp;
3626 	tvp = fpl->tvp;
3627 	tvp_seqc = fpl->tvp_seqc;
3628 
3629 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
3630 		lkflags = LK_SHARED;
3631 		if ((cnp->cn_flags & LOCKSHARED) == 0)
3632 			lkflags = LK_EXCLUSIVE;
3633 		error = vget_finish(tvp, lkflags, tvs);
3634 		if (__predict_false(error != 0)) {
3635 			return (cache_fpl_aborted(fpl));
3636 		}
3637 	} else {
3638 		vget_finish_ref(tvp, tvs);
3639 	}
3640 
3641 	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3642 		if ((cnp->cn_flags & LOCKLEAF) != 0)
3643 			vput(tvp);
3644 		else
3645 			vrele(tvp);
3646 		return (cache_fpl_aborted(fpl));
3647 	}
3648 
3649 	return (cache_fpl_handled(fpl, 0));
3650 }
3651 
3652 /*
3653  * They want to possibly modify the state of the namecache.
3654  *
3655  * Don't try to match the API contract, just leave.
3656  * TODO: this leaves scalability on the table
3657  */
3658 static int
3659 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3660 {
3661 	struct componentname *cnp;
3662 
3663 	cnp = fpl->cnp;
3664 	MPASS(cnp->cn_nameiop != LOOKUP);
3665 	return (cache_fpl_partial(fpl));
3666 }
3667 
3668 static int __noinline
3669 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3670 {
3671 	struct componentname *cnp;
3672 	enum vgetstate dvs, tvs;
3673 	struct vnode *dvp, *tvp;
3674 	seqc_t dvp_seqc, tvp_seqc;
3675 	int error;
3676 
3677 	cnp = fpl->cnp;
3678 	dvp = fpl->dvp;
3679 	dvp_seqc = fpl->dvp_seqc;
3680 	tvp = fpl->tvp;
3681 	tvp_seqc = fpl->tvp_seqc;
3682 
3683 	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3684 
3685 	/*
3686 	 * This is less efficient than it can be for simplicity.
3687 	 */
3688 	dvs = vget_prep_smr(dvp);
3689 	if (__predict_false(dvs == VGET_NONE)) {
3690 		return (cache_fpl_aborted(fpl));
3691 	}
3692 	tvs = vget_prep_smr(tvp);
3693 	if (__predict_false(tvs == VGET_NONE)) {
3694 		cache_fpl_smr_exit(fpl);
3695 		vget_abort(dvp, dvs);
3696 		return (cache_fpl_aborted(fpl));
3697 	}
3698 
3699 	cache_fpl_smr_exit(fpl);
3700 
3701 	if ((cnp->cn_flags & LOCKPARENT) != 0) {
3702 		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3703 		if (__predict_false(error != 0)) {
3704 			vget_abort(tvp, tvs);
3705 			return (cache_fpl_aborted(fpl));
3706 		}
3707 	} else {
3708 		vget_finish_ref(dvp, dvs);
3709 	}
3710 
3711 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3712 		vget_abort(tvp, tvs);
3713 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3714 			vput(dvp);
3715 		else
3716 			vrele(dvp);
3717 		return (cache_fpl_aborted(fpl));
3718 	}
3719 
3720 	error = cache_fplookup_final_child(fpl, tvs);
3721 	if (__predict_false(error != 0)) {
3722 		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3723 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3724 			vput(dvp);
3725 		else
3726 			vrele(dvp);
3727 		return (error);
3728 	}
3729 
3730 	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3731 	return (0);
3732 }
3733 
3734 static int
3735 cache_fplookup_final(struct cache_fpl *fpl)
3736 {
3737 	struct componentname *cnp;
3738 	enum vgetstate tvs;
3739 	struct vnode *dvp, *tvp;
3740 	seqc_t dvp_seqc, tvp_seqc;
3741 
3742 	cnp = fpl->cnp;
3743 	dvp = fpl->dvp;
3744 	dvp_seqc = fpl->dvp_seqc;
3745 	tvp = fpl->tvp;
3746 	tvp_seqc = fpl->tvp_seqc;
3747 
3748 	VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3749 
3750 	if (cnp->cn_nameiop != LOOKUP) {
3751 		return (cache_fplookup_final_modifying(fpl));
3752 	}
3753 
3754 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3755 		return (cache_fplookup_final_withparent(fpl));
3756 
3757 	tvs = vget_prep_smr(tvp);
3758 	if (__predict_false(tvs == VGET_NONE)) {
3759 		return (cache_fpl_partial(fpl));
3760 	}
3761 
3762 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3763 		cache_fpl_smr_exit(fpl);
3764 		vget_abort(tvp, tvs);
3765 		return (cache_fpl_aborted(fpl));
3766 	}
3767 
3768 	cache_fpl_smr_exit(fpl);
3769 	return (cache_fplookup_final_child(fpl, tvs));
3770 }
3771 
3772 static int __noinline
3773 cache_fplookup_dot(struct cache_fpl *fpl)
3774 {
3775 	struct vnode *dvp;
3776 
3777 	dvp = fpl->dvp;
3778 
3779 	fpl->tvp = dvp;
3780 	fpl->tvp_seqc = vn_seqc_read_any(dvp);
3781 	if (seqc_in_modify(fpl->tvp_seqc)) {
3782 		return (cache_fpl_aborted(fpl));
3783 	}
3784 
3785 	counter_u64_add(dothits, 1);
3786 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3787 
3788 	return (0);
3789 }
3790 
3791 static int __noinline
3792 cache_fplookup_dotdot(struct cache_fpl *fpl)
3793 {
3794 	struct nameidata *ndp;
3795 	struct componentname *cnp;
3796 	struct namecache *ncp;
3797 	struct vnode *dvp;
3798 	struct prison *pr;
3799 	u_char nc_flag;
3800 
3801 	ndp = fpl->ndp;
3802 	cnp = fpl->cnp;
3803 	dvp = fpl->dvp;
3804 
3805 	/*
3806 	 * XXX this is racy the same way regular lookup is
3807 	 */
3808 	for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3809 	    pr = pr->pr_parent)
3810 		if (dvp == pr->pr_root)
3811 			break;
3812 
3813 	if (dvp == ndp->ni_rootdir ||
3814 	    dvp == ndp->ni_topdir ||
3815 	    dvp == rootvnode ||
3816 	    pr != NULL) {
3817 		fpl->tvp = dvp;
3818 		fpl->tvp_seqc = vn_seqc_read_any(dvp);
3819 		if (seqc_in_modify(fpl->tvp_seqc)) {
3820 			return (cache_fpl_aborted(fpl));
3821 		}
3822 		return (0);
3823 	}
3824 
3825 	if ((dvp->v_vflag & VV_ROOT) != 0) {
3826 		/*
3827 		 * TODO
3828 		 * The opposite of climb mount is needed here.
3829 		 */
3830 		return (cache_fpl_aborted(fpl));
3831 	}
3832 
3833 	ncp = atomic_load_ptr(&dvp->v_cache_dd);
3834 	if (ncp == NULL) {
3835 		return (cache_fpl_aborted(fpl));
3836 	}
3837 
3838 	nc_flag = atomic_load_char(&ncp->nc_flag);
3839 	if ((nc_flag & NCF_ISDOTDOT) != 0) {
3840 		if ((nc_flag & NCF_NEGATIVE) != 0)
3841 			return (cache_fpl_aborted(fpl));
3842 		fpl->tvp = ncp->nc_vp;
3843 	} else {
3844 		fpl->tvp = ncp->nc_dvp;
3845 	}
3846 
3847 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3848 		return (cache_fpl_aborted(fpl));
3849 	}
3850 
3851 	fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3852 	if (seqc_in_modify(fpl->tvp_seqc)) {
3853 		return (cache_fpl_partial(fpl));
3854 	}
3855 
3856 	counter_u64_add(dotdothits, 1);
3857 	return (0);
3858 }
3859 
3860 static int
3861 cache_fplookup_next(struct cache_fpl *fpl)
3862 {
3863 	struct componentname *cnp;
3864 	struct namecache *ncp;
3865 	struct negstate *negstate;
3866 	struct vnode *dvp, *tvp;
3867 	u_char nc_flag;
3868 	uint32_t hash;
3869 	bool neg_hot;
3870 
3871 	cnp = fpl->cnp;
3872 	dvp = fpl->dvp;
3873 
3874 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3875 		return (cache_fplookup_dot(fpl));
3876 	}
3877 
3878 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3879 
3880 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3881 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3882 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3883 			break;
3884 	}
3885 
3886 	/*
3887 	 * If there is no entry we have to punt to the slow path to perform
3888 	 * actual lookup. Should there be nothing with this name a negative
3889 	 * entry will be created.
3890 	 */
3891 	if (__predict_false(ncp == NULL)) {
3892 		return (cache_fpl_partial(fpl));
3893 	}
3894 
3895 	tvp = atomic_load_ptr(&ncp->nc_vp);
3896 	nc_flag = atomic_load_char(&ncp->nc_flag);
3897 	if ((nc_flag & NCF_NEGATIVE) != 0) {
3898 		/*
3899 		 * If they want to create an entry we need to replace this one.
3900 		 */
3901 		if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3902 			return (cache_fpl_partial(fpl));
3903 		}
3904 		negstate = NCP2NEGSTATE(ncp);
3905 		neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3906 		if (__predict_false(!cache_ncp_canuse(ncp))) {
3907 			return (cache_fpl_partial(fpl));
3908 		}
3909 		if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3910 			return (cache_fpl_partial(fpl));
3911 		}
3912 		if (!neg_hot) {
3913 			return (cache_fplookup_negative_promote(fpl, ncp, hash));
3914 		}
3915 		SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3916 		    ncp->nc_name);
3917 		counter_u64_add(numneghits, 1);
3918 		cache_fpl_smr_exit(fpl);
3919 		return (cache_fpl_handled(fpl, ENOENT));
3920 	}
3921 
3922 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3923 		return (cache_fpl_partial(fpl));
3924 	}
3925 
3926 	fpl->tvp = tvp;
3927 	fpl->tvp_seqc = vn_seqc_read_any(tvp);
3928 	if (seqc_in_modify(fpl->tvp_seqc)) {
3929 		return (cache_fpl_partial(fpl));
3930 	}
3931 
3932 	if (!cache_fplookup_vnode_supported(tvp)) {
3933 		return (cache_fpl_partial(fpl));
3934 	}
3935 
3936 	counter_u64_add(numposhits, 1);
3937 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3938 	return (0);
3939 }
3940 
3941 static bool
3942 cache_fplookup_mp_supported(struct mount *mp)
3943 {
3944 
3945 	if (mp == NULL)
3946 		return (false);
3947 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3948 		return (false);
3949 	return (true);
3950 }
3951 
3952 /*
3953  * Walk up the mount stack (if any).
3954  *
3955  * Correctness is provided in the following ways:
3956  * - all vnodes are protected from freeing with SMR
3957  * - struct mount objects are type stable making them always safe to access
3958  * - stability of the particular mount is provided by busying it
3959  * - relationship between the vnode which is mounted on and the mount is
3960  *   verified with the vnode sequence counter after busying
3961  * - association between root vnode of the mount and the mount is protected
3962  *   by busy
3963  *
3964  * From that point on we can read the sequence counter of the root vnode
3965  * and get the next mount on the stack (if any) using the same protection.
3966  *
3967  * By the end of successful walk we are guaranteed the reached state was
3968  * indeed present at least at some point which matches the regular lookup.
3969  */
3970 static int __noinline
3971 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3972 {
3973 	struct mount *mp, *prev_mp;
3974 	struct vnode *vp;
3975 	seqc_t vp_seqc;
3976 
3977 	vp = fpl->tvp;
3978 	vp_seqc = fpl->tvp_seqc;
3979 
3980 	VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
3981 	mp = atomic_load_ptr(&vp->v_mountedhere);
3982 	if (mp == NULL)
3983 		return (0);
3984 
3985 	prev_mp = NULL;
3986 	for (;;) {
3987 		if (!vfs_op_thread_enter_crit(mp)) {
3988 			if (prev_mp != NULL)
3989 				vfs_op_thread_exit_crit(prev_mp);
3990 			return (cache_fpl_partial(fpl));
3991 		}
3992 		if (prev_mp != NULL)
3993 			vfs_op_thread_exit_crit(prev_mp);
3994 		if (!vn_seqc_consistent(vp, vp_seqc)) {
3995 			vfs_op_thread_exit_crit(mp);
3996 			return (cache_fpl_partial(fpl));
3997 		}
3998 		if (!cache_fplookup_mp_supported(mp)) {
3999 			vfs_op_thread_exit_crit(mp);
4000 			return (cache_fpl_partial(fpl));
4001 		}
4002 		vp = atomic_load_ptr(&mp->mnt_rootvnode);
4003 		if (vp == NULL || VN_IS_DOOMED(vp)) {
4004 			vfs_op_thread_exit_crit(mp);
4005 			return (cache_fpl_partial(fpl));
4006 		}
4007 		vp_seqc = vn_seqc_read_any(vp);
4008 		if (seqc_in_modify(vp_seqc)) {
4009 			vfs_op_thread_exit_crit(mp);
4010 			return (cache_fpl_partial(fpl));
4011 		}
4012 		prev_mp = mp;
4013 		mp = atomic_load_ptr(&vp->v_mountedhere);
4014 		if (mp == NULL)
4015 			break;
4016 	}
4017 
4018 	vfs_op_thread_exit_crit(prev_mp);
4019 	fpl->tvp = vp;
4020 	fpl->tvp_seqc = vp_seqc;
4021 	return (0);
4022 }
4023 
4024 static bool
4025 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4026 {
4027 	struct mount *mp;
4028 	struct vnode *vp;
4029 
4030 	vp = fpl->tvp;
4031 
4032 	/*
4033 	 * Hack: while this is a union, the pointer tends to be NULL so save on
4034 	 * a branch.
4035 	 */
4036 	mp = atomic_load_ptr(&vp->v_mountedhere);
4037 	if (mp == NULL)
4038 		return (false);
4039 	if (vp->v_type == VDIR)
4040 		return (true);
4041 	return (false);
4042 }
4043 
4044 /*
4045  * Parse the path.
4046  *
4047  * The code is mostly copy-pasted from regular lookup, see lookup().
4048  * The structure is maintained along with comments for easier maintenance.
4049  * Deduplicating the code will become feasible after fast path lookup
4050  * becomes more feature-complete.
4051  */
4052 static int
4053 cache_fplookup_parse(struct cache_fpl *fpl)
4054 {
4055 	struct nameidata *ndp;
4056 	struct componentname *cnp;
4057 	char *cp;
4058 
4059 	ndp = fpl->ndp;
4060 	cnp = fpl->cnp;
4061 
4062 	/*
4063 	 * Search a new directory.
4064 	 *
4065 	 * The last component of the filename is left accessible via
4066 	 * cnp->cn_nameptr for callers that need the name. Callers needing
4067 	 * the name set the SAVENAME flag. When done, they assume
4068 	 * responsibility for freeing the pathname buffer.
4069 	 */
4070 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4071 		continue;
4072 	cnp->cn_namelen = cp - cnp->cn_nameptr;
4073 	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4074 		cache_fpl_smr_exit(fpl);
4075 		return (cache_fpl_handled(fpl, ENAMETOOLONG));
4076 	}
4077 	ndp->ni_pathlen -= cnp->cn_namelen;
4078 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
4079 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4080 	ndp->ni_next = cp;
4081 
4082 	/*
4083 	 * Replace multiple slashes by a single slash and trailing slashes
4084 	 * by a null.  This must be done before VOP_LOOKUP() because some
4085 	 * fs's don't know about trailing slashes.  Remember if there were
4086 	 * trailing slashes to handle symlinks, existing non-directories
4087 	 * and non-existing files that won't be directories specially later.
4088 	 */
4089 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4090 		cp++;
4091 		ndp->ni_pathlen--;
4092 		if (*cp == '\0') {
4093 			/*
4094 			 * TODO
4095 			 * Regular lookup performs the following:
4096 			 * *ndp->ni_next = '\0';
4097 			 * cnp->cn_flags |= TRAILINGSLASH;
4098 			 *
4099 			 * Which is problematic since it modifies data read
4100 			 * from userspace. Then if fast path lookup was to
4101 			 * abort we would have to either restore it or convey
4102 			 * the flag. Since this is a corner case just ignore
4103 			 * it for simplicity.
4104 			 */
4105 			return (cache_fpl_partial(fpl));
4106 		}
4107 	}
4108 	ndp->ni_next = cp;
4109 
4110 	/*
4111 	 * Check for degenerate name (e.g. / or "")
4112 	 * which is a way of talking about a directory,
4113 	 * e.g. like "/." or ".".
4114 	 *
4115 	 * TODO
4116 	 * Another corner case handled by the regular lookup
4117 	 */
4118 	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4119 		return (cache_fpl_partial(fpl));
4120 	}
4121 	return (0);
4122 }
4123 
4124 static void
4125 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4126 {
4127 	struct nameidata *ndp;
4128 	struct componentname *cnp;
4129 
4130 	ndp = fpl->ndp;
4131 	cnp = fpl->cnp;
4132 
4133 	cnp->cn_nameptr = ndp->ni_next;
4134 	while (*cnp->cn_nameptr == '/') {
4135 		cnp->cn_nameptr++;
4136 		ndp->ni_pathlen--;
4137 	}
4138 }
4139 
4140 static int __noinline
4141 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4142 {
4143 
4144 	switch (error) {
4145 	case EAGAIN:
4146 		/*
4147 		 * Can happen when racing against vgone.
4148 		 * */
4149 	case EOPNOTSUPP:
4150 		cache_fpl_partial(fpl);
4151 		break;
4152 	default:
4153 		/*
4154 		 * See the API contract for VOP_FPLOOKUP_VEXEC.
4155 		 */
4156 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4157 			error = cache_fpl_aborted(fpl);
4158 		} else {
4159 			cache_fpl_smr_exit(fpl);
4160 			cache_fpl_handled(fpl, error);
4161 		}
4162 		break;
4163 	}
4164 	return (error);
4165 }
4166 
4167 static int
4168 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4169 {
4170 	struct nameidata *ndp;
4171 	struct componentname *cnp;
4172 	struct mount *mp;
4173 	int error;
4174 
4175 	error = CACHE_FPL_FAILED;
4176 	ndp = fpl->ndp;
4177 	cnp = fpl->cnp;
4178 
4179 	cache_fpl_checkpoint(fpl, &fpl->snd);
4180 
4181 	fpl->dvp = dvp;
4182 	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4183 	if (seqc_in_modify(fpl->dvp_seqc)) {
4184 		cache_fpl_aborted(fpl);
4185 		goto out;
4186 	}
4187 	mp = atomic_load_ptr(&fpl->dvp->v_mount);
4188 	if (!cache_fplookup_mp_supported(mp)) {
4189 		cache_fpl_aborted(fpl);
4190 		goto out;
4191 	}
4192 
4193 	VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4194 
4195 	for (;;) {
4196 		error = cache_fplookup_parse(fpl);
4197 		if (__predict_false(error != 0)) {
4198 			break;
4199 		}
4200 
4201 		VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4202 
4203 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4204 		if (__predict_false(error != 0)) {
4205 			error = cache_fplookup_failed_vexec(fpl, error);
4206 			break;
4207 		}
4208 
4209 		if (__predict_false(cache_fpl_isdotdot(cnp))) {
4210 			error = cache_fplookup_dotdot(fpl);
4211 			if (__predict_false(error != 0)) {
4212 				break;
4213 			}
4214 		} else {
4215 			error = cache_fplookup_next(fpl);
4216 			if (__predict_false(error != 0)) {
4217 				break;
4218 			}
4219 
4220 			VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4221 
4222 			if (cache_fplookup_need_climb_mount(fpl)) {
4223 				error = cache_fplookup_climb_mount(fpl);
4224 				if (__predict_false(error != 0)) {
4225 					break;
4226 				}
4227 			}
4228 		}
4229 
4230 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4231 
4232 		if (cache_fpl_islastcn(ndp)) {
4233 			error = cache_fplookup_final(fpl);
4234 			break;
4235 		}
4236 
4237 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4238 			error = cache_fpl_aborted(fpl);
4239 			break;
4240 		}
4241 
4242 		fpl->dvp = fpl->tvp;
4243 		fpl->dvp_seqc = fpl->tvp_seqc;
4244 
4245 		cache_fplookup_parse_advance(fpl);
4246 		cache_fpl_checkpoint(fpl, &fpl->snd);
4247 	}
4248 out:
4249 	switch (fpl->status) {
4250 	case CACHE_FPL_STATUS_UNSET:
4251 		__assert_unreachable();
4252 		break;
4253 	case CACHE_FPL_STATUS_PARTIAL:
4254 		cache_fpl_smr_assert_entered(fpl);
4255 		return (cache_fplookup_partial_setup(fpl));
4256 	case CACHE_FPL_STATUS_ABORTED:
4257 		if (fpl->in_smr)
4258 			cache_fpl_smr_exit(fpl);
4259 		return (CACHE_FPL_FAILED);
4260 	case CACHE_FPL_STATUS_HANDLED:
4261 		MPASS(error != CACHE_FPL_FAILED);
4262 		cache_fpl_smr_assert_not_entered(fpl);
4263 		if (__predict_false(error != 0)) {
4264 			ndp->ni_dvp = NULL;
4265 			ndp->ni_vp = NULL;
4266 			cache_fpl_cleanup_cnp(cnp);
4267 			return (error);
4268 		}
4269 		ndp->ni_dvp = fpl->dvp;
4270 		ndp->ni_vp = fpl->tvp;
4271 		if (cnp->cn_flags & SAVENAME)
4272 			cnp->cn_flags |= HASBUF;
4273 		else
4274 			cache_fpl_cleanup_cnp(cnp);
4275 		return (error);
4276 	}
4277 }
4278 
4279 /*
4280  * Fast path lookup protected with SMR and sequence counters.
4281  *
4282  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4283  *
4284  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4285  * outlined below.
4286  *
4287  * Traditional vnode lookup conceptually looks like this:
4288  *
4289  * vn_lock(current);
4290  * for (;;) {
4291  *	next = find();
4292  *	vn_lock(next);
4293  *	vn_unlock(current);
4294  *	current = next;
4295  *	if (last)
4296  *	    break;
4297  * }
4298  * return (current);
4299  *
4300  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4301  * any modifications thanks to holding respective locks.
4302  *
4303  * The same guarantee can be provided with a combination of safe memory
4304  * reclamation and sequence counters instead. If all operations which affect
4305  * the relationship between the current vnode and the one we are looking for
4306  * also modify the counter, we can verify whether all the conditions held as
4307  * we made the jump. This includes things like permissions, mount points etc.
4308  * Counter modification is provided by enclosing relevant places in
4309  * vn_seqc_write_begin()/end() calls.
4310  *
4311  * Thus this translates to:
4312  *
4313  * vfs_smr_enter();
4314  * dvp_seqc = seqc_read_any(dvp);
4315  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4316  *     abort();
4317  * for (;;) {
4318  * 	tvp = find();
4319  * 	tvp_seqc = seqc_read_any(tvp);
4320  * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4321  * 	    abort();
4322  * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4323  * 	    abort();
4324  * 	dvp = tvp; // we know nothing of importance has changed
4325  * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4326  * 	if (last)
4327  * 	    break;
4328  * }
4329  * vget(); // secure the vnode
4330  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4331  * 	    abort();
4332  * // at this point we know nothing has changed for any parent<->child pair
4333  * // as they were crossed during the lookup, meaning we matched the guarantee
4334  * // of the locked variant
4335  * return (tvp);
4336  *
4337  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4338  * - they are called while within vfs_smr protection which they must never exit
4339  * - EAGAIN can be returned to denote checking could not be performed, it is
4340  *   always valid to return it
4341  * - if the sequence counter has not changed the result must be valid
4342  * - if the sequence counter has changed both false positives and false negatives
4343  *   are permitted (since the result will be rejected later)
4344  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4345  *
4346  * Caveats to watch out for:
4347  * - vnodes are passed unlocked and unreferenced with nothing stopping
4348  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4349  *   to use atomic_load_ptr to fetch it.
4350  * - the aforementioned object can also get freed, meaning absent other means it
4351  *   should be protected with vfs_smr
4352  * - either safely checking permissions as they are modified or guaranteeing
4353  *   their stability is left to the routine
4354  */
4355 int
4356 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4357     struct pwd **pwdp)
4358 {
4359 	struct cache_fpl fpl;
4360 	struct pwd *pwd;
4361 	struct vnode *dvp;
4362 	struct componentname *cnp;
4363 	struct nameidata_saved orig;
4364 	int error;
4365 
4366 	MPASS(ndp->ni_lcf == 0);
4367 
4368 	fpl.status = CACHE_FPL_STATUS_UNSET;
4369 	fpl.ndp = ndp;
4370 	fpl.cnp = &ndp->ni_cnd;
4371 	MPASS(curthread == fpl.cnp->cn_thread);
4372 
4373 	if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4374 		MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4375 
4376 	if (!cache_can_fplookup(&fpl)) {
4377 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4378 		*status = fpl.status;
4379 		return (EOPNOTSUPP);
4380 	}
4381 
4382 	cache_fpl_checkpoint(&fpl, &orig);
4383 
4384 	cache_fpl_smr_enter_initial(&fpl);
4385 	pwd = pwd_get_smr();
4386 	fpl.pwd = pwd;
4387 	ndp->ni_rootdir = pwd->pwd_rdir;
4388 	ndp->ni_topdir = pwd->pwd_jdir;
4389 
4390 	cnp = fpl.cnp;
4391 	cnp->cn_nameptr = cnp->cn_pnbuf;
4392 	if (cnp->cn_pnbuf[0] == '/') {
4393 		cache_fpl_handle_root(ndp, &dvp);
4394 	} else {
4395 		MPASS(ndp->ni_dirfd == AT_FDCWD);
4396 		dvp = pwd->pwd_cdir;
4397 	}
4398 
4399 	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4400 
4401 	error = cache_fplookup_impl(dvp, &fpl);
4402 	cache_fpl_smr_assert_not_entered(&fpl);
4403 	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4404 
4405 	*status = fpl.status;
4406 	switch (fpl.status) {
4407 	case CACHE_FPL_STATUS_UNSET:
4408 		__assert_unreachable();
4409 		break;
4410 	case CACHE_FPL_STATUS_HANDLED:
4411 		SDT_PROBE3(vfs, namei, lookup, return, error,
4412 		    (error == 0 ? ndp->ni_vp : NULL), true);
4413 		break;
4414 	case CACHE_FPL_STATUS_PARTIAL:
4415 		*pwdp = fpl.pwd;
4416 		/*
4417 		 * Status restored by cache_fplookup_partial_setup.
4418 		 */
4419 		break;
4420 	case CACHE_FPL_STATUS_ABORTED:
4421 		cache_fpl_restore(&fpl, &orig);
4422 		break;
4423 	}
4424 	return (error);
4425 }
4426