xref: /freebsd/sys/kern/vfs_cache.c (revision 0a5eb308d3006e2f386620f7266b6593b300b987)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ddb.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
50 #include <sys/ktr.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
54 #include <sys/jail.h>
55 #include <sys/mount.h>
56 #include <sys/namei.h>
57 #include <sys/proc.h>
58 #include <sys/rwlock.h>
59 #include <sys/seqc.h>
60 #include <sys/sdt.h>
61 #include <sys/smr.h>
62 #include <sys/smp.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/sysproto.h>
66 #include <sys/vnode.h>
67 #include <ck_queue.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 
72 #include <sys/capsicum.h>
73 
74 #include <security/audit/audit.h>
75 #include <security/mac/mac_framework.h>
76 
77 #ifdef DDB
78 #include <ddb/ddb.h>
79 #endif
80 
81 #include <vm/uma.h>
82 
83 SDT_PROVIDER_DECLARE(vfs);
84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
85     "struct vnode *");
86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
87     "char *");
88 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
89     "const char *");
90 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
91     "struct namecache *", "int", "int");
92 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
93 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
94     "char *", "struct vnode *");
95 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
96 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
97     "struct vnode *", "char *");
98 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
99     "struct vnode *");
100 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
101     "struct vnode *", "char *");
102 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
103     "char *");
104 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
105     "struct componentname *");
106 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
107     "struct componentname *");
108 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
109 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
110 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
111 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
112     "struct vnode *");
113 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
114     "char *");
115 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
116     "char *");
117 
118 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
119 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
120 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
121 
122 /*
123  * This structure describes the elements in the cache of recent
124  * names looked up by namei.
125  */
126 struct negstate {
127 	u_char neg_flag;
128 };
129 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
130     "the state must fit in a union with a pointer without growing it");
131 
132 struct	namecache {
133 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
134 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
135 	CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
136 	struct	vnode *nc_dvp;		/* vnode of parent of name */
137 	union {
138 		struct	vnode *nu_vp;	/* vnode the name refers to */
139 		struct	negstate nu_neg;/* negative entry state */
140 	} n_un;
141 	u_char	nc_flag;		/* flag bits */
142 	u_char	nc_nlen;		/* length of name */
143 	char	nc_name[0];		/* segment name + nul */
144 };
145 
146 /*
147  * struct namecache_ts repeats struct namecache layout up to the
148  * nc_nlen member.
149  * struct namecache_ts is used in place of struct namecache when time(s) need
150  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
151  * both a non-dotdot directory name plus dotdot for the directory's
152  * parent.
153  *
154  * See below for alignment requirement.
155  */
156 struct	namecache_ts {
157 	struct	timespec nc_time;	/* timespec provided by fs */
158 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
159 	int	nc_ticks;		/* ticks value when entry was added */
160 	struct namecache nc_nc;
161 };
162 
163 /*
164  * At least mips n32 performs 64-bit accesses to timespec as found
165  * in namecache_ts and requires them to be aligned. Since others
166  * may be in the same spot suffer a little bit and enforce the
167  * alignment for everyone. Note this is a nop for 64-bit platforms.
168  */
169 #define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
170 #define	CACHE_PATH_CUTOFF	39
171 
172 #define CACHE_ZONE_SMALL_SIZE		(sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
173 #define CACHE_ZONE_SMALL_TS_SIZE	(sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
174 #define CACHE_ZONE_LARGE_SIZE		(sizeof(struct namecache) + NAME_MAX + 1)
175 #define CACHE_ZONE_LARGE_TS_SIZE	(sizeof(struct namecache_ts) + NAME_MAX + 1)
176 
177 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
178 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
179 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
180 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
181 
182 #define	nc_vp		n_un.nu_vp
183 #define	nc_neg		n_un.nu_neg
184 
185 /*
186  * Flags in namecache.nc_flag
187  */
188 #define NCF_WHITE	0x01
189 #define NCF_ISDOTDOT	0x02
190 #define	NCF_TS		0x04
191 #define	NCF_DTS		0x08
192 #define	NCF_DVDROP	0x10
193 #define	NCF_NEGATIVE	0x20
194 #define	NCF_INVALID	0x40
195 #define	NCF_WIP		0x80
196 
197 /*
198  * Flags in negstate.neg_flag
199  */
200 #define NEG_HOT		0x01
201 
202 /*
203  * Mark an entry as invalid.
204  *
205  * This is called before it starts getting deconstructed.
206  */
207 static void
208 cache_ncp_invalidate(struct namecache *ncp)
209 {
210 
211 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
212 	    ("%s: entry %p already invalid", __func__, ncp));
213 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
214 	atomic_thread_fence_rel();
215 }
216 
217 /*
218  * Check whether the entry can be safely used.
219  *
220  * All places which elide locks are supposed to call this after they are
221  * done with reading from an entry.
222  */
223 static bool
224 cache_ncp_canuse(struct namecache *ncp)
225 {
226 
227 	atomic_thread_fence_acq();
228 	return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
229 }
230 
231 /*
232  * Name caching works as follows:
233  *
234  * Names found by directory scans are retained in a cache
235  * for future reference.  It is managed LRU, so frequently
236  * used names will hang around.  Cache is indexed by hash value
237  * obtained from (dvp, name) where dvp refers to the directory
238  * containing name.
239  *
240  * If it is a "negative" entry, (i.e. for a name that is known NOT to
241  * exist) the vnode pointer will be NULL.
242  *
243  * Upon reaching the last segment of a path, if the reference
244  * is for DELETE, or NOCACHE is set (rewrite), and the
245  * name is located in the cache, it will be dropped.
246  *
247  * These locks are used (in the order in which they can be taken):
248  * NAME		TYPE	ROLE
249  * vnodelock	mtx	vnode lists and v_cache_dd field protection
250  * bucketlock	rwlock	for access to given set of hash buckets
251  * neglist	mtx	negative entry LRU management
252  *
253  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
254  * shrinking the LRU list.
255  *
256  * It is legal to take multiple vnodelock and bucketlock locks. The locking
257  * order is lower address first. Both are recursive.
258  *
259  * "." lookups are lockless.
260  *
261  * ".." and vnode -> name lookups require vnodelock.
262  *
263  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
264  *
265  * Insertions and removals of entries require involved vnodes and bucketlocks
266  * to be write-locked to prevent other threads from seeing the entry.
267  *
268  * Some lookups result in removal of the found entry (e.g. getting rid of a
269  * negative entry with the intent to create a positive one), which poses a
270  * problem when multiple threads reach the state. Similarly, two different
271  * threads can purge two different vnodes and try to remove the same name.
272  *
273  * If the already held vnode lock is lower than the second required lock, we
274  * can just take the other lock. However, in the opposite case, this could
275  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
276  * the first node, locking everything in order and revalidating the state.
277  */
278 
279 VFS_SMR_DECLARE;
280 
281 /*
282  * Structures associated with name caching.
283  */
284 #define NCHHASH(hash) \
285 	(&nchashtbl[(hash) & nchash])
286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
287 static u_long __read_mostly	nchash;			/* size of hash table */
288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
289     "Size of namecache hash table");
290 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
292     "Ratio of negative namecache entries");
293 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
294 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
295 u_int ncsizefactor = 2;
296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
297     "Size factor for namecache");
298 static u_int __read_mostly	ncpurgeminvnodes;
299 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
300     "Number of vnodes below which purgevfs ignores the request");
301 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
302 
303 struct nchstats	nchstats;		/* cache effectiveness statistics */
304 
305 static bool __read_frequently cache_fast_revlookup = true;
306 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
307     &cache_fast_revlookup, 0, "");
308 
309 static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
310 
311 struct neglist {
312 	struct mtx		nl_lock;
313 	TAILQ_HEAD(, namecache) nl_list;
314 } __aligned(CACHE_LINE_SIZE);
315 
316 static struct neglist __read_mostly	*neglists;
317 static struct neglist ncneg_hot;
318 static u_long numhotneg;
319 
320 #define ncneghash	3
321 #define	numneglists	(ncneghash + 1)
322 static inline struct neglist *
323 NCP2NEGLIST(struct namecache *ncp)
324 {
325 
326 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
327 }
328 
329 static inline struct negstate *
330 NCP2NEGSTATE(struct namecache *ncp)
331 {
332 
333 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
334 	return (&ncp->nc_neg);
335 }
336 
337 #define	numbucketlocks (ncbuckethash + 1)
338 static u_int __read_mostly  ncbuckethash;
339 static struct rwlock_padalign __read_mostly  *bucketlocks;
340 #define	HASH2BUCKETLOCK(hash) \
341 	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
342 
343 #define	numvnodelocks (ncvnodehash + 1)
344 static u_int __read_mostly  ncvnodehash;
345 static struct mtx __read_mostly *vnodelocks;
346 static inline struct mtx *
347 VP2VNODELOCK(struct vnode *vp)
348 {
349 
350 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
351 }
352 
353 /*
354  * UMA zones for the VFS cache.
355  *
356  * The small cache is used for entries with short names, which are the
357  * most common.  The large cache is used for entries which are too big to
358  * fit in the small cache.
359  */
360 static uma_zone_t __read_mostly cache_zone_small;
361 static uma_zone_t __read_mostly cache_zone_small_ts;
362 static uma_zone_t __read_mostly cache_zone_large;
363 static uma_zone_t __read_mostly cache_zone_large_ts;
364 
365 static struct namecache *
366 cache_alloc(int len, int ts)
367 {
368 	struct namecache_ts *ncp_ts;
369 	struct namecache *ncp;
370 
371 	if (__predict_false(ts)) {
372 		if (len <= CACHE_PATH_CUTOFF)
373 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
374 		else
375 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
376 		ncp = &ncp_ts->nc_nc;
377 	} else {
378 		if (len <= CACHE_PATH_CUTOFF)
379 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
380 		else
381 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
382 	}
383 	return (ncp);
384 }
385 
386 static void
387 cache_free(struct namecache *ncp)
388 {
389 	struct namecache_ts *ncp_ts;
390 
391 	if (ncp == NULL)
392 		return;
393 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
394 		vdrop(ncp->nc_dvp);
395 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
396 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
397 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
398 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
399 		else
400 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
401 	} else {
402 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
403 			uma_zfree_smr(cache_zone_small, ncp);
404 		else
405 			uma_zfree_smr(cache_zone_large, ncp);
406 	}
407 }
408 
409 static void
410 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
411 {
412 	struct namecache_ts *ncp_ts;
413 
414 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
415 	    (tsp == NULL && ticksp == NULL),
416 	    ("No NCF_TS"));
417 
418 	if (tsp == NULL && ticksp == NULL)
419 		return;
420 
421 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
422 	if (tsp != NULL)
423 		*tsp = ncp_ts->nc_time;
424 	if (ticksp != NULL)
425 		*ticksp = ncp_ts->nc_ticks;
426 }
427 
428 #ifdef DEBUG_CACHE
429 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
430 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
431     "VFS namecache enabled");
432 #endif
433 
434 /* Export size information to userland */
435 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
436     sizeof(struct namecache), "sizeof(struct namecache)");
437 
438 /*
439  * The new name cache statistics
440  */
441 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
442     "Name cache statistics");
443 #define STATNODE_ULONG(name, descr)					\
444 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
445 #define STATNODE_COUNTER(name, descr)					\
446 	static COUNTER_U64_DEFINE_EARLY(name);				\
447 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
448 	    descr);
449 STATNODE_ULONG(numneg, "Number of negative cache entries");
450 STATNODE_ULONG(numcache, "Number of cache entries");
451 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
452 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
453 STATNODE_COUNTER(dothits, "Number of '.' hits");
454 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
455 STATNODE_COUNTER(nummiss, "Number of cache misses");
456 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
457 STATNODE_COUNTER(numposzaps,
458     "Number of cache hits (positive) we do not want to cache");
459 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
460 STATNODE_COUNTER(numnegzaps,
461     "Number of cache hits (negative) we do not want to cache");
462 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
463 /* These count for vn_getcwd(), too. */
464 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
465 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
466 STATNODE_COUNTER(numfullpathfail2,
467     "Number of fullpath search errors (VOP_VPTOCNP failures)");
468 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
469 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
470 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
471     "Number of successful removals after relocking");
472 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
473     "Number of times zap_and_exit failed to lock");
474 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
475     "Number of times zap_and_exit failed to lock");
476 static long cache_lock_vnodes_cel_3_failures;
477 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
478     "Number of times 3-way vnode locking failed");
479 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
480 STATNODE_COUNTER(numneg_evicted,
481     "Number of negative entries evicted when adding a new entry");
482 STATNODE_COUNTER(shrinking_skipped,
483     "Number of times shrinking was already in progress");
484 
485 static void cache_zap_locked(struct namecache *ncp);
486 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
487     char **freebuf, size_t *buflen);
488 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
489     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
490 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
491     char **retbuf, size_t *buflen);
492 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
493     char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
494 
495 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
496 
497 static int cache_yield;
498 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
499     "Number of times cache called yield");
500 
501 static void __noinline
502 cache_maybe_yield(void)
503 {
504 
505 	if (should_yield()) {
506 		cache_yield++;
507 		kern_yield(PRI_USER);
508 	}
509 }
510 
511 static inline void
512 cache_assert_vlp_locked(struct mtx *vlp)
513 {
514 
515 	if (vlp != NULL)
516 		mtx_assert(vlp, MA_OWNED);
517 }
518 
519 static inline void
520 cache_assert_vnode_locked(struct vnode *vp)
521 {
522 	struct mtx *vlp;
523 
524 	vlp = VP2VNODELOCK(vp);
525 	cache_assert_vlp_locked(vlp);
526 }
527 
528 /*
529  * TODO: With the value stored we can do better than computing the hash based
530  * on the address. The choice of FNV should also be revisited.
531  */
532 static void
533 cache_prehash(struct vnode *vp)
534 {
535 
536 	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
537 }
538 
539 static uint32_t
540 cache_get_hash(char *name, u_char len, struct vnode *dvp)
541 {
542 
543 	return (fnv_32_buf(name, len, dvp->v_nchash));
544 }
545 
546 static inline struct nchashhead *
547 NCP2BUCKET(struct namecache *ncp)
548 {
549 	uint32_t hash;
550 
551 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
552 	return (NCHHASH(hash));
553 }
554 
555 static inline struct rwlock *
556 NCP2BUCKETLOCK(struct namecache *ncp)
557 {
558 	uint32_t hash;
559 
560 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
561 	return (HASH2BUCKETLOCK(hash));
562 }
563 
564 #ifdef INVARIANTS
565 static void
566 cache_assert_bucket_locked(struct namecache *ncp, int mode)
567 {
568 	struct rwlock *blp;
569 
570 	blp = NCP2BUCKETLOCK(ncp);
571 	rw_assert(blp, mode);
572 }
573 #else
574 #define cache_assert_bucket_locked(x, y) do { } while (0)
575 #endif
576 
577 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
578 static void
579 _cache_sort_vnodes(void **p1, void **p2)
580 {
581 	void *tmp;
582 
583 	MPASS(*p1 != NULL || *p2 != NULL);
584 
585 	if (*p1 > *p2) {
586 		tmp = *p2;
587 		*p2 = *p1;
588 		*p1 = tmp;
589 	}
590 }
591 
592 static void
593 cache_lock_all_buckets(void)
594 {
595 	u_int i;
596 
597 	for (i = 0; i < numbucketlocks; i++)
598 		rw_wlock(&bucketlocks[i]);
599 }
600 
601 static void
602 cache_unlock_all_buckets(void)
603 {
604 	u_int i;
605 
606 	for (i = 0; i < numbucketlocks; i++)
607 		rw_wunlock(&bucketlocks[i]);
608 }
609 
610 static void
611 cache_lock_all_vnodes(void)
612 {
613 	u_int i;
614 
615 	for (i = 0; i < numvnodelocks; i++)
616 		mtx_lock(&vnodelocks[i]);
617 }
618 
619 static void
620 cache_unlock_all_vnodes(void)
621 {
622 	u_int i;
623 
624 	for (i = 0; i < numvnodelocks; i++)
625 		mtx_unlock(&vnodelocks[i]);
626 }
627 
628 static int
629 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
630 {
631 
632 	cache_sort_vnodes(&vlp1, &vlp2);
633 
634 	if (vlp1 != NULL) {
635 		if (!mtx_trylock(vlp1))
636 			return (EAGAIN);
637 	}
638 	if (!mtx_trylock(vlp2)) {
639 		if (vlp1 != NULL)
640 			mtx_unlock(vlp1);
641 		return (EAGAIN);
642 	}
643 
644 	return (0);
645 }
646 
647 static void
648 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
649 {
650 
651 	MPASS(vlp1 != NULL || vlp2 != NULL);
652 	MPASS(vlp1 <= vlp2);
653 
654 	if (vlp1 != NULL)
655 		mtx_lock(vlp1);
656 	if (vlp2 != NULL)
657 		mtx_lock(vlp2);
658 }
659 
660 static void
661 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
662 {
663 
664 	MPASS(vlp1 != NULL || vlp2 != NULL);
665 
666 	if (vlp1 != NULL)
667 		mtx_unlock(vlp1);
668 	if (vlp2 != NULL)
669 		mtx_unlock(vlp2);
670 }
671 
672 static int
673 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
674 {
675 	struct nchstats snap;
676 
677 	if (req->oldptr == NULL)
678 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
679 
680 	snap = nchstats;
681 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
682 	snap.ncs_neghits = counter_u64_fetch(numneghits);
683 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
684 	    counter_u64_fetch(numnegzaps);
685 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
686 	    counter_u64_fetch(nummiss);
687 
688 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
689 }
690 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
691     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
692     "VFS cache effectiveness statistics");
693 
694 #ifdef DIAGNOSTIC
695 /*
696  * Grab an atomic snapshot of the name cache hash chain lengths
697  */
698 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
699     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
700     "hash table stats");
701 
702 static int
703 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
704 {
705 	struct nchashhead *ncpp;
706 	struct namecache *ncp;
707 	int i, error, n_nchash, *cntbuf;
708 
709 retry:
710 	n_nchash = nchash + 1;	/* nchash is max index, not count */
711 	if (req->oldptr == NULL)
712 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
713 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
714 	cache_lock_all_buckets();
715 	if (n_nchash != nchash + 1) {
716 		cache_unlock_all_buckets();
717 		free(cntbuf, M_TEMP);
718 		goto retry;
719 	}
720 	/* Scan hash tables counting entries */
721 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
722 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
723 			cntbuf[i]++;
724 	cache_unlock_all_buckets();
725 	for (error = 0, i = 0; i < n_nchash; i++)
726 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
727 			break;
728 	free(cntbuf, M_TEMP);
729 	return (error);
730 }
731 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
732     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
733     "nchash chain lengths");
734 
735 static int
736 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
737 {
738 	int error;
739 	struct nchashhead *ncpp;
740 	struct namecache *ncp;
741 	int n_nchash;
742 	int count, maxlength, used, pct;
743 
744 	if (!req->oldptr)
745 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
746 
747 	cache_lock_all_buckets();
748 	n_nchash = nchash + 1;	/* nchash is max index, not count */
749 	used = 0;
750 	maxlength = 0;
751 
752 	/* Scan hash tables for applicable entries */
753 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
754 		count = 0;
755 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
756 			count++;
757 		}
758 		if (count)
759 			used++;
760 		if (maxlength < count)
761 			maxlength = count;
762 	}
763 	n_nchash = nchash + 1;
764 	cache_unlock_all_buckets();
765 	pct = (used * 100) / (n_nchash / 100);
766 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
767 	if (error)
768 		return (error);
769 	error = SYSCTL_OUT(req, &used, sizeof(used));
770 	if (error)
771 		return (error);
772 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
773 	if (error)
774 		return (error);
775 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
776 	if (error)
777 		return (error);
778 	return (0);
779 }
780 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
781     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
782     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
783 #endif
784 
785 /*
786  * Negative entries management
787  *
788  * A variation of LRU scheme is used. New entries are hashed into one of
789  * numneglists cold lists. Entries get promoted to the hot list on first hit.
790  *
791  * The shrinker will demote hot list head and evict from the cold list in a
792  * round-robin manner.
793  */
794 static void
795 cache_negative_init(struct namecache *ncp)
796 {
797 	struct negstate *negstate;
798 
799 	ncp->nc_flag |= NCF_NEGATIVE;
800 	negstate = NCP2NEGSTATE(ncp);
801 	negstate->neg_flag = 0;
802 }
803 
804 static void
805 cache_negative_hit(struct namecache *ncp)
806 {
807 	struct neglist *neglist;
808 	struct negstate *negstate;
809 
810 	negstate = NCP2NEGSTATE(ncp);
811 	if ((negstate->neg_flag & NEG_HOT) != 0)
812 		return;
813 	neglist = NCP2NEGLIST(ncp);
814 	mtx_lock(&ncneg_hot.nl_lock);
815 	mtx_lock(&neglist->nl_lock);
816 	if ((negstate->neg_flag & NEG_HOT) == 0) {
817 		numhotneg++;
818 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
819 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
820 		negstate->neg_flag |= NEG_HOT;
821 	}
822 	mtx_unlock(&neglist->nl_lock);
823 	mtx_unlock(&ncneg_hot.nl_lock);
824 }
825 
826 static void
827 cache_negative_insert(struct namecache *ncp)
828 {
829 	struct neglist *neglist;
830 
831 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
832 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
833 	neglist = NCP2NEGLIST(ncp);
834 	mtx_lock(&neglist->nl_lock);
835 	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
836 	mtx_unlock(&neglist->nl_lock);
837 	atomic_add_rel_long(&numneg, 1);
838 }
839 
840 static void
841 cache_negative_remove(struct namecache *ncp)
842 {
843 	struct neglist *neglist;
844 	struct negstate *negstate;
845 	bool hot_locked = false;
846 	bool list_locked = false;
847 
848 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
849 	neglist = NCP2NEGLIST(ncp);
850 	negstate = NCP2NEGSTATE(ncp);
851 	if ((negstate->neg_flag & NEG_HOT) != 0) {
852 		hot_locked = true;
853 		mtx_lock(&ncneg_hot.nl_lock);
854 		if ((negstate->neg_flag & NEG_HOT) == 0) {
855 			list_locked = true;
856 			mtx_lock(&neglist->nl_lock);
857 		}
858 	} else {
859 		list_locked = true;
860 		mtx_lock(&neglist->nl_lock);
861 		/*
862 		 * We may be racing against promotion in lockless lookup.
863 		 */
864 		if ((negstate->neg_flag & NEG_HOT) != 0) {
865 			mtx_unlock(&neglist->nl_lock);
866 			hot_locked = true;
867 			mtx_lock(&ncneg_hot.nl_lock);
868 			mtx_lock(&neglist->nl_lock);
869 		}
870 	}
871 	if ((negstate->neg_flag & NEG_HOT) != 0) {
872 		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
873 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
874 		numhotneg--;
875 	} else {
876 		mtx_assert(&neglist->nl_lock, MA_OWNED);
877 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
878 	}
879 	if (list_locked)
880 		mtx_unlock(&neglist->nl_lock);
881 	if (hot_locked)
882 		mtx_unlock(&ncneg_hot.nl_lock);
883 	atomic_subtract_rel_long(&numneg, 1);
884 }
885 
886 static void
887 cache_negative_shrink_select(struct namecache **ncpp,
888     struct neglist **neglistpp)
889 {
890 	struct neglist *neglist;
891 	struct namecache *ncp;
892 	static u_int cycle;
893 	u_int i;
894 
895 	*ncpp = ncp = NULL;
896 
897 	for (i = 0; i < numneglists; i++) {
898 		neglist = &neglists[(cycle + i) % numneglists];
899 		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
900 			continue;
901 		mtx_lock(&neglist->nl_lock);
902 		ncp = TAILQ_FIRST(&neglist->nl_list);
903 		if (ncp != NULL)
904 			break;
905 		mtx_unlock(&neglist->nl_lock);
906 	}
907 
908 	*neglistpp = neglist;
909 	*ncpp = ncp;
910 	cycle++;
911 }
912 
913 static void
914 cache_negative_zap_one(void)
915 {
916 	struct namecache *ncp, *ncp2;
917 	struct neglist *neglist;
918 	struct negstate *negstate;
919 	struct mtx *dvlp;
920 	struct rwlock *blp;
921 
922 	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
923 	    !mtx_trylock(&ncneg_shrink_lock)) {
924 		counter_u64_add(shrinking_skipped, 1);
925 		return;
926 	}
927 
928 	mtx_lock(&ncneg_hot.nl_lock);
929 	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
930 	if (ncp != NULL) {
931 		neglist = NCP2NEGLIST(ncp);
932 		negstate = NCP2NEGSTATE(ncp);
933 		mtx_lock(&neglist->nl_lock);
934 		MPASS((negstate->neg_flag & NEG_HOT) != 0);
935 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
936 		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
937 		negstate->neg_flag &= ~NEG_HOT;
938 		numhotneg--;
939 		mtx_unlock(&neglist->nl_lock);
940 	}
941 	mtx_unlock(&ncneg_hot.nl_lock);
942 
943 	cache_negative_shrink_select(&ncp, &neglist);
944 
945 	mtx_unlock(&ncneg_shrink_lock);
946 	if (ncp == NULL)
947 		return;
948 
949 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
950 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
951 	blp = NCP2BUCKETLOCK(ncp);
952 	mtx_unlock(&neglist->nl_lock);
953 	mtx_lock(dvlp);
954 	rw_wlock(blp);
955 	/*
956 	 * Enter SMR to safely check the negative list.
957 	 * Even if the found pointer matches, the entry may now be reallocated
958 	 * and used by a different vnode.
959 	 */
960 	vfs_smr_enter();
961 	ncp2 = TAILQ_FIRST(&neglist->nl_list);
962 	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
963 	    blp != NCP2BUCKETLOCK(ncp2)) {
964 		vfs_smr_exit();
965 		ncp = NULL;
966 	} else {
967 		vfs_smr_exit();
968 		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
969 		    ncp->nc_name);
970 		cache_zap_locked(ncp);
971 		counter_u64_add(numneg_evicted, 1);
972 	}
973 	rw_wunlock(blp);
974 	mtx_unlock(dvlp);
975 	cache_free(ncp);
976 }
977 
978 /*
979  * cache_zap_locked():
980  *
981  *   Removes a namecache entry from cache, whether it contains an actual
982  *   pointer to a vnode or if it is just a negative cache entry.
983  */
984 static void
985 cache_zap_locked(struct namecache *ncp)
986 {
987 	struct nchashhead *ncpp;
988 
989 	if (!(ncp->nc_flag & NCF_NEGATIVE))
990 		cache_assert_vnode_locked(ncp->nc_vp);
991 	cache_assert_vnode_locked(ncp->nc_dvp);
992 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
993 
994 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
995 	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
996 
997 	cache_ncp_invalidate(ncp);
998 
999 	ncpp = NCP2BUCKET(ncp);
1000 	CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1001 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1002 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1003 		    ncp->nc_name, ncp->nc_vp);
1004 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1005 		if (ncp == ncp->nc_vp->v_cache_dd) {
1006 			vn_seqc_write_begin_unheld(ncp->nc_vp);
1007 			ncp->nc_vp->v_cache_dd = NULL;
1008 			vn_seqc_write_end(ncp->nc_vp);
1009 		}
1010 	} else {
1011 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1012 		    ncp->nc_name);
1013 		cache_negative_remove(ncp);
1014 	}
1015 	if (ncp->nc_flag & NCF_ISDOTDOT) {
1016 		if (ncp == ncp->nc_dvp->v_cache_dd) {
1017 			vn_seqc_write_begin_unheld(ncp->nc_dvp);
1018 			ncp->nc_dvp->v_cache_dd = NULL;
1019 			vn_seqc_write_end(ncp->nc_dvp);
1020 		}
1021 	} else {
1022 		LIST_REMOVE(ncp, nc_src);
1023 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1024 			ncp->nc_flag |= NCF_DVDROP;
1025 			counter_u64_add(numcachehv, -1);
1026 		}
1027 	}
1028 	atomic_subtract_rel_long(&numcache, 1);
1029 }
1030 
1031 static void
1032 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1033 {
1034 	struct rwlock *blp;
1035 
1036 	MPASS(ncp->nc_dvp == vp);
1037 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1038 	cache_assert_vnode_locked(vp);
1039 
1040 	blp = NCP2BUCKETLOCK(ncp);
1041 	rw_wlock(blp);
1042 	cache_zap_locked(ncp);
1043 	rw_wunlock(blp);
1044 }
1045 
1046 static bool
1047 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1048     struct mtx **vlpp)
1049 {
1050 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1051 	struct rwlock *blp;
1052 
1053 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1054 	cache_assert_vnode_locked(vp);
1055 
1056 	if (ncp->nc_flag & NCF_NEGATIVE) {
1057 		if (*vlpp != NULL) {
1058 			mtx_unlock(*vlpp);
1059 			*vlpp = NULL;
1060 		}
1061 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1062 		return (true);
1063 	}
1064 
1065 	pvlp = VP2VNODELOCK(vp);
1066 	blp = NCP2BUCKETLOCK(ncp);
1067 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1068 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1069 
1070 	if (*vlpp == vlp1 || *vlpp == vlp2) {
1071 		to_unlock = *vlpp;
1072 		*vlpp = NULL;
1073 	} else {
1074 		if (*vlpp != NULL) {
1075 			mtx_unlock(*vlpp);
1076 			*vlpp = NULL;
1077 		}
1078 		cache_sort_vnodes(&vlp1, &vlp2);
1079 		if (vlp1 == pvlp) {
1080 			mtx_lock(vlp2);
1081 			to_unlock = vlp2;
1082 		} else {
1083 			if (!mtx_trylock(vlp1))
1084 				goto out_relock;
1085 			to_unlock = vlp1;
1086 		}
1087 	}
1088 	rw_wlock(blp);
1089 	cache_zap_locked(ncp);
1090 	rw_wunlock(blp);
1091 	if (to_unlock != NULL)
1092 		mtx_unlock(to_unlock);
1093 	return (true);
1094 
1095 out_relock:
1096 	mtx_unlock(vlp2);
1097 	mtx_lock(vlp1);
1098 	mtx_lock(vlp2);
1099 	MPASS(*vlpp == NULL);
1100 	*vlpp = vlp1;
1101 	return (false);
1102 }
1103 
1104 static int __noinline
1105 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1106 {
1107 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1108 	struct rwlock *blp;
1109 	int error = 0;
1110 
1111 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1112 	cache_assert_vnode_locked(vp);
1113 
1114 	pvlp = VP2VNODELOCK(vp);
1115 	if (ncp->nc_flag & NCF_NEGATIVE) {
1116 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1117 		goto out;
1118 	}
1119 
1120 	blp = NCP2BUCKETLOCK(ncp);
1121 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1122 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1123 	cache_sort_vnodes(&vlp1, &vlp2);
1124 	if (vlp1 == pvlp) {
1125 		mtx_lock(vlp2);
1126 		to_unlock = vlp2;
1127 	} else {
1128 		if (!mtx_trylock(vlp1)) {
1129 			error = EAGAIN;
1130 			goto out;
1131 		}
1132 		to_unlock = vlp1;
1133 	}
1134 	rw_wlock(blp);
1135 	cache_zap_locked(ncp);
1136 	rw_wunlock(blp);
1137 	mtx_unlock(to_unlock);
1138 out:
1139 	mtx_unlock(pvlp);
1140 	return (error);
1141 }
1142 
1143 /*
1144  * If trylocking failed we can get here. We know enough to take all needed locks
1145  * in the right order and re-lookup the entry.
1146  */
1147 static int
1148 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1149     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1150     struct rwlock *blp)
1151 {
1152 	struct namecache *rncp;
1153 
1154 	cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1155 
1156 	cache_sort_vnodes(&dvlp, &vlp);
1157 	cache_lock_vnodes(dvlp, vlp);
1158 	rw_wlock(blp);
1159 	CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1160 		if (rncp == ncp && rncp->nc_dvp == dvp &&
1161 		    rncp->nc_nlen == cnp->cn_namelen &&
1162 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1163 			break;
1164 	}
1165 	if (rncp != NULL) {
1166 		cache_zap_locked(rncp);
1167 		rw_wunlock(blp);
1168 		cache_unlock_vnodes(dvlp, vlp);
1169 		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1170 		return (0);
1171 	}
1172 
1173 	rw_wunlock(blp);
1174 	cache_unlock_vnodes(dvlp, vlp);
1175 	return (EAGAIN);
1176 }
1177 
1178 static int __noinline
1179 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1180     uint32_t hash, struct rwlock *blp)
1181 {
1182 	struct mtx *dvlp, *vlp;
1183 	struct vnode *dvp;
1184 
1185 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1186 
1187 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1188 	vlp = NULL;
1189 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1190 		vlp = VP2VNODELOCK(ncp->nc_vp);
1191 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1192 		cache_zap_locked(ncp);
1193 		rw_wunlock(blp);
1194 		cache_unlock_vnodes(dvlp, vlp);
1195 		return (0);
1196 	}
1197 
1198 	dvp = ncp->nc_dvp;
1199 	rw_wunlock(blp);
1200 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1201 }
1202 
1203 static int __noinline
1204 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1205     uint32_t hash, struct rwlock *blp)
1206 {
1207 	struct mtx *dvlp, *vlp;
1208 	struct vnode *dvp;
1209 
1210 	cache_assert_bucket_locked(ncp, RA_RLOCKED);
1211 
1212 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1213 	vlp = NULL;
1214 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1215 		vlp = VP2VNODELOCK(ncp->nc_vp);
1216 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1217 		rw_runlock(blp);
1218 		rw_wlock(blp);
1219 		cache_zap_locked(ncp);
1220 		rw_wunlock(blp);
1221 		cache_unlock_vnodes(dvlp, vlp);
1222 		return (0);
1223 	}
1224 
1225 	dvp = ncp->nc_dvp;
1226 	rw_runlock(blp);
1227 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1228 }
1229 
1230 static int
1231 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1232     struct mtx **vlpp1, struct mtx **vlpp2)
1233 {
1234 	struct mtx *dvlp, *vlp;
1235 
1236 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1237 
1238 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1239 	vlp = NULL;
1240 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1241 		vlp = VP2VNODELOCK(ncp->nc_vp);
1242 	cache_sort_vnodes(&dvlp, &vlp);
1243 
1244 	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1245 		cache_zap_locked(ncp);
1246 		cache_unlock_vnodes(dvlp, vlp);
1247 		*vlpp1 = NULL;
1248 		*vlpp2 = NULL;
1249 		return (0);
1250 	}
1251 
1252 	if (*vlpp1 != NULL)
1253 		mtx_unlock(*vlpp1);
1254 	if (*vlpp2 != NULL)
1255 		mtx_unlock(*vlpp2);
1256 	*vlpp1 = NULL;
1257 	*vlpp2 = NULL;
1258 
1259 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1260 		cache_zap_locked(ncp);
1261 		cache_unlock_vnodes(dvlp, vlp);
1262 		return (0);
1263 	}
1264 
1265 	rw_wunlock(blp);
1266 	*vlpp1 = dvlp;
1267 	*vlpp2 = vlp;
1268 	if (*vlpp1 != NULL)
1269 		mtx_lock(*vlpp1);
1270 	mtx_lock(*vlpp2);
1271 	rw_wlock(blp);
1272 	return (EAGAIN);
1273 }
1274 
1275 static void
1276 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1277 {
1278 
1279 	if (blp != NULL) {
1280 		rw_runlock(blp);
1281 	} else {
1282 		mtx_unlock(vlp);
1283 	}
1284 }
1285 
1286 static int __noinline
1287 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1288     struct timespec *tsp, int *ticksp)
1289 {
1290 	int ltype;
1291 
1292 	*vpp = dvp;
1293 	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1294 			dvp, cnp->cn_nameptr);
1295 	counter_u64_add(dothits, 1);
1296 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1297 	if (tsp != NULL)
1298 		timespecclear(tsp);
1299 	if (ticksp != NULL)
1300 		*ticksp = ticks;
1301 	vrefact(*vpp);
1302 	/*
1303 	 * When we lookup "." we still can be asked to lock it
1304 	 * differently...
1305 	 */
1306 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1307 	if (ltype != VOP_ISLOCKED(*vpp)) {
1308 		if (ltype == LK_EXCLUSIVE) {
1309 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1310 			if (VN_IS_DOOMED((*vpp))) {
1311 				/* forced unmount */
1312 				vrele(*vpp);
1313 				*vpp = NULL;
1314 				return (ENOENT);
1315 			}
1316 		} else
1317 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1318 	}
1319 	return (-1);
1320 }
1321 
1322 static __noinline int
1323 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1324 {
1325 	struct namecache *ncp;
1326 	struct rwlock *blp;
1327 	struct mtx *dvlp, *dvlp2;
1328 	uint32_t hash;
1329 	int error;
1330 
1331 	if (cnp->cn_namelen == 2 &&
1332 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1333 		dvlp = VP2VNODELOCK(dvp);
1334 		dvlp2 = NULL;
1335 		mtx_lock(dvlp);
1336 retry_dotdot:
1337 		ncp = dvp->v_cache_dd;
1338 		if (ncp == NULL) {
1339 			mtx_unlock(dvlp);
1340 			if (dvlp2 != NULL)
1341 				mtx_unlock(dvlp2);
1342 			SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1343 			return (0);
1344 		}
1345 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1346 			if (ncp->nc_dvp != dvp)
1347 				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1348 			if (!cache_zap_locked_vnode_kl2(ncp,
1349 			    dvp, &dvlp2))
1350 				goto retry_dotdot;
1351 			MPASS(dvp->v_cache_dd == NULL);
1352 			mtx_unlock(dvlp);
1353 			if (dvlp2 != NULL)
1354 				mtx_unlock(dvlp2);
1355 			cache_free(ncp);
1356 		} else {
1357 			vn_seqc_write_begin(dvp);
1358 			dvp->v_cache_dd = NULL;
1359 			vn_seqc_write_end(dvp);
1360 			mtx_unlock(dvlp);
1361 			if (dvlp2 != NULL)
1362 				mtx_unlock(dvlp2);
1363 		}
1364 		SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1365 		return (1);
1366 	}
1367 
1368 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1369 	blp = HASH2BUCKETLOCK(hash);
1370 retry:
1371 	if (CK_SLIST_EMPTY(NCHHASH(hash)))
1372 		goto out_no_entry;
1373 
1374 	rw_wlock(blp);
1375 
1376 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1377 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1378 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1379 			break;
1380 	}
1381 
1382 	/* We failed to find an entry */
1383 	if (ncp == NULL) {
1384 		rw_wunlock(blp);
1385 		goto out_no_entry;
1386 	}
1387 
1388 	error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1389 	if (__predict_false(error != 0)) {
1390 		zap_and_exit_bucket_fail++;
1391 		cache_maybe_yield();
1392 		goto retry;
1393 	}
1394 	counter_u64_add(numposzaps, 1);
1395 	cache_free(ncp);
1396 	SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1397 	return (1);
1398 out_no_entry:
1399 	SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1400 	counter_u64_add(nummisszap, 1);
1401 	return (0);
1402 }
1403 
1404 /**
1405  * Lookup a name in the name cache
1406  *
1407  * # Arguments
1408  *
1409  * - dvp:	Parent directory in which to search.
1410  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1411  * - cnp:	Parameters of the name search.  The most interesting bits of
1412  *   		the cn_flags field have the following meanings:
1413  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1414  *   			it up.
1415  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1416  * - tsp:	Return storage for cache timestamp.  On a successful (positive
1417  *   		or negative) lookup, tsp will be filled with any timespec that
1418  *   		was stored when this cache entry was created.  However, it will
1419  *   		be clear for "." entries.
1420  * - ticks:	Return storage for alternate cache timestamp.  On a successful
1421  *   		(positive or negative) lookup, it will contain the ticks value
1422  *   		that was current when the cache entry was created, unless cnp
1423  *   		was ".".
1424  *
1425  * # Returns
1426  *
1427  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1428  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1429  *		to a forced unmount.  vpp will not be modified.  If the entry
1430  *		is a whiteout, then the ISWHITEOUT flag will be set in
1431  *		cnp->cn_flags.
1432  * - 0:		A cache miss.  vpp will not be modified.
1433  *
1434  * # Locking
1435  *
1436  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1437  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1438  * lock is not recursively acquired.
1439  */
1440 int
1441 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1442     struct timespec *tsp, int *ticksp)
1443 {
1444 	struct namecache_ts *ncp_ts;
1445 	struct namecache *ncp;
1446 	struct negstate *negstate;
1447 	struct rwlock *blp;
1448 	struct mtx *dvlp;
1449 	uint32_t hash;
1450 	enum vgetstate vs;
1451 	int error, ltype;
1452 	bool try_smr, doing_smr, whiteout;
1453 
1454 #ifdef DEBUG_CACHE
1455 	if (__predict_false(!doingcache)) {
1456 		cnp->cn_flags &= ~MAKEENTRY;
1457 		return (0);
1458 	}
1459 #endif
1460 
1461 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1462 		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1463 
1464 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
1465 		cache_remove_cnp(dvp, cnp);
1466 		return (0);
1467 	}
1468 
1469 	try_smr = true;
1470 	if (cnp->cn_nameiop == CREATE)
1471 		try_smr = false;
1472 retry:
1473 	doing_smr = false;
1474 	blp = NULL;
1475 	dvlp = NULL;
1476 	error = 0;
1477 	if (cnp->cn_namelen == 2 &&
1478 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1479 		counter_u64_add(dotdothits, 1);
1480 		dvlp = VP2VNODELOCK(dvp);
1481 		mtx_lock(dvlp);
1482 		ncp = dvp->v_cache_dd;
1483 		if (ncp == NULL) {
1484 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1485 			    "..", NULL);
1486 			mtx_unlock(dvlp);
1487 			return (0);
1488 		}
1489 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1490 			if (ncp->nc_flag & NCF_NEGATIVE)
1491 				*vpp = NULL;
1492 			else
1493 				*vpp = ncp->nc_vp;
1494 		} else
1495 			*vpp = ncp->nc_dvp;
1496 		/* Return failure if negative entry was found. */
1497 		if (*vpp == NULL)
1498 			goto negative_success;
1499 		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1500 		    dvp, cnp->cn_nameptr, *vpp);
1501 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1502 		    *vpp);
1503 		cache_out_ts(ncp, tsp, ticksp);
1504 		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1505 		    NCF_DTS && tsp != NULL) {
1506 			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1507 			*tsp = ncp_ts->nc_dotdottime;
1508 		}
1509 		goto success;
1510 	}
1511 
1512 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1513 retry_hashed:
1514 	if (try_smr) {
1515 		vfs_smr_enter();
1516 		doing_smr = true;
1517 		try_smr = false;
1518 	} else {
1519 		blp = HASH2BUCKETLOCK(hash);
1520 		rw_rlock(blp);
1521 	}
1522 
1523 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1524 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1525 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1526 			break;
1527 	}
1528 
1529 	/* We failed to find an entry */
1530 	if (__predict_false(ncp == NULL)) {
1531 		if (doing_smr)
1532 			vfs_smr_exit();
1533 		else
1534 			rw_runlock(blp);
1535 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1536 		    NULL);
1537 		counter_u64_add(nummiss, 1);
1538 		return (0);
1539 	}
1540 
1541 	if (ncp->nc_flag & NCF_NEGATIVE)
1542 		goto negative_success;
1543 
1544 	/* We found a "positive" match, return the vnode */
1545 	counter_u64_add(numposhits, 1);
1546 	*vpp = ncp->nc_vp;
1547 	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1548 	    dvp, cnp->cn_nameptr, *vpp, ncp);
1549 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1550 	    *vpp);
1551 	cache_out_ts(ncp, tsp, ticksp);
1552 success:
1553 	/*
1554 	 * On success we return a locked and ref'd vnode as per the lookup
1555 	 * protocol.
1556 	 */
1557 	MPASS(dvp != *vpp);
1558 	ltype = 0;	/* silence gcc warning */
1559 	if (cnp->cn_flags & ISDOTDOT) {
1560 		ltype = VOP_ISLOCKED(dvp);
1561 		VOP_UNLOCK(dvp);
1562 	}
1563 	if (doing_smr) {
1564 		if (!cache_ncp_canuse(ncp)) {
1565 			vfs_smr_exit();
1566 			*vpp = NULL;
1567 			goto retry;
1568 		}
1569 		vs = vget_prep_smr(*vpp);
1570 		vfs_smr_exit();
1571 		if (__predict_false(vs == VGET_NONE)) {
1572 			*vpp = NULL;
1573 			goto retry;
1574 		}
1575 	} else {
1576 		vs = vget_prep(*vpp);
1577 		cache_lookup_unlock(blp, dvlp);
1578 	}
1579 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1580 	if (cnp->cn_flags & ISDOTDOT) {
1581 		vn_lock(dvp, ltype | LK_RETRY);
1582 		if (VN_IS_DOOMED(dvp)) {
1583 			if (error == 0)
1584 				vput(*vpp);
1585 			*vpp = NULL;
1586 			return (ENOENT);
1587 		}
1588 	}
1589 	if (error) {
1590 		*vpp = NULL;
1591 		goto retry;
1592 	}
1593 	if ((cnp->cn_flags & ISLASTCN) &&
1594 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1595 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1596 	}
1597 	return (-1);
1598 
1599 negative_success:
1600 	/* We found a negative match, and want to create it, so purge */
1601 	if (cnp->cn_nameiop == CREATE) {
1602 		MPASS(!doing_smr);
1603 		counter_u64_add(numnegzaps, 1);
1604 		goto zap_and_exit;
1605 	}
1606 
1607 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1608 	cache_out_ts(ncp, tsp, ticksp);
1609 	counter_u64_add(numneghits, 1);
1610 	whiteout = (ncp->nc_flag & NCF_WHITE);
1611 
1612 	if (doing_smr) {
1613 		/*
1614 		 * We need to take locks to promote an entry.
1615 		 */
1616 		negstate = NCP2NEGSTATE(ncp);
1617 		if ((negstate->neg_flag & NEG_HOT) == 0 ||
1618 		    !cache_ncp_canuse(ncp)) {
1619 			vfs_smr_exit();
1620 			doing_smr = false;
1621 			goto retry_hashed;
1622 		}
1623 		vfs_smr_exit();
1624 	} else {
1625 		cache_negative_hit(ncp);
1626 		cache_lookup_unlock(blp, dvlp);
1627 	}
1628 	if (whiteout)
1629 		cnp->cn_flags |= ISWHITEOUT;
1630 	return (ENOENT);
1631 
1632 zap_and_exit:
1633 	MPASS(!doing_smr);
1634 	if (blp != NULL)
1635 		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1636 	else
1637 		error = cache_zap_locked_vnode(ncp, dvp);
1638 	if (__predict_false(error != 0)) {
1639 		zap_and_exit_bucket_fail2++;
1640 		cache_maybe_yield();
1641 		goto retry;
1642 	}
1643 	cache_free(ncp);
1644 	return (0);
1645 }
1646 
1647 struct celockstate {
1648 	struct mtx *vlp[3];
1649 	struct rwlock *blp[2];
1650 };
1651 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1652 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1653 
1654 static inline void
1655 cache_celockstate_init(struct celockstate *cel)
1656 {
1657 
1658 	bzero(cel, sizeof(*cel));
1659 }
1660 
1661 static void
1662 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1663     struct vnode *dvp)
1664 {
1665 	struct mtx *vlp1, *vlp2;
1666 
1667 	MPASS(cel->vlp[0] == NULL);
1668 	MPASS(cel->vlp[1] == NULL);
1669 	MPASS(cel->vlp[2] == NULL);
1670 
1671 	MPASS(vp != NULL || dvp != NULL);
1672 
1673 	vlp1 = VP2VNODELOCK(vp);
1674 	vlp2 = VP2VNODELOCK(dvp);
1675 	cache_sort_vnodes(&vlp1, &vlp2);
1676 
1677 	if (vlp1 != NULL) {
1678 		mtx_lock(vlp1);
1679 		cel->vlp[0] = vlp1;
1680 	}
1681 	mtx_lock(vlp2);
1682 	cel->vlp[1] = vlp2;
1683 }
1684 
1685 static void
1686 cache_unlock_vnodes_cel(struct celockstate *cel)
1687 {
1688 
1689 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1690 
1691 	if (cel->vlp[0] != NULL)
1692 		mtx_unlock(cel->vlp[0]);
1693 	if (cel->vlp[1] != NULL)
1694 		mtx_unlock(cel->vlp[1]);
1695 	if (cel->vlp[2] != NULL)
1696 		mtx_unlock(cel->vlp[2]);
1697 }
1698 
1699 static bool
1700 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1701 {
1702 	struct mtx *vlp;
1703 	bool ret;
1704 
1705 	cache_assert_vlp_locked(cel->vlp[0]);
1706 	cache_assert_vlp_locked(cel->vlp[1]);
1707 	MPASS(cel->vlp[2] == NULL);
1708 
1709 	MPASS(vp != NULL);
1710 	vlp = VP2VNODELOCK(vp);
1711 
1712 	ret = true;
1713 	if (vlp >= cel->vlp[1]) {
1714 		mtx_lock(vlp);
1715 	} else {
1716 		if (mtx_trylock(vlp))
1717 			goto out;
1718 		cache_lock_vnodes_cel_3_failures++;
1719 		cache_unlock_vnodes_cel(cel);
1720 		if (vlp < cel->vlp[0]) {
1721 			mtx_lock(vlp);
1722 			mtx_lock(cel->vlp[0]);
1723 			mtx_lock(cel->vlp[1]);
1724 		} else {
1725 			if (cel->vlp[0] != NULL)
1726 				mtx_lock(cel->vlp[0]);
1727 			mtx_lock(vlp);
1728 			mtx_lock(cel->vlp[1]);
1729 		}
1730 		ret = false;
1731 	}
1732 out:
1733 	cel->vlp[2] = vlp;
1734 	return (ret);
1735 }
1736 
1737 static void
1738 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1739     struct rwlock *blp2)
1740 {
1741 
1742 	MPASS(cel->blp[0] == NULL);
1743 	MPASS(cel->blp[1] == NULL);
1744 
1745 	cache_sort_vnodes(&blp1, &blp2);
1746 
1747 	if (blp1 != NULL) {
1748 		rw_wlock(blp1);
1749 		cel->blp[0] = blp1;
1750 	}
1751 	rw_wlock(blp2);
1752 	cel->blp[1] = blp2;
1753 }
1754 
1755 static void
1756 cache_unlock_buckets_cel(struct celockstate *cel)
1757 {
1758 
1759 	if (cel->blp[0] != NULL)
1760 		rw_wunlock(cel->blp[0]);
1761 	rw_wunlock(cel->blp[1]);
1762 }
1763 
1764 /*
1765  * Lock part of the cache affected by the insertion.
1766  *
1767  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1768  * However, insertion can result in removal of an old entry. In this
1769  * case we have an additional vnode and bucketlock pair to lock. If the
1770  * entry is negative, ncelock is locked instead of the vnode.
1771  *
1772  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1773  * preserving the locking order (smaller address first).
1774  */
1775 static void
1776 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1777     uint32_t hash)
1778 {
1779 	struct namecache *ncp;
1780 	struct rwlock *blps[2];
1781 
1782 	blps[0] = HASH2BUCKETLOCK(hash);
1783 	for (;;) {
1784 		blps[1] = NULL;
1785 		cache_lock_vnodes_cel(cel, dvp, vp);
1786 		if (vp == NULL || vp->v_type != VDIR)
1787 			break;
1788 		ncp = vp->v_cache_dd;
1789 		if (ncp == NULL)
1790 			break;
1791 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1792 			break;
1793 		MPASS(ncp->nc_dvp == vp);
1794 		blps[1] = NCP2BUCKETLOCK(ncp);
1795 		if (ncp->nc_flag & NCF_NEGATIVE)
1796 			break;
1797 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1798 			break;
1799 		/*
1800 		 * All vnodes got re-locked. Re-validate the state and if
1801 		 * nothing changed we are done. Otherwise restart.
1802 		 */
1803 		if (ncp == vp->v_cache_dd &&
1804 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1805 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1806 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1807 			break;
1808 		cache_unlock_vnodes_cel(cel);
1809 		cel->vlp[0] = NULL;
1810 		cel->vlp[1] = NULL;
1811 		cel->vlp[2] = NULL;
1812 	}
1813 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1814 }
1815 
1816 static void
1817 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1818     uint32_t hash)
1819 {
1820 	struct namecache *ncp;
1821 	struct rwlock *blps[2];
1822 
1823 	blps[0] = HASH2BUCKETLOCK(hash);
1824 	for (;;) {
1825 		blps[1] = NULL;
1826 		cache_lock_vnodes_cel(cel, dvp, vp);
1827 		ncp = dvp->v_cache_dd;
1828 		if (ncp == NULL)
1829 			break;
1830 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1831 			break;
1832 		MPASS(ncp->nc_dvp == dvp);
1833 		blps[1] = NCP2BUCKETLOCK(ncp);
1834 		if (ncp->nc_flag & NCF_NEGATIVE)
1835 			break;
1836 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1837 			break;
1838 		if (ncp == dvp->v_cache_dd &&
1839 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1840 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1841 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1842 			break;
1843 		cache_unlock_vnodes_cel(cel);
1844 		cel->vlp[0] = NULL;
1845 		cel->vlp[1] = NULL;
1846 		cel->vlp[2] = NULL;
1847 	}
1848 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1849 }
1850 
1851 static void
1852 cache_enter_unlock(struct celockstate *cel)
1853 {
1854 
1855 	cache_unlock_buckets_cel(cel);
1856 	cache_unlock_vnodes_cel(cel);
1857 }
1858 
1859 static void __noinline
1860 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1861     struct componentname *cnp)
1862 {
1863 	struct celockstate cel;
1864 	struct namecache *ncp;
1865 	uint32_t hash;
1866 	int len;
1867 
1868 	if (dvp->v_cache_dd == NULL)
1869 		return;
1870 	len = cnp->cn_namelen;
1871 	cache_celockstate_init(&cel);
1872 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1873 	cache_enter_lock_dd(&cel, dvp, vp, hash);
1874 	vn_seqc_write_begin(dvp);
1875 	ncp = dvp->v_cache_dd;
1876 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1877 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1878 		cache_zap_locked(ncp);
1879 	} else {
1880 		ncp = NULL;
1881 	}
1882 	dvp->v_cache_dd = NULL;
1883 	vn_seqc_write_end(dvp);
1884 	cache_enter_unlock(&cel);
1885 	cache_free(ncp);
1886 }
1887 
1888 /*
1889  * Add an entry to the cache.
1890  */
1891 void
1892 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1893     struct timespec *tsp, struct timespec *dtsp)
1894 {
1895 	struct celockstate cel;
1896 	struct namecache *ncp, *n2, *ndd;
1897 	struct namecache_ts *ncp_ts, *n2_ts;
1898 	struct nchashhead *ncpp;
1899 	uint32_t hash;
1900 	int flag;
1901 	int len;
1902 	u_long lnumcache;
1903 
1904 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1905 	VNPASS(!VN_IS_DOOMED(dvp), dvp);
1906 	VNPASS(dvp->v_type != VNON, dvp);
1907 	if (vp != NULL) {
1908 		VNPASS(!VN_IS_DOOMED(vp), vp);
1909 		VNPASS(vp->v_type != VNON, vp);
1910 	}
1911 
1912 #ifdef DEBUG_CACHE
1913 	if (__predict_false(!doingcache))
1914 		return;
1915 #endif
1916 
1917 	flag = 0;
1918 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1919 		if (cnp->cn_namelen == 1)
1920 			return;
1921 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1922 			cache_enter_dotdot_prep(dvp, vp, cnp);
1923 			flag = NCF_ISDOTDOT;
1924 		}
1925 	}
1926 
1927 	/*
1928 	 * Avoid blowout in namecache entries.
1929 	 */
1930 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1931 	if (__predict_false(lnumcache >= ncsize)) {
1932 		atomic_add_long(&numcache, -1);
1933 		counter_u64_add(numdrops, 1);
1934 		return;
1935 	}
1936 
1937 	cache_celockstate_init(&cel);
1938 	ndd = NULL;
1939 	ncp_ts = NULL;
1940 
1941 	/*
1942 	 * Calculate the hash key and setup as much of the new
1943 	 * namecache entry as possible before acquiring the lock.
1944 	 */
1945 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1946 	ncp->nc_flag = flag | NCF_WIP;
1947 	ncp->nc_vp = vp;
1948 	if (vp == NULL)
1949 		cache_negative_init(ncp);
1950 	ncp->nc_dvp = dvp;
1951 	if (tsp != NULL) {
1952 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1953 		ncp_ts->nc_time = *tsp;
1954 		ncp_ts->nc_ticks = ticks;
1955 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
1956 		if (dtsp != NULL) {
1957 			ncp_ts->nc_dotdottime = *dtsp;
1958 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1959 		}
1960 	}
1961 	len = ncp->nc_nlen = cnp->cn_namelen;
1962 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1963 	memcpy(ncp->nc_name, cnp->cn_nameptr, len);
1964 	ncp->nc_name[len] = '\0';
1965 	cache_enter_lock(&cel, dvp, vp, hash);
1966 
1967 	/*
1968 	 * See if this vnode or negative entry is already in the cache
1969 	 * with this name.  This can happen with concurrent lookups of
1970 	 * the same path name.
1971 	 */
1972 	ncpp = NCHHASH(hash);
1973 	CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
1974 		if (n2->nc_dvp == dvp &&
1975 		    n2->nc_nlen == cnp->cn_namelen &&
1976 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1977 			MPASS(cache_ncp_canuse(n2));
1978 			if ((n2->nc_flag & NCF_NEGATIVE) != 0)
1979 				KASSERT(vp == NULL,
1980 				    ("%s: found entry pointing to a different vnode (%p != %p)",
1981 				    __func__, NULL, vp));
1982 			else
1983 				KASSERT(n2->nc_vp == vp,
1984 				    ("%s: found entry pointing to a different vnode (%p != %p)",
1985 				    __func__, n2->nc_vp, vp));
1986 			if (tsp != NULL) {
1987 				KASSERT((n2->nc_flag & NCF_TS) != 0,
1988 				    ("no NCF_TS"));
1989 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1990 				n2_ts->nc_time = ncp_ts->nc_time;
1991 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
1992 				if (dtsp != NULL) {
1993 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1994 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
1995 				}
1996 			}
1997 			goto out_unlock_free;
1998 		}
1999 	}
2000 
2001 	if (flag == NCF_ISDOTDOT) {
2002 		/*
2003 		 * See if we are trying to add .. entry, but some other lookup
2004 		 * has populated v_cache_dd pointer already.
2005 		 */
2006 		if (dvp->v_cache_dd != NULL)
2007 			goto out_unlock_free;
2008 		KASSERT(vp == NULL || vp->v_type == VDIR,
2009 		    ("wrong vnode type %p", vp));
2010 		vn_seqc_write_begin(dvp);
2011 		dvp->v_cache_dd = ncp;
2012 		vn_seqc_write_end(dvp);
2013 	}
2014 
2015 	if (vp != NULL) {
2016 		if (flag != NCF_ISDOTDOT) {
2017 			/*
2018 			 * For this case, the cache entry maps both the
2019 			 * directory name in it and the name ".." for the
2020 			 * directory's parent.
2021 			 */
2022 			vn_seqc_write_begin(vp);
2023 			if ((ndd = vp->v_cache_dd) != NULL) {
2024 				if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2025 					cache_zap_locked(ndd);
2026 				else
2027 					ndd = NULL;
2028 			}
2029 			vp->v_cache_dd = ncp;
2030 			vn_seqc_write_end(vp);
2031 		} else if (vp->v_type != VDIR) {
2032 			if (vp->v_cache_dd != NULL) {
2033 				vn_seqc_write_begin(vp);
2034 				vp->v_cache_dd = NULL;
2035 				vn_seqc_write_end(vp);
2036 			}
2037 		}
2038 	}
2039 
2040 	if (flag != NCF_ISDOTDOT) {
2041 		if (LIST_EMPTY(&dvp->v_cache_src)) {
2042 			vhold(dvp);
2043 			counter_u64_add(numcachehv, 1);
2044 		}
2045 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2046 	}
2047 
2048 	/*
2049 	 * If the entry is "negative", we place it into the
2050 	 * "negative" cache queue, otherwise, we place it into the
2051 	 * destination vnode's cache entries queue.
2052 	 */
2053 	if (vp != NULL) {
2054 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2055 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2056 		    vp);
2057 	} else {
2058 		if (cnp->cn_flags & ISWHITEOUT)
2059 			ncp->nc_flag |= NCF_WHITE;
2060 		cache_negative_insert(ncp);
2061 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2062 		    ncp->nc_name);
2063 	}
2064 
2065 	/*
2066 	 * Insert the new namecache entry into the appropriate chain
2067 	 * within the cache entries table.
2068 	 */
2069 	CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2070 
2071 	atomic_thread_fence_rel();
2072 	/*
2073 	 * Mark the entry as fully constructed.
2074 	 * It is immutable past this point until its removal.
2075 	 */
2076 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2077 
2078 	cache_enter_unlock(&cel);
2079 	if (numneg * ncnegfactor > lnumcache)
2080 		cache_negative_zap_one();
2081 	cache_free(ndd);
2082 	return;
2083 out_unlock_free:
2084 	cache_enter_unlock(&cel);
2085 	atomic_add_long(&numcache, -1);
2086 	cache_free(ncp);
2087 	return;
2088 }
2089 
2090 static u_int
2091 cache_roundup_2(u_int val)
2092 {
2093 	u_int res;
2094 
2095 	for (res = 1; res <= val; res <<= 1)
2096 		continue;
2097 
2098 	return (res);
2099 }
2100 
2101 static struct nchashhead *
2102 nchinittbl(u_long elements, u_long *hashmask)
2103 {
2104 	struct nchashhead *hashtbl;
2105 	u_long hashsize, i;
2106 
2107 	hashsize = cache_roundup_2(elements) / 2;
2108 
2109 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2110 	for (i = 0; i < hashsize; i++)
2111 		CK_SLIST_INIT(&hashtbl[i]);
2112 	*hashmask = hashsize - 1;
2113 	return (hashtbl);
2114 }
2115 
2116 static void
2117 ncfreetbl(struct nchashhead *hashtbl)
2118 {
2119 
2120 	free(hashtbl, M_VFSCACHE);
2121 }
2122 
2123 /*
2124  * Name cache initialization, from vfs_init() when we are booting
2125  */
2126 static void
2127 nchinit(void *dummy __unused)
2128 {
2129 	u_int i;
2130 
2131 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2132 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2133 	cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2134 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2135 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2136 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2137 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2138 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2139 
2140 	VFS_SMR_ZONE_SET(cache_zone_small);
2141 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2142 	VFS_SMR_ZONE_SET(cache_zone_large);
2143 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2144 
2145 	ncsize = desiredvnodes * ncsizefactor;
2146 	nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2147 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2148 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2149 		ncbuckethash = 7;
2150 	if (ncbuckethash > nchash)
2151 		ncbuckethash = nchash;
2152 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2153 	    M_WAITOK | M_ZERO);
2154 	for (i = 0; i < numbucketlocks; i++)
2155 		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2156 	ncvnodehash = ncbuckethash;
2157 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2158 	    M_WAITOK | M_ZERO);
2159 	for (i = 0; i < numvnodelocks; i++)
2160 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2161 	ncpurgeminvnodes = numbucketlocks * 2;
2162 
2163 	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2164 	    M_WAITOK | M_ZERO);
2165 	for (i = 0; i < numneglists; i++) {
2166 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2167 		TAILQ_INIT(&neglists[i].nl_list);
2168 	}
2169 	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2170 	TAILQ_INIT(&ncneg_hot.nl_list);
2171 
2172 	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2173 }
2174 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2175 
2176 void
2177 cache_vnode_init(struct vnode *vp)
2178 {
2179 
2180 	LIST_INIT(&vp->v_cache_src);
2181 	TAILQ_INIT(&vp->v_cache_dst);
2182 	vp->v_cache_dd = NULL;
2183 	cache_prehash(vp);
2184 }
2185 
2186 void
2187 cache_changesize(u_long newmaxvnodes)
2188 {
2189 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
2190 	u_long new_nchash, old_nchash;
2191 	struct namecache *ncp;
2192 	uint32_t hash;
2193 	u_long newncsize;
2194 	int i;
2195 
2196 	newncsize = newmaxvnodes * ncsizefactor;
2197 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2198 	if (newmaxvnodes < numbucketlocks)
2199 		newmaxvnodes = numbucketlocks;
2200 
2201 	new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2202 	/* If same hash table size, nothing to do */
2203 	if (nchash == new_nchash) {
2204 		ncfreetbl(new_nchashtbl);
2205 		return;
2206 	}
2207 	/*
2208 	 * Move everything from the old hash table to the new table.
2209 	 * None of the namecache entries in the table can be removed
2210 	 * because to do so, they have to be removed from the hash table.
2211 	 */
2212 	cache_lock_all_vnodes();
2213 	cache_lock_all_buckets();
2214 	old_nchashtbl = nchashtbl;
2215 	old_nchash = nchash;
2216 	nchashtbl = new_nchashtbl;
2217 	nchash = new_nchash;
2218 	for (i = 0; i <= old_nchash; i++) {
2219 		while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2220 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2221 			    ncp->nc_dvp);
2222 			CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2223 			CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2224 		}
2225 	}
2226 	ncsize = newncsize;
2227 	cache_unlock_all_buckets();
2228 	cache_unlock_all_vnodes();
2229 	ncfreetbl(old_nchashtbl);
2230 }
2231 
2232 /*
2233  * Invalidate all entries from and to a particular vnode.
2234  */
2235 static void
2236 cache_purge_impl(struct vnode *vp)
2237 {
2238 	TAILQ_HEAD(, namecache) ncps;
2239 	struct namecache *ncp, *nnp;
2240 	struct mtx *vlp, *vlp2;
2241 
2242 	TAILQ_INIT(&ncps);
2243 	vlp = VP2VNODELOCK(vp);
2244 	vlp2 = NULL;
2245 	mtx_assert(vlp, MA_OWNED);
2246 retry:
2247 	while (!LIST_EMPTY(&vp->v_cache_src)) {
2248 		ncp = LIST_FIRST(&vp->v_cache_src);
2249 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2250 			goto retry;
2251 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2252 	}
2253 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2254 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2255 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2256 			goto retry;
2257 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2258 	}
2259 	ncp = vp->v_cache_dd;
2260 	if (ncp != NULL) {
2261 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2262 		   ("lost dotdot link"));
2263 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2264 			goto retry;
2265 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2266 	}
2267 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2268 	mtx_unlock(vlp);
2269 	if (vlp2 != NULL)
2270 		mtx_unlock(vlp2);
2271 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2272 		cache_free(ncp);
2273 	}
2274 }
2275 
2276 void
2277 cache_purge(struct vnode *vp)
2278 {
2279 	struct mtx *vlp;
2280 
2281 	SDT_PROBE1(vfs, namecache, purge, done, vp);
2282 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2283 	    vp->v_cache_dd == NULL)
2284 		return;
2285 	vlp = VP2VNODELOCK(vp);
2286 	mtx_lock(vlp);
2287 	cache_purge_impl(vp);
2288 }
2289 
2290 /*
2291  * Only to be used by vgone.
2292  */
2293 void
2294 cache_purge_vgone(struct vnode *vp)
2295 {
2296 	struct mtx *vlp;
2297 
2298 	VNPASS(VN_IS_DOOMED(vp), vp);
2299 	vlp = VP2VNODELOCK(vp);
2300 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2301 	    vp->v_cache_dd == NULL)) {
2302 		mtx_lock(vlp);
2303 		cache_purge_impl(vp);
2304 		mtx_assert(vlp, MA_NOTOWNED);
2305 		return;
2306 	}
2307 
2308 	/*
2309 	 * All the NULL pointer state we found above may be transient.
2310 	 * Serialize against a possible thread doing cache_purge.
2311 	 */
2312 	mtx_wait_unlocked(vlp);
2313 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2314 	    vp->v_cache_dd == NULL)) {
2315 		mtx_lock(vlp);
2316 		cache_purge_impl(vp);
2317 		mtx_assert(vlp, MA_NOTOWNED);
2318 		return;
2319 	}
2320 	return;
2321 }
2322 
2323 /*
2324  * Invalidate all negative entries for a particular directory vnode.
2325  */
2326 void
2327 cache_purge_negative(struct vnode *vp)
2328 {
2329 	TAILQ_HEAD(, namecache) ncps;
2330 	struct namecache *ncp, *nnp;
2331 	struct mtx *vlp;
2332 
2333 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2334 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2335 	if (LIST_EMPTY(&vp->v_cache_src))
2336 		return;
2337 	TAILQ_INIT(&ncps);
2338 	vlp = VP2VNODELOCK(vp);
2339 	mtx_lock(vlp);
2340 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2341 		if (!(ncp->nc_flag & NCF_NEGATIVE))
2342 			continue;
2343 		cache_zap_negative_locked_vnode_kl(ncp, vp);
2344 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2345 	}
2346 	mtx_unlock(vlp);
2347 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2348 		cache_free(ncp);
2349 	}
2350 }
2351 
2352 void
2353 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2354     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2355 {
2356 
2357 	ASSERT_VOP_IN_SEQC(fdvp);
2358 	ASSERT_VOP_IN_SEQC(fvp);
2359 	ASSERT_VOP_IN_SEQC(tdvp);
2360 	if (tvp != NULL)
2361 		ASSERT_VOP_IN_SEQC(tvp);
2362 
2363 	cache_purge(fvp);
2364 	if (tvp != NULL) {
2365 		cache_purge(tvp);
2366 		KASSERT(!cache_remove_cnp(tdvp, tcnp),
2367 		    ("%s: lingering negative entry", __func__));
2368 	} else {
2369 		cache_remove_cnp(tdvp, tcnp);
2370 	}
2371 }
2372 
2373 /*
2374  * Flush all entries referencing a particular filesystem.
2375  */
2376 void
2377 cache_purgevfs(struct mount *mp, bool force)
2378 {
2379 	TAILQ_HEAD(, namecache) ncps;
2380 	struct mtx *vlp1, *vlp2;
2381 	struct rwlock *blp;
2382 	struct nchashhead *bucket;
2383 	struct namecache *ncp, *nnp;
2384 	u_long i, j, n_nchash;
2385 	int error;
2386 
2387 	/* Scan hash tables for applicable entries */
2388 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2389 	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2390 		return;
2391 	TAILQ_INIT(&ncps);
2392 	n_nchash = nchash + 1;
2393 	vlp1 = vlp2 = NULL;
2394 	for (i = 0; i < numbucketlocks; i++) {
2395 		blp = (struct rwlock *)&bucketlocks[i];
2396 		rw_wlock(blp);
2397 		for (j = i; j < n_nchash; j += numbucketlocks) {
2398 retry:
2399 			bucket = &nchashtbl[j];
2400 			CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2401 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
2402 				if (ncp->nc_dvp->v_mount != mp)
2403 					continue;
2404 				error = cache_zap_wlocked_bucket_kl(ncp, blp,
2405 				    &vlp1, &vlp2);
2406 				if (error != 0)
2407 					goto retry;
2408 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2409 			}
2410 		}
2411 		rw_wunlock(blp);
2412 		if (vlp1 == NULL && vlp2 == NULL)
2413 			cache_maybe_yield();
2414 	}
2415 	if (vlp1 != NULL)
2416 		mtx_unlock(vlp1);
2417 	if (vlp2 != NULL)
2418 		mtx_unlock(vlp2);
2419 
2420 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2421 		cache_free(ncp);
2422 	}
2423 }
2424 
2425 /*
2426  * Perform canonical checks and cache lookup and pass on to filesystem
2427  * through the vop_cachedlookup only if needed.
2428  */
2429 
2430 int
2431 vfs_cache_lookup(struct vop_lookup_args *ap)
2432 {
2433 	struct vnode *dvp;
2434 	int error;
2435 	struct vnode **vpp = ap->a_vpp;
2436 	struct componentname *cnp = ap->a_cnp;
2437 	int flags = cnp->cn_flags;
2438 
2439 	*vpp = NULL;
2440 	dvp = ap->a_dvp;
2441 
2442 	if (dvp->v_type != VDIR)
2443 		return (ENOTDIR);
2444 
2445 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2446 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2447 		return (EROFS);
2448 
2449 	error = vn_dir_check_exec(dvp, cnp);
2450 	if (error != 0)
2451 		return (error);
2452 
2453 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2454 	if (error == 0)
2455 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2456 	if (error == -1)
2457 		return (0);
2458 	return (error);
2459 }
2460 
2461 /* Implementation of the getcwd syscall. */
2462 int
2463 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2464 {
2465 	char *buf, *retbuf;
2466 	size_t buflen;
2467 	int error;
2468 
2469 	buflen = uap->buflen;
2470 	if (__predict_false(buflen < 2))
2471 		return (EINVAL);
2472 	if (buflen > MAXPATHLEN)
2473 		buflen = MAXPATHLEN;
2474 
2475 	buf = uma_zalloc(namei_zone, M_WAITOK);
2476 	error = vn_getcwd(buf, &retbuf, &buflen);
2477 	if (error == 0)
2478 		error = copyout(retbuf, uap->buf, buflen);
2479 	uma_zfree(namei_zone, buf);
2480 	return (error);
2481 }
2482 
2483 int
2484 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2485 {
2486 	struct pwd *pwd;
2487 	int error;
2488 
2489 	vfs_smr_enter();
2490 	pwd = pwd_get_smr();
2491 	error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2492 	    buflen, false, 0);
2493 	VFS_SMR_ASSERT_NOT_ENTERED();
2494 	if (error < 0) {
2495 		pwd = pwd_hold(curthread);
2496 		error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2497 		    retbuf, buflen);
2498 		pwd_drop(pwd);
2499 	}
2500 
2501 #ifdef KTRACE
2502 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2503 		ktrnamei(*retbuf);
2504 #endif
2505 	return (error);
2506 }
2507 
2508 static int
2509 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2510     size_t size, int flags, enum uio_seg pathseg)
2511 {
2512 	struct nameidata nd;
2513 	char *retbuf, *freebuf;
2514 	int error;
2515 
2516 	if (flags != 0)
2517 		return (EINVAL);
2518 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2519 	    pathseg, path, fd, &cap_fstat_rights, td);
2520 	if ((error = namei(&nd)) != 0)
2521 		return (error);
2522 	error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2523 	if (error == 0) {
2524 		error = copyout(retbuf, buf, size);
2525 		free(freebuf, M_TEMP);
2526 	}
2527 	NDFREE(&nd, 0);
2528 	return (error);
2529 }
2530 
2531 int
2532 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2533 {
2534 
2535 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2536 	    uap->flags, UIO_USERSPACE));
2537 }
2538 
2539 /*
2540  * Retrieve the full filesystem path that correspond to a vnode from the name
2541  * cache (if available)
2542  */
2543 int
2544 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2545 {
2546 	struct pwd *pwd;
2547 	char *buf;
2548 	size_t buflen;
2549 	int error;
2550 
2551 	if (__predict_false(vp == NULL))
2552 		return (EINVAL);
2553 
2554 	buflen = MAXPATHLEN;
2555 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2556 	vfs_smr_enter();
2557 	pwd = pwd_get_smr();
2558 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2559 	VFS_SMR_ASSERT_NOT_ENTERED();
2560 	if (error < 0) {
2561 		pwd = pwd_hold(curthread);
2562 		error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2563 		pwd_drop(pwd);
2564 	}
2565 	if (error == 0)
2566 		*freebuf = buf;
2567 	else
2568 		free(buf, M_TEMP);
2569 	return (error);
2570 }
2571 
2572 /*
2573  * This function is similar to vn_fullpath, but it attempts to lookup the
2574  * pathname relative to the global root mount point.  This is required for the
2575  * auditing sub-system, as audited pathnames must be absolute, relative to the
2576  * global root mount point.
2577  */
2578 int
2579 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2580 {
2581 	char *buf;
2582 	size_t buflen;
2583 	int error;
2584 
2585 	if (__predict_false(vp == NULL))
2586 		return (EINVAL);
2587 	buflen = MAXPATHLEN;
2588 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2589 	vfs_smr_enter();
2590 	error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2591 	VFS_SMR_ASSERT_NOT_ENTERED();
2592 	if (error < 0) {
2593 		error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2594 	}
2595 	if (error == 0)
2596 		*freebuf = buf;
2597 	else
2598 		free(buf, M_TEMP);
2599 	return (error);
2600 }
2601 
2602 static struct namecache *
2603 vn_dd_from_dst(struct vnode *vp)
2604 {
2605 	struct namecache *ncp;
2606 
2607 	cache_assert_vnode_locked(vp);
2608 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2609 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2610 			return (ncp);
2611 	}
2612 	return (NULL);
2613 }
2614 
2615 int
2616 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2617 {
2618 	struct vnode *dvp;
2619 	struct namecache *ncp;
2620 	struct mtx *vlp;
2621 	int error;
2622 
2623 	vlp = VP2VNODELOCK(*vp);
2624 	mtx_lock(vlp);
2625 	ncp = (*vp)->v_cache_dd;
2626 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2627 		KASSERT(ncp == vn_dd_from_dst(*vp),
2628 		    ("%s: mismatch for dd entry (%p != %p)", __func__,
2629 		    ncp, vn_dd_from_dst(*vp)));
2630 	} else {
2631 		ncp = vn_dd_from_dst(*vp);
2632 	}
2633 	if (ncp != NULL) {
2634 		if (*buflen < ncp->nc_nlen) {
2635 			mtx_unlock(vlp);
2636 			vrele(*vp);
2637 			counter_u64_add(numfullpathfail4, 1);
2638 			error = ENOMEM;
2639 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2640 			    vp, NULL);
2641 			return (error);
2642 		}
2643 		*buflen -= ncp->nc_nlen;
2644 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2645 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2646 		    ncp->nc_name, vp);
2647 		dvp = *vp;
2648 		*vp = ncp->nc_dvp;
2649 		vref(*vp);
2650 		mtx_unlock(vlp);
2651 		vrele(dvp);
2652 		return (0);
2653 	}
2654 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2655 
2656 	mtx_unlock(vlp);
2657 	vn_lock(*vp, LK_SHARED | LK_RETRY);
2658 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2659 	vput(*vp);
2660 	if (error) {
2661 		counter_u64_add(numfullpathfail2, 1);
2662 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2663 		return (error);
2664 	}
2665 
2666 	*vp = dvp;
2667 	if (VN_IS_DOOMED(dvp)) {
2668 		/* forced unmount */
2669 		vrele(dvp);
2670 		error = ENOENT;
2671 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2672 		return (error);
2673 	}
2674 	/*
2675 	 * *vp has its use count incremented still.
2676 	 */
2677 
2678 	return (0);
2679 }
2680 
2681 /*
2682  * Resolve a directory to a pathname.
2683  *
2684  * The name of the directory can always be found in the namecache or fetched
2685  * from the filesystem. There is also guaranteed to be only one parent, meaning
2686  * we can just follow vnodes up until we find the root.
2687  *
2688  * The vnode must be referenced.
2689  */
2690 static int
2691 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2692     size_t *len, bool slash_prefixed, size_t addend)
2693 {
2694 #ifdef KDTRACE_HOOKS
2695 	struct vnode *startvp = vp;
2696 #endif
2697 	struct vnode *vp1;
2698 	size_t buflen;
2699 	int error;
2700 
2701 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2702 	VNPASS(vp->v_usecount > 0, vp);
2703 
2704 	buflen = *len;
2705 
2706 	if (!slash_prefixed) {
2707 		MPASS(*len >= 2);
2708 		buflen--;
2709 		buf[buflen] = '\0';
2710 	}
2711 
2712 	error = 0;
2713 
2714 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2715 	counter_u64_add(numfullpathcalls, 1);
2716 	while (vp != rdir && vp != rootvnode) {
2717 		/*
2718 		 * The vp vnode must be already fully constructed,
2719 		 * since it is either found in namecache or obtained
2720 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2721 		 * without obtaining the vnode lock.
2722 		 */
2723 		if ((vp->v_vflag & VV_ROOT) != 0) {
2724 			vn_lock(vp, LK_RETRY | LK_SHARED);
2725 
2726 			/*
2727 			 * With the vnode locked, check for races with
2728 			 * unmount, forced or not.  Note that we
2729 			 * already verified that vp is not equal to
2730 			 * the root vnode, which means that
2731 			 * mnt_vnodecovered can be NULL only for the
2732 			 * case of unmount.
2733 			 */
2734 			if (VN_IS_DOOMED(vp) ||
2735 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2736 			    vp1->v_mountedhere != vp->v_mount) {
2737 				vput(vp);
2738 				error = ENOENT;
2739 				SDT_PROBE3(vfs, namecache, fullpath, return,
2740 				    error, vp, NULL);
2741 				break;
2742 			}
2743 
2744 			vref(vp1);
2745 			vput(vp);
2746 			vp = vp1;
2747 			continue;
2748 		}
2749 		if (vp->v_type != VDIR) {
2750 			vrele(vp);
2751 			counter_u64_add(numfullpathfail1, 1);
2752 			error = ENOTDIR;
2753 			SDT_PROBE3(vfs, namecache, fullpath, return,
2754 			    error, vp, NULL);
2755 			break;
2756 		}
2757 		error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2758 		if (error)
2759 			break;
2760 		if (buflen == 0) {
2761 			vrele(vp);
2762 			error = ENOMEM;
2763 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2764 			    startvp, NULL);
2765 			break;
2766 		}
2767 		buf[--buflen] = '/';
2768 		slash_prefixed = true;
2769 	}
2770 	if (error)
2771 		return (error);
2772 	if (!slash_prefixed) {
2773 		if (buflen == 0) {
2774 			vrele(vp);
2775 			counter_u64_add(numfullpathfail4, 1);
2776 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2777 			    startvp, NULL);
2778 			return (ENOMEM);
2779 		}
2780 		buf[--buflen] = '/';
2781 	}
2782 	counter_u64_add(numfullpathfound, 1);
2783 	vrele(vp);
2784 
2785 	*retbuf = buf + buflen;
2786 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2787 	*len -= buflen;
2788 	*len += addend;
2789 	return (0);
2790 }
2791 
2792 /*
2793  * Resolve an arbitrary vnode to a pathname.
2794  *
2795  * Note 2 caveats:
2796  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2797  *   resolve to a different path than the one used to find it
2798  * - namecache is not mandatory, meaning names are not guaranteed to be added
2799  *   (in which case resolving fails)
2800  */
2801 static void __inline
2802 cache_rev_failed_impl(int *reason, int line)
2803 {
2804 
2805 	*reason = line;
2806 }
2807 #define cache_rev_failed(var)	cache_rev_failed_impl((var), __LINE__)
2808 
2809 static int
2810 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
2811     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
2812 {
2813 #ifdef KDTRACE_HOOKS
2814 	struct vnode *startvp = vp;
2815 #endif
2816 	struct vnode *tvp;
2817 	struct mount *mp;
2818 	struct namecache *ncp;
2819 	size_t orig_buflen;
2820 	int reason;
2821 	int error;
2822 #ifdef KDTRACE_HOOKS
2823 	int i;
2824 #endif
2825 	seqc_t vp_seqc, tvp_seqc;
2826 	u_char nc_flag;
2827 
2828 	VFS_SMR_ASSERT_ENTERED();
2829 
2830 	if (!cache_fast_revlookup) {
2831 		vfs_smr_exit();
2832 		return (-1);
2833 	}
2834 
2835 	orig_buflen = *buflen;
2836 
2837 	MPASS(*buflen >= 2);
2838 
2839 	if (!slash_prefixed) {
2840 		MPASS(*buflen >= 2);
2841 		*buflen -= 1;
2842 		buf[*buflen] = '\0';
2843 	}
2844 
2845 	if (vp == rdir || vp == rootvnode) {
2846 		if (!slash_prefixed) {
2847 			*buflen -= 1;
2848 			buf[*buflen] = '/';
2849 		}
2850 		goto out_ok;
2851 	}
2852 
2853 #ifdef KDTRACE_HOOKS
2854 	i = 0;
2855 #endif
2856 	error = -1;
2857 	vp_seqc = vn_seqc_read_any(vp);
2858 	if (seqc_in_modify(vp_seqc)) {
2859 		cache_rev_failed(&reason);
2860 		goto out_abort;
2861 	}
2862 
2863 	for (;;) {
2864 #ifdef KDTRACE_HOOKS
2865 		i++;
2866 #endif
2867 		if ((vp->v_vflag & VV_ROOT) != 0) {
2868 			mp = atomic_load_ptr(&vp->v_mount);
2869 			if (mp == NULL) {
2870 				cache_rev_failed(&reason);
2871 				goto out_abort;
2872 			}
2873 			tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
2874 			tvp_seqc = vn_seqc_read_any(tvp);
2875 			if (seqc_in_modify(tvp_seqc)) {
2876 				cache_rev_failed(&reason);
2877 				goto out_abort;
2878 			}
2879 			if (!vn_seqc_consistent(vp, vp_seqc)) {
2880 				cache_rev_failed(&reason);
2881 				goto out_abort;
2882 			}
2883 			vp = tvp;
2884 			vp_seqc = tvp_seqc;
2885 			continue;
2886 		}
2887 		ncp = atomic_load_ptr(&vp->v_cache_dd);
2888 		if (ncp == NULL) {
2889 			cache_rev_failed(&reason);
2890 			goto out_abort;
2891 		}
2892 		nc_flag = atomic_load_char(&ncp->nc_flag);
2893 		if ((nc_flag & NCF_ISDOTDOT) != 0) {
2894 			cache_rev_failed(&reason);
2895 			goto out_abort;
2896 		}
2897 		if (!cache_ncp_canuse(ncp)) {
2898 			cache_rev_failed(&reason);
2899 			goto out_abort;
2900 		}
2901 		if (ncp->nc_nlen >= *buflen) {
2902 			cache_rev_failed(&reason);
2903 			error = ENOMEM;
2904 			goto out_abort;
2905 		}
2906 		*buflen -= ncp->nc_nlen;
2907 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2908 		*buflen -= 1;
2909 		buf[*buflen] = '/';
2910 		tvp = ncp->nc_dvp;
2911 		tvp_seqc = vn_seqc_read_any(tvp);
2912 		if (seqc_in_modify(tvp_seqc)) {
2913 			cache_rev_failed(&reason);
2914 			goto out_abort;
2915 		}
2916 		if (!vn_seqc_consistent(vp, vp_seqc)) {
2917 			cache_rev_failed(&reason);
2918 			goto out_abort;
2919 		}
2920 		vp = tvp;
2921 		vp_seqc = tvp_seqc;
2922 		if (vp == rdir || vp == rootvnode)
2923 			break;
2924 	}
2925 out_ok:
2926 	vfs_smr_exit();
2927 	*retbuf = buf + *buflen;
2928 	*buflen = orig_buflen - *buflen + addend;
2929 	SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
2930 	return (0);
2931 
2932 out_abort:
2933 	*buflen = orig_buflen;
2934 	SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
2935 	vfs_smr_exit();
2936 	return (error);
2937 }
2938 
2939 static int
2940 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2941     size_t *buflen)
2942 {
2943 	size_t orig_buflen;
2944 	bool slash_prefixed;
2945 	int error;
2946 
2947 	if (*buflen < 2)
2948 		return (EINVAL);
2949 
2950 	orig_buflen = *buflen;
2951 
2952 	vref(vp);
2953 	slash_prefixed = false;
2954 	if (vp->v_type != VDIR) {
2955 		*buflen -= 1;
2956 		buf[*buflen] = '\0';
2957 		error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
2958 		if (error)
2959 			return (error);
2960 		if (*buflen == 0) {
2961 			vrele(vp);
2962 			return (ENOMEM);
2963 		}
2964 		*buflen -= 1;
2965 		buf[*buflen] = '/';
2966 		slash_prefixed = true;
2967 	}
2968 
2969 	return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
2970 	    orig_buflen - *buflen));
2971 }
2972 
2973 /*
2974  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2975  *
2976  * Since the namecache does not track handlings, the caller is expected to first
2977  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2978  *
2979  * Then we have 2 cases:
2980  * - if the found vnode is a directory, the path can be constructed just by
2981  *   fullowing names up the chain
2982  * - otherwise we populate the buffer with the saved name and start resolving
2983  *   from the parent
2984  */
2985 static int
2986 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
2987     size_t *buflen)
2988 {
2989 	char *buf, *tmpbuf;
2990 	struct pwd *pwd;
2991 	struct componentname *cnp;
2992 	struct vnode *vp;
2993 	size_t addend;
2994 	int error;
2995 	bool slash_prefixed;
2996 	enum vtype type;
2997 
2998 	if (*buflen < 2)
2999 		return (EINVAL);
3000 	if (*buflen > MAXPATHLEN)
3001 		*buflen = MAXPATHLEN;
3002 
3003 	slash_prefixed = false;
3004 
3005 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
3006 
3007 	addend = 0;
3008 	vp = ndp->ni_vp;
3009 	/*
3010 	 * Check for VBAD to work around the vp_crossmp bug in lookup().
3011 	 *
3012 	 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3013 	 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3014 	 * If the type is VDIR (like in this very case) we can skip looking
3015 	 * at ni_dvp in the first place. However, since vnodes get passed here
3016 	 * unlocked the target may transition to doomed state (type == VBAD)
3017 	 * before we get to evaluate the condition. If this happens, we will
3018 	 * populate part of the buffer and descend to vn_fullpath_dir with
3019 	 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3020 	 *
3021 	 * This should be atomic_load(&vp->v_type) but it is ilegal to take
3022 	 * an address of a bit field, even if said field is sized to char.
3023 	 * Work around the problem by reading the value into a full-sized enum
3024 	 * and then re-reading it with atomic_load which will still prevent
3025 	 * the compiler from re-reading down the road.
3026 	 */
3027 	type = vp->v_type;
3028 	type = atomic_load_int(&type);
3029 	if (type == VBAD) {
3030 		error = ENOENT;
3031 		goto out_bad;
3032 	}
3033 	if (type != VDIR) {
3034 		cnp = &ndp->ni_cnd;
3035 		addend = cnp->cn_namelen + 2;
3036 		if (*buflen < addend) {
3037 			error = ENOMEM;
3038 			goto out_bad;
3039 		}
3040 		*buflen -= addend;
3041 		tmpbuf = buf + *buflen;
3042 		tmpbuf[0] = '/';
3043 		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3044 		tmpbuf[addend - 1] = '\0';
3045 		slash_prefixed = true;
3046 		vp = ndp->ni_dvp;
3047 	}
3048 
3049 	vfs_smr_enter();
3050 	pwd = pwd_get_smr();
3051 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3052 	    slash_prefixed, addend);
3053 	VFS_SMR_ASSERT_NOT_ENTERED();
3054 	if (error < 0) {
3055 		pwd = pwd_hold(curthread);
3056 		vref(vp);
3057 		error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3058 		    slash_prefixed, addend);
3059 		pwd_drop(pwd);
3060 		if (error != 0)
3061 			goto out_bad;
3062 	}
3063 
3064 	*freebuf = buf;
3065 
3066 	return (0);
3067 out_bad:
3068 	free(buf, M_TEMP);
3069 	return (error);
3070 }
3071 
3072 struct vnode *
3073 vn_dir_dd_ino(struct vnode *vp)
3074 {
3075 	struct namecache *ncp;
3076 	struct vnode *ddvp;
3077 	struct mtx *vlp;
3078 	enum vgetstate vs;
3079 
3080 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3081 	vlp = VP2VNODELOCK(vp);
3082 	mtx_lock(vlp);
3083 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3084 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3085 			continue;
3086 		ddvp = ncp->nc_dvp;
3087 		vs = vget_prep(ddvp);
3088 		mtx_unlock(vlp);
3089 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3090 			return (NULL);
3091 		return (ddvp);
3092 	}
3093 	mtx_unlock(vlp);
3094 	return (NULL);
3095 }
3096 
3097 int
3098 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3099 {
3100 	struct namecache *ncp;
3101 	struct mtx *vlp;
3102 	int l;
3103 
3104 	vlp = VP2VNODELOCK(vp);
3105 	mtx_lock(vlp);
3106 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3107 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3108 			break;
3109 	if (ncp == NULL) {
3110 		mtx_unlock(vlp);
3111 		return (ENOENT);
3112 	}
3113 	l = min(ncp->nc_nlen, buflen - 1);
3114 	memcpy(buf, ncp->nc_name, l);
3115 	mtx_unlock(vlp);
3116 	buf[l] = '\0';
3117 	return (0);
3118 }
3119 
3120 /*
3121  * This function updates path string to vnode's full global path
3122  * and checks the size of the new path string against the pathlen argument.
3123  *
3124  * Requires a locked, referenced vnode.
3125  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3126  *
3127  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3128  * because it falls back to the ".." lookup if the namecache lookup fails.
3129  */
3130 int
3131 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3132     u_int pathlen)
3133 {
3134 	struct nameidata nd;
3135 	struct vnode *vp1;
3136 	char *rpath, *fbuf;
3137 	int error;
3138 
3139 	ASSERT_VOP_ELOCKED(vp, __func__);
3140 
3141 	/* Construct global filesystem path from vp. */
3142 	VOP_UNLOCK(vp);
3143 	error = vn_fullpath_global(vp, &rpath, &fbuf);
3144 
3145 	if (error != 0) {
3146 		vrele(vp);
3147 		return (error);
3148 	}
3149 
3150 	if (strlen(rpath) >= pathlen) {
3151 		vrele(vp);
3152 		error = ENAMETOOLONG;
3153 		goto out;
3154 	}
3155 
3156 	/*
3157 	 * Re-lookup the vnode by path to detect a possible rename.
3158 	 * As a side effect, the vnode is relocked.
3159 	 * If vnode was renamed, return ENOENT.
3160 	 */
3161 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3162 	    UIO_SYSSPACE, path, td);
3163 	error = namei(&nd);
3164 	if (error != 0) {
3165 		vrele(vp);
3166 		goto out;
3167 	}
3168 	NDFREE(&nd, NDF_ONLY_PNBUF);
3169 	vp1 = nd.ni_vp;
3170 	vrele(vp);
3171 	if (vp1 == vp)
3172 		strcpy(path, rpath);
3173 	else {
3174 		vput(vp1);
3175 		error = ENOENT;
3176 	}
3177 
3178 out:
3179 	free(fbuf, M_TEMP);
3180 	return (error);
3181 }
3182 
3183 #ifdef DDB
3184 static void
3185 db_print_vpath(struct vnode *vp)
3186 {
3187 
3188 	while (vp != NULL) {
3189 		db_printf("%p: ", vp);
3190 		if (vp == rootvnode) {
3191 			db_printf("/");
3192 			vp = NULL;
3193 		} else {
3194 			if (vp->v_vflag & VV_ROOT) {
3195 				db_printf("<mount point>");
3196 				vp = vp->v_mount->mnt_vnodecovered;
3197 			} else {
3198 				struct namecache *ncp;
3199 				char *ncn;
3200 				int i;
3201 
3202 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
3203 				if (ncp != NULL) {
3204 					ncn = ncp->nc_name;
3205 					for (i = 0; i < ncp->nc_nlen; i++)
3206 						db_printf("%c", *ncn++);
3207 					vp = ncp->nc_dvp;
3208 				} else {
3209 					vp = NULL;
3210 				}
3211 			}
3212 		}
3213 		db_printf("\n");
3214 	}
3215 
3216 	return;
3217 }
3218 
3219 DB_SHOW_COMMAND(vpath, db_show_vpath)
3220 {
3221 	struct vnode *vp;
3222 
3223 	if (!have_addr) {
3224 		db_printf("usage: show vpath <struct vnode *>\n");
3225 		return;
3226 	}
3227 
3228 	vp = (struct vnode *)addr;
3229 	db_print_vpath(vp);
3230 }
3231 
3232 #endif
3233 
3234 static bool __read_frequently cache_fast_lookup = true;
3235 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3236     &cache_fast_lookup, 0, "");
3237 
3238 #define CACHE_FPL_FAILED	-2020
3239 
3240 static void
3241 cache_fpl_cleanup_cnp(struct componentname *cnp)
3242 {
3243 
3244 	uma_zfree(namei_zone, cnp->cn_pnbuf);
3245 #ifdef DIAGNOSTIC
3246 	cnp->cn_pnbuf = NULL;
3247 	cnp->cn_nameptr = NULL;
3248 #endif
3249 }
3250 
3251 static void
3252 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3253 {
3254 	struct componentname *cnp;
3255 
3256 	cnp = &ndp->ni_cnd;
3257 	while (*(cnp->cn_nameptr) == '/') {
3258 		cnp->cn_nameptr++;
3259 		ndp->ni_pathlen--;
3260 	}
3261 
3262 	*dpp = ndp->ni_rootdir;
3263 }
3264 
3265 /*
3266  * Components of nameidata (or objects it can point to) which may
3267  * need restoring in case fast path lookup fails.
3268  */
3269 struct nameidata_saved {
3270 	long cn_namelen;
3271 	char *cn_nameptr;
3272 	size_t ni_pathlen;
3273 	int cn_flags;
3274 };
3275 
3276 struct cache_fpl {
3277 	struct nameidata *ndp;
3278 	struct componentname *cnp;
3279 	struct pwd *pwd;
3280 	struct vnode *dvp;
3281 	struct vnode *tvp;
3282 	seqc_t dvp_seqc;
3283 	seqc_t tvp_seqc;
3284 	struct nameidata_saved snd;
3285 	int line;
3286 	enum cache_fpl_status status:8;
3287 	bool in_smr;
3288 };
3289 
3290 static void
3291 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3292 {
3293 
3294 	snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3295 	snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3296 	snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3297 	snd->ni_pathlen = fpl->ndp->ni_pathlen;
3298 }
3299 
3300 static void
3301 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3302 {
3303 
3304 	fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3305 	fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3306 	fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3307 	fpl->ndp->ni_pathlen = snd->ni_pathlen;
3308 }
3309 
3310 #ifdef INVARIANTS
3311 #define cache_fpl_smr_assert_entered(fpl) ({			\
3312 	struct cache_fpl *_fpl = (fpl);				\
3313 	MPASS(_fpl->in_smr == true);				\
3314 	VFS_SMR_ASSERT_ENTERED();				\
3315 })
3316 #define cache_fpl_smr_assert_not_entered(fpl) ({		\
3317 	struct cache_fpl *_fpl = (fpl);				\
3318 	MPASS(_fpl->in_smr == false);				\
3319 	VFS_SMR_ASSERT_NOT_ENTERED();				\
3320 })
3321 #else
3322 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3323 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3324 #endif
3325 
3326 #define cache_fpl_smr_enter_initial(fpl) ({			\
3327 	struct cache_fpl *_fpl = (fpl);				\
3328 	vfs_smr_enter();					\
3329 	_fpl->in_smr = true;					\
3330 })
3331 
3332 #define cache_fpl_smr_enter(fpl) ({				\
3333 	struct cache_fpl *_fpl = (fpl);				\
3334 	MPASS(_fpl->in_smr == false);				\
3335 	vfs_smr_enter();					\
3336 	_fpl->in_smr = true;					\
3337 })
3338 
3339 #define cache_fpl_smr_exit(fpl) ({				\
3340 	struct cache_fpl *_fpl = (fpl);				\
3341 	MPASS(_fpl->in_smr == true);				\
3342 	vfs_smr_exit();						\
3343 	_fpl->in_smr = false;					\
3344 })
3345 
3346 static int
3347 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3348 {
3349 
3350 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3351 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3352 		    ("%s: converting to abort from %d at %d, set at %d\n",
3353 		    __func__, fpl->status, line, fpl->line));
3354 	}
3355 	fpl->status = CACHE_FPL_STATUS_ABORTED;
3356 	fpl->line = line;
3357 	return (CACHE_FPL_FAILED);
3358 }
3359 
3360 #define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
3361 
3362 static int
3363 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3364 {
3365 
3366 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3367 	    ("%s: setting to partial at %d, but already set to %d at %d\n",
3368 	    __func__, line, fpl->status, fpl->line));
3369 	cache_fpl_smr_assert_entered(fpl);
3370 	fpl->status = CACHE_FPL_STATUS_PARTIAL;
3371 	fpl->line = line;
3372 	return (CACHE_FPL_FAILED);
3373 }
3374 
3375 #define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
3376 
3377 static int
3378 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3379 {
3380 
3381 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3382 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
3383 	    __func__, line, fpl->status, fpl->line));
3384 	cache_fpl_smr_assert_not_entered(fpl);
3385 	MPASS(error != CACHE_FPL_FAILED);
3386 	fpl->status = CACHE_FPL_STATUS_HANDLED;
3387 	fpl->line = line;
3388 	return (error);
3389 }
3390 
3391 #define cache_fpl_handled(x, e)	cache_fpl_handled_impl((x), (e), __LINE__)
3392 
3393 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3394 	(LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3395 	 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3396 
3397 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3398 	(ISDOTDOT | MAKEENTRY | ISLASTCN)
3399 
3400 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3401     "supported and internal flags overlap");
3402 
3403 static bool
3404 cache_fpl_islastcn(struct nameidata *ndp)
3405 {
3406 
3407 	return (*ndp->ni_next == 0);
3408 }
3409 
3410 static bool
3411 cache_fpl_isdotdot(struct componentname *cnp)
3412 {
3413 
3414 	if (cnp->cn_namelen == 2 &&
3415 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3416 		return (true);
3417 	return (false);
3418 }
3419 
3420 static bool
3421 cache_can_fplookup(struct cache_fpl *fpl)
3422 {
3423 	struct nameidata *ndp;
3424 	struct componentname *cnp;
3425 	struct thread *td;
3426 
3427 	ndp = fpl->ndp;
3428 	cnp = fpl->cnp;
3429 	td = cnp->cn_thread;
3430 
3431 	if (!cache_fast_lookup) {
3432 		cache_fpl_aborted(fpl);
3433 		return (false);
3434 	}
3435 #ifdef MAC
3436 	if (mac_vnode_check_lookup_enabled()) {
3437 		cache_fpl_aborted(fpl);
3438 		return (false);
3439 	}
3440 #endif
3441 	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3442 		cache_fpl_aborted(fpl);
3443 		return (false);
3444 	}
3445 	if (ndp->ni_dirfd != AT_FDCWD) {
3446 		cache_fpl_aborted(fpl);
3447 		return (false);
3448 	}
3449 	if (IN_CAPABILITY_MODE(td)) {
3450 		cache_fpl_aborted(fpl);
3451 		return (false);
3452 	}
3453 	if (AUDITING_TD(td)) {
3454 		cache_fpl_aborted(fpl);
3455 		return (false);
3456 	}
3457 	if (ndp->ni_startdir != NULL) {
3458 		cache_fpl_aborted(fpl);
3459 		return (false);
3460 	}
3461 	return (true);
3462 }
3463 
3464 static bool
3465 cache_fplookup_vnode_supported(struct vnode *vp)
3466 {
3467 
3468 	return (vp->v_type != VLNK);
3469 }
3470 
3471 /*
3472  * Move a negative entry to the hot list.
3473  *
3474  * We have to take locks, but they may be contended and in the worst
3475  * case we may need to go off CPU. We don't want to spin within the
3476  * smr section and we can't block with it. Instead we are going to
3477  * look up the entry again.
3478  */
3479 static int __noinline
3480 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3481     uint32_t hash)
3482 {
3483 	struct componentname *cnp;
3484 	struct namecache *ncp;
3485 	struct neglist *neglist;
3486 	struct negstate *negstate;
3487 	struct vnode *dvp;
3488 	u_char nc_flag;
3489 
3490 	cnp = fpl->cnp;
3491 	dvp = fpl->dvp;
3492 
3493 	if (!vhold_smr(dvp))
3494 		return (cache_fpl_aborted(fpl));
3495 
3496 	neglist = NCP2NEGLIST(oncp);
3497 	cache_fpl_smr_exit(fpl);
3498 
3499 	mtx_lock(&ncneg_hot.nl_lock);
3500 	mtx_lock(&neglist->nl_lock);
3501 	/*
3502 	 * For hash iteration.
3503 	 */
3504 	cache_fpl_smr_enter(fpl);
3505 
3506 	/*
3507 	 * Avoid all surprises by only succeeding if we got the same entry and
3508 	 * bailing completely otherwise.
3509 	 *
3510 	 * In particular at this point there can be a new ncp which matches the
3511 	 * search but hashes to a different neglist.
3512 	 */
3513 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3514 		if (ncp == oncp)
3515 			break;
3516 	}
3517 
3518 	/*
3519 	 * No match to begin with.
3520 	 */
3521 	if (__predict_false(ncp == NULL)) {
3522 		goto out_abort;
3523 	}
3524 
3525 	/*
3526 	 * The newly found entry may be something different...
3527 	 */
3528 	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3529 	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3530 		goto out_abort;
3531 	}
3532 
3533 	/*
3534 	 * ... and not even negative.
3535 	 */
3536 	nc_flag = atomic_load_char(&ncp->nc_flag);
3537 	if ((nc_flag & NCF_NEGATIVE) == 0) {
3538 		goto out_abort;
3539 	}
3540 
3541 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3542 		goto out_abort;
3543 	}
3544 
3545 	negstate = NCP2NEGSTATE(ncp);
3546 	if ((negstate->neg_flag & NEG_HOT) == 0) {
3547 		numhotneg++;
3548 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3549 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3550 		negstate->neg_flag |= NEG_HOT;
3551 	}
3552 
3553 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3554 	counter_u64_add(numneghits, 1);
3555 	cache_fpl_smr_exit(fpl);
3556 	mtx_unlock(&neglist->nl_lock);
3557 	mtx_unlock(&ncneg_hot.nl_lock);
3558 	vdrop(dvp);
3559 	return (cache_fpl_handled(fpl, ENOENT));
3560 out_abort:
3561 	cache_fpl_smr_exit(fpl);
3562 	mtx_unlock(&neglist->nl_lock);
3563 	mtx_unlock(&ncneg_hot.nl_lock);
3564 	vdrop(dvp);
3565 	return (cache_fpl_aborted(fpl));
3566 }
3567 
3568 /*
3569  * The target vnode is not supported, prepare for the slow path to take over.
3570  */
3571 static int __noinline
3572 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3573 {
3574 	struct nameidata *ndp;
3575 	struct componentname *cnp;
3576 	enum vgetstate dvs;
3577 	struct vnode *dvp;
3578 	struct pwd *pwd;
3579 	seqc_t dvp_seqc;
3580 
3581 	ndp = fpl->ndp;
3582 	cnp = fpl->cnp;
3583 	dvp = fpl->dvp;
3584 	dvp_seqc = fpl->dvp_seqc;
3585 
3586 	dvs = vget_prep_smr(dvp);
3587 	if (__predict_false(dvs == VGET_NONE)) {
3588 		cache_fpl_smr_exit(fpl);
3589 		return (cache_fpl_aborted(fpl));
3590 	}
3591 
3592 	cache_fpl_smr_exit(fpl);
3593 
3594 	vget_finish_ref(dvp, dvs);
3595 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3596 		vrele(dvp);
3597 		return (cache_fpl_aborted(fpl));
3598 	}
3599 
3600 	pwd = pwd_hold(curthread);
3601 	if (fpl->pwd != pwd) {
3602 		vrele(dvp);
3603 		pwd_drop(pwd);
3604 		return (cache_fpl_aborted(fpl));
3605 	}
3606 
3607 	cache_fpl_restore(fpl, &fpl->snd);
3608 
3609 	ndp->ni_startdir = dvp;
3610 	cnp->cn_flags |= MAKEENTRY;
3611 	if (cache_fpl_islastcn(ndp))
3612 		cnp->cn_flags |= ISLASTCN;
3613 	if (cache_fpl_isdotdot(cnp))
3614 		cnp->cn_flags |= ISDOTDOT;
3615 
3616 	return (0);
3617 }
3618 
3619 static int
3620 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3621 {
3622 	struct componentname *cnp;
3623 	struct vnode *tvp;
3624 	seqc_t tvp_seqc;
3625 	int error, lkflags;
3626 
3627 	cnp = fpl->cnp;
3628 	tvp = fpl->tvp;
3629 	tvp_seqc = fpl->tvp_seqc;
3630 
3631 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
3632 		lkflags = LK_SHARED;
3633 		if ((cnp->cn_flags & LOCKSHARED) == 0)
3634 			lkflags = LK_EXCLUSIVE;
3635 		error = vget_finish(tvp, lkflags, tvs);
3636 		if (__predict_false(error != 0)) {
3637 			return (cache_fpl_aborted(fpl));
3638 		}
3639 	} else {
3640 		vget_finish_ref(tvp, tvs);
3641 	}
3642 
3643 	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3644 		if ((cnp->cn_flags & LOCKLEAF) != 0)
3645 			vput(tvp);
3646 		else
3647 			vrele(tvp);
3648 		return (cache_fpl_aborted(fpl));
3649 	}
3650 
3651 	return (cache_fpl_handled(fpl, 0));
3652 }
3653 
3654 /*
3655  * They want to possibly modify the state of the namecache.
3656  *
3657  * Don't try to match the API contract, just leave.
3658  * TODO: this leaves scalability on the table
3659  */
3660 static int
3661 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3662 {
3663 	struct componentname *cnp;
3664 
3665 	cnp = fpl->cnp;
3666 	MPASS(cnp->cn_nameiop != LOOKUP);
3667 	return (cache_fpl_partial(fpl));
3668 }
3669 
3670 static int __noinline
3671 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3672 {
3673 	struct componentname *cnp;
3674 	enum vgetstate dvs, tvs;
3675 	struct vnode *dvp, *tvp;
3676 	seqc_t dvp_seqc, tvp_seqc;
3677 	int error;
3678 
3679 	cnp = fpl->cnp;
3680 	dvp = fpl->dvp;
3681 	dvp_seqc = fpl->dvp_seqc;
3682 	tvp = fpl->tvp;
3683 	tvp_seqc = fpl->tvp_seqc;
3684 
3685 	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3686 
3687 	/*
3688 	 * This is less efficient than it can be for simplicity.
3689 	 */
3690 	dvs = vget_prep_smr(dvp);
3691 	if (__predict_false(dvs == VGET_NONE)) {
3692 		return (cache_fpl_aborted(fpl));
3693 	}
3694 	tvs = vget_prep_smr(tvp);
3695 	if (__predict_false(tvs == VGET_NONE)) {
3696 		cache_fpl_smr_exit(fpl);
3697 		vget_abort(dvp, dvs);
3698 		return (cache_fpl_aborted(fpl));
3699 	}
3700 
3701 	cache_fpl_smr_exit(fpl);
3702 
3703 	if ((cnp->cn_flags & LOCKPARENT) != 0) {
3704 		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3705 		if (__predict_false(error != 0)) {
3706 			vget_abort(tvp, tvs);
3707 			return (cache_fpl_aborted(fpl));
3708 		}
3709 	} else {
3710 		vget_finish_ref(dvp, dvs);
3711 	}
3712 
3713 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3714 		vget_abort(tvp, tvs);
3715 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3716 			vput(dvp);
3717 		else
3718 			vrele(dvp);
3719 		return (cache_fpl_aborted(fpl));
3720 	}
3721 
3722 	error = cache_fplookup_final_child(fpl, tvs);
3723 	if (__predict_false(error != 0)) {
3724 		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3725 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3726 			vput(dvp);
3727 		else
3728 			vrele(dvp);
3729 		return (error);
3730 	}
3731 
3732 	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3733 	return (0);
3734 }
3735 
3736 static int
3737 cache_fplookup_final(struct cache_fpl *fpl)
3738 {
3739 	struct componentname *cnp;
3740 	enum vgetstate tvs;
3741 	struct vnode *dvp, *tvp;
3742 	seqc_t dvp_seqc, tvp_seqc;
3743 
3744 	cnp = fpl->cnp;
3745 	dvp = fpl->dvp;
3746 	dvp_seqc = fpl->dvp_seqc;
3747 	tvp = fpl->tvp;
3748 	tvp_seqc = fpl->tvp_seqc;
3749 
3750 	VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3751 
3752 	if (cnp->cn_nameiop != LOOKUP) {
3753 		return (cache_fplookup_final_modifying(fpl));
3754 	}
3755 
3756 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3757 		return (cache_fplookup_final_withparent(fpl));
3758 
3759 	tvs = vget_prep_smr(tvp);
3760 	if (__predict_false(tvs == VGET_NONE)) {
3761 		return (cache_fpl_partial(fpl));
3762 	}
3763 
3764 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3765 		cache_fpl_smr_exit(fpl);
3766 		vget_abort(tvp, tvs);
3767 		return (cache_fpl_aborted(fpl));
3768 	}
3769 
3770 	cache_fpl_smr_exit(fpl);
3771 	return (cache_fplookup_final_child(fpl, tvs));
3772 }
3773 
3774 static int __noinline
3775 cache_fplookup_dot(struct cache_fpl *fpl)
3776 {
3777 	struct vnode *dvp;
3778 
3779 	dvp = fpl->dvp;
3780 
3781 	fpl->tvp = dvp;
3782 	fpl->tvp_seqc = vn_seqc_read_any(dvp);
3783 	if (seqc_in_modify(fpl->tvp_seqc)) {
3784 		return (cache_fpl_aborted(fpl));
3785 	}
3786 
3787 	counter_u64_add(dothits, 1);
3788 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3789 
3790 	return (0);
3791 }
3792 
3793 static int __noinline
3794 cache_fplookup_dotdot(struct cache_fpl *fpl)
3795 {
3796 	struct nameidata *ndp;
3797 	struct componentname *cnp;
3798 	struct namecache *ncp;
3799 	struct vnode *dvp;
3800 	struct prison *pr;
3801 	u_char nc_flag;
3802 
3803 	ndp = fpl->ndp;
3804 	cnp = fpl->cnp;
3805 	dvp = fpl->dvp;
3806 
3807 	/*
3808 	 * XXX this is racy the same way regular lookup is
3809 	 */
3810 	for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3811 	    pr = pr->pr_parent)
3812 		if (dvp == pr->pr_root)
3813 			break;
3814 
3815 	if (dvp == ndp->ni_rootdir ||
3816 	    dvp == ndp->ni_topdir ||
3817 	    dvp == rootvnode ||
3818 	    pr != NULL) {
3819 		fpl->tvp = dvp;
3820 		fpl->tvp_seqc = vn_seqc_read_any(dvp);
3821 		if (seqc_in_modify(fpl->tvp_seqc)) {
3822 			return (cache_fpl_aborted(fpl));
3823 		}
3824 		return (0);
3825 	}
3826 
3827 	if ((dvp->v_vflag & VV_ROOT) != 0) {
3828 		/*
3829 		 * TODO
3830 		 * The opposite of climb mount is needed here.
3831 		 */
3832 		return (cache_fpl_aborted(fpl));
3833 	}
3834 
3835 	ncp = atomic_load_ptr(&dvp->v_cache_dd);
3836 	if (ncp == NULL) {
3837 		return (cache_fpl_aborted(fpl));
3838 	}
3839 
3840 	nc_flag = atomic_load_char(&ncp->nc_flag);
3841 	if ((nc_flag & NCF_ISDOTDOT) != 0) {
3842 		if ((nc_flag & NCF_NEGATIVE) != 0)
3843 			return (cache_fpl_aborted(fpl));
3844 		fpl->tvp = ncp->nc_vp;
3845 	} else {
3846 		fpl->tvp = ncp->nc_dvp;
3847 	}
3848 
3849 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3850 		return (cache_fpl_aborted(fpl));
3851 	}
3852 
3853 	fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3854 	if (seqc_in_modify(fpl->tvp_seqc)) {
3855 		return (cache_fpl_partial(fpl));
3856 	}
3857 
3858 	counter_u64_add(dotdothits, 1);
3859 	return (0);
3860 }
3861 
3862 static int
3863 cache_fplookup_next(struct cache_fpl *fpl)
3864 {
3865 	struct componentname *cnp;
3866 	struct namecache *ncp;
3867 	struct negstate *negstate;
3868 	struct vnode *dvp, *tvp;
3869 	u_char nc_flag;
3870 	uint32_t hash;
3871 	bool neg_hot;
3872 
3873 	cnp = fpl->cnp;
3874 	dvp = fpl->dvp;
3875 
3876 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3877 		return (cache_fplookup_dot(fpl));
3878 	}
3879 
3880 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3881 
3882 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3883 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3884 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3885 			break;
3886 	}
3887 
3888 	/*
3889 	 * If there is no entry we have to punt to the slow path to perform
3890 	 * actual lookup. Should there be nothing with this name a negative
3891 	 * entry will be created.
3892 	 */
3893 	if (__predict_false(ncp == NULL)) {
3894 		return (cache_fpl_partial(fpl));
3895 	}
3896 
3897 	tvp = atomic_load_ptr(&ncp->nc_vp);
3898 	nc_flag = atomic_load_char(&ncp->nc_flag);
3899 	if ((nc_flag & NCF_NEGATIVE) != 0) {
3900 		/*
3901 		 * If they want to create an entry we need to replace this one.
3902 		 */
3903 		if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3904 			return (cache_fpl_partial(fpl));
3905 		}
3906 		negstate = NCP2NEGSTATE(ncp);
3907 		neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3908 		if (__predict_false(!cache_ncp_canuse(ncp))) {
3909 			return (cache_fpl_partial(fpl));
3910 		}
3911 		if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3912 			return (cache_fpl_partial(fpl));
3913 		}
3914 		if (!neg_hot) {
3915 			return (cache_fplookup_negative_promote(fpl, ncp, hash));
3916 		}
3917 		SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3918 		    ncp->nc_name);
3919 		counter_u64_add(numneghits, 1);
3920 		cache_fpl_smr_exit(fpl);
3921 		return (cache_fpl_handled(fpl, ENOENT));
3922 	}
3923 
3924 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3925 		return (cache_fpl_partial(fpl));
3926 	}
3927 
3928 	fpl->tvp = tvp;
3929 	fpl->tvp_seqc = vn_seqc_read_any(tvp);
3930 	if (seqc_in_modify(fpl->tvp_seqc)) {
3931 		return (cache_fpl_partial(fpl));
3932 	}
3933 
3934 	if (!cache_fplookup_vnode_supported(tvp)) {
3935 		return (cache_fpl_partial(fpl));
3936 	}
3937 
3938 	counter_u64_add(numposhits, 1);
3939 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3940 	return (0);
3941 }
3942 
3943 static bool
3944 cache_fplookup_mp_supported(struct mount *mp)
3945 {
3946 
3947 	if (mp == NULL)
3948 		return (false);
3949 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3950 		return (false);
3951 	return (true);
3952 }
3953 
3954 /*
3955  * Walk up the mount stack (if any).
3956  *
3957  * Correctness is provided in the following ways:
3958  * - all vnodes are protected from freeing with SMR
3959  * - struct mount objects are type stable making them always safe to access
3960  * - stability of the particular mount is provided by busying it
3961  * - relationship between the vnode which is mounted on and the mount is
3962  *   verified with the vnode sequence counter after busying
3963  * - association between root vnode of the mount and the mount is protected
3964  *   by busy
3965  *
3966  * From that point on we can read the sequence counter of the root vnode
3967  * and get the next mount on the stack (if any) using the same protection.
3968  *
3969  * By the end of successful walk we are guaranteed the reached state was
3970  * indeed present at least at some point which matches the regular lookup.
3971  */
3972 static int __noinline
3973 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3974 {
3975 	struct mount *mp, *prev_mp;
3976 	struct vnode *vp;
3977 	seqc_t vp_seqc;
3978 
3979 	vp = fpl->tvp;
3980 	vp_seqc = fpl->tvp_seqc;
3981 
3982 	VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
3983 	mp = atomic_load_ptr(&vp->v_mountedhere);
3984 	if (mp == NULL)
3985 		return (0);
3986 
3987 	prev_mp = NULL;
3988 	for (;;) {
3989 		if (!vfs_op_thread_enter_crit(mp)) {
3990 			if (prev_mp != NULL)
3991 				vfs_op_thread_exit_crit(prev_mp);
3992 			return (cache_fpl_partial(fpl));
3993 		}
3994 		if (prev_mp != NULL)
3995 			vfs_op_thread_exit_crit(prev_mp);
3996 		if (!vn_seqc_consistent(vp, vp_seqc)) {
3997 			vfs_op_thread_exit_crit(mp);
3998 			return (cache_fpl_partial(fpl));
3999 		}
4000 		if (!cache_fplookup_mp_supported(mp)) {
4001 			vfs_op_thread_exit_crit(mp);
4002 			return (cache_fpl_partial(fpl));
4003 		}
4004 		vp = atomic_load_ptr(&mp->mnt_rootvnode);
4005 		if (vp == NULL || VN_IS_DOOMED(vp)) {
4006 			vfs_op_thread_exit_crit(mp);
4007 			return (cache_fpl_partial(fpl));
4008 		}
4009 		vp_seqc = vn_seqc_read_any(vp);
4010 		if (seqc_in_modify(vp_seqc)) {
4011 			vfs_op_thread_exit_crit(mp);
4012 			return (cache_fpl_partial(fpl));
4013 		}
4014 		prev_mp = mp;
4015 		mp = atomic_load_ptr(&vp->v_mountedhere);
4016 		if (mp == NULL)
4017 			break;
4018 	}
4019 
4020 	vfs_op_thread_exit_crit(prev_mp);
4021 	fpl->tvp = vp;
4022 	fpl->tvp_seqc = vp_seqc;
4023 	return (0);
4024 }
4025 
4026 static bool
4027 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4028 {
4029 	struct mount *mp;
4030 	struct vnode *vp;
4031 
4032 	vp = fpl->tvp;
4033 
4034 	/*
4035 	 * Hack: while this is a union, the pointer tends to be NULL so save on
4036 	 * a branch.
4037 	 */
4038 	mp = atomic_load_ptr(&vp->v_mountedhere);
4039 	if (mp == NULL)
4040 		return (false);
4041 	if (vp->v_type == VDIR)
4042 		return (true);
4043 	return (false);
4044 }
4045 
4046 /*
4047  * Parse the path.
4048  *
4049  * The code is mostly copy-pasted from regular lookup, see lookup().
4050  * The structure is maintained along with comments for easier maintenance.
4051  * Deduplicating the code will become feasible after fast path lookup
4052  * becomes more feature-complete.
4053  */
4054 static int
4055 cache_fplookup_parse(struct cache_fpl *fpl)
4056 {
4057 	struct nameidata *ndp;
4058 	struct componentname *cnp;
4059 	char *cp;
4060 
4061 	ndp = fpl->ndp;
4062 	cnp = fpl->cnp;
4063 
4064 	/*
4065 	 * Search a new directory.
4066 	 *
4067 	 * The last component of the filename is left accessible via
4068 	 * cnp->cn_nameptr for callers that need the name. Callers needing
4069 	 * the name set the SAVENAME flag. When done, they assume
4070 	 * responsibility for freeing the pathname buffer.
4071 	 */
4072 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4073 		continue;
4074 	cnp->cn_namelen = cp - cnp->cn_nameptr;
4075 	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4076 		cache_fpl_smr_exit(fpl);
4077 		return (cache_fpl_handled(fpl, ENAMETOOLONG));
4078 	}
4079 	ndp->ni_pathlen -= cnp->cn_namelen;
4080 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
4081 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4082 	ndp->ni_next = cp;
4083 
4084 	/*
4085 	 * Replace multiple slashes by a single slash and trailing slashes
4086 	 * by a null.  This must be done before VOP_LOOKUP() because some
4087 	 * fs's don't know about trailing slashes.  Remember if there were
4088 	 * trailing slashes to handle symlinks, existing non-directories
4089 	 * and non-existing files that won't be directories specially later.
4090 	 */
4091 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4092 		cp++;
4093 		ndp->ni_pathlen--;
4094 		if (*cp == '\0') {
4095 			/*
4096 			 * TODO
4097 			 * Regular lookup performs the following:
4098 			 * *ndp->ni_next = '\0';
4099 			 * cnp->cn_flags |= TRAILINGSLASH;
4100 			 *
4101 			 * Which is problematic since it modifies data read
4102 			 * from userspace. Then if fast path lookup was to
4103 			 * abort we would have to either restore it or convey
4104 			 * the flag. Since this is a corner case just ignore
4105 			 * it for simplicity.
4106 			 */
4107 			return (cache_fpl_partial(fpl));
4108 		}
4109 	}
4110 	ndp->ni_next = cp;
4111 
4112 	/*
4113 	 * Check for degenerate name (e.g. / or "")
4114 	 * which is a way of talking about a directory,
4115 	 * e.g. like "/." or ".".
4116 	 *
4117 	 * TODO
4118 	 * Another corner case handled by the regular lookup
4119 	 */
4120 	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4121 		return (cache_fpl_partial(fpl));
4122 	}
4123 	return (0);
4124 }
4125 
4126 static void
4127 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4128 {
4129 	struct nameidata *ndp;
4130 	struct componentname *cnp;
4131 
4132 	ndp = fpl->ndp;
4133 	cnp = fpl->cnp;
4134 
4135 	cnp->cn_nameptr = ndp->ni_next;
4136 	while (*cnp->cn_nameptr == '/') {
4137 		cnp->cn_nameptr++;
4138 		ndp->ni_pathlen--;
4139 	}
4140 }
4141 
4142 static int __noinline
4143 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4144 {
4145 
4146 	switch (error) {
4147 	case EAGAIN:
4148 		/*
4149 		 * Can happen when racing against vgone.
4150 		 * */
4151 	case EOPNOTSUPP:
4152 		cache_fpl_partial(fpl);
4153 		break;
4154 	default:
4155 		/*
4156 		 * See the API contract for VOP_FPLOOKUP_VEXEC.
4157 		 */
4158 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4159 			error = cache_fpl_aborted(fpl);
4160 		} else {
4161 			cache_fpl_smr_exit(fpl);
4162 			cache_fpl_handled(fpl, error);
4163 		}
4164 		break;
4165 	}
4166 	return (error);
4167 }
4168 
4169 static int
4170 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4171 {
4172 	struct nameidata *ndp;
4173 	struct componentname *cnp;
4174 	struct mount *mp;
4175 	int error;
4176 
4177 	error = CACHE_FPL_FAILED;
4178 	ndp = fpl->ndp;
4179 	cnp = fpl->cnp;
4180 
4181 	cache_fpl_checkpoint(fpl, &fpl->snd);
4182 
4183 	fpl->dvp = dvp;
4184 	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4185 	if (seqc_in_modify(fpl->dvp_seqc)) {
4186 		cache_fpl_aborted(fpl);
4187 		goto out;
4188 	}
4189 	mp = atomic_load_ptr(&fpl->dvp->v_mount);
4190 	if (!cache_fplookup_mp_supported(mp)) {
4191 		cache_fpl_aborted(fpl);
4192 		goto out;
4193 	}
4194 
4195 	VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4196 
4197 	for (;;) {
4198 		error = cache_fplookup_parse(fpl);
4199 		if (__predict_false(error != 0)) {
4200 			break;
4201 		}
4202 
4203 		VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4204 
4205 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4206 		if (__predict_false(error != 0)) {
4207 			error = cache_fplookup_failed_vexec(fpl, error);
4208 			break;
4209 		}
4210 
4211 		if (__predict_false(cache_fpl_isdotdot(cnp))) {
4212 			error = cache_fplookup_dotdot(fpl);
4213 			if (__predict_false(error != 0)) {
4214 				break;
4215 			}
4216 		} else {
4217 			error = cache_fplookup_next(fpl);
4218 			if (__predict_false(error != 0)) {
4219 				break;
4220 			}
4221 
4222 			VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4223 
4224 			if (cache_fplookup_need_climb_mount(fpl)) {
4225 				error = cache_fplookup_climb_mount(fpl);
4226 				if (__predict_false(error != 0)) {
4227 					break;
4228 				}
4229 			}
4230 		}
4231 
4232 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4233 
4234 		if (cache_fpl_islastcn(ndp)) {
4235 			error = cache_fplookup_final(fpl);
4236 			break;
4237 		}
4238 
4239 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4240 			error = cache_fpl_aborted(fpl);
4241 			break;
4242 		}
4243 
4244 		fpl->dvp = fpl->tvp;
4245 		fpl->dvp_seqc = fpl->tvp_seqc;
4246 
4247 		cache_fplookup_parse_advance(fpl);
4248 		cache_fpl_checkpoint(fpl, &fpl->snd);
4249 	}
4250 out:
4251 	switch (fpl->status) {
4252 	case CACHE_FPL_STATUS_UNSET:
4253 		__assert_unreachable();
4254 		break;
4255 	case CACHE_FPL_STATUS_PARTIAL:
4256 		cache_fpl_smr_assert_entered(fpl);
4257 		return (cache_fplookup_partial_setup(fpl));
4258 	case CACHE_FPL_STATUS_ABORTED:
4259 		if (fpl->in_smr)
4260 			cache_fpl_smr_exit(fpl);
4261 		return (CACHE_FPL_FAILED);
4262 	case CACHE_FPL_STATUS_HANDLED:
4263 		MPASS(error != CACHE_FPL_FAILED);
4264 		cache_fpl_smr_assert_not_entered(fpl);
4265 		if (__predict_false(error != 0)) {
4266 			ndp->ni_dvp = NULL;
4267 			ndp->ni_vp = NULL;
4268 			cache_fpl_cleanup_cnp(cnp);
4269 			return (error);
4270 		}
4271 		ndp->ni_dvp = fpl->dvp;
4272 		ndp->ni_vp = fpl->tvp;
4273 		if (cnp->cn_flags & SAVENAME)
4274 			cnp->cn_flags |= HASBUF;
4275 		else
4276 			cache_fpl_cleanup_cnp(cnp);
4277 		return (error);
4278 	}
4279 }
4280 
4281 /*
4282  * Fast path lookup protected with SMR and sequence counters.
4283  *
4284  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4285  *
4286  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4287  * outlined below.
4288  *
4289  * Traditional vnode lookup conceptually looks like this:
4290  *
4291  * vn_lock(current);
4292  * for (;;) {
4293  *	next = find();
4294  *	vn_lock(next);
4295  *	vn_unlock(current);
4296  *	current = next;
4297  *	if (last)
4298  *	    break;
4299  * }
4300  * return (current);
4301  *
4302  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4303  * any modifications thanks to holding respective locks.
4304  *
4305  * The same guarantee can be provided with a combination of safe memory
4306  * reclamation and sequence counters instead. If all operations which affect
4307  * the relationship between the current vnode and the one we are looking for
4308  * also modify the counter, we can verify whether all the conditions held as
4309  * we made the jump. This includes things like permissions, mount points etc.
4310  * Counter modification is provided by enclosing relevant places in
4311  * vn_seqc_write_begin()/end() calls.
4312  *
4313  * Thus this translates to:
4314  *
4315  * vfs_smr_enter();
4316  * dvp_seqc = seqc_read_any(dvp);
4317  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4318  *     abort();
4319  * for (;;) {
4320  * 	tvp = find();
4321  * 	tvp_seqc = seqc_read_any(tvp);
4322  * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4323  * 	    abort();
4324  * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4325  * 	    abort();
4326  * 	dvp = tvp; // we know nothing of importance has changed
4327  * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4328  * 	if (last)
4329  * 	    break;
4330  * }
4331  * vget(); // secure the vnode
4332  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4333  * 	    abort();
4334  * // at this point we know nothing has changed for any parent<->child pair
4335  * // as they were crossed during the lookup, meaning we matched the guarantee
4336  * // of the locked variant
4337  * return (tvp);
4338  *
4339  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4340  * - they are called while within vfs_smr protection which they must never exit
4341  * - EAGAIN can be returned to denote checking could not be performed, it is
4342  *   always valid to return it
4343  * - if the sequence counter has not changed the result must be valid
4344  * - if the sequence counter has changed both false positives and false negatives
4345  *   are permitted (since the result will be rejected later)
4346  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4347  *
4348  * Caveats to watch out for:
4349  * - vnodes are passed unlocked and unreferenced with nothing stopping
4350  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4351  *   to use atomic_load_ptr to fetch it.
4352  * - the aforementioned object can also get freed, meaning absent other means it
4353  *   should be protected with vfs_smr
4354  * - either safely checking permissions as they are modified or guaranteeing
4355  *   their stability is left to the routine
4356  */
4357 int
4358 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4359     struct pwd **pwdp)
4360 {
4361 	struct cache_fpl fpl;
4362 	struct pwd *pwd;
4363 	struct vnode *dvp;
4364 	struct componentname *cnp;
4365 	struct nameidata_saved orig;
4366 	int error;
4367 
4368 	MPASS(ndp->ni_lcf == 0);
4369 
4370 	fpl.status = CACHE_FPL_STATUS_UNSET;
4371 	fpl.ndp = ndp;
4372 	fpl.cnp = &ndp->ni_cnd;
4373 	MPASS(curthread == fpl.cnp->cn_thread);
4374 
4375 	if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4376 		MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4377 
4378 	if (!cache_can_fplookup(&fpl)) {
4379 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4380 		*status = fpl.status;
4381 		return (EOPNOTSUPP);
4382 	}
4383 
4384 	cache_fpl_checkpoint(&fpl, &orig);
4385 
4386 	cache_fpl_smr_enter_initial(&fpl);
4387 	pwd = pwd_get_smr();
4388 	fpl.pwd = pwd;
4389 	ndp->ni_rootdir = pwd->pwd_rdir;
4390 	ndp->ni_topdir = pwd->pwd_jdir;
4391 
4392 	cnp = fpl.cnp;
4393 	cnp->cn_nameptr = cnp->cn_pnbuf;
4394 	if (cnp->cn_pnbuf[0] == '/') {
4395 		cache_fpl_handle_root(ndp, &dvp);
4396 	} else {
4397 		MPASS(ndp->ni_dirfd == AT_FDCWD);
4398 		dvp = pwd->pwd_cdir;
4399 	}
4400 
4401 	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4402 
4403 	error = cache_fplookup_impl(dvp, &fpl);
4404 	cache_fpl_smr_assert_not_entered(&fpl);
4405 	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4406 
4407 	*status = fpl.status;
4408 	switch (fpl.status) {
4409 	case CACHE_FPL_STATUS_UNSET:
4410 		__assert_unreachable();
4411 		break;
4412 	case CACHE_FPL_STATUS_HANDLED:
4413 		SDT_PROBE3(vfs, namei, lookup, return, error,
4414 		    (error == 0 ? ndp->ni_vp : NULL), true);
4415 		break;
4416 	case CACHE_FPL_STATUS_PARTIAL:
4417 		*pwdp = fpl.pwd;
4418 		/*
4419 		 * Status restored by cache_fplookup_partial_setup.
4420 		 */
4421 		break;
4422 	case CACHE_FPL_STATUS_ABORTED:
4423 		cache_fpl_restore(&fpl, &orig);
4424 		break;
4425 	}
4426 	return (error);
4427 }
4428