xref: /freebsd/sys/kern/vfs_cache.c (revision 96474d2a3fa895fb9636183403fc8ca7ccf60216)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ddb.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
50 #include <sys/ktr.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
54 #include <sys/jail.h>
55 #include <sys/mount.h>
56 #include <sys/namei.h>
57 #include <sys/proc.h>
58 #include <sys/seqc.h>
59 #include <sys/sdt.h>
60 #include <sys/smr.h>
61 #include <sys/smp.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
66 #include <ck_queue.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 
71 #include <sys/capsicum.h>
72 
73 #include <security/audit/audit.h>
74 #include <security/mac/mac_framework.h>
75 
76 #ifdef DDB
77 #include <ddb/ddb.h>
78 #endif
79 
80 #include <vm/uma.h>
81 
82 SDT_PROVIDER_DECLARE(vfs);
83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
84     "struct vnode *");
85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
86     "char *");
87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
88     "const char *");
89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
90     "struct namecache *", "int", "int");
91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
93     "char *", "struct vnode *");
94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
96     "struct vnode *", "char *");
97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
98     "struct vnode *");
99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
100     "struct vnode *", "char *");
101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
102     "char *");
103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
104     "struct componentname *");
105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
106     "struct componentname *");
107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
111     "struct vnode *");
112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
113     "char *");
114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
115     "char *");
116 
117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
119 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
120 
121 /*
122  * This structure describes the elements in the cache of recent
123  * names looked up by namei.
124  */
125 struct negstate {
126 	u_char neg_flag;
127 };
128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
129     "the state must fit in a union with a pointer without growing it");
130 
131 struct	namecache {
132 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
133 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
134 	CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
135 	struct	vnode *nc_dvp;		/* vnode of parent of name */
136 	union {
137 		struct	vnode *nu_vp;	/* vnode the name refers to */
138 		struct	negstate nu_neg;/* negative entry state */
139 	} n_un;
140 	u_char	nc_flag;		/* flag bits */
141 	u_char	nc_nlen;		/* length of name */
142 	char	nc_name[0];		/* segment name + nul */
143 };
144 
145 /*
146  * struct namecache_ts repeats struct namecache layout up to the
147  * nc_nlen member.
148  * struct namecache_ts is used in place of struct namecache when time(s) need
149  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
150  * both a non-dotdot directory name plus dotdot for the directory's
151  * parent.
152  *
153  * See below for alignment requirement.
154  */
155 struct	namecache_ts {
156 	struct	timespec nc_time;	/* timespec provided by fs */
157 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
158 	int	nc_ticks;		/* ticks value when entry was added */
159 	struct namecache nc_nc;
160 };
161 
162 /*
163  * At least mips n32 performs 64-bit accesses to timespec as found
164  * in namecache_ts and requires them to be aligned. Since others
165  * may be in the same spot suffer a little bit and enforce the
166  * alignment for everyone. Note this is a nop for 64-bit platforms.
167  */
168 #define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
169 #define	CACHE_PATH_CUTOFF	39
170 
171 #define CACHE_ZONE_SMALL_SIZE		(sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
172 #define CACHE_ZONE_SMALL_TS_SIZE	(sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
173 #define CACHE_ZONE_LARGE_SIZE		(sizeof(struct namecache) + NAME_MAX + 1)
174 #define CACHE_ZONE_LARGE_TS_SIZE	(sizeof(struct namecache_ts) + NAME_MAX + 1)
175 
176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
180 
181 #define	nc_vp		n_un.nu_vp
182 #define	nc_neg		n_un.nu_neg
183 
184 /*
185  * Flags in namecache.nc_flag
186  */
187 #define NCF_WHITE	0x01
188 #define NCF_ISDOTDOT	0x02
189 #define	NCF_TS		0x04
190 #define	NCF_DTS		0x08
191 #define	NCF_DVDROP	0x10
192 #define	NCF_NEGATIVE	0x20
193 #define	NCF_INVALID	0x40
194 #define	NCF_WIP		0x80
195 
196 /*
197  * Flags in negstate.neg_flag
198  */
199 #define NEG_HOT		0x01
200 
201 /*
202  * Mark an entry as invalid.
203  *
204  * This is called before it starts getting deconstructed.
205  */
206 static void
207 cache_ncp_invalidate(struct namecache *ncp)
208 {
209 
210 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
211 	    ("%s: entry %p already invalid", __func__, ncp));
212 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
213 	atomic_thread_fence_rel();
214 }
215 
216 /*
217  * Check whether the entry can be safely used.
218  *
219  * All places which elide locks are supposed to call this after they are
220  * done with reading from an entry.
221  */
222 static bool
223 cache_ncp_canuse(struct namecache *ncp)
224 {
225 
226 	atomic_thread_fence_acq();
227 	return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
228 }
229 
230 /*
231  * Name caching works as follows:
232  *
233  * Names found by directory scans are retained in a cache
234  * for future reference.  It is managed LRU, so frequently
235  * used names will hang around.  Cache is indexed by hash value
236  * obtained from (dvp, name) where dvp refers to the directory
237  * containing name.
238  *
239  * If it is a "negative" entry, (i.e. for a name that is known NOT to
240  * exist) the vnode pointer will be NULL.
241  *
242  * Upon reaching the last segment of a path, if the reference
243  * is for DELETE, or NOCACHE is set (rewrite), and the
244  * name is located in the cache, it will be dropped.
245  *
246  * These locks are used (in the order in which they can be taken):
247  * NAME		TYPE	ROLE
248  * vnodelock	mtx	vnode lists and v_cache_dd field protection
249  * bucketlock	mtx	for access to given set of hash buckets
250  * neglist	mtx	negative entry LRU management
251  *
252  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
253  * shrinking the LRU list.
254  *
255  * It is legal to take multiple vnodelock and bucketlock locks. The locking
256  * order is lower address first. Both are recursive.
257  *
258  * "." lookups are lockless.
259  *
260  * ".." and vnode -> name lookups require vnodelock.
261  *
262  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
263  *
264  * Insertions and removals of entries require involved vnodes and bucketlocks
265  * to be locked to provide safe operation against other threads modifying the
266  * cache.
267  *
268  * Some lookups result in removal of the found entry (e.g. getting rid of a
269  * negative entry with the intent to create a positive one), which poses a
270  * problem when multiple threads reach the state. Similarly, two different
271  * threads can purge two different vnodes and try to remove the same name.
272  *
273  * If the already held vnode lock is lower than the second required lock, we
274  * can just take the other lock. However, in the opposite case, this could
275  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
276  * the first node, locking everything in order and revalidating the state.
277  */
278 
279 VFS_SMR_DECLARE;
280 
281 /*
282  * Structures associated with name caching.
283  */
284 #define NCHHASH(hash) \
285 	(&nchashtbl[(hash) & nchash])
286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
287 static u_long __read_mostly	nchash;			/* size of hash table */
288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
289     "Size of namecache hash table");
290 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
292     "Ratio of negative namecache entries");
293 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
294 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
295 u_int ncsizefactor = 2;
296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
297     "Size factor for namecache");
298 static u_int __read_mostly	ncpurgeminvnodes;
299 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
300     "Number of vnodes below which purgevfs ignores the request");
301 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
302 
303 struct nchstats	nchstats;		/* cache effectiveness statistics */
304 
305 static bool __read_frequently cache_fast_revlookup = true;
306 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
307     &cache_fast_revlookup, 0, "");
308 
309 static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
310 
311 struct neglist {
312 	struct mtx		nl_lock;
313 	TAILQ_HEAD(, namecache) nl_list;
314 } __aligned(CACHE_LINE_SIZE);
315 
316 static struct neglist __read_mostly	*neglists;
317 static struct neglist ncneg_hot;
318 static u_long numhotneg;
319 
320 #define ncneghash	3
321 #define	numneglists	(ncneghash + 1)
322 static inline struct neglist *
323 NCP2NEGLIST(struct namecache *ncp)
324 {
325 
326 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
327 }
328 
329 static inline struct negstate *
330 NCP2NEGSTATE(struct namecache *ncp)
331 {
332 
333 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
334 	return (&ncp->nc_neg);
335 }
336 
337 #define	numbucketlocks (ncbuckethash + 1)
338 static u_int __read_mostly  ncbuckethash;
339 static struct mtx_padalign __read_mostly  *bucketlocks;
340 #define	HASH2BUCKETLOCK(hash) \
341 	((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
342 
343 #define	numvnodelocks (ncvnodehash + 1)
344 static u_int __read_mostly  ncvnodehash;
345 static struct mtx __read_mostly *vnodelocks;
346 static inline struct mtx *
347 VP2VNODELOCK(struct vnode *vp)
348 {
349 
350 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
351 }
352 
353 /*
354  * UMA zones for the VFS cache.
355  *
356  * The small cache is used for entries with short names, which are the
357  * most common.  The large cache is used for entries which are too big to
358  * fit in the small cache.
359  */
360 static uma_zone_t __read_mostly cache_zone_small;
361 static uma_zone_t __read_mostly cache_zone_small_ts;
362 static uma_zone_t __read_mostly cache_zone_large;
363 static uma_zone_t __read_mostly cache_zone_large_ts;
364 
365 static struct namecache *
366 cache_alloc(int len, int ts)
367 {
368 	struct namecache_ts *ncp_ts;
369 	struct namecache *ncp;
370 
371 	if (__predict_false(ts)) {
372 		if (len <= CACHE_PATH_CUTOFF)
373 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
374 		else
375 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
376 		ncp = &ncp_ts->nc_nc;
377 	} else {
378 		if (len <= CACHE_PATH_CUTOFF)
379 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
380 		else
381 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
382 	}
383 	return (ncp);
384 }
385 
386 static void
387 cache_free(struct namecache *ncp)
388 {
389 	struct namecache_ts *ncp_ts;
390 
391 	MPASS(ncp != NULL);
392 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
393 		vdrop(ncp->nc_dvp);
394 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
395 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
396 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
397 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
398 		else
399 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
400 	} else {
401 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
402 			uma_zfree_smr(cache_zone_small, ncp);
403 		else
404 			uma_zfree_smr(cache_zone_large, ncp);
405 	}
406 }
407 
408 static void
409 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
410 {
411 	struct namecache_ts *ncp_ts;
412 
413 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
414 	    (tsp == NULL && ticksp == NULL),
415 	    ("No NCF_TS"));
416 
417 	if (tsp == NULL)
418 		return;
419 
420 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
421 	*tsp = ncp_ts->nc_time;
422 	*ticksp = ncp_ts->nc_ticks;
423 }
424 
425 #ifdef DEBUG_CACHE
426 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
427 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
428     "VFS namecache enabled");
429 #endif
430 
431 /* Export size information to userland */
432 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
433     sizeof(struct namecache), "sizeof(struct namecache)");
434 
435 /*
436  * The new name cache statistics
437  */
438 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
439     "Name cache statistics");
440 #define STATNODE_ULONG(name, descr)					\
441 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
442 #define STATNODE_COUNTER(name, descr)					\
443 	static COUNTER_U64_DEFINE_EARLY(name);				\
444 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
445 	    descr);
446 STATNODE_ULONG(numneg, "Number of negative cache entries");
447 STATNODE_ULONG(numcache, "Number of cache entries");
448 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
449 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
450 STATNODE_COUNTER(dothits, "Number of '.' hits");
451 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
452 STATNODE_COUNTER(nummiss, "Number of cache misses");
453 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
454 STATNODE_COUNTER(numposzaps,
455     "Number of cache hits (positive) we do not want to cache");
456 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
457 STATNODE_COUNTER(numnegzaps,
458     "Number of cache hits (negative) we do not want to cache");
459 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
460 /* These count for vn_getcwd(), too. */
461 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
462 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
463 STATNODE_COUNTER(numfullpathfail2,
464     "Number of fullpath search errors (VOP_VPTOCNP failures)");
465 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
466 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
467 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
468     "Number of successful removals after relocking");
469 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
470     "Number of times zap_and_exit failed to lock");
471 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
472     "Number of times zap_and_exit failed to lock");
473 static long cache_lock_vnodes_cel_3_failures;
474 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
475     "Number of times 3-way vnode locking failed");
476 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
477 STATNODE_COUNTER(numneg_evicted,
478     "Number of negative entries evicted when adding a new entry");
479 STATNODE_COUNTER(shrinking_skipped,
480     "Number of times shrinking was already in progress");
481 
482 static void cache_zap_locked(struct namecache *ncp);
483 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
484     char **freebuf, size_t *buflen);
485 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
486     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
487 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
488     char **retbuf, size_t *buflen);
489 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
490     char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
491 
492 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
493 
494 static int cache_yield;
495 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
496     "Number of times cache called yield");
497 
498 static void __noinline
499 cache_maybe_yield(void)
500 {
501 
502 	if (should_yield()) {
503 		cache_yield++;
504 		kern_yield(PRI_USER);
505 	}
506 }
507 
508 static inline void
509 cache_assert_vlp_locked(struct mtx *vlp)
510 {
511 
512 	if (vlp != NULL)
513 		mtx_assert(vlp, MA_OWNED);
514 }
515 
516 static inline void
517 cache_assert_vnode_locked(struct vnode *vp)
518 {
519 	struct mtx *vlp;
520 
521 	vlp = VP2VNODELOCK(vp);
522 	cache_assert_vlp_locked(vlp);
523 }
524 
525 /*
526  * TODO: With the value stored we can do better than computing the hash based
527  * on the address. The choice of FNV should also be revisited.
528  */
529 static void
530 cache_prehash(struct vnode *vp)
531 {
532 
533 	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
534 }
535 
536 static uint32_t
537 cache_get_hash(char *name, u_char len, struct vnode *dvp)
538 {
539 
540 	return (fnv_32_buf(name, len, dvp->v_nchash));
541 }
542 
543 static inline struct nchashhead *
544 NCP2BUCKET(struct namecache *ncp)
545 {
546 	uint32_t hash;
547 
548 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
549 	return (NCHHASH(hash));
550 }
551 
552 static inline struct mtx *
553 NCP2BUCKETLOCK(struct namecache *ncp)
554 {
555 	uint32_t hash;
556 
557 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
558 	return (HASH2BUCKETLOCK(hash));
559 }
560 
561 #ifdef INVARIANTS
562 static void
563 cache_assert_bucket_locked(struct namecache *ncp)
564 {
565 	struct mtx *blp;
566 
567 	blp = NCP2BUCKETLOCK(ncp);
568 	mtx_assert(blp, MA_OWNED);
569 }
570 
571 static void
572 cache_assert_bucket_unlocked(struct namecache *ncp)
573 {
574 	struct mtx *blp;
575 
576 	blp = NCP2BUCKETLOCK(ncp);
577 	mtx_assert(blp, MA_NOTOWNED);
578 }
579 #else
580 #define cache_assert_bucket_locked(x) do { } while (0)
581 #define cache_assert_bucket_unlocked(x) do { } while (0)
582 #endif
583 
584 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
585 static void
586 _cache_sort_vnodes(void **p1, void **p2)
587 {
588 	void *tmp;
589 
590 	MPASS(*p1 != NULL || *p2 != NULL);
591 
592 	if (*p1 > *p2) {
593 		tmp = *p2;
594 		*p2 = *p1;
595 		*p1 = tmp;
596 	}
597 }
598 
599 static void
600 cache_lock_all_buckets(void)
601 {
602 	u_int i;
603 
604 	for (i = 0; i < numbucketlocks; i++)
605 		mtx_lock(&bucketlocks[i]);
606 }
607 
608 static void
609 cache_unlock_all_buckets(void)
610 {
611 	u_int i;
612 
613 	for (i = 0; i < numbucketlocks; i++)
614 		mtx_unlock(&bucketlocks[i]);
615 }
616 
617 static void
618 cache_lock_all_vnodes(void)
619 {
620 	u_int i;
621 
622 	for (i = 0; i < numvnodelocks; i++)
623 		mtx_lock(&vnodelocks[i]);
624 }
625 
626 static void
627 cache_unlock_all_vnodes(void)
628 {
629 	u_int i;
630 
631 	for (i = 0; i < numvnodelocks; i++)
632 		mtx_unlock(&vnodelocks[i]);
633 }
634 
635 static int
636 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
637 {
638 
639 	cache_sort_vnodes(&vlp1, &vlp2);
640 
641 	if (vlp1 != NULL) {
642 		if (!mtx_trylock(vlp1))
643 			return (EAGAIN);
644 	}
645 	if (!mtx_trylock(vlp2)) {
646 		if (vlp1 != NULL)
647 			mtx_unlock(vlp1);
648 		return (EAGAIN);
649 	}
650 
651 	return (0);
652 }
653 
654 static void
655 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
656 {
657 
658 	MPASS(vlp1 != NULL || vlp2 != NULL);
659 	MPASS(vlp1 <= vlp2);
660 
661 	if (vlp1 != NULL)
662 		mtx_lock(vlp1);
663 	if (vlp2 != NULL)
664 		mtx_lock(vlp2);
665 }
666 
667 static void
668 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
669 {
670 
671 	MPASS(vlp1 != NULL || vlp2 != NULL);
672 
673 	if (vlp1 != NULL)
674 		mtx_unlock(vlp1);
675 	if (vlp2 != NULL)
676 		mtx_unlock(vlp2);
677 }
678 
679 static int
680 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
681 {
682 	struct nchstats snap;
683 
684 	if (req->oldptr == NULL)
685 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
686 
687 	snap = nchstats;
688 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
689 	snap.ncs_neghits = counter_u64_fetch(numneghits);
690 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
691 	    counter_u64_fetch(numnegzaps);
692 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
693 	    counter_u64_fetch(nummiss);
694 
695 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
696 }
697 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
698     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
699     "VFS cache effectiveness statistics");
700 
701 #ifdef DIAGNOSTIC
702 /*
703  * Grab an atomic snapshot of the name cache hash chain lengths
704  */
705 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
706     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
707     "hash table stats");
708 
709 static int
710 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
711 {
712 	struct nchashhead *ncpp;
713 	struct namecache *ncp;
714 	int i, error, n_nchash, *cntbuf;
715 
716 retry:
717 	n_nchash = nchash + 1;	/* nchash is max index, not count */
718 	if (req->oldptr == NULL)
719 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
720 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
721 	cache_lock_all_buckets();
722 	if (n_nchash != nchash + 1) {
723 		cache_unlock_all_buckets();
724 		free(cntbuf, M_TEMP);
725 		goto retry;
726 	}
727 	/* Scan hash tables counting entries */
728 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
729 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
730 			cntbuf[i]++;
731 	cache_unlock_all_buckets();
732 	for (error = 0, i = 0; i < n_nchash; i++)
733 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
734 			break;
735 	free(cntbuf, M_TEMP);
736 	return (error);
737 }
738 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
739     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
740     "nchash chain lengths");
741 
742 static int
743 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
744 {
745 	int error;
746 	struct nchashhead *ncpp;
747 	struct namecache *ncp;
748 	int n_nchash;
749 	int count, maxlength, used, pct;
750 
751 	if (!req->oldptr)
752 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
753 
754 	cache_lock_all_buckets();
755 	n_nchash = nchash + 1;	/* nchash is max index, not count */
756 	used = 0;
757 	maxlength = 0;
758 
759 	/* Scan hash tables for applicable entries */
760 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
761 		count = 0;
762 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
763 			count++;
764 		}
765 		if (count)
766 			used++;
767 		if (maxlength < count)
768 			maxlength = count;
769 	}
770 	n_nchash = nchash + 1;
771 	cache_unlock_all_buckets();
772 	pct = (used * 100) / (n_nchash / 100);
773 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
774 	if (error)
775 		return (error);
776 	error = SYSCTL_OUT(req, &used, sizeof(used));
777 	if (error)
778 		return (error);
779 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
780 	if (error)
781 		return (error);
782 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
783 	if (error)
784 		return (error);
785 	return (0);
786 }
787 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
788     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
789     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
790 #endif
791 
792 /*
793  * Negative entries management
794  *
795  * A variation of LRU scheme is used. New entries are hashed into one of
796  * numneglists cold lists. Entries get promoted to the hot list on first hit.
797  *
798  * The shrinker will demote hot list head and evict from the cold list in a
799  * round-robin manner.
800  */
801 static void
802 cache_negative_init(struct namecache *ncp)
803 {
804 	struct negstate *negstate;
805 
806 	ncp->nc_flag |= NCF_NEGATIVE;
807 	negstate = NCP2NEGSTATE(ncp);
808 	negstate->neg_flag = 0;
809 }
810 
811 static void
812 cache_negative_hit(struct namecache *ncp)
813 {
814 	struct neglist *neglist;
815 	struct negstate *negstate;
816 
817 	negstate = NCP2NEGSTATE(ncp);
818 	if ((negstate->neg_flag & NEG_HOT) != 0)
819 		return;
820 	neglist = NCP2NEGLIST(ncp);
821 	mtx_lock(&ncneg_hot.nl_lock);
822 	mtx_lock(&neglist->nl_lock);
823 	if ((negstate->neg_flag & NEG_HOT) == 0) {
824 		numhotneg++;
825 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
826 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
827 		negstate->neg_flag |= NEG_HOT;
828 	}
829 	mtx_unlock(&neglist->nl_lock);
830 	mtx_unlock(&ncneg_hot.nl_lock);
831 }
832 
833 static void
834 cache_negative_insert(struct namecache *ncp)
835 {
836 	struct neglist *neglist;
837 
838 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
839 	cache_assert_bucket_locked(ncp);
840 	neglist = NCP2NEGLIST(ncp);
841 	mtx_lock(&neglist->nl_lock);
842 	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
843 	mtx_unlock(&neglist->nl_lock);
844 	atomic_add_rel_long(&numneg, 1);
845 }
846 
847 static void
848 cache_negative_remove(struct namecache *ncp)
849 {
850 	struct neglist *neglist;
851 	struct negstate *negstate;
852 	bool hot_locked = false;
853 	bool list_locked = false;
854 
855 	cache_assert_bucket_locked(ncp);
856 	neglist = NCP2NEGLIST(ncp);
857 	negstate = NCP2NEGSTATE(ncp);
858 	if ((negstate->neg_flag & NEG_HOT) != 0) {
859 		hot_locked = true;
860 		mtx_lock(&ncneg_hot.nl_lock);
861 		if ((negstate->neg_flag & NEG_HOT) == 0) {
862 			list_locked = true;
863 			mtx_lock(&neglist->nl_lock);
864 		}
865 	} else {
866 		list_locked = true;
867 		mtx_lock(&neglist->nl_lock);
868 		/*
869 		 * We may be racing against promotion in lockless lookup.
870 		 */
871 		if ((negstate->neg_flag & NEG_HOT) != 0) {
872 			mtx_unlock(&neglist->nl_lock);
873 			hot_locked = true;
874 			mtx_lock(&ncneg_hot.nl_lock);
875 			mtx_lock(&neglist->nl_lock);
876 		}
877 	}
878 	if ((negstate->neg_flag & NEG_HOT) != 0) {
879 		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
880 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
881 		numhotneg--;
882 	} else {
883 		mtx_assert(&neglist->nl_lock, MA_OWNED);
884 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
885 	}
886 	if (list_locked)
887 		mtx_unlock(&neglist->nl_lock);
888 	if (hot_locked)
889 		mtx_unlock(&ncneg_hot.nl_lock);
890 	atomic_subtract_rel_long(&numneg, 1);
891 }
892 
893 static void
894 cache_negative_shrink_select(struct namecache **ncpp,
895     struct neglist **neglistpp)
896 {
897 	struct neglist *neglist;
898 	struct namecache *ncp;
899 	static u_int cycle;
900 	u_int i;
901 
902 	*ncpp = ncp = NULL;
903 
904 	for (i = 0; i < numneglists; i++) {
905 		neglist = &neglists[(cycle + i) % numneglists];
906 		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
907 			continue;
908 		mtx_lock(&neglist->nl_lock);
909 		ncp = TAILQ_FIRST(&neglist->nl_list);
910 		if (ncp != NULL)
911 			break;
912 		mtx_unlock(&neglist->nl_lock);
913 	}
914 
915 	*neglistpp = neglist;
916 	*ncpp = ncp;
917 	cycle++;
918 }
919 
920 static void
921 cache_negative_zap_one(void)
922 {
923 	struct namecache *ncp, *ncp2;
924 	struct neglist *neglist;
925 	struct negstate *negstate;
926 	struct mtx *dvlp;
927 	struct mtx *blp;
928 
929 	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
930 	    !mtx_trylock(&ncneg_shrink_lock)) {
931 		counter_u64_add(shrinking_skipped, 1);
932 		return;
933 	}
934 
935 	mtx_lock(&ncneg_hot.nl_lock);
936 	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
937 	if (ncp != NULL) {
938 		neglist = NCP2NEGLIST(ncp);
939 		negstate = NCP2NEGSTATE(ncp);
940 		mtx_lock(&neglist->nl_lock);
941 		MPASS((negstate->neg_flag & NEG_HOT) != 0);
942 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
943 		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
944 		negstate->neg_flag &= ~NEG_HOT;
945 		numhotneg--;
946 		mtx_unlock(&neglist->nl_lock);
947 	}
948 	mtx_unlock(&ncneg_hot.nl_lock);
949 
950 	cache_negative_shrink_select(&ncp, &neglist);
951 
952 	mtx_unlock(&ncneg_shrink_lock);
953 	if (ncp == NULL)
954 		return;
955 
956 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
957 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
958 	blp = NCP2BUCKETLOCK(ncp);
959 	mtx_unlock(&neglist->nl_lock);
960 	mtx_lock(dvlp);
961 	mtx_lock(blp);
962 	/*
963 	 * Enter SMR to safely check the negative list.
964 	 * Even if the found pointer matches, the entry may now be reallocated
965 	 * and used by a different vnode.
966 	 */
967 	vfs_smr_enter();
968 	ncp2 = TAILQ_FIRST(&neglist->nl_list);
969 	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
970 	    blp != NCP2BUCKETLOCK(ncp2)) {
971 		vfs_smr_exit();
972 		ncp = NULL;
973 	} else {
974 		vfs_smr_exit();
975 		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
976 		    ncp->nc_name);
977 		cache_zap_locked(ncp);
978 		counter_u64_add(numneg_evicted, 1);
979 	}
980 	mtx_unlock(blp);
981 	mtx_unlock(dvlp);
982 	if (ncp != NULL)
983 		cache_free(ncp);
984 }
985 
986 /*
987  * cache_zap_locked():
988  *
989  *   Removes a namecache entry from cache, whether it contains an actual
990  *   pointer to a vnode or if it is just a negative cache entry.
991  */
992 static void
993 cache_zap_locked(struct namecache *ncp)
994 {
995 	struct nchashhead *ncpp;
996 
997 	if (!(ncp->nc_flag & NCF_NEGATIVE))
998 		cache_assert_vnode_locked(ncp->nc_vp);
999 	cache_assert_vnode_locked(ncp->nc_dvp);
1000 	cache_assert_bucket_locked(ncp);
1001 
1002 	cache_ncp_invalidate(ncp);
1003 
1004 	ncpp = NCP2BUCKET(ncp);
1005 	CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1006 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1007 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1008 		    ncp->nc_name, ncp->nc_vp);
1009 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1010 		if (ncp == ncp->nc_vp->v_cache_dd) {
1011 			vn_seqc_write_begin_unheld(ncp->nc_vp);
1012 			ncp->nc_vp->v_cache_dd = NULL;
1013 			vn_seqc_write_end(ncp->nc_vp);
1014 		}
1015 	} else {
1016 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1017 		    ncp->nc_name);
1018 		cache_negative_remove(ncp);
1019 	}
1020 	if (ncp->nc_flag & NCF_ISDOTDOT) {
1021 		if (ncp == ncp->nc_dvp->v_cache_dd) {
1022 			vn_seqc_write_begin_unheld(ncp->nc_dvp);
1023 			ncp->nc_dvp->v_cache_dd = NULL;
1024 			vn_seqc_write_end(ncp->nc_dvp);
1025 		}
1026 	} else {
1027 		LIST_REMOVE(ncp, nc_src);
1028 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1029 			ncp->nc_flag |= NCF_DVDROP;
1030 			counter_u64_add(numcachehv, -1);
1031 		}
1032 	}
1033 	atomic_subtract_rel_long(&numcache, 1);
1034 }
1035 
1036 static void
1037 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1038 {
1039 	struct mtx *blp;
1040 
1041 	MPASS(ncp->nc_dvp == vp);
1042 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1043 	cache_assert_vnode_locked(vp);
1044 
1045 	blp = NCP2BUCKETLOCK(ncp);
1046 	mtx_lock(blp);
1047 	cache_zap_locked(ncp);
1048 	mtx_unlock(blp);
1049 }
1050 
1051 static bool
1052 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1053     struct mtx **vlpp)
1054 {
1055 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1056 	struct mtx *blp;
1057 
1058 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1059 	cache_assert_vnode_locked(vp);
1060 
1061 	if (ncp->nc_flag & NCF_NEGATIVE) {
1062 		if (*vlpp != NULL) {
1063 			mtx_unlock(*vlpp);
1064 			*vlpp = NULL;
1065 		}
1066 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1067 		return (true);
1068 	}
1069 
1070 	pvlp = VP2VNODELOCK(vp);
1071 	blp = NCP2BUCKETLOCK(ncp);
1072 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1073 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1074 
1075 	if (*vlpp == vlp1 || *vlpp == vlp2) {
1076 		to_unlock = *vlpp;
1077 		*vlpp = NULL;
1078 	} else {
1079 		if (*vlpp != NULL) {
1080 			mtx_unlock(*vlpp);
1081 			*vlpp = NULL;
1082 		}
1083 		cache_sort_vnodes(&vlp1, &vlp2);
1084 		if (vlp1 == pvlp) {
1085 			mtx_lock(vlp2);
1086 			to_unlock = vlp2;
1087 		} else {
1088 			if (!mtx_trylock(vlp1))
1089 				goto out_relock;
1090 			to_unlock = vlp1;
1091 		}
1092 	}
1093 	mtx_lock(blp);
1094 	cache_zap_locked(ncp);
1095 	mtx_unlock(blp);
1096 	if (to_unlock != NULL)
1097 		mtx_unlock(to_unlock);
1098 	return (true);
1099 
1100 out_relock:
1101 	mtx_unlock(vlp2);
1102 	mtx_lock(vlp1);
1103 	mtx_lock(vlp2);
1104 	MPASS(*vlpp == NULL);
1105 	*vlpp = vlp1;
1106 	return (false);
1107 }
1108 
1109 static int __noinline
1110 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1111 {
1112 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1113 	struct mtx *blp;
1114 	int error = 0;
1115 
1116 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1117 	cache_assert_vnode_locked(vp);
1118 
1119 	pvlp = VP2VNODELOCK(vp);
1120 	if (ncp->nc_flag & NCF_NEGATIVE) {
1121 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1122 		goto out;
1123 	}
1124 
1125 	blp = NCP2BUCKETLOCK(ncp);
1126 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1127 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1128 	cache_sort_vnodes(&vlp1, &vlp2);
1129 	if (vlp1 == pvlp) {
1130 		mtx_lock(vlp2);
1131 		to_unlock = vlp2;
1132 	} else {
1133 		if (!mtx_trylock(vlp1)) {
1134 			/*
1135 			 * TODO: Very wasteful but rare.
1136 			 */
1137 			mtx_unlock(pvlp);
1138 			mtx_lock(vlp1);
1139 			mtx_lock(vlp2);
1140 			mtx_unlock(vlp2);
1141 			mtx_unlock(vlp1);
1142 			return (EAGAIN);
1143 		}
1144 		to_unlock = vlp1;
1145 	}
1146 	mtx_lock(blp);
1147 	cache_zap_locked(ncp);
1148 	mtx_unlock(blp);
1149 	mtx_unlock(to_unlock);
1150 out:
1151 	mtx_unlock(pvlp);
1152 	return (error);
1153 }
1154 
1155 /*
1156  * If trylocking failed we can get here. We know enough to take all needed locks
1157  * in the right order and re-lookup the entry.
1158  */
1159 static int
1160 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1161     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1162     struct mtx *blp)
1163 {
1164 	struct namecache *rncp;
1165 
1166 	cache_assert_bucket_unlocked(ncp);
1167 
1168 	cache_sort_vnodes(&dvlp, &vlp);
1169 	cache_lock_vnodes(dvlp, vlp);
1170 	mtx_lock(blp);
1171 	CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1172 		if (rncp == ncp && rncp->nc_dvp == dvp &&
1173 		    rncp->nc_nlen == cnp->cn_namelen &&
1174 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1175 			break;
1176 	}
1177 	if (rncp != NULL) {
1178 		cache_zap_locked(rncp);
1179 		mtx_unlock(blp);
1180 		cache_unlock_vnodes(dvlp, vlp);
1181 		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1182 		return (0);
1183 	}
1184 
1185 	mtx_unlock(blp);
1186 	cache_unlock_vnodes(dvlp, vlp);
1187 	return (EAGAIN);
1188 }
1189 
1190 static int __noinline
1191 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1192     uint32_t hash, struct mtx *blp)
1193 {
1194 	struct mtx *dvlp, *vlp;
1195 	struct vnode *dvp;
1196 
1197 	cache_assert_bucket_locked(ncp);
1198 
1199 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1200 	vlp = NULL;
1201 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1202 		vlp = VP2VNODELOCK(ncp->nc_vp);
1203 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1204 		cache_zap_locked(ncp);
1205 		mtx_unlock(blp);
1206 		cache_unlock_vnodes(dvlp, vlp);
1207 		return (0);
1208 	}
1209 
1210 	dvp = ncp->nc_dvp;
1211 	mtx_unlock(blp);
1212 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1213 }
1214 
1215 static int
1216 cache_zap_locked_bucket_kl(struct namecache *ncp, struct mtx *blp,
1217     struct mtx **vlpp1, struct mtx **vlpp2)
1218 {
1219 	struct mtx *dvlp, *vlp;
1220 
1221 	cache_assert_bucket_locked(ncp);
1222 
1223 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1224 	vlp = NULL;
1225 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1226 		vlp = VP2VNODELOCK(ncp->nc_vp);
1227 	cache_sort_vnodes(&dvlp, &vlp);
1228 
1229 	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1230 		cache_zap_locked(ncp);
1231 		cache_unlock_vnodes(dvlp, vlp);
1232 		*vlpp1 = NULL;
1233 		*vlpp2 = NULL;
1234 		return (0);
1235 	}
1236 
1237 	if (*vlpp1 != NULL)
1238 		mtx_unlock(*vlpp1);
1239 	if (*vlpp2 != NULL)
1240 		mtx_unlock(*vlpp2);
1241 	*vlpp1 = NULL;
1242 	*vlpp2 = NULL;
1243 
1244 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1245 		cache_zap_locked(ncp);
1246 		cache_unlock_vnodes(dvlp, vlp);
1247 		return (0);
1248 	}
1249 
1250 	mtx_unlock(blp);
1251 	*vlpp1 = dvlp;
1252 	*vlpp2 = vlp;
1253 	if (*vlpp1 != NULL)
1254 		mtx_lock(*vlpp1);
1255 	mtx_lock(*vlpp2);
1256 	mtx_lock(blp);
1257 	return (EAGAIN);
1258 }
1259 
1260 static __noinline int
1261 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1262 {
1263 	struct namecache *ncp;
1264 	struct mtx *blp;
1265 	struct mtx *dvlp, *dvlp2;
1266 	uint32_t hash;
1267 	int error;
1268 
1269 	if (cnp->cn_namelen == 2 &&
1270 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1271 		dvlp = VP2VNODELOCK(dvp);
1272 		dvlp2 = NULL;
1273 		mtx_lock(dvlp);
1274 retry_dotdot:
1275 		ncp = dvp->v_cache_dd;
1276 		if (ncp == NULL) {
1277 			mtx_unlock(dvlp);
1278 			if (dvlp2 != NULL)
1279 				mtx_unlock(dvlp2);
1280 			SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1281 			return (0);
1282 		}
1283 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1284 			if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1285 				goto retry_dotdot;
1286 			MPASS(dvp->v_cache_dd == NULL);
1287 			mtx_unlock(dvlp);
1288 			if (dvlp2 != NULL)
1289 				mtx_unlock(dvlp2);
1290 			cache_free(ncp);
1291 		} else {
1292 			vn_seqc_write_begin(dvp);
1293 			dvp->v_cache_dd = NULL;
1294 			vn_seqc_write_end(dvp);
1295 			mtx_unlock(dvlp);
1296 			if (dvlp2 != NULL)
1297 				mtx_unlock(dvlp2);
1298 		}
1299 		SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1300 		return (1);
1301 	}
1302 
1303 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1304 	blp = HASH2BUCKETLOCK(hash);
1305 retry:
1306 	if (CK_SLIST_EMPTY(NCHHASH(hash)))
1307 		goto out_no_entry;
1308 
1309 	mtx_lock(blp);
1310 
1311 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1312 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1313 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1314 			break;
1315 	}
1316 
1317 	if (ncp == NULL) {
1318 		mtx_unlock(blp);
1319 		goto out_no_entry;
1320 	}
1321 
1322 	error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1323 	if (__predict_false(error != 0)) {
1324 		zap_and_exit_bucket_fail++;
1325 		goto retry;
1326 	}
1327 	counter_u64_add(numposzaps, 1);
1328 	SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1329 	cache_free(ncp);
1330 	return (1);
1331 out_no_entry:
1332 	counter_u64_add(nummisszap, 1);
1333 	SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1334 	return (0);
1335 }
1336 
1337 static int __noinline
1338 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1339     struct timespec *tsp, int *ticksp)
1340 {
1341 	int ltype;
1342 
1343 	*vpp = dvp;
1344 	counter_u64_add(dothits, 1);
1345 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1346 	if (tsp != NULL)
1347 		timespecclear(tsp);
1348 	if (ticksp != NULL)
1349 		*ticksp = ticks;
1350 	vrefact(*vpp);
1351 	/*
1352 	 * When we lookup "." we still can be asked to lock it
1353 	 * differently...
1354 	 */
1355 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1356 	if (ltype != VOP_ISLOCKED(*vpp)) {
1357 		if (ltype == LK_EXCLUSIVE) {
1358 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1359 			if (VN_IS_DOOMED((*vpp))) {
1360 				/* forced unmount */
1361 				vrele(*vpp);
1362 				*vpp = NULL;
1363 				return (ENOENT);
1364 			}
1365 		} else
1366 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1367 	}
1368 	return (-1);
1369 }
1370 
1371 static int __noinline
1372 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1373     struct timespec *tsp, int *ticksp)
1374 {
1375 	struct namecache_ts *ncp_ts;
1376 	struct namecache *ncp;
1377 	struct mtx *dvlp;
1378 	enum vgetstate vs;
1379 	int error, ltype;
1380 	bool whiteout;
1381 
1382 	MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1383 
1384 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
1385 		cache_remove_cnp(dvp, cnp);
1386 		return (0);
1387 	}
1388 
1389 	counter_u64_add(dotdothits, 1);
1390 retry:
1391 	dvlp = VP2VNODELOCK(dvp);
1392 	mtx_lock(dvlp);
1393 	ncp = dvp->v_cache_dd;
1394 	if (ncp == NULL) {
1395 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1396 		mtx_unlock(dvlp);
1397 		return (0);
1398 	}
1399 	if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1400 		if (ncp->nc_flag & NCF_NEGATIVE)
1401 			*vpp = NULL;
1402 		else
1403 			*vpp = ncp->nc_vp;
1404 	} else
1405 		*vpp = ncp->nc_dvp;
1406 	if (*vpp == NULL)
1407 		goto negative_success;
1408 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1409 	cache_out_ts(ncp, tsp, ticksp);
1410 	if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1411 	    NCF_DTS && tsp != NULL) {
1412 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1413 		*tsp = ncp_ts->nc_dotdottime;
1414 	}
1415 
1416 	MPASS(dvp != *vpp);
1417 	ltype = VOP_ISLOCKED(dvp);
1418 	VOP_UNLOCK(dvp);
1419 	vs = vget_prep(*vpp);
1420 	mtx_unlock(dvlp);
1421 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1422 	vn_lock(dvp, ltype | LK_RETRY);
1423 	if (VN_IS_DOOMED(dvp)) {
1424 		if (error == 0)
1425 			vput(*vpp);
1426 		*vpp = NULL;
1427 		return (ENOENT);
1428 	}
1429 	if (error) {
1430 		*vpp = NULL;
1431 		goto retry;
1432 	}
1433 	return (-1);
1434 negative_success:
1435 	if (__predict_false(cnp->cn_nameiop == CREATE)) {
1436 		if (cnp->cn_flags & ISLASTCN) {
1437 			counter_u64_add(numnegzaps, 1);
1438 			error = cache_zap_locked_vnode(ncp, dvp);
1439 			if (__predict_false(error != 0)) {
1440 				zap_and_exit_bucket_fail2++;
1441 				goto retry;
1442 			}
1443 			cache_free(ncp);
1444 			return (0);
1445 		}
1446 	}
1447 
1448 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1449 	cache_out_ts(ncp, tsp, ticksp);
1450 	counter_u64_add(numneghits, 1);
1451 	whiteout = (ncp->nc_flag & NCF_WHITE);
1452 	cache_negative_hit(ncp);
1453 	mtx_unlock(dvlp);
1454 	if (whiteout)
1455 		cnp->cn_flags |= ISWHITEOUT;
1456 	return (ENOENT);
1457 }
1458 
1459 /**
1460  * Lookup a name in the name cache
1461  *
1462  * # Arguments
1463  *
1464  * - dvp:	Parent directory in which to search.
1465  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1466  * - cnp:	Parameters of the name search.  The most interesting bits of
1467  *   		the cn_flags field have the following meanings:
1468  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1469  *   			it up.
1470  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1471  * - tsp:	Return storage for cache timestamp.  On a successful (positive
1472  *   		or negative) lookup, tsp will be filled with any timespec that
1473  *   		was stored when this cache entry was created.  However, it will
1474  *   		be clear for "." entries.
1475  * - ticks:	Return storage for alternate cache timestamp.  On a successful
1476  *   		(positive or negative) lookup, it will contain the ticks value
1477  *   		that was current when the cache entry was created, unless cnp
1478  *   		was ".".
1479  *
1480  * Either both tsp and ticks have to be provided or neither of them.
1481  *
1482  * # Returns
1483  *
1484  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1485  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1486  *		to a forced unmount.  vpp will not be modified.  If the entry
1487  *		is a whiteout, then the ISWHITEOUT flag will be set in
1488  *		cnp->cn_flags.
1489  * - 0:		A cache miss.  vpp will not be modified.
1490  *
1491  * # Locking
1492  *
1493  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1494  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1495  * lock is not recursively acquired.
1496  */
1497 static int __noinline
1498 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1499     struct timespec *tsp, int *ticksp)
1500 {
1501 	struct namecache *ncp;
1502 	struct mtx *blp;
1503 	uint32_t hash;
1504 	enum vgetstate vs;
1505 	int error;
1506 	bool whiteout;
1507 
1508 	MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY);
1509 
1510 retry:
1511 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1512 	blp = HASH2BUCKETLOCK(hash);
1513 	mtx_lock(blp);
1514 
1515 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1516 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1517 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1518 			break;
1519 	}
1520 
1521 	if (__predict_false(ncp == NULL)) {
1522 		mtx_unlock(blp);
1523 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1524 		    NULL);
1525 		counter_u64_add(nummiss, 1);
1526 		return (0);
1527 	}
1528 
1529 	if (ncp->nc_flag & NCF_NEGATIVE)
1530 		goto negative_success;
1531 
1532 	counter_u64_add(numposhits, 1);
1533 	*vpp = ncp->nc_vp;
1534 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1535 	cache_out_ts(ncp, tsp, ticksp);
1536 	MPASS(dvp != *vpp);
1537 	vs = vget_prep(*vpp);
1538 	mtx_unlock(blp);
1539 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1540 	if (error) {
1541 		*vpp = NULL;
1542 		goto retry;
1543 	}
1544 	return (-1);
1545 negative_success:
1546 	if (__predict_false(cnp->cn_nameiop == CREATE)) {
1547 		if (cnp->cn_flags & ISLASTCN) {
1548 			counter_u64_add(numnegzaps, 1);
1549 			error = cache_zap_locked_vnode(ncp, dvp);
1550 			if (__predict_false(error != 0)) {
1551 				zap_and_exit_bucket_fail2++;
1552 				goto retry;
1553 			}
1554 			cache_free(ncp);
1555 			return (0);
1556 		}
1557 	}
1558 
1559 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1560 	cache_out_ts(ncp, tsp, ticksp);
1561 	counter_u64_add(numneghits, 1);
1562 	whiteout = (ncp->nc_flag & NCF_WHITE);
1563 	cache_negative_hit(ncp);
1564 	mtx_unlock(blp);
1565 	if (whiteout)
1566 		cnp->cn_flags |= ISWHITEOUT;
1567 	return (ENOENT);
1568 }
1569 
1570 int
1571 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1572     struct timespec *tsp, int *ticksp)
1573 {
1574 	struct namecache *ncp;
1575 	struct negstate *negstate;
1576 	uint32_t hash;
1577 	enum vgetstate vs;
1578 	int error;
1579 	bool whiteout;
1580 	u_short nc_flag;
1581 
1582 	MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1583 
1584 #ifdef DEBUG_CACHE
1585 	if (__predict_false(!doingcache)) {
1586 		cnp->cn_flags &= ~MAKEENTRY;
1587 		return (0);
1588 	}
1589 #endif
1590 
1591 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1592 		if (cnp->cn_namelen == 1)
1593 			return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1594 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1595 			return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1596 	}
1597 
1598 	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1599 
1600 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
1601 		cache_remove_cnp(dvp, cnp);
1602 		return (0);
1603 	}
1604 
1605 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1606 	vfs_smr_enter();
1607 
1608 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1609 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1610 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1611 			break;
1612 	}
1613 
1614 	if (__predict_false(ncp == NULL)) {
1615 		vfs_smr_exit();
1616 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1617 		    NULL);
1618 		counter_u64_add(nummiss, 1);
1619 		return (0);
1620 	}
1621 
1622 	nc_flag = atomic_load_char(&ncp->nc_flag);
1623 	if (nc_flag & NCF_NEGATIVE)
1624 		goto negative_success;
1625 
1626 	counter_u64_add(numposhits, 1);
1627 	*vpp = ncp->nc_vp;
1628 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1629 	cache_out_ts(ncp, tsp, ticksp);
1630 	MPASS(dvp != *vpp);
1631 	if (!cache_ncp_canuse(ncp)) {
1632 		vfs_smr_exit();
1633 		*vpp = NULL;
1634 		goto out_fallback;
1635 	}
1636 	vs = vget_prep_smr(*vpp);
1637 	vfs_smr_exit();
1638 	if (__predict_false(vs == VGET_NONE)) {
1639 		*vpp = NULL;
1640 		goto out_fallback;
1641 	}
1642 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1643 	if (error) {
1644 		*vpp = NULL;
1645 		goto out_fallback;
1646 	}
1647 	return (-1);
1648 negative_success:
1649 	if (__predict_false(cnp->cn_nameiop == CREATE)) {
1650 		if (cnp->cn_flags & ISLASTCN) {
1651 			vfs_smr_exit();
1652 			goto out_fallback;
1653 		}
1654 	}
1655 
1656 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1657 	cache_out_ts(ncp, tsp, ticksp);
1658 	counter_u64_add(numneghits, 1);
1659 	whiteout = (ncp->nc_flag & NCF_WHITE);
1660 	/*
1661 	 * TODO: We need to take locks to promote an entry. Code doing it
1662 	 * in SMR lookup can be modified to be shared.
1663 	 */
1664 	negstate = NCP2NEGSTATE(ncp);
1665 	if ((negstate->neg_flag & NEG_HOT) == 0 ||
1666 	    !cache_ncp_canuse(ncp)) {
1667 		vfs_smr_exit();
1668 		goto out_fallback;
1669 	}
1670 	vfs_smr_exit();
1671 	if (whiteout)
1672 		cnp->cn_flags |= ISWHITEOUT;
1673 	return (ENOENT);
1674 out_fallback:
1675 	return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1676 }
1677 
1678 struct celockstate {
1679 	struct mtx *vlp[3];
1680 	struct mtx *blp[2];
1681 };
1682 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1683 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1684 
1685 static inline void
1686 cache_celockstate_init(struct celockstate *cel)
1687 {
1688 
1689 	bzero(cel, sizeof(*cel));
1690 }
1691 
1692 static void
1693 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1694     struct vnode *dvp)
1695 {
1696 	struct mtx *vlp1, *vlp2;
1697 
1698 	MPASS(cel->vlp[0] == NULL);
1699 	MPASS(cel->vlp[1] == NULL);
1700 	MPASS(cel->vlp[2] == NULL);
1701 
1702 	MPASS(vp != NULL || dvp != NULL);
1703 
1704 	vlp1 = VP2VNODELOCK(vp);
1705 	vlp2 = VP2VNODELOCK(dvp);
1706 	cache_sort_vnodes(&vlp1, &vlp2);
1707 
1708 	if (vlp1 != NULL) {
1709 		mtx_lock(vlp1);
1710 		cel->vlp[0] = vlp1;
1711 	}
1712 	mtx_lock(vlp2);
1713 	cel->vlp[1] = vlp2;
1714 }
1715 
1716 static void
1717 cache_unlock_vnodes_cel(struct celockstate *cel)
1718 {
1719 
1720 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1721 
1722 	if (cel->vlp[0] != NULL)
1723 		mtx_unlock(cel->vlp[0]);
1724 	if (cel->vlp[1] != NULL)
1725 		mtx_unlock(cel->vlp[1]);
1726 	if (cel->vlp[2] != NULL)
1727 		mtx_unlock(cel->vlp[2]);
1728 }
1729 
1730 static bool
1731 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1732 {
1733 	struct mtx *vlp;
1734 	bool ret;
1735 
1736 	cache_assert_vlp_locked(cel->vlp[0]);
1737 	cache_assert_vlp_locked(cel->vlp[1]);
1738 	MPASS(cel->vlp[2] == NULL);
1739 
1740 	MPASS(vp != NULL);
1741 	vlp = VP2VNODELOCK(vp);
1742 
1743 	ret = true;
1744 	if (vlp >= cel->vlp[1]) {
1745 		mtx_lock(vlp);
1746 	} else {
1747 		if (mtx_trylock(vlp))
1748 			goto out;
1749 		cache_lock_vnodes_cel_3_failures++;
1750 		cache_unlock_vnodes_cel(cel);
1751 		if (vlp < cel->vlp[0]) {
1752 			mtx_lock(vlp);
1753 			mtx_lock(cel->vlp[0]);
1754 			mtx_lock(cel->vlp[1]);
1755 		} else {
1756 			if (cel->vlp[0] != NULL)
1757 				mtx_lock(cel->vlp[0]);
1758 			mtx_lock(vlp);
1759 			mtx_lock(cel->vlp[1]);
1760 		}
1761 		ret = false;
1762 	}
1763 out:
1764 	cel->vlp[2] = vlp;
1765 	return (ret);
1766 }
1767 
1768 static void
1769 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1770     struct mtx *blp2)
1771 {
1772 
1773 	MPASS(cel->blp[0] == NULL);
1774 	MPASS(cel->blp[1] == NULL);
1775 
1776 	cache_sort_vnodes(&blp1, &blp2);
1777 
1778 	if (blp1 != NULL) {
1779 		mtx_lock(blp1);
1780 		cel->blp[0] = blp1;
1781 	}
1782 	mtx_lock(blp2);
1783 	cel->blp[1] = blp2;
1784 }
1785 
1786 static void
1787 cache_unlock_buckets_cel(struct celockstate *cel)
1788 {
1789 
1790 	if (cel->blp[0] != NULL)
1791 		mtx_unlock(cel->blp[0]);
1792 	mtx_unlock(cel->blp[1]);
1793 }
1794 
1795 /*
1796  * Lock part of the cache affected by the insertion.
1797  *
1798  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1799  * However, insertion can result in removal of an old entry. In this
1800  * case we have an additional vnode and bucketlock pair to lock.
1801  *
1802  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1803  * preserving the locking order (smaller address first).
1804  */
1805 static void
1806 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1807     uint32_t hash)
1808 {
1809 	struct namecache *ncp;
1810 	struct mtx *blps[2];
1811 
1812 	blps[0] = HASH2BUCKETLOCK(hash);
1813 	for (;;) {
1814 		blps[1] = NULL;
1815 		cache_lock_vnodes_cel(cel, dvp, vp);
1816 		if (vp == NULL || vp->v_type != VDIR)
1817 			break;
1818 		ncp = vp->v_cache_dd;
1819 		if (ncp == NULL)
1820 			break;
1821 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1822 			break;
1823 		MPASS(ncp->nc_dvp == vp);
1824 		blps[1] = NCP2BUCKETLOCK(ncp);
1825 		if (ncp->nc_flag & NCF_NEGATIVE)
1826 			break;
1827 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1828 			break;
1829 		/*
1830 		 * All vnodes got re-locked. Re-validate the state and if
1831 		 * nothing changed we are done. Otherwise restart.
1832 		 */
1833 		if (ncp == vp->v_cache_dd &&
1834 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1835 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1836 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1837 			break;
1838 		cache_unlock_vnodes_cel(cel);
1839 		cel->vlp[0] = NULL;
1840 		cel->vlp[1] = NULL;
1841 		cel->vlp[2] = NULL;
1842 	}
1843 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1844 }
1845 
1846 static void
1847 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1848     uint32_t hash)
1849 {
1850 	struct namecache *ncp;
1851 	struct mtx *blps[2];
1852 
1853 	blps[0] = HASH2BUCKETLOCK(hash);
1854 	for (;;) {
1855 		blps[1] = NULL;
1856 		cache_lock_vnodes_cel(cel, dvp, vp);
1857 		ncp = dvp->v_cache_dd;
1858 		if (ncp == NULL)
1859 			break;
1860 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1861 			break;
1862 		MPASS(ncp->nc_dvp == dvp);
1863 		blps[1] = NCP2BUCKETLOCK(ncp);
1864 		if (ncp->nc_flag & NCF_NEGATIVE)
1865 			break;
1866 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1867 			break;
1868 		if (ncp == dvp->v_cache_dd &&
1869 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1870 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1871 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1872 			break;
1873 		cache_unlock_vnodes_cel(cel);
1874 		cel->vlp[0] = NULL;
1875 		cel->vlp[1] = NULL;
1876 		cel->vlp[2] = NULL;
1877 	}
1878 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1879 }
1880 
1881 static void
1882 cache_enter_unlock(struct celockstate *cel)
1883 {
1884 
1885 	cache_unlock_buckets_cel(cel);
1886 	cache_unlock_vnodes_cel(cel);
1887 }
1888 
1889 static void __noinline
1890 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1891     struct componentname *cnp)
1892 {
1893 	struct celockstate cel;
1894 	struct namecache *ncp;
1895 	uint32_t hash;
1896 	int len;
1897 
1898 	if (dvp->v_cache_dd == NULL)
1899 		return;
1900 	len = cnp->cn_namelen;
1901 	cache_celockstate_init(&cel);
1902 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1903 	cache_enter_lock_dd(&cel, dvp, vp, hash);
1904 	vn_seqc_write_begin(dvp);
1905 	ncp = dvp->v_cache_dd;
1906 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1907 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1908 		cache_zap_locked(ncp);
1909 	} else {
1910 		ncp = NULL;
1911 	}
1912 	dvp->v_cache_dd = NULL;
1913 	vn_seqc_write_end(dvp);
1914 	cache_enter_unlock(&cel);
1915 	if (ncp != NULL)
1916 		cache_free(ncp);
1917 }
1918 
1919 /*
1920  * Add an entry to the cache.
1921  */
1922 void
1923 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1924     struct timespec *tsp, struct timespec *dtsp)
1925 {
1926 	struct celockstate cel;
1927 	struct namecache *ncp, *n2, *ndd;
1928 	struct namecache_ts *ncp_ts;
1929 	struct nchashhead *ncpp;
1930 	uint32_t hash;
1931 	int flag;
1932 	int len;
1933 	u_long lnumcache;
1934 
1935 	VNPASS(!VN_IS_DOOMED(dvp), dvp);
1936 	VNPASS(dvp->v_type != VNON, dvp);
1937 	if (vp != NULL) {
1938 		VNPASS(!VN_IS_DOOMED(vp), vp);
1939 		VNPASS(vp->v_type != VNON, vp);
1940 	}
1941 
1942 #ifdef DEBUG_CACHE
1943 	if (__predict_false(!doingcache))
1944 		return;
1945 #endif
1946 
1947 	flag = 0;
1948 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1949 		if (cnp->cn_namelen == 1)
1950 			return;
1951 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1952 			cache_enter_dotdot_prep(dvp, vp, cnp);
1953 			flag = NCF_ISDOTDOT;
1954 		}
1955 	}
1956 
1957 	/*
1958 	 * Avoid blowout in namecache entries.
1959 	 */
1960 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1961 	if (__predict_false(lnumcache >= ncsize)) {
1962 		atomic_add_long(&numcache, -1);
1963 		counter_u64_add(numdrops, 1);
1964 		return;
1965 	}
1966 
1967 	cache_celockstate_init(&cel);
1968 	ndd = NULL;
1969 	ncp_ts = NULL;
1970 
1971 	/*
1972 	 * Calculate the hash key and setup as much of the new
1973 	 * namecache entry as possible before acquiring the lock.
1974 	 */
1975 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1976 	ncp->nc_flag = flag | NCF_WIP;
1977 	ncp->nc_vp = vp;
1978 	if (vp == NULL)
1979 		cache_negative_init(ncp);
1980 	ncp->nc_dvp = dvp;
1981 	if (tsp != NULL) {
1982 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1983 		ncp_ts->nc_time = *tsp;
1984 		ncp_ts->nc_ticks = ticks;
1985 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
1986 		if (dtsp != NULL) {
1987 			ncp_ts->nc_dotdottime = *dtsp;
1988 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1989 		}
1990 	}
1991 	len = ncp->nc_nlen = cnp->cn_namelen;
1992 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1993 	memcpy(ncp->nc_name, cnp->cn_nameptr, len);
1994 	ncp->nc_name[len] = '\0';
1995 	cache_enter_lock(&cel, dvp, vp, hash);
1996 
1997 	/*
1998 	 * See if this vnode or negative entry is already in the cache
1999 	 * with this name.  This can happen with concurrent lookups of
2000 	 * the same path name.
2001 	 */
2002 	ncpp = NCHHASH(hash);
2003 	CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2004 		if (n2->nc_dvp == dvp &&
2005 		    n2->nc_nlen == cnp->cn_namelen &&
2006 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2007 			MPASS(cache_ncp_canuse(n2));
2008 			if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2009 				KASSERT(vp == NULL,
2010 				    ("%s: found entry pointing to a different vnode (%p != %p)",
2011 				    __func__, NULL, vp));
2012 			else
2013 				KASSERT(n2->nc_vp == vp,
2014 				    ("%s: found entry pointing to a different vnode (%p != %p)",
2015 				    __func__, n2->nc_vp, vp));
2016 			/*
2017 			 * Entries are supposed to be immutable unless in the
2018 			 * process of getting destroyed. Accommodating for
2019 			 * changing timestamps is possible but not worth it.
2020 			 * This should be harmless in terms of correctness, in
2021 			 * the worst case resulting in an earlier expiration.
2022 			 * Alternatively, the found entry can be replaced
2023 			 * altogether.
2024 			 */
2025 			MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2026 #if 0
2027 			if (tsp != NULL) {
2028 				KASSERT((n2->nc_flag & NCF_TS) != 0,
2029 				    ("no NCF_TS"));
2030 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2031 				n2_ts->nc_time = ncp_ts->nc_time;
2032 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
2033 				if (dtsp != NULL) {
2034 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2035 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
2036 				}
2037 			}
2038 #endif
2039 			goto out_unlock_free;
2040 		}
2041 	}
2042 
2043 	if (flag == NCF_ISDOTDOT) {
2044 		/*
2045 		 * See if we are trying to add .. entry, but some other lookup
2046 		 * has populated v_cache_dd pointer already.
2047 		 */
2048 		if (dvp->v_cache_dd != NULL)
2049 			goto out_unlock_free;
2050 		KASSERT(vp == NULL || vp->v_type == VDIR,
2051 		    ("wrong vnode type %p", vp));
2052 		vn_seqc_write_begin(dvp);
2053 		dvp->v_cache_dd = ncp;
2054 		vn_seqc_write_end(dvp);
2055 	}
2056 
2057 	if (vp != NULL) {
2058 		if (flag != NCF_ISDOTDOT) {
2059 			/*
2060 			 * For this case, the cache entry maps both the
2061 			 * directory name in it and the name ".." for the
2062 			 * directory's parent.
2063 			 */
2064 			vn_seqc_write_begin(vp);
2065 			if ((ndd = vp->v_cache_dd) != NULL) {
2066 				if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2067 					cache_zap_locked(ndd);
2068 				else
2069 					ndd = NULL;
2070 			}
2071 			vp->v_cache_dd = ncp;
2072 			vn_seqc_write_end(vp);
2073 		} else if (vp->v_type != VDIR) {
2074 			if (vp->v_cache_dd != NULL) {
2075 				vn_seqc_write_begin(vp);
2076 				vp->v_cache_dd = NULL;
2077 				vn_seqc_write_end(vp);
2078 			}
2079 		}
2080 	}
2081 
2082 	if (flag != NCF_ISDOTDOT) {
2083 		if (LIST_EMPTY(&dvp->v_cache_src)) {
2084 			vhold(dvp);
2085 			counter_u64_add(numcachehv, 1);
2086 		}
2087 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2088 	}
2089 
2090 	/*
2091 	 * If the entry is "negative", we place it into the
2092 	 * "negative" cache queue, otherwise, we place it into the
2093 	 * destination vnode's cache entries queue.
2094 	 */
2095 	if (vp != NULL) {
2096 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2097 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2098 		    vp);
2099 	} else {
2100 		if (cnp->cn_flags & ISWHITEOUT)
2101 			ncp->nc_flag |= NCF_WHITE;
2102 		cache_negative_insert(ncp);
2103 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2104 		    ncp->nc_name);
2105 	}
2106 
2107 	/*
2108 	 * Insert the new namecache entry into the appropriate chain
2109 	 * within the cache entries table.
2110 	 */
2111 	CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2112 
2113 	atomic_thread_fence_rel();
2114 	/*
2115 	 * Mark the entry as fully constructed.
2116 	 * It is immutable past this point until its removal.
2117 	 */
2118 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2119 
2120 	cache_enter_unlock(&cel);
2121 	if (numneg * ncnegfactor > lnumcache)
2122 		cache_negative_zap_one();
2123 	if (ndd != NULL)
2124 		cache_free(ndd);
2125 	return;
2126 out_unlock_free:
2127 	cache_enter_unlock(&cel);
2128 	atomic_add_long(&numcache, -1);
2129 	cache_free(ncp);
2130 	return;
2131 }
2132 
2133 static u_int
2134 cache_roundup_2(u_int val)
2135 {
2136 	u_int res;
2137 
2138 	for (res = 1; res <= val; res <<= 1)
2139 		continue;
2140 
2141 	return (res);
2142 }
2143 
2144 static struct nchashhead *
2145 nchinittbl(u_long elements, u_long *hashmask)
2146 {
2147 	struct nchashhead *hashtbl;
2148 	u_long hashsize, i;
2149 
2150 	hashsize = cache_roundup_2(elements) / 2;
2151 
2152 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2153 	for (i = 0; i < hashsize; i++)
2154 		CK_SLIST_INIT(&hashtbl[i]);
2155 	*hashmask = hashsize - 1;
2156 	return (hashtbl);
2157 }
2158 
2159 static void
2160 ncfreetbl(struct nchashhead *hashtbl)
2161 {
2162 
2163 	free(hashtbl, M_VFSCACHE);
2164 }
2165 
2166 /*
2167  * Name cache initialization, from vfs_init() when we are booting
2168  */
2169 static void
2170 nchinit(void *dummy __unused)
2171 {
2172 	u_int i;
2173 
2174 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2175 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2176 	cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2177 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2178 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2179 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2180 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2181 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2182 
2183 	VFS_SMR_ZONE_SET(cache_zone_small);
2184 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2185 	VFS_SMR_ZONE_SET(cache_zone_large);
2186 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2187 
2188 	ncsize = desiredvnodes * ncsizefactor;
2189 	nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2190 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2191 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2192 		ncbuckethash = 7;
2193 	if (ncbuckethash > nchash)
2194 		ncbuckethash = nchash;
2195 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2196 	    M_WAITOK | M_ZERO);
2197 	for (i = 0; i < numbucketlocks; i++)
2198 		mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2199 	ncvnodehash = ncbuckethash;
2200 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2201 	    M_WAITOK | M_ZERO);
2202 	for (i = 0; i < numvnodelocks; i++)
2203 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2204 	ncpurgeminvnodes = numbucketlocks * 2;
2205 
2206 	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2207 	    M_WAITOK | M_ZERO);
2208 	for (i = 0; i < numneglists; i++) {
2209 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2210 		TAILQ_INIT(&neglists[i].nl_list);
2211 	}
2212 	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2213 	TAILQ_INIT(&ncneg_hot.nl_list);
2214 
2215 	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2216 }
2217 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2218 
2219 void
2220 cache_vnode_init(struct vnode *vp)
2221 {
2222 
2223 	LIST_INIT(&vp->v_cache_src);
2224 	TAILQ_INIT(&vp->v_cache_dst);
2225 	vp->v_cache_dd = NULL;
2226 	cache_prehash(vp);
2227 }
2228 
2229 void
2230 cache_changesize(u_long newmaxvnodes)
2231 {
2232 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
2233 	u_long new_nchash, old_nchash;
2234 	struct namecache *ncp;
2235 	uint32_t hash;
2236 	u_long newncsize;
2237 	int i;
2238 
2239 	newncsize = newmaxvnodes * ncsizefactor;
2240 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2241 	if (newmaxvnodes < numbucketlocks)
2242 		newmaxvnodes = numbucketlocks;
2243 
2244 	new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2245 	/* If same hash table size, nothing to do */
2246 	if (nchash == new_nchash) {
2247 		ncfreetbl(new_nchashtbl);
2248 		return;
2249 	}
2250 	/*
2251 	 * Move everything from the old hash table to the new table.
2252 	 * None of the namecache entries in the table can be removed
2253 	 * because to do so, they have to be removed from the hash table.
2254 	 */
2255 	cache_lock_all_vnodes();
2256 	cache_lock_all_buckets();
2257 	old_nchashtbl = nchashtbl;
2258 	old_nchash = nchash;
2259 	nchashtbl = new_nchashtbl;
2260 	nchash = new_nchash;
2261 	for (i = 0; i <= old_nchash; i++) {
2262 		while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2263 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2264 			    ncp->nc_dvp);
2265 			CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2266 			CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2267 		}
2268 	}
2269 	ncsize = newncsize;
2270 	cache_unlock_all_buckets();
2271 	cache_unlock_all_vnodes();
2272 	ncfreetbl(old_nchashtbl);
2273 }
2274 
2275 /*
2276  * Invalidate all entries from and to a particular vnode.
2277  */
2278 static void
2279 cache_purge_impl(struct vnode *vp)
2280 {
2281 	TAILQ_HEAD(, namecache) ncps;
2282 	struct namecache *ncp, *nnp;
2283 	struct mtx *vlp, *vlp2;
2284 
2285 	TAILQ_INIT(&ncps);
2286 	vlp = VP2VNODELOCK(vp);
2287 	vlp2 = NULL;
2288 	mtx_assert(vlp, MA_OWNED);
2289 retry:
2290 	while (!LIST_EMPTY(&vp->v_cache_src)) {
2291 		ncp = LIST_FIRST(&vp->v_cache_src);
2292 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2293 			goto retry;
2294 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2295 	}
2296 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2297 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2298 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2299 			goto retry;
2300 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2301 	}
2302 	ncp = vp->v_cache_dd;
2303 	if (ncp != NULL) {
2304 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2305 		   ("lost dotdot link"));
2306 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2307 			goto retry;
2308 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2309 	}
2310 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2311 	mtx_unlock(vlp);
2312 	if (vlp2 != NULL)
2313 		mtx_unlock(vlp2);
2314 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2315 		cache_free(ncp);
2316 	}
2317 }
2318 
2319 void
2320 cache_purge(struct vnode *vp)
2321 {
2322 	struct mtx *vlp;
2323 
2324 	SDT_PROBE1(vfs, namecache, purge, done, vp);
2325 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2326 	    vp->v_cache_dd == NULL)
2327 		return;
2328 	vlp = VP2VNODELOCK(vp);
2329 	mtx_lock(vlp);
2330 	cache_purge_impl(vp);
2331 }
2332 
2333 /*
2334  * Only to be used by vgone.
2335  */
2336 void
2337 cache_purge_vgone(struct vnode *vp)
2338 {
2339 	struct mtx *vlp;
2340 
2341 	VNPASS(VN_IS_DOOMED(vp), vp);
2342 	vlp = VP2VNODELOCK(vp);
2343 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2344 	    vp->v_cache_dd == NULL)) {
2345 		mtx_lock(vlp);
2346 		cache_purge_impl(vp);
2347 		mtx_assert(vlp, MA_NOTOWNED);
2348 		return;
2349 	}
2350 
2351 	/*
2352 	 * All the NULL pointer state we found above may be transient.
2353 	 * Serialize against a possible thread doing cache_purge.
2354 	 */
2355 	mtx_wait_unlocked(vlp);
2356 	if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2357 	    vp->v_cache_dd == NULL)) {
2358 		mtx_lock(vlp);
2359 		cache_purge_impl(vp);
2360 		mtx_assert(vlp, MA_NOTOWNED);
2361 		return;
2362 	}
2363 	return;
2364 }
2365 
2366 /*
2367  * Invalidate all negative entries for a particular directory vnode.
2368  */
2369 void
2370 cache_purge_negative(struct vnode *vp)
2371 {
2372 	TAILQ_HEAD(, namecache) ncps;
2373 	struct namecache *ncp, *nnp;
2374 	struct mtx *vlp;
2375 
2376 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2377 	if (LIST_EMPTY(&vp->v_cache_src))
2378 		return;
2379 	TAILQ_INIT(&ncps);
2380 	vlp = VP2VNODELOCK(vp);
2381 	mtx_lock(vlp);
2382 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2383 		if (!(ncp->nc_flag & NCF_NEGATIVE))
2384 			continue;
2385 		cache_zap_negative_locked_vnode_kl(ncp, vp);
2386 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2387 	}
2388 	mtx_unlock(vlp);
2389 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2390 		cache_free(ncp);
2391 	}
2392 }
2393 
2394 void
2395 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2396     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2397 {
2398 
2399 	ASSERT_VOP_IN_SEQC(fdvp);
2400 	ASSERT_VOP_IN_SEQC(fvp);
2401 	ASSERT_VOP_IN_SEQC(tdvp);
2402 	if (tvp != NULL)
2403 		ASSERT_VOP_IN_SEQC(tvp);
2404 
2405 	cache_purge(fvp);
2406 	if (tvp != NULL) {
2407 		cache_purge(tvp);
2408 		KASSERT(!cache_remove_cnp(tdvp, tcnp),
2409 		    ("%s: lingering negative entry", __func__));
2410 	} else {
2411 		cache_remove_cnp(tdvp, tcnp);
2412 	}
2413 }
2414 
2415 /*
2416  * Flush all entries referencing a particular filesystem.
2417  */
2418 void
2419 cache_purgevfs(struct mount *mp, bool force)
2420 {
2421 	TAILQ_HEAD(, namecache) ncps;
2422 	struct mtx *vlp1, *vlp2;
2423 	struct mtx *blp;
2424 	struct nchashhead *bucket;
2425 	struct namecache *ncp, *nnp;
2426 	u_long i, j, n_nchash;
2427 	int error;
2428 
2429 	/* Scan hash tables for applicable entries */
2430 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2431 	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2432 		return;
2433 	TAILQ_INIT(&ncps);
2434 	n_nchash = nchash + 1;
2435 	vlp1 = vlp2 = NULL;
2436 	for (i = 0; i < numbucketlocks; i++) {
2437 		blp = (struct mtx *)&bucketlocks[i];
2438 		mtx_lock(blp);
2439 		for (j = i; j < n_nchash; j += numbucketlocks) {
2440 retry:
2441 			bucket = &nchashtbl[j];
2442 			CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2443 				cache_assert_bucket_locked(ncp);
2444 				if (ncp->nc_dvp->v_mount != mp)
2445 					continue;
2446 				error = cache_zap_locked_bucket_kl(ncp, blp,
2447 				    &vlp1, &vlp2);
2448 				if (error != 0)
2449 					goto retry;
2450 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2451 			}
2452 		}
2453 		mtx_unlock(blp);
2454 		if (vlp1 == NULL && vlp2 == NULL)
2455 			cache_maybe_yield();
2456 	}
2457 	if (vlp1 != NULL)
2458 		mtx_unlock(vlp1);
2459 	if (vlp2 != NULL)
2460 		mtx_unlock(vlp2);
2461 
2462 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2463 		cache_free(ncp);
2464 	}
2465 }
2466 
2467 /*
2468  * Perform canonical checks and cache lookup and pass on to filesystem
2469  * through the vop_cachedlookup only if needed.
2470  */
2471 
2472 int
2473 vfs_cache_lookup(struct vop_lookup_args *ap)
2474 {
2475 	struct vnode *dvp;
2476 	int error;
2477 	struct vnode **vpp = ap->a_vpp;
2478 	struct componentname *cnp = ap->a_cnp;
2479 	int flags = cnp->cn_flags;
2480 
2481 	*vpp = NULL;
2482 	dvp = ap->a_dvp;
2483 
2484 	if (dvp->v_type != VDIR)
2485 		return (ENOTDIR);
2486 
2487 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2488 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2489 		return (EROFS);
2490 
2491 	error = vn_dir_check_exec(dvp, cnp);
2492 	if (error != 0)
2493 		return (error);
2494 
2495 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2496 	if (error == 0)
2497 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2498 	if (error == -1)
2499 		return (0);
2500 	return (error);
2501 }
2502 
2503 /* Implementation of the getcwd syscall. */
2504 int
2505 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2506 {
2507 	char *buf, *retbuf;
2508 	size_t buflen;
2509 	int error;
2510 
2511 	buflen = uap->buflen;
2512 	if (__predict_false(buflen < 2))
2513 		return (EINVAL);
2514 	if (buflen > MAXPATHLEN)
2515 		buflen = MAXPATHLEN;
2516 
2517 	buf = uma_zalloc(namei_zone, M_WAITOK);
2518 	error = vn_getcwd(buf, &retbuf, &buflen);
2519 	if (error == 0)
2520 		error = copyout(retbuf, uap->buf, buflen);
2521 	uma_zfree(namei_zone, buf);
2522 	return (error);
2523 }
2524 
2525 int
2526 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2527 {
2528 	struct pwd *pwd;
2529 	int error;
2530 
2531 	vfs_smr_enter();
2532 	pwd = pwd_get_smr();
2533 	error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2534 	    buflen, false, 0);
2535 	VFS_SMR_ASSERT_NOT_ENTERED();
2536 	if (error < 0) {
2537 		pwd = pwd_hold(curthread);
2538 		error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2539 		    retbuf, buflen);
2540 		pwd_drop(pwd);
2541 	}
2542 
2543 #ifdef KTRACE
2544 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2545 		ktrnamei(*retbuf);
2546 #endif
2547 	return (error);
2548 }
2549 
2550 static int
2551 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2552     size_t size, int flags, enum uio_seg pathseg)
2553 {
2554 	struct nameidata nd;
2555 	char *retbuf, *freebuf;
2556 	int error;
2557 
2558 	if (flags != 0)
2559 		return (EINVAL);
2560 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2561 	    pathseg, path, fd, &cap_fstat_rights, td);
2562 	if ((error = namei(&nd)) != 0)
2563 		return (error);
2564 	error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2565 	if (error == 0) {
2566 		error = copyout(retbuf, buf, size);
2567 		free(freebuf, M_TEMP);
2568 	}
2569 	NDFREE(&nd, 0);
2570 	return (error);
2571 }
2572 
2573 int
2574 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2575 {
2576 
2577 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2578 	    uap->flags, UIO_USERSPACE));
2579 }
2580 
2581 /*
2582  * Retrieve the full filesystem path that correspond to a vnode from the name
2583  * cache (if available)
2584  */
2585 int
2586 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2587 {
2588 	struct pwd *pwd;
2589 	char *buf;
2590 	size_t buflen;
2591 	int error;
2592 
2593 	if (__predict_false(vp == NULL))
2594 		return (EINVAL);
2595 
2596 	buflen = MAXPATHLEN;
2597 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2598 	vfs_smr_enter();
2599 	pwd = pwd_get_smr();
2600 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2601 	VFS_SMR_ASSERT_NOT_ENTERED();
2602 	if (error < 0) {
2603 		pwd = pwd_hold(curthread);
2604 		error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2605 		pwd_drop(pwd);
2606 	}
2607 	if (error == 0)
2608 		*freebuf = buf;
2609 	else
2610 		free(buf, M_TEMP);
2611 	return (error);
2612 }
2613 
2614 /*
2615  * This function is similar to vn_fullpath, but it attempts to lookup the
2616  * pathname relative to the global root mount point.  This is required for the
2617  * auditing sub-system, as audited pathnames must be absolute, relative to the
2618  * global root mount point.
2619  */
2620 int
2621 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2622 {
2623 	char *buf;
2624 	size_t buflen;
2625 	int error;
2626 
2627 	if (__predict_false(vp == NULL))
2628 		return (EINVAL);
2629 	buflen = MAXPATHLEN;
2630 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2631 	vfs_smr_enter();
2632 	error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2633 	VFS_SMR_ASSERT_NOT_ENTERED();
2634 	if (error < 0) {
2635 		error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2636 	}
2637 	if (error == 0)
2638 		*freebuf = buf;
2639 	else
2640 		free(buf, M_TEMP);
2641 	return (error);
2642 }
2643 
2644 static struct namecache *
2645 vn_dd_from_dst(struct vnode *vp)
2646 {
2647 	struct namecache *ncp;
2648 
2649 	cache_assert_vnode_locked(vp);
2650 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2651 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2652 			return (ncp);
2653 	}
2654 	return (NULL);
2655 }
2656 
2657 int
2658 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2659 {
2660 	struct vnode *dvp;
2661 	struct namecache *ncp;
2662 	struct mtx *vlp;
2663 	int error;
2664 
2665 	vlp = VP2VNODELOCK(*vp);
2666 	mtx_lock(vlp);
2667 	ncp = (*vp)->v_cache_dd;
2668 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2669 		KASSERT(ncp == vn_dd_from_dst(*vp),
2670 		    ("%s: mismatch for dd entry (%p != %p)", __func__,
2671 		    ncp, vn_dd_from_dst(*vp)));
2672 	} else {
2673 		ncp = vn_dd_from_dst(*vp);
2674 	}
2675 	if (ncp != NULL) {
2676 		if (*buflen < ncp->nc_nlen) {
2677 			mtx_unlock(vlp);
2678 			vrele(*vp);
2679 			counter_u64_add(numfullpathfail4, 1);
2680 			error = ENOMEM;
2681 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2682 			    vp, NULL);
2683 			return (error);
2684 		}
2685 		*buflen -= ncp->nc_nlen;
2686 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2687 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2688 		    ncp->nc_name, vp);
2689 		dvp = *vp;
2690 		*vp = ncp->nc_dvp;
2691 		vref(*vp);
2692 		mtx_unlock(vlp);
2693 		vrele(dvp);
2694 		return (0);
2695 	}
2696 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2697 
2698 	mtx_unlock(vlp);
2699 	vn_lock(*vp, LK_SHARED | LK_RETRY);
2700 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2701 	vput(*vp);
2702 	if (error) {
2703 		counter_u64_add(numfullpathfail2, 1);
2704 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2705 		return (error);
2706 	}
2707 
2708 	*vp = dvp;
2709 	if (VN_IS_DOOMED(dvp)) {
2710 		/* forced unmount */
2711 		vrele(dvp);
2712 		error = ENOENT;
2713 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2714 		return (error);
2715 	}
2716 	/*
2717 	 * *vp has its use count incremented still.
2718 	 */
2719 
2720 	return (0);
2721 }
2722 
2723 /*
2724  * Resolve a directory to a pathname.
2725  *
2726  * The name of the directory can always be found in the namecache or fetched
2727  * from the filesystem. There is also guaranteed to be only one parent, meaning
2728  * we can just follow vnodes up until we find the root.
2729  *
2730  * The vnode must be referenced.
2731  */
2732 static int
2733 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2734     size_t *len, bool slash_prefixed, size_t addend)
2735 {
2736 #ifdef KDTRACE_HOOKS
2737 	struct vnode *startvp = vp;
2738 #endif
2739 	struct vnode *vp1;
2740 	size_t buflen;
2741 	int error;
2742 
2743 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2744 	VNPASS(vp->v_usecount > 0, vp);
2745 
2746 	buflen = *len;
2747 
2748 	if (!slash_prefixed) {
2749 		MPASS(*len >= 2);
2750 		buflen--;
2751 		buf[buflen] = '\0';
2752 	}
2753 
2754 	error = 0;
2755 
2756 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2757 	counter_u64_add(numfullpathcalls, 1);
2758 	while (vp != rdir && vp != rootvnode) {
2759 		/*
2760 		 * The vp vnode must be already fully constructed,
2761 		 * since it is either found in namecache or obtained
2762 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2763 		 * without obtaining the vnode lock.
2764 		 */
2765 		if ((vp->v_vflag & VV_ROOT) != 0) {
2766 			vn_lock(vp, LK_RETRY | LK_SHARED);
2767 
2768 			/*
2769 			 * With the vnode locked, check for races with
2770 			 * unmount, forced or not.  Note that we
2771 			 * already verified that vp is not equal to
2772 			 * the root vnode, which means that
2773 			 * mnt_vnodecovered can be NULL only for the
2774 			 * case of unmount.
2775 			 */
2776 			if (VN_IS_DOOMED(vp) ||
2777 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2778 			    vp1->v_mountedhere != vp->v_mount) {
2779 				vput(vp);
2780 				error = ENOENT;
2781 				SDT_PROBE3(vfs, namecache, fullpath, return,
2782 				    error, vp, NULL);
2783 				break;
2784 			}
2785 
2786 			vref(vp1);
2787 			vput(vp);
2788 			vp = vp1;
2789 			continue;
2790 		}
2791 		if (vp->v_type != VDIR) {
2792 			vrele(vp);
2793 			counter_u64_add(numfullpathfail1, 1);
2794 			error = ENOTDIR;
2795 			SDT_PROBE3(vfs, namecache, fullpath, return,
2796 			    error, vp, NULL);
2797 			break;
2798 		}
2799 		error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2800 		if (error)
2801 			break;
2802 		if (buflen == 0) {
2803 			vrele(vp);
2804 			error = ENOMEM;
2805 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2806 			    startvp, NULL);
2807 			break;
2808 		}
2809 		buf[--buflen] = '/';
2810 		slash_prefixed = true;
2811 	}
2812 	if (error)
2813 		return (error);
2814 	if (!slash_prefixed) {
2815 		if (buflen == 0) {
2816 			vrele(vp);
2817 			counter_u64_add(numfullpathfail4, 1);
2818 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2819 			    startvp, NULL);
2820 			return (ENOMEM);
2821 		}
2822 		buf[--buflen] = '/';
2823 	}
2824 	counter_u64_add(numfullpathfound, 1);
2825 	vrele(vp);
2826 
2827 	*retbuf = buf + buflen;
2828 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2829 	*len -= buflen;
2830 	*len += addend;
2831 	return (0);
2832 }
2833 
2834 /*
2835  * Resolve an arbitrary vnode to a pathname.
2836  *
2837  * Note 2 caveats:
2838  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2839  *   resolve to a different path than the one used to find it
2840  * - namecache is not mandatory, meaning names are not guaranteed to be added
2841  *   (in which case resolving fails)
2842  */
2843 static void __inline
2844 cache_rev_failed_impl(int *reason, int line)
2845 {
2846 
2847 	*reason = line;
2848 }
2849 #define cache_rev_failed(var)	cache_rev_failed_impl((var), __LINE__)
2850 
2851 static int
2852 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
2853     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
2854 {
2855 #ifdef KDTRACE_HOOKS
2856 	struct vnode *startvp = vp;
2857 #endif
2858 	struct vnode *tvp;
2859 	struct mount *mp;
2860 	struct namecache *ncp;
2861 	size_t orig_buflen;
2862 	int reason;
2863 	int error;
2864 #ifdef KDTRACE_HOOKS
2865 	int i;
2866 #endif
2867 	seqc_t vp_seqc, tvp_seqc;
2868 	u_char nc_flag;
2869 
2870 	VFS_SMR_ASSERT_ENTERED();
2871 
2872 	if (!cache_fast_revlookup) {
2873 		vfs_smr_exit();
2874 		return (-1);
2875 	}
2876 
2877 	orig_buflen = *buflen;
2878 
2879 	if (!slash_prefixed) {
2880 		MPASS(*buflen >= 2);
2881 		*buflen -= 1;
2882 		buf[*buflen] = '\0';
2883 	}
2884 
2885 	if (vp == rdir || vp == rootvnode) {
2886 		if (!slash_prefixed) {
2887 			*buflen -= 1;
2888 			buf[*buflen] = '/';
2889 		}
2890 		goto out_ok;
2891 	}
2892 
2893 #ifdef KDTRACE_HOOKS
2894 	i = 0;
2895 #endif
2896 	error = -1;
2897 	ncp = NULL; /* for sdt probe down below */
2898 	vp_seqc = vn_seqc_read_any(vp);
2899 	if (seqc_in_modify(vp_seqc)) {
2900 		cache_rev_failed(&reason);
2901 		goto out_abort;
2902 	}
2903 
2904 	for (;;) {
2905 #ifdef KDTRACE_HOOKS
2906 		i++;
2907 #endif
2908 		if ((vp->v_vflag & VV_ROOT) != 0) {
2909 			mp = atomic_load_ptr(&vp->v_mount);
2910 			if (mp == NULL) {
2911 				cache_rev_failed(&reason);
2912 				goto out_abort;
2913 			}
2914 			tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
2915 			tvp_seqc = vn_seqc_read_any(tvp);
2916 			if (seqc_in_modify(tvp_seqc)) {
2917 				cache_rev_failed(&reason);
2918 				goto out_abort;
2919 			}
2920 			if (!vn_seqc_consistent(vp, vp_seqc)) {
2921 				cache_rev_failed(&reason);
2922 				goto out_abort;
2923 			}
2924 			vp = tvp;
2925 			vp_seqc = tvp_seqc;
2926 			continue;
2927 		}
2928 		ncp = atomic_load_ptr(&vp->v_cache_dd);
2929 		if (ncp == NULL) {
2930 			cache_rev_failed(&reason);
2931 			goto out_abort;
2932 		}
2933 		nc_flag = atomic_load_char(&ncp->nc_flag);
2934 		if ((nc_flag & NCF_ISDOTDOT) != 0) {
2935 			cache_rev_failed(&reason);
2936 			goto out_abort;
2937 		}
2938 		if (!cache_ncp_canuse(ncp)) {
2939 			cache_rev_failed(&reason);
2940 			goto out_abort;
2941 		}
2942 		if (ncp->nc_nlen >= *buflen) {
2943 			cache_rev_failed(&reason);
2944 			error = ENOMEM;
2945 			goto out_abort;
2946 		}
2947 		*buflen -= ncp->nc_nlen;
2948 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2949 		*buflen -= 1;
2950 		buf[*buflen] = '/';
2951 		tvp = ncp->nc_dvp;
2952 		tvp_seqc = vn_seqc_read_any(tvp);
2953 		if (seqc_in_modify(tvp_seqc)) {
2954 			cache_rev_failed(&reason);
2955 			goto out_abort;
2956 		}
2957 		if (!vn_seqc_consistent(vp, vp_seqc)) {
2958 			cache_rev_failed(&reason);
2959 			goto out_abort;
2960 		}
2961 		vp = tvp;
2962 		vp_seqc = tvp_seqc;
2963 		if (vp == rdir || vp == rootvnode)
2964 			break;
2965 	}
2966 out_ok:
2967 	vfs_smr_exit();
2968 	*retbuf = buf + *buflen;
2969 	*buflen = orig_buflen - *buflen + addend;
2970 	SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
2971 	return (0);
2972 
2973 out_abort:
2974 	*buflen = orig_buflen;
2975 	SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
2976 	vfs_smr_exit();
2977 	return (error);
2978 }
2979 
2980 static int
2981 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2982     size_t *buflen)
2983 {
2984 	size_t orig_buflen;
2985 	bool slash_prefixed;
2986 	int error;
2987 
2988 	if (*buflen < 2)
2989 		return (EINVAL);
2990 
2991 	orig_buflen = *buflen;
2992 
2993 	vref(vp);
2994 	slash_prefixed = false;
2995 	if (vp->v_type != VDIR) {
2996 		*buflen -= 1;
2997 		buf[*buflen] = '\0';
2998 		error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
2999 		if (error)
3000 			return (error);
3001 		if (*buflen == 0) {
3002 			vrele(vp);
3003 			return (ENOMEM);
3004 		}
3005 		*buflen -= 1;
3006 		buf[*buflen] = '/';
3007 		slash_prefixed = true;
3008 	}
3009 
3010 	return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
3011 	    orig_buflen - *buflen));
3012 }
3013 
3014 /*
3015  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3016  *
3017  * Since the namecache does not track handlings, the caller is expected to first
3018  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3019  *
3020  * Then we have 2 cases:
3021  * - if the found vnode is a directory, the path can be constructed just by
3022  *   fullowing names up the chain
3023  * - otherwise we populate the buffer with the saved name and start resolving
3024  *   from the parent
3025  */
3026 static int
3027 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3028     size_t *buflen)
3029 {
3030 	char *buf, *tmpbuf;
3031 	struct pwd *pwd;
3032 	struct componentname *cnp;
3033 	struct vnode *vp;
3034 	size_t addend;
3035 	int error;
3036 	bool slash_prefixed;
3037 	enum vtype type;
3038 
3039 	if (*buflen < 2)
3040 		return (EINVAL);
3041 	if (*buflen > MAXPATHLEN)
3042 		*buflen = MAXPATHLEN;
3043 
3044 	slash_prefixed = false;
3045 
3046 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
3047 
3048 	addend = 0;
3049 	vp = ndp->ni_vp;
3050 	/*
3051 	 * Check for VBAD to work around the vp_crossmp bug in lookup().
3052 	 *
3053 	 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3054 	 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3055 	 * If the type is VDIR (like in this very case) we can skip looking
3056 	 * at ni_dvp in the first place. However, since vnodes get passed here
3057 	 * unlocked the target may transition to doomed state (type == VBAD)
3058 	 * before we get to evaluate the condition. If this happens, we will
3059 	 * populate part of the buffer and descend to vn_fullpath_dir with
3060 	 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3061 	 *
3062 	 * This should be atomic_load(&vp->v_type) but it is ilegal to take
3063 	 * an address of a bit field, even if said field is sized to char.
3064 	 * Work around the problem by reading the value into a full-sized enum
3065 	 * and then re-reading it with atomic_load which will still prevent
3066 	 * the compiler from re-reading down the road.
3067 	 */
3068 	type = vp->v_type;
3069 	type = atomic_load_int(&type);
3070 	if (type == VBAD) {
3071 		error = ENOENT;
3072 		goto out_bad;
3073 	}
3074 	if (type != VDIR) {
3075 		cnp = &ndp->ni_cnd;
3076 		addend = cnp->cn_namelen + 2;
3077 		if (*buflen < addend) {
3078 			error = ENOMEM;
3079 			goto out_bad;
3080 		}
3081 		*buflen -= addend;
3082 		tmpbuf = buf + *buflen;
3083 		tmpbuf[0] = '/';
3084 		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3085 		tmpbuf[addend - 1] = '\0';
3086 		slash_prefixed = true;
3087 		vp = ndp->ni_dvp;
3088 	}
3089 
3090 	vfs_smr_enter();
3091 	pwd = pwd_get_smr();
3092 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3093 	    slash_prefixed, addend);
3094 	VFS_SMR_ASSERT_NOT_ENTERED();
3095 	if (error < 0) {
3096 		pwd = pwd_hold(curthread);
3097 		vref(vp);
3098 		error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3099 		    slash_prefixed, addend);
3100 		pwd_drop(pwd);
3101 		if (error != 0)
3102 			goto out_bad;
3103 	}
3104 
3105 	*freebuf = buf;
3106 
3107 	return (0);
3108 out_bad:
3109 	free(buf, M_TEMP);
3110 	return (error);
3111 }
3112 
3113 struct vnode *
3114 vn_dir_dd_ino(struct vnode *vp)
3115 {
3116 	struct namecache *ncp;
3117 	struct vnode *ddvp;
3118 	struct mtx *vlp;
3119 	enum vgetstate vs;
3120 
3121 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3122 	vlp = VP2VNODELOCK(vp);
3123 	mtx_lock(vlp);
3124 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3125 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3126 			continue;
3127 		ddvp = ncp->nc_dvp;
3128 		vs = vget_prep(ddvp);
3129 		mtx_unlock(vlp);
3130 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3131 			return (NULL);
3132 		return (ddvp);
3133 	}
3134 	mtx_unlock(vlp);
3135 	return (NULL);
3136 }
3137 
3138 int
3139 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3140 {
3141 	struct namecache *ncp;
3142 	struct mtx *vlp;
3143 	int l;
3144 
3145 	vlp = VP2VNODELOCK(vp);
3146 	mtx_lock(vlp);
3147 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3148 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3149 			break;
3150 	if (ncp == NULL) {
3151 		mtx_unlock(vlp);
3152 		return (ENOENT);
3153 	}
3154 	l = min(ncp->nc_nlen, buflen - 1);
3155 	memcpy(buf, ncp->nc_name, l);
3156 	mtx_unlock(vlp);
3157 	buf[l] = '\0';
3158 	return (0);
3159 }
3160 
3161 /*
3162  * This function updates path string to vnode's full global path
3163  * and checks the size of the new path string against the pathlen argument.
3164  *
3165  * Requires a locked, referenced vnode.
3166  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3167  *
3168  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3169  * because it falls back to the ".." lookup if the namecache lookup fails.
3170  */
3171 int
3172 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3173     u_int pathlen)
3174 {
3175 	struct nameidata nd;
3176 	struct vnode *vp1;
3177 	char *rpath, *fbuf;
3178 	int error;
3179 
3180 	ASSERT_VOP_ELOCKED(vp, __func__);
3181 
3182 	/* Construct global filesystem path from vp. */
3183 	VOP_UNLOCK(vp);
3184 	error = vn_fullpath_global(vp, &rpath, &fbuf);
3185 
3186 	if (error != 0) {
3187 		vrele(vp);
3188 		return (error);
3189 	}
3190 
3191 	if (strlen(rpath) >= pathlen) {
3192 		vrele(vp);
3193 		error = ENAMETOOLONG;
3194 		goto out;
3195 	}
3196 
3197 	/*
3198 	 * Re-lookup the vnode by path to detect a possible rename.
3199 	 * As a side effect, the vnode is relocked.
3200 	 * If vnode was renamed, return ENOENT.
3201 	 */
3202 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3203 	    UIO_SYSSPACE, path, td);
3204 	error = namei(&nd);
3205 	if (error != 0) {
3206 		vrele(vp);
3207 		goto out;
3208 	}
3209 	NDFREE(&nd, NDF_ONLY_PNBUF);
3210 	vp1 = nd.ni_vp;
3211 	vrele(vp);
3212 	if (vp1 == vp)
3213 		strcpy(path, rpath);
3214 	else {
3215 		vput(vp1);
3216 		error = ENOENT;
3217 	}
3218 
3219 out:
3220 	free(fbuf, M_TEMP);
3221 	return (error);
3222 }
3223 
3224 #ifdef DDB
3225 static void
3226 db_print_vpath(struct vnode *vp)
3227 {
3228 
3229 	while (vp != NULL) {
3230 		db_printf("%p: ", vp);
3231 		if (vp == rootvnode) {
3232 			db_printf("/");
3233 			vp = NULL;
3234 		} else {
3235 			if (vp->v_vflag & VV_ROOT) {
3236 				db_printf("<mount point>");
3237 				vp = vp->v_mount->mnt_vnodecovered;
3238 			} else {
3239 				struct namecache *ncp;
3240 				char *ncn;
3241 				int i;
3242 
3243 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
3244 				if (ncp != NULL) {
3245 					ncn = ncp->nc_name;
3246 					for (i = 0; i < ncp->nc_nlen; i++)
3247 						db_printf("%c", *ncn++);
3248 					vp = ncp->nc_dvp;
3249 				} else {
3250 					vp = NULL;
3251 				}
3252 			}
3253 		}
3254 		db_printf("\n");
3255 	}
3256 
3257 	return;
3258 }
3259 
3260 DB_SHOW_COMMAND(vpath, db_show_vpath)
3261 {
3262 	struct vnode *vp;
3263 
3264 	if (!have_addr) {
3265 		db_printf("usage: show vpath <struct vnode *>\n");
3266 		return;
3267 	}
3268 
3269 	vp = (struct vnode *)addr;
3270 	db_print_vpath(vp);
3271 }
3272 
3273 #endif
3274 
3275 static bool __read_frequently cache_fast_lookup = true;
3276 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3277     &cache_fast_lookup, 0, "");
3278 
3279 #define CACHE_FPL_FAILED	-2020
3280 
3281 static void
3282 cache_fpl_cleanup_cnp(struct componentname *cnp)
3283 {
3284 
3285 	uma_zfree(namei_zone, cnp->cn_pnbuf);
3286 #ifdef DIAGNOSTIC
3287 	cnp->cn_pnbuf = NULL;
3288 	cnp->cn_nameptr = NULL;
3289 #endif
3290 }
3291 
3292 static void
3293 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3294 {
3295 	struct componentname *cnp;
3296 
3297 	cnp = &ndp->ni_cnd;
3298 	while (*(cnp->cn_nameptr) == '/') {
3299 		cnp->cn_nameptr++;
3300 		ndp->ni_pathlen--;
3301 	}
3302 
3303 	*dpp = ndp->ni_rootdir;
3304 }
3305 
3306 /*
3307  * Components of nameidata (or objects it can point to) which may
3308  * need restoring in case fast path lookup fails.
3309  */
3310 struct nameidata_saved {
3311 	long cn_namelen;
3312 	char *cn_nameptr;
3313 	size_t ni_pathlen;
3314 	int cn_flags;
3315 };
3316 
3317 struct cache_fpl {
3318 	struct nameidata *ndp;
3319 	struct componentname *cnp;
3320 	struct pwd *pwd;
3321 	struct vnode *dvp;
3322 	struct vnode *tvp;
3323 	seqc_t dvp_seqc;
3324 	seqc_t tvp_seqc;
3325 	struct nameidata_saved snd;
3326 	int line;
3327 	enum cache_fpl_status status:8;
3328 	bool in_smr;
3329 };
3330 
3331 static void
3332 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3333 {
3334 
3335 	snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3336 	snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3337 	snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3338 	snd->ni_pathlen = fpl->ndp->ni_pathlen;
3339 }
3340 
3341 static void
3342 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3343 {
3344 
3345 	fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3346 	fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3347 	fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3348 	fpl->ndp->ni_pathlen = snd->ni_pathlen;
3349 }
3350 
3351 #ifdef INVARIANTS
3352 #define cache_fpl_smr_assert_entered(fpl) ({			\
3353 	struct cache_fpl *_fpl = (fpl);				\
3354 	MPASS(_fpl->in_smr == true);				\
3355 	VFS_SMR_ASSERT_ENTERED();				\
3356 })
3357 #define cache_fpl_smr_assert_not_entered(fpl) ({		\
3358 	struct cache_fpl *_fpl = (fpl);				\
3359 	MPASS(_fpl->in_smr == false);				\
3360 	VFS_SMR_ASSERT_NOT_ENTERED();				\
3361 })
3362 #else
3363 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3364 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3365 #endif
3366 
3367 #define cache_fpl_smr_enter_initial(fpl) ({			\
3368 	struct cache_fpl *_fpl = (fpl);				\
3369 	vfs_smr_enter();					\
3370 	_fpl->in_smr = true;					\
3371 })
3372 
3373 #define cache_fpl_smr_enter(fpl) ({				\
3374 	struct cache_fpl *_fpl = (fpl);				\
3375 	MPASS(_fpl->in_smr == false);				\
3376 	vfs_smr_enter();					\
3377 	_fpl->in_smr = true;					\
3378 })
3379 
3380 #define cache_fpl_smr_exit(fpl) ({				\
3381 	struct cache_fpl *_fpl = (fpl);				\
3382 	MPASS(_fpl->in_smr == true);				\
3383 	vfs_smr_exit();						\
3384 	_fpl->in_smr = false;					\
3385 })
3386 
3387 static int
3388 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3389 {
3390 
3391 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3392 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3393 		    ("%s: converting to abort from %d at %d, set at %d\n",
3394 		    __func__, fpl->status, line, fpl->line));
3395 	}
3396 	fpl->status = CACHE_FPL_STATUS_ABORTED;
3397 	fpl->line = line;
3398 	return (CACHE_FPL_FAILED);
3399 }
3400 
3401 #define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
3402 
3403 static int
3404 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3405 {
3406 
3407 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3408 	    ("%s: setting to partial at %d, but already set to %d at %d\n",
3409 	    __func__, line, fpl->status, fpl->line));
3410 	cache_fpl_smr_assert_entered(fpl);
3411 	fpl->status = CACHE_FPL_STATUS_PARTIAL;
3412 	fpl->line = line;
3413 	return (CACHE_FPL_FAILED);
3414 }
3415 
3416 #define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
3417 
3418 static int
3419 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3420 {
3421 
3422 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3423 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
3424 	    __func__, line, fpl->status, fpl->line));
3425 	cache_fpl_smr_assert_not_entered(fpl);
3426 	MPASS(error != CACHE_FPL_FAILED);
3427 	fpl->status = CACHE_FPL_STATUS_HANDLED;
3428 	fpl->line = line;
3429 	return (error);
3430 }
3431 
3432 #define cache_fpl_handled(x, e)	cache_fpl_handled_impl((x), (e), __LINE__)
3433 
3434 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3435 	(LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3436 	 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3437 
3438 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3439 	(ISDOTDOT | MAKEENTRY | ISLASTCN)
3440 
3441 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3442     "supported and internal flags overlap");
3443 
3444 static bool
3445 cache_fpl_islastcn(struct nameidata *ndp)
3446 {
3447 
3448 	return (*ndp->ni_next == 0);
3449 }
3450 
3451 static bool
3452 cache_fpl_isdotdot(struct componentname *cnp)
3453 {
3454 
3455 	if (cnp->cn_namelen == 2 &&
3456 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3457 		return (true);
3458 	return (false);
3459 }
3460 
3461 static bool
3462 cache_can_fplookup(struct cache_fpl *fpl)
3463 {
3464 	struct nameidata *ndp;
3465 	struct componentname *cnp;
3466 	struct thread *td;
3467 
3468 	ndp = fpl->ndp;
3469 	cnp = fpl->cnp;
3470 	td = cnp->cn_thread;
3471 
3472 	if (!cache_fast_lookup) {
3473 		cache_fpl_aborted(fpl);
3474 		return (false);
3475 	}
3476 #ifdef MAC
3477 	if (mac_vnode_check_lookup_enabled()) {
3478 		cache_fpl_aborted(fpl);
3479 		return (false);
3480 	}
3481 #endif
3482 	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3483 		cache_fpl_aborted(fpl);
3484 		return (false);
3485 	}
3486 	if (ndp->ni_dirfd != AT_FDCWD) {
3487 		cache_fpl_aborted(fpl);
3488 		return (false);
3489 	}
3490 	if (IN_CAPABILITY_MODE(td)) {
3491 		cache_fpl_aborted(fpl);
3492 		return (false);
3493 	}
3494 	if (AUDITING_TD(td)) {
3495 		cache_fpl_aborted(fpl);
3496 		return (false);
3497 	}
3498 	if (ndp->ni_startdir != NULL) {
3499 		cache_fpl_aborted(fpl);
3500 		return (false);
3501 	}
3502 	return (true);
3503 }
3504 
3505 static bool
3506 cache_fplookup_vnode_supported(struct vnode *vp)
3507 {
3508 
3509 	return (vp->v_type != VLNK);
3510 }
3511 
3512 /*
3513  * Move a negative entry to the hot list.
3514  *
3515  * We have to take locks, but they may be contended and in the worst
3516  * case we may need to go off CPU. We don't want to spin within the
3517  * smr section and we can't block with it. Instead we are going to
3518  * look up the entry again.
3519  */
3520 static int __noinline
3521 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3522     uint32_t hash)
3523 {
3524 	struct componentname *cnp;
3525 	struct namecache *ncp;
3526 	struct neglist *neglist;
3527 	struct negstate *negstate;
3528 	struct vnode *dvp;
3529 	u_char nc_flag;
3530 
3531 	cnp = fpl->cnp;
3532 	dvp = fpl->dvp;
3533 
3534 	if (!vhold_smr(dvp))
3535 		return (cache_fpl_aborted(fpl));
3536 
3537 	neglist = NCP2NEGLIST(oncp);
3538 	cache_fpl_smr_exit(fpl);
3539 
3540 	mtx_lock(&ncneg_hot.nl_lock);
3541 	mtx_lock(&neglist->nl_lock);
3542 	/*
3543 	 * For hash iteration.
3544 	 */
3545 	cache_fpl_smr_enter(fpl);
3546 
3547 	/*
3548 	 * Avoid all surprises by only succeeding if we got the same entry and
3549 	 * bailing completely otherwise.
3550 	 *
3551 	 * In particular at this point there can be a new ncp which matches the
3552 	 * search but hashes to a different neglist.
3553 	 */
3554 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3555 		if (ncp == oncp)
3556 			break;
3557 	}
3558 
3559 	/*
3560 	 * No match to begin with.
3561 	 */
3562 	if (__predict_false(ncp == NULL)) {
3563 		goto out_abort;
3564 	}
3565 
3566 	/*
3567 	 * The newly found entry may be something different...
3568 	 */
3569 	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3570 	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3571 		goto out_abort;
3572 	}
3573 
3574 	/*
3575 	 * ... and not even negative.
3576 	 */
3577 	nc_flag = atomic_load_char(&ncp->nc_flag);
3578 	if ((nc_flag & NCF_NEGATIVE) == 0) {
3579 		goto out_abort;
3580 	}
3581 
3582 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3583 		goto out_abort;
3584 	}
3585 
3586 	negstate = NCP2NEGSTATE(ncp);
3587 	if ((negstate->neg_flag & NEG_HOT) == 0) {
3588 		numhotneg++;
3589 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3590 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3591 		negstate->neg_flag |= NEG_HOT;
3592 	}
3593 
3594 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3595 	counter_u64_add(numneghits, 1);
3596 	cache_fpl_smr_exit(fpl);
3597 	mtx_unlock(&neglist->nl_lock);
3598 	mtx_unlock(&ncneg_hot.nl_lock);
3599 	vdrop(dvp);
3600 	return (cache_fpl_handled(fpl, ENOENT));
3601 out_abort:
3602 	cache_fpl_smr_exit(fpl);
3603 	mtx_unlock(&neglist->nl_lock);
3604 	mtx_unlock(&ncneg_hot.nl_lock);
3605 	vdrop(dvp);
3606 	return (cache_fpl_aborted(fpl));
3607 }
3608 
3609 /*
3610  * The target vnode is not supported, prepare for the slow path to take over.
3611  */
3612 static int __noinline
3613 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3614 {
3615 	struct nameidata *ndp;
3616 	struct componentname *cnp;
3617 	enum vgetstate dvs;
3618 	struct vnode *dvp;
3619 	struct pwd *pwd;
3620 	seqc_t dvp_seqc;
3621 
3622 	ndp = fpl->ndp;
3623 	cnp = fpl->cnp;
3624 	dvp = fpl->dvp;
3625 	dvp_seqc = fpl->dvp_seqc;
3626 
3627 	dvs = vget_prep_smr(dvp);
3628 	if (__predict_false(dvs == VGET_NONE)) {
3629 		cache_fpl_smr_exit(fpl);
3630 		return (cache_fpl_aborted(fpl));
3631 	}
3632 
3633 	cache_fpl_smr_exit(fpl);
3634 
3635 	vget_finish_ref(dvp, dvs);
3636 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3637 		vrele(dvp);
3638 		return (cache_fpl_aborted(fpl));
3639 	}
3640 
3641 	pwd = pwd_hold(curthread);
3642 	if (fpl->pwd != pwd) {
3643 		vrele(dvp);
3644 		pwd_drop(pwd);
3645 		return (cache_fpl_aborted(fpl));
3646 	}
3647 
3648 	cache_fpl_restore(fpl, &fpl->snd);
3649 
3650 	ndp->ni_startdir = dvp;
3651 	cnp->cn_flags |= MAKEENTRY;
3652 	if (cache_fpl_islastcn(ndp))
3653 		cnp->cn_flags |= ISLASTCN;
3654 	if (cache_fpl_isdotdot(cnp))
3655 		cnp->cn_flags |= ISDOTDOT;
3656 
3657 	return (0);
3658 }
3659 
3660 static int
3661 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3662 {
3663 	struct componentname *cnp;
3664 	struct vnode *tvp;
3665 	seqc_t tvp_seqc;
3666 	int error, lkflags;
3667 
3668 	cnp = fpl->cnp;
3669 	tvp = fpl->tvp;
3670 	tvp_seqc = fpl->tvp_seqc;
3671 
3672 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
3673 		lkflags = LK_SHARED;
3674 		if ((cnp->cn_flags & LOCKSHARED) == 0)
3675 			lkflags = LK_EXCLUSIVE;
3676 		error = vget_finish(tvp, lkflags, tvs);
3677 		if (__predict_false(error != 0)) {
3678 			return (cache_fpl_aborted(fpl));
3679 		}
3680 	} else {
3681 		vget_finish_ref(tvp, tvs);
3682 	}
3683 
3684 	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3685 		if ((cnp->cn_flags & LOCKLEAF) != 0)
3686 			vput(tvp);
3687 		else
3688 			vrele(tvp);
3689 		return (cache_fpl_aborted(fpl));
3690 	}
3691 
3692 	return (cache_fpl_handled(fpl, 0));
3693 }
3694 
3695 /*
3696  * They want to possibly modify the state of the namecache.
3697  *
3698  * Don't try to match the API contract, just leave.
3699  * TODO: this leaves scalability on the table
3700  */
3701 static int
3702 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3703 {
3704 	struct componentname *cnp;
3705 
3706 	cnp = fpl->cnp;
3707 	MPASS(cnp->cn_nameiop != LOOKUP);
3708 	return (cache_fpl_partial(fpl));
3709 }
3710 
3711 static int __noinline
3712 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3713 {
3714 	struct componentname *cnp;
3715 	enum vgetstate dvs, tvs;
3716 	struct vnode *dvp, *tvp;
3717 	seqc_t dvp_seqc;
3718 	int error;
3719 
3720 	cnp = fpl->cnp;
3721 	dvp = fpl->dvp;
3722 	dvp_seqc = fpl->dvp_seqc;
3723 	tvp = fpl->tvp;
3724 
3725 	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3726 
3727 	/*
3728 	 * This is less efficient than it can be for simplicity.
3729 	 */
3730 	dvs = vget_prep_smr(dvp);
3731 	if (__predict_false(dvs == VGET_NONE)) {
3732 		return (cache_fpl_aborted(fpl));
3733 	}
3734 	tvs = vget_prep_smr(tvp);
3735 	if (__predict_false(tvs == VGET_NONE)) {
3736 		cache_fpl_smr_exit(fpl);
3737 		vget_abort(dvp, dvs);
3738 		return (cache_fpl_aborted(fpl));
3739 	}
3740 
3741 	cache_fpl_smr_exit(fpl);
3742 
3743 	if ((cnp->cn_flags & LOCKPARENT) != 0) {
3744 		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3745 		if (__predict_false(error != 0)) {
3746 			vget_abort(tvp, tvs);
3747 			return (cache_fpl_aborted(fpl));
3748 		}
3749 	} else {
3750 		vget_finish_ref(dvp, dvs);
3751 	}
3752 
3753 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3754 		vget_abort(tvp, tvs);
3755 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3756 			vput(dvp);
3757 		else
3758 			vrele(dvp);
3759 		return (cache_fpl_aborted(fpl));
3760 	}
3761 
3762 	error = cache_fplookup_final_child(fpl, tvs);
3763 	if (__predict_false(error != 0)) {
3764 		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3765 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3766 			vput(dvp);
3767 		else
3768 			vrele(dvp);
3769 		return (error);
3770 	}
3771 
3772 	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3773 	return (0);
3774 }
3775 
3776 static int
3777 cache_fplookup_final(struct cache_fpl *fpl)
3778 {
3779 	struct componentname *cnp;
3780 	enum vgetstate tvs;
3781 	struct vnode *dvp, *tvp;
3782 	seqc_t dvp_seqc;
3783 
3784 	cnp = fpl->cnp;
3785 	dvp = fpl->dvp;
3786 	dvp_seqc = fpl->dvp_seqc;
3787 	tvp = fpl->tvp;
3788 
3789 	VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3790 
3791 	if (cnp->cn_nameiop != LOOKUP) {
3792 		return (cache_fplookup_final_modifying(fpl));
3793 	}
3794 
3795 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3796 		return (cache_fplookup_final_withparent(fpl));
3797 
3798 	tvs = vget_prep_smr(tvp);
3799 	if (__predict_false(tvs == VGET_NONE)) {
3800 		return (cache_fpl_partial(fpl));
3801 	}
3802 
3803 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3804 		cache_fpl_smr_exit(fpl);
3805 		vget_abort(tvp, tvs);
3806 		return (cache_fpl_aborted(fpl));
3807 	}
3808 
3809 	cache_fpl_smr_exit(fpl);
3810 	return (cache_fplookup_final_child(fpl, tvs));
3811 }
3812 
3813 static int __noinline
3814 cache_fplookup_dot(struct cache_fpl *fpl)
3815 {
3816 	struct vnode *dvp;
3817 
3818 	dvp = fpl->dvp;
3819 
3820 	fpl->tvp = dvp;
3821 	fpl->tvp_seqc = vn_seqc_read_any(dvp);
3822 	if (seqc_in_modify(fpl->tvp_seqc)) {
3823 		return (cache_fpl_aborted(fpl));
3824 	}
3825 
3826 	counter_u64_add(dothits, 1);
3827 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3828 
3829 	return (0);
3830 }
3831 
3832 static int __noinline
3833 cache_fplookup_dotdot(struct cache_fpl *fpl)
3834 {
3835 	struct nameidata *ndp;
3836 	struct componentname *cnp;
3837 	struct namecache *ncp;
3838 	struct vnode *dvp;
3839 	struct prison *pr;
3840 	u_char nc_flag;
3841 
3842 	ndp = fpl->ndp;
3843 	cnp = fpl->cnp;
3844 	dvp = fpl->dvp;
3845 
3846 	/*
3847 	 * XXX this is racy the same way regular lookup is
3848 	 */
3849 	for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3850 	    pr = pr->pr_parent)
3851 		if (dvp == pr->pr_root)
3852 			break;
3853 
3854 	if (dvp == ndp->ni_rootdir ||
3855 	    dvp == ndp->ni_topdir ||
3856 	    dvp == rootvnode ||
3857 	    pr != NULL) {
3858 		fpl->tvp = dvp;
3859 		fpl->tvp_seqc = vn_seqc_read_any(dvp);
3860 		if (seqc_in_modify(fpl->tvp_seqc)) {
3861 			return (cache_fpl_aborted(fpl));
3862 		}
3863 		return (0);
3864 	}
3865 
3866 	if ((dvp->v_vflag & VV_ROOT) != 0) {
3867 		/*
3868 		 * TODO
3869 		 * The opposite of climb mount is needed here.
3870 		 */
3871 		return (cache_fpl_aborted(fpl));
3872 	}
3873 
3874 	ncp = atomic_load_ptr(&dvp->v_cache_dd);
3875 	if (ncp == NULL) {
3876 		return (cache_fpl_aborted(fpl));
3877 	}
3878 
3879 	nc_flag = atomic_load_char(&ncp->nc_flag);
3880 	if ((nc_flag & NCF_ISDOTDOT) != 0) {
3881 		if ((nc_flag & NCF_NEGATIVE) != 0)
3882 			return (cache_fpl_aborted(fpl));
3883 		fpl->tvp = ncp->nc_vp;
3884 	} else {
3885 		fpl->tvp = ncp->nc_dvp;
3886 	}
3887 
3888 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3889 		return (cache_fpl_aborted(fpl));
3890 	}
3891 
3892 	fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3893 	if (seqc_in_modify(fpl->tvp_seqc)) {
3894 		return (cache_fpl_partial(fpl));
3895 	}
3896 
3897 	counter_u64_add(dotdothits, 1);
3898 	return (0);
3899 }
3900 
3901 static int
3902 cache_fplookup_next(struct cache_fpl *fpl)
3903 {
3904 	struct componentname *cnp;
3905 	struct namecache *ncp;
3906 	struct negstate *negstate;
3907 	struct vnode *dvp, *tvp;
3908 	u_char nc_flag;
3909 	uint32_t hash;
3910 	bool neg_hot;
3911 
3912 	cnp = fpl->cnp;
3913 	dvp = fpl->dvp;
3914 
3915 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3916 		return (cache_fplookup_dot(fpl));
3917 	}
3918 
3919 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3920 
3921 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3922 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3923 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3924 			break;
3925 	}
3926 
3927 	/*
3928 	 * If there is no entry we have to punt to the slow path to perform
3929 	 * actual lookup. Should there be nothing with this name a negative
3930 	 * entry will be created.
3931 	 */
3932 	if (__predict_false(ncp == NULL)) {
3933 		return (cache_fpl_partial(fpl));
3934 	}
3935 
3936 	tvp = atomic_load_ptr(&ncp->nc_vp);
3937 	nc_flag = atomic_load_char(&ncp->nc_flag);
3938 	if ((nc_flag & NCF_NEGATIVE) != 0) {
3939 		/*
3940 		 * If they want to create an entry we need to replace this one.
3941 		 */
3942 		if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3943 			return (cache_fpl_partial(fpl));
3944 		}
3945 		negstate = NCP2NEGSTATE(ncp);
3946 		neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3947 		if (__predict_false(!cache_ncp_canuse(ncp))) {
3948 			return (cache_fpl_partial(fpl));
3949 		}
3950 		if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3951 			return (cache_fpl_partial(fpl));
3952 		}
3953 		if (!neg_hot) {
3954 			return (cache_fplookup_negative_promote(fpl, ncp, hash));
3955 		}
3956 		SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3957 		    ncp->nc_name);
3958 		counter_u64_add(numneghits, 1);
3959 		cache_fpl_smr_exit(fpl);
3960 		return (cache_fpl_handled(fpl, ENOENT));
3961 	}
3962 
3963 	if (__predict_false(!cache_ncp_canuse(ncp))) {
3964 		return (cache_fpl_partial(fpl));
3965 	}
3966 
3967 	fpl->tvp = tvp;
3968 	fpl->tvp_seqc = vn_seqc_read_any(tvp);
3969 	if (seqc_in_modify(fpl->tvp_seqc)) {
3970 		return (cache_fpl_partial(fpl));
3971 	}
3972 
3973 	if (!cache_fplookup_vnode_supported(tvp)) {
3974 		return (cache_fpl_partial(fpl));
3975 	}
3976 
3977 	counter_u64_add(numposhits, 1);
3978 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3979 	return (0);
3980 }
3981 
3982 static bool
3983 cache_fplookup_mp_supported(struct mount *mp)
3984 {
3985 
3986 	if (mp == NULL)
3987 		return (false);
3988 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3989 		return (false);
3990 	return (true);
3991 }
3992 
3993 /*
3994  * Walk up the mount stack (if any).
3995  *
3996  * Correctness is provided in the following ways:
3997  * - all vnodes are protected from freeing with SMR
3998  * - struct mount objects are type stable making them always safe to access
3999  * - stability of the particular mount is provided by busying it
4000  * - relationship between the vnode which is mounted on and the mount is
4001  *   verified with the vnode sequence counter after busying
4002  * - association between root vnode of the mount and the mount is protected
4003  *   by busy
4004  *
4005  * From that point on we can read the sequence counter of the root vnode
4006  * and get the next mount on the stack (if any) using the same protection.
4007  *
4008  * By the end of successful walk we are guaranteed the reached state was
4009  * indeed present at least at some point which matches the regular lookup.
4010  */
4011 static int __noinline
4012 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4013 {
4014 	struct mount *mp, *prev_mp;
4015 	struct vnode *vp;
4016 	seqc_t vp_seqc;
4017 
4018 	vp = fpl->tvp;
4019 	vp_seqc = fpl->tvp_seqc;
4020 
4021 	VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4022 	mp = atomic_load_ptr(&vp->v_mountedhere);
4023 	if (mp == NULL)
4024 		return (0);
4025 
4026 	prev_mp = NULL;
4027 	for (;;) {
4028 		if (!vfs_op_thread_enter_crit(mp)) {
4029 			if (prev_mp != NULL)
4030 				vfs_op_thread_exit_crit(prev_mp);
4031 			return (cache_fpl_partial(fpl));
4032 		}
4033 		if (prev_mp != NULL)
4034 			vfs_op_thread_exit_crit(prev_mp);
4035 		if (!vn_seqc_consistent(vp, vp_seqc)) {
4036 			vfs_op_thread_exit_crit(mp);
4037 			return (cache_fpl_partial(fpl));
4038 		}
4039 		if (!cache_fplookup_mp_supported(mp)) {
4040 			vfs_op_thread_exit_crit(mp);
4041 			return (cache_fpl_partial(fpl));
4042 		}
4043 		vp = atomic_load_ptr(&mp->mnt_rootvnode);
4044 		if (vp == NULL || VN_IS_DOOMED(vp)) {
4045 			vfs_op_thread_exit_crit(mp);
4046 			return (cache_fpl_partial(fpl));
4047 		}
4048 		vp_seqc = vn_seqc_read_any(vp);
4049 		if (seqc_in_modify(vp_seqc)) {
4050 			vfs_op_thread_exit_crit(mp);
4051 			return (cache_fpl_partial(fpl));
4052 		}
4053 		prev_mp = mp;
4054 		mp = atomic_load_ptr(&vp->v_mountedhere);
4055 		if (mp == NULL)
4056 			break;
4057 	}
4058 
4059 	vfs_op_thread_exit_crit(prev_mp);
4060 	fpl->tvp = vp;
4061 	fpl->tvp_seqc = vp_seqc;
4062 	return (0);
4063 }
4064 
4065 static bool
4066 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4067 {
4068 	struct mount *mp;
4069 	struct vnode *vp;
4070 
4071 	vp = fpl->tvp;
4072 
4073 	/*
4074 	 * Hack: while this is a union, the pointer tends to be NULL so save on
4075 	 * a branch.
4076 	 */
4077 	mp = atomic_load_ptr(&vp->v_mountedhere);
4078 	if (mp == NULL)
4079 		return (false);
4080 	if (vp->v_type == VDIR)
4081 		return (true);
4082 	return (false);
4083 }
4084 
4085 /*
4086  * Parse the path.
4087  *
4088  * The code is mostly copy-pasted from regular lookup, see lookup().
4089  * The structure is maintained along with comments for easier maintenance.
4090  * Deduplicating the code will become feasible after fast path lookup
4091  * becomes more feature-complete.
4092  */
4093 static int
4094 cache_fplookup_parse(struct cache_fpl *fpl)
4095 {
4096 	struct nameidata *ndp;
4097 	struct componentname *cnp;
4098 	char *cp;
4099 
4100 	ndp = fpl->ndp;
4101 	cnp = fpl->cnp;
4102 
4103 	/*
4104 	 * Search a new directory.
4105 	 *
4106 	 * The last component of the filename is left accessible via
4107 	 * cnp->cn_nameptr for callers that need the name. Callers needing
4108 	 * the name set the SAVENAME flag. When done, they assume
4109 	 * responsibility for freeing the pathname buffer.
4110 	 */
4111 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4112 		continue;
4113 	cnp->cn_namelen = cp - cnp->cn_nameptr;
4114 	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4115 		cache_fpl_smr_exit(fpl);
4116 		return (cache_fpl_handled(fpl, ENAMETOOLONG));
4117 	}
4118 	ndp->ni_pathlen -= cnp->cn_namelen;
4119 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
4120 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4121 	ndp->ni_next = cp;
4122 
4123 	/*
4124 	 * Replace multiple slashes by a single slash and trailing slashes
4125 	 * by a null.  This must be done before VOP_LOOKUP() because some
4126 	 * fs's don't know about trailing slashes.  Remember if there were
4127 	 * trailing slashes to handle symlinks, existing non-directories
4128 	 * and non-existing files that won't be directories specially later.
4129 	 */
4130 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4131 		cp++;
4132 		ndp->ni_pathlen--;
4133 		if (*cp == '\0') {
4134 			/*
4135 			 * TODO
4136 			 * Regular lookup performs the following:
4137 			 * *ndp->ni_next = '\0';
4138 			 * cnp->cn_flags |= TRAILINGSLASH;
4139 			 *
4140 			 * Which is problematic since it modifies data read
4141 			 * from userspace. Then if fast path lookup was to
4142 			 * abort we would have to either restore it or convey
4143 			 * the flag. Since this is a corner case just ignore
4144 			 * it for simplicity.
4145 			 */
4146 			return (cache_fpl_partial(fpl));
4147 		}
4148 	}
4149 	ndp->ni_next = cp;
4150 
4151 	/*
4152 	 * Check for degenerate name (e.g. / or "")
4153 	 * which is a way of talking about a directory,
4154 	 * e.g. like "/." or ".".
4155 	 *
4156 	 * TODO
4157 	 * Another corner case handled by the regular lookup
4158 	 */
4159 	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4160 		return (cache_fpl_partial(fpl));
4161 	}
4162 	return (0);
4163 }
4164 
4165 static void
4166 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4167 {
4168 	struct nameidata *ndp;
4169 	struct componentname *cnp;
4170 
4171 	ndp = fpl->ndp;
4172 	cnp = fpl->cnp;
4173 
4174 	cnp->cn_nameptr = ndp->ni_next;
4175 	while (*cnp->cn_nameptr == '/') {
4176 		cnp->cn_nameptr++;
4177 		ndp->ni_pathlen--;
4178 	}
4179 }
4180 
4181 static int __noinline
4182 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4183 {
4184 
4185 	switch (error) {
4186 	case EAGAIN:
4187 		/*
4188 		 * Can happen when racing against vgone.
4189 		 * */
4190 	case EOPNOTSUPP:
4191 		cache_fpl_partial(fpl);
4192 		break;
4193 	default:
4194 		/*
4195 		 * See the API contract for VOP_FPLOOKUP_VEXEC.
4196 		 */
4197 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4198 			error = cache_fpl_aborted(fpl);
4199 		} else {
4200 			cache_fpl_smr_exit(fpl);
4201 			cache_fpl_handled(fpl, error);
4202 		}
4203 		break;
4204 	}
4205 	return (error);
4206 }
4207 
4208 static int
4209 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4210 {
4211 	struct nameidata *ndp;
4212 	struct componentname *cnp;
4213 	struct mount *mp;
4214 	int error;
4215 
4216 	error = CACHE_FPL_FAILED;
4217 	ndp = fpl->ndp;
4218 	cnp = fpl->cnp;
4219 
4220 	cache_fpl_checkpoint(fpl, &fpl->snd);
4221 
4222 	fpl->dvp = dvp;
4223 	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4224 	if (seqc_in_modify(fpl->dvp_seqc)) {
4225 		cache_fpl_aborted(fpl);
4226 		goto out;
4227 	}
4228 	mp = atomic_load_ptr(&fpl->dvp->v_mount);
4229 	if (!cache_fplookup_mp_supported(mp)) {
4230 		cache_fpl_aborted(fpl);
4231 		goto out;
4232 	}
4233 
4234 	VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4235 
4236 	for (;;) {
4237 		error = cache_fplookup_parse(fpl);
4238 		if (__predict_false(error != 0)) {
4239 			break;
4240 		}
4241 
4242 		VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4243 
4244 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4245 		if (__predict_false(error != 0)) {
4246 			error = cache_fplookup_failed_vexec(fpl, error);
4247 			break;
4248 		}
4249 
4250 		if (__predict_false(cache_fpl_isdotdot(cnp))) {
4251 			error = cache_fplookup_dotdot(fpl);
4252 			if (__predict_false(error != 0)) {
4253 				break;
4254 			}
4255 		} else {
4256 			error = cache_fplookup_next(fpl);
4257 			if (__predict_false(error != 0)) {
4258 				break;
4259 			}
4260 
4261 			VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4262 
4263 			if (cache_fplookup_need_climb_mount(fpl)) {
4264 				error = cache_fplookup_climb_mount(fpl);
4265 				if (__predict_false(error != 0)) {
4266 					break;
4267 				}
4268 			}
4269 		}
4270 
4271 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4272 
4273 		if (cache_fpl_islastcn(ndp)) {
4274 			error = cache_fplookup_final(fpl);
4275 			break;
4276 		}
4277 
4278 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4279 			error = cache_fpl_aborted(fpl);
4280 			break;
4281 		}
4282 
4283 		fpl->dvp = fpl->tvp;
4284 		fpl->dvp_seqc = fpl->tvp_seqc;
4285 
4286 		cache_fplookup_parse_advance(fpl);
4287 		cache_fpl_checkpoint(fpl, &fpl->snd);
4288 	}
4289 out:
4290 	switch (fpl->status) {
4291 	case CACHE_FPL_STATUS_UNSET:
4292 		__assert_unreachable();
4293 		break;
4294 	case CACHE_FPL_STATUS_PARTIAL:
4295 		cache_fpl_smr_assert_entered(fpl);
4296 		return (cache_fplookup_partial_setup(fpl));
4297 	case CACHE_FPL_STATUS_ABORTED:
4298 		if (fpl->in_smr)
4299 			cache_fpl_smr_exit(fpl);
4300 		return (CACHE_FPL_FAILED);
4301 	case CACHE_FPL_STATUS_HANDLED:
4302 		MPASS(error != CACHE_FPL_FAILED);
4303 		cache_fpl_smr_assert_not_entered(fpl);
4304 		if (__predict_false(error != 0)) {
4305 			ndp->ni_dvp = NULL;
4306 			ndp->ni_vp = NULL;
4307 			cache_fpl_cleanup_cnp(cnp);
4308 			return (error);
4309 		}
4310 		ndp->ni_dvp = fpl->dvp;
4311 		ndp->ni_vp = fpl->tvp;
4312 		if (cnp->cn_flags & SAVENAME)
4313 			cnp->cn_flags |= HASBUF;
4314 		else
4315 			cache_fpl_cleanup_cnp(cnp);
4316 		return (error);
4317 	}
4318 }
4319 
4320 /*
4321  * Fast path lookup protected with SMR and sequence counters.
4322  *
4323  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4324  *
4325  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4326  * outlined below.
4327  *
4328  * Traditional vnode lookup conceptually looks like this:
4329  *
4330  * vn_lock(current);
4331  * for (;;) {
4332  *	next = find();
4333  *	vn_lock(next);
4334  *	vn_unlock(current);
4335  *	current = next;
4336  *	if (last)
4337  *	    break;
4338  * }
4339  * return (current);
4340  *
4341  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4342  * any modifications thanks to holding respective locks.
4343  *
4344  * The same guarantee can be provided with a combination of safe memory
4345  * reclamation and sequence counters instead. If all operations which affect
4346  * the relationship between the current vnode and the one we are looking for
4347  * also modify the counter, we can verify whether all the conditions held as
4348  * we made the jump. This includes things like permissions, mount points etc.
4349  * Counter modification is provided by enclosing relevant places in
4350  * vn_seqc_write_begin()/end() calls.
4351  *
4352  * Thus this translates to:
4353  *
4354  * vfs_smr_enter();
4355  * dvp_seqc = seqc_read_any(dvp);
4356  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4357  *     abort();
4358  * for (;;) {
4359  * 	tvp = find();
4360  * 	tvp_seqc = seqc_read_any(tvp);
4361  * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4362  * 	    abort();
4363  * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4364  * 	    abort();
4365  * 	dvp = tvp; // we know nothing of importance has changed
4366  * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4367  * 	if (last)
4368  * 	    break;
4369  * }
4370  * vget(); // secure the vnode
4371  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4372  * 	    abort();
4373  * // at this point we know nothing has changed for any parent<->child pair
4374  * // as they were crossed during the lookup, meaning we matched the guarantee
4375  * // of the locked variant
4376  * return (tvp);
4377  *
4378  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4379  * - they are called while within vfs_smr protection which they must never exit
4380  * - EAGAIN can be returned to denote checking could not be performed, it is
4381  *   always valid to return it
4382  * - if the sequence counter has not changed the result must be valid
4383  * - if the sequence counter has changed both false positives and false negatives
4384  *   are permitted (since the result will be rejected later)
4385  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4386  *
4387  * Caveats to watch out for:
4388  * - vnodes are passed unlocked and unreferenced with nothing stopping
4389  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4390  *   to use atomic_load_ptr to fetch it.
4391  * - the aforementioned object can also get freed, meaning absent other means it
4392  *   should be protected with vfs_smr
4393  * - either safely checking permissions as they are modified or guaranteeing
4394  *   their stability is left to the routine
4395  */
4396 int
4397 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4398     struct pwd **pwdp)
4399 {
4400 	struct cache_fpl fpl;
4401 	struct pwd *pwd;
4402 	struct vnode *dvp;
4403 	struct componentname *cnp;
4404 	struct nameidata_saved orig;
4405 	int error;
4406 
4407 	MPASS(ndp->ni_lcf == 0);
4408 
4409 	fpl.status = CACHE_FPL_STATUS_UNSET;
4410 	fpl.ndp = ndp;
4411 	fpl.cnp = &ndp->ni_cnd;
4412 	MPASS(curthread == fpl.cnp->cn_thread);
4413 
4414 	if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4415 		MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4416 
4417 	if (!cache_can_fplookup(&fpl)) {
4418 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4419 		*status = fpl.status;
4420 		return (EOPNOTSUPP);
4421 	}
4422 
4423 	cache_fpl_checkpoint(&fpl, &orig);
4424 
4425 	cache_fpl_smr_enter_initial(&fpl);
4426 	pwd = pwd_get_smr();
4427 	fpl.pwd = pwd;
4428 	ndp->ni_rootdir = pwd->pwd_rdir;
4429 	ndp->ni_topdir = pwd->pwd_jdir;
4430 
4431 	cnp = fpl.cnp;
4432 	cnp->cn_nameptr = cnp->cn_pnbuf;
4433 	if (cnp->cn_pnbuf[0] == '/') {
4434 		cache_fpl_handle_root(ndp, &dvp);
4435 	} else {
4436 		MPASS(ndp->ni_dirfd == AT_FDCWD);
4437 		dvp = pwd->pwd_cdir;
4438 	}
4439 
4440 	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4441 
4442 	error = cache_fplookup_impl(dvp, &fpl);
4443 	cache_fpl_smr_assert_not_entered(&fpl);
4444 	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4445 
4446 	*status = fpl.status;
4447 	switch (fpl.status) {
4448 	case CACHE_FPL_STATUS_UNSET:
4449 		__assert_unreachable();
4450 		break;
4451 	case CACHE_FPL_STATUS_HANDLED:
4452 		SDT_PROBE3(vfs, namei, lookup, return, error,
4453 		    (error == 0 ? ndp->ni_vp : NULL), true);
4454 		break;
4455 	case CACHE_FPL_STATUS_PARTIAL:
4456 		*pwdp = fpl.pwd;
4457 		/*
4458 		 * Status restored by cache_fplookup_partial_setup.
4459 		 */
4460 		break;
4461 	case CACHE_FPL_STATUS_ABORTED:
4462 		cache_fpl_restore(&fpl, &orig);
4463 		break;
4464 	}
4465 	return (error);
4466 }
4467