xref: /freebsd/sys/kern/vfs_cache.c (revision f2d48b5e2c3b45850585e4d7aee324fe148afbf2)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ddb.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
50 #include <sys/ktr.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
54 #include <sys/jail.h>
55 #include <sys/mount.h>
56 #include <sys/namei.h>
57 #include <sys/proc.h>
58 #include <sys/seqc.h>
59 #include <sys/sdt.h>
60 #include <sys/smr.h>
61 #include <sys/smp.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
66 #include <ck_queue.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 #ifdef INVARIANTS
71 #include <machine/_inttypes.h>
72 #endif
73 
74 #include <sys/capsicum.h>
75 
76 #include <security/audit/audit.h>
77 #include <security/mac/mac_framework.h>
78 
79 #ifdef DDB
80 #include <ddb/ddb.h>
81 #endif
82 
83 #include <vm/uma.h>
84 
85 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
86     "Name cache");
87 
88 SDT_PROVIDER_DECLARE(vfs);
89 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
90     "struct vnode *");
91 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
92     "struct vnode *");
93 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
94     "char *");
95 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
96     "const char *");
97 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
98     "struct namecache *", "int", "int");
99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
101     "char *", "struct vnode *");
102 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
103 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
104     "struct vnode *", "char *");
105 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
106     "struct vnode *");
107 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
108     "struct vnode *", "char *");
109 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
110     "char *");
111 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
112     "struct componentname *");
113 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
114     "struct componentname *");
115 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
116 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
117 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
118 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
119 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
120     "struct vnode *");
121 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
122     "char *");
123 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
124     "char *");
125 
126 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
127 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
128 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
129 
130 /*
131  * This structure describes the elements in the cache of recent
132  * names looked up by namei.
133  */
134 struct negstate {
135 	u_char neg_flag;
136 	u_char neg_hit;
137 };
138 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
139     "the state must fit in a union with a pointer without growing it");
140 
141 struct	namecache {
142 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
143 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
144 	CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
145 	struct	vnode *nc_dvp;		/* vnode of parent of name */
146 	union {
147 		struct	vnode *nu_vp;	/* vnode the name refers to */
148 		struct	negstate nu_neg;/* negative entry state */
149 	} n_un;
150 	u_char	nc_flag;		/* flag bits */
151 	u_char	nc_nlen;		/* length of name */
152 	char	nc_name[0];		/* segment name + nul */
153 };
154 
155 /*
156  * struct namecache_ts repeats struct namecache layout up to the
157  * nc_nlen member.
158  * struct namecache_ts is used in place of struct namecache when time(s) need
159  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
160  * both a non-dotdot directory name plus dotdot for the directory's
161  * parent.
162  *
163  * See below for alignment requirement.
164  */
165 struct	namecache_ts {
166 	struct	timespec nc_time;	/* timespec provided by fs */
167 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
168 	int	nc_ticks;		/* ticks value when entry was added */
169 	int	nc_pad;
170 	struct namecache nc_nc;
171 };
172 
173 TAILQ_HEAD(cache_freebatch, namecache);
174 
175 /*
176  * At least mips n32 performs 64-bit accesses to timespec as found
177  * in namecache_ts and requires them to be aligned. Since others
178  * may be in the same spot suffer a little bit and enforce the
179  * alignment for everyone. Note this is a nop for 64-bit platforms.
180  */
181 #define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
182 
183 /*
184  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
185  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
186  * smaller and the value was bumped to retain the total size, but it
187  * was never re-evaluated for suitability. A simple test counting
188  * lengths during package building shows that the value of 45 covers
189  * about 86% of all added entries, reaching 99% at 65.
190  *
191  * Regardless of the above, use of dedicated zones instead of malloc may be
192  * inducing additional waste. This may be hard to address as said zones are
193  * tied to VFS SMR. Even if retaining them, the current split should be
194  * re-evaluated.
195  */
196 #ifdef __LP64__
197 #define	CACHE_PATH_CUTOFF	45
198 #define	CACHE_LARGE_PAD		6
199 #else
200 #define	CACHE_PATH_CUTOFF	41
201 #define	CACHE_LARGE_PAD		2
202 #endif
203 
204 #define CACHE_ZONE_SMALL_SIZE		(offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
205 #define CACHE_ZONE_SMALL_TS_SIZE	(offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
206 #define CACHE_ZONE_LARGE_SIZE		(offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
207 #define CACHE_ZONE_LARGE_TS_SIZE	(offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
208 
209 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
210 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
211 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
212 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
213 
214 #define	nc_vp		n_un.nu_vp
215 #define	nc_neg		n_un.nu_neg
216 
217 /*
218  * Flags in namecache.nc_flag
219  */
220 #define NCF_WHITE	0x01
221 #define NCF_ISDOTDOT	0x02
222 #define	NCF_TS		0x04
223 #define	NCF_DTS		0x08
224 #define	NCF_DVDROP	0x10
225 #define	NCF_NEGATIVE	0x20
226 #define	NCF_INVALID	0x40
227 #define	NCF_WIP		0x80
228 
229 /*
230  * Flags in negstate.neg_flag
231  */
232 #define NEG_HOT		0x01
233 
234 static bool	cache_neg_evict_cond(u_long lnumcache);
235 
236 /*
237  * Mark an entry as invalid.
238  *
239  * This is called before it starts getting deconstructed.
240  */
241 static void
242 cache_ncp_invalidate(struct namecache *ncp)
243 {
244 
245 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
246 	    ("%s: entry %p already invalid", __func__, ncp));
247 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
248 	atomic_thread_fence_rel();
249 }
250 
251 /*
252  * Check whether the entry can be safely used.
253  *
254  * All places which elide locks are supposed to call this after they are
255  * done with reading from an entry.
256  */
257 #define cache_ncp_canuse(ncp)	({					\
258 	struct namecache *_ncp = (ncp);					\
259 	u_char _nc_flag;						\
260 									\
261 	atomic_thread_fence_acq();					\
262 	_nc_flag = atomic_load_char(&_ncp->nc_flag);			\
263 	__predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);	\
264 })
265 
266 /*
267  * Name caching works as follows:
268  *
269  * Names found by directory scans are retained in a cache
270  * for future reference.  It is managed LRU, so frequently
271  * used names will hang around.  Cache is indexed by hash value
272  * obtained from (dvp, name) where dvp refers to the directory
273  * containing name.
274  *
275  * If it is a "negative" entry, (i.e. for a name that is known NOT to
276  * exist) the vnode pointer will be NULL.
277  *
278  * Upon reaching the last segment of a path, if the reference
279  * is for DELETE, or NOCACHE is set (rewrite), and the
280  * name is located in the cache, it will be dropped.
281  *
282  * These locks are used (in the order in which they can be taken):
283  * NAME		TYPE	ROLE
284  * vnodelock	mtx	vnode lists and v_cache_dd field protection
285  * bucketlock	mtx	for access to given set of hash buckets
286  * neglist	mtx	negative entry LRU management
287  *
288  * It is legal to take multiple vnodelock and bucketlock locks. The locking
289  * order is lower address first. Both are recursive.
290  *
291  * "." lookups are lockless.
292  *
293  * ".." and vnode -> name lookups require vnodelock.
294  *
295  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
296  *
297  * Insertions and removals of entries require involved vnodes and bucketlocks
298  * to be locked to provide safe operation against other threads modifying the
299  * cache.
300  *
301  * Some lookups result in removal of the found entry (e.g. getting rid of a
302  * negative entry with the intent to create a positive one), which poses a
303  * problem when multiple threads reach the state. Similarly, two different
304  * threads can purge two different vnodes and try to remove the same name.
305  *
306  * If the already held vnode lock is lower than the second required lock, we
307  * can just take the other lock. However, in the opposite case, this could
308  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
309  * the first node, locking everything in order and revalidating the state.
310  */
311 
312 VFS_SMR_DECLARE;
313 
314 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
315     "Name cache parameters");
316 
317 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
318 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
319     "Total namecache capacity");
320 
321 u_int ncsizefactor = 2;
322 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
323     "Size factor for namecache");
324 
325 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
326 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
327     "Ratio of negative namecache entries");
328 
329 /*
330  * Negative entry % of namecache capacity above which automatic eviction is allowed.
331  *
332  * Check cache_neg_evict_cond for details.
333  */
334 static u_int ncnegminpct = 3;
335 
336 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
337 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
338     "Negative entry count above which automatic eviction is allowed");
339 
340 /*
341  * Structures associated with name caching.
342  */
343 #define NCHHASH(hash) \
344 	(&nchashtbl[(hash) & nchash])
345 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
346 static u_long __read_mostly	nchash;			/* size of hash table */
347 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
348     "Size of namecache hash table");
349 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
350 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
351 
352 struct nchstats	nchstats;		/* cache effectiveness statistics */
353 
354 static bool __read_frequently cache_fast_revlookup = true;
355 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
356     &cache_fast_revlookup, 0, "");
357 
358 static u_int __exclusive_cache_line neg_cycle;
359 
360 #define ncneghash	3
361 #define	numneglists	(ncneghash + 1)
362 
363 struct neglist {
364 	struct mtx		nl_evict_lock;
365 	struct mtx		nl_lock __aligned(CACHE_LINE_SIZE);
366 	TAILQ_HEAD(, namecache) nl_list;
367 	TAILQ_HEAD(, namecache) nl_hotlist;
368 	u_long			nl_hotnum;
369 } __aligned(CACHE_LINE_SIZE);
370 
371 static struct neglist neglists[numneglists];
372 
373 static inline struct neglist *
374 NCP2NEGLIST(struct namecache *ncp)
375 {
376 
377 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
378 }
379 
380 static inline struct negstate *
381 NCP2NEGSTATE(struct namecache *ncp)
382 {
383 
384 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
385 	return (&ncp->nc_neg);
386 }
387 
388 #define	numbucketlocks (ncbuckethash + 1)
389 static u_int __read_mostly  ncbuckethash;
390 static struct mtx_padalign __read_mostly  *bucketlocks;
391 #define	HASH2BUCKETLOCK(hash) \
392 	((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
393 
394 #define	numvnodelocks (ncvnodehash + 1)
395 static u_int __read_mostly  ncvnodehash;
396 static struct mtx __read_mostly *vnodelocks;
397 static inline struct mtx *
398 VP2VNODELOCK(struct vnode *vp)
399 {
400 
401 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
402 }
403 
404 static void
405 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
406 {
407 	struct namecache_ts *ncp_ts;
408 
409 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
410 	    (tsp == NULL && ticksp == NULL),
411 	    ("No NCF_TS"));
412 
413 	if (tsp == NULL)
414 		return;
415 
416 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
417 	*tsp = ncp_ts->nc_time;
418 	*ticksp = ncp_ts->nc_ticks;
419 }
420 
421 #ifdef DEBUG_CACHE
422 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
423 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
424     "VFS namecache enabled");
425 #endif
426 
427 /* Export size information to userland */
428 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
429     sizeof(struct namecache), "sizeof(struct namecache)");
430 
431 /*
432  * The new name cache statistics
433  */
434 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
435     "Name cache statistics");
436 
437 #define STATNODE_ULONG(name, varname, descr)					\
438 	SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
439 #define STATNODE_COUNTER(name, varname, descr)					\
440 	static COUNTER_U64_DEFINE_EARLY(varname);				\
441 	SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
442 	    descr);
443 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
444 STATNODE_ULONG(count, numcache, "Number of cache entries");
445 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
446 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
447 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
448 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
449 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
450 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
451 STATNODE_COUNTER(posszaps, numposzaps,
452     "Number of cache hits (positive) we do not want to cache");
453 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
454 STATNODE_COUNTER(negzaps, numnegzaps,
455     "Number of cache hits (negative) we do not want to cache");
456 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
457 /* These count for vn_getcwd(), too. */
458 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
459 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
460 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
461     "Number of fullpath search errors (VOP_VPTOCNP failures)");
462 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
463 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
464 
465 /*
466  * Debug or developer statistics.
467  */
468 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
469     "Name cache debugging");
470 #define DEBUGNODE_ULONG(name, varname, descr)					\
471 	SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
472 #define DEBUGNODE_COUNTER(name, varname, descr)					\
473 	static COUNTER_U64_DEFINE_EARLY(varname);				\
474 	SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
475 	    descr);
476 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
477     "Number of successful removals after relocking");
478 static long zap_bucket_fail;
479 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
480 static long zap_bucket_fail2;
481 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
482 static long cache_lock_vnodes_cel_3_failures;
483 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
484     "Number of times 3-way vnode locking failed");
485 
486 static void cache_zap_locked(struct namecache *ncp);
487 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
488     char **freebuf, size_t *buflen);
489 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
490     char **retbuf, size_t *buflen, size_t addend);
491 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
492     char **retbuf, size_t *buflen);
493 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
494     char **retbuf, size_t *len, size_t addend);
495 
496 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
497 
498 static inline void
499 cache_assert_vlp_locked(struct mtx *vlp)
500 {
501 
502 	if (vlp != NULL)
503 		mtx_assert(vlp, MA_OWNED);
504 }
505 
506 static inline void
507 cache_assert_vnode_locked(struct vnode *vp)
508 {
509 	struct mtx *vlp;
510 
511 	vlp = VP2VNODELOCK(vp);
512 	cache_assert_vlp_locked(vlp);
513 }
514 
515 /*
516  * Directory vnodes with entries are held for two reasons:
517  * 1. make them less of a target for reclamation in vnlru
518  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
519  *
520  * It will be feasible to stop doing it altogether if all filesystems start
521  * supporting lockless lookup.
522  */
523 static void
524 cache_hold_vnode(struct vnode *vp)
525 {
526 
527 	cache_assert_vnode_locked(vp);
528 	VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
529 	vhold(vp);
530 	counter_u64_add(numcachehv, 1);
531 }
532 
533 static void
534 cache_drop_vnode(struct vnode *vp)
535 {
536 
537 	/*
538 	 * Called after all locks are dropped, meaning we can't assert
539 	 * on the state of v_cache_src.
540 	 */
541 	vdrop(vp);
542 	counter_u64_add(numcachehv, -1);
543 }
544 
545 /*
546  * UMA zones.
547  */
548 static uma_zone_t __read_mostly cache_zone_small;
549 static uma_zone_t __read_mostly cache_zone_small_ts;
550 static uma_zone_t __read_mostly cache_zone_large;
551 static uma_zone_t __read_mostly cache_zone_large_ts;
552 
553 static struct namecache *
554 cache_alloc_uma(int len, bool ts)
555 {
556 	struct namecache_ts *ncp_ts;
557 	struct namecache *ncp;
558 
559 	if (__predict_false(ts)) {
560 		if (len <= CACHE_PATH_CUTOFF)
561 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
562 		else
563 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
564 		ncp = &ncp_ts->nc_nc;
565 	} else {
566 		if (len <= CACHE_PATH_CUTOFF)
567 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
568 		else
569 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
570 	}
571 	return (ncp);
572 }
573 
574 static void
575 cache_free_uma(struct namecache *ncp)
576 {
577 	struct namecache_ts *ncp_ts;
578 
579 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
580 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
581 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
582 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
583 		else
584 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
585 	} else {
586 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
587 			uma_zfree_smr(cache_zone_small, ncp);
588 		else
589 			uma_zfree_smr(cache_zone_large, ncp);
590 	}
591 }
592 
593 static struct namecache *
594 cache_alloc(int len, bool ts)
595 {
596 	u_long lnumcache;
597 
598 	/*
599 	 * Avoid blowout in namecache entries.
600 	 *
601 	 * Bugs:
602 	 * 1. filesystems may end up trying to add an already existing entry
603 	 * (for example this can happen after a cache miss during concurrent
604 	 * lookup), in which case we will call cache_neg_evict despite not
605 	 * adding anything.
606 	 * 2. the routine may fail to free anything and no provisions are made
607 	 * to make it try harder (see the inside for failure modes)
608 	 * 3. it only ever looks at negative entries.
609 	 */
610 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
611 	if (cache_neg_evict_cond(lnumcache)) {
612 		lnumcache = atomic_load_long(&numcache);
613 	}
614 	if (__predict_false(lnumcache >= ncsize)) {
615 		atomic_subtract_long(&numcache, 1);
616 		counter_u64_add(numdrops, 1);
617 		return (NULL);
618 	}
619 	return (cache_alloc_uma(len, ts));
620 }
621 
622 static void
623 cache_free(struct namecache *ncp)
624 {
625 
626 	MPASS(ncp != NULL);
627 	if ((ncp->nc_flag & NCF_DVDROP) != 0) {
628 		cache_drop_vnode(ncp->nc_dvp);
629 	}
630 	cache_free_uma(ncp);
631 	atomic_subtract_long(&numcache, 1);
632 }
633 
634 static void
635 cache_free_batch(struct cache_freebatch *batch)
636 {
637 	struct namecache *ncp, *nnp;
638 	int i;
639 
640 	i = 0;
641 	if (TAILQ_EMPTY(batch))
642 		goto out;
643 	TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
644 		if ((ncp->nc_flag & NCF_DVDROP) != 0) {
645 			cache_drop_vnode(ncp->nc_dvp);
646 		}
647 		cache_free_uma(ncp);
648 		i++;
649 	}
650 	atomic_subtract_long(&numcache, i);
651 out:
652 	SDT_PROBE1(vfs, namecache, purge, batch, i);
653 }
654 
655 /*
656  * TODO: With the value stored we can do better than computing the hash based
657  * on the address. The choice of FNV should also be revisited.
658  */
659 static void
660 cache_prehash(struct vnode *vp)
661 {
662 
663 	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
664 }
665 
666 static uint32_t
667 cache_get_hash(char *name, u_char len, struct vnode *dvp)
668 {
669 
670 	return (fnv_32_buf(name, len, dvp->v_nchash));
671 }
672 
673 static inline struct nchashhead *
674 NCP2BUCKET(struct namecache *ncp)
675 {
676 	uint32_t hash;
677 
678 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
679 	return (NCHHASH(hash));
680 }
681 
682 static inline struct mtx *
683 NCP2BUCKETLOCK(struct namecache *ncp)
684 {
685 	uint32_t hash;
686 
687 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
688 	return (HASH2BUCKETLOCK(hash));
689 }
690 
691 #ifdef INVARIANTS
692 static void
693 cache_assert_bucket_locked(struct namecache *ncp)
694 {
695 	struct mtx *blp;
696 
697 	blp = NCP2BUCKETLOCK(ncp);
698 	mtx_assert(blp, MA_OWNED);
699 }
700 
701 static void
702 cache_assert_bucket_unlocked(struct namecache *ncp)
703 {
704 	struct mtx *blp;
705 
706 	blp = NCP2BUCKETLOCK(ncp);
707 	mtx_assert(blp, MA_NOTOWNED);
708 }
709 #else
710 #define cache_assert_bucket_locked(x) do { } while (0)
711 #define cache_assert_bucket_unlocked(x) do { } while (0)
712 #endif
713 
714 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
715 static void
716 _cache_sort_vnodes(void **p1, void **p2)
717 {
718 	void *tmp;
719 
720 	MPASS(*p1 != NULL || *p2 != NULL);
721 
722 	if (*p1 > *p2) {
723 		tmp = *p2;
724 		*p2 = *p1;
725 		*p1 = tmp;
726 	}
727 }
728 
729 static void
730 cache_lock_all_buckets(void)
731 {
732 	u_int i;
733 
734 	for (i = 0; i < numbucketlocks; i++)
735 		mtx_lock(&bucketlocks[i]);
736 }
737 
738 static void
739 cache_unlock_all_buckets(void)
740 {
741 	u_int i;
742 
743 	for (i = 0; i < numbucketlocks; i++)
744 		mtx_unlock(&bucketlocks[i]);
745 }
746 
747 static void
748 cache_lock_all_vnodes(void)
749 {
750 	u_int i;
751 
752 	for (i = 0; i < numvnodelocks; i++)
753 		mtx_lock(&vnodelocks[i]);
754 }
755 
756 static void
757 cache_unlock_all_vnodes(void)
758 {
759 	u_int i;
760 
761 	for (i = 0; i < numvnodelocks; i++)
762 		mtx_unlock(&vnodelocks[i]);
763 }
764 
765 static int
766 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
767 {
768 
769 	cache_sort_vnodes(&vlp1, &vlp2);
770 
771 	if (vlp1 != NULL) {
772 		if (!mtx_trylock(vlp1))
773 			return (EAGAIN);
774 	}
775 	if (!mtx_trylock(vlp2)) {
776 		if (vlp1 != NULL)
777 			mtx_unlock(vlp1);
778 		return (EAGAIN);
779 	}
780 
781 	return (0);
782 }
783 
784 static void
785 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
786 {
787 
788 	MPASS(vlp1 != NULL || vlp2 != NULL);
789 	MPASS(vlp1 <= vlp2);
790 
791 	if (vlp1 != NULL)
792 		mtx_lock(vlp1);
793 	if (vlp2 != NULL)
794 		mtx_lock(vlp2);
795 }
796 
797 static void
798 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
799 {
800 
801 	MPASS(vlp1 != NULL || vlp2 != NULL);
802 
803 	if (vlp1 != NULL)
804 		mtx_unlock(vlp1);
805 	if (vlp2 != NULL)
806 		mtx_unlock(vlp2);
807 }
808 
809 static int
810 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
811 {
812 	struct nchstats snap;
813 
814 	if (req->oldptr == NULL)
815 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
816 
817 	snap = nchstats;
818 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
819 	snap.ncs_neghits = counter_u64_fetch(numneghits);
820 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
821 	    counter_u64_fetch(numnegzaps);
822 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
823 	    counter_u64_fetch(nummiss);
824 
825 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
826 }
827 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
828     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
829     "VFS cache effectiveness statistics");
830 
831 static void
832 cache_recalc_neg_min(u_int val)
833 {
834 
835 	neg_min = (ncsize * val) / 100;
836 }
837 
838 static int
839 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
840 {
841 	u_int val;
842 	int error;
843 
844 	val = ncnegminpct;
845 	error = sysctl_handle_int(oidp, &val, 0, req);
846 	if (error != 0 || req->newptr == NULL)
847 		return (error);
848 
849 	if (val == ncnegminpct)
850 		return (0);
851 	if (val < 0 || val > 99)
852 		return (EINVAL);
853 	ncnegminpct = val;
854 	cache_recalc_neg_min(val);
855 	return (0);
856 }
857 
858 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
859     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
860     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
861 
862 #ifdef DIAGNOSTIC
863 /*
864  * Grab an atomic snapshot of the name cache hash chain lengths
865  */
866 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
867     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
868     "hash table stats");
869 
870 static int
871 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
872 {
873 	struct nchashhead *ncpp;
874 	struct namecache *ncp;
875 	int i, error, n_nchash, *cntbuf;
876 
877 retry:
878 	n_nchash = nchash + 1;	/* nchash is max index, not count */
879 	if (req->oldptr == NULL)
880 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
881 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
882 	cache_lock_all_buckets();
883 	if (n_nchash != nchash + 1) {
884 		cache_unlock_all_buckets();
885 		free(cntbuf, M_TEMP);
886 		goto retry;
887 	}
888 	/* Scan hash tables counting entries */
889 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
890 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
891 			cntbuf[i]++;
892 	cache_unlock_all_buckets();
893 	for (error = 0, i = 0; i < n_nchash; i++)
894 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
895 			break;
896 	free(cntbuf, M_TEMP);
897 	return (error);
898 }
899 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
900     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
901     "nchash chain lengths");
902 
903 static int
904 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
905 {
906 	int error;
907 	struct nchashhead *ncpp;
908 	struct namecache *ncp;
909 	int n_nchash;
910 	int count, maxlength, used, pct;
911 
912 	if (!req->oldptr)
913 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
914 
915 	cache_lock_all_buckets();
916 	n_nchash = nchash + 1;	/* nchash is max index, not count */
917 	used = 0;
918 	maxlength = 0;
919 
920 	/* Scan hash tables for applicable entries */
921 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
922 		count = 0;
923 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
924 			count++;
925 		}
926 		if (count)
927 			used++;
928 		if (maxlength < count)
929 			maxlength = count;
930 	}
931 	n_nchash = nchash + 1;
932 	cache_unlock_all_buckets();
933 	pct = (used * 100) / (n_nchash / 100);
934 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
935 	if (error)
936 		return (error);
937 	error = SYSCTL_OUT(req, &used, sizeof(used));
938 	if (error)
939 		return (error);
940 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
941 	if (error)
942 		return (error);
943 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
944 	if (error)
945 		return (error);
946 	return (0);
947 }
948 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
949     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
950     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
951 #endif
952 
953 /*
954  * Negative entries management
955  *
956  * Various workloads create plenty of negative entries and barely use them
957  * afterwards. Moreover malicious users can keep performing bogus lookups
958  * adding even more entries. For example "make tinderbox" as of writing this
959  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
960  * negative.
961  *
962  * As such, a rather aggressive eviction method is needed. The currently
963  * employed method is a placeholder.
964  *
965  * Entries are split over numneglists separate lists, each of which is further
966  * split into hot and cold entries. Entries get promoted after getting a hit.
967  * Eviction happens on addition of new entry.
968  */
969 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
970     "Name cache negative entry statistics");
971 
972 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
973     "Number of negative cache entries");
974 
975 static COUNTER_U64_DEFINE_EARLY(neg_created);
976 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
977     "Number of created negative entries");
978 
979 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
980 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
981     "Number of evicted negative entries");
982 
983 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
984 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
985     &neg_evict_skipped_empty,
986     "Number of times evicting failed due to lack of entries");
987 
988 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
989 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
990     &neg_evict_skipped_missed,
991     "Number of times evicting failed due to target entry disappearing");
992 
993 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
994 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
995     &neg_evict_skipped_contended,
996     "Number of times evicting failed due to contention");
997 
998 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
999     "Number of cache hits (negative)");
1000 
1001 static int
1002 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1003 {
1004 	int i, out;
1005 
1006 	out = 0;
1007 	for (i = 0; i < numneglists; i++)
1008 		out += neglists[i].nl_hotnum;
1009 
1010 	return (SYSCTL_OUT(req, &out, sizeof(out)));
1011 }
1012 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1013     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1014     "Number of hot negative entries");
1015 
1016 static void
1017 cache_neg_init(struct namecache *ncp)
1018 {
1019 	struct negstate *ns;
1020 
1021 	ncp->nc_flag |= NCF_NEGATIVE;
1022 	ns = NCP2NEGSTATE(ncp);
1023 	ns->neg_flag = 0;
1024 	ns->neg_hit = 0;
1025 	counter_u64_add(neg_created, 1);
1026 }
1027 
1028 #define CACHE_NEG_PROMOTION_THRESH 2
1029 
1030 static bool
1031 cache_neg_hit_prep(struct namecache *ncp)
1032 {
1033 	struct negstate *ns;
1034 	u_char n;
1035 
1036 	ns = NCP2NEGSTATE(ncp);
1037 	n = atomic_load_char(&ns->neg_hit);
1038 	for (;;) {
1039 		if (n >= CACHE_NEG_PROMOTION_THRESH)
1040 			return (false);
1041 		if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1042 			break;
1043 	}
1044 	return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1045 }
1046 
1047 /*
1048  * Nothing to do here but it is provided for completeness as some
1049  * cache_neg_hit_prep callers may end up returning without even
1050  * trying to promote.
1051  */
1052 #define cache_neg_hit_abort(ncp)	do { } while (0)
1053 
1054 static void
1055 cache_neg_hit_finish(struct namecache *ncp)
1056 {
1057 
1058 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1059 	counter_u64_add(numneghits, 1);
1060 }
1061 
1062 /*
1063  * Move a negative entry to the hot list.
1064  */
1065 static void
1066 cache_neg_promote_locked(struct namecache *ncp)
1067 {
1068 	struct neglist *nl;
1069 	struct negstate *ns;
1070 
1071 	ns = NCP2NEGSTATE(ncp);
1072 	nl = NCP2NEGLIST(ncp);
1073 	mtx_assert(&nl->nl_lock, MA_OWNED);
1074 	if ((ns->neg_flag & NEG_HOT) == 0) {
1075 		TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1076 		TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1077 		nl->nl_hotnum++;
1078 		ns->neg_flag |= NEG_HOT;
1079 	}
1080 }
1081 
1082 /*
1083  * Move a hot negative entry to the cold list.
1084  */
1085 static void
1086 cache_neg_demote_locked(struct namecache *ncp)
1087 {
1088 	struct neglist *nl;
1089 	struct negstate *ns;
1090 
1091 	ns = NCP2NEGSTATE(ncp);
1092 	nl = NCP2NEGLIST(ncp);
1093 	mtx_assert(&nl->nl_lock, MA_OWNED);
1094 	MPASS(ns->neg_flag & NEG_HOT);
1095 	TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1096 	TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1097 	nl->nl_hotnum--;
1098 	ns->neg_flag &= ~NEG_HOT;
1099 	atomic_store_char(&ns->neg_hit, 0);
1100 }
1101 
1102 /*
1103  * Move a negative entry to the hot list if it matches the lookup.
1104  *
1105  * We have to take locks, but they may be contended and in the worst
1106  * case we may need to go off CPU. We don't want to spin within the
1107  * smr section and we can't block with it. Exiting the section means
1108  * the found entry could have been evicted. We are going to look it
1109  * up again.
1110  */
1111 static bool
1112 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1113     struct namecache *oncp, uint32_t hash)
1114 {
1115 	struct namecache *ncp;
1116 	struct neglist *nl;
1117 	u_char nc_flag;
1118 
1119 	nl = NCP2NEGLIST(oncp);
1120 
1121 	mtx_lock(&nl->nl_lock);
1122 	/*
1123 	 * For hash iteration.
1124 	 */
1125 	vfs_smr_enter();
1126 
1127 	/*
1128 	 * Avoid all surprises by only succeeding if we got the same entry and
1129 	 * bailing completely otherwise.
1130 	 * XXX There are no provisions to keep the vnode around, meaning we may
1131 	 * end up promoting a negative entry for a *new* vnode and returning
1132 	 * ENOENT on its account. This is the error we want to return anyway
1133 	 * and promotion is harmless.
1134 	 *
1135 	 * In particular at this point there can be a new ncp which matches the
1136 	 * search but hashes to a different neglist.
1137 	 */
1138 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1139 		if (ncp == oncp)
1140 			break;
1141 	}
1142 
1143 	/*
1144 	 * No match to begin with.
1145 	 */
1146 	if (__predict_false(ncp == NULL)) {
1147 		goto out_abort;
1148 	}
1149 
1150 	/*
1151 	 * The newly found entry may be something different...
1152 	 */
1153 	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1154 	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1155 		goto out_abort;
1156 	}
1157 
1158 	/*
1159 	 * ... and not even negative.
1160 	 */
1161 	nc_flag = atomic_load_char(&ncp->nc_flag);
1162 	if ((nc_flag & NCF_NEGATIVE) == 0) {
1163 		goto out_abort;
1164 	}
1165 
1166 	if (!cache_ncp_canuse(ncp)) {
1167 		goto out_abort;
1168 	}
1169 
1170 	cache_neg_promote_locked(ncp);
1171 	cache_neg_hit_finish(ncp);
1172 	vfs_smr_exit();
1173 	mtx_unlock(&nl->nl_lock);
1174 	return (true);
1175 out_abort:
1176 	vfs_smr_exit();
1177 	mtx_unlock(&nl->nl_lock);
1178 	return (false);
1179 }
1180 
1181 static void
1182 cache_neg_promote(struct namecache *ncp)
1183 {
1184 	struct neglist *nl;
1185 
1186 	nl = NCP2NEGLIST(ncp);
1187 	mtx_lock(&nl->nl_lock);
1188 	cache_neg_promote_locked(ncp);
1189 	mtx_unlock(&nl->nl_lock);
1190 }
1191 
1192 static void
1193 cache_neg_insert(struct namecache *ncp)
1194 {
1195 	struct neglist *nl;
1196 
1197 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1198 	cache_assert_bucket_locked(ncp);
1199 	nl = NCP2NEGLIST(ncp);
1200 	mtx_lock(&nl->nl_lock);
1201 	TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1202 	mtx_unlock(&nl->nl_lock);
1203 	atomic_add_long(&numneg, 1);
1204 }
1205 
1206 static void
1207 cache_neg_remove(struct namecache *ncp)
1208 {
1209 	struct neglist *nl;
1210 	struct negstate *ns;
1211 
1212 	cache_assert_bucket_locked(ncp);
1213 	nl = NCP2NEGLIST(ncp);
1214 	ns = NCP2NEGSTATE(ncp);
1215 	mtx_lock(&nl->nl_lock);
1216 	if ((ns->neg_flag & NEG_HOT) != 0) {
1217 		TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1218 		nl->nl_hotnum--;
1219 	} else {
1220 		TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1221 	}
1222 	mtx_unlock(&nl->nl_lock);
1223 	atomic_subtract_long(&numneg, 1);
1224 }
1225 
1226 static struct neglist *
1227 cache_neg_evict_select_list(void)
1228 {
1229 	struct neglist *nl;
1230 	u_int c;
1231 
1232 	c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1233 	nl = &neglists[c % numneglists];
1234 	if (!mtx_trylock(&nl->nl_evict_lock)) {
1235 		counter_u64_add(neg_evict_skipped_contended, 1);
1236 		return (NULL);
1237 	}
1238 	return (nl);
1239 }
1240 
1241 static struct namecache *
1242 cache_neg_evict_select_entry(struct neglist *nl)
1243 {
1244 	struct namecache *ncp, *lncp;
1245 	struct negstate *ns, *lns;
1246 	int i;
1247 
1248 	mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1249 	mtx_assert(&nl->nl_lock, MA_OWNED);
1250 	ncp = TAILQ_FIRST(&nl->nl_list);
1251 	if (ncp == NULL)
1252 		return (NULL);
1253 	lncp = ncp;
1254 	lns = NCP2NEGSTATE(lncp);
1255 	for (i = 1; i < 4; i++) {
1256 		ncp = TAILQ_NEXT(ncp, nc_dst);
1257 		if (ncp == NULL)
1258 			break;
1259 		ns = NCP2NEGSTATE(ncp);
1260 		if (ns->neg_hit < lns->neg_hit) {
1261 			lncp = ncp;
1262 			lns = ns;
1263 		}
1264 	}
1265 	return (lncp);
1266 }
1267 
1268 static bool
1269 cache_neg_evict(void)
1270 {
1271 	struct namecache *ncp, *ncp2;
1272 	struct neglist *nl;
1273 	struct vnode *dvp;
1274 	struct mtx *dvlp;
1275 	struct mtx *blp;
1276 	uint32_t hash;
1277 	u_char nlen;
1278 	bool evicted;
1279 
1280 	nl = cache_neg_evict_select_list();
1281 	if (nl == NULL) {
1282 		return (false);
1283 	}
1284 
1285 	mtx_lock(&nl->nl_lock);
1286 	ncp = TAILQ_FIRST(&nl->nl_hotlist);
1287 	if (ncp != NULL) {
1288 		cache_neg_demote_locked(ncp);
1289 	}
1290 	ncp = cache_neg_evict_select_entry(nl);
1291 	if (ncp == NULL) {
1292 		counter_u64_add(neg_evict_skipped_empty, 1);
1293 		mtx_unlock(&nl->nl_lock);
1294 		mtx_unlock(&nl->nl_evict_lock);
1295 		return (false);
1296 	}
1297 	nlen = ncp->nc_nlen;
1298 	dvp = ncp->nc_dvp;
1299 	hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1300 	dvlp = VP2VNODELOCK(dvp);
1301 	blp = HASH2BUCKETLOCK(hash);
1302 	mtx_unlock(&nl->nl_lock);
1303 	mtx_unlock(&nl->nl_evict_lock);
1304 	mtx_lock(dvlp);
1305 	mtx_lock(blp);
1306 	/*
1307 	 * Note that since all locks were dropped above, the entry may be
1308 	 * gone or reallocated to be something else.
1309 	 */
1310 	CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1311 		if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1312 		    ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1313 			break;
1314 	}
1315 	if (ncp2 == NULL) {
1316 		counter_u64_add(neg_evict_skipped_missed, 1);
1317 		ncp = NULL;
1318 		evicted = false;
1319 	} else {
1320 		MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1321 		MPASS(blp == NCP2BUCKETLOCK(ncp));
1322 		SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1323 		    ncp->nc_name);
1324 		cache_zap_locked(ncp);
1325 		counter_u64_add(neg_evicted, 1);
1326 		evicted = true;
1327 	}
1328 	mtx_unlock(blp);
1329 	mtx_unlock(dvlp);
1330 	if (ncp != NULL)
1331 		cache_free(ncp);
1332 	return (evicted);
1333 }
1334 
1335 /*
1336  * Maybe evict a negative entry to create more room.
1337  *
1338  * The ncnegfactor parameter limits what fraction of the total count
1339  * can comprise of negative entries. However, if the cache is just
1340  * warming up this leads to excessive evictions.  As such, ncnegminpct
1341  * (recomputed to neg_min) dictates whether the above should be
1342  * applied.
1343  *
1344  * Try evicting if the cache is close to full capacity regardless of
1345  * other considerations.
1346  */
1347 static bool
1348 cache_neg_evict_cond(u_long lnumcache)
1349 {
1350 	u_long lnumneg;
1351 
1352 	if (ncsize - 1000 < lnumcache)
1353 		goto out_evict;
1354 	lnumneg = atomic_load_long(&numneg);
1355 	if (lnumneg < neg_min)
1356 		return (false);
1357 	if (lnumneg * ncnegfactor < lnumcache)
1358 		return (false);
1359 out_evict:
1360 	return (cache_neg_evict());
1361 }
1362 
1363 /*
1364  * cache_zap_locked():
1365  *
1366  *   Removes a namecache entry from cache, whether it contains an actual
1367  *   pointer to a vnode or if it is just a negative cache entry.
1368  */
1369 static void
1370 cache_zap_locked(struct namecache *ncp)
1371 {
1372 	struct nchashhead *ncpp;
1373 
1374 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1375 		cache_assert_vnode_locked(ncp->nc_vp);
1376 	cache_assert_vnode_locked(ncp->nc_dvp);
1377 	cache_assert_bucket_locked(ncp);
1378 
1379 	cache_ncp_invalidate(ncp);
1380 
1381 	ncpp = NCP2BUCKET(ncp);
1382 	CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1383 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1384 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1385 		    ncp->nc_name, ncp->nc_vp);
1386 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1387 		if (ncp == ncp->nc_vp->v_cache_dd) {
1388 			vn_seqc_write_begin_unheld(ncp->nc_vp);
1389 			ncp->nc_vp->v_cache_dd = NULL;
1390 			vn_seqc_write_end(ncp->nc_vp);
1391 		}
1392 	} else {
1393 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1394 		    ncp->nc_name);
1395 		cache_neg_remove(ncp);
1396 	}
1397 	if (ncp->nc_flag & NCF_ISDOTDOT) {
1398 		if (ncp == ncp->nc_dvp->v_cache_dd) {
1399 			vn_seqc_write_begin_unheld(ncp->nc_dvp);
1400 			ncp->nc_dvp->v_cache_dd = NULL;
1401 			vn_seqc_write_end(ncp->nc_dvp);
1402 		}
1403 	} else {
1404 		LIST_REMOVE(ncp, nc_src);
1405 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1406 			ncp->nc_flag |= NCF_DVDROP;
1407 		}
1408 	}
1409 }
1410 
1411 static void
1412 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1413 {
1414 	struct mtx *blp;
1415 
1416 	MPASS(ncp->nc_dvp == vp);
1417 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
1418 	cache_assert_vnode_locked(vp);
1419 
1420 	blp = NCP2BUCKETLOCK(ncp);
1421 	mtx_lock(blp);
1422 	cache_zap_locked(ncp);
1423 	mtx_unlock(blp);
1424 }
1425 
1426 static bool
1427 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1428     struct mtx **vlpp)
1429 {
1430 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1431 	struct mtx *blp;
1432 
1433 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1434 	cache_assert_vnode_locked(vp);
1435 
1436 	if (ncp->nc_flag & NCF_NEGATIVE) {
1437 		if (*vlpp != NULL) {
1438 			mtx_unlock(*vlpp);
1439 			*vlpp = NULL;
1440 		}
1441 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1442 		return (true);
1443 	}
1444 
1445 	pvlp = VP2VNODELOCK(vp);
1446 	blp = NCP2BUCKETLOCK(ncp);
1447 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1448 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1449 
1450 	if (*vlpp == vlp1 || *vlpp == vlp2) {
1451 		to_unlock = *vlpp;
1452 		*vlpp = NULL;
1453 	} else {
1454 		if (*vlpp != NULL) {
1455 			mtx_unlock(*vlpp);
1456 			*vlpp = NULL;
1457 		}
1458 		cache_sort_vnodes(&vlp1, &vlp2);
1459 		if (vlp1 == pvlp) {
1460 			mtx_lock(vlp2);
1461 			to_unlock = vlp2;
1462 		} else {
1463 			if (!mtx_trylock(vlp1))
1464 				goto out_relock;
1465 			to_unlock = vlp1;
1466 		}
1467 	}
1468 	mtx_lock(blp);
1469 	cache_zap_locked(ncp);
1470 	mtx_unlock(blp);
1471 	if (to_unlock != NULL)
1472 		mtx_unlock(to_unlock);
1473 	return (true);
1474 
1475 out_relock:
1476 	mtx_unlock(vlp2);
1477 	mtx_lock(vlp1);
1478 	mtx_lock(vlp2);
1479 	MPASS(*vlpp == NULL);
1480 	*vlpp = vlp1;
1481 	return (false);
1482 }
1483 
1484 /*
1485  * If trylocking failed we can get here. We know enough to take all needed locks
1486  * in the right order and re-lookup the entry.
1487  */
1488 static int
1489 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1490     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1491     struct mtx *blp)
1492 {
1493 	struct namecache *rncp;
1494 
1495 	cache_assert_bucket_unlocked(ncp);
1496 
1497 	cache_sort_vnodes(&dvlp, &vlp);
1498 	cache_lock_vnodes(dvlp, vlp);
1499 	mtx_lock(blp);
1500 	CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1501 		if (rncp == ncp && rncp->nc_dvp == dvp &&
1502 		    rncp->nc_nlen == cnp->cn_namelen &&
1503 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1504 			break;
1505 	}
1506 	if (rncp != NULL) {
1507 		cache_zap_locked(rncp);
1508 		mtx_unlock(blp);
1509 		cache_unlock_vnodes(dvlp, vlp);
1510 		counter_u64_add(zap_bucket_relock_success, 1);
1511 		return (0);
1512 	}
1513 
1514 	mtx_unlock(blp);
1515 	cache_unlock_vnodes(dvlp, vlp);
1516 	return (EAGAIN);
1517 }
1518 
1519 static int __noinline
1520 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1521     uint32_t hash, struct mtx *blp)
1522 {
1523 	struct mtx *dvlp, *vlp;
1524 	struct vnode *dvp;
1525 
1526 	cache_assert_bucket_locked(ncp);
1527 
1528 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1529 	vlp = NULL;
1530 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1531 		vlp = VP2VNODELOCK(ncp->nc_vp);
1532 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1533 		cache_zap_locked(ncp);
1534 		mtx_unlock(blp);
1535 		cache_unlock_vnodes(dvlp, vlp);
1536 		return (0);
1537 	}
1538 
1539 	dvp = ncp->nc_dvp;
1540 	mtx_unlock(blp);
1541 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1542 }
1543 
1544 static __noinline int
1545 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1546 {
1547 	struct namecache *ncp;
1548 	struct mtx *blp;
1549 	struct mtx *dvlp, *dvlp2;
1550 	uint32_t hash;
1551 	int error;
1552 
1553 	if (cnp->cn_namelen == 2 &&
1554 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1555 		dvlp = VP2VNODELOCK(dvp);
1556 		dvlp2 = NULL;
1557 		mtx_lock(dvlp);
1558 retry_dotdot:
1559 		ncp = dvp->v_cache_dd;
1560 		if (ncp == NULL) {
1561 			mtx_unlock(dvlp);
1562 			if (dvlp2 != NULL)
1563 				mtx_unlock(dvlp2);
1564 			SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1565 			return (0);
1566 		}
1567 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1568 			if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1569 				goto retry_dotdot;
1570 			MPASS(dvp->v_cache_dd == NULL);
1571 			mtx_unlock(dvlp);
1572 			if (dvlp2 != NULL)
1573 				mtx_unlock(dvlp2);
1574 			cache_free(ncp);
1575 		} else {
1576 			vn_seqc_write_begin(dvp);
1577 			dvp->v_cache_dd = NULL;
1578 			vn_seqc_write_end(dvp);
1579 			mtx_unlock(dvlp);
1580 			if (dvlp2 != NULL)
1581 				mtx_unlock(dvlp2);
1582 		}
1583 		SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1584 		return (1);
1585 	}
1586 
1587 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1588 	blp = HASH2BUCKETLOCK(hash);
1589 retry:
1590 	if (CK_SLIST_EMPTY(NCHHASH(hash)))
1591 		goto out_no_entry;
1592 
1593 	mtx_lock(blp);
1594 
1595 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1596 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1597 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1598 			break;
1599 	}
1600 
1601 	if (ncp == NULL) {
1602 		mtx_unlock(blp);
1603 		goto out_no_entry;
1604 	}
1605 
1606 	error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1607 	if (__predict_false(error != 0)) {
1608 		zap_bucket_fail++;
1609 		goto retry;
1610 	}
1611 	counter_u64_add(numposzaps, 1);
1612 	SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1613 	cache_free(ncp);
1614 	return (1);
1615 out_no_entry:
1616 	counter_u64_add(nummisszap, 1);
1617 	SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1618 	return (0);
1619 }
1620 
1621 static int __noinline
1622 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1623     struct timespec *tsp, int *ticksp)
1624 {
1625 	int ltype;
1626 
1627 	*vpp = dvp;
1628 	counter_u64_add(dothits, 1);
1629 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1630 	if (tsp != NULL)
1631 		timespecclear(tsp);
1632 	if (ticksp != NULL)
1633 		*ticksp = ticks;
1634 	vrefact(*vpp);
1635 	/*
1636 	 * When we lookup "." we still can be asked to lock it
1637 	 * differently...
1638 	 */
1639 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1640 	if (ltype != VOP_ISLOCKED(*vpp)) {
1641 		if (ltype == LK_EXCLUSIVE) {
1642 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1643 			if (VN_IS_DOOMED((*vpp))) {
1644 				/* forced unmount */
1645 				vrele(*vpp);
1646 				*vpp = NULL;
1647 				return (ENOENT);
1648 			}
1649 		} else
1650 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1651 	}
1652 	return (-1);
1653 }
1654 
1655 static int __noinline
1656 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1657     struct timespec *tsp, int *ticksp)
1658 {
1659 	struct namecache_ts *ncp_ts;
1660 	struct namecache *ncp;
1661 	struct mtx *dvlp;
1662 	enum vgetstate vs;
1663 	int error, ltype;
1664 	bool whiteout;
1665 
1666 	MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1667 
1668 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
1669 		cache_remove_cnp(dvp, cnp);
1670 		return (0);
1671 	}
1672 
1673 	counter_u64_add(dotdothits, 1);
1674 retry:
1675 	dvlp = VP2VNODELOCK(dvp);
1676 	mtx_lock(dvlp);
1677 	ncp = dvp->v_cache_dd;
1678 	if (ncp == NULL) {
1679 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1680 		mtx_unlock(dvlp);
1681 		return (0);
1682 	}
1683 	if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1684 		if (ncp->nc_flag & NCF_NEGATIVE)
1685 			*vpp = NULL;
1686 		else
1687 			*vpp = ncp->nc_vp;
1688 	} else
1689 		*vpp = ncp->nc_dvp;
1690 	if (*vpp == NULL)
1691 		goto negative_success;
1692 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1693 	cache_out_ts(ncp, tsp, ticksp);
1694 	if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1695 	    NCF_DTS && tsp != NULL) {
1696 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1697 		*tsp = ncp_ts->nc_dotdottime;
1698 	}
1699 
1700 	MPASS(dvp != *vpp);
1701 	ltype = VOP_ISLOCKED(dvp);
1702 	VOP_UNLOCK(dvp);
1703 	vs = vget_prep(*vpp);
1704 	mtx_unlock(dvlp);
1705 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1706 	vn_lock(dvp, ltype | LK_RETRY);
1707 	if (VN_IS_DOOMED(dvp)) {
1708 		if (error == 0)
1709 			vput(*vpp);
1710 		*vpp = NULL;
1711 		return (ENOENT);
1712 	}
1713 	if (error) {
1714 		*vpp = NULL;
1715 		goto retry;
1716 	}
1717 	return (-1);
1718 negative_success:
1719 	if (__predict_false(cnp->cn_nameiop == CREATE)) {
1720 		if (cnp->cn_flags & ISLASTCN) {
1721 			counter_u64_add(numnegzaps, 1);
1722 			cache_zap_negative_locked_vnode_kl(ncp, dvp);
1723 			mtx_unlock(dvlp);
1724 			cache_free(ncp);
1725 			return (0);
1726 		}
1727 	}
1728 
1729 	whiteout = (ncp->nc_flag & NCF_WHITE);
1730 	cache_out_ts(ncp, tsp, ticksp);
1731 	if (cache_neg_hit_prep(ncp))
1732 		cache_neg_promote(ncp);
1733 	else
1734 		cache_neg_hit_finish(ncp);
1735 	mtx_unlock(dvlp);
1736 	if (whiteout)
1737 		cnp->cn_flags |= ISWHITEOUT;
1738 	return (ENOENT);
1739 }
1740 
1741 /**
1742  * Lookup a name in the name cache
1743  *
1744  * # Arguments
1745  *
1746  * - dvp:	Parent directory in which to search.
1747  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1748  * - cnp:	Parameters of the name search.  The most interesting bits of
1749  *   		the cn_flags field have the following meanings:
1750  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1751  *   			it up.
1752  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1753  * - tsp:	Return storage for cache timestamp.  On a successful (positive
1754  *   		or negative) lookup, tsp will be filled with any timespec that
1755  *   		was stored when this cache entry was created.  However, it will
1756  *   		be clear for "." entries.
1757  * - ticks:	Return storage for alternate cache timestamp.  On a successful
1758  *   		(positive or negative) lookup, it will contain the ticks value
1759  *   		that was current when the cache entry was created, unless cnp
1760  *   		was ".".
1761  *
1762  * Either both tsp and ticks have to be provided or neither of them.
1763  *
1764  * # Returns
1765  *
1766  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1767  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1768  *		to a forced unmount.  vpp will not be modified.  If the entry
1769  *		is a whiteout, then the ISWHITEOUT flag will be set in
1770  *		cnp->cn_flags.
1771  * - 0:		A cache miss.  vpp will not be modified.
1772  *
1773  * # Locking
1774  *
1775  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1776  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1777  * lock is not recursively acquired.
1778  */
1779 static int __noinline
1780 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1781     struct timespec *tsp, int *ticksp)
1782 {
1783 	struct namecache *ncp;
1784 	struct mtx *blp;
1785 	uint32_t hash;
1786 	enum vgetstate vs;
1787 	int error;
1788 	bool whiteout;
1789 
1790 	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1791 	MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1792 
1793 retry:
1794 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1795 	blp = HASH2BUCKETLOCK(hash);
1796 	mtx_lock(blp);
1797 
1798 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1799 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1800 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1801 			break;
1802 	}
1803 
1804 	if (__predict_false(ncp == NULL)) {
1805 		mtx_unlock(blp);
1806 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1807 		    NULL);
1808 		counter_u64_add(nummiss, 1);
1809 		return (0);
1810 	}
1811 
1812 	if (ncp->nc_flag & NCF_NEGATIVE)
1813 		goto negative_success;
1814 
1815 	counter_u64_add(numposhits, 1);
1816 	*vpp = ncp->nc_vp;
1817 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1818 	cache_out_ts(ncp, tsp, ticksp);
1819 	MPASS(dvp != *vpp);
1820 	vs = vget_prep(*vpp);
1821 	mtx_unlock(blp);
1822 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1823 	if (error) {
1824 		*vpp = NULL;
1825 		goto retry;
1826 	}
1827 	return (-1);
1828 negative_success:
1829 	/*
1830 	 * We don't get here with regular lookup apart from corner cases.
1831 	 */
1832 	if (__predict_true(cnp->cn_nameiop == CREATE)) {
1833 		if (cnp->cn_flags & ISLASTCN) {
1834 			counter_u64_add(numnegzaps, 1);
1835 			error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1836 			if (__predict_false(error != 0)) {
1837 				zap_bucket_fail2++;
1838 				goto retry;
1839 			}
1840 			cache_free(ncp);
1841 			return (0);
1842 		}
1843 	}
1844 
1845 	whiteout = (ncp->nc_flag & NCF_WHITE);
1846 	cache_out_ts(ncp, tsp, ticksp);
1847 	if (cache_neg_hit_prep(ncp))
1848 		cache_neg_promote(ncp);
1849 	else
1850 		cache_neg_hit_finish(ncp);
1851 	mtx_unlock(blp);
1852 	if (whiteout)
1853 		cnp->cn_flags |= ISWHITEOUT;
1854 	return (ENOENT);
1855 }
1856 
1857 int
1858 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1859     struct timespec *tsp, int *ticksp)
1860 {
1861 	struct namecache *ncp;
1862 	uint32_t hash;
1863 	enum vgetstate vs;
1864 	int error;
1865 	bool whiteout, neg_promote;
1866 	u_short nc_flag;
1867 
1868 	MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1869 
1870 #ifdef DEBUG_CACHE
1871 	if (__predict_false(!doingcache)) {
1872 		cnp->cn_flags &= ~MAKEENTRY;
1873 		return (0);
1874 	}
1875 #endif
1876 
1877 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1878 		if (cnp->cn_namelen == 1)
1879 			return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1880 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1881 			return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1882 	}
1883 
1884 	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1885 
1886 	if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
1887 		cache_remove_cnp(dvp, cnp);
1888 		return (0);
1889 	}
1890 
1891 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1892 	vfs_smr_enter();
1893 
1894 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1895 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1896 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1897 			break;
1898 	}
1899 
1900 	if (__predict_false(ncp == NULL)) {
1901 		vfs_smr_exit();
1902 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1903 		    NULL);
1904 		counter_u64_add(nummiss, 1);
1905 		return (0);
1906 	}
1907 
1908 	nc_flag = atomic_load_char(&ncp->nc_flag);
1909 	if (nc_flag & NCF_NEGATIVE)
1910 		goto negative_success;
1911 
1912 	counter_u64_add(numposhits, 1);
1913 	*vpp = ncp->nc_vp;
1914 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1915 	cache_out_ts(ncp, tsp, ticksp);
1916 	MPASS(dvp != *vpp);
1917 	if (!cache_ncp_canuse(ncp)) {
1918 		vfs_smr_exit();
1919 		*vpp = NULL;
1920 		goto out_fallback;
1921 	}
1922 	vs = vget_prep_smr(*vpp);
1923 	vfs_smr_exit();
1924 	if (__predict_false(vs == VGET_NONE)) {
1925 		*vpp = NULL;
1926 		goto out_fallback;
1927 	}
1928 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1929 	if (error) {
1930 		*vpp = NULL;
1931 		goto out_fallback;
1932 	}
1933 	return (-1);
1934 negative_success:
1935 	if (cnp->cn_nameiop == CREATE) {
1936 		if (cnp->cn_flags & ISLASTCN) {
1937 			vfs_smr_exit();
1938 			goto out_fallback;
1939 		}
1940 	}
1941 
1942 	cache_out_ts(ncp, tsp, ticksp);
1943 	whiteout = (ncp->nc_flag & NCF_WHITE);
1944 	neg_promote = cache_neg_hit_prep(ncp);
1945 	if (!cache_ncp_canuse(ncp)) {
1946 		cache_neg_hit_abort(ncp);
1947 		vfs_smr_exit();
1948 		goto out_fallback;
1949 	}
1950 	if (neg_promote) {
1951 		vfs_smr_exit();
1952 		if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1953 			goto out_fallback;
1954 	} else {
1955 		cache_neg_hit_finish(ncp);
1956 		vfs_smr_exit();
1957 	}
1958 	if (whiteout)
1959 		cnp->cn_flags |= ISWHITEOUT;
1960 	return (ENOENT);
1961 out_fallback:
1962 	return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1963 }
1964 
1965 struct celockstate {
1966 	struct mtx *vlp[3];
1967 	struct mtx *blp[2];
1968 };
1969 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1970 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1971 
1972 static inline void
1973 cache_celockstate_init(struct celockstate *cel)
1974 {
1975 
1976 	bzero(cel, sizeof(*cel));
1977 }
1978 
1979 static void
1980 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1981     struct vnode *dvp)
1982 {
1983 	struct mtx *vlp1, *vlp2;
1984 
1985 	MPASS(cel->vlp[0] == NULL);
1986 	MPASS(cel->vlp[1] == NULL);
1987 	MPASS(cel->vlp[2] == NULL);
1988 
1989 	MPASS(vp != NULL || dvp != NULL);
1990 
1991 	vlp1 = VP2VNODELOCK(vp);
1992 	vlp2 = VP2VNODELOCK(dvp);
1993 	cache_sort_vnodes(&vlp1, &vlp2);
1994 
1995 	if (vlp1 != NULL) {
1996 		mtx_lock(vlp1);
1997 		cel->vlp[0] = vlp1;
1998 	}
1999 	mtx_lock(vlp2);
2000 	cel->vlp[1] = vlp2;
2001 }
2002 
2003 static void
2004 cache_unlock_vnodes_cel(struct celockstate *cel)
2005 {
2006 
2007 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2008 
2009 	if (cel->vlp[0] != NULL)
2010 		mtx_unlock(cel->vlp[0]);
2011 	if (cel->vlp[1] != NULL)
2012 		mtx_unlock(cel->vlp[1]);
2013 	if (cel->vlp[2] != NULL)
2014 		mtx_unlock(cel->vlp[2]);
2015 }
2016 
2017 static bool
2018 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2019 {
2020 	struct mtx *vlp;
2021 	bool ret;
2022 
2023 	cache_assert_vlp_locked(cel->vlp[0]);
2024 	cache_assert_vlp_locked(cel->vlp[1]);
2025 	MPASS(cel->vlp[2] == NULL);
2026 
2027 	MPASS(vp != NULL);
2028 	vlp = VP2VNODELOCK(vp);
2029 
2030 	ret = true;
2031 	if (vlp >= cel->vlp[1]) {
2032 		mtx_lock(vlp);
2033 	} else {
2034 		if (mtx_trylock(vlp))
2035 			goto out;
2036 		cache_lock_vnodes_cel_3_failures++;
2037 		cache_unlock_vnodes_cel(cel);
2038 		if (vlp < cel->vlp[0]) {
2039 			mtx_lock(vlp);
2040 			mtx_lock(cel->vlp[0]);
2041 			mtx_lock(cel->vlp[1]);
2042 		} else {
2043 			if (cel->vlp[0] != NULL)
2044 				mtx_lock(cel->vlp[0]);
2045 			mtx_lock(vlp);
2046 			mtx_lock(cel->vlp[1]);
2047 		}
2048 		ret = false;
2049 	}
2050 out:
2051 	cel->vlp[2] = vlp;
2052 	return (ret);
2053 }
2054 
2055 static void
2056 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2057     struct mtx *blp2)
2058 {
2059 
2060 	MPASS(cel->blp[0] == NULL);
2061 	MPASS(cel->blp[1] == NULL);
2062 
2063 	cache_sort_vnodes(&blp1, &blp2);
2064 
2065 	if (blp1 != NULL) {
2066 		mtx_lock(blp1);
2067 		cel->blp[0] = blp1;
2068 	}
2069 	mtx_lock(blp2);
2070 	cel->blp[1] = blp2;
2071 }
2072 
2073 static void
2074 cache_unlock_buckets_cel(struct celockstate *cel)
2075 {
2076 
2077 	if (cel->blp[0] != NULL)
2078 		mtx_unlock(cel->blp[0]);
2079 	mtx_unlock(cel->blp[1]);
2080 }
2081 
2082 /*
2083  * Lock part of the cache affected by the insertion.
2084  *
2085  * This means vnodelocks for dvp, vp and the relevant bucketlock.
2086  * However, insertion can result in removal of an old entry. In this
2087  * case we have an additional vnode and bucketlock pair to lock.
2088  *
2089  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2090  * preserving the locking order (smaller address first).
2091  */
2092 static void
2093 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2094     uint32_t hash)
2095 {
2096 	struct namecache *ncp;
2097 	struct mtx *blps[2];
2098 
2099 	blps[0] = HASH2BUCKETLOCK(hash);
2100 	for (;;) {
2101 		blps[1] = NULL;
2102 		cache_lock_vnodes_cel(cel, dvp, vp);
2103 		if (vp == NULL || vp->v_type != VDIR)
2104 			break;
2105 		ncp = vp->v_cache_dd;
2106 		if (ncp == NULL)
2107 			break;
2108 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2109 			break;
2110 		MPASS(ncp->nc_dvp == vp);
2111 		blps[1] = NCP2BUCKETLOCK(ncp);
2112 		if (ncp->nc_flag & NCF_NEGATIVE)
2113 			break;
2114 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2115 			break;
2116 		/*
2117 		 * All vnodes got re-locked. Re-validate the state and if
2118 		 * nothing changed we are done. Otherwise restart.
2119 		 */
2120 		if (ncp == vp->v_cache_dd &&
2121 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2122 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
2123 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2124 			break;
2125 		cache_unlock_vnodes_cel(cel);
2126 		cel->vlp[0] = NULL;
2127 		cel->vlp[1] = NULL;
2128 		cel->vlp[2] = NULL;
2129 	}
2130 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
2131 }
2132 
2133 static void
2134 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2135     uint32_t hash)
2136 {
2137 	struct namecache *ncp;
2138 	struct mtx *blps[2];
2139 
2140 	blps[0] = HASH2BUCKETLOCK(hash);
2141 	for (;;) {
2142 		blps[1] = NULL;
2143 		cache_lock_vnodes_cel(cel, dvp, vp);
2144 		ncp = dvp->v_cache_dd;
2145 		if (ncp == NULL)
2146 			break;
2147 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2148 			break;
2149 		MPASS(ncp->nc_dvp == dvp);
2150 		blps[1] = NCP2BUCKETLOCK(ncp);
2151 		if (ncp->nc_flag & NCF_NEGATIVE)
2152 			break;
2153 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2154 			break;
2155 		if (ncp == dvp->v_cache_dd &&
2156 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2157 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
2158 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2159 			break;
2160 		cache_unlock_vnodes_cel(cel);
2161 		cel->vlp[0] = NULL;
2162 		cel->vlp[1] = NULL;
2163 		cel->vlp[2] = NULL;
2164 	}
2165 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
2166 }
2167 
2168 static void
2169 cache_enter_unlock(struct celockstate *cel)
2170 {
2171 
2172 	cache_unlock_buckets_cel(cel);
2173 	cache_unlock_vnodes_cel(cel);
2174 }
2175 
2176 static void __noinline
2177 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2178     struct componentname *cnp)
2179 {
2180 	struct celockstate cel;
2181 	struct namecache *ncp;
2182 	uint32_t hash;
2183 	int len;
2184 
2185 	if (dvp->v_cache_dd == NULL)
2186 		return;
2187 	len = cnp->cn_namelen;
2188 	cache_celockstate_init(&cel);
2189 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2190 	cache_enter_lock_dd(&cel, dvp, vp, hash);
2191 	vn_seqc_write_begin(dvp);
2192 	ncp = dvp->v_cache_dd;
2193 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2194 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2195 		cache_zap_locked(ncp);
2196 	} else {
2197 		ncp = NULL;
2198 	}
2199 	dvp->v_cache_dd = NULL;
2200 	vn_seqc_write_end(dvp);
2201 	cache_enter_unlock(&cel);
2202 	if (ncp != NULL)
2203 		cache_free(ncp);
2204 }
2205 
2206 /*
2207  * Add an entry to the cache.
2208  */
2209 void
2210 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2211     struct timespec *tsp, struct timespec *dtsp)
2212 {
2213 	struct celockstate cel;
2214 	struct namecache *ncp, *n2, *ndd;
2215 	struct namecache_ts *ncp_ts;
2216 	struct nchashhead *ncpp;
2217 	uint32_t hash;
2218 	int flag;
2219 	int len;
2220 
2221 	VNPASS(dvp != vp, dvp);
2222 	VNPASS(!VN_IS_DOOMED(dvp), dvp);
2223 	VNPASS(dvp->v_type != VNON, dvp);
2224 	if (vp != NULL) {
2225 		VNPASS(!VN_IS_DOOMED(vp), vp);
2226 		VNPASS(vp->v_type != VNON, vp);
2227 	}
2228 
2229 #ifdef DEBUG_CACHE
2230 	if (__predict_false(!doingcache))
2231 		return;
2232 #endif
2233 
2234 	flag = 0;
2235 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2236 		if (cnp->cn_namelen == 1)
2237 			return;
2238 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2239 			cache_enter_dotdot_prep(dvp, vp, cnp);
2240 			flag = NCF_ISDOTDOT;
2241 		}
2242 	}
2243 
2244 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2245 	if (ncp == NULL)
2246 		return;
2247 
2248 	cache_celockstate_init(&cel);
2249 	ndd = NULL;
2250 	ncp_ts = NULL;
2251 
2252 	/*
2253 	 * Calculate the hash key and setup as much of the new
2254 	 * namecache entry as possible before acquiring the lock.
2255 	 */
2256 	ncp->nc_flag = flag | NCF_WIP;
2257 	ncp->nc_vp = vp;
2258 	if (vp == NULL)
2259 		cache_neg_init(ncp);
2260 	ncp->nc_dvp = dvp;
2261 	if (tsp != NULL) {
2262 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2263 		ncp_ts->nc_time = *tsp;
2264 		ncp_ts->nc_ticks = ticks;
2265 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
2266 		if (dtsp != NULL) {
2267 			ncp_ts->nc_dotdottime = *dtsp;
2268 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2269 		}
2270 	}
2271 	len = ncp->nc_nlen = cnp->cn_namelen;
2272 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2273 	memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2274 	ncp->nc_name[len] = '\0';
2275 	cache_enter_lock(&cel, dvp, vp, hash);
2276 
2277 	/*
2278 	 * See if this vnode or negative entry is already in the cache
2279 	 * with this name.  This can happen with concurrent lookups of
2280 	 * the same path name.
2281 	 */
2282 	ncpp = NCHHASH(hash);
2283 	CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2284 		if (n2->nc_dvp == dvp &&
2285 		    n2->nc_nlen == cnp->cn_namelen &&
2286 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2287 			MPASS(cache_ncp_canuse(n2));
2288 			if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2289 				KASSERT(vp == NULL,
2290 				    ("%s: found entry pointing to a different vnode (%p != %p)",
2291 				    __func__, NULL, vp));
2292 			else
2293 				KASSERT(n2->nc_vp == vp,
2294 				    ("%s: found entry pointing to a different vnode (%p != %p)",
2295 				    __func__, n2->nc_vp, vp));
2296 			/*
2297 			 * Entries are supposed to be immutable unless in the
2298 			 * process of getting destroyed. Accommodating for
2299 			 * changing timestamps is possible but not worth it.
2300 			 * This should be harmless in terms of correctness, in
2301 			 * the worst case resulting in an earlier expiration.
2302 			 * Alternatively, the found entry can be replaced
2303 			 * altogether.
2304 			 */
2305 			MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2306 #if 0
2307 			if (tsp != NULL) {
2308 				KASSERT((n2->nc_flag & NCF_TS) != 0,
2309 				    ("no NCF_TS"));
2310 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2311 				n2_ts->nc_time = ncp_ts->nc_time;
2312 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
2313 				if (dtsp != NULL) {
2314 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2315 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
2316 				}
2317 			}
2318 #endif
2319 			SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2320 			    vp);
2321 			goto out_unlock_free;
2322 		}
2323 	}
2324 
2325 	if (flag == NCF_ISDOTDOT) {
2326 		/*
2327 		 * See if we are trying to add .. entry, but some other lookup
2328 		 * has populated v_cache_dd pointer already.
2329 		 */
2330 		if (dvp->v_cache_dd != NULL)
2331 			goto out_unlock_free;
2332 		KASSERT(vp == NULL || vp->v_type == VDIR,
2333 		    ("wrong vnode type %p", vp));
2334 		vn_seqc_write_begin(dvp);
2335 		dvp->v_cache_dd = ncp;
2336 		vn_seqc_write_end(dvp);
2337 	}
2338 
2339 	if (vp != NULL) {
2340 		if (flag != NCF_ISDOTDOT) {
2341 			/*
2342 			 * For this case, the cache entry maps both the
2343 			 * directory name in it and the name ".." for the
2344 			 * directory's parent.
2345 			 */
2346 			vn_seqc_write_begin(vp);
2347 			if ((ndd = vp->v_cache_dd) != NULL) {
2348 				if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2349 					cache_zap_locked(ndd);
2350 				else
2351 					ndd = NULL;
2352 			}
2353 			vp->v_cache_dd = ncp;
2354 			vn_seqc_write_end(vp);
2355 		} else if (vp->v_type != VDIR) {
2356 			if (vp->v_cache_dd != NULL) {
2357 				vn_seqc_write_begin(vp);
2358 				vp->v_cache_dd = NULL;
2359 				vn_seqc_write_end(vp);
2360 			}
2361 		}
2362 	}
2363 
2364 	if (flag != NCF_ISDOTDOT) {
2365 		if (LIST_EMPTY(&dvp->v_cache_src)) {
2366 			cache_hold_vnode(dvp);
2367 		}
2368 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2369 	}
2370 
2371 	/*
2372 	 * If the entry is "negative", we place it into the
2373 	 * "negative" cache queue, otherwise, we place it into the
2374 	 * destination vnode's cache entries queue.
2375 	 */
2376 	if (vp != NULL) {
2377 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2378 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2379 		    vp);
2380 	} else {
2381 		if (cnp->cn_flags & ISWHITEOUT)
2382 			ncp->nc_flag |= NCF_WHITE;
2383 		cache_neg_insert(ncp);
2384 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2385 		    ncp->nc_name);
2386 	}
2387 
2388 	/*
2389 	 * Insert the new namecache entry into the appropriate chain
2390 	 * within the cache entries table.
2391 	 */
2392 	CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2393 
2394 	atomic_thread_fence_rel();
2395 	/*
2396 	 * Mark the entry as fully constructed.
2397 	 * It is immutable past this point until its removal.
2398 	 */
2399 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2400 
2401 	cache_enter_unlock(&cel);
2402 	if (ndd != NULL)
2403 		cache_free(ndd);
2404 	return;
2405 out_unlock_free:
2406 	cache_enter_unlock(&cel);
2407 	cache_free(ncp);
2408 	return;
2409 }
2410 
2411 static u_int
2412 cache_roundup_2(u_int val)
2413 {
2414 	u_int res;
2415 
2416 	for (res = 1; res <= val; res <<= 1)
2417 		continue;
2418 
2419 	return (res);
2420 }
2421 
2422 static struct nchashhead *
2423 nchinittbl(u_long elements, u_long *hashmask)
2424 {
2425 	struct nchashhead *hashtbl;
2426 	u_long hashsize, i;
2427 
2428 	hashsize = cache_roundup_2(elements) / 2;
2429 
2430 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2431 	for (i = 0; i < hashsize; i++)
2432 		CK_SLIST_INIT(&hashtbl[i]);
2433 	*hashmask = hashsize - 1;
2434 	return (hashtbl);
2435 }
2436 
2437 static void
2438 ncfreetbl(struct nchashhead *hashtbl)
2439 {
2440 
2441 	free(hashtbl, M_VFSCACHE);
2442 }
2443 
2444 /*
2445  * Name cache initialization, from vfs_init() when we are booting
2446  */
2447 static void
2448 nchinit(void *dummy __unused)
2449 {
2450 	u_int i;
2451 
2452 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2453 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2454 	cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2455 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2456 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2457 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2458 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2459 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2460 
2461 	VFS_SMR_ZONE_SET(cache_zone_small);
2462 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2463 	VFS_SMR_ZONE_SET(cache_zone_large);
2464 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2465 
2466 	ncsize = desiredvnodes * ncsizefactor;
2467 	cache_recalc_neg_min(ncnegminpct);
2468 	nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2469 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2470 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2471 		ncbuckethash = 7;
2472 	if (ncbuckethash > nchash)
2473 		ncbuckethash = nchash;
2474 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2475 	    M_WAITOK | M_ZERO);
2476 	for (i = 0; i < numbucketlocks; i++)
2477 		mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2478 	ncvnodehash = ncbuckethash;
2479 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2480 	    M_WAITOK | M_ZERO);
2481 	for (i = 0; i < numvnodelocks; i++)
2482 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2483 
2484 	for (i = 0; i < numneglists; i++) {
2485 		mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2486 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2487 		TAILQ_INIT(&neglists[i].nl_list);
2488 		TAILQ_INIT(&neglists[i].nl_hotlist);
2489 	}
2490 }
2491 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2492 
2493 void
2494 cache_vnode_init(struct vnode *vp)
2495 {
2496 
2497 	LIST_INIT(&vp->v_cache_src);
2498 	TAILQ_INIT(&vp->v_cache_dst);
2499 	vp->v_cache_dd = NULL;
2500 	cache_prehash(vp);
2501 }
2502 
2503 void
2504 cache_changesize(u_long newmaxvnodes)
2505 {
2506 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
2507 	u_long new_nchash, old_nchash;
2508 	struct namecache *ncp;
2509 	uint32_t hash;
2510 	u_long newncsize;
2511 	int i;
2512 
2513 	newncsize = newmaxvnodes * ncsizefactor;
2514 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2515 	if (newmaxvnodes < numbucketlocks)
2516 		newmaxvnodes = numbucketlocks;
2517 
2518 	new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2519 	/* If same hash table size, nothing to do */
2520 	if (nchash == new_nchash) {
2521 		ncfreetbl(new_nchashtbl);
2522 		return;
2523 	}
2524 	/*
2525 	 * Move everything from the old hash table to the new table.
2526 	 * None of the namecache entries in the table can be removed
2527 	 * because to do so, they have to be removed from the hash table.
2528 	 */
2529 	cache_lock_all_vnodes();
2530 	cache_lock_all_buckets();
2531 	old_nchashtbl = nchashtbl;
2532 	old_nchash = nchash;
2533 	nchashtbl = new_nchashtbl;
2534 	nchash = new_nchash;
2535 	for (i = 0; i <= old_nchash; i++) {
2536 		while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2537 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2538 			    ncp->nc_dvp);
2539 			CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2540 			CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2541 		}
2542 	}
2543 	ncsize = newncsize;
2544 	cache_recalc_neg_min(ncnegminpct);
2545 	cache_unlock_all_buckets();
2546 	cache_unlock_all_vnodes();
2547 	ncfreetbl(old_nchashtbl);
2548 }
2549 
2550 /*
2551  * Invalidate all entries from and to a particular vnode.
2552  */
2553 static void
2554 cache_purge_impl(struct vnode *vp)
2555 {
2556 	struct cache_freebatch batch;
2557 	struct namecache *ncp;
2558 	struct mtx *vlp, *vlp2;
2559 
2560 	TAILQ_INIT(&batch);
2561 	vlp = VP2VNODELOCK(vp);
2562 	vlp2 = NULL;
2563 	mtx_lock(vlp);
2564 retry:
2565 	while (!LIST_EMPTY(&vp->v_cache_src)) {
2566 		ncp = LIST_FIRST(&vp->v_cache_src);
2567 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2568 			goto retry;
2569 		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2570 	}
2571 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2572 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2573 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2574 			goto retry;
2575 		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2576 	}
2577 	ncp = vp->v_cache_dd;
2578 	if (ncp != NULL) {
2579 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2580 		   ("lost dotdot link"));
2581 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2582 			goto retry;
2583 		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2584 	}
2585 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2586 	mtx_unlock(vlp);
2587 	if (vlp2 != NULL)
2588 		mtx_unlock(vlp2);
2589 	cache_free_batch(&batch);
2590 }
2591 
2592 /*
2593  * Opportunistic check to see if there is anything to do.
2594  */
2595 static bool
2596 cache_has_entries(struct vnode *vp)
2597 {
2598 
2599 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2600 	    vp->v_cache_dd == NULL)
2601 		return (false);
2602 	return (true);
2603 }
2604 
2605 void
2606 cache_purge(struct vnode *vp)
2607 {
2608 
2609 	SDT_PROBE1(vfs, namecache, purge, done, vp);
2610 	if (!cache_has_entries(vp))
2611 		return;
2612 	cache_purge_impl(vp);
2613 }
2614 
2615 /*
2616  * Only to be used by vgone.
2617  */
2618 void
2619 cache_purge_vgone(struct vnode *vp)
2620 {
2621 	struct mtx *vlp;
2622 
2623 	VNPASS(VN_IS_DOOMED(vp), vp);
2624 	if (cache_has_entries(vp)) {
2625 		cache_purge_impl(vp);
2626 		return;
2627 	}
2628 
2629 	/*
2630 	 * Serialize against a potential thread doing cache_purge.
2631 	 */
2632 	vlp = VP2VNODELOCK(vp);
2633 	mtx_wait_unlocked(vlp);
2634 	if (cache_has_entries(vp)) {
2635 		cache_purge_impl(vp);
2636 		return;
2637 	}
2638 	return;
2639 }
2640 
2641 /*
2642  * Invalidate all negative entries for a particular directory vnode.
2643  */
2644 void
2645 cache_purge_negative(struct vnode *vp)
2646 {
2647 	struct cache_freebatch batch;
2648 	struct namecache *ncp, *nnp;
2649 	struct mtx *vlp;
2650 
2651 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2652 	if (LIST_EMPTY(&vp->v_cache_src))
2653 		return;
2654 	TAILQ_INIT(&batch);
2655 	vlp = VP2VNODELOCK(vp);
2656 	mtx_lock(vlp);
2657 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2658 		if (!(ncp->nc_flag & NCF_NEGATIVE))
2659 			continue;
2660 		cache_zap_negative_locked_vnode_kl(ncp, vp);
2661 		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2662 	}
2663 	mtx_unlock(vlp);
2664 	cache_free_batch(&batch);
2665 }
2666 
2667 /*
2668  * Entry points for modifying VOP operations.
2669  */
2670 void
2671 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2672     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2673 {
2674 
2675 	ASSERT_VOP_IN_SEQC(fdvp);
2676 	ASSERT_VOP_IN_SEQC(fvp);
2677 	ASSERT_VOP_IN_SEQC(tdvp);
2678 	if (tvp != NULL)
2679 		ASSERT_VOP_IN_SEQC(tvp);
2680 
2681 	cache_purge(fvp);
2682 	if (tvp != NULL) {
2683 		cache_purge(tvp);
2684 		KASSERT(!cache_remove_cnp(tdvp, tcnp),
2685 		    ("%s: lingering negative entry", __func__));
2686 	} else {
2687 		cache_remove_cnp(tdvp, tcnp);
2688 	}
2689 }
2690 
2691 void
2692 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
2693 {
2694 
2695 	ASSERT_VOP_IN_SEQC(dvp);
2696 	ASSERT_VOP_IN_SEQC(vp);
2697 	cache_purge(vp);
2698 }
2699 
2700 #ifdef INVARIANTS
2701 /*
2702  * Validate that if an entry exists it matches.
2703  */
2704 void
2705 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2706 {
2707 	struct namecache *ncp;
2708 	struct mtx *blp;
2709 	uint32_t hash;
2710 
2711 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2712 	if (CK_SLIST_EMPTY(NCHHASH(hash)))
2713 		return;
2714 	blp = HASH2BUCKETLOCK(hash);
2715 	mtx_lock(blp);
2716 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2717 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2718 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2719 			if (ncp->nc_vp != vp)
2720 				panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
2721 				    __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
2722 				    ncp->nc_vp);
2723 		}
2724 	}
2725 	mtx_unlock(blp);
2726 }
2727 #endif
2728 
2729 /*
2730  * Flush all entries referencing a particular filesystem.
2731  */
2732 void
2733 cache_purgevfs(struct mount *mp)
2734 {
2735 	struct vnode *vp, *mvp;
2736 
2737 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2738 	/*
2739 	 * Somewhat wasteful iteration over all vnodes. Would be better to
2740 	 * support filtering and avoid the interlock to begin with.
2741 	 */
2742 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2743 		if (!cache_has_entries(vp)) {
2744 			VI_UNLOCK(vp);
2745 			continue;
2746 		}
2747 		vholdl(vp);
2748 		VI_UNLOCK(vp);
2749 		cache_purge(vp);
2750 		vdrop(vp);
2751 	}
2752 }
2753 
2754 /*
2755  * Perform canonical checks and cache lookup and pass on to filesystem
2756  * through the vop_cachedlookup only if needed.
2757  */
2758 
2759 int
2760 vfs_cache_lookup(struct vop_lookup_args *ap)
2761 {
2762 	struct vnode *dvp;
2763 	int error;
2764 	struct vnode **vpp = ap->a_vpp;
2765 	struct componentname *cnp = ap->a_cnp;
2766 	int flags = cnp->cn_flags;
2767 
2768 	*vpp = NULL;
2769 	dvp = ap->a_dvp;
2770 
2771 	if (dvp->v_type != VDIR)
2772 		return (ENOTDIR);
2773 
2774 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2775 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2776 		return (EROFS);
2777 
2778 	error = vn_dir_check_exec(dvp, cnp);
2779 	if (error != 0)
2780 		return (error);
2781 
2782 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2783 	if (error == 0)
2784 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2785 	if (error == -1)
2786 		return (0);
2787 	return (error);
2788 }
2789 
2790 /* Implementation of the getcwd syscall. */
2791 int
2792 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2793 {
2794 	char *buf, *retbuf;
2795 	size_t buflen;
2796 	int error;
2797 
2798 	buflen = uap->buflen;
2799 	if (__predict_false(buflen < 2))
2800 		return (EINVAL);
2801 	if (buflen > MAXPATHLEN)
2802 		buflen = MAXPATHLEN;
2803 
2804 	buf = uma_zalloc(namei_zone, M_WAITOK);
2805 	error = vn_getcwd(buf, &retbuf, &buflen);
2806 	if (error == 0)
2807 		error = copyout(retbuf, uap->buf, buflen);
2808 	uma_zfree(namei_zone, buf);
2809 	return (error);
2810 }
2811 
2812 int
2813 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2814 {
2815 	struct pwd *pwd;
2816 	int error;
2817 
2818 	vfs_smr_enter();
2819 	pwd = pwd_get_smr();
2820 	error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2821 	    buflen, 0);
2822 	VFS_SMR_ASSERT_NOT_ENTERED();
2823 	if (error < 0) {
2824 		pwd = pwd_hold(curthread);
2825 		error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2826 		    retbuf, buflen);
2827 		pwd_drop(pwd);
2828 	}
2829 
2830 #ifdef KTRACE
2831 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2832 		ktrnamei(*retbuf);
2833 #endif
2834 	return (error);
2835 }
2836 
2837 static int
2838 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2839     size_t size, int flags, enum uio_seg pathseg)
2840 {
2841 	struct nameidata nd;
2842 	char *retbuf, *freebuf;
2843 	int error;
2844 
2845 	if (flags != 0)
2846 		return (EINVAL);
2847 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2848 	    pathseg, path, fd, &cap_fstat_rights, td);
2849 	if ((error = namei(&nd)) != 0)
2850 		return (error);
2851 	error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2852 	if (error == 0) {
2853 		error = copyout(retbuf, buf, size);
2854 		free(freebuf, M_TEMP);
2855 	}
2856 	NDFREE(&nd, 0);
2857 	return (error);
2858 }
2859 
2860 int
2861 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2862 {
2863 
2864 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2865 	    uap->flags, UIO_USERSPACE));
2866 }
2867 
2868 /*
2869  * Retrieve the full filesystem path that correspond to a vnode from the name
2870  * cache (if available)
2871  */
2872 int
2873 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2874 {
2875 	struct pwd *pwd;
2876 	char *buf;
2877 	size_t buflen;
2878 	int error;
2879 
2880 	if (__predict_false(vp == NULL))
2881 		return (EINVAL);
2882 
2883 	buflen = MAXPATHLEN;
2884 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2885 	vfs_smr_enter();
2886 	pwd = pwd_get_smr();
2887 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
2888 	VFS_SMR_ASSERT_NOT_ENTERED();
2889 	if (error < 0) {
2890 		pwd = pwd_hold(curthread);
2891 		error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2892 		pwd_drop(pwd);
2893 	}
2894 	if (error == 0)
2895 		*freebuf = buf;
2896 	else
2897 		free(buf, M_TEMP);
2898 	return (error);
2899 }
2900 
2901 /*
2902  * This function is similar to vn_fullpath, but it attempts to lookup the
2903  * pathname relative to the global root mount point.  This is required for the
2904  * auditing sub-system, as audited pathnames must be absolute, relative to the
2905  * global root mount point.
2906  */
2907 int
2908 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2909 {
2910 	char *buf;
2911 	size_t buflen;
2912 	int error;
2913 
2914 	if (__predict_false(vp == NULL))
2915 		return (EINVAL);
2916 	buflen = MAXPATHLEN;
2917 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2918 	vfs_smr_enter();
2919 	error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
2920 	VFS_SMR_ASSERT_NOT_ENTERED();
2921 	if (error < 0) {
2922 		error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2923 	}
2924 	if (error == 0)
2925 		*freebuf = buf;
2926 	else
2927 		free(buf, M_TEMP);
2928 	return (error);
2929 }
2930 
2931 static struct namecache *
2932 vn_dd_from_dst(struct vnode *vp)
2933 {
2934 	struct namecache *ncp;
2935 
2936 	cache_assert_vnode_locked(vp);
2937 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2938 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2939 			return (ncp);
2940 	}
2941 	return (NULL);
2942 }
2943 
2944 int
2945 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
2946 {
2947 	struct vnode *dvp;
2948 	struct namecache *ncp;
2949 	struct mtx *vlp;
2950 	int error;
2951 
2952 	vlp = VP2VNODELOCK(*vp);
2953 	mtx_lock(vlp);
2954 	ncp = (*vp)->v_cache_dd;
2955 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2956 		KASSERT(ncp == vn_dd_from_dst(*vp),
2957 		    ("%s: mismatch for dd entry (%p != %p)", __func__,
2958 		    ncp, vn_dd_from_dst(*vp)));
2959 	} else {
2960 		ncp = vn_dd_from_dst(*vp);
2961 	}
2962 	if (ncp != NULL) {
2963 		if (*buflen < ncp->nc_nlen) {
2964 			mtx_unlock(vlp);
2965 			vrele(*vp);
2966 			counter_u64_add(numfullpathfail4, 1);
2967 			error = ENOMEM;
2968 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2969 			    vp, NULL);
2970 			return (error);
2971 		}
2972 		*buflen -= ncp->nc_nlen;
2973 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2974 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2975 		    ncp->nc_name, vp);
2976 		dvp = *vp;
2977 		*vp = ncp->nc_dvp;
2978 		vref(*vp);
2979 		mtx_unlock(vlp);
2980 		vrele(dvp);
2981 		return (0);
2982 	}
2983 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2984 
2985 	mtx_unlock(vlp);
2986 	vn_lock(*vp, LK_SHARED | LK_RETRY);
2987 	error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
2988 	vput(*vp);
2989 	if (error) {
2990 		counter_u64_add(numfullpathfail2, 1);
2991 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2992 		return (error);
2993 	}
2994 
2995 	*vp = dvp;
2996 	if (VN_IS_DOOMED(dvp)) {
2997 		/* forced unmount */
2998 		vrele(dvp);
2999 		error = ENOENT;
3000 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3001 		return (error);
3002 	}
3003 	/*
3004 	 * *vp has its use count incremented still.
3005 	 */
3006 
3007 	return (0);
3008 }
3009 
3010 /*
3011  * Resolve a directory to a pathname.
3012  *
3013  * The name of the directory can always be found in the namecache or fetched
3014  * from the filesystem. There is also guaranteed to be only one parent, meaning
3015  * we can just follow vnodes up until we find the root.
3016  *
3017  * The vnode must be referenced.
3018  */
3019 static int
3020 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3021     size_t *len, size_t addend)
3022 {
3023 #ifdef KDTRACE_HOOKS
3024 	struct vnode *startvp = vp;
3025 #endif
3026 	struct vnode *vp1;
3027 	size_t buflen;
3028 	int error;
3029 	bool slash_prefixed;
3030 
3031 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3032 	VNPASS(vp->v_usecount > 0, vp);
3033 
3034 	buflen = *len;
3035 
3036 	slash_prefixed = true;
3037 	if (addend == 0) {
3038 		MPASS(*len >= 2);
3039 		buflen--;
3040 		buf[buflen] = '\0';
3041 		slash_prefixed = false;
3042 	}
3043 
3044 	error = 0;
3045 
3046 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3047 	counter_u64_add(numfullpathcalls, 1);
3048 	while (vp != rdir && vp != rootvnode) {
3049 		/*
3050 		 * The vp vnode must be already fully constructed,
3051 		 * since it is either found in namecache or obtained
3052 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
3053 		 * without obtaining the vnode lock.
3054 		 */
3055 		if ((vp->v_vflag & VV_ROOT) != 0) {
3056 			vn_lock(vp, LK_RETRY | LK_SHARED);
3057 
3058 			/*
3059 			 * With the vnode locked, check for races with
3060 			 * unmount, forced or not.  Note that we
3061 			 * already verified that vp is not equal to
3062 			 * the root vnode, which means that
3063 			 * mnt_vnodecovered can be NULL only for the
3064 			 * case of unmount.
3065 			 */
3066 			if (VN_IS_DOOMED(vp) ||
3067 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3068 			    vp1->v_mountedhere != vp->v_mount) {
3069 				vput(vp);
3070 				error = ENOENT;
3071 				SDT_PROBE3(vfs, namecache, fullpath, return,
3072 				    error, vp, NULL);
3073 				break;
3074 			}
3075 
3076 			vref(vp1);
3077 			vput(vp);
3078 			vp = vp1;
3079 			continue;
3080 		}
3081 		if (vp->v_type != VDIR) {
3082 			vrele(vp);
3083 			counter_u64_add(numfullpathfail1, 1);
3084 			error = ENOTDIR;
3085 			SDT_PROBE3(vfs, namecache, fullpath, return,
3086 			    error, vp, NULL);
3087 			break;
3088 		}
3089 		error = vn_vptocnp(&vp, buf, &buflen);
3090 		if (error)
3091 			break;
3092 		if (buflen == 0) {
3093 			vrele(vp);
3094 			error = ENOMEM;
3095 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
3096 			    startvp, NULL);
3097 			break;
3098 		}
3099 		buf[--buflen] = '/';
3100 		slash_prefixed = true;
3101 	}
3102 	if (error)
3103 		return (error);
3104 	if (!slash_prefixed) {
3105 		if (buflen == 0) {
3106 			vrele(vp);
3107 			counter_u64_add(numfullpathfail4, 1);
3108 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3109 			    startvp, NULL);
3110 			return (ENOMEM);
3111 		}
3112 		buf[--buflen] = '/';
3113 	}
3114 	counter_u64_add(numfullpathfound, 1);
3115 	vrele(vp);
3116 
3117 	*retbuf = buf + buflen;
3118 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3119 	*len -= buflen;
3120 	*len += addend;
3121 	return (0);
3122 }
3123 
3124 /*
3125  * Resolve an arbitrary vnode to a pathname.
3126  *
3127  * Note 2 caveats:
3128  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3129  *   resolve to a different path than the one used to find it
3130  * - namecache is not mandatory, meaning names are not guaranteed to be added
3131  *   (in which case resolving fails)
3132  */
3133 static void __inline
3134 cache_rev_failed_impl(int *reason, int line)
3135 {
3136 
3137 	*reason = line;
3138 }
3139 #define cache_rev_failed(var)	cache_rev_failed_impl((var), __LINE__)
3140 
3141 static int
3142 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3143     char **retbuf, size_t *buflen, size_t addend)
3144 {
3145 #ifdef KDTRACE_HOOKS
3146 	struct vnode *startvp = vp;
3147 #endif
3148 	struct vnode *tvp;
3149 	struct mount *mp;
3150 	struct namecache *ncp;
3151 	size_t orig_buflen;
3152 	int reason;
3153 	int error;
3154 #ifdef KDTRACE_HOOKS
3155 	int i;
3156 #endif
3157 	seqc_t vp_seqc, tvp_seqc;
3158 	u_char nc_flag;
3159 
3160 	VFS_SMR_ASSERT_ENTERED();
3161 
3162 	if (!cache_fast_revlookup) {
3163 		vfs_smr_exit();
3164 		return (-1);
3165 	}
3166 
3167 	orig_buflen = *buflen;
3168 
3169 	if (addend == 0) {
3170 		MPASS(*buflen >= 2);
3171 		*buflen -= 1;
3172 		buf[*buflen] = '\0';
3173 	}
3174 
3175 	if (vp == rdir || vp == rootvnode) {
3176 		if (addend == 0) {
3177 			*buflen -= 1;
3178 			buf[*buflen] = '/';
3179 		}
3180 		goto out_ok;
3181 	}
3182 
3183 #ifdef KDTRACE_HOOKS
3184 	i = 0;
3185 #endif
3186 	error = -1;
3187 	ncp = NULL; /* for sdt probe down below */
3188 	vp_seqc = vn_seqc_read_any(vp);
3189 	if (seqc_in_modify(vp_seqc)) {
3190 		cache_rev_failed(&reason);
3191 		goto out_abort;
3192 	}
3193 
3194 	for (;;) {
3195 #ifdef KDTRACE_HOOKS
3196 		i++;
3197 #endif
3198 		if ((vp->v_vflag & VV_ROOT) != 0) {
3199 			mp = atomic_load_ptr(&vp->v_mount);
3200 			if (mp == NULL) {
3201 				cache_rev_failed(&reason);
3202 				goto out_abort;
3203 			}
3204 			tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3205 			tvp_seqc = vn_seqc_read_any(tvp);
3206 			if (seqc_in_modify(tvp_seqc)) {
3207 				cache_rev_failed(&reason);
3208 				goto out_abort;
3209 			}
3210 			if (!vn_seqc_consistent(vp, vp_seqc)) {
3211 				cache_rev_failed(&reason);
3212 				goto out_abort;
3213 			}
3214 			vp = tvp;
3215 			vp_seqc = tvp_seqc;
3216 			continue;
3217 		}
3218 		ncp = atomic_load_ptr(&vp->v_cache_dd);
3219 		if (ncp == NULL) {
3220 			cache_rev_failed(&reason);
3221 			goto out_abort;
3222 		}
3223 		nc_flag = atomic_load_char(&ncp->nc_flag);
3224 		if ((nc_flag & NCF_ISDOTDOT) != 0) {
3225 			cache_rev_failed(&reason);
3226 			goto out_abort;
3227 		}
3228 		if (!cache_ncp_canuse(ncp)) {
3229 			cache_rev_failed(&reason);
3230 			goto out_abort;
3231 		}
3232 		if (ncp->nc_nlen >= *buflen) {
3233 			cache_rev_failed(&reason);
3234 			error = ENOMEM;
3235 			goto out_abort;
3236 		}
3237 		*buflen -= ncp->nc_nlen;
3238 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3239 		*buflen -= 1;
3240 		buf[*buflen] = '/';
3241 		tvp = ncp->nc_dvp;
3242 		tvp_seqc = vn_seqc_read_any(tvp);
3243 		if (seqc_in_modify(tvp_seqc)) {
3244 			cache_rev_failed(&reason);
3245 			goto out_abort;
3246 		}
3247 		if (!vn_seqc_consistent(vp, vp_seqc)) {
3248 			cache_rev_failed(&reason);
3249 			goto out_abort;
3250 		}
3251 		vp = tvp;
3252 		vp_seqc = tvp_seqc;
3253 		if (vp == rdir || vp == rootvnode)
3254 			break;
3255 	}
3256 out_ok:
3257 	vfs_smr_exit();
3258 	*retbuf = buf + *buflen;
3259 	*buflen = orig_buflen - *buflen + addend;
3260 	SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3261 	return (0);
3262 
3263 out_abort:
3264 	*buflen = orig_buflen;
3265 	SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3266 	vfs_smr_exit();
3267 	return (error);
3268 }
3269 
3270 static int
3271 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3272     size_t *buflen)
3273 {
3274 	size_t orig_buflen, addend;
3275 	int error;
3276 
3277 	if (*buflen < 2)
3278 		return (EINVAL);
3279 
3280 	orig_buflen = *buflen;
3281 
3282 	vref(vp);
3283 	addend = 0;
3284 	if (vp->v_type != VDIR) {
3285 		*buflen -= 1;
3286 		buf[*buflen] = '\0';
3287 		error = vn_vptocnp(&vp, buf, buflen);
3288 		if (error)
3289 			return (error);
3290 		if (*buflen == 0) {
3291 			vrele(vp);
3292 			return (ENOMEM);
3293 		}
3294 		*buflen -= 1;
3295 		buf[*buflen] = '/';
3296 		addend = orig_buflen - *buflen;
3297 	}
3298 
3299 	return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3300 }
3301 
3302 /*
3303  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3304  *
3305  * Since the namecache does not track hardlinks, the caller is expected to first
3306  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3307  *
3308  * Then we have 2 cases:
3309  * - if the found vnode is a directory, the path can be constructed just by
3310  *   following names up the chain
3311  * - otherwise we populate the buffer with the saved name and start resolving
3312  *   from the parent
3313  */
3314 static int
3315 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3316     size_t *buflen)
3317 {
3318 	char *buf, *tmpbuf;
3319 	struct pwd *pwd;
3320 	struct componentname *cnp;
3321 	struct vnode *vp;
3322 	size_t addend;
3323 	int error;
3324 	enum vtype type;
3325 
3326 	if (*buflen < 2)
3327 		return (EINVAL);
3328 	if (*buflen > MAXPATHLEN)
3329 		*buflen = MAXPATHLEN;
3330 
3331 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
3332 
3333 	addend = 0;
3334 	vp = ndp->ni_vp;
3335 	/*
3336 	 * Check for VBAD to work around the vp_crossmp bug in lookup().
3337 	 *
3338 	 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3339 	 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3340 	 * If the type is VDIR (like in this very case) we can skip looking
3341 	 * at ni_dvp in the first place. However, since vnodes get passed here
3342 	 * unlocked the target may transition to doomed state (type == VBAD)
3343 	 * before we get to evaluate the condition. If this happens, we will
3344 	 * populate part of the buffer and descend to vn_fullpath_dir with
3345 	 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3346 	 *
3347 	 * This should be atomic_load(&vp->v_type) but it is illegal to take
3348 	 * an address of a bit field, even if said field is sized to char.
3349 	 * Work around the problem by reading the value into a full-sized enum
3350 	 * and then re-reading it with atomic_load which will still prevent
3351 	 * the compiler from re-reading down the road.
3352 	 */
3353 	type = vp->v_type;
3354 	type = atomic_load_int(&type);
3355 	if (type == VBAD) {
3356 		error = ENOENT;
3357 		goto out_bad;
3358 	}
3359 	if (type != VDIR) {
3360 		cnp = &ndp->ni_cnd;
3361 		addend = cnp->cn_namelen + 2;
3362 		if (*buflen < addend) {
3363 			error = ENOMEM;
3364 			goto out_bad;
3365 		}
3366 		*buflen -= addend;
3367 		tmpbuf = buf + *buflen;
3368 		tmpbuf[0] = '/';
3369 		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3370 		tmpbuf[addend - 1] = '\0';
3371 		vp = ndp->ni_dvp;
3372 	}
3373 
3374 	vfs_smr_enter();
3375 	pwd = pwd_get_smr();
3376 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3377 	    addend);
3378 	VFS_SMR_ASSERT_NOT_ENTERED();
3379 	if (error < 0) {
3380 		pwd = pwd_hold(curthread);
3381 		vref(vp);
3382 		error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3383 		    addend);
3384 		pwd_drop(pwd);
3385 		if (error != 0)
3386 			goto out_bad;
3387 	}
3388 
3389 	*freebuf = buf;
3390 
3391 	return (0);
3392 out_bad:
3393 	free(buf, M_TEMP);
3394 	return (error);
3395 }
3396 
3397 struct vnode *
3398 vn_dir_dd_ino(struct vnode *vp)
3399 {
3400 	struct namecache *ncp;
3401 	struct vnode *ddvp;
3402 	struct mtx *vlp;
3403 	enum vgetstate vs;
3404 
3405 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3406 	vlp = VP2VNODELOCK(vp);
3407 	mtx_lock(vlp);
3408 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3409 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3410 			continue;
3411 		ddvp = ncp->nc_dvp;
3412 		vs = vget_prep(ddvp);
3413 		mtx_unlock(vlp);
3414 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3415 			return (NULL);
3416 		return (ddvp);
3417 	}
3418 	mtx_unlock(vlp);
3419 	return (NULL);
3420 }
3421 
3422 int
3423 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3424 {
3425 	struct namecache *ncp;
3426 	struct mtx *vlp;
3427 	int l;
3428 
3429 	vlp = VP2VNODELOCK(vp);
3430 	mtx_lock(vlp);
3431 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3432 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3433 			break;
3434 	if (ncp == NULL) {
3435 		mtx_unlock(vlp);
3436 		return (ENOENT);
3437 	}
3438 	l = min(ncp->nc_nlen, buflen - 1);
3439 	memcpy(buf, ncp->nc_name, l);
3440 	mtx_unlock(vlp);
3441 	buf[l] = '\0';
3442 	return (0);
3443 }
3444 
3445 /*
3446  * This function updates path string to vnode's full global path
3447  * and checks the size of the new path string against the pathlen argument.
3448  *
3449  * Requires a locked, referenced vnode.
3450  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3451  *
3452  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3453  * because it falls back to the ".." lookup if the namecache lookup fails.
3454  */
3455 int
3456 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3457     u_int pathlen)
3458 {
3459 	struct nameidata nd;
3460 	struct vnode *vp1;
3461 	char *rpath, *fbuf;
3462 	int error;
3463 
3464 	ASSERT_VOP_ELOCKED(vp, __func__);
3465 
3466 	/* Construct global filesystem path from vp. */
3467 	VOP_UNLOCK(vp);
3468 	error = vn_fullpath_global(vp, &rpath, &fbuf);
3469 
3470 	if (error != 0) {
3471 		vrele(vp);
3472 		return (error);
3473 	}
3474 
3475 	if (strlen(rpath) >= pathlen) {
3476 		vrele(vp);
3477 		error = ENAMETOOLONG;
3478 		goto out;
3479 	}
3480 
3481 	/*
3482 	 * Re-lookup the vnode by path to detect a possible rename.
3483 	 * As a side effect, the vnode is relocked.
3484 	 * If vnode was renamed, return ENOENT.
3485 	 */
3486 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3487 	    UIO_SYSSPACE, path, td);
3488 	error = namei(&nd);
3489 	if (error != 0) {
3490 		vrele(vp);
3491 		goto out;
3492 	}
3493 	NDFREE(&nd, NDF_ONLY_PNBUF);
3494 	vp1 = nd.ni_vp;
3495 	vrele(vp);
3496 	if (vp1 == vp)
3497 		strcpy(path, rpath);
3498 	else {
3499 		vput(vp1);
3500 		error = ENOENT;
3501 	}
3502 
3503 out:
3504 	free(fbuf, M_TEMP);
3505 	return (error);
3506 }
3507 
3508 #ifdef DDB
3509 static void
3510 db_print_vpath(struct vnode *vp)
3511 {
3512 
3513 	while (vp != NULL) {
3514 		db_printf("%p: ", vp);
3515 		if (vp == rootvnode) {
3516 			db_printf("/");
3517 			vp = NULL;
3518 		} else {
3519 			if (vp->v_vflag & VV_ROOT) {
3520 				db_printf("<mount point>");
3521 				vp = vp->v_mount->mnt_vnodecovered;
3522 			} else {
3523 				struct namecache *ncp;
3524 				char *ncn;
3525 				int i;
3526 
3527 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
3528 				if (ncp != NULL) {
3529 					ncn = ncp->nc_name;
3530 					for (i = 0; i < ncp->nc_nlen; i++)
3531 						db_printf("%c", *ncn++);
3532 					vp = ncp->nc_dvp;
3533 				} else {
3534 					vp = NULL;
3535 				}
3536 			}
3537 		}
3538 		db_printf("\n");
3539 	}
3540 
3541 	return;
3542 }
3543 
3544 DB_SHOW_COMMAND(vpath, db_show_vpath)
3545 {
3546 	struct vnode *vp;
3547 
3548 	if (!have_addr) {
3549 		db_printf("usage: show vpath <struct vnode *>\n");
3550 		return;
3551 	}
3552 
3553 	vp = (struct vnode *)addr;
3554 	db_print_vpath(vp);
3555 }
3556 
3557 #endif
3558 
3559 static bool __read_frequently cache_fast_lookup = true;
3560 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3561     &cache_fast_lookup, 0, "");
3562 
3563 #define CACHE_FPL_FAILED	-2020
3564 
3565 static void
3566 cache_fpl_cleanup_cnp(struct componentname *cnp)
3567 {
3568 
3569 	uma_zfree(namei_zone, cnp->cn_pnbuf);
3570 #ifdef DIAGNOSTIC
3571 	cnp->cn_pnbuf = NULL;
3572 	cnp->cn_nameptr = NULL;
3573 #endif
3574 }
3575 
3576 static void
3577 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3578 {
3579 	struct componentname *cnp;
3580 
3581 	cnp = &ndp->ni_cnd;
3582 	while (*(cnp->cn_nameptr) == '/') {
3583 		cnp->cn_nameptr++;
3584 		ndp->ni_pathlen--;
3585 	}
3586 
3587 	*dpp = ndp->ni_rootdir;
3588 }
3589 
3590 /*
3591  * Components of nameidata (or objects it can point to) which may
3592  * need restoring in case fast path lookup fails.
3593  */
3594 struct nameidata_saved {
3595 	long cn_namelen;
3596 	char *cn_nameptr;
3597 	size_t ni_pathlen;
3598 	int cn_flags;
3599 };
3600 
3601 struct cache_fpl {
3602 	struct nameidata *ndp;
3603 	struct componentname *cnp;
3604 	struct pwd *pwd;
3605 	struct vnode *dvp;
3606 	struct vnode *tvp;
3607 	seqc_t dvp_seqc;
3608 	seqc_t tvp_seqc;
3609 	struct nameidata_saved snd;
3610 	int line;
3611 	enum cache_fpl_status status:8;
3612 	bool in_smr;
3613 	bool fsearch;
3614 };
3615 
3616 static void
3617 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3618 {
3619 
3620 	snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3621 	snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3622 	snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3623 	snd->ni_pathlen = fpl->ndp->ni_pathlen;
3624 }
3625 
3626 static void
3627 cache_fpl_restore_partial(struct cache_fpl *fpl, struct nameidata_saved *snd)
3628 {
3629 
3630 	fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3631 	fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3632 	fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3633 	fpl->ndp->ni_pathlen = snd->ni_pathlen;
3634 }
3635 
3636 static void
3637 cache_fpl_restore_abort(struct cache_fpl *fpl, struct nameidata_saved *snd)
3638 {
3639 
3640 	cache_fpl_restore_partial(fpl, snd);
3641 	/*
3642 	 * It is 0 on entry by API contract.
3643 	 */
3644 	fpl->ndp->ni_resflags = 0;
3645 }
3646 
3647 #ifdef INVARIANTS
3648 #define cache_fpl_smr_assert_entered(fpl) ({			\
3649 	struct cache_fpl *_fpl = (fpl);				\
3650 	MPASS(_fpl->in_smr == true);				\
3651 	VFS_SMR_ASSERT_ENTERED();				\
3652 })
3653 #define cache_fpl_smr_assert_not_entered(fpl) ({		\
3654 	struct cache_fpl *_fpl = (fpl);				\
3655 	MPASS(_fpl->in_smr == false);				\
3656 	VFS_SMR_ASSERT_NOT_ENTERED();				\
3657 })
3658 #else
3659 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3660 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3661 #endif
3662 
3663 #define cache_fpl_smr_enter_initial(fpl) ({			\
3664 	struct cache_fpl *_fpl = (fpl);				\
3665 	vfs_smr_enter();					\
3666 	_fpl->in_smr = true;					\
3667 })
3668 
3669 #define cache_fpl_smr_enter(fpl) ({				\
3670 	struct cache_fpl *_fpl = (fpl);				\
3671 	MPASS(_fpl->in_smr == false);				\
3672 	vfs_smr_enter();					\
3673 	_fpl->in_smr = true;					\
3674 })
3675 
3676 #define cache_fpl_smr_exit(fpl) ({				\
3677 	struct cache_fpl *_fpl = (fpl);				\
3678 	MPASS(_fpl->in_smr == true);				\
3679 	vfs_smr_exit();						\
3680 	_fpl->in_smr = false;					\
3681 })
3682 
3683 static int
3684 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3685 {
3686 
3687 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3688 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3689 		    ("%s: converting to abort from %d at %d, set at %d\n",
3690 		    __func__, fpl->status, line, fpl->line));
3691 	}
3692 	fpl->status = CACHE_FPL_STATUS_ABORTED;
3693 	fpl->line = line;
3694 	return (CACHE_FPL_FAILED);
3695 }
3696 
3697 #define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
3698 
3699 static int
3700 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3701 {
3702 
3703 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3704 	    ("%s: setting to partial at %d, but already set to %d at %d\n",
3705 	    __func__, line, fpl->status, fpl->line));
3706 	cache_fpl_smr_assert_entered(fpl);
3707 	fpl->status = CACHE_FPL_STATUS_PARTIAL;
3708 	fpl->line = line;
3709 	return (CACHE_FPL_FAILED);
3710 }
3711 
3712 #define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
3713 
3714 static int
3715 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3716 {
3717 
3718 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3719 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
3720 	    __func__, line, fpl->status, fpl->line));
3721 	cache_fpl_smr_assert_not_entered(fpl);
3722 	MPASS(error != CACHE_FPL_FAILED);
3723 	fpl->status = CACHE_FPL_STATUS_HANDLED;
3724 	fpl->line = line;
3725 	return (error);
3726 }
3727 
3728 #define cache_fpl_handled(x, e)	cache_fpl_handled_impl((x), (e), __LINE__)
3729 
3730 static bool
3731 cache_fpl_terminated(struct cache_fpl *fpl)
3732 {
3733 
3734 	return (fpl->status != CACHE_FPL_STATUS_UNSET);
3735 }
3736 
3737 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3738 	(NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
3739 	 FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | ISOPEN | \
3740 	 NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3741 
3742 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3743 	(ISDOTDOT | MAKEENTRY | ISLASTCN)
3744 
3745 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3746     "supported and internal flags overlap");
3747 
3748 static bool cache_fplookup_need_climb_mount(struct cache_fpl *fpl);
3749 
3750 static bool
3751 cache_fpl_islastcn(struct nameidata *ndp)
3752 {
3753 
3754 	return (*ndp->ni_next == 0);
3755 }
3756 
3757 static bool
3758 cache_fpl_isdotdot(struct componentname *cnp)
3759 {
3760 
3761 	if (cnp->cn_namelen == 2 &&
3762 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3763 		return (true);
3764 	return (false);
3765 }
3766 
3767 static bool
3768 cache_can_fplookup(struct cache_fpl *fpl)
3769 {
3770 	struct nameidata *ndp;
3771 	struct componentname *cnp;
3772 	struct thread *td;
3773 
3774 	ndp = fpl->ndp;
3775 	cnp = fpl->cnp;
3776 	td = cnp->cn_thread;
3777 
3778 	if (!cache_fast_lookup) {
3779 		cache_fpl_aborted(fpl);
3780 		return (false);
3781 	}
3782 #ifdef MAC
3783 	if (mac_vnode_check_lookup_enabled()) {
3784 		cache_fpl_aborted(fpl);
3785 		return (false);
3786 	}
3787 #endif
3788 	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3789 		cache_fpl_aborted(fpl);
3790 		return (false);
3791 	}
3792 	if (IN_CAPABILITY_MODE(td)) {
3793 		cache_fpl_aborted(fpl);
3794 		return (false);
3795 	}
3796 	if (AUDITING_TD(td)) {
3797 		cache_fpl_aborted(fpl);
3798 		return (false);
3799 	}
3800 	if (ndp->ni_startdir != NULL) {
3801 		cache_fpl_aborted(fpl);
3802 		return (false);
3803 	}
3804 	return (true);
3805 }
3806 
3807 static int
3808 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3809 {
3810 	struct nameidata *ndp;
3811 	int error;
3812 	bool fsearch;
3813 
3814 	ndp = fpl->ndp;
3815 	error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3816 	if (__predict_false(error != 0)) {
3817 		cache_fpl_smr_exit(fpl);
3818 		return (cache_fpl_aborted(fpl));
3819 	}
3820 	fpl->fsearch = fsearch;
3821 	return (0);
3822 }
3823 
3824 static bool
3825 cache_fplookup_vnode_supported(struct vnode *vp)
3826 {
3827 
3828 	return (vp->v_type != VLNK);
3829 }
3830 
3831 static int __noinline
3832 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3833     uint32_t hash)
3834 {
3835 	struct componentname *cnp;
3836 	struct vnode *dvp;
3837 
3838 	cnp = fpl->cnp;
3839 	dvp = fpl->dvp;
3840 
3841 	cache_fpl_smr_exit(fpl);
3842 	if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3843 		return (cache_fpl_handled(fpl, ENOENT));
3844 	else
3845 		return (cache_fpl_aborted(fpl));
3846 }
3847 
3848 /*
3849  * The target vnode is not supported, prepare for the slow path to take over.
3850  */
3851 static int __noinline
3852 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3853 {
3854 	struct nameidata *ndp;
3855 	struct componentname *cnp;
3856 	enum vgetstate dvs;
3857 	struct vnode *dvp;
3858 	struct pwd *pwd;
3859 	seqc_t dvp_seqc;
3860 
3861 	ndp = fpl->ndp;
3862 	cnp = fpl->cnp;
3863 	pwd = fpl->pwd;
3864 	dvp = fpl->dvp;
3865 	dvp_seqc = fpl->dvp_seqc;
3866 
3867 	if (!pwd_hold_smr(pwd)) {
3868 		cache_fpl_smr_exit(fpl);
3869 		return (cache_fpl_aborted(fpl));
3870 	}
3871 
3872 	/*
3873 	 * Note that seqc is checked before the vnode is locked, so by
3874 	 * the time regular lookup gets to it it may have moved.
3875 	 *
3876 	 * Ultimately this does not affect correctness, any lookup errors
3877 	 * are userspace racing with itself. It is guaranteed that any
3878 	 * path which ultimatley gets found could also have been found
3879 	 * by regular lookup going all the way in absence of concurrent
3880 	 * modifications.
3881 	 */
3882 	dvs = vget_prep_smr(dvp);
3883 	cache_fpl_smr_exit(fpl);
3884 	if (__predict_false(dvs == VGET_NONE)) {
3885 		pwd_drop(pwd);
3886 		return (cache_fpl_aborted(fpl));
3887 	}
3888 
3889 	vget_finish_ref(dvp, dvs);
3890 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3891 		vrele(dvp);
3892 		pwd_drop(pwd);
3893 		return (cache_fpl_aborted(fpl));
3894 	}
3895 
3896 	cache_fpl_restore_partial(fpl, &fpl->snd);
3897 
3898 	ndp->ni_startdir = dvp;
3899 	cnp->cn_flags |= MAKEENTRY;
3900 	if (cache_fpl_islastcn(ndp))
3901 		cnp->cn_flags |= ISLASTCN;
3902 	if (cache_fpl_isdotdot(cnp))
3903 		cnp->cn_flags |= ISDOTDOT;
3904 
3905 	return (0);
3906 }
3907 
3908 static int
3909 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3910 {
3911 	struct componentname *cnp;
3912 	struct vnode *tvp;
3913 	seqc_t tvp_seqc;
3914 	int error, lkflags;
3915 
3916 	cnp = fpl->cnp;
3917 	tvp = fpl->tvp;
3918 	tvp_seqc = fpl->tvp_seqc;
3919 
3920 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
3921 		lkflags = LK_SHARED;
3922 		if ((cnp->cn_flags & LOCKSHARED) == 0)
3923 			lkflags = LK_EXCLUSIVE;
3924 		error = vget_finish(tvp, lkflags, tvs);
3925 		if (__predict_false(error != 0)) {
3926 			return (cache_fpl_aborted(fpl));
3927 		}
3928 	} else {
3929 		vget_finish_ref(tvp, tvs);
3930 	}
3931 
3932 	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3933 		if ((cnp->cn_flags & LOCKLEAF) != 0)
3934 			vput(tvp);
3935 		else
3936 			vrele(tvp);
3937 		return (cache_fpl_aborted(fpl));
3938 	}
3939 
3940 	return (cache_fpl_handled(fpl, 0));
3941 }
3942 
3943 /*
3944  * They want to possibly modify the state of the namecache.
3945  */
3946 static int __noinline
3947 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3948 {
3949 	struct nameidata *ndp;
3950 	struct componentname *cnp;
3951 	enum vgetstate dvs;
3952 	struct vnode *dvp, *tvp;
3953 	struct mount *mp;
3954 	seqc_t dvp_seqc;
3955 	int error;
3956 	bool docache;
3957 
3958 	ndp = fpl->ndp;
3959 	cnp = fpl->cnp;
3960 	dvp = fpl->dvp;
3961 	dvp_seqc = fpl->dvp_seqc;
3962 
3963 	MPASS(cache_fpl_islastcn(ndp));
3964 	if ((cnp->cn_flags & LOCKPARENT) == 0)
3965 		MPASS((cnp->cn_flags & WANTPARENT) != 0);
3966 	MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
3967 	MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
3968 	    cnp->cn_nameiop == RENAME);
3969 	MPASS((cnp->cn_flags & MAKEENTRY) == 0);
3970 	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
3971 
3972 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
3973 	if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
3974 		docache = false;
3975 
3976 	mp = atomic_load_ptr(&dvp->v_mount);
3977 	if (__predict_false(mp == NULL)) {
3978 		return (cache_fpl_aborted(fpl));
3979 	}
3980 
3981 	if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
3982 		cache_fpl_smr_exit(fpl);
3983 		/*
3984 		 * Original code keeps not checking for CREATE which
3985 		 * might be a bug. For now let the old lookup decide.
3986 		 */
3987 		if (cnp->cn_nameiop == CREATE) {
3988 			return (cache_fpl_aborted(fpl));
3989 		}
3990 		return (cache_fpl_handled(fpl, EROFS));
3991 	}
3992 
3993 	/*
3994 	 * Secure access to dvp; check cache_fplookup_partial_setup for
3995 	 * reasoning.
3996 	 *
3997 	 * XXX At least UFS requires its lookup routine to be called for
3998 	 * the last path component, which leads to some level of complicaton
3999 	 * and inefficiency:
4000 	 * - the target routine always locks the target vnode, but our caller
4001 	 *   may not need it locked
4002 	 * - some of the VOP machinery asserts that the parent is locked, which
4003 	 *   once more may be not required
4004 	 *
4005 	 * TODO: add a flag for filesystems which don't need this.
4006 	 */
4007 	dvs = vget_prep_smr(dvp);
4008 	cache_fpl_smr_exit(fpl);
4009 	if (__predict_false(dvs == VGET_NONE)) {
4010 		return (cache_fpl_aborted(fpl));
4011 	}
4012 
4013 	vget_finish_ref(dvp, dvs);
4014 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4015 		vrele(dvp);
4016 		return (cache_fpl_aborted(fpl));
4017 	}
4018 
4019 	error = vn_lock(dvp, LK_EXCLUSIVE);
4020 	if (__predict_false(error != 0)) {
4021 		vrele(dvp);
4022 		return (cache_fpl_aborted(fpl));
4023 	}
4024 
4025 	tvp = NULL;
4026 	cnp->cn_flags |= ISLASTCN;
4027 	if (docache)
4028 		cnp->cn_flags |= MAKEENTRY;
4029 	if (cache_fpl_isdotdot(cnp))
4030 		cnp->cn_flags |= ISDOTDOT;
4031 	cnp->cn_lkflags = LK_EXCLUSIVE;
4032 	error = VOP_LOOKUP(dvp, &tvp, cnp);
4033 	switch (error) {
4034 	case EJUSTRETURN:
4035 	case 0:
4036 		break;
4037 	case ENOTDIR:
4038 	case ENOENT:
4039 		vput(dvp);
4040 		return (cache_fpl_handled(fpl, error));
4041 	default:
4042 		vput(dvp);
4043 		return (cache_fpl_aborted(fpl));
4044 	}
4045 
4046 	fpl->tvp = tvp;
4047 
4048 	if (tvp == NULL) {
4049 		if ((cnp->cn_flags & SAVESTART) != 0) {
4050 			ndp->ni_startdir = dvp;
4051 			vrefact(ndp->ni_startdir);
4052 			cnp->cn_flags |= SAVENAME;
4053 		}
4054 		MPASS(error == EJUSTRETURN);
4055 		if ((cnp->cn_flags & LOCKPARENT) == 0) {
4056 			VOP_UNLOCK(dvp);
4057 		}
4058 		return (cache_fpl_handled(fpl, 0));
4059 	}
4060 
4061 	/*
4062 	 * Check if the target is either a symlink or a mount point.
4063 	 * Since we expect this to be the terminal vnode it should
4064 	 * almost never be true.
4065 	 */
4066 	if (__predict_false(!cache_fplookup_vnode_supported(tvp) ||
4067 	    cache_fplookup_need_climb_mount(fpl))) {
4068 		vput(dvp);
4069 		vput(tvp);
4070 		return (cache_fpl_aborted(fpl));
4071 	}
4072 
4073 	if ((cnp->cn_flags & LOCKLEAF) == 0) {
4074 		VOP_UNLOCK(tvp);
4075 	}
4076 
4077 	if ((cnp->cn_flags & LOCKPARENT) == 0) {
4078 		VOP_UNLOCK(dvp);
4079 	}
4080 
4081 	if ((cnp->cn_flags & SAVESTART) != 0) {
4082 		ndp->ni_startdir = dvp;
4083 		vrefact(ndp->ni_startdir);
4084 		cnp->cn_flags |= SAVENAME;
4085 	}
4086 
4087 	return (cache_fpl_handled(fpl, 0));
4088 }
4089 
4090 static int __noinline
4091 cache_fplookup_modifying(struct cache_fpl *fpl)
4092 {
4093 	struct nameidata *ndp;
4094 
4095 	ndp = fpl->ndp;
4096 
4097 	if (!cache_fpl_islastcn(ndp)) {
4098 		return (cache_fpl_partial(fpl));
4099 	}
4100 	return  (cache_fplookup_final_modifying(fpl));
4101 }
4102 
4103 static int __noinline
4104 cache_fplookup_final_withparent(struct cache_fpl *fpl)
4105 {
4106 	struct componentname *cnp;
4107 	enum vgetstate dvs, tvs;
4108 	struct vnode *dvp, *tvp;
4109 	seqc_t dvp_seqc;
4110 	int error;
4111 
4112 	cnp = fpl->cnp;
4113 	dvp = fpl->dvp;
4114 	dvp_seqc = fpl->dvp_seqc;
4115 	tvp = fpl->tvp;
4116 
4117 	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4118 
4119 	/*
4120 	 * This is less efficient than it can be for simplicity.
4121 	 */
4122 	dvs = vget_prep_smr(dvp);
4123 	if (__predict_false(dvs == VGET_NONE)) {
4124 		return (cache_fpl_aborted(fpl));
4125 	}
4126 	tvs = vget_prep_smr(tvp);
4127 	if (__predict_false(tvs == VGET_NONE)) {
4128 		cache_fpl_smr_exit(fpl);
4129 		vget_abort(dvp, dvs);
4130 		return (cache_fpl_aborted(fpl));
4131 	}
4132 
4133 	cache_fpl_smr_exit(fpl);
4134 
4135 	if ((cnp->cn_flags & LOCKPARENT) != 0) {
4136 		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4137 		if (__predict_false(error != 0)) {
4138 			vget_abort(tvp, tvs);
4139 			return (cache_fpl_aborted(fpl));
4140 		}
4141 	} else {
4142 		vget_finish_ref(dvp, dvs);
4143 	}
4144 
4145 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4146 		vget_abort(tvp, tvs);
4147 		if ((cnp->cn_flags & LOCKPARENT) != 0)
4148 			vput(dvp);
4149 		else
4150 			vrele(dvp);
4151 		return (cache_fpl_aborted(fpl));
4152 	}
4153 
4154 	error = cache_fplookup_final_child(fpl, tvs);
4155 	if (__predict_false(error != 0)) {
4156 		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
4157 		if ((cnp->cn_flags & LOCKPARENT) != 0)
4158 			vput(dvp);
4159 		else
4160 			vrele(dvp);
4161 		return (error);
4162 	}
4163 
4164 	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4165 	return (0);
4166 }
4167 
4168 static int
4169 cache_fplookup_final(struct cache_fpl *fpl)
4170 {
4171 	struct componentname *cnp;
4172 	enum vgetstate tvs;
4173 	struct vnode *dvp, *tvp;
4174 	seqc_t dvp_seqc;
4175 
4176 	cnp = fpl->cnp;
4177 	dvp = fpl->dvp;
4178 	dvp_seqc = fpl->dvp_seqc;
4179 	tvp = fpl->tvp;
4180 
4181 	if (cnp->cn_nameiop != LOOKUP) {
4182 		return (cache_fplookup_final_modifying(fpl));
4183 	}
4184 
4185 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4186 		return (cache_fplookup_final_withparent(fpl));
4187 
4188 	tvs = vget_prep_smr(tvp);
4189 	if (__predict_false(tvs == VGET_NONE)) {
4190 		return (cache_fpl_partial(fpl));
4191 	}
4192 
4193 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4194 		cache_fpl_smr_exit(fpl);
4195 		vget_abort(tvp, tvs);
4196 		return (cache_fpl_aborted(fpl));
4197 	}
4198 
4199 	cache_fpl_smr_exit(fpl);
4200 	return (cache_fplookup_final_child(fpl, tvs));
4201 }
4202 
4203 static int __noinline
4204 cache_fplookup_noentry(struct cache_fpl *fpl)
4205 {
4206 	struct nameidata *ndp;
4207 	struct componentname *cnp;
4208 	enum vgetstate dvs;
4209 	struct vnode *dvp, *tvp;
4210 	seqc_t dvp_seqc;
4211 	int error;
4212 	bool docache;
4213 
4214 	ndp = fpl->ndp;
4215 	cnp = fpl->cnp;
4216 	dvp = fpl->dvp;
4217 	dvp_seqc = fpl->dvp_seqc;
4218 
4219 	MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4220 	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4221 	MPASS(!cache_fpl_isdotdot(cnp));
4222 
4223 	if (cnp->cn_nameiop != LOOKUP) {
4224 		return (cache_fplookup_modifying(fpl));
4225 	}
4226 
4227 	MPASS((cnp->cn_flags & SAVESTART) == 0);
4228 
4229 	/*
4230 	 * Only try to fill in the component if it is the last one,
4231 	 * otherwise not only there may be several to handle but the
4232 	 * walk may be complicated.
4233 	 */
4234 	if (!cache_fpl_islastcn(ndp)) {
4235 		return (cache_fpl_partial(fpl));
4236 	}
4237 
4238 	/*
4239 	 * Secure access to dvp; check cache_fplookup_partial_setup for
4240 	 * reasoning.
4241 	 */
4242 	dvs = vget_prep_smr(dvp);
4243 	cache_fpl_smr_exit(fpl);
4244 	if (__predict_false(dvs == VGET_NONE)) {
4245 		return (cache_fpl_aborted(fpl));
4246 	}
4247 
4248 	vget_finish_ref(dvp, dvs);
4249 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4250 		vrele(dvp);
4251 		return (cache_fpl_aborted(fpl));
4252 	}
4253 
4254 	error = vn_lock(dvp, LK_SHARED);
4255 	if (__predict_false(error != 0)) {
4256 		vrele(dvp);
4257 		return (cache_fpl_aborted(fpl));
4258 	}
4259 
4260 	tvp = NULL;
4261 	/*
4262 	 * TODO: provide variants which don't require locking either vnode.
4263 	 */
4264 	cnp->cn_flags |= ISLASTCN;
4265 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4266 	if (docache)
4267 		cnp->cn_flags |= MAKEENTRY;
4268 	cnp->cn_lkflags = LK_SHARED;
4269 	if ((cnp->cn_flags & LOCKSHARED) == 0) {
4270 		cnp->cn_lkflags = LK_EXCLUSIVE;
4271 	}
4272 	error = VOP_LOOKUP(dvp, &tvp, cnp);
4273 	switch (error) {
4274 	case EJUSTRETURN:
4275 	case 0:
4276 		break;
4277 	case ENOTDIR:
4278 	case ENOENT:
4279 		vput(dvp);
4280 		return (cache_fpl_handled(fpl, error));
4281 	default:
4282 		vput(dvp);
4283 		return (cache_fpl_aborted(fpl));
4284 	}
4285 
4286 	fpl->tvp = tvp;
4287 
4288 	if (tvp == NULL) {
4289 		MPASS(error == EJUSTRETURN);
4290 		if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
4291 			vput(dvp);
4292 		} else if ((cnp->cn_flags & LOCKPARENT) == 0) {
4293 			VOP_UNLOCK(dvp);
4294 		}
4295 		return (cache_fpl_handled(fpl, 0));
4296 	}
4297 
4298 	if (__predict_false(!cache_fplookup_vnode_supported(tvp) ||
4299 	    cache_fplookup_need_climb_mount(fpl))) {
4300 		vput(dvp);
4301 		vput(tvp);
4302 		return (cache_fpl_aborted(fpl));
4303 	}
4304 
4305 	if ((cnp->cn_flags & LOCKLEAF) == 0) {
4306 		VOP_UNLOCK(tvp);
4307 	}
4308 
4309 	if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
4310 		vput(dvp);
4311 	} else if ((cnp->cn_flags & LOCKPARENT) == 0) {
4312 		VOP_UNLOCK(dvp);
4313 	}
4314 	return (cache_fpl_handled(fpl, 0));
4315 }
4316 
4317 static int __noinline
4318 cache_fplookup_dot(struct cache_fpl *fpl)
4319 {
4320 	struct vnode *dvp;
4321 
4322 	dvp = fpl->dvp;
4323 
4324 	fpl->tvp = dvp;
4325 	fpl->tvp_seqc = vn_seqc_read_any(dvp);
4326 	if (seqc_in_modify(fpl->tvp_seqc)) {
4327 		return (cache_fpl_aborted(fpl));
4328 	}
4329 
4330 	counter_u64_add(dothits, 1);
4331 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
4332 
4333 	return (0);
4334 }
4335 
4336 static int __noinline
4337 cache_fplookup_dotdot(struct cache_fpl *fpl)
4338 {
4339 	struct nameidata *ndp;
4340 	struct componentname *cnp;
4341 	struct namecache *ncp;
4342 	struct vnode *dvp;
4343 	struct prison *pr;
4344 	u_char nc_flag;
4345 
4346 	ndp = fpl->ndp;
4347 	cnp = fpl->cnp;
4348 	dvp = fpl->dvp;
4349 
4350 	/*
4351 	 * XXX this is racy the same way regular lookup is
4352 	 */
4353 	for (pr = cnp->cn_cred->cr_prison; pr != NULL;
4354 	    pr = pr->pr_parent)
4355 		if (dvp == pr->pr_root)
4356 			break;
4357 
4358 	if (dvp == ndp->ni_rootdir ||
4359 	    dvp == ndp->ni_topdir ||
4360 	    dvp == rootvnode ||
4361 	    pr != NULL) {
4362 		fpl->tvp = dvp;
4363 		fpl->tvp_seqc = vn_seqc_read_any(dvp);
4364 		if (seqc_in_modify(fpl->tvp_seqc)) {
4365 			return (cache_fpl_aborted(fpl));
4366 		}
4367 		return (0);
4368 	}
4369 
4370 	if ((dvp->v_vflag & VV_ROOT) != 0) {
4371 		/*
4372 		 * TODO
4373 		 * The opposite of climb mount is needed here.
4374 		 */
4375 		return (cache_fpl_aborted(fpl));
4376 	}
4377 
4378 	ncp = atomic_load_ptr(&dvp->v_cache_dd);
4379 	if (ncp == NULL) {
4380 		return (cache_fpl_aborted(fpl));
4381 	}
4382 
4383 	nc_flag = atomic_load_char(&ncp->nc_flag);
4384 	if ((nc_flag & NCF_ISDOTDOT) != 0) {
4385 		if ((nc_flag & NCF_NEGATIVE) != 0)
4386 			return (cache_fpl_aborted(fpl));
4387 		fpl->tvp = ncp->nc_vp;
4388 	} else {
4389 		fpl->tvp = ncp->nc_dvp;
4390 	}
4391 
4392 	if (!cache_ncp_canuse(ncp)) {
4393 		return (cache_fpl_aborted(fpl));
4394 	}
4395 
4396 	fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4397 	if (seqc_in_modify(fpl->tvp_seqc)) {
4398 		return (cache_fpl_partial(fpl));
4399 	}
4400 
4401 	counter_u64_add(dotdothits, 1);
4402 	return (0);
4403 }
4404 
4405 static int __noinline
4406 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
4407 {
4408 	u_char nc_flag;
4409 	bool neg_promote;
4410 
4411 	nc_flag = atomic_load_char(&ncp->nc_flag);
4412 	MPASS((nc_flag & NCF_NEGATIVE) != 0);
4413 	/*
4414 	 * If they want to create an entry we need to replace this one.
4415 	 */
4416 	if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4417 		/*
4418 		 * TODO
4419 		 * This should call something similar to
4420 		 * cache_fplookup_final_modifying.
4421 		 */
4422 		return (cache_fpl_partial(fpl));
4423 	}
4424 	neg_promote = cache_neg_hit_prep(ncp);
4425 	if (!cache_ncp_canuse(ncp)) {
4426 		cache_neg_hit_abort(ncp);
4427 		return (cache_fpl_partial(fpl));
4428 	}
4429 	if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
4430 		cache_neg_hit_abort(ncp);
4431 		return (cache_fpl_partial(fpl));
4432 	}
4433 	if (neg_promote) {
4434 		return (cache_fplookup_negative_promote(fpl, ncp, hash));
4435 	}
4436 	cache_neg_hit_finish(ncp);
4437 	cache_fpl_smr_exit(fpl);
4438 	return (cache_fpl_handled(fpl, ENOENT));
4439 }
4440 
4441 static int
4442 cache_fplookup_next(struct cache_fpl *fpl)
4443 {
4444 	struct componentname *cnp;
4445 	struct namecache *ncp;
4446 	struct vnode *dvp, *tvp;
4447 	u_char nc_flag;
4448 	uint32_t hash;
4449 
4450 	cnp = fpl->cnp;
4451 	dvp = fpl->dvp;
4452 
4453 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
4454 		return (cache_fplookup_dot(fpl));
4455 	}
4456 
4457 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4458 
4459 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4460 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4461 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4462 			break;
4463 	}
4464 
4465 	if (__predict_false(ncp == NULL)) {
4466 		return (cache_fplookup_noentry(fpl));
4467 	}
4468 
4469 	tvp = atomic_load_ptr(&ncp->nc_vp);
4470 	nc_flag = atomic_load_char(&ncp->nc_flag);
4471 	if ((nc_flag & NCF_NEGATIVE) != 0) {
4472 		return (cache_fplookup_neg(fpl, ncp, hash));
4473 	}
4474 
4475 	if (!cache_ncp_canuse(ncp)) {
4476 		return (cache_fpl_partial(fpl));
4477 	}
4478 
4479 	fpl->tvp = tvp;
4480 	fpl->tvp_seqc = vn_seqc_read_any(tvp);
4481 	if (seqc_in_modify(fpl->tvp_seqc)) {
4482 		return (cache_fpl_partial(fpl));
4483 	}
4484 
4485 	if (!cache_fplookup_vnode_supported(tvp)) {
4486 		return (cache_fpl_partial(fpl));
4487 	}
4488 
4489 	counter_u64_add(numposhits, 1);
4490 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4491 	return (0);
4492 }
4493 
4494 static bool
4495 cache_fplookup_mp_supported(struct mount *mp)
4496 {
4497 
4498 	if (mp == NULL)
4499 		return (false);
4500 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4501 		return (false);
4502 	return (true);
4503 }
4504 
4505 /*
4506  * Walk up the mount stack (if any).
4507  *
4508  * Correctness is provided in the following ways:
4509  * - all vnodes are protected from freeing with SMR
4510  * - struct mount objects are type stable making them always safe to access
4511  * - stability of the particular mount is provided by busying it
4512  * - relationship between the vnode which is mounted on and the mount is
4513  *   verified with the vnode sequence counter after busying
4514  * - association between root vnode of the mount and the mount is protected
4515  *   by busy
4516  *
4517  * From that point on we can read the sequence counter of the root vnode
4518  * and get the next mount on the stack (if any) using the same protection.
4519  *
4520  * By the end of successful walk we are guaranteed the reached state was
4521  * indeed present at least at some point which matches the regular lookup.
4522  */
4523 static int __noinline
4524 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4525 {
4526 	struct mount *mp, *prev_mp;
4527 	struct mount_pcpu *mpcpu, *prev_mpcpu;
4528 	struct vnode *vp;
4529 	seqc_t vp_seqc;
4530 
4531 	vp = fpl->tvp;
4532 	vp_seqc = fpl->tvp_seqc;
4533 
4534 	VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4535 	mp = atomic_load_ptr(&vp->v_mountedhere);
4536 	if (mp == NULL)
4537 		return (0);
4538 
4539 	prev_mp = NULL;
4540 	for (;;) {
4541 		if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
4542 			if (prev_mp != NULL)
4543 				vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4544 			return (cache_fpl_partial(fpl));
4545 		}
4546 		if (prev_mp != NULL)
4547 			vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4548 		if (!vn_seqc_consistent(vp, vp_seqc)) {
4549 			vfs_op_thread_exit_crit(mp, mpcpu);
4550 			return (cache_fpl_partial(fpl));
4551 		}
4552 		if (!cache_fplookup_mp_supported(mp)) {
4553 			vfs_op_thread_exit_crit(mp, mpcpu);
4554 			return (cache_fpl_partial(fpl));
4555 		}
4556 		vp = atomic_load_ptr(&mp->mnt_rootvnode);
4557 		if (vp == NULL || VN_IS_DOOMED(vp)) {
4558 			vfs_op_thread_exit_crit(mp, mpcpu);
4559 			return (cache_fpl_partial(fpl));
4560 		}
4561 		vp_seqc = vn_seqc_read_any(vp);
4562 		if (seqc_in_modify(vp_seqc)) {
4563 			vfs_op_thread_exit_crit(mp, mpcpu);
4564 			return (cache_fpl_partial(fpl));
4565 		}
4566 		prev_mp = mp;
4567 		prev_mpcpu = mpcpu;
4568 		mp = atomic_load_ptr(&vp->v_mountedhere);
4569 		if (mp == NULL)
4570 			break;
4571 	}
4572 
4573 	vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4574 	fpl->tvp = vp;
4575 	fpl->tvp_seqc = vp_seqc;
4576 	return (0);
4577 }
4578 
4579 static bool
4580 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4581 {
4582 	struct mount *mp;
4583 	struct vnode *vp;
4584 
4585 	vp = fpl->tvp;
4586 
4587 	/*
4588 	 * Hack: while this is a union, the pointer tends to be NULL so save on
4589 	 * a branch.
4590 	 */
4591 	mp = atomic_load_ptr(&vp->v_mountedhere);
4592 	if (mp == NULL)
4593 		return (false);
4594 	if (vp->v_type == VDIR)
4595 		return (true);
4596 	return (false);
4597 }
4598 
4599 /*
4600  * Parse the path.
4601  *
4602  * The code was originally copy-pasted from regular lookup and despite
4603  * clean ups leaves performance on the table. Any modifications here
4604  * must take into account that in case off fallback the resulting
4605  * nameidata state has to be compatible with the original.
4606  */
4607 static int
4608 cache_fplookup_parse(struct cache_fpl *fpl)
4609 {
4610 	struct nameidata *ndp;
4611 	struct componentname *cnp;
4612 	char *cp;
4613 
4614 	ndp = fpl->ndp;
4615 	cnp = fpl->cnp;
4616 
4617 	/*
4618 	 * Search a new directory.
4619 	 *
4620 	 * The last component of the filename is left accessible via
4621 	 * cnp->cn_nameptr for callers that need the name. Callers needing
4622 	 * the name set the SAVENAME flag. When done, they assume
4623 	 * responsibility for freeing the pathname buffer.
4624 	 */
4625 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4626 		continue;
4627 	cnp->cn_namelen = cp - cnp->cn_nameptr;
4628 	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4629 		cache_fpl_smr_exit(fpl);
4630 		return (cache_fpl_handled(fpl, ENAMETOOLONG));
4631 	}
4632 	ndp->ni_pathlen -= cnp->cn_namelen;
4633 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
4634 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4635 	ndp->ni_next = cp;
4636 
4637 	/*
4638 	 * Replace multiple slashes by a single slash and trailing slashes
4639 	 * by a null.  This must be done before VOP_LOOKUP() because some
4640 	 * fs's don't know about trailing slashes.  Remember if there were
4641 	 * trailing slashes to handle symlinks, existing non-directories
4642 	 * and non-existing files that won't be directories specially later.
4643 	 */
4644 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4645 		cp++;
4646 		ndp->ni_pathlen--;
4647 		if (*cp == '\0') {
4648 			/*
4649 			 * TODO
4650 			 * Regular lookup performs the following:
4651 			 * *ndp->ni_next = '\0';
4652 			 * cnp->cn_flags |= TRAILINGSLASH;
4653 			 *
4654 			 * Which is problematic since it modifies data read
4655 			 * from userspace. Then if fast path lookup was to
4656 			 * abort we would have to either restore it or convey
4657 			 * the flag. Since this is a corner case just ignore
4658 			 * it for simplicity.
4659 			 */
4660 			return (cache_fpl_partial(fpl));
4661 		}
4662 	}
4663 	ndp->ni_next = cp;
4664 
4665 	/*
4666 	 * Check for degenerate name (e.g. / or "")
4667 	 * which is a way of talking about a directory,
4668 	 * e.g. like "/." or ".".
4669 	 *
4670 	 * TODO
4671 	 * Another corner case handled by the regular lookup
4672 	 */
4673 	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4674 		return (cache_fpl_partial(fpl));
4675 	}
4676 	return (0);
4677 }
4678 
4679 static void
4680 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4681 {
4682 	struct nameidata *ndp;
4683 	struct componentname *cnp;
4684 
4685 	ndp = fpl->ndp;
4686 	cnp = fpl->cnp;
4687 
4688 	cnp->cn_nameptr = ndp->ni_next;
4689 	while (*cnp->cn_nameptr == '/') {
4690 		cnp->cn_nameptr++;
4691 		ndp->ni_pathlen--;
4692 	}
4693 }
4694 
4695 /*
4696  * See the API contract for VOP_FPLOOKUP_VEXEC.
4697  */
4698 static int __noinline
4699 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4700 {
4701 	struct vnode *dvp;
4702 	seqc_t dvp_seqc;
4703 
4704 	dvp = fpl->dvp;
4705 	dvp_seqc = fpl->dvp_seqc;
4706 
4707 	/*
4708 	 * Hack: they may be looking up foo/bar, where foo is a
4709 	 * regular file. In such a case we need to turn ENOTDIR,
4710 	 * but we may happen to get here with a different error.
4711 	 */
4712 	if (dvp->v_type != VDIR) {
4713 		/*
4714 		 * The check here is predominantly to catch
4715 		 * EOPNOTSUPP from dead_vnodeops. If the vnode
4716 		 * gets doomed past this point it is going to
4717 		 * fail seqc verification.
4718 		 */
4719 		if (VN_IS_DOOMED(dvp)) {
4720 			return (cache_fpl_aborted(fpl));
4721 		}
4722 		error = ENOTDIR;
4723 	}
4724 
4725 	/*
4726 	 * Hack: handle O_SEARCH.
4727 	 *
4728 	 * Open Group Base Specifications Issue 7, 2018 edition states:
4729 	 * If the access mode of the open file description associated with the
4730 	 * file descriptor is not O_SEARCH, the function shall check whether
4731 	 * directory searches are permitted using the current permissions of
4732 	 * the directory underlying the file descriptor. If the access mode is
4733 	 * O_SEARCH, the function shall not perform the check.
4734 	 *
4735 	 * Regular lookup tests for the NOEXECCHECK flag for every path
4736 	 * component to decide whether to do the permission check. However,
4737 	 * since most lookups never have the flag (and when they do it is only
4738 	 * present for the first path component), lockless lookup only acts on
4739 	 * it if there is a permission problem. Here the flag is represented
4740 	 * with a boolean so that we don't have to clear it on the way out.
4741 	 *
4742 	 * For simplicity this always aborts.
4743 	 * TODO: check if this is the first lookup and ignore the permission
4744 	 * problem. Note the flag has to survive fallback (if it happens to be
4745 	 * performed).
4746 	 */
4747 	if (fpl->fsearch) {
4748 		return (cache_fpl_aborted(fpl));
4749 	}
4750 
4751 	switch (error) {
4752 	case EAGAIN:
4753 		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4754 			error = cache_fpl_aborted(fpl);
4755 		} else {
4756 			cache_fpl_partial(fpl);
4757 		}
4758 		break;
4759 	default:
4760 		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4761 			error = cache_fpl_aborted(fpl);
4762 		} else {
4763 			cache_fpl_smr_exit(fpl);
4764 			cache_fpl_handled(fpl, error);
4765 		}
4766 		break;
4767 	}
4768 	return (error);
4769 }
4770 
4771 static int
4772 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4773 {
4774 	struct nameidata *ndp;
4775 	struct componentname *cnp;
4776 	struct mount *mp;
4777 	int error;
4778 
4779 	error = CACHE_FPL_FAILED;
4780 	ndp = fpl->ndp;
4781 	cnp = fpl->cnp;
4782 
4783 	cache_fpl_checkpoint(fpl, &fpl->snd);
4784 
4785 	fpl->dvp = dvp;
4786 	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4787 	if (seqc_in_modify(fpl->dvp_seqc)) {
4788 		cache_fpl_aborted(fpl);
4789 		goto out;
4790 	}
4791 	mp = atomic_load_ptr(&fpl->dvp->v_mount);
4792 	if (!cache_fplookup_mp_supported(mp)) {
4793 		cache_fpl_aborted(fpl);
4794 		goto out;
4795 	}
4796 
4797 	VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4798 
4799 	for (;;) {
4800 		error = cache_fplookup_parse(fpl);
4801 		if (__predict_false(error != 0)) {
4802 			break;
4803 		}
4804 
4805 		VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4806 
4807 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4808 		if (__predict_false(error != 0)) {
4809 			error = cache_fplookup_failed_vexec(fpl, error);
4810 			break;
4811 		}
4812 
4813 		if (__predict_false(cache_fpl_isdotdot(cnp))) {
4814 			error = cache_fplookup_dotdot(fpl);
4815 			if (__predict_false(cache_fpl_terminated(fpl))) {
4816 				break;
4817 			}
4818 		} else {
4819 			error = cache_fplookup_next(fpl);
4820 			if (__predict_false(cache_fpl_terminated(fpl))) {
4821 				break;
4822 			}
4823 
4824 			VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4825 
4826 			if (cache_fplookup_need_climb_mount(fpl)) {
4827 				error = cache_fplookup_climb_mount(fpl);
4828 				if (__predict_false(error != 0)) {
4829 					break;
4830 				}
4831 			}
4832 		}
4833 
4834 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4835 
4836 		if (cache_fpl_islastcn(ndp)) {
4837 			error = cache_fplookup_final(fpl);
4838 			break;
4839 		}
4840 
4841 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4842 			error = cache_fpl_aborted(fpl);
4843 			break;
4844 		}
4845 
4846 		fpl->dvp = fpl->tvp;
4847 		fpl->dvp_seqc = fpl->tvp_seqc;
4848 
4849 		cache_fplookup_parse_advance(fpl);
4850 		cache_fpl_checkpoint(fpl, &fpl->snd);
4851 	}
4852 out:
4853 	switch (fpl->status) {
4854 	case CACHE_FPL_STATUS_UNSET:
4855 		__assert_unreachable();
4856 		break;
4857 	case CACHE_FPL_STATUS_PARTIAL:
4858 		cache_fpl_smr_assert_entered(fpl);
4859 		return (cache_fplookup_partial_setup(fpl));
4860 	case CACHE_FPL_STATUS_ABORTED:
4861 		if (fpl->in_smr)
4862 			cache_fpl_smr_exit(fpl);
4863 		return (CACHE_FPL_FAILED);
4864 	case CACHE_FPL_STATUS_HANDLED:
4865 		MPASS(error != CACHE_FPL_FAILED);
4866 		cache_fpl_smr_assert_not_entered(fpl);
4867 		/*
4868 		 * A common error is ENOENT.
4869 		 */
4870 		if (error != 0) {
4871 			ndp->ni_dvp = NULL;
4872 			ndp->ni_vp = NULL;
4873 			cache_fpl_cleanup_cnp(cnp);
4874 			return (error);
4875 		}
4876 		ndp->ni_dvp = fpl->dvp;
4877 		ndp->ni_vp = fpl->tvp;
4878 		if (cnp->cn_flags & SAVENAME)
4879 			cnp->cn_flags |= HASBUF;
4880 		else
4881 			cache_fpl_cleanup_cnp(cnp);
4882 		return (error);
4883 	}
4884 	__assert_unreachable();
4885 }
4886 
4887 /*
4888  * Fast path lookup protected with SMR and sequence counters.
4889  *
4890  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4891  *
4892  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4893  * outlined below.
4894  *
4895  * Traditional vnode lookup conceptually looks like this:
4896  *
4897  * vn_lock(current);
4898  * for (;;) {
4899  *	next = find();
4900  *	vn_lock(next);
4901  *	vn_unlock(current);
4902  *	current = next;
4903  *	if (last)
4904  *	    break;
4905  * }
4906  * return (current);
4907  *
4908  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4909  * any modifications thanks to holding respective locks.
4910  *
4911  * The same guarantee can be provided with a combination of safe memory
4912  * reclamation and sequence counters instead. If all operations which affect
4913  * the relationship between the current vnode and the one we are looking for
4914  * also modify the counter, we can verify whether all the conditions held as
4915  * we made the jump. This includes things like permissions, mount points etc.
4916  * Counter modification is provided by enclosing relevant places in
4917  * vn_seqc_write_begin()/end() calls.
4918  *
4919  * Thus this translates to:
4920  *
4921  * vfs_smr_enter();
4922  * dvp_seqc = seqc_read_any(dvp);
4923  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4924  *     abort();
4925  * for (;;) {
4926  * 	tvp = find();
4927  * 	tvp_seqc = seqc_read_any(tvp);
4928  * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4929  * 	    abort();
4930  * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4931  * 	    abort();
4932  * 	dvp = tvp; // we know nothing of importance has changed
4933  * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4934  * 	if (last)
4935  * 	    break;
4936  * }
4937  * vget(); // secure the vnode
4938  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4939  * 	    abort();
4940  * // at this point we know nothing has changed for any parent<->child pair
4941  * // as they were crossed during the lookup, meaning we matched the guarantee
4942  * // of the locked variant
4943  * return (tvp);
4944  *
4945  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4946  * - they are called while within vfs_smr protection which they must never exit
4947  * - EAGAIN can be returned to denote checking could not be performed, it is
4948  *   always valid to return it
4949  * - if the sequence counter has not changed the result must be valid
4950  * - if the sequence counter has changed both false positives and false negatives
4951  *   are permitted (since the result will be rejected later)
4952  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4953  *
4954  * Caveats to watch out for:
4955  * - vnodes are passed unlocked and unreferenced with nothing stopping
4956  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4957  *   to use atomic_load_ptr to fetch it.
4958  * - the aforementioned object can also get freed, meaning absent other means it
4959  *   should be protected with vfs_smr
4960  * - either safely checking permissions as they are modified or guaranteeing
4961  *   their stability is left to the routine
4962  */
4963 int
4964 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4965     struct pwd **pwdp)
4966 {
4967 	struct cache_fpl fpl;
4968 	struct pwd *pwd;
4969 	struct vnode *dvp;
4970 	struct componentname *cnp;
4971 	struct nameidata_saved orig;
4972 	int error;
4973 
4974 	MPASS(ndp->ni_lcf == 0);
4975 
4976 	fpl.status = CACHE_FPL_STATUS_UNSET;
4977 	fpl.ndp = ndp;
4978 	fpl.cnp = &ndp->ni_cnd;
4979 	MPASS(curthread == fpl.cnp->cn_thread);
4980 	KASSERT ((fpl.cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4981 	    ("%s: internal flags found in cn_flags %" PRIx64, __func__,
4982 	    fpl.cnp->cn_flags));
4983 
4984 	if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4985 		MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4986 
4987 	if (!cache_can_fplookup(&fpl)) {
4988 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4989 		*status = fpl.status;
4990 		return (EOPNOTSUPP);
4991 	}
4992 
4993 	cache_fpl_checkpoint(&fpl, &orig);
4994 
4995 	cache_fpl_smr_enter_initial(&fpl);
4996 	fpl.fsearch = false;
4997 	pwd = pwd_get_smr();
4998 	fpl.pwd = pwd;
4999 	ndp->ni_rootdir = pwd->pwd_rdir;
5000 	ndp->ni_topdir = pwd->pwd_jdir;
5001 
5002 	cnp = fpl.cnp;
5003 	cnp->cn_nameptr = cnp->cn_pnbuf;
5004 	if (cnp->cn_pnbuf[0] == '/') {
5005 		cache_fpl_handle_root(ndp, &dvp);
5006 		ndp->ni_resflags |= NIRES_ABS;
5007 	} else {
5008 		if (ndp->ni_dirfd == AT_FDCWD) {
5009 			dvp = pwd->pwd_cdir;
5010 		} else {
5011 			error = cache_fplookup_dirfd(&fpl, &dvp);
5012 			if (__predict_false(error != 0)) {
5013 				goto out;
5014 			}
5015 		}
5016 	}
5017 
5018 	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
5019 
5020 	error = cache_fplookup_impl(dvp, &fpl);
5021 out:
5022 	cache_fpl_smr_assert_not_entered(&fpl);
5023 	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
5024 
5025 	*status = fpl.status;
5026 	switch (fpl.status) {
5027 	case CACHE_FPL_STATUS_UNSET:
5028 		__assert_unreachable();
5029 		break;
5030 	case CACHE_FPL_STATUS_HANDLED:
5031 		SDT_PROBE3(vfs, namei, lookup, return, error,
5032 		    (error == 0 ? ndp->ni_vp : NULL), true);
5033 		break;
5034 	case CACHE_FPL_STATUS_PARTIAL:
5035 		*pwdp = fpl.pwd;
5036 		/*
5037 		 * Status restored by cache_fplookup_partial_setup.
5038 		 */
5039 		break;
5040 	case CACHE_FPL_STATUS_ABORTED:
5041 		cache_fpl_restore_abort(&fpl, &orig);
5042 		break;
5043 	}
5044 	return (error);
5045 }
5046