xref: /freebsd/sys/kern/vfs_cache.c (revision 9119bafbaf52e5d86e5879e54b38c5aba3efa2e9)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ddb.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
50 #include <sys/ktr.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
54 #include <sys/mount.h>
55 #include <sys/namei.h>
56 #include <sys/proc.h>
57 #include <sys/rwlock.h>
58 #include <sys/seqc.h>
59 #include <sys/sdt.h>
60 #include <sys/smr.h>
61 #include <sys/smp.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
66 #include <ck_queue.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 
71 #include <sys/capsicum.h>
72 
73 #include <security/audit/audit.h>
74 #include <security/mac/mac_framework.h>
75 
76 #ifdef DDB
77 #include <ddb/ddb.h>
78 #endif
79 
80 #include <vm/uma.h>
81 
82 SDT_PROVIDER_DECLARE(vfs);
83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
84     "struct vnode *");
85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
86     "char *");
87 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
88 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
89     "char *", "struct vnode *");
90 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
91 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
92     "struct vnode *", "char *");
93 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
94     "struct vnode *");
95 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
96     "struct vnode *", "char *");
97 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
98     "char *");
99 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
100 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
101 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
102 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
103     "struct vnode *");
104 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
105     "char *");
106 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
107     "char *");
108 
109 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
110 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
111 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
112 
113 /*
114  * This structure describes the elements in the cache of recent
115  * names looked up by namei.
116  */
117 struct negstate {
118 	u_char neg_flag;
119 };
120 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
121     "the state must fit in a union with a pointer without growing it");
122 
123 struct	namecache {
124 	CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */
125 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
126 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
127 	struct	vnode *nc_dvp;		/* vnode of parent of name */
128 	union {
129 		struct	vnode *nu_vp;	/* vnode the name refers to */
130 		struct	negstate nu_neg;/* negative entry state */
131 	} n_un;
132 	u_char	nc_flag;		/* flag bits */
133 	u_char	nc_nlen;		/* length of name */
134 	char	nc_name[0];		/* segment name + nul */
135 };
136 
137 /*
138  * struct namecache_ts repeats struct namecache layout up to the
139  * nc_nlen member.
140  * struct namecache_ts is used in place of struct namecache when time(s) need
141  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
142  * both a non-dotdot directory name plus dotdot for the directory's
143  * parent.
144  */
145 struct	namecache_ts {
146 	struct	timespec nc_time;	/* timespec provided by fs */
147 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
148 	int	nc_ticks;		/* ticks value when entry was added */
149 	struct namecache nc_nc;
150 };
151 
152 #define	nc_vp		n_un.nu_vp
153 #define	nc_neg		n_un.nu_neg
154 
155 /*
156  * Flags in namecache.nc_flag
157  */
158 #define NCF_WHITE	0x01
159 #define NCF_ISDOTDOT	0x02
160 #define	NCF_TS		0x04
161 #define	NCF_DTS		0x08
162 #define	NCF_DVDROP	0x10
163 #define	NCF_NEGATIVE	0x20
164 #define	NCF_INVALID	0x40
165 
166 /*
167  * Flags in negstate.neg_flag
168  */
169 #define NEG_HOT		0x01
170 
171 /*
172  * Mark an entry as invalid.
173  *
174  * This is called before it starts getting deconstructed.
175  */
176 static void
177 cache_ncp_invalidate(struct namecache *ncp)
178 {
179 
180 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
181 	    ("%s: entry %p already invalid", __func__, ncp));
182 	ncp->nc_flag |= NCF_INVALID;
183 	atomic_thread_fence_rel();
184 }
185 
186 /*
187  * Verify validity of an entry.
188  *
189  * All places which elide locks are supposed to call this after they are
190  * done with reading from an entry.
191  */
192 static bool
193 cache_ncp_invalid(struct namecache *ncp)
194 {
195 
196 	atomic_thread_fence_acq();
197 	return ((ncp->nc_flag & NCF_INVALID) != 0);
198 }
199 
200 /*
201  * Name caching works as follows:
202  *
203  * Names found by directory scans are retained in a cache
204  * for future reference.  It is managed LRU, so frequently
205  * used names will hang around.  Cache is indexed by hash value
206  * obtained from (dvp, name) where dvp refers to the directory
207  * containing name.
208  *
209  * If it is a "negative" entry, (i.e. for a name that is known NOT to
210  * exist) the vnode pointer will be NULL.
211  *
212  * Upon reaching the last segment of a path, if the reference
213  * is for DELETE, or NOCACHE is set (rewrite), and the
214  * name is located in the cache, it will be dropped.
215  *
216  * These locks are used (in the order in which they can be taken):
217  * NAME		TYPE	ROLE
218  * vnodelock	mtx	vnode lists and v_cache_dd field protection
219  * bucketlock	rwlock	for access to given set of hash buckets
220  * neglist	mtx	negative entry LRU management
221  *
222  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
223  * shrinking the LRU list.
224  *
225  * It is legal to take multiple vnodelock and bucketlock locks. The locking
226  * order is lower address first. Both are recursive.
227  *
228  * "." lookups are lockless.
229  *
230  * ".." and vnode -> name lookups require vnodelock.
231  *
232  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
233  *
234  * Insertions and removals of entries require involved vnodes and bucketlocks
235  * to be write-locked to prevent other threads from seeing the entry.
236  *
237  * Some lookups result in removal of the found entry (e.g. getting rid of a
238  * negative entry with the intent to create a positive one), which poses a
239  * problem when multiple threads reach the state. Similarly, two different
240  * threads can purge two different vnodes and try to remove the same name.
241  *
242  * If the already held vnode lock is lower than the second required lock, we
243  * can just take the other lock. However, in the opposite case, this could
244  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
245  * the first node, locking everything in order and revalidating the state.
246  */
247 
248 VFS_SMR_DECLARE;
249 
250 /*
251  * Structures associated with name caching.
252  */
253 #define NCHHASH(hash) \
254 	(&nchashtbl[(hash) & nchash])
255 static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
256 static u_long __read_mostly	nchash;			/* size of hash table */
257 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
258     "Size of namecache hash table");
259 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
260 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
261     "Ratio of negative namecache entries");
262 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
263 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
264 u_int ncsizefactor = 2;
265 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
266     "Size factor for namecache");
267 static u_int __read_mostly	ncpurgeminvnodes;
268 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
269     "Number of vnodes below which purgevfs ignores the request");
270 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
271 
272 struct nchstats	nchstats;		/* cache effectiveness statistics */
273 
274 static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
275 
276 struct neglist {
277 	struct mtx		nl_lock;
278 	TAILQ_HEAD(, namecache) nl_list;
279 } __aligned(CACHE_LINE_SIZE);
280 
281 static struct neglist __read_mostly	*neglists;
282 static struct neglist ncneg_hot;
283 static u_long numhotneg;
284 
285 #define	numneglists (ncneghash + 1)
286 static u_int __read_mostly	ncneghash;
287 static inline struct neglist *
288 NCP2NEGLIST(struct namecache *ncp)
289 {
290 
291 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
292 }
293 
294 static inline struct negstate *
295 NCP2NEGSTATE(struct namecache *ncp)
296 {
297 
298 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
299 	return (&ncp->nc_neg);
300 }
301 
302 #define	numbucketlocks (ncbuckethash + 1)
303 static u_int __read_mostly  ncbuckethash;
304 static struct rwlock_padalign __read_mostly  *bucketlocks;
305 #define	HASH2BUCKETLOCK(hash) \
306 	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
307 
308 #define	numvnodelocks (ncvnodehash + 1)
309 static u_int __read_mostly  ncvnodehash;
310 static struct mtx __read_mostly *vnodelocks;
311 static inline struct mtx *
312 VP2VNODELOCK(struct vnode *vp)
313 {
314 
315 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
316 }
317 
318 /*
319  * UMA zones for the VFS cache.
320  *
321  * The small cache is used for entries with short names, which are the
322  * most common.  The large cache is used for entries which are too big to
323  * fit in the small cache.
324  */
325 static uma_zone_t __read_mostly cache_zone_small;
326 static uma_zone_t __read_mostly cache_zone_small_ts;
327 static uma_zone_t __read_mostly cache_zone_large;
328 static uma_zone_t __read_mostly cache_zone_large_ts;
329 
330 #define	CACHE_PATH_CUTOFF	35
331 
332 static struct namecache *
333 cache_alloc(int len, int ts)
334 {
335 	struct namecache_ts *ncp_ts;
336 	struct namecache *ncp;
337 
338 	if (__predict_false(ts)) {
339 		if (len <= CACHE_PATH_CUTOFF)
340 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
341 		else
342 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
343 		ncp = &ncp_ts->nc_nc;
344 	} else {
345 		if (len <= CACHE_PATH_CUTOFF)
346 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
347 		else
348 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
349 	}
350 	return (ncp);
351 }
352 
353 static void
354 cache_free(struct namecache *ncp)
355 {
356 	struct namecache_ts *ncp_ts;
357 
358 	if (ncp == NULL)
359 		return;
360 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
361 		vdrop(ncp->nc_dvp);
362 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
363 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
364 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
365 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
366 		else
367 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
368 	} else {
369 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
370 			uma_zfree_smr(cache_zone_small, ncp);
371 		else
372 			uma_zfree_smr(cache_zone_large, ncp);
373 	}
374 }
375 
376 static void
377 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
378 {
379 	struct namecache_ts *ncp_ts;
380 
381 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
382 	    (tsp == NULL && ticksp == NULL),
383 	    ("No NCF_TS"));
384 
385 	if (tsp == NULL && ticksp == NULL)
386 		return;
387 
388 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
389 	if (tsp != NULL)
390 		*tsp = ncp_ts->nc_time;
391 	if (ticksp != NULL)
392 		*ticksp = ncp_ts->nc_ticks;
393 }
394 
395 #ifdef DEBUG_CACHE
396 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
397 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
398     "VFS namecache enabled");
399 #endif
400 
401 /* Export size information to userland */
402 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
403     sizeof(struct namecache), "sizeof(struct namecache)");
404 
405 /*
406  * The new name cache statistics
407  */
408 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
409     "Name cache statistics");
410 #define STATNODE_ULONG(name, descr)					\
411 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
412 #define STATNODE_COUNTER(name, descr)					\
413 	static COUNTER_U64_DEFINE_EARLY(name);				\
414 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
415 	    descr);
416 STATNODE_ULONG(numneg, "Number of negative cache entries");
417 STATNODE_ULONG(numcache, "Number of cache entries");
418 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
419 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
420 STATNODE_COUNTER(dothits, "Number of '.' hits");
421 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
422 STATNODE_COUNTER(nummiss, "Number of cache misses");
423 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
424 STATNODE_COUNTER(numposzaps,
425     "Number of cache hits (positive) we do not want to cache");
426 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
427 STATNODE_COUNTER(numnegzaps,
428     "Number of cache hits (negative) we do not want to cache");
429 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
430 /* These count for vn_getcwd(), too. */
431 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
432 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
433 STATNODE_COUNTER(numfullpathfail2,
434     "Number of fullpath search errors (VOP_VPTOCNP failures)");
435 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
436 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
437 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
438     "Number of successful removals after relocking");
439 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
440     "Number of times zap_and_exit failed to lock");
441 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
442     "Number of times zap_and_exit failed to lock");
443 static long cache_lock_vnodes_cel_3_failures;
444 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
445     "Number of times 3-way vnode locking failed");
446 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
447 STATNODE_COUNTER(numneg_evicted,
448     "Number of negative entries evicted when adding a new entry");
449 STATNODE_COUNTER(shrinking_skipped,
450     "Number of times shrinking was already in progress");
451 
452 static void cache_zap_locked(struct namecache *ncp);
453 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
454     char **freebuf, size_t *buflen);
455 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
456     char *buf, char **retbuf, size_t *buflen);
457 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
458     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
459 
460 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
461 
462 static int cache_yield;
463 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
464     "Number of times cache called yield");
465 
466 static void __noinline
467 cache_maybe_yield(void)
468 {
469 
470 	if (should_yield()) {
471 		cache_yield++;
472 		kern_yield(PRI_USER);
473 	}
474 }
475 
476 static inline void
477 cache_assert_vlp_locked(struct mtx *vlp)
478 {
479 
480 	if (vlp != NULL)
481 		mtx_assert(vlp, MA_OWNED);
482 }
483 
484 static inline void
485 cache_assert_vnode_locked(struct vnode *vp)
486 {
487 	struct mtx *vlp;
488 
489 	vlp = VP2VNODELOCK(vp);
490 	cache_assert_vlp_locked(vlp);
491 }
492 
493 static uint32_t
494 cache_get_hash(char *name, u_char len, struct vnode *dvp)
495 {
496 	uint32_t hash;
497 
498 	hash = fnv_32_buf(name, len, FNV1_32_INIT);
499 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
500 	return (hash);
501 }
502 
503 static inline struct rwlock *
504 NCP2BUCKETLOCK(struct namecache *ncp)
505 {
506 	uint32_t hash;
507 
508 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
509 	return (HASH2BUCKETLOCK(hash));
510 }
511 
512 #ifdef INVARIANTS
513 static void
514 cache_assert_bucket_locked(struct namecache *ncp, int mode)
515 {
516 	struct rwlock *blp;
517 
518 	blp = NCP2BUCKETLOCK(ncp);
519 	rw_assert(blp, mode);
520 }
521 #else
522 #define cache_assert_bucket_locked(x, y) do { } while (0)
523 #endif
524 
525 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
526 static void
527 _cache_sort_vnodes(void **p1, void **p2)
528 {
529 	void *tmp;
530 
531 	MPASS(*p1 != NULL || *p2 != NULL);
532 
533 	if (*p1 > *p2) {
534 		tmp = *p2;
535 		*p2 = *p1;
536 		*p1 = tmp;
537 	}
538 }
539 
540 static void
541 cache_lock_all_buckets(void)
542 {
543 	u_int i;
544 
545 	for (i = 0; i < numbucketlocks; i++)
546 		rw_wlock(&bucketlocks[i]);
547 }
548 
549 static void
550 cache_unlock_all_buckets(void)
551 {
552 	u_int i;
553 
554 	for (i = 0; i < numbucketlocks; i++)
555 		rw_wunlock(&bucketlocks[i]);
556 }
557 
558 static void
559 cache_lock_all_vnodes(void)
560 {
561 	u_int i;
562 
563 	for (i = 0; i < numvnodelocks; i++)
564 		mtx_lock(&vnodelocks[i]);
565 }
566 
567 static void
568 cache_unlock_all_vnodes(void)
569 {
570 	u_int i;
571 
572 	for (i = 0; i < numvnodelocks; i++)
573 		mtx_unlock(&vnodelocks[i]);
574 }
575 
576 static int
577 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
578 {
579 
580 	cache_sort_vnodes(&vlp1, &vlp2);
581 
582 	if (vlp1 != NULL) {
583 		if (!mtx_trylock(vlp1))
584 			return (EAGAIN);
585 	}
586 	if (!mtx_trylock(vlp2)) {
587 		if (vlp1 != NULL)
588 			mtx_unlock(vlp1);
589 		return (EAGAIN);
590 	}
591 
592 	return (0);
593 }
594 
595 static void
596 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
597 {
598 
599 	MPASS(vlp1 != NULL || vlp2 != NULL);
600 	MPASS(vlp1 <= vlp2);
601 
602 	if (vlp1 != NULL)
603 		mtx_lock(vlp1);
604 	if (vlp2 != NULL)
605 		mtx_lock(vlp2);
606 }
607 
608 static void
609 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
610 {
611 
612 	MPASS(vlp1 != NULL || vlp2 != NULL);
613 
614 	if (vlp1 != NULL)
615 		mtx_unlock(vlp1);
616 	if (vlp2 != NULL)
617 		mtx_unlock(vlp2);
618 }
619 
620 static int
621 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
622 {
623 	struct nchstats snap;
624 
625 	if (req->oldptr == NULL)
626 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
627 
628 	snap = nchstats;
629 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
630 	snap.ncs_neghits = counter_u64_fetch(numneghits);
631 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
632 	    counter_u64_fetch(numnegzaps);
633 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
634 	    counter_u64_fetch(nummiss);
635 
636 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
637 }
638 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
639     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
640     "VFS cache effectiveness statistics");
641 
642 #ifdef DIAGNOSTIC
643 /*
644  * Grab an atomic snapshot of the name cache hash chain lengths
645  */
646 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
647     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
648     "hash table stats");
649 
650 static int
651 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
652 {
653 	struct nchashhead *ncpp;
654 	struct namecache *ncp;
655 	int i, error, n_nchash, *cntbuf;
656 
657 retry:
658 	n_nchash = nchash + 1;	/* nchash is max index, not count */
659 	if (req->oldptr == NULL)
660 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
661 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
662 	cache_lock_all_buckets();
663 	if (n_nchash != nchash + 1) {
664 		cache_unlock_all_buckets();
665 		free(cntbuf, M_TEMP);
666 		goto retry;
667 	}
668 	/* Scan hash tables counting entries */
669 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
670 		CK_LIST_FOREACH(ncp, ncpp, nc_hash)
671 			cntbuf[i]++;
672 	cache_unlock_all_buckets();
673 	for (error = 0, i = 0; i < n_nchash; i++)
674 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
675 			break;
676 	free(cntbuf, M_TEMP);
677 	return (error);
678 }
679 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
680     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
681     "nchash chain lengths");
682 
683 static int
684 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
685 {
686 	int error;
687 	struct nchashhead *ncpp;
688 	struct namecache *ncp;
689 	int n_nchash;
690 	int count, maxlength, used, pct;
691 
692 	if (!req->oldptr)
693 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
694 
695 	cache_lock_all_buckets();
696 	n_nchash = nchash + 1;	/* nchash is max index, not count */
697 	used = 0;
698 	maxlength = 0;
699 
700 	/* Scan hash tables for applicable entries */
701 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
702 		count = 0;
703 		CK_LIST_FOREACH(ncp, ncpp, nc_hash) {
704 			count++;
705 		}
706 		if (count)
707 			used++;
708 		if (maxlength < count)
709 			maxlength = count;
710 	}
711 	n_nchash = nchash + 1;
712 	cache_unlock_all_buckets();
713 	pct = (used * 100) / (n_nchash / 100);
714 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
715 	if (error)
716 		return (error);
717 	error = SYSCTL_OUT(req, &used, sizeof(used));
718 	if (error)
719 		return (error);
720 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
721 	if (error)
722 		return (error);
723 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
724 	if (error)
725 		return (error);
726 	return (0);
727 }
728 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
729     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
730     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
731 #endif
732 
733 /*
734  * Negative entries management
735  *
736  * A variation of LRU scheme is used. New entries are hashed into one of
737  * numneglists cold lists. Entries get promoted to the hot list on first hit.
738  *
739  * The shrinker will demote hot list head and evict from the cold list in a
740  * round-robin manner.
741  */
742 static void
743 cache_negative_init(struct namecache *ncp)
744 {
745 	struct negstate *negstate;
746 
747 	ncp->nc_flag |= NCF_NEGATIVE;
748 	negstate = NCP2NEGSTATE(ncp);
749 	negstate->neg_flag = 0;
750 }
751 
752 static void
753 cache_negative_hit(struct namecache *ncp)
754 {
755 	struct neglist *neglist;
756 	struct negstate *negstate;
757 
758 	negstate = NCP2NEGSTATE(ncp);
759 	if ((negstate->neg_flag & NEG_HOT) != 0)
760 		return;
761 	neglist = NCP2NEGLIST(ncp);
762 	mtx_lock(&ncneg_hot.nl_lock);
763 	mtx_lock(&neglist->nl_lock);
764 	if ((negstate->neg_flag & NEG_HOT) == 0) {
765 		numhotneg++;
766 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
767 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
768 		negstate->neg_flag |= NEG_HOT;
769 	}
770 	mtx_unlock(&neglist->nl_lock);
771 	mtx_unlock(&ncneg_hot.nl_lock);
772 }
773 
774 static void
775 cache_negative_insert(struct namecache *ncp)
776 {
777 	struct neglist *neglist;
778 
779 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
780 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
781 	neglist = NCP2NEGLIST(ncp);
782 	mtx_lock(&neglist->nl_lock);
783 	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
784 	mtx_unlock(&neglist->nl_lock);
785 	atomic_add_rel_long(&numneg, 1);
786 }
787 
788 static void
789 cache_negative_remove(struct namecache *ncp)
790 {
791 	struct neglist *neglist;
792 	struct negstate *negstate;
793 	bool hot_locked = false;
794 	bool list_locked = false;
795 
796 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
797 	neglist = NCP2NEGLIST(ncp);
798 	negstate = NCP2NEGSTATE(ncp);
799 	if ((negstate->neg_flag & NEG_HOT) != 0) {
800 		hot_locked = true;
801 		mtx_lock(&ncneg_hot.nl_lock);
802 		if ((negstate->neg_flag & NEG_HOT) == 0) {
803 			list_locked = true;
804 			mtx_lock(&neglist->nl_lock);
805 		}
806 	} else {
807 		list_locked = true;
808 		mtx_lock(&neglist->nl_lock);
809 		/*
810 		 * We may be racing against promotion in lockless lookup.
811 		 */
812 		if ((negstate->neg_flag & NEG_HOT) != 0) {
813 			mtx_unlock(&neglist->nl_lock);
814 			hot_locked = true;
815 			mtx_lock(&ncneg_hot.nl_lock);
816 			mtx_lock(&neglist->nl_lock);
817 		}
818 	}
819 	if ((negstate->neg_flag & NEG_HOT) != 0) {
820 		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
821 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
822 		numhotneg--;
823 	} else {
824 		mtx_assert(&neglist->nl_lock, MA_OWNED);
825 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
826 	}
827 	if (list_locked)
828 		mtx_unlock(&neglist->nl_lock);
829 	if (hot_locked)
830 		mtx_unlock(&ncneg_hot.nl_lock);
831 	atomic_subtract_rel_long(&numneg, 1);
832 }
833 
834 static void
835 cache_negative_shrink_select(struct namecache **ncpp,
836     struct neglist **neglistpp)
837 {
838 	struct neglist *neglist;
839 	struct namecache *ncp;
840 	static u_int cycle;
841 	u_int i;
842 
843 	*ncpp = ncp = NULL;
844 
845 	for (i = 0; i < numneglists; i++) {
846 		neglist = &neglists[(cycle + i) % numneglists];
847 		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
848 			continue;
849 		mtx_lock(&neglist->nl_lock);
850 		ncp = TAILQ_FIRST(&neglist->nl_list);
851 		if (ncp != NULL)
852 			break;
853 		mtx_unlock(&neglist->nl_lock);
854 	}
855 
856 	*neglistpp = neglist;
857 	*ncpp = ncp;
858 	cycle++;
859 }
860 
861 static void
862 cache_negative_zap_one(void)
863 {
864 	struct namecache *ncp, *ncp2;
865 	struct neglist *neglist;
866 	struct negstate *negstate;
867 	struct mtx *dvlp;
868 	struct rwlock *blp;
869 
870 	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
871 	    !mtx_trylock(&ncneg_shrink_lock)) {
872 		counter_u64_add(shrinking_skipped, 1);
873 		return;
874 	}
875 
876 	mtx_lock(&ncneg_hot.nl_lock);
877 	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
878 	if (ncp != NULL) {
879 		neglist = NCP2NEGLIST(ncp);
880 		negstate = NCP2NEGSTATE(ncp);
881 		mtx_lock(&neglist->nl_lock);
882 		MPASS((negstate->neg_flag & NEG_HOT) != 0);
883 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
884 		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
885 		negstate->neg_flag &= ~NEG_HOT;
886 		numhotneg--;
887 		mtx_unlock(&neglist->nl_lock);
888 	}
889 	mtx_unlock(&ncneg_hot.nl_lock);
890 
891 	cache_negative_shrink_select(&ncp, &neglist);
892 
893 	mtx_unlock(&ncneg_shrink_lock);
894 	if (ncp == NULL)
895 		return;
896 
897 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
898 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
899 	blp = NCP2BUCKETLOCK(ncp);
900 	mtx_unlock(&neglist->nl_lock);
901 	mtx_lock(dvlp);
902 	rw_wlock(blp);
903 	/*
904 	 * Enter SMR to safely check the negative list.
905 	 * Even if the found pointer matches, the entry may now be reallocated
906 	 * and used by a different vnode.
907 	 */
908 	vfs_smr_enter();
909 	ncp2 = TAILQ_FIRST(&neglist->nl_list);
910 	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
911 	    blp != NCP2BUCKETLOCK(ncp2)) {
912 		vfs_smr_exit();
913 		ncp = NULL;
914 	} else {
915 		vfs_smr_exit();
916 		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
917 		    ncp->nc_name);
918 		cache_zap_locked(ncp);
919 		counter_u64_add(numneg_evicted, 1);
920 	}
921 	rw_wunlock(blp);
922 	mtx_unlock(dvlp);
923 	cache_free(ncp);
924 }
925 
926 /*
927  * cache_zap_locked():
928  *
929  *   Removes a namecache entry from cache, whether it contains an actual
930  *   pointer to a vnode or if it is just a negative cache entry.
931  */
932 static void
933 cache_zap_locked(struct namecache *ncp)
934 {
935 
936 	if (!(ncp->nc_flag & NCF_NEGATIVE))
937 		cache_assert_vnode_locked(ncp->nc_vp);
938 	cache_assert_vnode_locked(ncp->nc_dvp);
939 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
940 
941 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
942 	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
943 
944 	cache_ncp_invalidate(ncp);
945 
946 	CK_LIST_REMOVE(ncp, nc_hash);
947 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
948 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
949 		    ncp->nc_name, ncp->nc_vp);
950 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
951 		if (ncp == ncp->nc_vp->v_cache_dd)
952 			ncp->nc_vp->v_cache_dd = NULL;
953 	} else {
954 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
955 		    ncp->nc_name);
956 		cache_negative_remove(ncp);
957 	}
958 	if (ncp->nc_flag & NCF_ISDOTDOT) {
959 		if (ncp == ncp->nc_dvp->v_cache_dd)
960 			ncp->nc_dvp->v_cache_dd = NULL;
961 	} else {
962 		LIST_REMOVE(ncp, nc_src);
963 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
964 			ncp->nc_flag |= NCF_DVDROP;
965 			counter_u64_add(numcachehv, -1);
966 		}
967 	}
968 	atomic_subtract_rel_long(&numcache, 1);
969 }
970 
971 static void
972 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
973 {
974 	struct rwlock *blp;
975 
976 	MPASS(ncp->nc_dvp == vp);
977 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
978 	cache_assert_vnode_locked(vp);
979 
980 	blp = NCP2BUCKETLOCK(ncp);
981 	rw_wlock(blp);
982 	cache_zap_locked(ncp);
983 	rw_wunlock(blp);
984 }
985 
986 static bool
987 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
988     struct mtx **vlpp)
989 {
990 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
991 	struct rwlock *blp;
992 
993 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
994 	cache_assert_vnode_locked(vp);
995 
996 	if (ncp->nc_flag & NCF_NEGATIVE) {
997 		if (*vlpp != NULL) {
998 			mtx_unlock(*vlpp);
999 			*vlpp = NULL;
1000 		}
1001 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1002 		return (true);
1003 	}
1004 
1005 	pvlp = VP2VNODELOCK(vp);
1006 	blp = NCP2BUCKETLOCK(ncp);
1007 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1008 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1009 
1010 	if (*vlpp == vlp1 || *vlpp == vlp2) {
1011 		to_unlock = *vlpp;
1012 		*vlpp = NULL;
1013 	} else {
1014 		if (*vlpp != NULL) {
1015 			mtx_unlock(*vlpp);
1016 			*vlpp = NULL;
1017 		}
1018 		cache_sort_vnodes(&vlp1, &vlp2);
1019 		if (vlp1 == pvlp) {
1020 			mtx_lock(vlp2);
1021 			to_unlock = vlp2;
1022 		} else {
1023 			if (!mtx_trylock(vlp1))
1024 				goto out_relock;
1025 			to_unlock = vlp1;
1026 		}
1027 	}
1028 	rw_wlock(blp);
1029 	cache_zap_locked(ncp);
1030 	rw_wunlock(blp);
1031 	if (to_unlock != NULL)
1032 		mtx_unlock(to_unlock);
1033 	return (true);
1034 
1035 out_relock:
1036 	mtx_unlock(vlp2);
1037 	mtx_lock(vlp1);
1038 	mtx_lock(vlp2);
1039 	MPASS(*vlpp == NULL);
1040 	*vlpp = vlp1;
1041 	return (false);
1042 }
1043 
1044 static int __noinline
1045 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1046 {
1047 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1048 	struct rwlock *blp;
1049 	int error = 0;
1050 
1051 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1052 	cache_assert_vnode_locked(vp);
1053 
1054 	pvlp = VP2VNODELOCK(vp);
1055 	if (ncp->nc_flag & NCF_NEGATIVE) {
1056 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1057 		goto out;
1058 	}
1059 
1060 	blp = NCP2BUCKETLOCK(ncp);
1061 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1062 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1063 	cache_sort_vnodes(&vlp1, &vlp2);
1064 	if (vlp1 == pvlp) {
1065 		mtx_lock(vlp2);
1066 		to_unlock = vlp2;
1067 	} else {
1068 		if (!mtx_trylock(vlp1)) {
1069 			error = EAGAIN;
1070 			goto out;
1071 		}
1072 		to_unlock = vlp1;
1073 	}
1074 	rw_wlock(blp);
1075 	cache_zap_locked(ncp);
1076 	rw_wunlock(blp);
1077 	mtx_unlock(to_unlock);
1078 out:
1079 	mtx_unlock(pvlp);
1080 	return (error);
1081 }
1082 
1083 /*
1084  * If trylocking failed we can get here. We know enough to take all needed locks
1085  * in the right order and re-lookup the entry.
1086  */
1087 static int
1088 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1089     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1090     struct rwlock *blp)
1091 {
1092 	struct namecache *rncp;
1093 
1094 	cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1095 
1096 	cache_sort_vnodes(&dvlp, &vlp);
1097 	cache_lock_vnodes(dvlp, vlp);
1098 	rw_wlock(blp);
1099 	CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1100 		if (rncp == ncp && rncp->nc_dvp == dvp &&
1101 		    rncp->nc_nlen == cnp->cn_namelen &&
1102 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1103 			break;
1104 	}
1105 	if (rncp != NULL) {
1106 		cache_zap_locked(rncp);
1107 		rw_wunlock(blp);
1108 		cache_unlock_vnodes(dvlp, vlp);
1109 		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1110 		return (0);
1111 	}
1112 
1113 	rw_wunlock(blp);
1114 	cache_unlock_vnodes(dvlp, vlp);
1115 	return (EAGAIN);
1116 }
1117 
1118 static int __noinline
1119 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1120     uint32_t hash, struct rwlock *blp)
1121 {
1122 	struct mtx *dvlp, *vlp;
1123 	struct vnode *dvp;
1124 
1125 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1126 
1127 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1128 	vlp = NULL;
1129 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1130 		vlp = VP2VNODELOCK(ncp->nc_vp);
1131 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1132 		cache_zap_locked(ncp);
1133 		rw_wunlock(blp);
1134 		cache_unlock_vnodes(dvlp, vlp);
1135 		return (0);
1136 	}
1137 
1138 	dvp = ncp->nc_dvp;
1139 	rw_wunlock(blp);
1140 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1141 }
1142 
1143 static int __noinline
1144 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1145     uint32_t hash, struct rwlock *blp)
1146 {
1147 	struct mtx *dvlp, *vlp;
1148 	struct vnode *dvp;
1149 
1150 	cache_assert_bucket_locked(ncp, RA_RLOCKED);
1151 
1152 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1153 	vlp = NULL;
1154 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1155 		vlp = VP2VNODELOCK(ncp->nc_vp);
1156 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1157 		rw_runlock(blp);
1158 		rw_wlock(blp);
1159 		cache_zap_locked(ncp);
1160 		rw_wunlock(blp);
1161 		cache_unlock_vnodes(dvlp, vlp);
1162 		return (0);
1163 	}
1164 
1165 	dvp = ncp->nc_dvp;
1166 	rw_runlock(blp);
1167 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1168 }
1169 
1170 static int
1171 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1172     struct mtx **vlpp1, struct mtx **vlpp2)
1173 {
1174 	struct mtx *dvlp, *vlp;
1175 
1176 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1177 
1178 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1179 	vlp = NULL;
1180 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1181 		vlp = VP2VNODELOCK(ncp->nc_vp);
1182 	cache_sort_vnodes(&dvlp, &vlp);
1183 
1184 	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1185 		cache_zap_locked(ncp);
1186 		cache_unlock_vnodes(dvlp, vlp);
1187 		*vlpp1 = NULL;
1188 		*vlpp2 = NULL;
1189 		return (0);
1190 	}
1191 
1192 	if (*vlpp1 != NULL)
1193 		mtx_unlock(*vlpp1);
1194 	if (*vlpp2 != NULL)
1195 		mtx_unlock(*vlpp2);
1196 	*vlpp1 = NULL;
1197 	*vlpp2 = NULL;
1198 
1199 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1200 		cache_zap_locked(ncp);
1201 		cache_unlock_vnodes(dvlp, vlp);
1202 		return (0);
1203 	}
1204 
1205 	rw_wunlock(blp);
1206 	*vlpp1 = dvlp;
1207 	*vlpp2 = vlp;
1208 	if (*vlpp1 != NULL)
1209 		mtx_lock(*vlpp1);
1210 	mtx_lock(*vlpp2);
1211 	rw_wlock(blp);
1212 	return (EAGAIN);
1213 }
1214 
1215 static void
1216 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1217 {
1218 
1219 	if (blp != NULL) {
1220 		rw_runlock(blp);
1221 	} else {
1222 		mtx_unlock(vlp);
1223 	}
1224 }
1225 
1226 static int __noinline
1227 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1228     struct timespec *tsp, int *ticksp)
1229 {
1230 	int ltype;
1231 
1232 	*vpp = dvp;
1233 	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1234 			dvp, cnp->cn_nameptr);
1235 	counter_u64_add(dothits, 1);
1236 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1237 	if (tsp != NULL)
1238 		timespecclear(tsp);
1239 	if (ticksp != NULL)
1240 		*ticksp = ticks;
1241 	vrefact(*vpp);
1242 	/*
1243 	 * When we lookup "." we still can be asked to lock it
1244 	 * differently...
1245 	 */
1246 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1247 	if (ltype != VOP_ISLOCKED(*vpp)) {
1248 		if (ltype == LK_EXCLUSIVE) {
1249 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1250 			if (VN_IS_DOOMED((*vpp))) {
1251 				/* forced unmount */
1252 				vrele(*vpp);
1253 				*vpp = NULL;
1254 				return (ENOENT);
1255 			}
1256 		} else
1257 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1258 	}
1259 	return (-1);
1260 }
1261 
1262 static __noinline int
1263 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1264     struct componentname *cnp, struct timespec *tsp, int *ticksp)
1265 {
1266 	struct namecache *ncp;
1267 	struct rwlock *blp;
1268 	struct mtx *dvlp, *dvlp2;
1269 	uint32_t hash;
1270 	int error;
1271 
1272 	if (cnp->cn_namelen == 2 &&
1273 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1274 		counter_u64_add(dotdothits, 1);
1275 		dvlp = VP2VNODELOCK(dvp);
1276 		dvlp2 = NULL;
1277 		mtx_lock(dvlp);
1278 retry_dotdot:
1279 		ncp = dvp->v_cache_dd;
1280 		if (ncp == NULL) {
1281 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1282 			    "..", NULL);
1283 			mtx_unlock(dvlp);
1284 			if (dvlp2 != NULL)
1285 				mtx_unlock(dvlp2);
1286 			return (0);
1287 		}
1288 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1289 			if (ncp->nc_dvp != dvp)
1290 				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1291 			if (!cache_zap_locked_vnode_kl2(ncp,
1292 			    dvp, &dvlp2))
1293 				goto retry_dotdot;
1294 			MPASS(dvp->v_cache_dd == NULL);
1295 			mtx_unlock(dvlp);
1296 			if (dvlp2 != NULL)
1297 				mtx_unlock(dvlp2);
1298 			cache_free(ncp);
1299 		} else {
1300 			dvp->v_cache_dd = NULL;
1301 			mtx_unlock(dvlp);
1302 			if (dvlp2 != NULL)
1303 				mtx_unlock(dvlp2);
1304 		}
1305 		return (0);
1306 	}
1307 
1308 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1309 	blp = HASH2BUCKETLOCK(hash);
1310 retry:
1311 	if (CK_LIST_EMPTY(NCHHASH(hash)))
1312 		goto out_no_entry;
1313 
1314 	rw_wlock(blp);
1315 
1316 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1317 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1318 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1319 			break;
1320 	}
1321 
1322 	/* We failed to find an entry */
1323 	if (ncp == NULL) {
1324 		rw_wunlock(blp);
1325 		goto out_no_entry;
1326 	}
1327 
1328 	error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1329 	if (__predict_false(error != 0)) {
1330 		zap_and_exit_bucket_fail++;
1331 		cache_maybe_yield();
1332 		goto retry;
1333 	}
1334 	counter_u64_add(numposzaps, 1);
1335 	cache_free(ncp);
1336 	return (0);
1337 out_no_entry:
1338 	SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1339 	counter_u64_add(nummisszap, 1);
1340 	return (0);
1341 }
1342 
1343 /**
1344  * Lookup a name in the name cache
1345  *
1346  * # Arguments
1347  *
1348  * - dvp:	Parent directory in which to search.
1349  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1350  * - cnp:	Parameters of the name search.  The most interesting bits of
1351  *   		the cn_flags field have the following meanings:
1352  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1353  *   			it up.
1354  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1355  * - tsp:	Return storage for cache timestamp.  On a successful (positive
1356  *   		or negative) lookup, tsp will be filled with any timespec that
1357  *   		was stored when this cache entry was created.  However, it will
1358  *   		be clear for "." entries.
1359  * - ticks:	Return storage for alternate cache timestamp.  On a successful
1360  *   		(positive or negative) lookup, it will contain the ticks value
1361  *   		that was current when the cache entry was created, unless cnp
1362  *   		was ".".
1363  *
1364  * # Returns
1365  *
1366  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1367  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1368  *		to a forced unmount.  vpp will not be modified.  If the entry
1369  *		is a whiteout, then the ISWHITEOUT flag will be set in
1370  *		cnp->cn_flags.
1371  * - 0:		A cache miss.  vpp will not be modified.
1372  *
1373  * # Locking
1374  *
1375  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1376  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1377  * lock is not recursively acquired.
1378  */
1379 int
1380 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1381     struct timespec *tsp, int *ticksp)
1382 {
1383 	struct namecache_ts *ncp_ts;
1384 	struct namecache *ncp;
1385 	struct negstate *negstate;
1386 	struct rwlock *blp;
1387 	struct mtx *dvlp;
1388 	uint32_t hash;
1389 	enum vgetstate vs;
1390 	int error, ltype;
1391 	bool try_smr, doing_smr, whiteout;
1392 
1393 #ifdef DEBUG_CACHE
1394 	if (__predict_false(!doingcache)) {
1395 		cnp->cn_flags &= ~MAKEENTRY;
1396 		return (0);
1397 	}
1398 #endif
1399 
1400 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1401 		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1402 
1403 	if ((cnp->cn_flags & MAKEENTRY) == 0)
1404 		return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1405 
1406 	try_smr = true;
1407 	if (cnp->cn_nameiop == CREATE)
1408 		try_smr = false;
1409 retry:
1410 	doing_smr = false;
1411 	blp = NULL;
1412 	dvlp = NULL;
1413 	error = 0;
1414 	if (cnp->cn_namelen == 2 &&
1415 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1416 		counter_u64_add(dotdothits, 1);
1417 		dvlp = VP2VNODELOCK(dvp);
1418 		mtx_lock(dvlp);
1419 		ncp = dvp->v_cache_dd;
1420 		if (ncp == NULL) {
1421 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1422 			    "..", NULL);
1423 			mtx_unlock(dvlp);
1424 			return (0);
1425 		}
1426 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1427 			if (ncp->nc_flag & NCF_NEGATIVE)
1428 				*vpp = NULL;
1429 			else
1430 				*vpp = ncp->nc_vp;
1431 		} else
1432 			*vpp = ncp->nc_dvp;
1433 		/* Return failure if negative entry was found. */
1434 		if (*vpp == NULL)
1435 			goto negative_success;
1436 		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1437 		    dvp, cnp->cn_nameptr, *vpp);
1438 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1439 		    *vpp);
1440 		cache_out_ts(ncp, tsp, ticksp);
1441 		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1442 		    NCF_DTS && tsp != NULL) {
1443 			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1444 			*tsp = ncp_ts->nc_dotdottime;
1445 		}
1446 		goto success;
1447 	}
1448 
1449 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1450 retry_hashed:
1451 	if (try_smr) {
1452 		vfs_smr_enter();
1453 		doing_smr = true;
1454 		try_smr = false;
1455 	} else {
1456 		blp = HASH2BUCKETLOCK(hash);
1457 		rw_rlock(blp);
1458 	}
1459 
1460 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1461 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1462 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1463 			break;
1464 	}
1465 
1466 	/* We failed to find an entry */
1467 	if (__predict_false(ncp == NULL)) {
1468 		if (doing_smr)
1469 			vfs_smr_exit();
1470 		else
1471 			rw_runlock(blp);
1472 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1473 		    NULL);
1474 		counter_u64_add(nummiss, 1);
1475 		return (0);
1476 	}
1477 
1478 	if (ncp->nc_flag & NCF_NEGATIVE)
1479 		goto negative_success;
1480 
1481 	/* We found a "positive" match, return the vnode */
1482 	counter_u64_add(numposhits, 1);
1483 	*vpp = ncp->nc_vp;
1484 	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1485 	    dvp, cnp->cn_nameptr, *vpp, ncp);
1486 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1487 	    *vpp);
1488 	cache_out_ts(ncp, tsp, ticksp);
1489 success:
1490 	/*
1491 	 * On success we return a locked and ref'd vnode as per the lookup
1492 	 * protocol.
1493 	 */
1494 	MPASS(dvp != *vpp);
1495 	ltype = 0;	/* silence gcc warning */
1496 	if (cnp->cn_flags & ISDOTDOT) {
1497 		ltype = VOP_ISLOCKED(dvp);
1498 		VOP_UNLOCK(dvp);
1499 	}
1500 	if (doing_smr) {
1501 		if (cache_ncp_invalid(ncp)) {
1502 			vfs_smr_exit();
1503 			*vpp = NULL;
1504 			goto retry;
1505 		}
1506 		vs = vget_prep_smr(*vpp);
1507 		vfs_smr_exit();
1508 		if (vs == VGET_NONE) {
1509 			*vpp = NULL;
1510 			goto retry;
1511 		}
1512 	} else {
1513 		vs = vget_prep(*vpp);
1514 		cache_lookup_unlock(blp, dvlp);
1515 	}
1516 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1517 	if (cnp->cn_flags & ISDOTDOT) {
1518 		vn_lock(dvp, ltype | LK_RETRY);
1519 		if (VN_IS_DOOMED(dvp)) {
1520 			if (error == 0)
1521 				vput(*vpp);
1522 			*vpp = NULL;
1523 			return (ENOENT);
1524 		}
1525 	}
1526 	if (error) {
1527 		*vpp = NULL;
1528 		goto retry;
1529 	}
1530 	if ((cnp->cn_flags & ISLASTCN) &&
1531 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1532 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1533 	}
1534 	return (-1);
1535 
1536 negative_success:
1537 	/* We found a negative match, and want to create it, so purge */
1538 	if (cnp->cn_nameiop == CREATE) {
1539 		MPASS(!doing_smr);
1540 		counter_u64_add(numnegzaps, 1);
1541 		goto zap_and_exit;
1542 	}
1543 
1544 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1545 	cache_out_ts(ncp, tsp, ticksp);
1546 	counter_u64_add(numneghits, 1);
1547 	whiteout = (ncp->nc_flag & NCF_WHITE);
1548 
1549 	if (doing_smr) {
1550 		/*
1551 		 * We need to take locks to promote an entry.
1552 		 */
1553 		negstate = NCP2NEGSTATE(ncp);
1554 		if ((negstate->neg_flag & NEG_HOT) == 0 ||
1555 		    cache_ncp_invalid(ncp)) {
1556 			vfs_smr_exit();
1557 			doing_smr = false;
1558 			goto retry_hashed;
1559 		}
1560 		vfs_smr_exit();
1561 	} else {
1562 		cache_negative_hit(ncp);
1563 		cache_lookup_unlock(blp, dvlp);
1564 	}
1565 	if (whiteout)
1566 		cnp->cn_flags |= ISWHITEOUT;
1567 	return (ENOENT);
1568 
1569 zap_and_exit:
1570 	MPASS(!doing_smr);
1571 	if (blp != NULL)
1572 		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1573 	else
1574 		error = cache_zap_locked_vnode(ncp, dvp);
1575 	if (__predict_false(error != 0)) {
1576 		zap_and_exit_bucket_fail2++;
1577 		cache_maybe_yield();
1578 		goto retry;
1579 	}
1580 	cache_free(ncp);
1581 	return (0);
1582 }
1583 
1584 struct celockstate {
1585 	struct mtx *vlp[3];
1586 	struct rwlock *blp[2];
1587 };
1588 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1589 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1590 
1591 static inline void
1592 cache_celockstate_init(struct celockstate *cel)
1593 {
1594 
1595 	bzero(cel, sizeof(*cel));
1596 }
1597 
1598 static void
1599 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1600     struct vnode *dvp)
1601 {
1602 	struct mtx *vlp1, *vlp2;
1603 
1604 	MPASS(cel->vlp[0] == NULL);
1605 	MPASS(cel->vlp[1] == NULL);
1606 	MPASS(cel->vlp[2] == NULL);
1607 
1608 	MPASS(vp != NULL || dvp != NULL);
1609 
1610 	vlp1 = VP2VNODELOCK(vp);
1611 	vlp2 = VP2VNODELOCK(dvp);
1612 	cache_sort_vnodes(&vlp1, &vlp2);
1613 
1614 	if (vlp1 != NULL) {
1615 		mtx_lock(vlp1);
1616 		cel->vlp[0] = vlp1;
1617 	}
1618 	mtx_lock(vlp2);
1619 	cel->vlp[1] = vlp2;
1620 }
1621 
1622 static void
1623 cache_unlock_vnodes_cel(struct celockstate *cel)
1624 {
1625 
1626 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1627 
1628 	if (cel->vlp[0] != NULL)
1629 		mtx_unlock(cel->vlp[0]);
1630 	if (cel->vlp[1] != NULL)
1631 		mtx_unlock(cel->vlp[1]);
1632 	if (cel->vlp[2] != NULL)
1633 		mtx_unlock(cel->vlp[2]);
1634 }
1635 
1636 static bool
1637 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1638 {
1639 	struct mtx *vlp;
1640 	bool ret;
1641 
1642 	cache_assert_vlp_locked(cel->vlp[0]);
1643 	cache_assert_vlp_locked(cel->vlp[1]);
1644 	MPASS(cel->vlp[2] == NULL);
1645 
1646 	MPASS(vp != NULL);
1647 	vlp = VP2VNODELOCK(vp);
1648 
1649 	ret = true;
1650 	if (vlp >= cel->vlp[1]) {
1651 		mtx_lock(vlp);
1652 	} else {
1653 		if (mtx_trylock(vlp))
1654 			goto out;
1655 		cache_lock_vnodes_cel_3_failures++;
1656 		cache_unlock_vnodes_cel(cel);
1657 		if (vlp < cel->vlp[0]) {
1658 			mtx_lock(vlp);
1659 			mtx_lock(cel->vlp[0]);
1660 			mtx_lock(cel->vlp[1]);
1661 		} else {
1662 			if (cel->vlp[0] != NULL)
1663 				mtx_lock(cel->vlp[0]);
1664 			mtx_lock(vlp);
1665 			mtx_lock(cel->vlp[1]);
1666 		}
1667 		ret = false;
1668 	}
1669 out:
1670 	cel->vlp[2] = vlp;
1671 	return (ret);
1672 }
1673 
1674 static void
1675 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1676     struct rwlock *blp2)
1677 {
1678 
1679 	MPASS(cel->blp[0] == NULL);
1680 	MPASS(cel->blp[1] == NULL);
1681 
1682 	cache_sort_vnodes(&blp1, &blp2);
1683 
1684 	if (blp1 != NULL) {
1685 		rw_wlock(blp1);
1686 		cel->blp[0] = blp1;
1687 	}
1688 	rw_wlock(blp2);
1689 	cel->blp[1] = blp2;
1690 }
1691 
1692 static void
1693 cache_unlock_buckets_cel(struct celockstate *cel)
1694 {
1695 
1696 	if (cel->blp[0] != NULL)
1697 		rw_wunlock(cel->blp[0]);
1698 	rw_wunlock(cel->blp[1]);
1699 }
1700 
1701 /*
1702  * Lock part of the cache affected by the insertion.
1703  *
1704  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1705  * However, insertion can result in removal of an old entry. In this
1706  * case we have an additional vnode and bucketlock pair to lock. If the
1707  * entry is negative, ncelock is locked instead of the vnode.
1708  *
1709  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1710  * preserving the locking order (smaller address first).
1711  */
1712 static void
1713 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1714     uint32_t hash)
1715 {
1716 	struct namecache *ncp;
1717 	struct rwlock *blps[2];
1718 
1719 	blps[0] = HASH2BUCKETLOCK(hash);
1720 	for (;;) {
1721 		blps[1] = NULL;
1722 		cache_lock_vnodes_cel(cel, dvp, vp);
1723 		if (vp == NULL || vp->v_type != VDIR)
1724 			break;
1725 		ncp = vp->v_cache_dd;
1726 		if (ncp == NULL)
1727 			break;
1728 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1729 			break;
1730 		MPASS(ncp->nc_dvp == vp);
1731 		blps[1] = NCP2BUCKETLOCK(ncp);
1732 		if (ncp->nc_flag & NCF_NEGATIVE)
1733 			break;
1734 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1735 			break;
1736 		/*
1737 		 * All vnodes got re-locked. Re-validate the state and if
1738 		 * nothing changed we are done. Otherwise restart.
1739 		 */
1740 		if (ncp == vp->v_cache_dd &&
1741 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1742 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1743 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1744 			break;
1745 		cache_unlock_vnodes_cel(cel);
1746 		cel->vlp[0] = NULL;
1747 		cel->vlp[1] = NULL;
1748 		cel->vlp[2] = NULL;
1749 	}
1750 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1751 }
1752 
1753 static void
1754 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1755     uint32_t hash)
1756 {
1757 	struct namecache *ncp;
1758 	struct rwlock *blps[2];
1759 
1760 	blps[0] = HASH2BUCKETLOCK(hash);
1761 	for (;;) {
1762 		blps[1] = NULL;
1763 		cache_lock_vnodes_cel(cel, dvp, vp);
1764 		ncp = dvp->v_cache_dd;
1765 		if (ncp == NULL)
1766 			break;
1767 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1768 			break;
1769 		MPASS(ncp->nc_dvp == dvp);
1770 		blps[1] = NCP2BUCKETLOCK(ncp);
1771 		if (ncp->nc_flag & NCF_NEGATIVE)
1772 			break;
1773 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1774 			break;
1775 		if (ncp == dvp->v_cache_dd &&
1776 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1777 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1778 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1779 			break;
1780 		cache_unlock_vnodes_cel(cel);
1781 		cel->vlp[0] = NULL;
1782 		cel->vlp[1] = NULL;
1783 		cel->vlp[2] = NULL;
1784 	}
1785 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1786 }
1787 
1788 static void
1789 cache_enter_unlock(struct celockstate *cel)
1790 {
1791 
1792 	cache_unlock_buckets_cel(cel);
1793 	cache_unlock_vnodes_cel(cel);
1794 }
1795 
1796 static void __noinline
1797 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1798     struct componentname *cnp)
1799 {
1800 	struct celockstate cel;
1801 	struct namecache *ncp;
1802 	uint32_t hash;
1803 	int len;
1804 
1805 	if (dvp->v_cache_dd == NULL)
1806 		return;
1807 	len = cnp->cn_namelen;
1808 	cache_celockstate_init(&cel);
1809 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1810 	cache_enter_lock_dd(&cel, dvp, vp, hash);
1811 	ncp = dvp->v_cache_dd;
1812 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1813 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1814 		cache_zap_locked(ncp);
1815 	} else {
1816 		ncp = NULL;
1817 	}
1818 	dvp->v_cache_dd = NULL;
1819 	cache_enter_unlock(&cel);
1820 	cache_free(ncp);
1821 }
1822 
1823 /*
1824  * Add an entry to the cache.
1825  */
1826 void
1827 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1828     struct timespec *tsp, struct timespec *dtsp)
1829 {
1830 	struct celockstate cel;
1831 	struct namecache *ncp, *n2, *ndd;
1832 	struct namecache_ts *ncp_ts, *n2_ts;
1833 	struct nchashhead *ncpp;
1834 	uint32_t hash;
1835 	int flag;
1836 	int len;
1837 	u_long lnumcache;
1838 
1839 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1840 	VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp,
1841 	    ("cache_enter: Adding a doomed vnode"));
1842 	VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp,
1843 	    ("cache_enter: Doomed vnode used as src"));
1844 
1845 #ifdef DEBUG_CACHE
1846 	if (__predict_false(!doingcache))
1847 		return;
1848 #endif
1849 
1850 	flag = 0;
1851 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1852 		if (cnp->cn_namelen == 1)
1853 			return;
1854 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1855 			cache_enter_dotdot_prep(dvp, vp, cnp);
1856 			flag = NCF_ISDOTDOT;
1857 		}
1858 	}
1859 
1860 	/*
1861 	 * Avoid blowout in namecache entries.
1862 	 */
1863 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1864 	if (__predict_false(lnumcache >= ncsize)) {
1865 		atomic_add_long(&numcache, -1);
1866 		counter_u64_add(numdrops, 1);
1867 		return;
1868 	}
1869 
1870 	cache_celockstate_init(&cel);
1871 	ndd = NULL;
1872 	ncp_ts = NULL;
1873 
1874 	/*
1875 	 * Calculate the hash key and setup as much of the new
1876 	 * namecache entry as possible before acquiring the lock.
1877 	 */
1878 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1879 	ncp->nc_flag = flag;
1880 	ncp->nc_vp = vp;
1881 	if (vp == NULL)
1882 		cache_negative_init(ncp);
1883 	ncp->nc_dvp = dvp;
1884 	if (tsp != NULL) {
1885 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1886 		ncp_ts->nc_time = *tsp;
1887 		ncp_ts->nc_ticks = ticks;
1888 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
1889 		if (dtsp != NULL) {
1890 			ncp_ts->nc_dotdottime = *dtsp;
1891 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1892 		}
1893 	}
1894 	len = ncp->nc_nlen = cnp->cn_namelen;
1895 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1896 	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
1897 	cache_enter_lock(&cel, dvp, vp, hash);
1898 
1899 	/*
1900 	 * See if this vnode or negative entry is already in the cache
1901 	 * with this name.  This can happen with concurrent lookups of
1902 	 * the same path name.
1903 	 */
1904 	ncpp = NCHHASH(hash);
1905 	CK_LIST_FOREACH(n2, ncpp, nc_hash) {
1906 		if (n2->nc_dvp == dvp &&
1907 		    n2->nc_nlen == cnp->cn_namelen &&
1908 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1909 			if (tsp != NULL) {
1910 				KASSERT((n2->nc_flag & NCF_TS) != 0,
1911 				    ("no NCF_TS"));
1912 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1913 				n2_ts->nc_time = ncp_ts->nc_time;
1914 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
1915 				if (dtsp != NULL) {
1916 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1917 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
1918 				}
1919 			}
1920 			goto out_unlock_free;
1921 		}
1922 	}
1923 
1924 	if (flag == NCF_ISDOTDOT) {
1925 		/*
1926 		 * See if we are trying to add .. entry, but some other lookup
1927 		 * has populated v_cache_dd pointer already.
1928 		 */
1929 		if (dvp->v_cache_dd != NULL)
1930 			goto out_unlock_free;
1931 		KASSERT(vp == NULL || vp->v_type == VDIR,
1932 		    ("wrong vnode type %p", vp));
1933 		dvp->v_cache_dd = ncp;
1934 	}
1935 
1936 	if (vp != NULL) {
1937 		if (vp->v_type == VDIR) {
1938 			if (flag != NCF_ISDOTDOT) {
1939 				/*
1940 				 * For this case, the cache entry maps both the
1941 				 * directory name in it and the name ".." for the
1942 				 * directory's parent.
1943 				 */
1944 				if ((ndd = vp->v_cache_dd) != NULL) {
1945 					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1946 						cache_zap_locked(ndd);
1947 					else
1948 						ndd = NULL;
1949 				}
1950 				vp->v_cache_dd = ncp;
1951 			}
1952 		} else {
1953 			vp->v_cache_dd = NULL;
1954 		}
1955 	}
1956 
1957 	if (flag != NCF_ISDOTDOT) {
1958 		if (LIST_EMPTY(&dvp->v_cache_src)) {
1959 			vhold(dvp);
1960 			counter_u64_add(numcachehv, 1);
1961 		}
1962 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
1963 	}
1964 
1965 	/*
1966 	 * If the entry is "negative", we place it into the
1967 	 * "negative" cache queue, otherwise, we place it into the
1968 	 * destination vnode's cache entries queue.
1969 	 */
1970 	if (vp != NULL) {
1971 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
1972 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
1973 		    vp);
1974 	} else {
1975 		if (cnp->cn_flags & ISWHITEOUT)
1976 			ncp->nc_flag |= NCF_WHITE;
1977 		cache_negative_insert(ncp);
1978 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
1979 		    ncp->nc_name);
1980 	}
1981 
1982 	atomic_thread_fence_rel();
1983 	/*
1984 	 * Insert the new namecache entry into the appropriate chain
1985 	 * within the cache entries table.
1986 	 */
1987 	CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
1988 
1989 	cache_enter_unlock(&cel);
1990 	if (numneg * ncnegfactor > lnumcache)
1991 		cache_negative_zap_one();
1992 	cache_free(ndd);
1993 	return;
1994 out_unlock_free:
1995 	cache_enter_unlock(&cel);
1996 	cache_free(ncp);
1997 	return;
1998 }
1999 
2000 static u_int
2001 cache_roundup_2(u_int val)
2002 {
2003 	u_int res;
2004 
2005 	for (res = 1; res <= val; res <<= 1)
2006 		continue;
2007 
2008 	return (res);
2009 }
2010 
2011 /*
2012  * Name cache initialization, from vfs_init() when we are booting
2013  */
2014 static void
2015 nchinit(void *dummy __unused)
2016 {
2017 	u_int i;
2018 
2019 	cache_zone_small = uma_zcreate("S VFS Cache",
2020 	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
2021 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2022 	    UMA_ZONE_ZINIT);
2023 	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
2024 	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
2025 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2026 	    UMA_ZONE_ZINIT);
2027 	cache_zone_large = uma_zcreate("L VFS Cache",
2028 	    sizeof(struct namecache) + NAME_MAX + 1,
2029 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2030 	    UMA_ZONE_ZINIT);
2031 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
2032 	    sizeof(struct namecache_ts) + NAME_MAX + 1,
2033 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2034 	    UMA_ZONE_ZINIT);
2035 
2036 	VFS_SMR_ZONE_SET(cache_zone_small);
2037 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2038 	VFS_SMR_ZONE_SET(cache_zone_large);
2039 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2040 
2041 	ncsize = desiredvnodes * ncsizefactor;
2042 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
2043 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2044 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2045 		ncbuckethash = 7;
2046 	if (ncbuckethash > nchash)
2047 		ncbuckethash = nchash;
2048 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2049 	    M_WAITOK | M_ZERO);
2050 	for (i = 0; i < numbucketlocks; i++)
2051 		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2052 	ncvnodehash = ncbuckethash;
2053 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2054 	    M_WAITOK | M_ZERO);
2055 	for (i = 0; i < numvnodelocks; i++)
2056 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2057 	ncpurgeminvnodes = numbucketlocks * 2;
2058 
2059 	ncneghash = 3;
2060 	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2061 	    M_WAITOK | M_ZERO);
2062 	for (i = 0; i < numneglists; i++) {
2063 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2064 		TAILQ_INIT(&neglists[i].nl_list);
2065 	}
2066 	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2067 	TAILQ_INIT(&ncneg_hot.nl_list);
2068 
2069 	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2070 }
2071 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2072 
2073 void
2074 cache_changesize(u_long newmaxvnodes)
2075 {
2076 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
2077 	u_long new_nchash, old_nchash;
2078 	struct namecache *ncp;
2079 	uint32_t hash;
2080 	u_long newncsize;
2081 	int i;
2082 
2083 	newncsize = newmaxvnodes * ncsizefactor;
2084 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2085 	if (newmaxvnodes < numbucketlocks)
2086 		newmaxvnodes = numbucketlocks;
2087 
2088 	new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
2089 	/* If same hash table size, nothing to do */
2090 	if (nchash == new_nchash) {
2091 		free(new_nchashtbl, M_VFSCACHE);
2092 		return;
2093 	}
2094 	/*
2095 	 * Move everything from the old hash table to the new table.
2096 	 * None of the namecache entries in the table can be removed
2097 	 * because to do so, they have to be removed from the hash table.
2098 	 */
2099 	cache_lock_all_vnodes();
2100 	cache_lock_all_buckets();
2101 	old_nchashtbl = nchashtbl;
2102 	old_nchash = nchash;
2103 	nchashtbl = new_nchashtbl;
2104 	nchash = new_nchash;
2105 	for (i = 0; i <= old_nchash; i++) {
2106 		while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) {
2107 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2108 			    ncp->nc_dvp);
2109 			CK_LIST_REMOVE(ncp, nc_hash);
2110 			CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2111 		}
2112 	}
2113 	ncsize = newncsize;
2114 	cache_unlock_all_buckets();
2115 	cache_unlock_all_vnodes();
2116 	free(old_nchashtbl, M_VFSCACHE);
2117 }
2118 
2119 /*
2120  * Invalidate all entries from and to a particular vnode.
2121  */
2122 void
2123 cache_purge(struct vnode *vp)
2124 {
2125 	TAILQ_HEAD(, namecache) ncps;
2126 	struct namecache *ncp, *nnp;
2127 	struct mtx *vlp, *vlp2;
2128 
2129 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
2130 	SDT_PROBE1(vfs, namecache, purge, done, vp);
2131 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2132 	    vp->v_cache_dd == NULL)
2133 		return;
2134 	TAILQ_INIT(&ncps);
2135 	vlp = VP2VNODELOCK(vp);
2136 	vlp2 = NULL;
2137 	mtx_lock(vlp);
2138 retry:
2139 	while (!LIST_EMPTY(&vp->v_cache_src)) {
2140 		ncp = LIST_FIRST(&vp->v_cache_src);
2141 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2142 			goto retry;
2143 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2144 	}
2145 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2146 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2147 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2148 			goto retry;
2149 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2150 	}
2151 	ncp = vp->v_cache_dd;
2152 	if (ncp != NULL) {
2153 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2154 		   ("lost dotdot link"));
2155 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2156 			goto retry;
2157 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2158 	}
2159 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2160 	mtx_unlock(vlp);
2161 	if (vlp2 != NULL)
2162 		mtx_unlock(vlp2);
2163 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2164 		cache_free(ncp);
2165 	}
2166 }
2167 
2168 /*
2169  * Invalidate all negative entries for a particular directory vnode.
2170  */
2171 void
2172 cache_purge_negative(struct vnode *vp)
2173 {
2174 	TAILQ_HEAD(, namecache) ncps;
2175 	struct namecache *ncp, *nnp;
2176 	struct mtx *vlp;
2177 
2178 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2179 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2180 	if (LIST_EMPTY(&vp->v_cache_src))
2181 		return;
2182 	TAILQ_INIT(&ncps);
2183 	vlp = VP2VNODELOCK(vp);
2184 	mtx_lock(vlp);
2185 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2186 		if (!(ncp->nc_flag & NCF_NEGATIVE))
2187 			continue;
2188 		cache_zap_negative_locked_vnode_kl(ncp, vp);
2189 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2190 	}
2191 	mtx_unlock(vlp);
2192 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2193 		cache_free(ncp);
2194 	}
2195 }
2196 
2197 /*
2198  * Flush all entries referencing a particular filesystem.
2199  */
2200 void
2201 cache_purgevfs(struct mount *mp, bool force)
2202 {
2203 	TAILQ_HEAD(, namecache) ncps;
2204 	struct mtx *vlp1, *vlp2;
2205 	struct rwlock *blp;
2206 	struct nchashhead *bucket;
2207 	struct namecache *ncp, *nnp;
2208 	u_long i, j, n_nchash;
2209 	int error;
2210 
2211 	/* Scan hash tables for applicable entries */
2212 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2213 	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2214 		return;
2215 	TAILQ_INIT(&ncps);
2216 	n_nchash = nchash + 1;
2217 	vlp1 = vlp2 = NULL;
2218 	for (i = 0; i < numbucketlocks; i++) {
2219 		blp = (struct rwlock *)&bucketlocks[i];
2220 		rw_wlock(blp);
2221 		for (j = i; j < n_nchash; j += numbucketlocks) {
2222 retry:
2223 			bucket = &nchashtbl[j];
2224 			CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2225 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
2226 				if (ncp->nc_dvp->v_mount != mp)
2227 					continue;
2228 				error = cache_zap_wlocked_bucket_kl(ncp, blp,
2229 				    &vlp1, &vlp2);
2230 				if (error != 0)
2231 					goto retry;
2232 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2233 			}
2234 		}
2235 		rw_wunlock(blp);
2236 		if (vlp1 == NULL && vlp2 == NULL)
2237 			cache_maybe_yield();
2238 	}
2239 	if (vlp1 != NULL)
2240 		mtx_unlock(vlp1);
2241 	if (vlp2 != NULL)
2242 		mtx_unlock(vlp2);
2243 
2244 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2245 		cache_free(ncp);
2246 	}
2247 }
2248 
2249 /*
2250  * Perform canonical checks and cache lookup and pass on to filesystem
2251  * through the vop_cachedlookup only if needed.
2252  */
2253 
2254 int
2255 vfs_cache_lookup(struct vop_lookup_args *ap)
2256 {
2257 	struct vnode *dvp;
2258 	int error;
2259 	struct vnode **vpp = ap->a_vpp;
2260 	struct componentname *cnp = ap->a_cnp;
2261 	int flags = cnp->cn_flags;
2262 
2263 	*vpp = NULL;
2264 	dvp = ap->a_dvp;
2265 
2266 	if (dvp->v_type != VDIR)
2267 		return (ENOTDIR);
2268 
2269 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2270 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2271 		return (EROFS);
2272 
2273 	error = vn_dir_check_exec(dvp, cnp);
2274 	if (error != 0)
2275 		return (error);
2276 
2277 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2278 	if (error == 0)
2279 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2280 	if (error == -1)
2281 		return (0);
2282 	return (error);
2283 }
2284 
2285 /* Implementation of the getcwd syscall. */
2286 int
2287 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2288 {
2289 	char *buf, *retbuf;
2290 	size_t buflen;
2291 	int error;
2292 
2293 	buflen = uap->buflen;
2294 	if (__predict_false(buflen < 2))
2295 		return (EINVAL);
2296 	if (buflen > MAXPATHLEN)
2297 		buflen = MAXPATHLEN;
2298 
2299 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2300 	error = vn_getcwd(td, buf, &retbuf, &buflen);
2301 	if (error == 0)
2302 		error = copyout(retbuf, uap->buf, buflen);
2303 	free(buf, M_TEMP);
2304 	return (error);
2305 }
2306 
2307 int
2308 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen)
2309 {
2310 	struct pwd *pwd;
2311 	int error;
2312 
2313 	pwd = pwd_hold(td);
2314 	error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen);
2315 	pwd_drop(pwd);
2316 
2317 #ifdef KTRACE
2318 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2319 		ktrnamei(*retbuf);
2320 #endif
2321 	return (error);
2322 }
2323 
2324 static int
2325 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2326     size_t size, int flags, enum uio_seg pathseg)
2327 {
2328 	struct nameidata nd;
2329 	char *retbuf, *freebuf;
2330 	int error;
2331 
2332 	if (flags != 0)
2333 		return (EINVAL);
2334 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2335 	    pathseg, path, fd, &cap_fstat_rights, td);
2336 	if ((error = namei(&nd)) != 0)
2337 		return (error);
2338 	error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size);
2339 	if (error == 0) {
2340 		error = copyout(retbuf, buf, size);
2341 		free(freebuf, M_TEMP);
2342 	}
2343 	NDFREE(&nd, 0);
2344 	return (error);
2345 }
2346 
2347 int
2348 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2349 {
2350 
2351 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2352 	    uap->flags, UIO_USERSPACE));
2353 }
2354 
2355 /*
2356  * Retrieve the full filesystem path that correspond to a vnode from the name
2357  * cache (if available)
2358  */
2359 int
2360 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2361 {
2362 	struct pwd *pwd;
2363 	char *buf;
2364 	size_t buflen;
2365 	int error;
2366 
2367 	if (__predict_false(vn == NULL))
2368 		return (EINVAL);
2369 
2370 	buflen = MAXPATHLEN;
2371 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2372 	pwd = pwd_hold(td);
2373 	error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen);
2374 	pwd_drop(pwd);
2375 
2376 	if (!error)
2377 		*freebuf = buf;
2378 	else
2379 		free(buf, M_TEMP);
2380 	return (error);
2381 }
2382 
2383 /*
2384  * This function is similar to vn_fullpath, but it attempts to lookup the
2385  * pathname relative to the global root mount point.  This is required for the
2386  * auditing sub-system, as audited pathnames must be absolute, relative to the
2387  * global root mount point.
2388  */
2389 int
2390 vn_fullpath_global(struct thread *td, struct vnode *vn,
2391     char **retbuf, char **freebuf)
2392 {
2393 	char *buf;
2394 	size_t buflen;
2395 	int error;
2396 
2397 	if (__predict_false(vn == NULL))
2398 		return (EINVAL);
2399 	buflen = MAXPATHLEN;
2400 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2401 	error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen);
2402 	if (!error)
2403 		*freebuf = buf;
2404 	else
2405 		free(buf, M_TEMP);
2406 	return (error);
2407 }
2408 
2409 int
2410 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2411 {
2412 	struct vnode *dvp;
2413 	struct namecache *ncp;
2414 	struct mtx *vlp;
2415 	int error;
2416 
2417 	vlp = VP2VNODELOCK(*vp);
2418 	mtx_lock(vlp);
2419 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2420 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2421 			break;
2422 	}
2423 	if (ncp != NULL) {
2424 		if (*buflen < ncp->nc_nlen) {
2425 			mtx_unlock(vlp);
2426 			vrele(*vp);
2427 			counter_u64_add(numfullpathfail4, 1);
2428 			error = ENOMEM;
2429 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2430 			    vp, NULL);
2431 			return (error);
2432 		}
2433 		*buflen -= ncp->nc_nlen;
2434 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2435 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2436 		    ncp->nc_name, vp);
2437 		dvp = *vp;
2438 		*vp = ncp->nc_dvp;
2439 		vref(*vp);
2440 		mtx_unlock(vlp);
2441 		vrele(dvp);
2442 		return (0);
2443 	}
2444 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2445 
2446 	mtx_unlock(vlp);
2447 	vn_lock(*vp, LK_SHARED | LK_RETRY);
2448 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2449 	vput(*vp);
2450 	if (error) {
2451 		counter_u64_add(numfullpathfail2, 1);
2452 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2453 		return (error);
2454 	}
2455 
2456 	*vp = dvp;
2457 	if (VN_IS_DOOMED(dvp)) {
2458 		/* forced unmount */
2459 		vrele(dvp);
2460 		error = ENOENT;
2461 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2462 		return (error);
2463 	}
2464 	/*
2465 	 * *vp has its use count incremented still.
2466 	 */
2467 
2468 	return (0);
2469 }
2470 
2471 /*
2472  * Resolve a directory to a pathname.
2473  *
2474  * The name of the directory can always be found in the namecache or fetched
2475  * from the filesystem. There is also guaranteed to be only one parent, meaning
2476  * we can just follow vnodes up until we find the root.
2477  *
2478  * The vnode must be referenced.
2479  */
2480 static int
2481 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
2482     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend)
2483 {
2484 #ifdef KDTRACE_HOOKS
2485 	struct vnode *startvp = vp;
2486 #endif
2487 	struct vnode *vp1;
2488 	size_t buflen;
2489 	int error;
2490 
2491 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2492 	VNPASS(vp->v_usecount > 0, vp);
2493 
2494 	buflen = *len;
2495 
2496 	if (!slash_prefixed) {
2497 		MPASS(*len >= 2);
2498 		buflen--;
2499 		buf[buflen] = '\0';
2500 	}
2501 
2502 	error = 0;
2503 
2504 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2505 	counter_u64_add(numfullpathcalls, 1);
2506 	while (vp != rdir && vp != rootvnode) {
2507 		/*
2508 		 * The vp vnode must be already fully constructed,
2509 		 * since it is either found in namecache or obtained
2510 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2511 		 * without obtaining the vnode lock.
2512 		 */
2513 		if ((vp->v_vflag & VV_ROOT) != 0) {
2514 			vn_lock(vp, LK_RETRY | LK_SHARED);
2515 
2516 			/*
2517 			 * With the vnode locked, check for races with
2518 			 * unmount, forced or not.  Note that we
2519 			 * already verified that vp is not equal to
2520 			 * the root vnode, which means that
2521 			 * mnt_vnodecovered can be NULL only for the
2522 			 * case of unmount.
2523 			 */
2524 			if (VN_IS_DOOMED(vp) ||
2525 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2526 			    vp1->v_mountedhere != vp->v_mount) {
2527 				vput(vp);
2528 				error = ENOENT;
2529 				SDT_PROBE3(vfs, namecache, fullpath, return,
2530 				    error, vp, NULL);
2531 				break;
2532 			}
2533 
2534 			vref(vp1);
2535 			vput(vp);
2536 			vp = vp1;
2537 			continue;
2538 		}
2539 		if (vp->v_type != VDIR) {
2540 			vrele(vp);
2541 			counter_u64_add(numfullpathfail1, 1);
2542 			error = ENOTDIR;
2543 			SDT_PROBE3(vfs, namecache, fullpath, return,
2544 			    error, vp, NULL);
2545 			break;
2546 		}
2547 		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2548 		if (error)
2549 			break;
2550 		if (buflen == 0) {
2551 			vrele(vp);
2552 			error = ENOMEM;
2553 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2554 			    startvp, NULL);
2555 			break;
2556 		}
2557 		buf[--buflen] = '/';
2558 		slash_prefixed = true;
2559 	}
2560 	if (error)
2561 		return (error);
2562 	if (!slash_prefixed) {
2563 		if (buflen == 0) {
2564 			vrele(vp);
2565 			counter_u64_add(numfullpathfail4, 1);
2566 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2567 			    startvp, NULL);
2568 			return (ENOMEM);
2569 		}
2570 		buf[--buflen] = '/';
2571 	}
2572 	counter_u64_add(numfullpathfound, 1);
2573 	vrele(vp);
2574 
2575 	*retbuf = buf + buflen;
2576 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2577 	*len -= buflen;
2578 	*len += addend;
2579 	return (0);
2580 }
2581 
2582 /*
2583  * Resolve an arbitrary vnode to a pathname.
2584  *
2585  * Note 2 caveats:
2586  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2587  *   resolve to a different path than the one used to find it
2588  * - namecache is not mandatory, meaning names are not guaranteed to be added
2589  *   (in which case resolving fails)
2590  */
2591 static int
2592 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
2593     char *buf, char **retbuf, size_t *buflen)
2594 {
2595 	size_t orig_buflen;
2596 	bool slash_prefixed;
2597 	int error;
2598 
2599 	if (*buflen < 2)
2600 		return (EINVAL);
2601 
2602 	orig_buflen = *buflen;
2603 
2604 	vref(vp);
2605 	slash_prefixed = false;
2606 	if (vp->v_type != VDIR) {
2607 		*buflen -= 1;
2608 		buf[*buflen] = '\0';
2609 		error = vn_vptocnp(&vp, td->td_ucred, buf, buflen);
2610 		if (error)
2611 			return (error);
2612 		if (*buflen == 0) {
2613 			vrele(vp);
2614 			return (ENOMEM);
2615 		}
2616 		*buflen -= 1;
2617 		buf[*buflen] = '/';
2618 		slash_prefixed = true;
2619 	}
2620 
2621 	return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed,
2622 	    orig_buflen - *buflen));
2623 }
2624 
2625 /*
2626  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2627  *
2628  * Since the namecache does not track handlings, the caller is expected to first
2629  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2630  *
2631  * Then we have 2 cases:
2632  * - if the found vnode is a directory, the path can be constructed just by
2633  *   fullowing names up the chain
2634  * - otherwise we populate the buffer with the saved name and start resolving
2635  *   from the parent
2636  */
2637 static int
2638 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
2639     char **freebuf, size_t *buflen)
2640 {
2641 	char *buf, *tmpbuf;
2642 	struct pwd *pwd;
2643 	struct componentname *cnp;
2644 	struct vnode *vp;
2645 	size_t addend;
2646 	int error;
2647 	bool slash_prefixed;
2648 
2649 	if (*buflen < 2)
2650 		return (EINVAL);
2651 	if (*buflen > MAXPATHLEN)
2652 		*buflen = MAXPATHLEN;
2653 
2654 	slash_prefixed = false;
2655 
2656 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
2657 	pwd = pwd_hold(td);
2658 
2659 	addend = 0;
2660 	vp = ndp->ni_vp;
2661 	if (vp->v_type != VDIR) {
2662 		cnp = &ndp->ni_cnd;
2663 		addend = cnp->cn_namelen + 2;
2664 		if (*buflen < addend) {
2665 			error = ENOMEM;
2666 			goto out_bad;
2667 		}
2668 		*buflen -= addend;
2669 		tmpbuf = buf + *buflen;
2670 		tmpbuf[0] = '/';
2671 		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
2672 		tmpbuf[addend - 1] = '\0';
2673 		slash_prefixed = true;
2674 		vp = ndp->ni_dvp;
2675 	}
2676 
2677 	vref(vp);
2678 	error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen,
2679 	    slash_prefixed, addend);
2680 	if (error != 0)
2681 		goto out_bad;
2682 
2683 	pwd_drop(pwd);
2684 	*freebuf = buf;
2685 
2686 	return (0);
2687 out_bad:
2688 	pwd_drop(pwd);
2689 	free(buf, M_TEMP);
2690 	return (error);
2691 }
2692 
2693 struct vnode *
2694 vn_dir_dd_ino(struct vnode *vp)
2695 {
2696 	struct namecache *ncp;
2697 	struct vnode *ddvp;
2698 	struct mtx *vlp;
2699 	enum vgetstate vs;
2700 
2701 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2702 	vlp = VP2VNODELOCK(vp);
2703 	mtx_lock(vlp);
2704 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2705 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2706 			continue;
2707 		ddvp = ncp->nc_dvp;
2708 		vs = vget_prep(ddvp);
2709 		mtx_unlock(vlp);
2710 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2711 			return (NULL);
2712 		return (ddvp);
2713 	}
2714 	mtx_unlock(vlp);
2715 	return (NULL);
2716 }
2717 
2718 int
2719 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2720 {
2721 	struct namecache *ncp;
2722 	struct mtx *vlp;
2723 	int l;
2724 
2725 	vlp = VP2VNODELOCK(vp);
2726 	mtx_lock(vlp);
2727 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2728 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2729 			break;
2730 	if (ncp == NULL) {
2731 		mtx_unlock(vlp);
2732 		return (ENOENT);
2733 	}
2734 	l = min(ncp->nc_nlen, buflen - 1);
2735 	memcpy(buf, ncp->nc_name, l);
2736 	mtx_unlock(vlp);
2737 	buf[l] = '\0';
2738 	return (0);
2739 }
2740 
2741 /*
2742  * This function updates path string to vnode's full global path
2743  * and checks the size of the new path string against the pathlen argument.
2744  *
2745  * Requires a locked, referenced vnode.
2746  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2747  *
2748  * If vp is a directory, the call to vn_fullpath_global() always succeeds
2749  * because it falls back to the ".." lookup if the namecache lookup fails.
2750  */
2751 int
2752 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2753     u_int pathlen)
2754 {
2755 	struct nameidata nd;
2756 	struct vnode *vp1;
2757 	char *rpath, *fbuf;
2758 	int error;
2759 
2760 	ASSERT_VOP_ELOCKED(vp, __func__);
2761 
2762 	/* Construct global filesystem path from vp. */
2763 	VOP_UNLOCK(vp);
2764 	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2765 
2766 	if (error != 0) {
2767 		vrele(vp);
2768 		return (error);
2769 	}
2770 
2771 	if (strlen(rpath) >= pathlen) {
2772 		vrele(vp);
2773 		error = ENAMETOOLONG;
2774 		goto out;
2775 	}
2776 
2777 	/*
2778 	 * Re-lookup the vnode by path to detect a possible rename.
2779 	 * As a side effect, the vnode is relocked.
2780 	 * If vnode was renamed, return ENOENT.
2781 	 */
2782 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2783 	    UIO_SYSSPACE, path, td);
2784 	error = namei(&nd);
2785 	if (error != 0) {
2786 		vrele(vp);
2787 		goto out;
2788 	}
2789 	NDFREE(&nd, NDF_ONLY_PNBUF);
2790 	vp1 = nd.ni_vp;
2791 	vrele(vp);
2792 	if (vp1 == vp)
2793 		strcpy(path, rpath);
2794 	else {
2795 		vput(vp1);
2796 		error = ENOENT;
2797 	}
2798 
2799 out:
2800 	free(fbuf, M_TEMP);
2801 	return (error);
2802 }
2803 
2804 #ifdef DDB
2805 static void
2806 db_print_vpath(struct vnode *vp)
2807 {
2808 
2809 	while (vp != NULL) {
2810 		db_printf("%p: ", vp);
2811 		if (vp == rootvnode) {
2812 			db_printf("/");
2813 			vp = NULL;
2814 		} else {
2815 			if (vp->v_vflag & VV_ROOT) {
2816 				db_printf("<mount point>");
2817 				vp = vp->v_mount->mnt_vnodecovered;
2818 			} else {
2819 				struct namecache *ncp;
2820 				char *ncn;
2821 				int i;
2822 
2823 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
2824 				if (ncp != NULL) {
2825 					ncn = ncp->nc_name;
2826 					for (i = 0; i < ncp->nc_nlen; i++)
2827 						db_printf("%c", *ncn++);
2828 					vp = ncp->nc_dvp;
2829 				} else {
2830 					vp = NULL;
2831 				}
2832 			}
2833 		}
2834 		db_printf("\n");
2835 	}
2836 
2837 	return;
2838 }
2839 
2840 DB_SHOW_COMMAND(vpath, db_show_vpath)
2841 {
2842 	struct vnode *vp;
2843 
2844 	if (!have_addr) {
2845 		db_printf("usage: show vpath <struct vnode *>\n");
2846 		return;
2847 	}
2848 
2849 	vp = (struct vnode *)addr;
2850 	db_print_vpath(vp);
2851 }
2852 
2853 #endif
2854 
2855 extern uma_zone_t namei_zone;
2856 
2857 static bool __read_frequently cache_fast_lookup = true;
2858 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
2859     &cache_fast_lookup, 0, "");
2860 
2861 #define CACHE_FPL_FAILED	-2020
2862 
2863 static void
2864 cache_fpl_cleanup_cnp(struct componentname *cnp)
2865 {
2866 
2867 	uma_zfree(namei_zone, cnp->cn_pnbuf);
2868 #ifdef DIAGNOSTIC
2869 	cnp->cn_pnbuf = NULL;
2870 	cnp->cn_nameptr = NULL;
2871 #endif
2872 }
2873 
2874 static void
2875 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
2876 {
2877 	struct componentname *cnp;
2878 
2879 	cnp = &ndp->ni_cnd;
2880 	while (*(cnp->cn_nameptr) == '/') {
2881 		cnp->cn_nameptr++;
2882 		ndp->ni_pathlen--;
2883 	}
2884 
2885 	*dpp = ndp->ni_rootdir;
2886 }
2887 
2888 /*
2889  * Components of nameidata (or objects it can point to) which may
2890  * need restoring in case fast path lookup fails.
2891  */
2892 struct nameidata_saved {
2893 	int cn_flags;
2894 	long cn_namelen;
2895 	char *cn_nameptr;
2896 	size_t ni_pathlen;
2897 };
2898 
2899 struct cache_fpl {
2900 	int line;
2901 	enum cache_fpl_status status;
2902 	bool in_smr;
2903 	struct nameidata *ndp;
2904 	struct nameidata_saved snd;
2905 	struct componentname *cnp;
2906 	struct vnode *dvp;
2907 	seqc_t dvp_seqc;
2908 	struct vnode *tvp;
2909 	seqc_t tvp_seqc;
2910 	struct pwd *pwd;
2911 };
2912 
2913 static void
2914 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
2915 {
2916 
2917 	snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
2918 	snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
2919 	snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
2920 	snd->ni_pathlen = fpl->ndp->ni_pathlen;
2921 }
2922 
2923 static void
2924 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
2925 {
2926 
2927 	fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
2928 	fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
2929 	fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
2930 	fpl->ndp->ni_pathlen = snd->ni_pathlen;
2931 }
2932 
2933 #ifdef INVARIANTS
2934 #define cache_fpl_smr_assert_entered(fpl) ({			\
2935 	struct cache_fpl *_fpl = (fpl);				\
2936 	MPASS(_fpl->in_smr == true);				\
2937 	VFS_SMR_ASSERT_ENTERED();				\
2938 })
2939 #define cache_fpl_smr_assert_not_entered(fpl) ({		\
2940 	struct cache_fpl *_fpl = (fpl);				\
2941 	MPASS(_fpl->in_smr == false);				\
2942 	VFS_SMR_ASSERT_NOT_ENTERED();				\
2943 })
2944 #else
2945 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
2946 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
2947 #endif
2948 
2949 #define cache_fpl_smr_enter(fpl) ({				\
2950 	struct cache_fpl *_fpl = (fpl);				\
2951 	MPASS(_fpl->in_smr == false);				\
2952 	vfs_smr_enter();					\
2953 	_fpl->in_smr = true;					\
2954 })
2955 
2956 #define cache_fpl_smr_exit(fpl) ({				\
2957 	struct cache_fpl *_fpl = (fpl);				\
2958 	MPASS(_fpl->in_smr == true);				\
2959 	vfs_smr_exit();						\
2960 	_fpl->in_smr = false;					\
2961 })
2962 
2963 static int
2964 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
2965 {
2966 
2967 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
2968 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
2969 		    ("%s: converting to abort from %d at %d, set at %d\n",
2970 		    __func__, fpl->status, line, fpl->line));
2971 	}
2972 	fpl->status = CACHE_FPL_STATUS_ABORTED;
2973 	fpl->line = line;
2974 	return (CACHE_FPL_FAILED);
2975 }
2976 
2977 #define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
2978 
2979 static int
2980 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
2981 {
2982 
2983 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
2984 	    ("%s: setting to partial at %d, but already set to %d at %d\n",
2985 	    __func__, line, fpl->status, fpl->line));
2986 	cache_fpl_smr_assert_entered(fpl);
2987 	fpl->status = CACHE_FPL_STATUS_PARTIAL;
2988 	fpl->line = line;
2989 	return (CACHE_FPL_FAILED);
2990 }
2991 
2992 #define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
2993 
2994 static int
2995 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
2996 {
2997 
2998 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
2999 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
3000 	    __func__, line, fpl->status, fpl->line));
3001 	cache_fpl_smr_assert_not_entered(fpl);
3002 	MPASS(error != CACHE_FPL_FAILED);
3003 	fpl->status = CACHE_FPL_STATUS_HANDLED;
3004 	fpl->line = line;
3005 	return (error);
3006 }
3007 
3008 #define cache_fpl_handled(x, e)	cache_fpl_handled_impl((x), (e), __LINE__)
3009 
3010 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3011 	(LOCKLEAF | LOCKPARENT | WANTPARENT | FOLLOW | LOCKSHARED | SAVENAME | \
3012 	 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2)
3013 
3014 static bool
3015 cache_can_fplookup(struct cache_fpl *fpl)
3016 {
3017 	struct nameidata *ndp;
3018 	struct componentname *cnp;
3019 	struct thread *td;
3020 
3021 	ndp = fpl->ndp;
3022 	cnp = fpl->cnp;
3023 	td = cnp->cn_thread;
3024 
3025 	if (!cache_fast_lookup) {
3026 		cache_fpl_aborted(fpl);
3027 		return (false);
3028 	}
3029 #ifdef MAC
3030 	if (mac_vnode_check_lookup_enabled()) {
3031 		cache_fpl_aborted(fpl);
3032 		return (false);
3033 	}
3034 #endif
3035 	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3036 		cache_fpl_aborted(fpl);
3037 		return (false);
3038 	}
3039 	if (cnp->cn_nameiop != LOOKUP) {
3040 		cache_fpl_aborted(fpl);
3041 		return (false);
3042 	}
3043 	if (ndp->ni_dirfd != AT_FDCWD) {
3044 		cache_fpl_aborted(fpl);
3045 		return (false);
3046 	}
3047 	if (IN_CAPABILITY_MODE(td)) {
3048 		cache_fpl_aborted(fpl);
3049 		return (false);
3050 	}
3051 	if (AUDITING_TD(td)) {
3052 		cache_fpl_aborted(fpl);
3053 		return (false);
3054 	}
3055 	if (ndp->ni_startdir != NULL) {
3056 		cache_fpl_aborted(fpl);
3057 		return (false);
3058 	}
3059 	return (true);
3060 }
3061 
3062 static bool
3063 cache_fplookup_vnode_supported(struct vnode *vp)
3064 {
3065 
3066 	return (vp->v_type != VLNK);
3067 }
3068 
3069 /*
3070  * Move a negative entry to the hot list.
3071  *
3072  * We have to take locks, but they may be contended and in the worst
3073  * case we may need to go off CPU. We don't want to spin within the
3074  * smr section and we can't block with it. Instead we are going to
3075  * look up the entry again.
3076  */
3077 static int __noinline
3078 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3079     uint32_t hash)
3080 {
3081 	struct componentname *cnp;
3082 	struct namecache *ncp;
3083 	struct neglist *neglist;
3084 	struct negstate *negstate;
3085 	struct vnode *dvp;
3086 	u_char nc_flag;
3087 
3088 	cnp = fpl->cnp;
3089 	dvp = fpl->dvp;
3090 
3091 	if (!vhold_smr(dvp))
3092 		return (cache_fpl_aborted(fpl));
3093 
3094 	neglist = NCP2NEGLIST(oncp);
3095 	cache_fpl_smr_exit(fpl);
3096 
3097 	mtx_lock(&ncneg_hot.nl_lock);
3098 	mtx_lock(&neglist->nl_lock);
3099 	/*
3100 	 * For hash iteration.
3101 	 */
3102 	cache_fpl_smr_enter(fpl);
3103 
3104 	/*
3105 	 * Avoid all surprises by only succeeding if we got the same entry and
3106 	 * bailing completely otherwise.
3107 	 *
3108 	 * In particular at this point there can be a new ncp which matches the
3109 	 * search but hashes to a different neglist.
3110 	 */
3111 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3112 		if (ncp == oncp)
3113 			break;
3114 	}
3115 
3116 	/*
3117 	 * No match to begin with.
3118 	 */
3119 	if (__predict_false(ncp == NULL)) {
3120 		goto out_abort;
3121 	}
3122 
3123 	/*
3124 	 * The newly found entry may be something different...
3125 	 */
3126 	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3127 	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3128 		goto out_abort;
3129 	}
3130 
3131 	/*
3132 	 * ... and not even negative.
3133 	 */
3134 	nc_flag = atomic_load_char(&ncp->nc_flag);
3135 	if ((nc_flag & NCF_NEGATIVE) == 0) {
3136 		goto out_abort;
3137 	}
3138 
3139 	if (__predict_false(cache_ncp_invalid(ncp))) {
3140 		goto out_abort;
3141 	}
3142 
3143 	negstate = NCP2NEGSTATE(ncp);
3144 	if ((negstate->neg_flag & NEG_HOT) == 0) {
3145 		numhotneg++;
3146 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3147 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3148 		negstate->neg_flag |= NEG_HOT;
3149 	}
3150 
3151 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3152 	counter_u64_add(numneghits, 1);
3153 	cache_fpl_smr_exit(fpl);
3154 	mtx_unlock(&neglist->nl_lock);
3155 	mtx_unlock(&ncneg_hot.nl_lock);
3156 	vdrop(dvp);
3157 	return (cache_fpl_handled(fpl, ENOENT));
3158 out_abort:
3159 	cache_fpl_smr_exit(fpl);
3160 	mtx_unlock(&neglist->nl_lock);
3161 	mtx_unlock(&ncneg_hot.nl_lock);
3162 	vdrop(dvp);
3163 	return (cache_fpl_aborted(fpl));
3164 }
3165 
3166 /*
3167  * The target vnode is not supported, prepare for the slow path to take over.
3168  */
3169 static int
3170 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3171 {
3172 	struct componentname *cnp;
3173 	enum vgetstate dvs;
3174 	struct vnode *dvp;
3175 	struct pwd *pwd;
3176 	seqc_t dvp_seqc;
3177 
3178 	cnp = fpl->cnp;
3179 	dvp = fpl->dvp;
3180 	dvp_seqc = fpl->dvp_seqc;
3181 
3182 	dvs = vget_prep_smr(dvp);
3183 	if (dvs == VGET_NONE) {
3184 		cache_fpl_smr_exit(fpl);
3185 		return (cache_fpl_aborted(fpl));
3186 	}
3187 
3188 	cache_fpl_smr_exit(fpl);
3189 
3190 	vget_finish_ref(dvp, dvs);
3191 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3192 		vrele(dvp);
3193 		return (cache_fpl_aborted(fpl));
3194 	}
3195 
3196 	pwd = pwd_hold(curthread);
3197 	if (fpl->pwd != pwd) {
3198 		vrele(dvp);
3199 		pwd_drop(pwd);
3200 		return (cache_fpl_aborted(fpl));
3201 	}
3202 
3203 	fpl->ndp->ni_startdir = dvp;
3204 	return (0);
3205 }
3206 
3207 static int
3208 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3209 {
3210 	struct componentname *cnp;
3211 	struct vnode *tvp;
3212 	seqc_t tvp_seqc;
3213 	int error;
3214 
3215 	cnp = fpl->cnp;
3216 	tvp = fpl->tvp;
3217 	tvp_seqc = fpl->tvp_seqc;
3218 
3219 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
3220 		error = vget_finish(tvp, cnp->cn_lkflags, tvs);
3221 		if (error != 0) {
3222 			return (cache_fpl_aborted(fpl));
3223 		}
3224 	} else {
3225 		vget_finish_ref(tvp, tvs);
3226 	}
3227 
3228 	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3229 		if ((cnp->cn_flags & LOCKLEAF) != 0)
3230 			vput(tvp);
3231 		else
3232 			vrele(tvp);
3233 		return (cache_fpl_aborted(fpl));
3234 	}
3235 
3236 	return (cache_fpl_handled(fpl, 0));
3237 }
3238 
3239 static int __noinline
3240 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3241 {
3242 	enum vgetstate dvs, tvs;
3243 	struct componentname *cnp;
3244 	struct vnode *dvp, *tvp;
3245 	seqc_t dvp_seqc, tvp_seqc;
3246 	int error;
3247 
3248 	cnp = fpl->cnp;
3249 	dvp = fpl->dvp;
3250 	dvp_seqc = fpl->dvp_seqc;
3251 	tvp = fpl->tvp;
3252 	tvp_seqc = fpl->tvp_seqc;
3253 
3254 	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3255 
3256 	/*
3257 	 * This is less efficient than it can be for simplicity.
3258 	 */
3259 	dvs = vget_prep_smr(dvp);
3260 	if (dvs == VGET_NONE) {
3261 		return (cache_fpl_aborted(fpl));
3262 	}
3263 	tvs = vget_prep_smr(tvp);
3264 	if (tvs == VGET_NONE) {
3265 		cache_fpl_smr_exit(fpl);
3266 		vget_abort(dvp, dvs);
3267 		return (cache_fpl_aborted(fpl));
3268 	}
3269 
3270 	cache_fpl_smr_exit(fpl);
3271 
3272 	if ((cnp->cn_flags & LOCKPARENT) != 0) {
3273 		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3274 		if (error != 0) {
3275 			vget_abort(tvp, tvs);
3276 			return (cache_fpl_aborted(fpl));
3277 		}
3278 	} else {
3279 		vget_finish_ref(dvp, dvs);
3280 	}
3281 
3282 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3283 		vget_abort(tvp, tvs);
3284 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3285 			vput(dvp);
3286 		else
3287 			vrele(dvp);
3288 		cache_fpl_aborted(fpl);
3289 		return (error);
3290 	}
3291 
3292 	error = cache_fplookup_final_child(fpl, tvs);
3293 	if (error != 0) {
3294 		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3295 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3296 			vput(dvp);
3297 		else
3298 			vrele(dvp);
3299 		return (error);
3300 	}
3301 
3302 	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3303 	return (0);
3304 }
3305 
3306 static int
3307 cache_fplookup_final(struct cache_fpl *fpl)
3308 {
3309 	struct componentname *cnp;
3310 	enum vgetstate tvs;
3311 	struct vnode *dvp, *tvp;
3312 	seqc_t dvp_seqc, tvp_seqc;
3313 
3314 	cnp = fpl->cnp;
3315 	dvp = fpl->dvp;
3316 	dvp_seqc = fpl->dvp_seqc;
3317 	tvp = fpl->tvp;
3318 	tvp_seqc = fpl->tvp_seqc;
3319 
3320 	VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3321 
3322 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3323 		return (cache_fplookup_final_withparent(fpl));
3324 
3325 	tvs = vget_prep_smr(tvp);
3326 	if (tvs == VGET_NONE) {
3327 		return (cache_fpl_partial(fpl));
3328 	}
3329 
3330 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3331 		cache_fpl_smr_exit(fpl);
3332 		vget_abort(tvp, tvs);
3333 		return (cache_fpl_aborted(fpl));
3334 	}
3335 
3336 	cache_fpl_smr_exit(fpl);
3337 	return (cache_fplookup_final_child(fpl, tvs));
3338 }
3339 
3340 static int
3341 cache_fplookup_next(struct cache_fpl *fpl)
3342 {
3343 	struct componentname *cnp;
3344 	struct namecache *ncp;
3345 	struct negstate *negstate;
3346 	struct vnode *dvp, *tvp;
3347 	u_char nc_flag;
3348 	uint32_t hash;
3349 	bool neg_hot;
3350 
3351 	cnp = fpl->cnp;
3352 	dvp = fpl->dvp;
3353 
3354 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3355 		fpl->tvp = dvp;
3356 		fpl->tvp_seqc = vn_seqc_read_any(dvp);
3357 		if (seqc_in_modify(fpl->tvp_seqc)) {
3358 			return (cache_fpl_aborted(fpl));
3359 		}
3360 		return (0);
3361 	}
3362 
3363 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3364 
3365 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3366 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3367 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3368 			break;
3369 	}
3370 
3371 	/*
3372 	 * If there is no entry we have to punt to the slow path to perform
3373 	 * actual lookup. Should there be nothing with this name a negative
3374 	 * entry will be created.
3375 	 */
3376 	if (__predict_false(ncp == NULL)) {
3377 		return (cache_fpl_partial(fpl));
3378 	}
3379 
3380 	tvp = atomic_load_ptr(&ncp->nc_vp);
3381 	nc_flag = atomic_load_char(&ncp->nc_flag);
3382 	if ((nc_flag & NCF_NEGATIVE) != 0) {
3383 		negstate = NCP2NEGSTATE(ncp);
3384 		neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3385 		if (__predict_false(cache_ncp_invalid(ncp))) {
3386 			return (cache_fpl_partial(fpl));
3387 		}
3388 		if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3389 			return (cache_fpl_partial(fpl));
3390 		}
3391 		if (!neg_hot) {
3392 			return (cache_fplookup_negative_promote(fpl, ncp, hash));
3393 		}
3394 		SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3395 		    ncp->nc_name);
3396 		counter_u64_add(numneghits, 1);
3397 		cache_fpl_smr_exit(fpl);
3398 		return (cache_fpl_handled(fpl, ENOENT));
3399 	}
3400 
3401 	if (__predict_false(cache_ncp_invalid(ncp))) {
3402 		return (cache_fpl_partial(fpl));
3403 	}
3404 
3405 	fpl->tvp = tvp;
3406 	fpl->tvp_seqc = vn_seqc_read_any(tvp);
3407 	if (seqc_in_modify(fpl->tvp_seqc)) {
3408 		return (cache_fpl_partial(fpl));
3409 	}
3410 
3411 	if (!cache_fplookup_vnode_supported(tvp)) {
3412 		return (cache_fpl_partial(fpl));
3413 	}
3414 
3415 	counter_u64_add(numposhits, 1);
3416 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3417 	return (0);
3418 }
3419 
3420 static bool
3421 cache_fplookup_mp_supported(struct mount *mp)
3422 {
3423 
3424 	if (mp == NULL)
3425 		return (false);
3426 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3427 		return (false);
3428 	if ((mp->mnt_flag & MNT_UNION) != 0)
3429 		return (false);
3430 	return (true);
3431 }
3432 
3433 /*
3434  * Walk up the mount stack (if any).
3435  *
3436  * Correctness is provided in the following ways:
3437  * - all vnodes are protected from freeing with SMR
3438  * - struct mount objects are type stable making them always safe to access
3439  * - stability of the particular mount is provided by busying it
3440  * - relationship between the vnode which is mounted on and the mount is
3441  *   verified with the vnode sequence counter after busying
3442  * - association between root vnode of the mount and the mount is protected
3443  *   by busy
3444  *
3445  * From that point on we can read the sequence counter of the root vnode
3446  * and get the next mount on the stack (if any) using the same protection.
3447  *
3448  * By the end of successful walk we are guaranteed the reached state was
3449  * indeed present at least at some point which matches the regular lookup.
3450  */
3451 static int
3452 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3453 {
3454 	struct mount *mp, *prev_mp;
3455 	struct vnode *vp;
3456 	seqc_t vp_seqc;
3457 
3458 	vp = fpl->tvp;
3459 	vp_seqc = fpl->tvp_seqc;
3460 	if (vp->v_type != VDIR)
3461 		return (0);
3462 
3463 	mp = atomic_load_ptr(&vp->v_mountedhere);
3464 	if (mp == NULL)
3465 		return (0);
3466 
3467 	prev_mp = NULL;
3468 	for (;;) {
3469 		if (!vfs_op_thread_enter(mp)) {
3470 			if (prev_mp != NULL)
3471 				vfs_op_thread_exit(prev_mp);
3472 			return (cache_fpl_partial(fpl));
3473 		}
3474 		if (prev_mp != NULL)
3475 			vfs_op_thread_exit(prev_mp);
3476 		if (!vn_seqc_consistent(vp, vp_seqc)) {
3477 			vfs_op_thread_exit(mp);
3478 			return (cache_fpl_partial(fpl));
3479 		}
3480 		if (!cache_fplookup_mp_supported(mp)) {
3481 			vfs_op_thread_exit(mp);
3482 			return (cache_fpl_partial(fpl));
3483 		}
3484 		vp = atomic_load_ptr(&mp->mnt_rootvnode);
3485 		if (vp == NULL || VN_IS_DOOMED(vp)) {
3486 			vfs_op_thread_exit(mp);
3487 			return (cache_fpl_partial(fpl));
3488 		}
3489 		vp_seqc = vn_seqc_read_any(vp);
3490 		if (seqc_in_modify(vp_seqc)) {
3491 			vfs_op_thread_exit(mp);
3492 			return (cache_fpl_partial(fpl));
3493 		}
3494 		prev_mp = mp;
3495 		mp = atomic_load_ptr(&vp->v_mountedhere);
3496 		if (mp == NULL)
3497 			break;
3498 	}
3499 
3500 	vfs_op_thread_exit(prev_mp);
3501 	fpl->tvp = vp;
3502 	fpl->tvp_seqc = vp_seqc;
3503 	return (0);
3504 }
3505 
3506 /*
3507  * Parse the path.
3508  *
3509  * The code is mostly copy-pasted from regular lookup, see lookup().
3510  * The structure is maintained along with comments for easier maintenance.
3511  * Deduplicating the code will become feasible after fast path lookup
3512  * becomes more feature-complete.
3513  */
3514 static int
3515 cache_fplookup_parse(struct cache_fpl *fpl)
3516 {
3517 	struct nameidata *ndp;
3518 	struct componentname *cnp;
3519 	char *cp;
3520 	char *prev_ni_next;             /* saved ndp->ni_next */
3521 	size_t prev_ni_pathlen;         /* saved ndp->ni_pathlen */
3522 
3523 	ndp = fpl->ndp;
3524 	cnp = fpl->cnp;
3525 
3526 	/*
3527 	 * Search a new directory.
3528 	 *
3529 	 * The last component of the filename is left accessible via
3530 	 * cnp->cn_nameptr for callers that need the name. Callers needing
3531 	 * the name set the SAVENAME flag. When done, they assume
3532 	 * responsibility for freeing the pathname buffer.
3533 	 */
3534 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
3535 		continue;
3536 	cnp->cn_namelen = cp - cnp->cn_nameptr;
3537 	if (cnp->cn_namelen > NAME_MAX) {
3538 		cache_fpl_smr_exit(fpl);
3539 		return (cache_fpl_handled(fpl, ENAMETOOLONG));
3540 	}
3541 	prev_ni_pathlen = ndp->ni_pathlen;
3542 	ndp->ni_pathlen -= cnp->cn_namelen;
3543 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
3544 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
3545 	prev_ni_next = ndp->ni_next;
3546 	ndp->ni_next = cp;
3547 
3548 	/*
3549 	 * Replace multiple slashes by a single slash and trailing slashes
3550 	 * by a null.  This must be done before VOP_LOOKUP() because some
3551 	 * fs's don't know about trailing slashes.  Remember if there were
3552 	 * trailing slashes to handle symlinks, existing non-directories
3553 	 * and non-existing files that won't be directories specially later.
3554 	 */
3555 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
3556 		cp++;
3557 		ndp->ni_pathlen--;
3558 		if (*cp == '\0') {
3559 			/*
3560 			 * TODO
3561 			 * Regular lookup performs the following:
3562 			 * *ndp->ni_next = '\0';
3563 			 * cnp->cn_flags |= TRAILINGSLASH;
3564 			 *
3565 			 * Which is problematic since it modifies data read
3566 			 * from userspace. Then if fast path lookup was to
3567 			 * abort we would have to either restore it or convey
3568 			 * the flag. Since this is a corner case just ignore
3569 			 * it for simplicity.
3570 			 */
3571 			return (cache_fpl_partial(fpl));
3572 		}
3573 	}
3574 	ndp->ni_next = cp;
3575 
3576 	cnp->cn_flags |= MAKEENTRY;
3577 
3578 	if (cnp->cn_namelen == 2 &&
3579 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3580 		cnp->cn_flags |= ISDOTDOT;
3581 	else
3582 		cnp->cn_flags &= ~ISDOTDOT;
3583 	if (*ndp->ni_next == 0)
3584 		cnp->cn_flags |= ISLASTCN;
3585 	else
3586 		cnp->cn_flags &= ~ISLASTCN;
3587 
3588 	/*
3589 	 * Check for degenerate name (e.g. / or "")
3590 	 * which is a way of talking about a directory,
3591 	 * e.g. like "/." or ".".
3592 	 *
3593 	 * TODO
3594 	 * Another corner case handled by the regular lookup
3595 	 */
3596 	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
3597 		return (cache_fpl_partial(fpl));
3598 	}
3599 	return (0);
3600 }
3601 
3602 static void
3603 cache_fplookup_parse_advance(struct cache_fpl *fpl)
3604 {
3605 	struct nameidata *ndp;
3606 	struct componentname *cnp;
3607 
3608 	ndp = fpl->ndp;
3609 	cnp = fpl->cnp;
3610 
3611 	cnp->cn_nameptr = ndp->ni_next;
3612 	while (*cnp->cn_nameptr == '/') {
3613 		cnp->cn_nameptr++;
3614 		ndp->ni_pathlen--;
3615 	}
3616 }
3617 
3618 static int
3619 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
3620 {
3621 	struct nameidata *ndp;
3622 	struct componentname *cnp;
3623 	struct mount *mp;
3624 	int error;
3625 
3626 	error = CACHE_FPL_FAILED;
3627 	ndp = fpl->ndp;
3628 	ndp->ni_lcf = 0;
3629 	cnp = fpl->cnp;
3630 	cnp->cn_lkflags = LK_SHARED;
3631 	if ((cnp->cn_flags & LOCKSHARED) == 0)
3632 		cnp->cn_lkflags = LK_EXCLUSIVE;
3633 
3634 	cache_fpl_checkpoint(fpl, &fpl->snd);
3635 
3636 	fpl->dvp = dvp;
3637 	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
3638 	if (seqc_in_modify(fpl->dvp_seqc)) {
3639 		cache_fpl_aborted(fpl);
3640 		goto out;
3641 	}
3642 	mp = atomic_load_ptr(&fpl->dvp->v_mount);
3643 	if (!cache_fplookup_mp_supported(mp)) {
3644 		cache_fpl_aborted(fpl);
3645 		goto out;
3646 	}
3647 
3648 	VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3649 
3650 	for (;;) {
3651 		error = cache_fplookup_parse(fpl);
3652 		if (__predict_false(error != 0)) {
3653 			break;
3654 		}
3655 
3656 		if (cnp->cn_flags & ISDOTDOT) {
3657 			error = cache_fpl_partial(fpl);
3658 			break;
3659 		}
3660 
3661 		VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3662 
3663 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
3664 		if (__predict_false(error != 0)) {
3665 			switch (error) {
3666 			case EAGAIN:
3667 			case EOPNOTSUPP: /* can happen when racing against vgone */
3668 				cache_fpl_partial(fpl);
3669 				break;
3670 			default:
3671 				/*
3672 				 * See the API contract for VOP_FPLOOKUP_VEXEC.
3673 				 */
3674 				if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3675 					error = cache_fpl_aborted(fpl);
3676 				} else {
3677 					cache_fpl_smr_exit(fpl);
3678 					cache_fpl_handled(fpl, error);
3679 				}
3680 				break;
3681 			}
3682 			break;
3683 		}
3684 
3685 		error = cache_fplookup_next(fpl);
3686 		if (__predict_false(error != 0)) {
3687 			break;
3688 		}
3689 
3690 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3691 
3692 		error = cache_fplookup_climb_mount(fpl);
3693 		if (__predict_false(error != 0)) {
3694 			break;
3695 		}
3696 
3697 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3698 
3699 		if (cnp->cn_flags & ISLASTCN) {
3700 			error = cache_fplookup_final(fpl);
3701 			break;
3702 		}
3703 
3704 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3705 			error = cache_fpl_aborted(fpl);
3706 			break;
3707 		}
3708 
3709 		fpl->dvp = fpl->tvp;
3710 		fpl->dvp_seqc = fpl->tvp_seqc;
3711 
3712 		cache_fplookup_parse_advance(fpl);
3713 		cache_fpl_checkpoint(fpl, &fpl->snd);
3714 	}
3715 out:
3716 	switch (fpl->status) {
3717 	case CACHE_FPL_STATUS_UNSET:
3718 		__assert_unreachable();
3719 		break;
3720 	case CACHE_FPL_STATUS_PARTIAL:
3721 		cache_fpl_smr_assert_entered(fpl);
3722 		return (cache_fplookup_partial_setup(fpl));
3723 	case CACHE_FPL_STATUS_ABORTED:
3724 		if (fpl->in_smr)
3725 			cache_fpl_smr_exit(fpl);
3726 		return (CACHE_FPL_FAILED);
3727 	case CACHE_FPL_STATUS_HANDLED:
3728 		cache_fpl_smr_assert_not_entered(fpl);
3729 		if (__predict_false(error != 0)) {
3730 			ndp->ni_dvp = NULL;
3731 			ndp->ni_vp = NULL;
3732 			cache_fpl_cleanup_cnp(cnp);
3733 			return (error);
3734 		}
3735 		ndp->ni_dvp = fpl->dvp;
3736 		ndp->ni_vp = fpl->tvp;
3737 		if (cnp->cn_flags & SAVENAME)
3738 			cnp->cn_flags |= HASBUF;
3739 		else
3740 			cache_fpl_cleanup_cnp(cnp);
3741 		return (error);
3742 	}
3743 }
3744 
3745 /*
3746  * Fast path lookup protected with SMR and sequence counters.
3747  *
3748  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
3749  *
3750  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
3751  * outlined below.
3752  *
3753  * Traditional vnode lookup conceptually looks like this:
3754  *
3755  * vn_lock(current);
3756  * for (;;) {
3757  *	next = find();
3758  *	vn_lock(next);
3759  *	vn_unlock(current);
3760  *	current = next;
3761  *	if (last)
3762  *	    break;
3763  * }
3764  * return (current);
3765  *
3766  * Each jump to the next vnode is safe memory-wise and atomic with respect to
3767  * any modifications thanks to holding respective locks.
3768  *
3769  * The same guarantee can be provided with a combination of safe memory
3770  * reclamation and sequence counters instead. If all operations which affect
3771  * the relationship between the current vnode and the one we are looking for
3772  * also modify the counter, we can verify whether all the conditions held as
3773  * we made the jump. This includes things like permissions, mount points etc.
3774  * Counter modification is provided by enclosing relevant places in
3775  * vn_seqc_write_begin()/end() calls.
3776  *
3777  * Thus this translates to:
3778  *
3779  * vfs_smr_enter();
3780  * dvp_seqc = seqc_read_any(dvp);
3781  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
3782  *     abort();
3783  * for (;;) {
3784  * 	tvp = find();
3785  * 	tvp_seqc = seqc_read_any(tvp);
3786  * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
3787  * 	    abort();
3788  * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
3789  * 	    abort();
3790  * 	dvp = tvp; // we know nothing of importance has changed
3791  * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
3792  * 	if (last)
3793  * 	    break;
3794  * }
3795  * vget(); // secure the vnode
3796  * if (!seqc_consistent(tvp, tvp_seqc) // final check
3797  * 	    abort();
3798  * // at this point we know nothing has changed for any parent<->child pair
3799  * // as they were crossed during the lookup, meaning we matched the guarantee
3800  * // of the locked variant
3801  * return (tvp);
3802  *
3803  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
3804  * - they are called while within vfs_smr protection which they must never exit
3805  * - EAGAIN can be returned to denote checking could not be performed, it is
3806  *   always valid to return it
3807  * - if the sequence counter has not changed the result must be valid
3808  * - if the sequence counter has changed both false positives and false negatives
3809  *   are permitted (since the result will be rejected later)
3810  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
3811  *
3812  * Caveats to watch out for:
3813  * - vnodes are passed unlocked and unreferenced with nothing stopping
3814  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
3815  *   to use atomic_load_ptr to fetch it.
3816  * - the aforementioned object can also get freed, meaning absent other means it
3817  *   should be protected with vfs_smr
3818  * - either safely checking permissions as they are modified or guaranteeing
3819  *   their stability is left to the routine
3820  */
3821 int
3822 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
3823     struct pwd **pwdp)
3824 {
3825 	struct cache_fpl fpl;
3826 	struct pwd *pwd;
3827 	struct vnode *dvp;
3828 	struct componentname *cnp;
3829 	struct nameidata_saved orig;
3830 	int error;
3831 
3832 	*status = CACHE_FPL_STATUS_UNSET;
3833 	bzero(&fpl, sizeof(fpl));
3834 	fpl.status = CACHE_FPL_STATUS_UNSET;
3835 	fpl.ndp = ndp;
3836 	fpl.cnp = &ndp->ni_cnd;
3837 	MPASS(curthread == fpl.cnp->cn_thread);
3838 
3839 	if (!cache_can_fplookup(&fpl)) {
3840 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
3841 		*status = fpl.status;
3842 		return (EOPNOTSUPP);
3843 	}
3844 
3845 	cache_fpl_checkpoint(&fpl, &orig);
3846 
3847 	cache_fpl_smr_enter(&fpl);
3848 	pwd = pwd_get_smr();
3849 	fpl.pwd = pwd;
3850 	ndp->ni_rootdir = pwd->pwd_rdir;
3851 	ndp->ni_topdir = pwd->pwd_jdir;
3852 
3853 	cnp = fpl.cnp;
3854 	cnp->cn_nameptr = cnp->cn_pnbuf;
3855 	if (cnp->cn_pnbuf[0] == '/') {
3856 		cache_fpl_handle_root(ndp, &dvp);
3857 	} else {
3858 		MPASS(ndp->ni_dirfd == AT_FDCWD);
3859 		dvp = pwd->pwd_cdir;
3860 	}
3861 
3862 	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
3863 
3864 	error = cache_fplookup_impl(dvp, &fpl);
3865 	cache_fpl_smr_assert_not_entered(&fpl);
3866 	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
3867 
3868 	*status = fpl.status;
3869 	switch (fpl.status) {
3870 	case CACHE_FPL_STATUS_UNSET:
3871 		__assert_unreachable();
3872 		break;
3873 	case CACHE_FPL_STATUS_HANDLED:
3874 		SDT_PROBE3(vfs, namei, lookup, return, error,
3875 		    (error == 0 ? ndp->ni_vp : NULL), true);
3876 		break;
3877 	case CACHE_FPL_STATUS_PARTIAL:
3878 		*pwdp = fpl.pwd;
3879 		cache_fpl_restore(&fpl, &fpl.snd);
3880 		break;
3881 	case CACHE_FPL_STATUS_ABORTED:
3882 		cache_fpl_restore(&fpl, &orig);
3883 		break;
3884 	}
3885 	return (error);
3886 }
3887