xref: /freebsd/sys/kern/vfs_cache.c (revision 3422ca83ba48e5c9174542a2d3ba8225275779a6)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ddb.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
50 #include <sys/ktr.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
54 #include <sys/mount.h>
55 #include <sys/namei.h>
56 #include <sys/proc.h>
57 #include <sys/rwlock.h>
58 #include <sys/seqc.h>
59 #include <sys/sdt.h>
60 #include <sys/smr.h>
61 #include <sys/smp.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
66 #include <ck_queue.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 
71 #include <sys/capsicum.h>
72 
73 #include <security/audit/audit.h>
74 #include <security/mac/mac_framework.h>
75 
76 #ifdef DDB
77 #include <ddb/ddb.h>
78 #endif
79 
80 #include <vm/uma.h>
81 
82 SDT_PROVIDER_DECLARE(vfs);
83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
84     "struct vnode *");
85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
86     "char *");
87 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
88 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
89     "char *", "struct vnode *");
90 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
91 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
92     "struct vnode *", "char *");
93 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
94     "struct vnode *");
95 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
96     "struct vnode *", "char *");
97 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
98     "char *");
99 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
100 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
101 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
102 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
103     "struct vnode *");
104 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
105     "char *");
106 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
107     "char *");
108 
109 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
110 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
111 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
112 
113 /*
114  * This structure describes the elements in the cache of recent
115  * names looked up by namei.
116  */
117 struct negstate {
118 	u_char neg_flag;
119 };
120 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
121     "the state must fit in a union with a pointer without growing it");
122 
123 struct	namecache {
124 	CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */
125 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
126 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
127 	struct	vnode *nc_dvp;		/* vnode of parent of name */
128 	union {
129 		struct	vnode *nu_vp;	/* vnode the name refers to */
130 		struct	negstate nu_neg;/* negative entry state */
131 	} n_un;
132 	u_char	nc_flag;		/* flag bits */
133 	u_char	nc_nlen;		/* length of name */
134 	char	nc_name[0];		/* segment name + nul */
135 };
136 
137 /*
138  * struct namecache_ts repeats struct namecache layout up to the
139  * nc_nlen member.
140  * struct namecache_ts is used in place of struct namecache when time(s) need
141  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
142  * both a non-dotdot directory name plus dotdot for the directory's
143  * parent.
144  */
145 struct	namecache_ts {
146 	struct	timespec nc_time;	/* timespec provided by fs */
147 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
148 	int	nc_ticks;		/* ticks value when entry was added */
149 	struct namecache nc_nc;
150 };
151 
152 #define	nc_vp		n_un.nu_vp
153 #define	nc_neg		n_un.nu_neg
154 
155 /*
156  * Flags in namecache.nc_flag
157  */
158 #define NCF_WHITE	0x01
159 #define NCF_ISDOTDOT	0x02
160 #define	NCF_TS		0x04
161 #define	NCF_DTS		0x08
162 #define	NCF_DVDROP	0x10
163 #define	NCF_NEGATIVE	0x20
164 #define	NCF_INVALID	0x40
165 
166 /*
167  * Flags in negstate.neg_flag
168  */
169 #define NEG_HOT		0x01
170 
171 /*
172  * Mark an entry as invalid.
173  *
174  * This is called before it starts getting deconstructed.
175  */
176 static void
177 cache_ncp_invalidate(struct namecache *ncp)
178 {
179 
180 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
181 	    ("%s: entry %p already invalid", __func__, ncp));
182 	ncp->nc_flag |= NCF_INVALID;
183 	atomic_thread_fence_rel();
184 }
185 
186 /*
187  * Verify validity of an entry.
188  *
189  * All places which elide locks are supposed to call this after they are
190  * done with reading from an entry.
191  */
192 static bool
193 cache_ncp_invalid(struct namecache *ncp)
194 {
195 
196 	atomic_thread_fence_acq();
197 	return ((ncp->nc_flag & NCF_INVALID) != 0);
198 }
199 
200 /*
201  * Name caching works as follows:
202  *
203  * Names found by directory scans are retained in a cache
204  * for future reference.  It is managed LRU, so frequently
205  * used names will hang around.  Cache is indexed by hash value
206  * obtained from (dvp, name) where dvp refers to the directory
207  * containing name.
208  *
209  * If it is a "negative" entry, (i.e. for a name that is known NOT to
210  * exist) the vnode pointer will be NULL.
211  *
212  * Upon reaching the last segment of a path, if the reference
213  * is for DELETE, or NOCACHE is set (rewrite), and the
214  * name is located in the cache, it will be dropped.
215  *
216  * These locks are used (in the order in which they can be taken):
217  * NAME		TYPE	ROLE
218  * vnodelock	mtx	vnode lists and v_cache_dd field protection
219  * bucketlock	rwlock	for access to given set of hash buckets
220  * neglist	mtx	negative entry LRU management
221  *
222  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
223  * shrinking the LRU list.
224  *
225  * It is legal to take multiple vnodelock and bucketlock locks. The locking
226  * order is lower address first. Both are recursive.
227  *
228  * "." lookups are lockless.
229  *
230  * ".." and vnode -> name lookups require vnodelock.
231  *
232  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
233  *
234  * Insertions and removals of entries require involved vnodes and bucketlocks
235  * to be write-locked to prevent other threads from seeing the entry.
236  *
237  * Some lookups result in removal of the found entry (e.g. getting rid of a
238  * negative entry with the intent to create a positive one), which poses a
239  * problem when multiple threads reach the state. Similarly, two different
240  * threads can purge two different vnodes and try to remove the same name.
241  *
242  * If the already held vnode lock is lower than the second required lock, we
243  * can just take the other lock. However, in the opposite case, this could
244  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
245  * the first node, locking everything in order and revalidating the state.
246  */
247 
248 VFS_SMR_DECLARE;
249 
250 /*
251  * Structures associated with name caching.
252  */
253 #define NCHHASH(hash) \
254 	(&nchashtbl[(hash) & nchash])
255 static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
256 static u_long __read_mostly	nchash;			/* size of hash table */
257 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
258     "Size of namecache hash table");
259 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
260 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
261     "Ratio of negative namecache entries");
262 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
263 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
264 u_int ncsizefactor = 2;
265 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
266     "Size factor for namecache");
267 static u_int __read_mostly	ncpurgeminvnodes;
268 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
269     "Number of vnodes below which purgevfs ignores the request");
270 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
271 
272 struct nchstats	nchstats;		/* cache effectiveness statistics */
273 
274 static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
275 
276 struct neglist {
277 	struct mtx		nl_lock;
278 	TAILQ_HEAD(, namecache) nl_list;
279 } __aligned(CACHE_LINE_SIZE);
280 
281 static struct neglist __read_mostly	*neglists;
282 static struct neglist ncneg_hot;
283 static u_long numhotneg;
284 
285 #define	numneglists (ncneghash + 1)
286 static u_int __read_mostly	ncneghash;
287 static inline struct neglist *
288 NCP2NEGLIST(struct namecache *ncp)
289 {
290 
291 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
292 }
293 
294 static inline struct negstate *
295 NCP2NEGSTATE(struct namecache *ncp)
296 {
297 
298 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
299 	return (&ncp->nc_neg);
300 }
301 
302 #define	numbucketlocks (ncbuckethash + 1)
303 static u_int __read_mostly  ncbuckethash;
304 static struct rwlock_padalign __read_mostly  *bucketlocks;
305 #define	HASH2BUCKETLOCK(hash) \
306 	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
307 
308 #define	numvnodelocks (ncvnodehash + 1)
309 static u_int __read_mostly  ncvnodehash;
310 static struct mtx __read_mostly *vnodelocks;
311 static inline struct mtx *
312 VP2VNODELOCK(struct vnode *vp)
313 {
314 
315 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
316 }
317 
318 /*
319  * UMA zones for the VFS cache.
320  *
321  * The small cache is used for entries with short names, which are the
322  * most common.  The large cache is used for entries which are too big to
323  * fit in the small cache.
324  */
325 static uma_zone_t __read_mostly cache_zone_small;
326 static uma_zone_t __read_mostly cache_zone_small_ts;
327 static uma_zone_t __read_mostly cache_zone_large;
328 static uma_zone_t __read_mostly cache_zone_large_ts;
329 
330 #define	CACHE_PATH_CUTOFF	35
331 
332 static struct namecache *
333 cache_alloc(int len, int ts)
334 {
335 	struct namecache_ts *ncp_ts;
336 	struct namecache *ncp;
337 
338 	if (__predict_false(ts)) {
339 		if (len <= CACHE_PATH_CUTOFF)
340 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
341 		else
342 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
343 		ncp = &ncp_ts->nc_nc;
344 	} else {
345 		if (len <= CACHE_PATH_CUTOFF)
346 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
347 		else
348 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
349 	}
350 	return (ncp);
351 }
352 
353 static void
354 cache_free(struct namecache *ncp)
355 {
356 	struct namecache_ts *ncp_ts;
357 
358 	if (ncp == NULL)
359 		return;
360 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
361 		vdrop(ncp->nc_dvp);
362 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
363 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
364 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
365 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
366 		else
367 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
368 	} else {
369 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
370 			uma_zfree_smr(cache_zone_small, ncp);
371 		else
372 			uma_zfree_smr(cache_zone_large, ncp);
373 	}
374 }
375 
376 static void
377 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
378 {
379 	struct namecache_ts *ncp_ts;
380 
381 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
382 	    (tsp == NULL && ticksp == NULL),
383 	    ("No NCF_TS"));
384 
385 	if (tsp == NULL && ticksp == NULL)
386 		return;
387 
388 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
389 	if (tsp != NULL)
390 		*tsp = ncp_ts->nc_time;
391 	if (ticksp != NULL)
392 		*ticksp = ncp_ts->nc_ticks;
393 }
394 
395 #ifdef DEBUG_CACHE
396 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
397 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
398     "VFS namecache enabled");
399 #endif
400 
401 /* Export size information to userland */
402 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
403     sizeof(struct namecache), "sizeof(struct namecache)");
404 
405 /*
406  * The new name cache statistics
407  */
408 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
409     "Name cache statistics");
410 #define STATNODE_ULONG(name, descr)					\
411 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
412 #define STATNODE_COUNTER(name, descr)					\
413 	static COUNTER_U64_DEFINE_EARLY(name);				\
414 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
415 	    descr);
416 STATNODE_ULONG(numneg, "Number of negative cache entries");
417 STATNODE_ULONG(numcache, "Number of cache entries");
418 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
419 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
420 STATNODE_COUNTER(dothits, "Number of '.' hits");
421 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
422 STATNODE_COUNTER(nummiss, "Number of cache misses");
423 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
424 STATNODE_COUNTER(numposzaps,
425     "Number of cache hits (positive) we do not want to cache");
426 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
427 STATNODE_COUNTER(numnegzaps,
428     "Number of cache hits (negative) we do not want to cache");
429 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
430 /* These count for vn_getcwd(), too. */
431 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
432 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
433 STATNODE_COUNTER(numfullpathfail2,
434     "Number of fullpath search errors (VOP_VPTOCNP failures)");
435 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
436 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
437 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
438     "Number of successful removals after relocking");
439 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
440     "Number of times zap_and_exit failed to lock");
441 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
442     "Number of times zap_and_exit failed to lock");
443 static long cache_lock_vnodes_cel_3_failures;
444 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
445     "Number of times 3-way vnode locking failed");
446 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
447 STATNODE_COUNTER(numneg_evicted,
448     "Number of negative entries evicted when adding a new entry");
449 STATNODE_COUNTER(shrinking_skipped,
450     "Number of times shrinking was already in progress");
451 
452 static void cache_zap_locked(struct namecache *ncp);
453 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
454     char **freebuf, size_t *buflen);
455 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
456     char *buf, char **retbuf, size_t *buflen);
457 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
458     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
459 
460 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
461 
462 static int cache_yield;
463 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
464     "Number of times cache called yield");
465 
466 static void __noinline
467 cache_maybe_yield(void)
468 {
469 
470 	if (should_yield()) {
471 		cache_yield++;
472 		kern_yield(PRI_USER);
473 	}
474 }
475 
476 static inline void
477 cache_assert_vlp_locked(struct mtx *vlp)
478 {
479 
480 	if (vlp != NULL)
481 		mtx_assert(vlp, MA_OWNED);
482 }
483 
484 static inline void
485 cache_assert_vnode_locked(struct vnode *vp)
486 {
487 	struct mtx *vlp;
488 
489 	vlp = VP2VNODELOCK(vp);
490 	cache_assert_vlp_locked(vlp);
491 }
492 
493 /*
494  * TODO: With the value stored we can do better than computing the hash based
495  * on the address and the choice of FNV should also be revisisted.
496  */
497 static void
498 cache_prehash(struct vnode *vp)
499 {
500 
501 	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
502 }
503 
504 static uint32_t
505 cache_get_hash(char *name, u_char len, struct vnode *dvp)
506 {
507 
508 	return (fnv_32_buf(name, len, dvp->v_nchash));
509 }
510 
511 static inline struct rwlock *
512 NCP2BUCKETLOCK(struct namecache *ncp)
513 {
514 	uint32_t hash;
515 
516 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
517 	return (HASH2BUCKETLOCK(hash));
518 }
519 
520 #ifdef INVARIANTS
521 static void
522 cache_assert_bucket_locked(struct namecache *ncp, int mode)
523 {
524 	struct rwlock *blp;
525 
526 	blp = NCP2BUCKETLOCK(ncp);
527 	rw_assert(blp, mode);
528 }
529 #else
530 #define cache_assert_bucket_locked(x, y) do { } while (0)
531 #endif
532 
533 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
534 static void
535 _cache_sort_vnodes(void **p1, void **p2)
536 {
537 	void *tmp;
538 
539 	MPASS(*p1 != NULL || *p2 != NULL);
540 
541 	if (*p1 > *p2) {
542 		tmp = *p2;
543 		*p2 = *p1;
544 		*p1 = tmp;
545 	}
546 }
547 
548 static void
549 cache_lock_all_buckets(void)
550 {
551 	u_int i;
552 
553 	for (i = 0; i < numbucketlocks; i++)
554 		rw_wlock(&bucketlocks[i]);
555 }
556 
557 static void
558 cache_unlock_all_buckets(void)
559 {
560 	u_int i;
561 
562 	for (i = 0; i < numbucketlocks; i++)
563 		rw_wunlock(&bucketlocks[i]);
564 }
565 
566 static void
567 cache_lock_all_vnodes(void)
568 {
569 	u_int i;
570 
571 	for (i = 0; i < numvnodelocks; i++)
572 		mtx_lock(&vnodelocks[i]);
573 }
574 
575 static void
576 cache_unlock_all_vnodes(void)
577 {
578 	u_int i;
579 
580 	for (i = 0; i < numvnodelocks; i++)
581 		mtx_unlock(&vnodelocks[i]);
582 }
583 
584 static int
585 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
586 {
587 
588 	cache_sort_vnodes(&vlp1, &vlp2);
589 
590 	if (vlp1 != NULL) {
591 		if (!mtx_trylock(vlp1))
592 			return (EAGAIN);
593 	}
594 	if (!mtx_trylock(vlp2)) {
595 		if (vlp1 != NULL)
596 			mtx_unlock(vlp1);
597 		return (EAGAIN);
598 	}
599 
600 	return (0);
601 }
602 
603 static void
604 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
605 {
606 
607 	MPASS(vlp1 != NULL || vlp2 != NULL);
608 	MPASS(vlp1 <= vlp2);
609 
610 	if (vlp1 != NULL)
611 		mtx_lock(vlp1);
612 	if (vlp2 != NULL)
613 		mtx_lock(vlp2);
614 }
615 
616 static void
617 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
618 {
619 
620 	MPASS(vlp1 != NULL || vlp2 != NULL);
621 
622 	if (vlp1 != NULL)
623 		mtx_unlock(vlp1);
624 	if (vlp2 != NULL)
625 		mtx_unlock(vlp2);
626 }
627 
628 static int
629 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
630 {
631 	struct nchstats snap;
632 
633 	if (req->oldptr == NULL)
634 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
635 
636 	snap = nchstats;
637 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
638 	snap.ncs_neghits = counter_u64_fetch(numneghits);
639 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
640 	    counter_u64_fetch(numnegzaps);
641 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
642 	    counter_u64_fetch(nummiss);
643 
644 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
645 }
646 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
647     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
648     "VFS cache effectiveness statistics");
649 
650 #ifdef DIAGNOSTIC
651 /*
652  * Grab an atomic snapshot of the name cache hash chain lengths
653  */
654 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
655     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
656     "hash table stats");
657 
658 static int
659 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
660 {
661 	struct nchashhead *ncpp;
662 	struct namecache *ncp;
663 	int i, error, n_nchash, *cntbuf;
664 
665 retry:
666 	n_nchash = nchash + 1;	/* nchash is max index, not count */
667 	if (req->oldptr == NULL)
668 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
669 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
670 	cache_lock_all_buckets();
671 	if (n_nchash != nchash + 1) {
672 		cache_unlock_all_buckets();
673 		free(cntbuf, M_TEMP);
674 		goto retry;
675 	}
676 	/* Scan hash tables counting entries */
677 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
678 		CK_LIST_FOREACH(ncp, ncpp, nc_hash)
679 			cntbuf[i]++;
680 	cache_unlock_all_buckets();
681 	for (error = 0, i = 0; i < n_nchash; i++)
682 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
683 			break;
684 	free(cntbuf, M_TEMP);
685 	return (error);
686 }
687 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
688     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
689     "nchash chain lengths");
690 
691 static int
692 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
693 {
694 	int error;
695 	struct nchashhead *ncpp;
696 	struct namecache *ncp;
697 	int n_nchash;
698 	int count, maxlength, used, pct;
699 
700 	if (!req->oldptr)
701 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
702 
703 	cache_lock_all_buckets();
704 	n_nchash = nchash + 1;	/* nchash is max index, not count */
705 	used = 0;
706 	maxlength = 0;
707 
708 	/* Scan hash tables for applicable entries */
709 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
710 		count = 0;
711 		CK_LIST_FOREACH(ncp, ncpp, nc_hash) {
712 			count++;
713 		}
714 		if (count)
715 			used++;
716 		if (maxlength < count)
717 			maxlength = count;
718 	}
719 	n_nchash = nchash + 1;
720 	cache_unlock_all_buckets();
721 	pct = (used * 100) / (n_nchash / 100);
722 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
723 	if (error)
724 		return (error);
725 	error = SYSCTL_OUT(req, &used, sizeof(used));
726 	if (error)
727 		return (error);
728 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
729 	if (error)
730 		return (error);
731 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
732 	if (error)
733 		return (error);
734 	return (0);
735 }
736 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
737     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
738     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
739 #endif
740 
741 /*
742  * Negative entries management
743  *
744  * A variation of LRU scheme is used. New entries are hashed into one of
745  * numneglists cold lists. Entries get promoted to the hot list on first hit.
746  *
747  * The shrinker will demote hot list head and evict from the cold list in a
748  * round-robin manner.
749  */
750 static void
751 cache_negative_init(struct namecache *ncp)
752 {
753 	struct negstate *negstate;
754 
755 	ncp->nc_flag |= NCF_NEGATIVE;
756 	negstate = NCP2NEGSTATE(ncp);
757 	negstate->neg_flag = 0;
758 }
759 
760 static void
761 cache_negative_hit(struct namecache *ncp)
762 {
763 	struct neglist *neglist;
764 	struct negstate *negstate;
765 
766 	negstate = NCP2NEGSTATE(ncp);
767 	if ((negstate->neg_flag & NEG_HOT) != 0)
768 		return;
769 	neglist = NCP2NEGLIST(ncp);
770 	mtx_lock(&ncneg_hot.nl_lock);
771 	mtx_lock(&neglist->nl_lock);
772 	if ((negstate->neg_flag & NEG_HOT) == 0) {
773 		numhotneg++;
774 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
775 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
776 		negstate->neg_flag |= NEG_HOT;
777 	}
778 	mtx_unlock(&neglist->nl_lock);
779 	mtx_unlock(&ncneg_hot.nl_lock);
780 }
781 
782 static void
783 cache_negative_insert(struct namecache *ncp)
784 {
785 	struct neglist *neglist;
786 
787 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
788 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
789 	neglist = NCP2NEGLIST(ncp);
790 	mtx_lock(&neglist->nl_lock);
791 	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
792 	mtx_unlock(&neglist->nl_lock);
793 	atomic_add_rel_long(&numneg, 1);
794 }
795 
796 static void
797 cache_negative_remove(struct namecache *ncp)
798 {
799 	struct neglist *neglist;
800 	struct negstate *negstate;
801 	bool hot_locked = false;
802 	bool list_locked = false;
803 
804 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
805 	neglist = NCP2NEGLIST(ncp);
806 	negstate = NCP2NEGSTATE(ncp);
807 	if ((negstate->neg_flag & NEG_HOT) != 0) {
808 		hot_locked = true;
809 		mtx_lock(&ncneg_hot.nl_lock);
810 		if ((negstate->neg_flag & NEG_HOT) == 0) {
811 			list_locked = true;
812 			mtx_lock(&neglist->nl_lock);
813 		}
814 	} else {
815 		list_locked = true;
816 		mtx_lock(&neglist->nl_lock);
817 		/*
818 		 * We may be racing against promotion in lockless lookup.
819 		 */
820 		if ((negstate->neg_flag & NEG_HOT) != 0) {
821 			mtx_unlock(&neglist->nl_lock);
822 			hot_locked = true;
823 			mtx_lock(&ncneg_hot.nl_lock);
824 			mtx_lock(&neglist->nl_lock);
825 		}
826 	}
827 	if ((negstate->neg_flag & NEG_HOT) != 0) {
828 		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
829 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
830 		numhotneg--;
831 	} else {
832 		mtx_assert(&neglist->nl_lock, MA_OWNED);
833 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
834 	}
835 	if (list_locked)
836 		mtx_unlock(&neglist->nl_lock);
837 	if (hot_locked)
838 		mtx_unlock(&ncneg_hot.nl_lock);
839 	atomic_subtract_rel_long(&numneg, 1);
840 }
841 
842 static void
843 cache_negative_shrink_select(struct namecache **ncpp,
844     struct neglist **neglistpp)
845 {
846 	struct neglist *neglist;
847 	struct namecache *ncp;
848 	static u_int cycle;
849 	u_int i;
850 
851 	*ncpp = ncp = NULL;
852 
853 	for (i = 0; i < numneglists; i++) {
854 		neglist = &neglists[(cycle + i) % numneglists];
855 		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
856 			continue;
857 		mtx_lock(&neglist->nl_lock);
858 		ncp = TAILQ_FIRST(&neglist->nl_list);
859 		if (ncp != NULL)
860 			break;
861 		mtx_unlock(&neglist->nl_lock);
862 	}
863 
864 	*neglistpp = neglist;
865 	*ncpp = ncp;
866 	cycle++;
867 }
868 
869 static void
870 cache_negative_zap_one(void)
871 {
872 	struct namecache *ncp, *ncp2;
873 	struct neglist *neglist;
874 	struct negstate *negstate;
875 	struct mtx *dvlp;
876 	struct rwlock *blp;
877 
878 	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
879 	    !mtx_trylock(&ncneg_shrink_lock)) {
880 		counter_u64_add(shrinking_skipped, 1);
881 		return;
882 	}
883 
884 	mtx_lock(&ncneg_hot.nl_lock);
885 	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
886 	if (ncp != NULL) {
887 		neglist = NCP2NEGLIST(ncp);
888 		negstate = NCP2NEGSTATE(ncp);
889 		mtx_lock(&neglist->nl_lock);
890 		MPASS((negstate->neg_flag & NEG_HOT) != 0);
891 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
892 		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
893 		negstate->neg_flag &= ~NEG_HOT;
894 		numhotneg--;
895 		mtx_unlock(&neglist->nl_lock);
896 	}
897 	mtx_unlock(&ncneg_hot.nl_lock);
898 
899 	cache_negative_shrink_select(&ncp, &neglist);
900 
901 	mtx_unlock(&ncneg_shrink_lock);
902 	if (ncp == NULL)
903 		return;
904 
905 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
906 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
907 	blp = NCP2BUCKETLOCK(ncp);
908 	mtx_unlock(&neglist->nl_lock);
909 	mtx_lock(dvlp);
910 	rw_wlock(blp);
911 	/*
912 	 * Enter SMR to safely check the negative list.
913 	 * Even if the found pointer matches, the entry may now be reallocated
914 	 * and used by a different vnode.
915 	 */
916 	vfs_smr_enter();
917 	ncp2 = TAILQ_FIRST(&neglist->nl_list);
918 	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
919 	    blp != NCP2BUCKETLOCK(ncp2)) {
920 		vfs_smr_exit();
921 		ncp = NULL;
922 	} else {
923 		vfs_smr_exit();
924 		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
925 		    ncp->nc_name);
926 		cache_zap_locked(ncp);
927 		counter_u64_add(numneg_evicted, 1);
928 	}
929 	rw_wunlock(blp);
930 	mtx_unlock(dvlp);
931 	cache_free(ncp);
932 }
933 
934 /*
935  * cache_zap_locked():
936  *
937  *   Removes a namecache entry from cache, whether it contains an actual
938  *   pointer to a vnode or if it is just a negative cache entry.
939  */
940 static void
941 cache_zap_locked(struct namecache *ncp)
942 {
943 
944 	if (!(ncp->nc_flag & NCF_NEGATIVE))
945 		cache_assert_vnode_locked(ncp->nc_vp);
946 	cache_assert_vnode_locked(ncp->nc_dvp);
947 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
948 
949 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
950 	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
951 
952 	cache_ncp_invalidate(ncp);
953 
954 	CK_LIST_REMOVE(ncp, nc_hash);
955 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
956 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
957 		    ncp->nc_name, ncp->nc_vp);
958 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
959 		if (ncp == ncp->nc_vp->v_cache_dd)
960 			ncp->nc_vp->v_cache_dd = NULL;
961 	} else {
962 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
963 		    ncp->nc_name);
964 		cache_negative_remove(ncp);
965 	}
966 	if (ncp->nc_flag & NCF_ISDOTDOT) {
967 		if (ncp == ncp->nc_dvp->v_cache_dd)
968 			ncp->nc_dvp->v_cache_dd = NULL;
969 	} else {
970 		LIST_REMOVE(ncp, nc_src);
971 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
972 			ncp->nc_flag |= NCF_DVDROP;
973 			counter_u64_add(numcachehv, -1);
974 		}
975 	}
976 	atomic_subtract_rel_long(&numcache, 1);
977 }
978 
979 static void
980 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
981 {
982 	struct rwlock *blp;
983 
984 	MPASS(ncp->nc_dvp == vp);
985 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
986 	cache_assert_vnode_locked(vp);
987 
988 	blp = NCP2BUCKETLOCK(ncp);
989 	rw_wlock(blp);
990 	cache_zap_locked(ncp);
991 	rw_wunlock(blp);
992 }
993 
994 static bool
995 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
996     struct mtx **vlpp)
997 {
998 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
999 	struct rwlock *blp;
1000 
1001 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1002 	cache_assert_vnode_locked(vp);
1003 
1004 	if (ncp->nc_flag & NCF_NEGATIVE) {
1005 		if (*vlpp != NULL) {
1006 			mtx_unlock(*vlpp);
1007 			*vlpp = NULL;
1008 		}
1009 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1010 		return (true);
1011 	}
1012 
1013 	pvlp = VP2VNODELOCK(vp);
1014 	blp = NCP2BUCKETLOCK(ncp);
1015 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1016 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1017 
1018 	if (*vlpp == vlp1 || *vlpp == vlp2) {
1019 		to_unlock = *vlpp;
1020 		*vlpp = NULL;
1021 	} else {
1022 		if (*vlpp != NULL) {
1023 			mtx_unlock(*vlpp);
1024 			*vlpp = NULL;
1025 		}
1026 		cache_sort_vnodes(&vlp1, &vlp2);
1027 		if (vlp1 == pvlp) {
1028 			mtx_lock(vlp2);
1029 			to_unlock = vlp2;
1030 		} else {
1031 			if (!mtx_trylock(vlp1))
1032 				goto out_relock;
1033 			to_unlock = vlp1;
1034 		}
1035 	}
1036 	rw_wlock(blp);
1037 	cache_zap_locked(ncp);
1038 	rw_wunlock(blp);
1039 	if (to_unlock != NULL)
1040 		mtx_unlock(to_unlock);
1041 	return (true);
1042 
1043 out_relock:
1044 	mtx_unlock(vlp2);
1045 	mtx_lock(vlp1);
1046 	mtx_lock(vlp2);
1047 	MPASS(*vlpp == NULL);
1048 	*vlpp = vlp1;
1049 	return (false);
1050 }
1051 
1052 static int __noinline
1053 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1054 {
1055 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1056 	struct rwlock *blp;
1057 	int error = 0;
1058 
1059 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1060 	cache_assert_vnode_locked(vp);
1061 
1062 	pvlp = VP2VNODELOCK(vp);
1063 	if (ncp->nc_flag & NCF_NEGATIVE) {
1064 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1065 		goto out;
1066 	}
1067 
1068 	blp = NCP2BUCKETLOCK(ncp);
1069 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1070 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1071 	cache_sort_vnodes(&vlp1, &vlp2);
1072 	if (vlp1 == pvlp) {
1073 		mtx_lock(vlp2);
1074 		to_unlock = vlp2;
1075 	} else {
1076 		if (!mtx_trylock(vlp1)) {
1077 			error = EAGAIN;
1078 			goto out;
1079 		}
1080 		to_unlock = vlp1;
1081 	}
1082 	rw_wlock(blp);
1083 	cache_zap_locked(ncp);
1084 	rw_wunlock(blp);
1085 	mtx_unlock(to_unlock);
1086 out:
1087 	mtx_unlock(pvlp);
1088 	return (error);
1089 }
1090 
1091 /*
1092  * If trylocking failed we can get here. We know enough to take all needed locks
1093  * in the right order and re-lookup the entry.
1094  */
1095 static int
1096 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1097     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1098     struct rwlock *blp)
1099 {
1100 	struct namecache *rncp;
1101 
1102 	cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1103 
1104 	cache_sort_vnodes(&dvlp, &vlp);
1105 	cache_lock_vnodes(dvlp, vlp);
1106 	rw_wlock(blp);
1107 	CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1108 		if (rncp == ncp && rncp->nc_dvp == dvp &&
1109 		    rncp->nc_nlen == cnp->cn_namelen &&
1110 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1111 			break;
1112 	}
1113 	if (rncp != NULL) {
1114 		cache_zap_locked(rncp);
1115 		rw_wunlock(blp);
1116 		cache_unlock_vnodes(dvlp, vlp);
1117 		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1118 		return (0);
1119 	}
1120 
1121 	rw_wunlock(blp);
1122 	cache_unlock_vnodes(dvlp, vlp);
1123 	return (EAGAIN);
1124 }
1125 
1126 static int __noinline
1127 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1128     uint32_t hash, struct rwlock *blp)
1129 {
1130 	struct mtx *dvlp, *vlp;
1131 	struct vnode *dvp;
1132 
1133 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1134 
1135 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1136 	vlp = NULL;
1137 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1138 		vlp = VP2VNODELOCK(ncp->nc_vp);
1139 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1140 		cache_zap_locked(ncp);
1141 		rw_wunlock(blp);
1142 		cache_unlock_vnodes(dvlp, vlp);
1143 		return (0);
1144 	}
1145 
1146 	dvp = ncp->nc_dvp;
1147 	rw_wunlock(blp);
1148 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1149 }
1150 
1151 static int __noinline
1152 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1153     uint32_t hash, struct rwlock *blp)
1154 {
1155 	struct mtx *dvlp, *vlp;
1156 	struct vnode *dvp;
1157 
1158 	cache_assert_bucket_locked(ncp, RA_RLOCKED);
1159 
1160 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1161 	vlp = NULL;
1162 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1163 		vlp = VP2VNODELOCK(ncp->nc_vp);
1164 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1165 		rw_runlock(blp);
1166 		rw_wlock(blp);
1167 		cache_zap_locked(ncp);
1168 		rw_wunlock(blp);
1169 		cache_unlock_vnodes(dvlp, vlp);
1170 		return (0);
1171 	}
1172 
1173 	dvp = ncp->nc_dvp;
1174 	rw_runlock(blp);
1175 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1176 }
1177 
1178 static int
1179 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1180     struct mtx **vlpp1, struct mtx **vlpp2)
1181 {
1182 	struct mtx *dvlp, *vlp;
1183 
1184 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1185 
1186 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1187 	vlp = NULL;
1188 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1189 		vlp = VP2VNODELOCK(ncp->nc_vp);
1190 	cache_sort_vnodes(&dvlp, &vlp);
1191 
1192 	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1193 		cache_zap_locked(ncp);
1194 		cache_unlock_vnodes(dvlp, vlp);
1195 		*vlpp1 = NULL;
1196 		*vlpp2 = NULL;
1197 		return (0);
1198 	}
1199 
1200 	if (*vlpp1 != NULL)
1201 		mtx_unlock(*vlpp1);
1202 	if (*vlpp2 != NULL)
1203 		mtx_unlock(*vlpp2);
1204 	*vlpp1 = NULL;
1205 	*vlpp2 = NULL;
1206 
1207 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1208 		cache_zap_locked(ncp);
1209 		cache_unlock_vnodes(dvlp, vlp);
1210 		return (0);
1211 	}
1212 
1213 	rw_wunlock(blp);
1214 	*vlpp1 = dvlp;
1215 	*vlpp2 = vlp;
1216 	if (*vlpp1 != NULL)
1217 		mtx_lock(*vlpp1);
1218 	mtx_lock(*vlpp2);
1219 	rw_wlock(blp);
1220 	return (EAGAIN);
1221 }
1222 
1223 static void
1224 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1225 {
1226 
1227 	if (blp != NULL) {
1228 		rw_runlock(blp);
1229 	} else {
1230 		mtx_unlock(vlp);
1231 	}
1232 }
1233 
1234 static int __noinline
1235 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1236     struct timespec *tsp, int *ticksp)
1237 {
1238 	int ltype;
1239 
1240 	*vpp = dvp;
1241 	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1242 			dvp, cnp->cn_nameptr);
1243 	counter_u64_add(dothits, 1);
1244 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1245 	if (tsp != NULL)
1246 		timespecclear(tsp);
1247 	if (ticksp != NULL)
1248 		*ticksp = ticks;
1249 	vrefact(*vpp);
1250 	/*
1251 	 * When we lookup "." we still can be asked to lock it
1252 	 * differently...
1253 	 */
1254 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1255 	if (ltype != VOP_ISLOCKED(*vpp)) {
1256 		if (ltype == LK_EXCLUSIVE) {
1257 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1258 			if (VN_IS_DOOMED((*vpp))) {
1259 				/* forced unmount */
1260 				vrele(*vpp);
1261 				*vpp = NULL;
1262 				return (ENOENT);
1263 			}
1264 		} else
1265 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1266 	}
1267 	return (-1);
1268 }
1269 
1270 static __noinline int
1271 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1272     struct componentname *cnp, struct timespec *tsp, int *ticksp)
1273 {
1274 	struct namecache *ncp;
1275 	struct rwlock *blp;
1276 	struct mtx *dvlp, *dvlp2;
1277 	uint32_t hash;
1278 	int error;
1279 
1280 	if (cnp->cn_namelen == 2 &&
1281 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1282 		counter_u64_add(dotdothits, 1);
1283 		dvlp = VP2VNODELOCK(dvp);
1284 		dvlp2 = NULL;
1285 		mtx_lock(dvlp);
1286 retry_dotdot:
1287 		ncp = dvp->v_cache_dd;
1288 		if (ncp == NULL) {
1289 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1290 			    "..", NULL);
1291 			mtx_unlock(dvlp);
1292 			if (dvlp2 != NULL)
1293 				mtx_unlock(dvlp2);
1294 			return (0);
1295 		}
1296 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1297 			if (ncp->nc_dvp != dvp)
1298 				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1299 			if (!cache_zap_locked_vnode_kl2(ncp,
1300 			    dvp, &dvlp2))
1301 				goto retry_dotdot;
1302 			MPASS(dvp->v_cache_dd == NULL);
1303 			mtx_unlock(dvlp);
1304 			if (dvlp2 != NULL)
1305 				mtx_unlock(dvlp2);
1306 			cache_free(ncp);
1307 		} else {
1308 			dvp->v_cache_dd = NULL;
1309 			mtx_unlock(dvlp);
1310 			if (dvlp2 != NULL)
1311 				mtx_unlock(dvlp2);
1312 		}
1313 		return (0);
1314 	}
1315 
1316 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1317 	blp = HASH2BUCKETLOCK(hash);
1318 retry:
1319 	if (CK_LIST_EMPTY(NCHHASH(hash)))
1320 		goto out_no_entry;
1321 
1322 	rw_wlock(blp);
1323 
1324 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1325 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1326 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1327 			break;
1328 	}
1329 
1330 	/* We failed to find an entry */
1331 	if (ncp == NULL) {
1332 		rw_wunlock(blp);
1333 		goto out_no_entry;
1334 	}
1335 
1336 	error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1337 	if (__predict_false(error != 0)) {
1338 		zap_and_exit_bucket_fail++;
1339 		cache_maybe_yield();
1340 		goto retry;
1341 	}
1342 	counter_u64_add(numposzaps, 1);
1343 	cache_free(ncp);
1344 	return (0);
1345 out_no_entry:
1346 	SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1347 	counter_u64_add(nummisszap, 1);
1348 	return (0);
1349 }
1350 
1351 /**
1352  * Lookup a name in the name cache
1353  *
1354  * # Arguments
1355  *
1356  * - dvp:	Parent directory in which to search.
1357  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1358  * - cnp:	Parameters of the name search.  The most interesting bits of
1359  *   		the cn_flags field have the following meanings:
1360  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1361  *   			it up.
1362  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1363  * - tsp:	Return storage for cache timestamp.  On a successful (positive
1364  *   		or negative) lookup, tsp will be filled with any timespec that
1365  *   		was stored when this cache entry was created.  However, it will
1366  *   		be clear for "." entries.
1367  * - ticks:	Return storage for alternate cache timestamp.  On a successful
1368  *   		(positive or negative) lookup, it will contain the ticks value
1369  *   		that was current when the cache entry was created, unless cnp
1370  *   		was ".".
1371  *
1372  * # Returns
1373  *
1374  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1375  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1376  *		to a forced unmount.  vpp will not be modified.  If the entry
1377  *		is a whiteout, then the ISWHITEOUT flag will be set in
1378  *		cnp->cn_flags.
1379  * - 0:		A cache miss.  vpp will not be modified.
1380  *
1381  * # Locking
1382  *
1383  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1384  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1385  * lock is not recursively acquired.
1386  */
1387 int
1388 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1389     struct timespec *tsp, int *ticksp)
1390 {
1391 	struct namecache_ts *ncp_ts;
1392 	struct namecache *ncp;
1393 	struct negstate *negstate;
1394 	struct rwlock *blp;
1395 	struct mtx *dvlp;
1396 	uint32_t hash;
1397 	enum vgetstate vs;
1398 	int error, ltype;
1399 	bool try_smr, doing_smr, whiteout;
1400 
1401 #ifdef DEBUG_CACHE
1402 	if (__predict_false(!doingcache)) {
1403 		cnp->cn_flags &= ~MAKEENTRY;
1404 		return (0);
1405 	}
1406 #endif
1407 
1408 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1409 		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1410 
1411 	if ((cnp->cn_flags & MAKEENTRY) == 0)
1412 		return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1413 
1414 	try_smr = true;
1415 	if (cnp->cn_nameiop == CREATE)
1416 		try_smr = false;
1417 retry:
1418 	doing_smr = false;
1419 	blp = NULL;
1420 	dvlp = NULL;
1421 	error = 0;
1422 	if (cnp->cn_namelen == 2 &&
1423 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1424 		counter_u64_add(dotdothits, 1);
1425 		dvlp = VP2VNODELOCK(dvp);
1426 		mtx_lock(dvlp);
1427 		ncp = dvp->v_cache_dd;
1428 		if (ncp == NULL) {
1429 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1430 			    "..", NULL);
1431 			mtx_unlock(dvlp);
1432 			return (0);
1433 		}
1434 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1435 			if (ncp->nc_flag & NCF_NEGATIVE)
1436 				*vpp = NULL;
1437 			else
1438 				*vpp = ncp->nc_vp;
1439 		} else
1440 			*vpp = ncp->nc_dvp;
1441 		/* Return failure if negative entry was found. */
1442 		if (*vpp == NULL)
1443 			goto negative_success;
1444 		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1445 		    dvp, cnp->cn_nameptr, *vpp);
1446 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1447 		    *vpp);
1448 		cache_out_ts(ncp, tsp, ticksp);
1449 		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1450 		    NCF_DTS && tsp != NULL) {
1451 			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1452 			*tsp = ncp_ts->nc_dotdottime;
1453 		}
1454 		goto success;
1455 	}
1456 
1457 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1458 retry_hashed:
1459 	if (try_smr) {
1460 		vfs_smr_enter();
1461 		doing_smr = true;
1462 		try_smr = false;
1463 	} else {
1464 		blp = HASH2BUCKETLOCK(hash);
1465 		rw_rlock(blp);
1466 	}
1467 
1468 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1469 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1470 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1471 			break;
1472 	}
1473 
1474 	/* We failed to find an entry */
1475 	if (__predict_false(ncp == NULL)) {
1476 		if (doing_smr)
1477 			vfs_smr_exit();
1478 		else
1479 			rw_runlock(blp);
1480 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1481 		    NULL);
1482 		counter_u64_add(nummiss, 1);
1483 		return (0);
1484 	}
1485 
1486 	if (ncp->nc_flag & NCF_NEGATIVE)
1487 		goto negative_success;
1488 
1489 	/* We found a "positive" match, return the vnode */
1490 	counter_u64_add(numposhits, 1);
1491 	*vpp = ncp->nc_vp;
1492 	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1493 	    dvp, cnp->cn_nameptr, *vpp, ncp);
1494 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1495 	    *vpp);
1496 	cache_out_ts(ncp, tsp, ticksp);
1497 success:
1498 	/*
1499 	 * On success we return a locked and ref'd vnode as per the lookup
1500 	 * protocol.
1501 	 */
1502 	MPASS(dvp != *vpp);
1503 	ltype = 0;	/* silence gcc warning */
1504 	if (cnp->cn_flags & ISDOTDOT) {
1505 		ltype = VOP_ISLOCKED(dvp);
1506 		VOP_UNLOCK(dvp);
1507 	}
1508 	if (doing_smr) {
1509 		if (cache_ncp_invalid(ncp)) {
1510 			vfs_smr_exit();
1511 			*vpp = NULL;
1512 			goto retry;
1513 		}
1514 		vs = vget_prep_smr(*vpp);
1515 		vfs_smr_exit();
1516 		if (vs == VGET_NONE) {
1517 			*vpp = NULL;
1518 			goto retry;
1519 		}
1520 	} else {
1521 		vs = vget_prep(*vpp);
1522 		cache_lookup_unlock(blp, dvlp);
1523 	}
1524 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1525 	if (cnp->cn_flags & ISDOTDOT) {
1526 		vn_lock(dvp, ltype | LK_RETRY);
1527 		if (VN_IS_DOOMED(dvp)) {
1528 			if (error == 0)
1529 				vput(*vpp);
1530 			*vpp = NULL;
1531 			return (ENOENT);
1532 		}
1533 	}
1534 	if (error) {
1535 		*vpp = NULL;
1536 		goto retry;
1537 	}
1538 	if ((cnp->cn_flags & ISLASTCN) &&
1539 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1540 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1541 	}
1542 	return (-1);
1543 
1544 negative_success:
1545 	/* We found a negative match, and want to create it, so purge */
1546 	if (cnp->cn_nameiop == CREATE) {
1547 		MPASS(!doing_smr);
1548 		counter_u64_add(numnegzaps, 1);
1549 		goto zap_and_exit;
1550 	}
1551 
1552 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1553 	cache_out_ts(ncp, tsp, ticksp);
1554 	counter_u64_add(numneghits, 1);
1555 	whiteout = (ncp->nc_flag & NCF_WHITE);
1556 
1557 	if (doing_smr) {
1558 		/*
1559 		 * We need to take locks to promote an entry.
1560 		 */
1561 		negstate = NCP2NEGSTATE(ncp);
1562 		if ((negstate->neg_flag & NEG_HOT) == 0 ||
1563 		    cache_ncp_invalid(ncp)) {
1564 			vfs_smr_exit();
1565 			doing_smr = false;
1566 			goto retry_hashed;
1567 		}
1568 		vfs_smr_exit();
1569 	} else {
1570 		cache_negative_hit(ncp);
1571 		cache_lookup_unlock(blp, dvlp);
1572 	}
1573 	if (whiteout)
1574 		cnp->cn_flags |= ISWHITEOUT;
1575 	return (ENOENT);
1576 
1577 zap_and_exit:
1578 	MPASS(!doing_smr);
1579 	if (blp != NULL)
1580 		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1581 	else
1582 		error = cache_zap_locked_vnode(ncp, dvp);
1583 	if (__predict_false(error != 0)) {
1584 		zap_and_exit_bucket_fail2++;
1585 		cache_maybe_yield();
1586 		goto retry;
1587 	}
1588 	cache_free(ncp);
1589 	return (0);
1590 }
1591 
1592 struct celockstate {
1593 	struct mtx *vlp[3];
1594 	struct rwlock *blp[2];
1595 };
1596 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1597 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1598 
1599 static inline void
1600 cache_celockstate_init(struct celockstate *cel)
1601 {
1602 
1603 	bzero(cel, sizeof(*cel));
1604 }
1605 
1606 static void
1607 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1608     struct vnode *dvp)
1609 {
1610 	struct mtx *vlp1, *vlp2;
1611 
1612 	MPASS(cel->vlp[0] == NULL);
1613 	MPASS(cel->vlp[1] == NULL);
1614 	MPASS(cel->vlp[2] == NULL);
1615 
1616 	MPASS(vp != NULL || dvp != NULL);
1617 
1618 	vlp1 = VP2VNODELOCK(vp);
1619 	vlp2 = VP2VNODELOCK(dvp);
1620 	cache_sort_vnodes(&vlp1, &vlp2);
1621 
1622 	if (vlp1 != NULL) {
1623 		mtx_lock(vlp1);
1624 		cel->vlp[0] = vlp1;
1625 	}
1626 	mtx_lock(vlp2);
1627 	cel->vlp[1] = vlp2;
1628 }
1629 
1630 static void
1631 cache_unlock_vnodes_cel(struct celockstate *cel)
1632 {
1633 
1634 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1635 
1636 	if (cel->vlp[0] != NULL)
1637 		mtx_unlock(cel->vlp[0]);
1638 	if (cel->vlp[1] != NULL)
1639 		mtx_unlock(cel->vlp[1]);
1640 	if (cel->vlp[2] != NULL)
1641 		mtx_unlock(cel->vlp[2]);
1642 }
1643 
1644 static bool
1645 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1646 {
1647 	struct mtx *vlp;
1648 	bool ret;
1649 
1650 	cache_assert_vlp_locked(cel->vlp[0]);
1651 	cache_assert_vlp_locked(cel->vlp[1]);
1652 	MPASS(cel->vlp[2] == NULL);
1653 
1654 	MPASS(vp != NULL);
1655 	vlp = VP2VNODELOCK(vp);
1656 
1657 	ret = true;
1658 	if (vlp >= cel->vlp[1]) {
1659 		mtx_lock(vlp);
1660 	} else {
1661 		if (mtx_trylock(vlp))
1662 			goto out;
1663 		cache_lock_vnodes_cel_3_failures++;
1664 		cache_unlock_vnodes_cel(cel);
1665 		if (vlp < cel->vlp[0]) {
1666 			mtx_lock(vlp);
1667 			mtx_lock(cel->vlp[0]);
1668 			mtx_lock(cel->vlp[1]);
1669 		} else {
1670 			if (cel->vlp[0] != NULL)
1671 				mtx_lock(cel->vlp[0]);
1672 			mtx_lock(vlp);
1673 			mtx_lock(cel->vlp[1]);
1674 		}
1675 		ret = false;
1676 	}
1677 out:
1678 	cel->vlp[2] = vlp;
1679 	return (ret);
1680 }
1681 
1682 static void
1683 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1684     struct rwlock *blp2)
1685 {
1686 
1687 	MPASS(cel->blp[0] == NULL);
1688 	MPASS(cel->blp[1] == NULL);
1689 
1690 	cache_sort_vnodes(&blp1, &blp2);
1691 
1692 	if (blp1 != NULL) {
1693 		rw_wlock(blp1);
1694 		cel->blp[0] = blp1;
1695 	}
1696 	rw_wlock(blp2);
1697 	cel->blp[1] = blp2;
1698 }
1699 
1700 static void
1701 cache_unlock_buckets_cel(struct celockstate *cel)
1702 {
1703 
1704 	if (cel->blp[0] != NULL)
1705 		rw_wunlock(cel->blp[0]);
1706 	rw_wunlock(cel->blp[1]);
1707 }
1708 
1709 /*
1710  * Lock part of the cache affected by the insertion.
1711  *
1712  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1713  * However, insertion can result in removal of an old entry. In this
1714  * case we have an additional vnode and bucketlock pair to lock. If the
1715  * entry is negative, ncelock is locked instead of the vnode.
1716  *
1717  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1718  * preserving the locking order (smaller address first).
1719  */
1720 static void
1721 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1722     uint32_t hash)
1723 {
1724 	struct namecache *ncp;
1725 	struct rwlock *blps[2];
1726 
1727 	blps[0] = HASH2BUCKETLOCK(hash);
1728 	for (;;) {
1729 		blps[1] = NULL;
1730 		cache_lock_vnodes_cel(cel, dvp, vp);
1731 		if (vp == NULL || vp->v_type != VDIR)
1732 			break;
1733 		ncp = vp->v_cache_dd;
1734 		if (ncp == NULL)
1735 			break;
1736 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1737 			break;
1738 		MPASS(ncp->nc_dvp == vp);
1739 		blps[1] = NCP2BUCKETLOCK(ncp);
1740 		if (ncp->nc_flag & NCF_NEGATIVE)
1741 			break;
1742 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1743 			break;
1744 		/*
1745 		 * All vnodes got re-locked. Re-validate the state and if
1746 		 * nothing changed we are done. Otherwise restart.
1747 		 */
1748 		if (ncp == vp->v_cache_dd &&
1749 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1750 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1751 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1752 			break;
1753 		cache_unlock_vnodes_cel(cel);
1754 		cel->vlp[0] = NULL;
1755 		cel->vlp[1] = NULL;
1756 		cel->vlp[2] = NULL;
1757 	}
1758 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1759 }
1760 
1761 static void
1762 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1763     uint32_t hash)
1764 {
1765 	struct namecache *ncp;
1766 	struct rwlock *blps[2];
1767 
1768 	blps[0] = HASH2BUCKETLOCK(hash);
1769 	for (;;) {
1770 		blps[1] = NULL;
1771 		cache_lock_vnodes_cel(cel, dvp, vp);
1772 		ncp = dvp->v_cache_dd;
1773 		if (ncp == NULL)
1774 			break;
1775 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1776 			break;
1777 		MPASS(ncp->nc_dvp == dvp);
1778 		blps[1] = NCP2BUCKETLOCK(ncp);
1779 		if (ncp->nc_flag & NCF_NEGATIVE)
1780 			break;
1781 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1782 			break;
1783 		if (ncp == dvp->v_cache_dd &&
1784 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1785 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1786 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1787 			break;
1788 		cache_unlock_vnodes_cel(cel);
1789 		cel->vlp[0] = NULL;
1790 		cel->vlp[1] = NULL;
1791 		cel->vlp[2] = NULL;
1792 	}
1793 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1794 }
1795 
1796 static void
1797 cache_enter_unlock(struct celockstate *cel)
1798 {
1799 
1800 	cache_unlock_buckets_cel(cel);
1801 	cache_unlock_vnodes_cel(cel);
1802 }
1803 
1804 static void __noinline
1805 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1806     struct componentname *cnp)
1807 {
1808 	struct celockstate cel;
1809 	struct namecache *ncp;
1810 	uint32_t hash;
1811 	int len;
1812 
1813 	if (dvp->v_cache_dd == NULL)
1814 		return;
1815 	len = cnp->cn_namelen;
1816 	cache_celockstate_init(&cel);
1817 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1818 	cache_enter_lock_dd(&cel, dvp, vp, hash);
1819 	ncp = dvp->v_cache_dd;
1820 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1821 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1822 		cache_zap_locked(ncp);
1823 	} else {
1824 		ncp = NULL;
1825 	}
1826 	dvp->v_cache_dd = NULL;
1827 	cache_enter_unlock(&cel);
1828 	cache_free(ncp);
1829 }
1830 
1831 /*
1832  * Add an entry to the cache.
1833  */
1834 void
1835 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1836     struct timespec *tsp, struct timespec *dtsp)
1837 {
1838 	struct celockstate cel;
1839 	struct namecache *ncp, *n2, *ndd;
1840 	struct namecache_ts *ncp_ts, *n2_ts;
1841 	struct nchashhead *ncpp;
1842 	uint32_t hash;
1843 	int flag;
1844 	int len;
1845 	u_long lnumcache;
1846 
1847 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1848 	VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp,
1849 	    ("cache_enter: Adding a doomed vnode"));
1850 	VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp,
1851 	    ("cache_enter: Doomed vnode used as src"));
1852 
1853 #ifdef DEBUG_CACHE
1854 	if (__predict_false(!doingcache))
1855 		return;
1856 #endif
1857 
1858 	flag = 0;
1859 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1860 		if (cnp->cn_namelen == 1)
1861 			return;
1862 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1863 			cache_enter_dotdot_prep(dvp, vp, cnp);
1864 			flag = NCF_ISDOTDOT;
1865 		}
1866 	}
1867 
1868 	/*
1869 	 * Avoid blowout in namecache entries.
1870 	 */
1871 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1872 	if (__predict_false(lnumcache >= ncsize)) {
1873 		atomic_add_long(&numcache, -1);
1874 		counter_u64_add(numdrops, 1);
1875 		return;
1876 	}
1877 
1878 	cache_celockstate_init(&cel);
1879 	ndd = NULL;
1880 	ncp_ts = NULL;
1881 
1882 	/*
1883 	 * Calculate the hash key and setup as much of the new
1884 	 * namecache entry as possible before acquiring the lock.
1885 	 */
1886 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1887 	ncp->nc_flag = flag;
1888 	ncp->nc_vp = vp;
1889 	if (vp == NULL)
1890 		cache_negative_init(ncp);
1891 	ncp->nc_dvp = dvp;
1892 	if (tsp != NULL) {
1893 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1894 		ncp_ts->nc_time = *tsp;
1895 		ncp_ts->nc_ticks = ticks;
1896 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
1897 		if (dtsp != NULL) {
1898 			ncp_ts->nc_dotdottime = *dtsp;
1899 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1900 		}
1901 	}
1902 	len = ncp->nc_nlen = cnp->cn_namelen;
1903 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1904 	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
1905 	cache_enter_lock(&cel, dvp, vp, hash);
1906 
1907 	/*
1908 	 * See if this vnode or negative entry is already in the cache
1909 	 * with this name.  This can happen with concurrent lookups of
1910 	 * the same path name.
1911 	 */
1912 	ncpp = NCHHASH(hash);
1913 	CK_LIST_FOREACH(n2, ncpp, nc_hash) {
1914 		if (n2->nc_dvp == dvp &&
1915 		    n2->nc_nlen == cnp->cn_namelen &&
1916 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1917 			if (tsp != NULL) {
1918 				KASSERT((n2->nc_flag & NCF_TS) != 0,
1919 				    ("no NCF_TS"));
1920 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1921 				n2_ts->nc_time = ncp_ts->nc_time;
1922 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
1923 				if (dtsp != NULL) {
1924 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1925 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
1926 				}
1927 			}
1928 			goto out_unlock_free;
1929 		}
1930 	}
1931 
1932 	if (flag == NCF_ISDOTDOT) {
1933 		/*
1934 		 * See if we are trying to add .. entry, but some other lookup
1935 		 * has populated v_cache_dd pointer already.
1936 		 */
1937 		if (dvp->v_cache_dd != NULL)
1938 			goto out_unlock_free;
1939 		KASSERT(vp == NULL || vp->v_type == VDIR,
1940 		    ("wrong vnode type %p", vp));
1941 		dvp->v_cache_dd = ncp;
1942 	}
1943 
1944 	if (vp != NULL) {
1945 		if (vp->v_type == VDIR) {
1946 			if (flag != NCF_ISDOTDOT) {
1947 				/*
1948 				 * For this case, the cache entry maps both the
1949 				 * directory name in it and the name ".." for the
1950 				 * directory's parent.
1951 				 */
1952 				if ((ndd = vp->v_cache_dd) != NULL) {
1953 					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1954 						cache_zap_locked(ndd);
1955 					else
1956 						ndd = NULL;
1957 				}
1958 				vp->v_cache_dd = ncp;
1959 			}
1960 		} else {
1961 			vp->v_cache_dd = NULL;
1962 		}
1963 	}
1964 
1965 	if (flag != NCF_ISDOTDOT) {
1966 		if (LIST_EMPTY(&dvp->v_cache_src)) {
1967 			vhold(dvp);
1968 			counter_u64_add(numcachehv, 1);
1969 		}
1970 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
1971 	}
1972 
1973 	/*
1974 	 * If the entry is "negative", we place it into the
1975 	 * "negative" cache queue, otherwise, we place it into the
1976 	 * destination vnode's cache entries queue.
1977 	 */
1978 	if (vp != NULL) {
1979 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
1980 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
1981 		    vp);
1982 	} else {
1983 		if (cnp->cn_flags & ISWHITEOUT)
1984 			ncp->nc_flag |= NCF_WHITE;
1985 		cache_negative_insert(ncp);
1986 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
1987 		    ncp->nc_name);
1988 	}
1989 
1990 	atomic_thread_fence_rel();
1991 	/*
1992 	 * Insert the new namecache entry into the appropriate chain
1993 	 * within the cache entries table.
1994 	 */
1995 	CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
1996 
1997 	cache_enter_unlock(&cel);
1998 	if (numneg * ncnegfactor > lnumcache)
1999 		cache_negative_zap_one();
2000 	cache_free(ndd);
2001 	return;
2002 out_unlock_free:
2003 	cache_enter_unlock(&cel);
2004 	cache_free(ncp);
2005 	return;
2006 }
2007 
2008 static u_int
2009 cache_roundup_2(u_int val)
2010 {
2011 	u_int res;
2012 
2013 	for (res = 1; res <= val; res <<= 1)
2014 		continue;
2015 
2016 	return (res);
2017 }
2018 
2019 /*
2020  * Name cache initialization, from vfs_init() when we are booting
2021  */
2022 static void
2023 nchinit(void *dummy __unused)
2024 {
2025 	u_int i;
2026 
2027 	cache_zone_small = uma_zcreate("S VFS Cache",
2028 	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
2029 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2030 	    UMA_ZONE_ZINIT);
2031 	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
2032 	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
2033 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2034 	    UMA_ZONE_ZINIT);
2035 	cache_zone_large = uma_zcreate("L VFS Cache",
2036 	    sizeof(struct namecache) + NAME_MAX + 1,
2037 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2038 	    UMA_ZONE_ZINIT);
2039 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
2040 	    sizeof(struct namecache_ts) + NAME_MAX + 1,
2041 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2042 	    UMA_ZONE_ZINIT);
2043 
2044 	VFS_SMR_ZONE_SET(cache_zone_small);
2045 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2046 	VFS_SMR_ZONE_SET(cache_zone_large);
2047 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2048 
2049 	ncsize = desiredvnodes * ncsizefactor;
2050 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
2051 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2052 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2053 		ncbuckethash = 7;
2054 	if (ncbuckethash > nchash)
2055 		ncbuckethash = nchash;
2056 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2057 	    M_WAITOK | M_ZERO);
2058 	for (i = 0; i < numbucketlocks; i++)
2059 		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2060 	ncvnodehash = ncbuckethash;
2061 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2062 	    M_WAITOK | M_ZERO);
2063 	for (i = 0; i < numvnodelocks; i++)
2064 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2065 	ncpurgeminvnodes = numbucketlocks * 2;
2066 
2067 	ncneghash = 3;
2068 	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2069 	    M_WAITOK | M_ZERO);
2070 	for (i = 0; i < numneglists; i++) {
2071 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2072 		TAILQ_INIT(&neglists[i].nl_list);
2073 	}
2074 	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2075 	TAILQ_INIT(&ncneg_hot.nl_list);
2076 
2077 	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2078 }
2079 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2080 
2081 void
2082 cache_vnode_init(struct vnode *vp)
2083 {
2084 
2085 	LIST_INIT(&vp->v_cache_src);
2086 	TAILQ_INIT(&vp->v_cache_dst);
2087 	vp->v_cache_dd = NULL;
2088 	cache_prehash(vp);
2089 }
2090 
2091 void
2092 cache_changesize(u_long newmaxvnodes)
2093 {
2094 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
2095 	u_long new_nchash, old_nchash;
2096 	struct namecache *ncp;
2097 	uint32_t hash;
2098 	u_long newncsize;
2099 	int i;
2100 
2101 	newncsize = newmaxvnodes * ncsizefactor;
2102 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2103 	if (newmaxvnodes < numbucketlocks)
2104 		newmaxvnodes = numbucketlocks;
2105 
2106 	new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
2107 	/* If same hash table size, nothing to do */
2108 	if (nchash == new_nchash) {
2109 		free(new_nchashtbl, M_VFSCACHE);
2110 		return;
2111 	}
2112 	/*
2113 	 * Move everything from the old hash table to the new table.
2114 	 * None of the namecache entries in the table can be removed
2115 	 * because to do so, they have to be removed from the hash table.
2116 	 */
2117 	cache_lock_all_vnodes();
2118 	cache_lock_all_buckets();
2119 	old_nchashtbl = nchashtbl;
2120 	old_nchash = nchash;
2121 	nchashtbl = new_nchashtbl;
2122 	nchash = new_nchash;
2123 	for (i = 0; i <= old_nchash; i++) {
2124 		while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) {
2125 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2126 			    ncp->nc_dvp);
2127 			CK_LIST_REMOVE(ncp, nc_hash);
2128 			CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2129 		}
2130 	}
2131 	ncsize = newncsize;
2132 	cache_unlock_all_buckets();
2133 	cache_unlock_all_vnodes();
2134 	free(old_nchashtbl, M_VFSCACHE);
2135 }
2136 
2137 /*
2138  * Invalidate all entries from and to a particular vnode.
2139  */
2140 void
2141 cache_purge(struct vnode *vp)
2142 {
2143 	TAILQ_HEAD(, namecache) ncps;
2144 	struct namecache *ncp, *nnp;
2145 	struct mtx *vlp, *vlp2;
2146 
2147 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
2148 	SDT_PROBE1(vfs, namecache, purge, done, vp);
2149 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2150 	    vp->v_cache_dd == NULL)
2151 		return;
2152 	TAILQ_INIT(&ncps);
2153 	vlp = VP2VNODELOCK(vp);
2154 	vlp2 = NULL;
2155 	mtx_lock(vlp);
2156 retry:
2157 	while (!LIST_EMPTY(&vp->v_cache_src)) {
2158 		ncp = LIST_FIRST(&vp->v_cache_src);
2159 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2160 			goto retry;
2161 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2162 	}
2163 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2164 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2165 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2166 			goto retry;
2167 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2168 	}
2169 	ncp = vp->v_cache_dd;
2170 	if (ncp != NULL) {
2171 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2172 		   ("lost dotdot link"));
2173 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2174 			goto retry;
2175 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2176 	}
2177 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2178 	mtx_unlock(vlp);
2179 	if (vlp2 != NULL)
2180 		mtx_unlock(vlp2);
2181 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2182 		cache_free(ncp);
2183 	}
2184 }
2185 
2186 /*
2187  * Invalidate all negative entries for a particular directory vnode.
2188  */
2189 void
2190 cache_purge_negative(struct vnode *vp)
2191 {
2192 	TAILQ_HEAD(, namecache) ncps;
2193 	struct namecache *ncp, *nnp;
2194 	struct mtx *vlp;
2195 
2196 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2197 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2198 	if (LIST_EMPTY(&vp->v_cache_src))
2199 		return;
2200 	TAILQ_INIT(&ncps);
2201 	vlp = VP2VNODELOCK(vp);
2202 	mtx_lock(vlp);
2203 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2204 		if (!(ncp->nc_flag & NCF_NEGATIVE))
2205 			continue;
2206 		cache_zap_negative_locked_vnode_kl(ncp, vp);
2207 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2208 	}
2209 	mtx_unlock(vlp);
2210 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2211 		cache_free(ncp);
2212 	}
2213 }
2214 
2215 /*
2216  * Flush all entries referencing a particular filesystem.
2217  */
2218 void
2219 cache_purgevfs(struct mount *mp, bool force)
2220 {
2221 	TAILQ_HEAD(, namecache) ncps;
2222 	struct mtx *vlp1, *vlp2;
2223 	struct rwlock *blp;
2224 	struct nchashhead *bucket;
2225 	struct namecache *ncp, *nnp;
2226 	u_long i, j, n_nchash;
2227 	int error;
2228 
2229 	/* Scan hash tables for applicable entries */
2230 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2231 	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2232 		return;
2233 	TAILQ_INIT(&ncps);
2234 	n_nchash = nchash + 1;
2235 	vlp1 = vlp2 = NULL;
2236 	for (i = 0; i < numbucketlocks; i++) {
2237 		blp = (struct rwlock *)&bucketlocks[i];
2238 		rw_wlock(blp);
2239 		for (j = i; j < n_nchash; j += numbucketlocks) {
2240 retry:
2241 			bucket = &nchashtbl[j];
2242 			CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2243 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
2244 				if (ncp->nc_dvp->v_mount != mp)
2245 					continue;
2246 				error = cache_zap_wlocked_bucket_kl(ncp, blp,
2247 				    &vlp1, &vlp2);
2248 				if (error != 0)
2249 					goto retry;
2250 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2251 			}
2252 		}
2253 		rw_wunlock(blp);
2254 		if (vlp1 == NULL && vlp2 == NULL)
2255 			cache_maybe_yield();
2256 	}
2257 	if (vlp1 != NULL)
2258 		mtx_unlock(vlp1);
2259 	if (vlp2 != NULL)
2260 		mtx_unlock(vlp2);
2261 
2262 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2263 		cache_free(ncp);
2264 	}
2265 }
2266 
2267 /*
2268  * Perform canonical checks and cache lookup and pass on to filesystem
2269  * through the vop_cachedlookup only if needed.
2270  */
2271 
2272 int
2273 vfs_cache_lookup(struct vop_lookup_args *ap)
2274 {
2275 	struct vnode *dvp;
2276 	int error;
2277 	struct vnode **vpp = ap->a_vpp;
2278 	struct componentname *cnp = ap->a_cnp;
2279 	int flags = cnp->cn_flags;
2280 
2281 	*vpp = NULL;
2282 	dvp = ap->a_dvp;
2283 
2284 	if (dvp->v_type != VDIR)
2285 		return (ENOTDIR);
2286 
2287 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2288 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2289 		return (EROFS);
2290 
2291 	error = vn_dir_check_exec(dvp, cnp);
2292 	if (error != 0)
2293 		return (error);
2294 
2295 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2296 	if (error == 0)
2297 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2298 	if (error == -1)
2299 		return (0);
2300 	return (error);
2301 }
2302 
2303 /* Implementation of the getcwd syscall. */
2304 int
2305 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2306 {
2307 	char *buf, *retbuf;
2308 	size_t buflen;
2309 	int error;
2310 
2311 	buflen = uap->buflen;
2312 	if (__predict_false(buflen < 2))
2313 		return (EINVAL);
2314 	if (buflen > MAXPATHLEN)
2315 		buflen = MAXPATHLEN;
2316 
2317 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2318 	error = vn_getcwd(td, buf, &retbuf, &buflen);
2319 	if (error == 0)
2320 		error = copyout(retbuf, uap->buf, buflen);
2321 	free(buf, M_TEMP);
2322 	return (error);
2323 }
2324 
2325 int
2326 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen)
2327 {
2328 	struct pwd *pwd;
2329 	int error;
2330 
2331 	pwd = pwd_hold(td);
2332 	error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen);
2333 	pwd_drop(pwd);
2334 
2335 #ifdef KTRACE
2336 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2337 		ktrnamei(*retbuf);
2338 #endif
2339 	return (error);
2340 }
2341 
2342 static int
2343 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2344     size_t size, int flags, enum uio_seg pathseg)
2345 {
2346 	struct nameidata nd;
2347 	char *retbuf, *freebuf;
2348 	int error;
2349 
2350 	if (flags != 0)
2351 		return (EINVAL);
2352 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2353 	    pathseg, path, fd, &cap_fstat_rights, td);
2354 	if ((error = namei(&nd)) != 0)
2355 		return (error);
2356 	error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size);
2357 	if (error == 0) {
2358 		error = copyout(retbuf, buf, size);
2359 		free(freebuf, M_TEMP);
2360 	}
2361 	NDFREE(&nd, 0);
2362 	return (error);
2363 }
2364 
2365 int
2366 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2367 {
2368 
2369 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2370 	    uap->flags, UIO_USERSPACE));
2371 }
2372 
2373 /*
2374  * Retrieve the full filesystem path that correspond to a vnode from the name
2375  * cache (if available)
2376  */
2377 int
2378 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2379 {
2380 	struct pwd *pwd;
2381 	char *buf;
2382 	size_t buflen;
2383 	int error;
2384 
2385 	if (__predict_false(vn == NULL))
2386 		return (EINVAL);
2387 
2388 	buflen = MAXPATHLEN;
2389 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2390 	pwd = pwd_hold(td);
2391 	error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen);
2392 	pwd_drop(pwd);
2393 
2394 	if (!error)
2395 		*freebuf = buf;
2396 	else
2397 		free(buf, M_TEMP);
2398 	return (error);
2399 }
2400 
2401 /*
2402  * This function is similar to vn_fullpath, but it attempts to lookup the
2403  * pathname relative to the global root mount point.  This is required for the
2404  * auditing sub-system, as audited pathnames must be absolute, relative to the
2405  * global root mount point.
2406  */
2407 int
2408 vn_fullpath_global(struct thread *td, struct vnode *vn,
2409     char **retbuf, char **freebuf)
2410 {
2411 	char *buf;
2412 	size_t buflen;
2413 	int error;
2414 
2415 	if (__predict_false(vn == NULL))
2416 		return (EINVAL);
2417 	buflen = MAXPATHLEN;
2418 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2419 	error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen);
2420 	if (!error)
2421 		*freebuf = buf;
2422 	else
2423 		free(buf, M_TEMP);
2424 	return (error);
2425 }
2426 
2427 int
2428 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2429 {
2430 	struct vnode *dvp;
2431 	struct namecache *ncp;
2432 	struct mtx *vlp;
2433 	int error;
2434 
2435 	vlp = VP2VNODELOCK(*vp);
2436 	mtx_lock(vlp);
2437 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2438 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2439 			break;
2440 	}
2441 	if (ncp != NULL) {
2442 		if (*buflen < ncp->nc_nlen) {
2443 			mtx_unlock(vlp);
2444 			vrele(*vp);
2445 			counter_u64_add(numfullpathfail4, 1);
2446 			error = ENOMEM;
2447 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2448 			    vp, NULL);
2449 			return (error);
2450 		}
2451 		*buflen -= ncp->nc_nlen;
2452 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2453 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2454 		    ncp->nc_name, vp);
2455 		dvp = *vp;
2456 		*vp = ncp->nc_dvp;
2457 		vref(*vp);
2458 		mtx_unlock(vlp);
2459 		vrele(dvp);
2460 		return (0);
2461 	}
2462 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2463 
2464 	mtx_unlock(vlp);
2465 	vn_lock(*vp, LK_SHARED | LK_RETRY);
2466 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2467 	vput(*vp);
2468 	if (error) {
2469 		counter_u64_add(numfullpathfail2, 1);
2470 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2471 		return (error);
2472 	}
2473 
2474 	*vp = dvp;
2475 	if (VN_IS_DOOMED(dvp)) {
2476 		/* forced unmount */
2477 		vrele(dvp);
2478 		error = ENOENT;
2479 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2480 		return (error);
2481 	}
2482 	/*
2483 	 * *vp has its use count incremented still.
2484 	 */
2485 
2486 	return (0);
2487 }
2488 
2489 /*
2490  * Resolve a directory to a pathname.
2491  *
2492  * The name of the directory can always be found in the namecache or fetched
2493  * from the filesystem. There is also guaranteed to be only one parent, meaning
2494  * we can just follow vnodes up until we find the root.
2495  *
2496  * The vnode must be referenced.
2497  */
2498 static int
2499 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
2500     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend)
2501 {
2502 #ifdef KDTRACE_HOOKS
2503 	struct vnode *startvp = vp;
2504 #endif
2505 	struct vnode *vp1;
2506 	size_t buflen;
2507 	int error;
2508 
2509 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2510 	VNPASS(vp->v_usecount > 0, vp);
2511 
2512 	buflen = *len;
2513 
2514 	if (!slash_prefixed) {
2515 		MPASS(*len >= 2);
2516 		buflen--;
2517 		buf[buflen] = '\0';
2518 	}
2519 
2520 	error = 0;
2521 
2522 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2523 	counter_u64_add(numfullpathcalls, 1);
2524 	while (vp != rdir && vp != rootvnode) {
2525 		/*
2526 		 * The vp vnode must be already fully constructed,
2527 		 * since it is either found in namecache or obtained
2528 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2529 		 * without obtaining the vnode lock.
2530 		 */
2531 		if ((vp->v_vflag & VV_ROOT) != 0) {
2532 			vn_lock(vp, LK_RETRY | LK_SHARED);
2533 
2534 			/*
2535 			 * With the vnode locked, check for races with
2536 			 * unmount, forced or not.  Note that we
2537 			 * already verified that vp is not equal to
2538 			 * the root vnode, which means that
2539 			 * mnt_vnodecovered can be NULL only for the
2540 			 * case of unmount.
2541 			 */
2542 			if (VN_IS_DOOMED(vp) ||
2543 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2544 			    vp1->v_mountedhere != vp->v_mount) {
2545 				vput(vp);
2546 				error = ENOENT;
2547 				SDT_PROBE3(vfs, namecache, fullpath, return,
2548 				    error, vp, NULL);
2549 				break;
2550 			}
2551 
2552 			vref(vp1);
2553 			vput(vp);
2554 			vp = vp1;
2555 			continue;
2556 		}
2557 		if (vp->v_type != VDIR) {
2558 			vrele(vp);
2559 			counter_u64_add(numfullpathfail1, 1);
2560 			error = ENOTDIR;
2561 			SDT_PROBE3(vfs, namecache, fullpath, return,
2562 			    error, vp, NULL);
2563 			break;
2564 		}
2565 		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2566 		if (error)
2567 			break;
2568 		if (buflen == 0) {
2569 			vrele(vp);
2570 			error = ENOMEM;
2571 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2572 			    startvp, NULL);
2573 			break;
2574 		}
2575 		buf[--buflen] = '/';
2576 		slash_prefixed = true;
2577 	}
2578 	if (error)
2579 		return (error);
2580 	if (!slash_prefixed) {
2581 		if (buflen == 0) {
2582 			vrele(vp);
2583 			counter_u64_add(numfullpathfail4, 1);
2584 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2585 			    startvp, NULL);
2586 			return (ENOMEM);
2587 		}
2588 		buf[--buflen] = '/';
2589 	}
2590 	counter_u64_add(numfullpathfound, 1);
2591 	vrele(vp);
2592 
2593 	*retbuf = buf + buflen;
2594 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2595 	*len -= buflen;
2596 	*len += addend;
2597 	return (0);
2598 }
2599 
2600 /*
2601  * Resolve an arbitrary vnode to a pathname.
2602  *
2603  * Note 2 caveats:
2604  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2605  *   resolve to a different path than the one used to find it
2606  * - namecache is not mandatory, meaning names are not guaranteed to be added
2607  *   (in which case resolving fails)
2608  */
2609 static int
2610 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
2611     char *buf, char **retbuf, size_t *buflen)
2612 {
2613 	size_t orig_buflen;
2614 	bool slash_prefixed;
2615 	int error;
2616 
2617 	if (*buflen < 2)
2618 		return (EINVAL);
2619 
2620 	orig_buflen = *buflen;
2621 
2622 	vref(vp);
2623 	slash_prefixed = false;
2624 	if (vp->v_type != VDIR) {
2625 		*buflen -= 1;
2626 		buf[*buflen] = '\0';
2627 		error = vn_vptocnp(&vp, td->td_ucred, buf, buflen);
2628 		if (error)
2629 			return (error);
2630 		if (*buflen == 0) {
2631 			vrele(vp);
2632 			return (ENOMEM);
2633 		}
2634 		*buflen -= 1;
2635 		buf[*buflen] = '/';
2636 		slash_prefixed = true;
2637 	}
2638 
2639 	return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed,
2640 	    orig_buflen - *buflen));
2641 }
2642 
2643 /*
2644  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2645  *
2646  * Since the namecache does not track handlings, the caller is expected to first
2647  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2648  *
2649  * Then we have 2 cases:
2650  * - if the found vnode is a directory, the path can be constructed just by
2651  *   fullowing names up the chain
2652  * - otherwise we populate the buffer with the saved name and start resolving
2653  *   from the parent
2654  */
2655 static int
2656 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
2657     char **freebuf, size_t *buflen)
2658 {
2659 	char *buf, *tmpbuf;
2660 	struct pwd *pwd;
2661 	struct componentname *cnp;
2662 	struct vnode *vp;
2663 	size_t addend;
2664 	int error;
2665 	bool slash_prefixed;
2666 
2667 	if (*buflen < 2)
2668 		return (EINVAL);
2669 	if (*buflen > MAXPATHLEN)
2670 		*buflen = MAXPATHLEN;
2671 
2672 	slash_prefixed = false;
2673 
2674 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
2675 	pwd = pwd_hold(td);
2676 
2677 	addend = 0;
2678 	vp = ndp->ni_vp;
2679 	if (vp->v_type != VDIR) {
2680 		cnp = &ndp->ni_cnd;
2681 		addend = cnp->cn_namelen + 2;
2682 		if (*buflen < addend) {
2683 			error = ENOMEM;
2684 			goto out_bad;
2685 		}
2686 		*buflen -= addend;
2687 		tmpbuf = buf + *buflen;
2688 		tmpbuf[0] = '/';
2689 		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
2690 		tmpbuf[addend - 1] = '\0';
2691 		slash_prefixed = true;
2692 		vp = ndp->ni_dvp;
2693 	}
2694 
2695 	vref(vp);
2696 	error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen,
2697 	    slash_prefixed, addend);
2698 	if (error != 0)
2699 		goto out_bad;
2700 
2701 	pwd_drop(pwd);
2702 	*freebuf = buf;
2703 
2704 	return (0);
2705 out_bad:
2706 	pwd_drop(pwd);
2707 	free(buf, M_TEMP);
2708 	return (error);
2709 }
2710 
2711 struct vnode *
2712 vn_dir_dd_ino(struct vnode *vp)
2713 {
2714 	struct namecache *ncp;
2715 	struct vnode *ddvp;
2716 	struct mtx *vlp;
2717 	enum vgetstate vs;
2718 
2719 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2720 	vlp = VP2VNODELOCK(vp);
2721 	mtx_lock(vlp);
2722 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2723 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2724 			continue;
2725 		ddvp = ncp->nc_dvp;
2726 		vs = vget_prep(ddvp);
2727 		mtx_unlock(vlp);
2728 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2729 			return (NULL);
2730 		return (ddvp);
2731 	}
2732 	mtx_unlock(vlp);
2733 	return (NULL);
2734 }
2735 
2736 int
2737 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2738 {
2739 	struct namecache *ncp;
2740 	struct mtx *vlp;
2741 	int l;
2742 
2743 	vlp = VP2VNODELOCK(vp);
2744 	mtx_lock(vlp);
2745 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2746 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2747 			break;
2748 	if (ncp == NULL) {
2749 		mtx_unlock(vlp);
2750 		return (ENOENT);
2751 	}
2752 	l = min(ncp->nc_nlen, buflen - 1);
2753 	memcpy(buf, ncp->nc_name, l);
2754 	mtx_unlock(vlp);
2755 	buf[l] = '\0';
2756 	return (0);
2757 }
2758 
2759 /*
2760  * This function updates path string to vnode's full global path
2761  * and checks the size of the new path string against the pathlen argument.
2762  *
2763  * Requires a locked, referenced vnode.
2764  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2765  *
2766  * If vp is a directory, the call to vn_fullpath_global() always succeeds
2767  * because it falls back to the ".." lookup if the namecache lookup fails.
2768  */
2769 int
2770 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2771     u_int pathlen)
2772 {
2773 	struct nameidata nd;
2774 	struct vnode *vp1;
2775 	char *rpath, *fbuf;
2776 	int error;
2777 
2778 	ASSERT_VOP_ELOCKED(vp, __func__);
2779 
2780 	/* Construct global filesystem path from vp. */
2781 	VOP_UNLOCK(vp);
2782 	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2783 
2784 	if (error != 0) {
2785 		vrele(vp);
2786 		return (error);
2787 	}
2788 
2789 	if (strlen(rpath) >= pathlen) {
2790 		vrele(vp);
2791 		error = ENAMETOOLONG;
2792 		goto out;
2793 	}
2794 
2795 	/*
2796 	 * Re-lookup the vnode by path to detect a possible rename.
2797 	 * As a side effect, the vnode is relocked.
2798 	 * If vnode was renamed, return ENOENT.
2799 	 */
2800 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2801 	    UIO_SYSSPACE, path, td);
2802 	error = namei(&nd);
2803 	if (error != 0) {
2804 		vrele(vp);
2805 		goto out;
2806 	}
2807 	NDFREE(&nd, NDF_ONLY_PNBUF);
2808 	vp1 = nd.ni_vp;
2809 	vrele(vp);
2810 	if (vp1 == vp)
2811 		strcpy(path, rpath);
2812 	else {
2813 		vput(vp1);
2814 		error = ENOENT;
2815 	}
2816 
2817 out:
2818 	free(fbuf, M_TEMP);
2819 	return (error);
2820 }
2821 
2822 #ifdef DDB
2823 static void
2824 db_print_vpath(struct vnode *vp)
2825 {
2826 
2827 	while (vp != NULL) {
2828 		db_printf("%p: ", vp);
2829 		if (vp == rootvnode) {
2830 			db_printf("/");
2831 			vp = NULL;
2832 		} else {
2833 			if (vp->v_vflag & VV_ROOT) {
2834 				db_printf("<mount point>");
2835 				vp = vp->v_mount->mnt_vnodecovered;
2836 			} else {
2837 				struct namecache *ncp;
2838 				char *ncn;
2839 				int i;
2840 
2841 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
2842 				if (ncp != NULL) {
2843 					ncn = ncp->nc_name;
2844 					for (i = 0; i < ncp->nc_nlen; i++)
2845 						db_printf("%c", *ncn++);
2846 					vp = ncp->nc_dvp;
2847 				} else {
2848 					vp = NULL;
2849 				}
2850 			}
2851 		}
2852 		db_printf("\n");
2853 	}
2854 
2855 	return;
2856 }
2857 
2858 DB_SHOW_COMMAND(vpath, db_show_vpath)
2859 {
2860 	struct vnode *vp;
2861 
2862 	if (!have_addr) {
2863 		db_printf("usage: show vpath <struct vnode *>\n");
2864 		return;
2865 	}
2866 
2867 	vp = (struct vnode *)addr;
2868 	db_print_vpath(vp);
2869 }
2870 
2871 #endif
2872 
2873 extern uma_zone_t namei_zone;
2874 
2875 static bool __read_frequently cache_fast_lookup = true;
2876 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
2877     &cache_fast_lookup, 0, "");
2878 
2879 #define CACHE_FPL_FAILED	-2020
2880 
2881 static void
2882 cache_fpl_cleanup_cnp(struct componentname *cnp)
2883 {
2884 
2885 	uma_zfree(namei_zone, cnp->cn_pnbuf);
2886 #ifdef DIAGNOSTIC
2887 	cnp->cn_pnbuf = NULL;
2888 	cnp->cn_nameptr = NULL;
2889 #endif
2890 }
2891 
2892 static void
2893 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
2894 {
2895 	struct componentname *cnp;
2896 
2897 	cnp = &ndp->ni_cnd;
2898 	while (*(cnp->cn_nameptr) == '/') {
2899 		cnp->cn_nameptr++;
2900 		ndp->ni_pathlen--;
2901 	}
2902 
2903 	*dpp = ndp->ni_rootdir;
2904 }
2905 
2906 /*
2907  * Components of nameidata (or objects it can point to) which may
2908  * need restoring in case fast path lookup fails.
2909  */
2910 struct nameidata_saved {
2911 	long cn_namelen;
2912 	char *cn_nameptr;
2913 	size_t ni_pathlen;
2914 	int cn_flags;
2915 };
2916 
2917 struct cache_fpl {
2918 	struct nameidata *ndp;
2919 	struct componentname *cnp;
2920 	struct pwd *pwd;
2921 	struct vnode *dvp;
2922 	struct vnode *tvp;
2923 	seqc_t dvp_seqc;
2924 	seqc_t tvp_seqc;
2925 	struct nameidata_saved snd;
2926 	int line;
2927 	enum cache_fpl_status status:8;
2928 	bool in_smr;
2929 };
2930 
2931 static void
2932 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
2933 {
2934 
2935 	snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
2936 	snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
2937 	snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
2938 	snd->ni_pathlen = fpl->ndp->ni_pathlen;
2939 }
2940 
2941 static void
2942 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
2943 {
2944 
2945 	fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
2946 	fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
2947 	fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
2948 	fpl->ndp->ni_pathlen = snd->ni_pathlen;
2949 }
2950 
2951 #ifdef INVARIANTS
2952 #define cache_fpl_smr_assert_entered(fpl) ({			\
2953 	struct cache_fpl *_fpl = (fpl);				\
2954 	MPASS(_fpl->in_smr == true);				\
2955 	VFS_SMR_ASSERT_ENTERED();				\
2956 })
2957 #define cache_fpl_smr_assert_not_entered(fpl) ({		\
2958 	struct cache_fpl *_fpl = (fpl);				\
2959 	MPASS(_fpl->in_smr == false);				\
2960 	VFS_SMR_ASSERT_NOT_ENTERED();				\
2961 })
2962 #else
2963 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
2964 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
2965 #endif
2966 
2967 #define cache_fpl_smr_enter(fpl) ({				\
2968 	struct cache_fpl *_fpl = (fpl);				\
2969 	MPASS(_fpl->in_smr == false);				\
2970 	vfs_smr_enter();					\
2971 	_fpl->in_smr = true;					\
2972 })
2973 
2974 #define cache_fpl_smr_exit(fpl) ({				\
2975 	struct cache_fpl *_fpl = (fpl);				\
2976 	MPASS(_fpl->in_smr == true);				\
2977 	vfs_smr_exit();						\
2978 	_fpl->in_smr = false;					\
2979 })
2980 
2981 static int
2982 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
2983 {
2984 
2985 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
2986 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
2987 		    ("%s: converting to abort from %d at %d, set at %d\n",
2988 		    __func__, fpl->status, line, fpl->line));
2989 	}
2990 	fpl->status = CACHE_FPL_STATUS_ABORTED;
2991 	fpl->line = line;
2992 	return (CACHE_FPL_FAILED);
2993 }
2994 
2995 #define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
2996 
2997 static int
2998 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
2999 {
3000 
3001 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3002 	    ("%s: setting to partial at %d, but already set to %d at %d\n",
3003 	    __func__, line, fpl->status, fpl->line));
3004 	cache_fpl_smr_assert_entered(fpl);
3005 	fpl->status = CACHE_FPL_STATUS_PARTIAL;
3006 	fpl->line = line;
3007 	return (CACHE_FPL_FAILED);
3008 }
3009 
3010 #define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
3011 
3012 static int
3013 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3014 {
3015 
3016 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3017 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
3018 	    __func__, line, fpl->status, fpl->line));
3019 	cache_fpl_smr_assert_not_entered(fpl);
3020 	MPASS(error != CACHE_FPL_FAILED);
3021 	fpl->status = CACHE_FPL_STATUS_HANDLED;
3022 	fpl->line = line;
3023 	return (error);
3024 }
3025 
3026 #define cache_fpl_handled(x, e)	cache_fpl_handled_impl((x), (e), __LINE__)
3027 
3028 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3029 	(LOCKLEAF | LOCKPARENT | WANTPARENT | FOLLOW | LOCKSHARED | SAVENAME | \
3030 	 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2)
3031 
3032 static bool
3033 cache_can_fplookup(struct cache_fpl *fpl)
3034 {
3035 	struct nameidata *ndp;
3036 	struct componentname *cnp;
3037 	struct thread *td;
3038 
3039 	ndp = fpl->ndp;
3040 	cnp = fpl->cnp;
3041 	td = cnp->cn_thread;
3042 
3043 	if (!cache_fast_lookup) {
3044 		cache_fpl_aborted(fpl);
3045 		return (false);
3046 	}
3047 #ifdef MAC
3048 	if (mac_vnode_check_lookup_enabled()) {
3049 		cache_fpl_aborted(fpl);
3050 		return (false);
3051 	}
3052 #endif
3053 	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3054 		cache_fpl_aborted(fpl);
3055 		return (false);
3056 	}
3057 	if (cnp->cn_nameiop != LOOKUP) {
3058 		cache_fpl_aborted(fpl);
3059 		return (false);
3060 	}
3061 	if (ndp->ni_dirfd != AT_FDCWD) {
3062 		cache_fpl_aborted(fpl);
3063 		return (false);
3064 	}
3065 	if (IN_CAPABILITY_MODE(td)) {
3066 		cache_fpl_aborted(fpl);
3067 		return (false);
3068 	}
3069 	if (AUDITING_TD(td)) {
3070 		cache_fpl_aborted(fpl);
3071 		return (false);
3072 	}
3073 	if (ndp->ni_startdir != NULL) {
3074 		cache_fpl_aborted(fpl);
3075 		return (false);
3076 	}
3077 	return (true);
3078 }
3079 
3080 static bool
3081 cache_fplookup_vnode_supported(struct vnode *vp)
3082 {
3083 
3084 	return (vp->v_type != VLNK);
3085 }
3086 
3087 /*
3088  * Move a negative entry to the hot list.
3089  *
3090  * We have to take locks, but they may be contended and in the worst
3091  * case we may need to go off CPU. We don't want to spin within the
3092  * smr section and we can't block with it. Instead we are going to
3093  * look up the entry again.
3094  */
3095 static int __noinline
3096 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3097     uint32_t hash)
3098 {
3099 	struct componentname *cnp;
3100 	struct namecache *ncp;
3101 	struct neglist *neglist;
3102 	struct negstate *negstate;
3103 	struct vnode *dvp;
3104 	u_char nc_flag;
3105 
3106 	cnp = fpl->cnp;
3107 	dvp = fpl->dvp;
3108 
3109 	if (!vhold_smr(dvp))
3110 		return (cache_fpl_aborted(fpl));
3111 
3112 	neglist = NCP2NEGLIST(oncp);
3113 	cache_fpl_smr_exit(fpl);
3114 
3115 	mtx_lock(&ncneg_hot.nl_lock);
3116 	mtx_lock(&neglist->nl_lock);
3117 	/*
3118 	 * For hash iteration.
3119 	 */
3120 	cache_fpl_smr_enter(fpl);
3121 
3122 	/*
3123 	 * Avoid all surprises by only succeeding if we got the same entry and
3124 	 * bailing completely otherwise.
3125 	 *
3126 	 * In particular at this point there can be a new ncp which matches the
3127 	 * search but hashes to a different neglist.
3128 	 */
3129 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3130 		if (ncp == oncp)
3131 			break;
3132 	}
3133 
3134 	/*
3135 	 * No match to begin with.
3136 	 */
3137 	if (__predict_false(ncp == NULL)) {
3138 		goto out_abort;
3139 	}
3140 
3141 	/*
3142 	 * The newly found entry may be something different...
3143 	 */
3144 	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3145 	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3146 		goto out_abort;
3147 	}
3148 
3149 	/*
3150 	 * ... and not even negative.
3151 	 */
3152 	nc_flag = atomic_load_char(&ncp->nc_flag);
3153 	if ((nc_flag & NCF_NEGATIVE) == 0) {
3154 		goto out_abort;
3155 	}
3156 
3157 	if (__predict_false(cache_ncp_invalid(ncp))) {
3158 		goto out_abort;
3159 	}
3160 
3161 	negstate = NCP2NEGSTATE(ncp);
3162 	if ((negstate->neg_flag & NEG_HOT) == 0) {
3163 		numhotneg++;
3164 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3165 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3166 		negstate->neg_flag |= NEG_HOT;
3167 	}
3168 
3169 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3170 	counter_u64_add(numneghits, 1);
3171 	cache_fpl_smr_exit(fpl);
3172 	mtx_unlock(&neglist->nl_lock);
3173 	mtx_unlock(&ncneg_hot.nl_lock);
3174 	vdrop(dvp);
3175 	return (cache_fpl_handled(fpl, ENOENT));
3176 out_abort:
3177 	cache_fpl_smr_exit(fpl);
3178 	mtx_unlock(&neglist->nl_lock);
3179 	mtx_unlock(&ncneg_hot.nl_lock);
3180 	vdrop(dvp);
3181 	return (cache_fpl_aborted(fpl));
3182 }
3183 
3184 /*
3185  * The target vnode is not supported, prepare for the slow path to take over.
3186  */
3187 static int
3188 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3189 {
3190 	struct componentname *cnp;
3191 	enum vgetstate dvs;
3192 	struct vnode *dvp;
3193 	struct pwd *pwd;
3194 	seqc_t dvp_seqc;
3195 
3196 	cnp = fpl->cnp;
3197 	dvp = fpl->dvp;
3198 	dvp_seqc = fpl->dvp_seqc;
3199 
3200 	dvs = vget_prep_smr(dvp);
3201 	if (dvs == VGET_NONE) {
3202 		cache_fpl_smr_exit(fpl);
3203 		return (cache_fpl_aborted(fpl));
3204 	}
3205 
3206 	cache_fpl_smr_exit(fpl);
3207 
3208 	vget_finish_ref(dvp, dvs);
3209 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3210 		vrele(dvp);
3211 		return (cache_fpl_aborted(fpl));
3212 	}
3213 
3214 	pwd = pwd_hold(curthread);
3215 	if (fpl->pwd != pwd) {
3216 		vrele(dvp);
3217 		pwd_drop(pwd);
3218 		return (cache_fpl_aborted(fpl));
3219 	}
3220 
3221 	fpl->ndp->ni_startdir = dvp;
3222 	return (0);
3223 }
3224 
3225 static int
3226 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3227 {
3228 	struct componentname *cnp;
3229 	struct vnode *tvp;
3230 	seqc_t tvp_seqc;
3231 	int error;
3232 
3233 	cnp = fpl->cnp;
3234 	tvp = fpl->tvp;
3235 	tvp_seqc = fpl->tvp_seqc;
3236 
3237 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
3238 		error = vget_finish(tvp, cnp->cn_lkflags, tvs);
3239 		if (error != 0) {
3240 			return (cache_fpl_aborted(fpl));
3241 		}
3242 	} else {
3243 		vget_finish_ref(tvp, tvs);
3244 	}
3245 
3246 	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3247 		if ((cnp->cn_flags & LOCKLEAF) != 0)
3248 			vput(tvp);
3249 		else
3250 			vrele(tvp);
3251 		return (cache_fpl_aborted(fpl));
3252 	}
3253 
3254 	return (cache_fpl_handled(fpl, 0));
3255 }
3256 
3257 static int __noinline
3258 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3259 {
3260 	enum vgetstate dvs, tvs;
3261 	struct componentname *cnp;
3262 	struct vnode *dvp, *tvp;
3263 	seqc_t dvp_seqc, tvp_seqc;
3264 	int error;
3265 
3266 	cnp = fpl->cnp;
3267 	dvp = fpl->dvp;
3268 	dvp_seqc = fpl->dvp_seqc;
3269 	tvp = fpl->tvp;
3270 	tvp_seqc = fpl->tvp_seqc;
3271 
3272 	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3273 
3274 	/*
3275 	 * This is less efficient than it can be for simplicity.
3276 	 */
3277 	dvs = vget_prep_smr(dvp);
3278 	if (dvs == VGET_NONE) {
3279 		return (cache_fpl_aborted(fpl));
3280 	}
3281 	tvs = vget_prep_smr(tvp);
3282 	if (tvs == VGET_NONE) {
3283 		cache_fpl_smr_exit(fpl);
3284 		vget_abort(dvp, dvs);
3285 		return (cache_fpl_aborted(fpl));
3286 	}
3287 
3288 	cache_fpl_smr_exit(fpl);
3289 
3290 	if ((cnp->cn_flags & LOCKPARENT) != 0) {
3291 		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3292 		if (error != 0) {
3293 			vget_abort(tvp, tvs);
3294 			return (cache_fpl_aborted(fpl));
3295 		}
3296 	} else {
3297 		vget_finish_ref(dvp, dvs);
3298 	}
3299 
3300 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3301 		vget_abort(tvp, tvs);
3302 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3303 			vput(dvp);
3304 		else
3305 			vrele(dvp);
3306 		cache_fpl_aborted(fpl);
3307 		return (error);
3308 	}
3309 
3310 	error = cache_fplookup_final_child(fpl, tvs);
3311 	if (error != 0) {
3312 		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3313 		if ((cnp->cn_flags & LOCKPARENT) != 0)
3314 			vput(dvp);
3315 		else
3316 			vrele(dvp);
3317 		return (error);
3318 	}
3319 
3320 	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3321 	return (0);
3322 }
3323 
3324 static int
3325 cache_fplookup_final(struct cache_fpl *fpl)
3326 {
3327 	struct componentname *cnp;
3328 	enum vgetstate tvs;
3329 	struct vnode *dvp, *tvp;
3330 	seqc_t dvp_seqc, tvp_seqc;
3331 
3332 	cnp = fpl->cnp;
3333 	dvp = fpl->dvp;
3334 	dvp_seqc = fpl->dvp_seqc;
3335 	tvp = fpl->tvp;
3336 	tvp_seqc = fpl->tvp_seqc;
3337 
3338 	VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3339 
3340 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3341 		return (cache_fplookup_final_withparent(fpl));
3342 
3343 	tvs = vget_prep_smr(tvp);
3344 	if (tvs == VGET_NONE) {
3345 		return (cache_fpl_partial(fpl));
3346 	}
3347 
3348 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3349 		cache_fpl_smr_exit(fpl);
3350 		vget_abort(tvp, tvs);
3351 		return (cache_fpl_aborted(fpl));
3352 	}
3353 
3354 	cache_fpl_smr_exit(fpl);
3355 	return (cache_fplookup_final_child(fpl, tvs));
3356 }
3357 
3358 static int
3359 cache_fplookup_next(struct cache_fpl *fpl)
3360 {
3361 	struct componentname *cnp;
3362 	struct namecache *ncp;
3363 	struct negstate *negstate;
3364 	struct vnode *dvp, *tvp;
3365 	u_char nc_flag;
3366 	uint32_t hash;
3367 	bool neg_hot;
3368 
3369 	cnp = fpl->cnp;
3370 	dvp = fpl->dvp;
3371 
3372 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3373 		fpl->tvp = dvp;
3374 		fpl->tvp_seqc = vn_seqc_read_any(dvp);
3375 		if (seqc_in_modify(fpl->tvp_seqc)) {
3376 			return (cache_fpl_aborted(fpl));
3377 		}
3378 		return (0);
3379 	}
3380 
3381 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3382 
3383 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3384 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3385 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3386 			break;
3387 	}
3388 
3389 	/*
3390 	 * If there is no entry we have to punt to the slow path to perform
3391 	 * actual lookup. Should there be nothing with this name a negative
3392 	 * entry will be created.
3393 	 */
3394 	if (__predict_false(ncp == NULL)) {
3395 		return (cache_fpl_partial(fpl));
3396 	}
3397 
3398 	tvp = atomic_load_ptr(&ncp->nc_vp);
3399 	nc_flag = atomic_load_char(&ncp->nc_flag);
3400 	if ((nc_flag & NCF_NEGATIVE) != 0) {
3401 		negstate = NCP2NEGSTATE(ncp);
3402 		neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3403 		if (__predict_false(cache_ncp_invalid(ncp))) {
3404 			return (cache_fpl_partial(fpl));
3405 		}
3406 		if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3407 			return (cache_fpl_partial(fpl));
3408 		}
3409 		if (!neg_hot) {
3410 			return (cache_fplookup_negative_promote(fpl, ncp, hash));
3411 		}
3412 		SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3413 		    ncp->nc_name);
3414 		counter_u64_add(numneghits, 1);
3415 		cache_fpl_smr_exit(fpl);
3416 		return (cache_fpl_handled(fpl, ENOENT));
3417 	}
3418 
3419 	if (__predict_false(cache_ncp_invalid(ncp))) {
3420 		return (cache_fpl_partial(fpl));
3421 	}
3422 
3423 	fpl->tvp = tvp;
3424 	fpl->tvp_seqc = vn_seqc_read_any(tvp);
3425 	if (seqc_in_modify(fpl->tvp_seqc)) {
3426 		return (cache_fpl_partial(fpl));
3427 	}
3428 
3429 	if (!cache_fplookup_vnode_supported(tvp)) {
3430 		return (cache_fpl_partial(fpl));
3431 	}
3432 
3433 	counter_u64_add(numposhits, 1);
3434 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3435 	return (0);
3436 }
3437 
3438 static bool
3439 cache_fplookup_mp_supported(struct mount *mp)
3440 {
3441 
3442 	if (mp == NULL)
3443 		return (false);
3444 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3445 		return (false);
3446 	if ((mp->mnt_flag & MNT_UNION) != 0)
3447 		return (false);
3448 	return (true);
3449 }
3450 
3451 /*
3452  * Walk up the mount stack (if any).
3453  *
3454  * Correctness is provided in the following ways:
3455  * - all vnodes are protected from freeing with SMR
3456  * - struct mount objects are type stable making them always safe to access
3457  * - stability of the particular mount is provided by busying it
3458  * - relationship between the vnode which is mounted on and the mount is
3459  *   verified with the vnode sequence counter after busying
3460  * - association between root vnode of the mount and the mount is protected
3461  *   by busy
3462  *
3463  * From that point on we can read the sequence counter of the root vnode
3464  * and get the next mount on the stack (if any) using the same protection.
3465  *
3466  * By the end of successful walk we are guaranteed the reached state was
3467  * indeed present at least at some point which matches the regular lookup.
3468  */
3469 static int __noinline
3470 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3471 {
3472 	struct mount *mp, *prev_mp;
3473 	struct vnode *vp;
3474 	seqc_t vp_seqc;
3475 
3476 	vp = fpl->tvp;
3477 	vp_seqc = fpl->tvp_seqc;
3478 
3479 	VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
3480 	mp = atomic_load_ptr(&vp->v_mountedhere);
3481 	if (mp == NULL)
3482 		return (0);
3483 
3484 	prev_mp = NULL;
3485 	for (;;) {
3486 		if (!vfs_op_thread_enter(mp)) {
3487 			if (prev_mp != NULL)
3488 				vfs_op_thread_exit(prev_mp);
3489 			return (cache_fpl_partial(fpl));
3490 		}
3491 		if (prev_mp != NULL)
3492 			vfs_op_thread_exit(prev_mp);
3493 		if (!vn_seqc_consistent(vp, vp_seqc)) {
3494 			vfs_op_thread_exit(mp);
3495 			return (cache_fpl_partial(fpl));
3496 		}
3497 		if (!cache_fplookup_mp_supported(mp)) {
3498 			vfs_op_thread_exit(mp);
3499 			return (cache_fpl_partial(fpl));
3500 		}
3501 		vp = atomic_load_ptr(&mp->mnt_rootvnode);
3502 		if (vp == NULL || VN_IS_DOOMED(vp)) {
3503 			vfs_op_thread_exit(mp);
3504 			return (cache_fpl_partial(fpl));
3505 		}
3506 		vp_seqc = vn_seqc_read_any(vp);
3507 		if (seqc_in_modify(vp_seqc)) {
3508 			vfs_op_thread_exit(mp);
3509 			return (cache_fpl_partial(fpl));
3510 		}
3511 		prev_mp = mp;
3512 		mp = atomic_load_ptr(&vp->v_mountedhere);
3513 		if (mp == NULL)
3514 			break;
3515 	}
3516 
3517 	vfs_op_thread_exit(prev_mp);
3518 	fpl->tvp = vp;
3519 	fpl->tvp_seqc = vp_seqc;
3520 	return (0);
3521 }
3522 
3523 static bool
3524 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
3525 {
3526 	struct mount *mp;
3527 	struct vnode *vp;
3528 
3529 	vp = fpl->tvp;
3530 
3531 	/*
3532 	 * Hack: while this is a union, the pointer tends to be NULL so save on
3533 	 * a branch.
3534 	 */
3535 	mp = atomic_load_ptr(&vp->v_mountedhere);
3536 	if (mp == NULL)
3537 		return (false);
3538 	if (vp->v_type == VDIR)
3539 		return (true);
3540 	return (false);
3541 }
3542 
3543 /*
3544  * Parse the path.
3545  *
3546  * The code is mostly copy-pasted from regular lookup, see lookup().
3547  * The structure is maintained along with comments for easier maintenance.
3548  * Deduplicating the code will become feasible after fast path lookup
3549  * becomes more feature-complete.
3550  */
3551 static int
3552 cache_fplookup_parse(struct cache_fpl *fpl)
3553 {
3554 	struct nameidata *ndp;
3555 	struct componentname *cnp;
3556 	char *cp;
3557 	char *prev_ni_next;             /* saved ndp->ni_next */
3558 	size_t prev_ni_pathlen;         /* saved ndp->ni_pathlen */
3559 
3560 	ndp = fpl->ndp;
3561 	cnp = fpl->cnp;
3562 
3563 	/*
3564 	 * Search a new directory.
3565 	 *
3566 	 * The last component of the filename is left accessible via
3567 	 * cnp->cn_nameptr for callers that need the name. Callers needing
3568 	 * the name set the SAVENAME flag. When done, they assume
3569 	 * responsibility for freeing the pathname buffer.
3570 	 */
3571 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
3572 		continue;
3573 	cnp->cn_namelen = cp - cnp->cn_nameptr;
3574 	if (cnp->cn_namelen > NAME_MAX) {
3575 		cache_fpl_smr_exit(fpl);
3576 		return (cache_fpl_handled(fpl, ENAMETOOLONG));
3577 	}
3578 	prev_ni_pathlen = ndp->ni_pathlen;
3579 	ndp->ni_pathlen -= cnp->cn_namelen;
3580 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
3581 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
3582 	prev_ni_next = ndp->ni_next;
3583 	ndp->ni_next = cp;
3584 
3585 	/*
3586 	 * Replace multiple slashes by a single slash and trailing slashes
3587 	 * by a null.  This must be done before VOP_LOOKUP() because some
3588 	 * fs's don't know about trailing slashes.  Remember if there were
3589 	 * trailing slashes to handle symlinks, existing non-directories
3590 	 * and non-existing files that won't be directories specially later.
3591 	 */
3592 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
3593 		cp++;
3594 		ndp->ni_pathlen--;
3595 		if (*cp == '\0') {
3596 			/*
3597 			 * TODO
3598 			 * Regular lookup performs the following:
3599 			 * *ndp->ni_next = '\0';
3600 			 * cnp->cn_flags |= TRAILINGSLASH;
3601 			 *
3602 			 * Which is problematic since it modifies data read
3603 			 * from userspace. Then if fast path lookup was to
3604 			 * abort we would have to either restore it or convey
3605 			 * the flag. Since this is a corner case just ignore
3606 			 * it for simplicity.
3607 			 */
3608 			return (cache_fpl_partial(fpl));
3609 		}
3610 	}
3611 	ndp->ni_next = cp;
3612 
3613 	cnp->cn_flags |= MAKEENTRY;
3614 
3615 	if (cnp->cn_namelen == 2 &&
3616 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3617 		cnp->cn_flags |= ISDOTDOT;
3618 	else
3619 		cnp->cn_flags &= ~ISDOTDOT;
3620 	if (*ndp->ni_next == 0)
3621 		cnp->cn_flags |= ISLASTCN;
3622 	else
3623 		cnp->cn_flags &= ~ISLASTCN;
3624 
3625 	/*
3626 	 * Check for degenerate name (e.g. / or "")
3627 	 * which is a way of talking about a directory,
3628 	 * e.g. like "/." or ".".
3629 	 *
3630 	 * TODO
3631 	 * Another corner case handled by the regular lookup
3632 	 */
3633 	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
3634 		return (cache_fpl_partial(fpl));
3635 	}
3636 	return (0);
3637 }
3638 
3639 static void
3640 cache_fplookup_parse_advance(struct cache_fpl *fpl)
3641 {
3642 	struct nameidata *ndp;
3643 	struct componentname *cnp;
3644 
3645 	ndp = fpl->ndp;
3646 	cnp = fpl->cnp;
3647 
3648 	cnp->cn_nameptr = ndp->ni_next;
3649 	while (*cnp->cn_nameptr == '/') {
3650 		cnp->cn_nameptr++;
3651 		ndp->ni_pathlen--;
3652 	}
3653 }
3654 
3655 static int
3656 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
3657 {
3658 	struct nameidata *ndp;
3659 	struct componentname *cnp;
3660 	struct mount *mp;
3661 	int error;
3662 
3663 	error = CACHE_FPL_FAILED;
3664 	ndp = fpl->ndp;
3665 	ndp->ni_lcf = 0;
3666 	cnp = fpl->cnp;
3667 	cnp->cn_lkflags = LK_SHARED;
3668 	if ((cnp->cn_flags & LOCKSHARED) == 0)
3669 		cnp->cn_lkflags = LK_EXCLUSIVE;
3670 
3671 	cache_fpl_checkpoint(fpl, &fpl->snd);
3672 
3673 	fpl->dvp = dvp;
3674 	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
3675 	if (seqc_in_modify(fpl->dvp_seqc)) {
3676 		cache_fpl_aborted(fpl);
3677 		goto out;
3678 	}
3679 	mp = atomic_load_ptr(&fpl->dvp->v_mount);
3680 	if (!cache_fplookup_mp_supported(mp)) {
3681 		cache_fpl_aborted(fpl);
3682 		goto out;
3683 	}
3684 
3685 	VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3686 
3687 	for (;;) {
3688 		error = cache_fplookup_parse(fpl);
3689 		if (__predict_false(error != 0)) {
3690 			break;
3691 		}
3692 
3693 		if (cnp->cn_flags & ISDOTDOT) {
3694 			error = cache_fpl_partial(fpl);
3695 			break;
3696 		}
3697 
3698 		VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3699 
3700 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
3701 		if (__predict_false(error != 0)) {
3702 			switch (error) {
3703 			case EAGAIN:
3704 			case EOPNOTSUPP: /* can happen when racing against vgone */
3705 				cache_fpl_partial(fpl);
3706 				break;
3707 			default:
3708 				/*
3709 				 * See the API contract for VOP_FPLOOKUP_VEXEC.
3710 				 */
3711 				if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3712 					error = cache_fpl_aborted(fpl);
3713 				} else {
3714 					cache_fpl_smr_exit(fpl);
3715 					cache_fpl_handled(fpl, error);
3716 				}
3717 				break;
3718 			}
3719 			break;
3720 		}
3721 
3722 		error = cache_fplookup_next(fpl);
3723 		if (__predict_false(error != 0)) {
3724 			break;
3725 		}
3726 
3727 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3728 
3729 		if (cache_fplookup_need_climb_mount(fpl)) {
3730 			error = cache_fplookup_climb_mount(fpl);
3731 			if (__predict_false(error != 0)) {
3732 				break;
3733 			}
3734 		}
3735 
3736 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3737 
3738 		if (cnp->cn_flags & ISLASTCN) {
3739 			error = cache_fplookup_final(fpl);
3740 			break;
3741 		}
3742 
3743 		if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3744 			error = cache_fpl_aborted(fpl);
3745 			break;
3746 		}
3747 
3748 		fpl->dvp = fpl->tvp;
3749 		fpl->dvp_seqc = fpl->tvp_seqc;
3750 
3751 		cache_fplookup_parse_advance(fpl);
3752 		cache_fpl_checkpoint(fpl, &fpl->snd);
3753 	}
3754 out:
3755 	switch (fpl->status) {
3756 	case CACHE_FPL_STATUS_UNSET:
3757 		__assert_unreachable();
3758 		break;
3759 	case CACHE_FPL_STATUS_PARTIAL:
3760 		cache_fpl_smr_assert_entered(fpl);
3761 		return (cache_fplookup_partial_setup(fpl));
3762 	case CACHE_FPL_STATUS_ABORTED:
3763 		if (fpl->in_smr)
3764 			cache_fpl_smr_exit(fpl);
3765 		return (CACHE_FPL_FAILED);
3766 	case CACHE_FPL_STATUS_HANDLED:
3767 		cache_fpl_smr_assert_not_entered(fpl);
3768 		if (__predict_false(error != 0)) {
3769 			ndp->ni_dvp = NULL;
3770 			ndp->ni_vp = NULL;
3771 			cache_fpl_cleanup_cnp(cnp);
3772 			return (error);
3773 		}
3774 		ndp->ni_dvp = fpl->dvp;
3775 		ndp->ni_vp = fpl->tvp;
3776 		if (cnp->cn_flags & SAVENAME)
3777 			cnp->cn_flags |= HASBUF;
3778 		else
3779 			cache_fpl_cleanup_cnp(cnp);
3780 		return (error);
3781 	}
3782 }
3783 
3784 /*
3785  * Fast path lookup protected with SMR and sequence counters.
3786  *
3787  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
3788  *
3789  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
3790  * outlined below.
3791  *
3792  * Traditional vnode lookup conceptually looks like this:
3793  *
3794  * vn_lock(current);
3795  * for (;;) {
3796  *	next = find();
3797  *	vn_lock(next);
3798  *	vn_unlock(current);
3799  *	current = next;
3800  *	if (last)
3801  *	    break;
3802  * }
3803  * return (current);
3804  *
3805  * Each jump to the next vnode is safe memory-wise and atomic with respect to
3806  * any modifications thanks to holding respective locks.
3807  *
3808  * The same guarantee can be provided with a combination of safe memory
3809  * reclamation and sequence counters instead. If all operations which affect
3810  * the relationship between the current vnode and the one we are looking for
3811  * also modify the counter, we can verify whether all the conditions held as
3812  * we made the jump. This includes things like permissions, mount points etc.
3813  * Counter modification is provided by enclosing relevant places in
3814  * vn_seqc_write_begin()/end() calls.
3815  *
3816  * Thus this translates to:
3817  *
3818  * vfs_smr_enter();
3819  * dvp_seqc = seqc_read_any(dvp);
3820  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
3821  *     abort();
3822  * for (;;) {
3823  * 	tvp = find();
3824  * 	tvp_seqc = seqc_read_any(tvp);
3825  * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
3826  * 	    abort();
3827  * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
3828  * 	    abort();
3829  * 	dvp = tvp; // we know nothing of importance has changed
3830  * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
3831  * 	if (last)
3832  * 	    break;
3833  * }
3834  * vget(); // secure the vnode
3835  * if (!seqc_consistent(tvp, tvp_seqc) // final check
3836  * 	    abort();
3837  * // at this point we know nothing has changed for any parent<->child pair
3838  * // as they were crossed during the lookup, meaning we matched the guarantee
3839  * // of the locked variant
3840  * return (tvp);
3841  *
3842  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
3843  * - they are called while within vfs_smr protection which they must never exit
3844  * - EAGAIN can be returned to denote checking could not be performed, it is
3845  *   always valid to return it
3846  * - if the sequence counter has not changed the result must be valid
3847  * - if the sequence counter has changed both false positives and false negatives
3848  *   are permitted (since the result will be rejected later)
3849  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
3850  *
3851  * Caveats to watch out for:
3852  * - vnodes are passed unlocked and unreferenced with nothing stopping
3853  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
3854  *   to use atomic_load_ptr to fetch it.
3855  * - the aforementioned object can also get freed, meaning absent other means it
3856  *   should be protected with vfs_smr
3857  * - either safely checking permissions as they are modified or guaranteeing
3858  *   their stability is left to the routine
3859  */
3860 int
3861 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
3862     struct pwd **pwdp)
3863 {
3864 	struct cache_fpl fpl;
3865 	struct pwd *pwd;
3866 	struct vnode *dvp;
3867 	struct componentname *cnp;
3868 	struct nameidata_saved orig;
3869 	int error;
3870 
3871 	*status = CACHE_FPL_STATUS_UNSET;
3872 	bzero(&fpl, sizeof(fpl));
3873 	fpl.status = CACHE_FPL_STATUS_UNSET;
3874 	fpl.ndp = ndp;
3875 	fpl.cnp = &ndp->ni_cnd;
3876 	MPASS(curthread == fpl.cnp->cn_thread);
3877 
3878 	if (!cache_can_fplookup(&fpl)) {
3879 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
3880 		*status = fpl.status;
3881 		return (EOPNOTSUPP);
3882 	}
3883 
3884 	cache_fpl_checkpoint(&fpl, &orig);
3885 
3886 	cache_fpl_smr_enter(&fpl);
3887 	pwd = pwd_get_smr();
3888 	fpl.pwd = pwd;
3889 	ndp->ni_rootdir = pwd->pwd_rdir;
3890 	ndp->ni_topdir = pwd->pwd_jdir;
3891 
3892 	cnp = fpl.cnp;
3893 	cnp->cn_nameptr = cnp->cn_pnbuf;
3894 	if (cnp->cn_pnbuf[0] == '/') {
3895 		cache_fpl_handle_root(ndp, &dvp);
3896 	} else {
3897 		MPASS(ndp->ni_dirfd == AT_FDCWD);
3898 		dvp = pwd->pwd_cdir;
3899 	}
3900 
3901 	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
3902 
3903 	error = cache_fplookup_impl(dvp, &fpl);
3904 	cache_fpl_smr_assert_not_entered(&fpl);
3905 	SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
3906 
3907 	*status = fpl.status;
3908 	switch (fpl.status) {
3909 	case CACHE_FPL_STATUS_UNSET:
3910 		__assert_unreachable();
3911 		break;
3912 	case CACHE_FPL_STATUS_HANDLED:
3913 		SDT_PROBE3(vfs, namei, lookup, return, error,
3914 		    (error == 0 ? ndp->ni_vp : NULL), true);
3915 		break;
3916 	case CACHE_FPL_STATUS_PARTIAL:
3917 		*pwdp = fpl.pwd;
3918 		cache_fpl_restore(&fpl, &fpl.snd);
3919 		break;
3920 	case CACHE_FPL_STATUS_ABORTED:
3921 		cache_fpl_restore(&fpl, &orig);
3922 		break;
3923 	}
3924 	return (error);
3925 }
3926