1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include "opt_ddb.h"
36 #include "opt_ktrace.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/capsicum.h>
41 #include <sys/counter.h>
42 #include <sys/filedesc.h>
43 #include <sys/fnv_hash.h>
44 #include <sys/inotify.h>
45 #include <sys/kernel.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/fcntl.h>
50 #include <sys/jail.h>
51 #include <sys/mount.h>
52 #include <sys/namei.h>
53 #include <sys/proc.h>
54 #include <sys/seqc.h>
55 #include <sys/sdt.h>
56 #include <sys/smr.h>
57 #include <sys/smp.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysctl.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <ck_queue.h>
63 #ifdef KTRACE
64 #include <sys/ktrace.h>
65 #endif
66 #ifdef INVARIANTS
67 #include <machine/_inttypes.h>
68 #endif
69
70 #include <security/audit/audit.h>
71 #include <security/mac/mac_framework.h>
72
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76
77 #include <vm/uma.h>
78
79 /*
80 * High level overview of name caching in the VFS layer.
81 *
82 * Originally caching was implemented as part of UFS, later extracted to allow
83 * use by other filesystems. A decision was made to make it optional and
84 * completely detached from the rest of the kernel, which comes with limitations
85 * outlined near the end of this comment block.
86 *
87 * This fundamental choice needs to be revisited. In the meantime, the current
88 * state is described below. Significance of all notable routines is explained
89 * in comments placed above their implementation. Scattered thoroughout the
90 * file are TODO comments indicating shortcomings which can be fixed without
91 * reworking everything (most of the fixes will likely be reusable). Various
92 * details are omitted from this explanation to not clutter the overview, they
93 * have to be checked by reading the code and associated commentary.
94 *
95 * Keep in mind that it's individual path components which are cached, not full
96 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
97 * one for each name.
98 *
99 * I. Data organization
100 *
101 * Entries are described by "struct namecache" objects and stored in a hash
102 * table. See cache_get_hash for more information.
103 *
104 * "struct vnode" contains pointers to source entries (names which can be found
105 * when traversing through said vnode), destination entries (names of that
106 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
107 * the parent vnode.
108 *
109 * The (directory vnode; name) tuple reliably determines the target entry if
110 * it exists.
111 *
112 * Since there are no small locks at this time (all are 32 bytes in size on
113 * LP64), the code works around the problem by introducing lock arrays to
114 * protect hash buckets and vnode lists.
115 *
116 * II. Filesystem integration
117 *
118 * Filesystems participating in name caching do the following:
119 * - set vop_lookup routine to vfs_cache_lookup
120 * - set vop_cachedlookup to whatever can perform the lookup if the above fails
121 * - if they support lockless lookup (see below), vop_fplookup_vexec and
122 * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
123 * mount point
124 * - call cache_purge or cache_vop_* routines to eliminate stale entries as
125 * applicable
126 * - call cache_enter to add entries depending on the MAKEENTRY flag
127 *
128 * With the above in mind, there are 2 entry points when doing lookups:
129 * - ... -> namei -> cache_fplookup -- this is the default
130 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
131 * should the above fail
132 *
133 * Example code flow how an entry is added:
134 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
135 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
136 *
137 * III. Performance considerations
138 *
139 * For lockless case forward lookup avoids any writes to shared areas apart
140 * from the terminal path component. In other words non-modifying lookups of
141 * different files don't suffer any scalability problems in the namecache.
142 * Looking up the same file is limited by VFS and goes beyond the scope of this
143 * file.
144 *
145 * At least on amd64 the single-threaded bottleneck for long paths is hashing
146 * (see cache_get_hash). There are cases where the code issues acquire fence
147 * multiple times, they can be combined on architectures which suffer from it.
148 *
149 * For locked case each encountered vnode has to be referenced and locked in
150 * order to be handed out to the caller (normally that's namei). This
151 * introduces significant hit single-threaded and serialization multi-threaded.
152 *
153 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
154 * avoids any writes to shared areas to any components.
155 *
156 * Unrelated insertions are partially serialized on updating the global entry
157 * counter and possibly serialized on colliding bucket or vnode locks.
158 *
159 * IV. Observability
160 *
161 * Note not everything has an explicit dtrace probe nor it should have, thus
162 * some of the one-liners below depend on implementation details.
163 *
164 * Examples:
165 *
166 * # Check what lookups failed to be handled in a lockless manner. Column 1 is
167 * # line number, column 2 is status code (see cache_fpl_status)
168 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
169 *
170 * # Lengths of names added by binary name
171 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
172 *
173 * # Same as above but only those which exceed 64 characters
174 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
175 *
176 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
177 * # path is it
178 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
179 *
180 * V. Limitations and implementation defects
181 *
182 * - since it is possible there is no entry for an open file, tools like
183 * "procstat" may fail to resolve fd -> vnode -> path to anything
184 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
185 * shortage) in which case the above problem applies
186 * - hardlinks are not tracked, thus if a vnode is reachable in more than one
187 * way, resolving a name may return a different path than the one used to
188 * open it (even if said path is still valid)
189 * - by default entries are not added for newly created files
190 * - adding an entry may need to evict negative entry first, which happens in 2
191 * distinct places (evicting on lookup, adding in a later VOP) making it
192 * impossible to simply reuse it
193 * - there is a simple scheme to evict negative entries as the cache is approaching
194 * its capacity, but it is very unclear if doing so is a good idea to begin with
195 * - vnodes are subject to being recycled even if target inode is left in memory,
196 * which loses the name cache entries when it perhaps should not. in case of tmpfs
197 * names get duplicated -- kept by filesystem itself and namecache separately
198 * - struct namecache has a fixed size and comes in 2 variants, often wasting
199 * space. now hard to replace with malloc due to dependence on SMR, which
200 * requires UMA zones to opt in
201 * - lack of better integration with the kernel also turns nullfs into a layered
202 * filesystem instead of something which can take advantage of caching
203 *
204 * Appendix A: where is the time lost, expanding on paragraph III
205 *
206 * While some care went into optimizing lookups, there is still plenty of
207 * performance left on the table, most notably from single-threaded standpoint.
208 * Below is a woefully incomplete list of changes which can help. Ideas are
209 * mostly sketched out, no claim is made all kinks or prerequisites are laid
210 * out.
211 *
212 * Note there is performance lost all over VFS.
213 *
214 * === SMR-only lookup
215 *
216 * For commonly used ops like stat(2), when the terminal vnode *is* cached,
217 * lockless lookup could refrain from refing/locking the found vnode and
218 * instead return while within the SMR section. Then a call to, say,
219 * vop_stat_smr could do the work (or fail with EAGAIN), finally the result
220 * would be validated with seqc not changing. This would be faster
221 * single-threaded as it dodges atomics and would provide full scalability for
222 * multicore uses. This would *not* work for open(2) or other calls which need
223 * the vnode to hang around for the long haul, but would work for aforementioned
224 * stat(2) but also access(2), readlink(2), realpathat(2) and probably more.
225 *
226 * === hotpatching for sdt probes
227 *
228 * They result in *tons* of branches all over with rather regrettable codegen
229 * at times. Removing sdt probes altogether gives over 2% boost in lookup rate.
230 * Reworking the code to patch itself at runtime with asm goto would solve it.
231 * asm goto is fully supported by gcc and clang.
232 *
233 * === copyinstr
234 *
235 * On all architectures it operates one byte at a time, while it could be
236 * word-sized instead thanks to the Mycroft trick.
237 *
238 * API itself is rather pessimal for path lookup, accepting arbitrary sizes and
239 * *optionally* filling in the length parameter.
240 *
241 * Instead a new routine (copyinpath?) could be introduced, demanding a buffer
242 * size which is a multiply of the word (and never zero), with the length
243 * always returned. On top of it the routine could be allowed to transform the
244 * buffer in arbitrary ways, most notably writing past the found length (not to
245 * be confused with writing past buffer size) -- this would allow word-sized
246 * movs while checking for '\0' later.
247 *
248 * === detour through namei
249 *
250 * Currently one suffers being called from namei, which then has to check if
251 * things worked out locklessly. Instead the lockless lookup could be the
252 * actual entry point which calls what is currently namei as a fallback.
253 *
254 * === avoidable branches in cache_can_fplookup
255 *
256 * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if
257 * this is off, none of fplookup code should execute).
258 *
259 * Both audit and capsicum branches can be combined into one, but it requires
260 * paying off a lot of tech debt first.
261 *
262 * ni_startdir could be indicated with a flag in cn_flags, eliminating the
263 * branch.
264 *
265 * === mount stacks
266 *
267 * Crossing a mount requires checking if perhaps something is mounted on top.
268 * Instead, an additional entry could be added to struct mount with a pointer
269 * to the final mount on the stack. This would be recalculated on each
270 * mount/unmount.
271 *
272 * === root vnodes
273 *
274 * It could become part of the API contract to *always* have a rootvnode set in
275 * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have
276 * to be modified to always skip them.
277 *
278 * === inactive on v_usecount reaching 0
279 *
280 * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such
281 * processing with a bit in usecount.
282 *
283 * === v_holdcnt
284 *
285 * Hold count should probably get eliminated, but one can argue it is a useful
286 * feature. Even if so, handling of v_usecount could be decoupled from it --
287 * vnlru et al would consider the vnode not-freeable if has either hold or
288 * usecount on it.
289 *
290 * This would eliminate 2 atomics.
291 */
292
293 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
294 "Name cache");
295
296 SDT_PROVIDER_DECLARE(vfs);
297 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
298 "struct vnode *");
299 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
300 "struct vnode *");
301 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
302 "char *");
303 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
304 "const char *");
305 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
306 "struct namecache *", "int", "int");
307 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
308 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
309 "char *", "struct vnode *");
310 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
311 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
312 "struct vnode *", "char *");
313 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
314 "struct vnode *");
315 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
316 "struct vnode *", "char *");
317 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
318 "char *");
319 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
320 "struct componentname *");
321 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
322 "struct componentname *");
323 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
324 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
325 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
326 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
327 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
328 "struct vnode *");
329 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
330 "char *");
331 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
332 "char *");
333 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
334
335 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
336 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
337 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
338
339 static char __read_frequently cache_fast_lookup_enabled = true;
340
341 /*
342 * This structure describes the elements in the cache of recent
343 * names looked up by namei.
344 */
345 struct negstate {
346 u_char neg_flag;
347 u_char neg_hit;
348 };
349 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
350 "the state must fit in a union with a pointer without growing it");
351
352 struct namecache {
353 LIST_ENTRY(namecache) nc_src; /* source vnode list */
354 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
355 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
356 struct vnode *nc_dvp; /* vnode of parent of name */
357 union {
358 struct vnode *nu_vp; /* vnode the name refers to */
359 struct negstate nu_neg;/* negative entry state */
360 } n_un;
361 u_char nc_flag; /* flag bits */
362 u_char nc_nlen; /* length of name */
363 char nc_name[]; /* segment name + nul */
364 };
365
366 /*
367 * struct namecache_ts repeats struct namecache layout up to the
368 * nc_nlen member.
369 * struct namecache_ts is used in place of struct namecache when time(s) need
370 * to be stored. The nc_dotdottime field is used when a cache entry is mapping
371 * both a non-dotdot directory name plus dotdot for the directory's
372 * parent.
373 *
374 * See below for alignment requirement.
375 */
376 struct namecache_ts {
377 struct timespec nc_time; /* timespec provided by fs */
378 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
379 int nc_ticks; /* ticks value when entry was added */
380 int nc_pad;
381 struct namecache nc_nc;
382 };
383
384 TAILQ_HEAD(cache_freebatch, namecache);
385
386 /*
387 * At least mips n32 performs 64-bit accesses to timespec as found
388 * in namecache_ts and requires them to be aligned. Since others
389 * may be in the same spot suffer a little bit and enforce the
390 * alignment for everyone. Note this is a nop for 64-bit platforms.
391 */
392 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t)
393
394 /*
395 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
396 * 4.4 BSD codebase. Later on struct namecache was tweaked to become
397 * smaller and the value was bumped to retain the total size, but it
398 * was never re-evaluated for suitability. A simple test counting
399 * lengths during package building shows that the value of 45 covers
400 * about 86% of all added entries, reaching 99% at 65.
401 *
402 * Regardless of the above, use of dedicated zones instead of malloc may be
403 * inducing additional waste. This may be hard to address as said zones are
404 * tied to VFS SMR. Even if retaining them, the current split should be
405 * re-evaluated.
406 */
407 #ifdef __LP64__
408 #define CACHE_PATH_CUTOFF 45
409 #define CACHE_LARGE_PAD 6
410 #else
411 #define CACHE_PATH_CUTOFF 41
412 #define CACHE_LARGE_PAD 2
413 #endif
414
415 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
416 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
417 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
418 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
419
420 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
421 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
422 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
423 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
424
425 #define nc_vp n_un.nu_vp
426 #define nc_neg n_un.nu_neg
427
428 /*
429 * Flags in namecache.nc_flag
430 */
431 #define NCF_WHITE 0x01
432 #define NCF_ISDOTDOT 0x02
433 #define NCF_TS 0x04
434 #define NCF_DTS 0x08
435 #define NCF_DVDROP 0x10
436 #define NCF_NEGATIVE 0x20
437 #define NCF_INVALID 0x40
438 #define NCF_WIP 0x80
439
440 /*
441 * Flags in negstate.neg_flag
442 */
443 #define NEG_HOT 0x01
444
445 static bool cache_neg_evict_cond(u_long lnumcache);
446
447 /*
448 * Mark an entry as invalid.
449 *
450 * This is called before it starts getting deconstructed.
451 */
452 static void
cache_ncp_invalidate(struct namecache * ncp)453 cache_ncp_invalidate(struct namecache *ncp)
454 {
455
456 KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
457 ("%s: entry %p already invalid", __func__, ncp));
458 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
459 atomic_thread_fence_rel();
460 }
461
462 /*
463 * Does this entry match the given directory and name?
464 */
465 static bool
cache_ncp_match(struct namecache * ncp,struct vnode * dvp,struct componentname * cnp)466 cache_ncp_match(struct namecache *ncp, struct vnode *dvp,
467 struct componentname *cnp)
468 {
469 return (ncp->nc_dvp == dvp &&
470 ncp->nc_nlen == cnp->cn_namelen &&
471 bcmp(ncp->nc_name, cnp->cn_nameptr, cnp->cn_namelen) == 0);
472 }
473
474 /*
475 * Check whether the entry can be safely used.
476 *
477 * All places which elide locks are supposed to call this after they are
478 * done with reading from an entry.
479 */
480 #define cache_ncp_canuse(ncp) ({ \
481 struct namecache *_ncp = (ncp); \
482 u_char _nc_flag; \
483 \
484 atomic_thread_fence_acq(); \
485 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
486 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \
487 })
488
489 /*
490 * Like the above but also checks NCF_WHITE.
491 */
492 #define cache_fpl_neg_ncp_canuse(ncp) ({ \
493 struct namecache *_ncp = (ncp); \
494 u_char _nc_flag; \
495 \
496 atomic_thread_fence_acq(); \
497 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
498 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \
499 })
500
501 VFS_SMR_DECLARE;
502
503 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
504 "Name cache parameters");
505
506 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */
507 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0,
508 "Total namecache capacity");
509
510 u_int ncsizefactor = 2;
511 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
512 "Size factor for namecache");
513
514 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */
515 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
516 "Ratio of negative namecache entries");
517
518 /*
519 * Negative entry % of namecache capacity above which automatic eviction is allowed.
520 *
521 * Check cache_neg_evict_cond for details.
522 */
523 static u_int ncnegminpct = 3;
524
525 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */
526 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
527 "Negative entry count above which automatic eviction is allowed");
528
529 /*
530 * Structures associated with name caching.
531 */
532 #define NCHHASH(hash) \
533 (&nchashtbl[(hash) & nchash])
534 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
535 static u_long __read_mostly nchash; /* size of hash table */
536 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
537 "Size of namecache hash table");
538 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
539 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
540
541 struct nchstats nchstats; /* cache effectiveness statistics */
542
543 static u_int __exclusive_cache_line neg_cycle;
544
545 #define ncneghash 3
546 #define numneglists (ncneghash + 1)
547
548 struct neglist {
549 struct mtx nl_evict_lock;
550 struct mtx nl_lock __aligned(CACHE_LINE_SIZE);
551 TAILQ_HEAD(, namecache) nl_list;
552 TAILQ_HEAD(, namecache) nl_hotlist;
553 u_long nl_hotnum;
554 } __aligned(CACHE_LINE_SIZE);
555
556 static struct neglist neglists[numneglists];
557
558 static inline struct neglist *
NCP2NEGLIST(struct namecache * ncp)559 NCP2NEGLIST(struct namecache *ncp)
560 {
561
562 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
563 }
564
565 static inline struct negstate *
NCP2NEGSTATE(struct namecache * ncp)566 NCP2NEGSTATE(struct namecache *ncp)
567 {
568
569 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
570 return (&ncp->nc_neg);
571 }
572
573 #define numbucketlocks (ncbuckethash + 1)
574 static u_int __read_mostly ncbuckethash;
575 static struct mtx_padalign __read_mostly *bucketlocks;
576 #define HASH2BUCKETLOCK(hash) \
577 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
578
579 #define numvnodelocks (ncvnodehash + 1)
580 static u_int __read_mostly ncvnodehash;
581 static struct mtx __read_mostly *vnodelocks;
582 static inline struct mtx *
VP2VNODELOCK(struct vnode * vp)583 VP2VNODELOCK(struct vnode *vp)
584 {
585
586 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
587 }
588
589 /*
590 * Search the hash table for a namecache entry. Either the corresponding bucket
591 * must be locked, or the caller must be in an SMR read section.
592 */
593 static struct namecache *
cache_ncp_find(struct vnode * dvp,struct componentname * cnp,uint32_t hash)594 cache_ncp_find(struct vnode *dvp, struct componentname *cnp, uint32_t hash)
595 {
596 struct namecache *ncp;
597
598 KASSERT(mtx_owned(HASH2BUCKETLOCK(hash)) || VFS_SMR_ENTERED(),
599 ("%s: hash %u not locked", __func__, hash));
600 CK_SLIST_FOREACH(ncp, NCHHASH(hash), nc_hash) {
601 if (cache_ncp_match(ncp, dvp, cnp))
602 break;
603 }
604 return (ncp);
605 }
606
607 static void
cache_out_ts(struct namecache * ncp,struct timespec * tsp,int * ticksp)608 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
609 {
610 struct namecache_ts *ncp_ts;
611
612 KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
613 (tsp == NULL && ticksp == NULL),
614 ("No NCF_TS"));
615
616 if (tsp == NULL)
617 return;
618
619 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
620 *tsp = ncp_ts->nc_time;
621 *ticksp = ncp_ts->nc_ticks;
622 }
623
624 #ifdef DEBUG_CACHE
625 static int __read_mostly doingcache = 1; /* 1 => enable the cache */
626 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
627 "VFS namecache enabled");
628 #endif
629
630 /* Export size information to userland */
631 SYSCTL_SIZEOF_STRUCT(namecache);
632
633 /*
634 * The new name cache statistics
635 */
636 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
637 "Name cache statistics");
638
639 #define STATNODE_ULONG(name, varname, descr) \
640 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
641 #define STATNODE_COUNTER(name, varname, descr) \
642 static COUNTER_U64_DEFINE_EARLY(varname); \
643 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
644 descr);
645 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
646 STATNODE_ULONG(count, numcache, "Number of cache entries");
647 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
648 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
649 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
650 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
651 STATNODE_COUNTER(poszaps, numposzaps,
652 "Number of cache hits (positive) we do not want to cache");
653 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
654 STATNODE_COUNTER(negzaps, numnegzaps,
655 "Number of cache hits (negative) we do not want to cache");
656 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
657 /* These count for vn_getcwd(), too. */
658 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
659 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
660 "Number of fullpath search errors (VOP_VPTOCNP failures)");
661 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
662 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
663 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
664
665 /*
666 * Debug or developer statistics.
667 */
668 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
669 "Name cache debugging");
670 #define DEBUGNODE_ULONG(name, varname, descr) \
671 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
672 static u_long zap_bucket_relock_success;
673 DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success,
674 "Number of successful removals after relocking");
675 static u_long zap_bucket_fail;
676 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
677 static u_long zap_bucket_fail2;
678 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
679 static u_long cache_lock_vnodes_cel_3_failures;
680 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
681 "Number of times 3-way vnode locking failed");
682
683 static void cache_zap_locked(struct namecache *ncp);
684 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
685 char **retbuf, size_t *buflen, size_t addend);
686 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
687 char **retbuf, size_t *buflen);
688 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
689 char **retbuf, size_t *len, size_t addend);
690
691 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
692
693 static inline void
cache_assert_vlp_locked(struct mtx * vlp)694 cache_assert_vlp_locked(struct mtx *vlp)
695 {
696
697 if (vlp != NULL)
698 mtx_assert(vlp, MA_OWNED);
699 }
700
701 static inline void
cache_assert_vnode_locked(struct vnode * vp)702 cache_assert_vnode_locked(struct vnode *vp)
703 {
704 struct mtx *vlp;
705
706 vlp = VP2VNODELOCK(vp);
707 cache_assert_vlp_locked(vlp);
708 }
709
710 /*
711 * Directory vnodes with entries are held for two reasons:
712 * 1. make them less of a target for reclamation in vnlru
713 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
714 *
715 * It will be feasible to stop doing it altogether if all filesystems start
716 * supporting lockless lookup.
717 */
718 static void
cache_hold_vnode(struct vnode * vp)719 cache_hold_vnode(struct vnode *vp)
720 {
721
722 cache_assert_vnode_locked(vp);
723 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
724 vhold(vp);
725 counter_u64_add(numcachehv, 1);
726 }
727
728 static void
cache_drop_vnode(struct vnode * vp)729 cache_drop_vnode(struct vnode *vp)
730 {
731
732 /*
733 * Called after all locks are dropped, meaning we can't assert
734 * on the state of v_cache_src.
735 */
736 vdrop(vp);
737 counter_u64_add(numcachehv, -1);
738 }
739
740 /*
741 * UMA zones.
742 */
743 static uma_zone_t __read_mostly cache_zone_small;
744 static uma_zone_t __read_mostly cache_zone_small_ts;
745 static uma_zone_t __read_mostly cache_zone_large;
746 static uma_zone_t __read_mostly cache_zone_large_ts;
747
748 char *
cache_symlink_alloc(size_t size,int flags)749 cache_symlink_alloc(size_t size, int flags)
750 {
751
752 if (size < CACHE_ZONE_SMALL_SIZE) {
753 return (uma_zalloc_smr(cache_zone_small, flags));
754 }
755 if (size < CACHE_ZONE_LARGE_SIZE) {
756 return (uma_zalloc_smr(cache_zone_large, flags));
757 }
758 counter_u64_add(symlinktoobig, 1);
759 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
760 return (NULL);
761 }
762
763 void
cache_symlink_free(char * string,size_t size)764 cache_symlink_free(char *string, size_t size)
765 {
766
767 MPASS(string != NULL);
768 KASSERT(size < CACHE_ZONE_LARGE_SIZE,
769 ("%s: size %zu too big", __func__, size));
770
771 if (size < CACHE_ZONE_SMALL_SIZE) {
772 uma_zfree_smr(cache_zone_small, string);
773 return;
774 }
775 if (size < CACHE_ZONE_LARGE_SIZE) {
776 uma_zfree_smr(cache_zone_large, string);
777 return;
778 }
779 __assert_unreachable();
780 }
781
782 static struct namecache *
cache_alloc_uma(int len,bool ts)783 cache_alloc_uma(int len, bool ts)
784 {
785 struct namecache_ts *ncp_ts;
786 struct namecache *ncp;
787
788 if (__predict_false(ts)) {
789 if (len <= CACHE_PATH_CUTOFF)
790 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
791 else
792 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
793 ncp = &ncp_ts->nc_nc;
794 } else {
795 if (len <= CACHE_PATH_CUTOFF)
796 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
797 else
798 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
799 }
800 return (ncp);
801 }
802
803 static void
cache_free_uma(struct namecache * ncp)804 cache_free_uma(struct namecache *ncp)
805 {
806 struct namecache_ts *ncp_ts;
807
808 if (__predict_false(ncp->nc_flag & NCF_TS)) {
809 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
810 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
811 uma_zfree_smr(cache_zone_small_ts, ncp_ts);
812 else
813 uma_zfree_smr(cache_zone_large_ts, ncp_ts);
814 } else {
815 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
816 uma_zfree_smr(cache_zone_small, ncp);
817 else
818 uma_zfree_smr(cache_zone_large, ncp);
819 }
820 }
821
822 static struct namecache *
cache_alloc(int len,bool ts)823 cache_alloc(int len, bool ts)
824 {
825 u_long lnumcache;
826
827 /*
828 * Avoid blowout in namecache entries.
829 *
830 * Bugs:
831 * 1. filesystems may end up trying to add an already existing entry
832 * (for example this can happen after a cache miss during concurrent
833 * lookup), in which case we will call cache_neg_evict despite not
834 * adding anything.
835 * 2. the routine may fail to free anything and no provisions are made
836 * to make it try harder (see the inside for failure modes)
837 * 3. it only ever looks at negative entries.
838 */
839 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
840 if (cache_neg_evict_cond(lnumcache)) {
841 lnumcache = atomic_load_long(&numcache);
842 }
843 if (__predict_false(lnumcache >= ncsize)) {
844 atomic_subtract_long(&numcache, 1);
845 counter_u64_add(numdrops, 1);
846 return (NULL);
847 }
848 return (cache_alloc_uma(len, ts));
849 }
850
851 static void
cache_free(struct namecache * ncp)852 cache_free(struct namecache *ncp)
853 {
854
855 MPASS(ncp != NULL);
856 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
857 cache_drop_vnode(ncp->nc_dvp);
858 }
859 cache_free_uma(ncp);
860 atomic_subtract_long(&numcache, 1);
861 }
862
863 static void
cache_free_batch(struct cache_freebatch * batch)864 cache_free_batch(struct cache_freebatch *batch)
865 {
866 struct namecache *ncp, *nnp;
867 int i;
868
869 i = 0;
870 if (TAILQ_EMPTY(batch))
871 goto out;
872 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
873 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
874 cache_drop_vnode(ncp->nc_dvp);
875 }
876 cache_free_uma(ncp);
877 i++;
878 }
879 atomic_subtract_long(&numcache, i);
880 out:
881 SDT_PROBE1(vfs, namecache, purge, batch, i);
882 }
883
884 /*
885 * Hashing.
886 *
887 * The code was made to use FNV in 2001 and this choice needs to be revisited.
888 *
889 * Short summary of the difficulty:
890 * The longest name which can be inserted is NAME_MAX characters in length (or
891 * 255 at the time of writing this comment), while majority of names used in
892 * practice are significantly shorter (mostly below 10). More importantly
893 * majority of lookups performed find names are even shorter than that.
894 *
895 * This poses a problem where hashes which do better than FNV past word size
896 * (or so) tend to come with additional overhead when finalizing the result,
897 * making them noticeably slower for the most commonly used range.
898 *
899 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
900 *
901 * When looking it up the most time consuming part by a large margin (at least
902 * on amd64) is hashing. Replacing FNV with something which pessimizes short
903 * input would make the slowest part stand out even more.
904 */
905
906 /*
907 * TODO: With the value stored we can do better than computing the hash based
908 * on the address.
909 */
910 static void
cache_prehash(struct vnode * vp)911 cache_prehash(struct vnode *vp)
912 {
913
914 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
915 }
916
917 static uint32_t
cache_get_hash(char * name,u_char len,struct vnode * dvp)918 cache_get_hash(char *name, u_char len, struct vnode *dvp)
919 {
920
921 return (fnv_32_buf(name, len, dvp->v_nchash));
922 }
923
924 static uint32_t
cache_get_hash_iter_start(struct vnode * dvp)925 cache_get_hash_iter_start(struct vnode *dvp)
926 {
927
928 return (dvp->v_nchash);
929 }
930
931 static uint32_t
cache_get_hash_iter(char c,uint32_t hash)932 cache_get_hash_iter(char c, uint32_t hash)
933 {
934
935 return (fnv_32_buf(&c, 1, hash));
936 }
937
938 static uint32_t
cache_get_hash_iter_finish(uint32_t hash)939 cache_get_hash_iter_finish(uint32_t hash)
940 {
941
942 return (hash);
943 }
944
945 static inline struct nchashhead *
NCP2BUCKET(struct namecache * ncp)946 NCP2BUCKET(struct namecache *ncp)
947 {
948 uint32_t hash;
949
950 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
951 return (NCHHASH(hash));
952 }
953
954 static inline struct mtx *
NCP2BUCKETLOCK(struct namecache * ncp)955 NCP2BUCKETLOCK(struct namecache *ncp)
956 {
957 uint32_t hash;
958
959 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
960 return (HASH2BUCKETLOCK(hash));
961 }
962
963 #ifdef INVARIANTS
964 static void
cache_assert_bucket_locked(struct namecache * ncp)965 cache_assert_bucket_locked(struct namecache *ncp)
966 {
967 struct mtx *blp;
968
969 blp = NCP2BUCKETLOCK(ncp);
970 mtx_assert(blp, MA_OWNED);
971 }
972
973 static void
cache_assert_bucket_unlocked(struct namecache * ncp)974 cache_assert_bucket_unlocked(struct namecache *ncp)
975 {
976 struct mtx *blp;
977
978 blp = NCP2BUCKETLOCK(ncp);
979 mtx_assert(blp, MA_NOTOWNED);
980 }
981 #else
982 #define cache_assert_bucket_locked(x) do { } while (0)
983 #define cache_assert_bucket_unlocked(x) do { } while (0)
984 #endif
985
986 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
987 static void
_cache_sort_vnodes(void ** p1,void ** p2)988 _cache_sort_vnodes(void **p1, void **p2)
989 {
990 void *tmp;
991
992 MPASS(*p1 != NULL || *p2 != NULL);
993
994 if (*p1 > *p2) {
995 tmp = *p2;
996 *p2 = *p1;
997 *p1 = tmp;
998 }
999 }
1000
1001 static void
cache_lock_all_buckets(void)1002 cache_lock_all_buckets(void)
1003 {
1004 u_int i;
1005
1006 for (i = 0; i < numbucketlocks; i++)
1007 mtx_lock(&bucketlocks[i]);
1008 }
1009
1010 static void
cache_unlock_all_buckets(void)1011 cache_unlock_all_buckets(void)
1012 {
1013 u_int i;
1014
1015 for (i = 0; i < numbucketlocks; i++)
1016 mtx_unlock(&bucketlocks[i]);
1017 }
1018
1019 static void
cache_lock_all_vnodes(void)1020 cache_lock_all_vnodes(void)
1021 {
1022 u_int i;
1023
1024 for (i = 0; i < numvnodelocks; i++)
1025 mtx_lock(&vnodelocks[i]);
1026 }
1027
1028 static void
cache_unlock_all_vnodes(void)1029 cache_unlock_all_vnodes(void)
1030 {
1031 u_int i;
1032
1033 for (i = 0; i < numvnodelocks; i++)
1034 mtx_unlock(&vnodelocks[i]);
1035 }
1036
1037 static int
cache_trylock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1038 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1039 {
1040
1041 cache_sort_vnodes(&vlp1, &vlp2);
1042
1043 if (vlp1 != NULL) {
1044 if (!mtx_trylock(vlp1))
1045 return (EAGAIN);
1046 }
1047 if (!mtx_trylock(vlp2)) {
1048 if (vlp1 != NULL)
1049 mtx_unlock(vlp1);
1050 return (EAGAIN);
1051 }
1052
1053 return (0);
1054 }
1055
1056 static void
cache_lock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1057 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1058 {
1059
1060 MPASS(vlp1 != NULL || vlp2 != NULL);
1061 MPASS(vlp1 <= vlp2);
1062
1063 if (vlp1 != NULL)
1064 mtx_lock(vlp1);
1065 if (vlp2 != NULL)
1066 mtx_lock(vlp2);
1067 }
1068
1069 static void
cache_unlock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1070 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1071 {
1072
1073 MPASS(vlp1 != NULL || vlp2 != NULL);
1074
1075 if (vlp1 != NULL)
1076 mtx_unlock(vlp1);
1077 if (vlp2 != NULL)
1078 mtx_unlock(vlp2);
1079 }
1080
1081 static int
sysctl_nchstats(SYSCTL_HANDLER_ARGS)1082 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
1083 {
1084 struct nchstats snap;
1085
1086 if (req->oldptr == NULL)
1087 return (SYSCTL_OUT(req, 0, sizeof(snap)));
1088
1089 snap = nchstats;
1090 snap.ncs_goodhits = counter_u64_fetch(numposhits);
1091 snap.ncs_neghits = counter_u64_fetch(numneghits);
1092 snap.ncs_badhits = counter_u64_fetch(numposzaps) +
1093 counter_u64_fetch(numnegzaps);
1094 snap.ncs_miss = counter_u64_fetch(nummisszap) +
1095 counter_u64_fetch(nummiss);
1096
1097 return (SYSCTL_OUT(req, &snap, sizeof(snap)));
1098 }
1099 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
1100 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
1101 "VFS cache effectiveness statistics");
1102
1103 static int
sysctl_hitpct(SYSCTL_HANDLER_ARGS)1104 sysctl_hitpct(SYSCTL_HANDLER_ARGS)
1105 {
1106 long poshits, neghits, miss, total;
1107 long pct;
1108
1109 poshits = counter_u64_fetch(numposhits);
1110 neghits = counter_u64_fetch(numneghits);
1111 miss = counter_u64_fetch(nummiss);
1112 total = poshits + neghits + miss;
1113
1114 pct = 0;
1115 if (total != 0)
1116 pct = ((poshits + neghits) * 100) / total;
1117 return (sysctl_handle_int(oidp, 0, pct, req));
1118 }
1119 SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct,
1120 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct,
1121 "I", "Percentage of hits");
1122
1123 static void
cache_recalc_neg_min(void)1124 cache_recalc_neg_min(void)
1125 {
1126
1127 neg_min = (ncsize * ncnegminpct) / 100;
1128 }
1129
1130 static int
sysctl_negminpct(SYSCTL_HANDLER_ARGS)1131 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
1132 {
1133 u_int val;
1134 int error;
1135
1136 val = ncnegminpct;
1137 error = sysctl_handle_int(oidp, &val, 0, req);
1138 if (error != 0 || req->newptr == NULL)
1139 return (error);
1140
1141 if (val == ncnegminpct)
1142 return (0);
1143 if (val < 0 || val > 99)
1144 return (EINVAL);
1145 ncnegminpct = val;
1146 cache_recalc_neg_min();
1147 return (0);
1148 }
1149
1150 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
1151 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
1152 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
1153
1154 #ifdef DEBUG_CACHE
1155 /*
1156 * Grab an atomic snapshot of the name cache hash chain lengths
1157 */
1158 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
1159 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
1160 "hash table stats");
1161
1162 static int
sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)1163 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
1164 {
1165 struct nchashhead *ncpp;
1166 struct namecache *ncp;
1167 int i, error, n_nchash, *cntbuf;
1168
1169 retry:
1170 n_nchash = nchash + 1; /* nchash is max index, not count */
1171 if (req->oldptr == NULL)
1172 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
1173 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
1174 cache_lock_all_buckets();
1175 if (n_nchash != nchash + 1) {
1176 cache_unlock_all_buckets();
1177 free(cntbuf, M_TEMP);
1178 goto retry;
1179 }
1180 /* Scan hash tables counting entries */
1181 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
1182 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
1183 cntbuf[i]++;
1184 cache_unlock_all_buckets();
1185 for (error = 0, i = 0; i < n_nchash; i++)
1186 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
1187 break;
1188 free(cntbuf, M_TEMP);
1189 return (error);
1190 }
1191 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
1192 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
1193 "nchash chain lengths");
1194
1195 static int
sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)1196 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
1197 {
1198 int error;
1199 struct nchashhead *ncpp;
1200 struct namecache *ncp;
1201 int n_nchash;
1202 int count, maxlength, used, pct;
1203
1204 if (!req->oldptr)
1205 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
1206
1207 cache_lock_all_buckets();
1208 n_nchash = nchash + 1; /* nchash is max index, not count */
1209 used = 0;
1210 maxlength = 0;
1211
1212 /* Scan hash tables for applicable entries */
1213 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
1214 count = 0;
1215 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
1216 count++;
1217 }
1218 if (count)
1219 used++;
1220 if (maxlength < count)
1221 maxlength = count;
1222 }
1223 n_nchash = nchash + 1;
1224 cache_unlock_all_buckets();
1225 pct = (used * 100) / (n_nchash / 100);
1226 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
1227 if (error)
1228 return (error);
1229 error = SYSCTL_OUT(req, &used, sizeof(used));
1230 if (error)
1231 return (error);
1232 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
1233 if (error)
1234 return (error);
1235 error = SYSCTL_OUT(req, &pct, sizeof(pct));
1236 if (error)
1237 return (error);
1238 return (0);
1239 }
1240 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1241 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1242 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1243 #endif
1244
1245 /*
1246 * Negative entries management
1247 *
1248 * Various workloads create plenty of negative entries and barely use them
1249 * afterwards. Moreover malicious users can keep performing bogus lookups
1250 * adding even more entries. For example "make tinderbox" as of writing this
1251 * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1252 * negative.
1253 *
1254 * As such, a rather aggressive eviction method is needed. The currently
1255 * employed method is a placeholder.
1256 *
1257 * Entries are split over numneglists separate lists, each of which is further
1258 * split into hot and cold entries. Entries get promoted after getting a hit.
1259 * Eviction happens on addition of new entry.
1260 */
1261 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1262 "Name cache negative entry statistics");
1263
1264 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1265 "Number of negative cache entries");
1266
1267 static COUNTER_U64_DEFINE_EARLY(neg_created);
1268 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1269 "Number of created negative entries");
1270
1271 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1272 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1273 "Number of evicted negative entries");
1274
1275 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1276 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1277 &neg_evict_skipped_empty,
1278 "Number of times evicting failed due to lack of entries");
1279
1280 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1281 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1282 &neg_evict_skipped_missed,
1283 "Number of times evicting failed due to target entry disappearing");
1284
1285 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1286 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1287 &neg_evict_skipped_contended,
1288 "Number of times evicting failed due to contention");
1289
1290 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1291 "Number of cache hits (negative)");
1292
1293 static int
sysctl_neg_hot(SYSCTL_HANDLER_ARGS)1294 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1295 {
1296 int i, out;
1297
1298 out = 0;
1299 for (i = 0; i < numneglists; i++)
1300 out += neglists[i].nl_hotnum;
1301
1302 return (SYSCTL_OUT(req, &out, sizeof(out)));
1303 }
1304 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1305 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1306 "Number of hot negative entries");
1307
1308 static void
cache_neg_init(struct namecache * ncp)1309 cache_neg_init(struct namecache *ncp)
1310 {
1311 struct negstate *ns;
1312
1313 ncp->nc_flag |= NCF_NEGATIVE;
1314 ns = NCP2NEGSTATE(ncp);
1315 ns->neg_flag = 0;
1316 ns->neg_hit = 0;
1317 counter_u64_add(neg_created, 1);
1318 }
1319
1320 #define CACHE_NEG_PROMOTION_THRESH 2
1321
1322 static bool
cache_neg_hit_prep(struct namecache * ncp)1323 cache_neg_hit_prep(struct namecache *ncp)
1324 {
1325 struct negstate *ns;
1326 u_char n;
1327
1328 ns = NCP2NEGSTATE(ncp);
1329 n = atomic_load_char(&ns->neg_hit);
1330 for (;;) {
1331 if (n >= CACHE_NEG_PROMOTION_THRESH)
1332 return (false);
1333 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1334 break;
1335 }
1336 return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1337 }
1338
1339 /*
1340 * Nothing to do here but it is provided for completeness as some
1341 * cache_neg_hit_prep callers may end up returning without even
1342 * trying to promote.
1343 */
1344 #define cache_neg_hit_abort(ncp) do { } while (0)
1345
1346 static void
cache_neg_hit_finish(struct namecache * ncp)1347 cache_neg_hit_finish(struct namecache *ncp)
1348 {
1349
1350 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1351 counter_u64_add(numneghits, 1);
1352 }
1353
1354 /*
1355 * Move a negative entry to the hot list.
1356 */
1357 static void
cache_neg_promote_locked(struct namecache * ncp)1358 cache_neg_promote_locked(struct namecache *ncp)
1359 {
1360 struct neglist *nl;
1361 struct negstate *ns;
1362
1363 ns = NCP2NEGSTATE(ncp);
1364 nl = NCP2NEGLIST(ncp);
1365 mtx_assert(&nl->nl_lock, MA_OWNED);
1366 if ((ns->neg_flag & NEG_HOT) == 0) {
1367 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1368 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1369 nl->nl_hotnum++;
1370 ns->neg_flag |= NEG_HOT;
1371 }
1372 }
1373
1374 /*
1375 * Move a hot negative entry to the cold list.
1376 */
1377 static void
cache_neg_demote_locked(struct namecache * ncp)1378 cache_neg_demote_locked(struct namecache *ncp)
1379 {
1380 struct neglist *nl;
1381 struct negstate *ns;
1382
1383 ns = NCP2NEGSTATE(ncp);
1384 nl = NCP2NEGLIST(ncp);
1385 mtx_assert(&nl->nl_lock, MA_OWNED);
1386 MPASS(ns->neg_flag & NEG_HOT);
1387 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1388 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1389 nl->nl_hotnum--;
1390 ns->neg_flag &= ~NEG_HOT;
1391 atomic_store_char(&ns->neg_hit, 0);
1392 }
1393
1394 /*
1395 * Move a negative entry to the hot list if it matches the lookup.
1396 *
1397 * We have to take locks, but they may be contended and in the worst
1398 * case we may need to go off CPU. We don't want to spin within the
1399 * smr section and we can't block with it. Exiting the section means
1400 * the found entry could have been evicted. We are going to look it
1401 * up again.
1402 */
1403 static bool
cache_neg_promote_cond(struct vnode * dvp,struct componentname * cnp,struct namecache * oncp,uint32_t hash)1404 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1405 struct namecache *oncp, uint32_t hash)
1406 {
1407 struct namecache *ncp;
1408 struct neglist *nl;
1409 u_char nc_flag;
1410
1411 nl = NCP2NEGLIST(oncp);
1412
1413 mtx_lock(&nl->nl_lock);
1414 /*
1415 * For hash iteration.
1416 */
1417 vfs_smr_enter();
1418
1419 /*
1420 * Avoid all surprises by only succeeding if we got the same entry and
1421 * bailing completely otherwise.
1422 * XXX There are no provisions to keep the vnode around, meaning we may
1423 * end up promoting a negative entry for a *new* vnode and returning
1424 * ENOENT on its account. This is the error we want to return anyway
1425 * and promotion is harmless.
1426 *
1427 * In particular at this point there can be a new ncp which matches the
1428 * search but hashes to a different neglist.
1429 */
1430 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1431 if (ncp == oncp)
1432 break;
1433 }
1434
1435 /*
1436 * No match to begin with.
1437 */
1438 if (__predict_false(ncp == NULL)) {
1439 goto out_abort;
1440 }
1441
1442 /*
1443 * The newly found entry may be something different...
1444 */
1445 if (!cache_ncp_match(ncp, dvp, cnp)) {
1446 goto out_abort;
1447 }
1448
1449 /*
1450 * ... and not even negative.
1451 */
1452 nc_flag = atomic_load_char(&ncp->nc_flag);
1453 if ((nc_flag & NCF_NEGATIVE) == 0) {
1454 goto out_abort;
1455 }
1456
1457 if (!cache_ncp_canuse(ncp)) {
1458 goto out_abort;
1459 }
1460
1461 cache_neg_promote_locked(ncp);
1462 cache_neg_hit_finish(ncp);
1463 vfs_smr_exit();
1464 mtx_unlock(&nl->nl_lock);
1465 return (true);
1466 out_abort:
1467 vfs_smr_exit();
1468 mtx_unlock(&nl->nl_lock);
1469 return (false);
1470 }
1471
1472 static void
cache_neg_promote(struct namecache * ncp)1473 cache_neg_promote(struct namecache *ncp)
1474 {
1475 struct neglist *nl;
1476
1477 nl = NCP2NEGLIST(ncp);
1478 mtx_lock(&nl->nl_lock);
1479 cache_neg_promote_locked(ncp);
1480 mtx_unlock(&nl->nl_lock);
1481 }
1482
1483 static void
cache_neg_insert(struct namecache * ncp)1484 cache_neg_insert(struct namecache *ncp)
1485 {
1486 struct neglist *nl;
1487
1488 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1489 cache_assert_bucket_locked(ncp);
1490 nl = NCP2NEGLIST(ncp);
1491 mtx_lock(&nl->nl_lock);
1492 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1493 mtx_unlock(&nl->nl_lock);
1494 atomic_add_long(&numneg, 1);
1495 }
1496
1497 static void
cache_neg_remove(struct namecache * ncp)1498 cache_neg_remove(struct namecache *ncp)
1499 {
1500 struct neglist *nl;
1501 struct negstate *ns;
1502
1503 cache_assert_bucket_locked(ncp);
1504 nl = NCP2NEGLIST(ncp);
1505 ns = NCP2NEGSTATE(ncp);
1506 mtx_lock(&nl->nl_lock);
1507 if ((ns->neg_flag & NEG_HOT) != 0) {
1508 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1509 nl->nl_hotnum--;
1510 } else {
1511 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1512 }
1513 mtx_unlock(&nl->nl_lock);
1514 atomic_subtract_long(&numneg, 1);
1515 }
1516
1517 static struct neglist *
cache_neg_evict_select_list(void)1518 cache_neg_evict_select_list(void)
1519 {
1520 struct neglist *nl;
1521 u_int c;
1522
1523 c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1524 nl = &neglists[c % numneglists];
1525 if (!mtx_trylock(&nl->nl_evict_lock)) {
1526 counter_u64_add(neg_evict_skipped_contended, 1);
1527 return (NULL);
1528 }
1529 return (nl);
1530 }
1531
1532 static struct namecache *
cache_neg_evict_select_entry(struct neglist * nl)1533 cache_neg_evict_select_entry(struct neglist *nl)
1534 {
1535 struct namecache *ncp, *lncp;
1536 struct negstate *ns, *lns;
1537 int i;
1538
1539 mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1540 mtx_assert(&nl->nl_lock, MA_OWNED);
1541 ncp = TAILQ_FIRST(&nl->nl_list);
1542 if (ncp == NULL)
1543 return (NULL);
1544 lncp = ncp;
1545 lns = NCP2NEGSTATE(lncp);
1546 for (i = 1; i < 4; i++) {
1547 ncp = TAILQ_NEXT(ncp, nc_dst);
1548 if (ncp == NULL)
1549 break;
1550 ns = NCP2NEGSTATE(ncp);
1551 if (ns->neg_hit < lns->neg_hit) {
1552 lncp = ncp;
1553 lns = ns;
1554 }
1555 }
1556 return (lncp);
1557 }
1558
1559 static bool
cache_neg_evict(void)1560 cache_neg_evict(void)
1561 {
1562 struct namecache *ncp, *ncp2;
1563 struct neglist *nl;
1564 struct vnode *dvp;
1565 struct mtx *dvlp;
1566 struct mtx *blp;
1567 uint32_t hash;
1568 u_char nlen;
1569 bool evicted;
1570
1571 nl = cache_neg_evict_select_list();
1572 if (nl == NULL) {
1573 return (false);
1574 }
1575
1576 mtx_lock(&nl->nl_lock);
1577 ncp = TAILQ_FIRST(&nl->nl_hotlist);
1578 if (ncp != NULL) {
1579 cache_neg_demote_locked(ncp);
1580 }
1581 ncp = cache_neg_evict_select_entry(nl);
1582 if (ncp == NULL) {
1583 counter_u64_add(neg_evict_skipped_empty, 1);
1584 mtx_unlock(&nl->nl_lock);
1585 mtx_unlock(&nl->nl_evict_lock);
1586 return (false);
1587 }
1588 nlen = ncp->nc_nlen;
1589 dvp = ncp->nc_dvp;
1590 hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1591 dvlp = VP2VNODELOCK(dvp);
1592 blp = HASH2BUCKETLOCK(hash);
1593 mtx_unlock(&nl->nl_lock);
1594 mtx_unlock(&nl->nl_evict_lock);
1595 mtx_lock(dvlp);
1596 mtx_lock(blp);
1597 /*
1598 * Note that since all locks were dropped above, the entry may be
1599 * gone or reallocated to be something else.
1600 */
1601 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1602 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1603 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1604 break;
1605 }
1606 if (ncp2 == NULL) {
1607 counter_u64_add(neg_evict_skipped_missed, 1);
1608 ncp = NULL;
1609 evicted = false;
1610 } else {
1611 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1612 MPASS(blp == NCP2BUCKETLOCK(ncp));
1613 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1614 ncp->nc_name);
1615 cache_zap_locked(ncp);
1616 counter_u64_add(neg_evicted, 1);
1617 evicted = true;
1618 }
1619 mtx_unlock(blp);
1620 mtx_unlock(dvlp);
1621 if (ncp != NULL)
1622 cache_free(ncp);
1623 return (evicted);
1624 }
1625
1626 /*
1627 * Maybe evict a negative entry to create more room.
1628 *
1629 * The ncnegfactor parameter limits what fraction of the total count
1630 * can comprise of negative entries. However, if the cache is just
1631 * warming up this leads to excessive evictions. As such, ncnegminpct
1632 * (recomputed to neg_min) dictates whether the above should be
1633 * applied.
1634 *
1635 * Try evicting if the cache is close to full capacity regardless of
1636 * other considerations.
1637 */
1638 static bool
cache_neg_evict_cond(u_long lnumcache)1639 cache_neg_evict_cond(u_long lnumcache)
1640 {
1641 u_long lnumneg;
1642
1643 if (ncsize - 1000 < lnumcache)
1644 goto out_evict;
1645 lnumneg = atomic_load_long(&numneg);
1646 if (lnumneg < neg_min)
1647 return (false);
1648 if (lnumneg * ncnegfactor < lnumcache)
1649 return (false);
1650 out_evict:
1651 return (cache_neg_evict());
1652 }
1653
1654 /*
1655 * cache_zap_locked():
1656 *
1657 * Removes a namecache entry from cache, whether it contains an actual
1658 * pointer to a vnode or if it is just a negative cache entry.
1659 */
1660 static void
cache_zap_locked(struct namecache * ncp)1661 cache_zap_locked(struct namecache *ncp)
1662 {
1663 struct nchashhead *ncpp;
1664 struct vnode *dvp, *vp;
1665
1666 dvp = ncp->nc_dvp;
1667 vp = ncp->nc_vp;
1668
1669 if (!(ncp->nc_flag & NCF_NEGATIVE))
1670 cache_assert_vnode_locked(vp);
1671 cache_assert_vnode_locked(dvp);
1672 cache_assert_bucket_locked(ncp);
1673
1674 cache_ncp_invalidate(ncp);
1675
1676 ncpp = NCP2BUCKET(ncp);
1677 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1678 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1679 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1680 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1681 if (ncp == vp->v_cache_dd) {
1682 atomic_store_ptr(&vp->v_cache_dd, NULL);
1683 }
1684 } else {
1685 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1686 cache_neg_remove(ncp);
1687 }
1688 if (ncp->nc_flag & NCF_ISDOTDOT) {
1689 if (ncp == dvp->v_cache_dd) {
1690 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1691 }
1692 } else {
1693 LIST_REMOVE(ncp, nc_src);
1694 if (LIST_EMPTY(&dvp->v_cache_src)) {
1695 ncp->nc_flag |= NCF_DVDROP;
1696 }
1697 }
1698 }
1699
1700 static void
cache_zap_negative_locked_vnode_kl(struct namecache * ncp,struct vnode * vp)1701 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1702 {
1703 struct mtx *blp;
1704
1705 MPASS(ncp->nc_dvp == vp);
1706 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1707 cache_assert_vnode_locked(vp);
1708
1709 blp = NCP2BUCKETLOCK(ncp);
1710 mtx_lock(blp);
1711 cache_zap_locked(ncp);
1712 mtx_unlock(blp);
1713 }
1714
1715 static bool
cache_zap_locked_vnode_kl2(struct namecache * ncp,struct vnode * vp,struct mtx ** vlpp)1716 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1717 struct mtx **vlpp)
1718 {
1719 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1720 struct mtx *blp;
1721
1722 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1723 cache_assert_vnode_locked(vp);
1724
1725 if (ncp->nc_flag & NCF_NEGATIVE) {
1726 if (*vlpp != NULL) {
1727 mtx_unlock(*vlpp);
1728 *vlpp = NULL;
1729 }
1730 cache_zap_negative_locked_vnode_kl(ncp, vp);
1731 return (true);
1732 }
1733
1734 pvlp = VP2VNODELOCK(vp);
1735 blp = NCP2BUCKETLOCK(ncp);
1736 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1737 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1738
1739 if (*vlpp == vlp1 || *vlpp == vlp2) {
1740 to_unlock = *vlpp;
1741 *vlpp = NULL;
1742 } else {
1743 if (*vlpp != NULL) {
1744 mtx_unlock(*vlpp);
1745 *vlpp = NULL;
1746 }
1747 cache_sort_vnodes(&vlp1, &vlp2);
1748 if (vlp1 == pvlp) {
1749 mtx_lock(vlp2);
1750 to_unlock = vlp2;
1751 } else {
1752 if (!mtx_trylock(vlp1))
1753 goto out_relock;
1754 to_unlock = vlp1;
1755 }
1756 }
1757 mtx_lock(blp);
1758 cache_zap_locked(ncp);
1759 mtx_unlock(blp);
1760 if (to_unlock != NULL)
1761 mtx_unlock(to_unlock);
1762 return (true);
1763
1764 out_relock:
1765 mtx_unlock(vlp2);
1766 mtx_lock(vlp1);
1767 mtx_lock(vlp2);
1768 MPASS(*vlpp == NULL);
1769 *vlpp = vlp1;
1770 return (false);
1771 }
1772
1773 /*
1774 * If trylocking failed we can get here. We know enough to take all needed locks
1775 * in the right order and re-lookup the entry.
1776 */
1777 static int
cache_zap_unlocked_bucket(struct namecache * ncp,struct componentname * cnp,struct vnode * dvp,struct mtx * dvlp,struct mtx * vlp,uint32_t hash,struct mtx * blp)1778 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1779 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1780 struct mtx *blp)
1781 {
1782 struct namecache *rncp;
1783 struct mtx *rvlp;
1784
1785 cache_assert_bucket_unlocked(ncp);
1786
1787 cache_sort_vnodes(&dvlp, &vlp);
1788 cache_lock_vnodes(dvlp, vlp);
1789 mtx_lock(blp);
1790 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1791 if (rncp == ncp && cache_ncp_match(rncp, dvp, cnp))
1792 break;
1793 }
1794 if (rncp == NULL)
1795 goto out_mismatch;
1796
1797 if (!(ncp->nc_flag & NCF_NEGATIVE))
1798 rvlp = VP2VNODELOCK(rncp->nc_vp);
1799 else
1800 rvlp = NULL;
1801 if (rvlp != vlp)
1802 goto out_mismatch;
1803
1804 cache_zap_locked(rncp);
1805 mtx_unlock(blp);
1806 cache_unlock_vnodes(dvlp, vlp);
1807 atomic_add_long(&zap_bucket_relock_success, 1);
1808 return (0);
1809
1810 out_mismatch:
1811 mtx_unlock(blp);
1812 cache_unlock_vnodes(dvlp, vlp);
1813 return (EAGAIN);
1814 }
1815
1816 static int __noinline
cache_zap_locked_bucket(struct namecache * ncp,struct componentname * cnp,uint32_t hash,struct mtx * blp)1817 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1818 uint32_t hash, struct mtx *blp)
1819 {
1820 struct mtx *dvlp, *vlp;
1821 struct vnode *dvp;
1822
1823 cache_assert_bucket_locked(ncp);
1824
1825 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1826 vlp = NULL;
1827 if (!(ncp->nc_flag & NCF_NEGATIVE))
1828 vlp = VP2VNODELOCK(ncp->nc_vp);
1829 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1830 cache_zap_locked(ncp);
1831 mtx_unlock(blp);
1832 cache_unlock_vnodes(dvlp, vlp);
1833 return (0);
1834 }
1835
1836 dvp = ncp->nc_dvp;
1837 mtx_unlock(blp);
1838 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1839 }
1840
1841 static __noinline int
cache_remove_cnp(struct vnode * dvp,struct componentname * cnp)1842 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1843 {
1844 struct namecache *ncp;
1845 struct mtx *blp;
1846 struct mtx *dvlp, *dvlp2;
1847 uint32_t hash;
1848 int error;
1849
1850 if (cnp->cn_namelen == 2 &&
1851 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1852 dvlp = VP2VNODELOCK(dvp);
1853 dvlp2 = NULL;
1854 mtx_lock(dvlp);
1855 retry_dotdot:
1856 ncp = dvp->v_cache_dd;
1857 if (ncp == NULL) {
1858 mtx_unlock(dvlp);
1859 if (dvlp2 != NULL)
1860 mtx_unlock(dvlp2);
1861 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1862 return (0);
1863 }
1864 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1865 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1866 goto retry_dotdot;
1867 MPASS(dvp->v_cache_dd == NULL);
1868 mtx_unlock(dvlp);
1869 if (dvlp2 != NULL)
1870 mtx_unlock(dvlp2);
1871 cache_free(ncp);
1872 } else {
1873 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1874 mtx_unlock(dvlp);
1875 if (dvlp2 != NULL)
1876 mtx_unlock(dvlp2);
1877 }
1878 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1879 return (1);
1880 }
1881
1882 /*
1883 * XXX note that access here is completely unlocked with no provisions
1884 * to keep the hash allocated. If one is sufficiently unlucky a
1885 * parallel cache resize can reallocate the hash, unmap backing pages
1886 * and cause the empty check below to fault.
1887 *
1888 * Fixing this has epsilon priority, but can be done with no overhead
1889 * for this codepath with sufficient effort.
1890 */
1891 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1892 blp = HASH2BUCKETLOCK(hash);
1893 retry:
1894 if (CK_SLIST_EMPTY(NCHHASH(hash)))
1895 goto out_no_entry;
1896
1897 mtx_lock(blp);
1898 ncp = cache_ncp_find(dvp, cnp, hash);
1899 if (ncp == NULL) {
1900 mtx_unlock(blp);
1901 goto out_no_entry;
1902 }
1903
1904 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1905 if (__predict_false(error != 0)) {
1906 atomic_add_long(&zap_bucket_fail, 1);
1907 goto retry;
1908 }
1909 counter_u64_add(numposzaps, 1);
1910 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1911 cache_free(ncp);
1912 return (1);
1913 out_no_entry:
1914 counter_u64_add(nummisszap, 1);
1915 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1916 return (0);
1917 }
1918
1919 static int __noinline
cache_lookup_dot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1920 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1921 struct timespec *tsp, int *ticksp)
1922 {
1923 int ltype;
1924
1925 *vpp = dvp;
1926 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1927 if (tsp != NULL)
1928 timespecclear(tsp);
1929 if (ticksp != NULL)
1930 *ticksp = ticks;
1931 vrefact(*vpp);
1932 /*
1933 * When we lookup "." we still can be asked to lock it
1934 * differently...
1935 */
1936 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1937 if (ltype != VOP_ISLOCKED(*vpp)) {
1938 if (ltype == LK_EXCLUSIVE) {
1939 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1940 if (VN_IS_DOOMED((*vpp))) {
1941 /* forced unmount */
1942 vrele(*vpp);
1943 *vpp = NULL;
1944 return (ENOENT);
1945 }
1946 } else
1947 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1948 }
1949 return (-1);
1950 }
1951
1952 static int __noinline
cache_lookup_dotdot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1953 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1954 struct timespec *tsp, int *ticksp)
1955 {
1956 struct namecache_ts *ncp_ts;
1957 struct namecache *ncp;
1958 struct mtx *dvlp;
1959 enum vgetstate vs;
1960 int error, ltype;
1961 bool whiteout;
1962
1963 MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1964
1965 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1966 cache_remove_cnp(dvp, cnp);
1967 return (0);
1968 }
1969
1970 retry:
1971 dvlp = VP2VNODELOCK(dvp);
1972 mtx_lock(dvlp);
1973 ncp = dvp->v_cache_dd;
1974 if (ncp == NULL) {
1975 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
1976 mtx_unlock(dvlp);
1977 return (0);
1978 }
1979 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1980 if (ncp->nc_flag & NCF_NEGATIVE)
1981 *vpp = NULL;
1982 else
1983 *vpp = ncp->nc_vp;
1984 } else
1985 *vpp = ncp->nc_dvp;
1986 if (*vpp == NULL)
1987 goto negative_success;
1988 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1989 cache_out_ts(ncp, tsp, ticksp);
1990 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1991 NCF_DTS && tsp != NULL) {
1992 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1993 *tsp = ncp_ts->nc_dotdottime;
1994 }
1995
1996 MPASS(dvp != *vpp);
1997 ltype = VOP_ISLOCKED(dvp);
1998 VOP_UNLOCK(dvp);
1999 vs = vget_prep(*vpp);
2000 mtx_unlock(dvlp);
2001 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2002 vn_lock(dvp, ltype | LK_RETRY);
2003 if (VN_IS_DOOMED(dvp)) {
2004 if (error == 0)
2005 vput(*vpp);
2006 *vpp = NULL;
2007 return (ENOENT);
2008 }
2009 if (error) {
2010 *vpp = NULL;
2011 goto retry;
2012 }
2013 return (-1);
2014 negative_success:
2015 if (__predict_false(cnp->cn_nameiop == CREATE)) {
2016 if (cnp->cn_flags & ISLASTCN) {
2017 counter_u64_add(numnegzaps, 1);
2018 cache_zap_negative_locked_vnode_kl(ncp, dvp);
2019 mtx_unlock(dvlp);
2020 cache_free(ncp);
2021 return (0);
2022 }
2023 }
2024
2025 whiteout = (ncp->nc_flag & NCF_WHITE);
2026 cache_out_ts(ncp, tsp, ticksp);
2027 if (cache_neg_hit_prep(ncp))
2028 cache_neg_promote(ncp);
2029 else
2030 cache_neg_hit_finish(ncp);
2031 mtx_unlock(dvlp);
2032 if (whiteout)
2033 cnp->cn_flags |= ISWHITEOUT;
2034 return (ENOENT);
2035 }
2036
2037 /**
2038 * Lookup a name in the name cache
2039 *
2040 * # Arguments
2041 *
2042 * - dvp: Parent directory in which to search.
2043 * - vpp: Return argument. Will contain desired vnode on cache hit.
2044 * - cnp: Parameters of the name search. The most interesting bits of
2045 * the cn_flags field have the following meanings:
2046 * - MAKEENTRY: If clear, free an entry from the cache rather than look
2047 * it up.
2048 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
2049 * - tsp: Return storage for cache timestamp. On a successful (positive
2050 * or negative) lookup, tsp will be filled with any timespec that
2051 * was stored when this cache entry was created. However, it will
2052 * be clear for "." entries.
2053 * - ticks: Return storage for alternate cache timestamp. On a successful
2054 * (positive or negative) lookup, it will contain the ticks value
2055 * that was current when the cache entry was created, unless cnp
2056 * was ".".
2057 *
2058 * Either both tsp and ticks have to be provided or neither of them.
2059 *
2060 * # Returns
2061 *
2062 * - -1: A positive cache hit. vpp will contain the desired vnode.
2063 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
2064 * to a forced unmount. vpp will not be modified. If the entry
2065 * is a whiteout, then the ISWHITEOUT flag will be set in
2066 * cnp->cn_flags.
2067 * - 0: A cache miss. vpp will not be modified.
2068 *
2069 * # Locking
2070 *
2071 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
2072 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
2073 * lock is not recursively acquired.
2074 */
2075 static int __noinline
cache_lookup_fallback(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)2076 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2077 struct timespec *tsp, int *ticksp)
2078 {
2079 struct namecache *ncp;
2080 struct mtx *blp;
2081 uint32_t hash;
2082 enum vgetstate vs;
2083 int error;
2084 bool whiteout;
2085
2086 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2087 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
2088
2089 retry:
2090 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2091 blp = HASH2BUCKETLOCK(hash);
2092 mtx_lock(blp);
2093
2094 ncp = cache_ncp_find(dvp, cnp, hash);
2095 if (__predict_false(ncp == NULL)) {
2096 mtx_unlock(blp);
2097 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2098 counter_u64_add(nummiss, 1);
2099 return (0);
2100 }
2101
2102 if (ncp->nc_flag & NCF_NEGATIVE)
2103 goto negative_success;
2104
2105 counter_u64_add(numposhits, 1);
2106 *vpp = ncp->nc_vp;
2107 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2108 cache_out_ts(ncp, tsp, ticksp);
2109 MPASS(dvp != *vpp);
2110 vs = vget_prep(*vpp);
2111 mtx_unlock(blp);
2112 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2113 if (error) {
2114 *vpp = NULL;
2115 goto retry;
2116 }
2117 return (-1);
2118 negative_success:
2119 /*
2120 * We don't get here with regular lookup apart from corner cases.
2121 */
2122 if (__predict_true(cnp->cn_nameiop == CREATE)) {
2123 if (cnp->cn_flags & ISLASTCN) {
2124 counter_u64_add(numnegzaps, 1);
2125 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
2126 if (__predict_false(error != 0)) {
2127 atomic_add_long(&zap_bucket_fail2, 1);
2128 goto retry;
2129 }
2130 cache_free(ncp);
2131 return (0);
2132 }
2133 }
2134
2135 whiteout = (ncp->nc_flag & NCF_WHITE);
2136 cache_out_ts(ncp, tsp, ticksp);
2137 if (cache_neg_hit_prep(ncp))
2138 cache_neg_promote(ncp);
2139 else
2140 cache_neg_hit_finish(ncp);
2141 mtx_unlock(blp);
2142 if (whiteout)
2143 cnp->cn_flags |= ISWHITEOUT;
2144 return (ENOENT);
2145 }
2146
2147 int
cache_lookup(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)2148 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2149 struct timespec *tsp, int *ticksp)
2150 {
2151 struct namecache *ncp;
2152 uint32_t hash;
2153 enum vgetstate vs;
2154 int error;
2155 bool whiteout, neg_promote;
2156 u_short nc_flag;
2157
2158 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
2159
2160 #ifdef DEBUG_CACHE
2161 if (__predict_false(!doingcache)) {
2162 cnp->cn_flags &= ~MAKEENTRY;
2163 return (0);
2164 }
2165 #endif
2166
2167 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2168 if (cnp->cn_namelen == 1)
2169 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
2170 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
2171 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
2172 }
2173
2174 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2175
2176 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
2177 cache_remove_cnp(dvp, cnp);
2178 return (0);
2179 }
2180
2181 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2182 vfs_smr_enter();
2183
2184 ncp = cache_ncp_find(dvp, cnp, hash);
2185 if (__predict_false(ncp == NULL)) {
2186 vfs_smr_exit();
2187 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2188 counter_u64_add(nummiss, 1);
2189 return (0);
2190 }
2191
2192 nc_flag = atomic_load_char(&ncp->nc_flag);
2193 if (nc_flag & NCF_NEGATIVE)
2194 goto negative_success;
2195
2196 counter_u64_add(numposhits, 1);
2197 *vpp = ncp->nc_vp;
2198 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2199 cache_out_ts(ncp, tsp, ticksp);
2200 MPASS(dvp != *vpp);
2201 if (!cache_ncp_canuse(ncp)) {
2202 vfs_smr_exit();
2203 *vpp = NULL;
2204 goto out_fallback;
2205 }
2206 vs = vget_prep_smr(*vpp);
2207 vfs_smr_exit();
2208 if (__predict_false(vs == VGET_NONE)) {
2209 *vpp = NULL;
2210 goto out_fallback;
2211 }
2212 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2213 if (error) {
2214 *vpp = NULL;
2215 goto out_fallback;
2216 }
2217 return (-1);
2218 negative_success:
2219 if (cnp->cn_nameiop == CREATE) {
2220 if (cnp->cn_flags & ISLASTCN) {
2221 vfs_smr_exit();
2222 goto out_fallback;
2223 }
2224 }
2225
2226 cache_out_ts(ncp, tsp, ticksp);
2227 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
2228 neg_promote = cache_neg_hit_prep(ncp);
2229 if (!cache_ncp_canuse(ncp)) {
2230 cache_neg_hit_abort(ncp);
2231 vfs_smr_exit();
2232 goto out_fallback;
2233 }
2234 if (neg_promote) {
2235 vfs_smr_exit();
2236 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2237 goto out_fallback;
2238 } else {
2239 cache_neg_hit_finish(ncp);
2240 vfs_smr_exit();
2241 }
2242 if (whiteout)
2243 cnp->cn_flags |= ISWHITEOUT;
2244 return (ENOENT);
2245 out_fallback:
2246 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2247 }
2248
2249 struct celockstate {
2250 struct mtx *vlp[3];
2251 struct mtx *blp[2];
2252 };
2253 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2254 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2255
2256 static inline void
cache_celockstate_init(struct celockstate * cel)2257 cache_celockstate_init(struct celockstate *cel)
2258 {
2259
2260 bzero(cel, sizeof(*cel));
2261 }
2262
2263 static void
cache_lock_vnodes_cel(struct celockstate * cel,struct vnode * vp,struct vnode * dvp)2264 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2265 struct vnode *dvp)
2266 {
2267 struct mtx *vlp1, *vlp2;
2268
2269 MPASS(cel->vlp[0] == NULL);
2270 MPASS(cel->vlp[1] == NULL);
2271 MPASS(cel->vlp[2] == NULL);
2272
2273 MPASS(vp != NULL || dvp != NULL);
2274
2275 vlp1 = VP2VNODELOCK(vp);
2276 vlp2 = VP2VNODELOCK(dvp);
2277 cache_sort_vnodes(&vlp1, &vlp2);
2278
2279 if (vlp1 != NULL) {
2280 mtx_lock(vlp1);
2281 cel->vlp[0] = vlp1;
2282 }
2283 mtx_lock(vlp2);
2284 cel->vlp[1] = vlp2;
2285 }
2286
2287 static void
cache_unlock_vnodes_cel(struct celockstate * cel)2288 cache_unlock_vnodes_cel(struct celockstate *cel)
2289 {
2290
2291 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2292
2293 if (cel->vlp[0] != NULL)
2294 mtx_unlock(cel->vlp[0]);
2295 if (cel->vlp[1] != NULL)
2296 mtx_unlock(cel->vlp[1]);
2297 if (cel->vlp[2] != NULL)
2298 mtx_unlock(cel->vlp[2]);
2299 }
2300
2301 static bool
cache_lock_vnodes_cel_3(struct celockstate * cel,struct vnode * vp)2302 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2303 {
2304 struct mtx *vlp;
2305 bool ret;
2306
2307 cache_assert_vlp_locked(cel->vlp[0]);
2308 cache_assert_vlp_locked(cel->vlp[1]);
2309 MPASS(cel->vlp[2] == NULL);
2310
2311 MPASS(vp != NULL);
2312 vlp = VP2VNODELOCK(vp);
2313
2314 ret = true;
2315 if (vlp >= cel->vlp[1]) {
2316 mtx_lock(vlp);
2317 } else {
2318 if (mtx_trylock(vlp))
2319 goto out;
2320 cache_unlock_vnodes_cel(cel);
2321 atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1);
2322 if (vlp < cel->vlp[0]) {
2323 mtx_lock(vlp);
2324 mtx_lock(cel->vlp[0]);
2325 mtx_lock(cel->vlp[1]);
2326 } else {
2327 if (cel->vlp[0] != NULL)
2328 mtx_lock(cel->vlp[0]);
2329 mtx_lock(vlp);
2330 mtx_lock(cel->vlp[1]);
2331 }
2332 ret = false;
2333 }
2334 out:
2335 cel->vlp[2] = vlp;
2336 return (ret);
2337 }
2338
2339 static void
cache_lock_buckets_cel(struct celockstate * cel,struct mtx * blp1,struct mtx * blp2)2340 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2341 struct mtx *blp2)
2342 {
2343
2344 MPASS(cel->blp[0] == NULL);
2345 MPASS(cel->blp[1] == NULL);
2346
2347 cache_sort_vnodes(&blp1, &blp2);
2348
2349 if (blp1 != NULL) {
2350 mtx_lock(blp1);
2351 cel->blp[0] = blp1;
2352 }
2353 mtx_lock(blp2);
2354 cel->blp[1] = blp2;
2355 }
2356
2357 static void
cache_unlock_buckets_cel(struct celockstate * cel)2358 cache_unlock_buckets_cel(struct celockstate *cel)
2359 {
2360
2361 if (cel->blp[0] != NULL)
2362 mtx_unlock(cel->blp[0]);
2363 mtx_unlock(cel->blp[1]);
2364 }
2365
2366 /*
2367 * Lock part of the cache affected by the insertion.
2368 *
2369 * This means vnodelocks for dvp, vp and the relevant bucketlock.
2370 * However, insertion can result in removal of an old entry. In this
2371 * case we have an additional vnode and bucketlock pair to lock.
2372 *
2373 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2374 * preserving the locking order (smaller address first).
2375 */
2376 static void
cache_enter_lock(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2377 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2378 uint32_t hash)
2379 {
2380 struct namecache *ncp;
2381 struct mtx *blps[2];
2382 u_char nc_flag;
2383
2384 blps[0] = HASH2BUCKETLOCK(hash);
2385 for (;;) {
2386 blps[1] = NULL;
2387 cache_lock_vnodes_cel(cel, dvp, vp);
2388 if (vp == NULL || vp->v_type != VDIR)
2389 break;
2390 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2391 if (ncp == NULL)
2392 break;
2393 nc_flag = atomic_load_char(&ncp->nc_flag);
2394 if ((nc_flag & NCF_ISDOTDOT) == 0)
2395 break;
2396 MPASS(ncp->nc_dvp == vp);
2397 blps[1] = NCP2BUCKETLOCK(ncp);
2398 if ((nc_flag & NCF_NEGATIVE) != 0)
2399 break;
2400 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2401 break;
2402 /*
2403 * All vnodes got re-locked. Re-validate the state and if
2404 * nothing changed we are done. Otherwise restart.
2405 */
2406 if (ncp == vp->v_cache_dd &&
2407 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2408 blps[1] == NCP2BUCKETLOCK(ncp) &&
2409 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2410 break;
2411 cache_unlock_vnodes_cel(cel);
2412 cel->vlp[0] = NULL;
2413 cel->vlp[1] = NULL;
2414 cel->vlp[2] = NULL;
2415 }
2416 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2417 }
2418
2419 static void
cache_enter_lock_dd(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2420 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2421 uint32_t hash)
2422 {
2423 struct namecache *ncp;
2424 struct mtx *blps[2];
2425 u_char nc_flag;
2426
2427 blps[0] = HASH2BUCKETLOCK(hash);
2428 for (;;) {
2429 blps[1] = NULL;
2430 cache_lock_vnodes_cel(cel, dvp, vp);
2431 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2432 if (ncp == NULL)
2433 break;
2434 nc_flag = atomic_load_char(&ncp->nc_flag);
2435 if ((nc_flag & NCF_ISDOTDOT) == 0)
2436 break;
2437 MPASS(ncp->nc_dvp == dvp);
2438 blps[1] = NCP2BUCKETLOCK(ncp);
2439 if ((nc_flag & NCF_NEGATIVE) != 0)
2440 break;
2441 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2442 break;
2443 if (ncp == dvp->v_cache_dd &&
2444 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2445 blps[1] == NCP2BUCKETLOCK(ncp) &&
2446 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2447 break;
2448 cache_unlock_vnodes_cel(cel);
2449 cel->vlp[0] = NULL;
2450 cel->vlp[1] = NULL;
2451 cel->vlp[2] = NULL;
2452 }
2453 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2454 }
2455
2456 static void
cache_enter_unlock(struct celockstate * cel)2457 cache_enter_unlock(struct celockstate *cel)
2458 {
2459
2460 cache_unlock_buckets_cel(cel);
2461 cache_unlock_vnodes_cel(cel);
2462 }
2463
2464 static void __noinline
cache_enter_dotdot_prep(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)2465 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2466 struct componentname *cnp)
2467 {
2468 struct celockstate cel;
2469 struct namecache *ncp;
2470 uint32_t hash;
2471 int len;
2472
2473 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2474 return;
2475 len = cnp->cn_namelen;
2476 cache_celockstate_init(&cel);
2477 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2478 cache_enter_lock_dd(&cel, dvp, vp, hash);
2479 ncp = dvp->v_cache_dd;
2480 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2481 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2482 cache_zap_locked(ncp);
2483 } else {
2484 ncp = NULL;
2485 }
2486 atomic_store_ptr(&dvp->v_cache_dd, NULL);
2487 cache_enter_unlock(&cel);
2488 if (ncp != NULL)
2489 cache_free(ncp);
2490 }
2491
2492 /*
2493 * Add an entry to the cache.
2494 */
2495 void
cache_enter_time(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp)2496 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2497 struct timespec *tsp, struct timespec *dtsp)
2498 {
2499 struct celockstate cel;
2500 struct namecache *ncp, *n2, *ndd;
2501 struct namecache_ts *ncp_ts;
2502 uint32_t hash;
2503 int flag;
2504 int len;
2505
2506 KASSERT(cnp->cn_namelen <= NAME_MAX,
2507 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2508 NAME_MAX));
2509 VNPASS(!VN_IS_DOOMED(dvp), dvp);
2510 VNPASS(dvp->v_type != VNON, dvp);
2511 if (vp != NULL) {
2512 VNPASS(!VN_IS_DOOMED(vp), vp);
2513 VNPASS(vp->v_type != VNON, vp);
2514 }
2515 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
2516 KASSERT(dvp == vp,
2517 ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
2518 dvp, vp));
2519 } else {
2520 KASSERT(dvp != vp,
2521 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
2522 cnp->cn_nameptr, dvp));
2523 }
2524
2525 #ifdef DEBUG_CACHE
2526 if (__predict_false(!doingcache))
2527 return;
2528 #endif
2529
2530 flag = 0;
2531 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2532 if (cnp->cn_namelen == 1)
2533 return;
2534 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2535 cache_enter_dotdot_prep(dvp, vp, cnp);
2536 flag = NCF_ISDOTDOT;
2537 }
2538 }
2539
2540 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2541 if (ncp == NULL)
2542 return;
2543
2544 cache_celockstate_init(&cel);
2545 ndd = NULL;
2546 ncp_ts = NULL;
2547
2548 /*
2549 * Calculate the hash key and setup as much of the new
2550 * namecache entry as possible before acquiring the lock.
2551 */
2552 ncp->nc_flag = flag | NCF_WIP;
2553 ncp->nc_vp = vp;
2554 if (vp == NULL)
2555 cache_neg_init(ncp);
2556 ncp->nc_dvp = dvp;
2557 if (tsp != NULL) {
2558 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2559 ncp_ts->nc_time = *tsp;
2560 ncp_ts->nc_ticks = ticks;
2561 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2562 if (dtsp != NULL) {
2563 ncp_ts->nc_dotdottime = *dtsp;
2564 ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2565 }
2566 }
2567 len = ncp->nc_nlen = cnp->cn_namelen;
2568 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2569 memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2570 ncp->nc_name[len] = '\0';
2571 cache_enter_lock(&cel, dvp, vp, hash);
2572
2573 /*
2574 * See if this vnode or negative entry is already in the cache
2575 * with this name. This can happen with concurrent lookups of
2576 * the same path name.
2577 */
2578 n2 = cache_ncp_find(dvp, cnp, hash);
2579 if (n2 != NULL) {
2580 MPASS(cache_ncp_canuse(n2));
2581 if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2582 KASSERT(vp == NULL,
2583 ("%s: found entry pointing to a different vnode "
2584 "(%p != %p); name [%s]",
2585 __func__, NULL, vp, cnp->cn_nameptr));
2586 else
2587 KASSERT(n2->nc_vp == vp,
2588 ("%s: found entry pointing to a different vnode "
2589 "(%p != %p); name [%s]",
2590 __func__, n2->nc_vp, vp, cnp->cn_nameptr));
2591 /*
2592 * Entries are supposed to be immutable unless in the
2593 * process of getting destroyed. Accommodating for
2594 * changing timestamps is possible but not worth it.
2595 * This should be harmless in terms of correctness, in
2596 * the worst case resulting in an earlier expiration.
2597 * Alternatively, the found entry can be replaced
2598 * altogether.
2599 */
2600 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) ==
2601 (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2602 #if 0
2603 if (tsp != NULL) {
2604 KASSERT((n2->nc_flag & NCF_TS) != 0,
2605 ("no NCF_TS"));
2606 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2607 n2_ts->nc_time = ncp_ts->nc_time;
2608 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2609 if (dtsp != NULL) {
2610 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2611 n2_ts->nc_nc.nc_flag |= NCF_DTS;
2612 }
2613 }
2614 #endif
2615 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2616 vp);
2617 goto out_unlock_free;
2618 }
2619
2620 if (flag == NCF_ISDOTDOT) {
2621 /*
2622 * See if we are trying to add .. entry, but some other lookup
2623 * has populated v_cache_dd pointer already.
2624 */
2625 if (dvp->v_cache_dd != NULL)
2626 goto out_unlock_free;
2627 KASSERT(vp == NULL || vp->v_type == VDIR,
2628 ("wrong vnode type %p", vp));
2629 atomic_thread_fence_rel();
2630 atomic_store_ptr(&dvp->v_cache_dd, ncp);
2631 } else if (vp != NULL) {
2632 /*
2633 * Take the slow path in INOTIFY(). This flag will be lazily
2634 * cleared by cache_vop_inotify() once all directories referring
2635 * to vp are unwatched.
2636 */
2637 if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0))
2638 vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT);
2639
2640 /*
2641 * For this case, the cache entry maps both the
2642 * directory name in it and the name ".." for the
2643 * directory's parent.
2644 */
2645 if ((ndd = vp->v_cache_dd) != NULL) {
2646 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2647 cache_zap_locked(ndd);
2648 else
2649 ndd = NULL;
2650 }
2651 atomic_thread_fence_rel();
2652 atomic_store_ptr(&vp->v_cache_dd, ncp);
2653 }
2654
2655 if (flag != NCF_ISDOTDOT) {
2656 if (LIST_EMPTY(&dvp->v_cache_src)) {
2657 cache_hold_vnode(dvp);
2658 }
2659 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2660 }
2661
2662 /*
2663 * If the entry is "negative", we place it into the
2664 * "negative" cache queue, otherwise, we place it into the
2665 * destination vnode's cache entries queue.
2666 */
2667 if (vp != NULL) {
2668 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2669 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2670 vp);
2671 } else {
2672 if (cnp->cn_flags & ISWHITEOUT)
2673 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2674 cache_neg_insert(ncp);
2675 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2676 ncp->nc_name);
2677 }
2678
2679 /*
2680 * Insert the new namecache entry into the appropriate chain
2681 * within the cache entries table.
2682 */
2683 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2684
2685 atomic_thread_fence_rel();
2686 /*
2687 * Mark the entry as fully constructed.
2688 * It is immutable past this point until its removal.
2689 */
2690 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2691
2692 cache_enter_unlock(&cel);
2693 if (ndd != NULL)
2694 cache_free(ndd);
2695 return;
2696 out_unlock_free:
2697 cache_enter_unlock(&cel);
2698 cache_free(ncp);
2699 return;
2700 }
2701
2702 /*
2703 * A variant of the above accepting flags.
2704 *
2705 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
2706 *
2707 * TODO: this routine is a hack. It blindly removes the old entry, even if it
2708 * happens to match and it is doing it in an inefficient manner. It was added
2709 * to accommodate NFS which runs into a case where the target for a given name
2710 * may change from under it. Note this does nothing to solve the following
2711 * race: 2 callers of cache_enter_time_flags pass a different target vnode for
2712 * the same [dvp, cnp]. It may be argued that code doing this is broken.
2713 */
2714 void
cache_enter_time_flags(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp,int flags)2715 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2716 struct timespec *tsp, struct timespec *dtsp, int flags)
2717 {
2718
2719 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
2720
2721 if (flags & VFS_CACHE_DROPOLD)
2722 cache_remove_cnp(dvp, cnp);
2723 cache_enter_time(dvp, vp, cnp, tsp, dtsp);
2724 }
2725
2726 static u_long
cache_roundup_2(u_long val)2727 cache_roundup_2(u_long val)
2728 {
2729 u_long res;
2730
2731 for (res = 1; res <= val; res <<= 1)
2732 continue;
2733
2734 return (res);
2735 }
2736
2737 static struct nchashhead *
nchinittbl(u_long elements,u_long * hashmask)2738 nchinittbl(u_long elements, u_long *hashmask)
2739 {
2740 struct nchashhead *hashtbl;
2741 u_long hashsize, i;
2742
2743 hashsize = cache_roundup_2(elements) / 2;
2744
2745 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2746 for (i = 0; i < hashsize; i++)
2747 CK_SLIST_INIT(&hashtbl[i]);
2748 *hashmask = hashsize - 1;
2749 return (hashtbl);
2750 }
2751
2752 static void
ncfreetbl(struct nchashhead * hashtbl)2753 ncfreetbl(struct nchashhead *hashtbl)
2754 {
2755
2756 free(hashtbl, M_VFSCACHE);
2757 }
2758
2759 /*
2760 * Name cache initialization, from vfs_init() when we are booting
2761 */
2762 static void
nchinit(void * dummy __unused)2763 nchinit(void *dummy __unused)
2764 {
2765 u_int i;
2766
2767 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2768 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2769 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2770 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2771 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2772 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2773 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2774 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2775
2776 VFS_SMR_ZONE_SET(cache_zone_small);
2777 VFS_SMR_ZONE_SET(cache_zone_small_ts);
2778 VFS_SMR_ZONE_SET(cache_zone_large);
2779 VFS_SMR_ZONE_SET(cache_zone_large_ts);
2780
2781 ncsize = desiredvnodes * ncsizefactor;
2782 cache_recalc_neg_min();
2783 nchashtbl = nchinittbl(ncsize, &nchash);
2784 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2785 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2786 ncbuckethash = 7;
2787 if (ncbuckethash > nchash)
2788 ncbuckethash = nchash;
2789 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2790 M_WAITOK | M_ZERO);
2791 for (i = 0; i < numbucketlocks; i++)
2792 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2793 ncvnodehash = ncbuckethash;
2794 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2795 M_WAITOK | M_ZERO);
2796 for (i = 0; i < numvnodelocks; i++)
2797 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2798
2799 for (i = 0; i < numneglists; i++) {
2800 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2801 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2802 TAILQ_INIT(&neglists[i].nl_list);
2803 TAILQ_INIT(&neglists[i].nl_hotlist);
2804 }
2805 }
2806 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2807
2808 void
cache_vnode_init(struct vnode * vp)2809 cache_vnode_init(struct vnode *vp)
2810 {
2811
2812 LIST_INIT(&vp->v_cache_src);
2813 TAILQ_INIT(&vp->v_cache_dst);
2814 vp->v_cache_dd = NULL;
2815 cache_prehash(vp);
2816 }
2817
2818 /*
2819 * Induce transient cache misses for lockless operation in cache_lookup() by
2820 * using a temporary hash table.
2821 *
2822 * This will force a fs lookup.
2823 *
2824 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
2825 * to observe all CPUs not performing the lookup.
2826 */
2827 static void
cache_changesize_set_temp(struct nchashhead * temptbl,u_long temphash)2828 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
2829 {
2830
2831 MPASS(temphash < nchash);
2832 /*
2833 * Change the size. The new size is smaller and can safely be used
2834 * against the existing table. All lookups which now hash wrong will
2835 * result in a cache miss, which all callers are supposed to know how
2836 * to handle.
2837 */
2838 atomic_store_long(&nchash, temphash);
2839 atomic_thread_fence_rel();
2840 vfs_smr_synchronize();
2841 /*
2842 * At this point everyone sees the updated hash value, but they still
2843 * see the old table.
2844 */
2845 atomic_store_ptr(&nchashtbl, temptbl);
2846 atomic_thread_fence_rel();
2847 vfs_smr_synchronize();
2848 /*
2849 * At this point everyone sees the updated table pointer and size pair.
2850 */
2851 }
2852
2853 /*
2854 * Set the new hash table.
2855 *
2856 * Similarly to cache_changesize_set_temp(), this has to synchronize against
2857 * lockless operation in cache_lookup().
2858 */
2859 static void
cache_changesize_set_new(struct nchashhead * new_tbl,u_long new_hash)2860 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
2861 {
2862
2863 MPASS(nchash < new_hash);
2864 /*
2865 * Change the pointer first. This wont result in out of bounds access
2866 * since the temporary table is guaranteed to be smaller.
2867 */
2868 atomic_store_ptr(&nchashtbl, new_tbl);
2869 atomic_thread_fence_rel();
2870 vfs_smr_synchronize();
2871 /*
2872 * At this point everyone sees the updated pointer value, but they
2873 * still see the old size.
2874 */
2875 atomic_store_long(&nchash, new_hash);
2876 atomic_thread_fence_rel();
2877 vfs_smr_synchronize();
2878 /*
2879 * At this point everyone sees the updated table pointer and size pair.
2880 */
2881 }
2882
2883 void
cache_changesize(u_long newmaxvnodes)2884 cache_changesize(u_long newmaxvnodes)
2885 {
2886 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
2887 u_long new_nchash, old_nchash, temphash;
2888 struct namecache *ncp;
2889 uint32_t hash;
2890 u_long newncsize;
2891 u_long i;
2892
2893 newncsize = newmaxvnodes * ncsizefactor;
2894 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2895 if (newmaxvnodes < numbucketlocks)
2896 newmaxvnodes = numbucketlocks;
2897
2898 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2899 /* If same hash table size, nothing to do */
2900 if (nchash == new_nchash) {
2901 ncfreetbl(new_nchashtbl);
2902 return;
2903 }
2904
2905 temptbl = nchinittbl(1, &temphash);
2906
2907 /*
2908 * Move everything from the old hash table to the new table.
2909 * None of the namecache entries in the table can be removed
2910 * because to do so, they have to be removed from the hash table.
2911 */
2912 cache_lock_all_vnodes();
2913 cache_lock_all_buckets();
2914 old_nchashtbl = nchashtbl;
2915 old_nchash = nchash;
2916 cache_changesize_set_temp(temptbl, temphash);
2917 for (i = 0; i <= old_nchash; i++) {
2918 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2919 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2920 ncp->nc_dvp);
2921 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2922 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
2923 }
2924 }
2925 ncsize = newncsize;
2926 cache_recalc_neg_min();
2927 cache_changesize_set_new(new_nchashtbl, new_nchash);
2928 cache_unlock_all_buckets();
2929 cache_unlock_all_vnodes();
2930 ncfreetbl(old_nchashtbl);
2931 ncfreetbl(temptbl);
2932 }
2933
2934 /*
2935 * Remove all entries from and to a particular vnode.
2936 */
2937 static void
cache_purge_impl(struct vnode * vp)2938 cache_purge_impl(struct vnode *vp)
2939 {
2940 struct cache_freebatch batch;
2941 struct namecache *ncp;
2942 struct mtx *vlp, *vlp2;
2943
2944 TAILQ_INIT(&batch);
2945 vlp = VP2VNODELOCK(vp);
2946 vlp2 = NULL;
2947 mtx_lock(vlp);
2948 retry:
2949 while (!LIST_EMPTY(&vp->v_cache_src)) {
2950 ncp = LIST_FIRST(&vp->v_cache_src);
2951 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2952 goto retry;
2953 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2954 }
2955 while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2956 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2957 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2958 goto retry;
2959 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2960 }
2961 ncp = vp->v_cache_dd;
2962 if (ncp != NULL) {
2963 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2964 ("lost dotdot link"));
2965 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2966 goto retry;
2967 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2968 }
2969 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2970 mtx_unlock(vlp);
2971 if (vlp2 != NULL)
2972 mtx_unlock(vlp2);
2973 cache_free_batch(&batch);
2974 }
2975
2976 /*
2977 * Opportunistic check to see if there is anything to do.
2978 */
2979 static bool
cache_has_entries(struct vnode * vp)2980 cache_has_entries(struct vnode *vp)
2981 {
2982
2983 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2984 atomic_load_ptr(&vp->v_cache_dd) == NULL)
2985 return (false);
2986 return (true);
2987 }
2988
2989 void
cache_purge(struct vnode * vp)2990 cache_purge(struct vnode *vp)
2991 {
2992
2993 SDT_PROBE1(vfs, namecache, purge, done, vp);
2994 if (!cache_has_entries(vp))
2995 return;
2996 cache_purge_impl(vp);
2997 }
2998
2999 /*
3000 * Only to be used by vgone.
3001 */
3002 void
cache_purge_vgone(struct vnode * vp)3003 cache_purge_vgone(struct vnode *vp)
3004 {
3005 struct mtx *vlp;
3006
3007 VNPASS(VN_IS_DOOMED(vp), vp);
3008 if (cache_has_entries(vp)) {
3009 cache_purge_impl(vp);
3010 return;
3011 }
3012
3013 /*
3014 * Serialize against a potential thread doing cache_purge.
3015 */
3016 vlp = VP2VNODELOCK(vp);
3017 mtx_wait_unlocked(vlp);
3018 if (cache_has_entries(vp)) {
3019 cache_purge_impl(vp);
3020 return;
3021 }
3022 return;
3023 }
3024
3025 /*
3026 * Remove all negative entries for a particular directory vnode.
3027 */
3028 void
cache_purge_negative(struct vnode * vp)3029 cache_purge_negative(struct vnode *vp)
3030 {
3031 struct cache_freebatch batch;
3032 struct namecache *ncp, *nnp;
3033 struct mtx *vlp;
3034
3035 SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
3036 if (LIST_EMPTY(&vp->v_cache_src))
3037 return;
3038 TAILQ_INIT(&batch);
3039 vlp = VP2VNODELOCK(vp);
3040 mtx_lock(vlp);
3041 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
3042 if (!(ncp->nc_flag & NCF_NEGATIVE))
3043 continue;
3044 cache_zap_negative_locked_vnode_kl(ncp, vp);
3045 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
3046 }
3047 mtx_unlock(vlp);
3048 cache_free_batch(&batch);
3049 }
3050
3051 /*
3052 * Entry points for modifying VOP operations.
3053 */
3054 void
cache_vop_rename(struct vnode * fdvp,struct vnode * fvp,struct vnode * tdvp,struct vnode * tvp,struct componentname * fcnp,struct componentname * tcnp)3055 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
3056 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
3057 {
3058
3059 ASSERT_VOP_IN_SEQC(fdvp);
3060 ASSERT_VOP_IN_SEQC(fvp);
3061 ASSERT_VOP_IN_SEQC(tdvp);
3062 if (tvp != NULL)
3063 ASSERT_VOP_IN_SEQC(tvp);
3064
3065 cache_purge(fvp);
3066 if (tvp != NULL) {
3067 cache_purge(tvp);
3068 KASSERT(!cache_remove_cnp(tdvp, tcnp),
3069 ("%s: lingering negative entry", __func__));
3070 } else {
3071 cache_remove_cnp(tdvp, tcnp);
3072 }
3073
3074 /*
3075 * TODO
3076 *
3077 * Historically renaming was always purging all revelang entries,
3078 * but that's quite wasteful. In particular turns out that in many cases
3079 * the target file is immediately accessed after rename, inducing a cache
3080 * miss.
3081 *
3082 * Recode this to reduce relocking and reuse the existing entry (if any)
3083 * instead of just removing it above and allocating a new one here.
3084 */
3085 cache_enter(tdvp, fvp, tcnp);
3086 }
3087
3088 void
cache_vop_rmdir(struct vnode * dvp,struct vnode * vp)3089 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
3090 {
3091
3092 ASSERT_VOP_IN_SEQC(dvp);
3093 ASSERT_VOP_IN_SEQC(vp);
3094 cache_purge(vp);
3095 }
3096
3097 #ifdef INVARIANTS
3098 /*
3099 * Validate that if an entry exists it matches.
3100 */
3101 void
cache_validate(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)3102 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
3103 {
3104 struct namecache *ncp;
3105 struct mtx *blp;
3106 uint32_t hash;
3107
3108 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3109 if (CK_SLIST_EMPTY(NCHHASH(hash)))
3110 return;
3111 blp = HASH2BUCKETLOCK(hash);
3112 mtx_lock(blp);
3113 ncp = cache_ncp_find(dvp, cnp, hash);
3114 if (ncp != NULL && ncp->nc_vp != vp) {
3115 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
3116 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
3117 }
3118 mtx_unlock(blp);
3119 }
3120
3121 void
cache_assert_no_entries(struct vnode * vp)3122 cache_assert_no_entries(struct vnode *vp)
3123 {
3124
3125 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp);
3126 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
3127 VNPASS(vp->v_cache_dd == NULL, vp);
3128 }
3129 #endif
3130
3131 /*
3132 * Flush all entries referencing a particular filesystem.
3133 */
3134 void
cache_purgevfs(struct mount * mp)3135 cache_purgevfs(struct mount *mp)
3136 {
3137 struct vnode *vp, *mvp;
3138 size_t visited __sdt_used, purged __sdt_used;
3139
3140 visited = purged = 0;
3141 /*
3142 * Somewhat wasteful iteration over all vnodes. Would be better to
3143 * support filtering and avoid the interlock to begin with.
3144 */
3145 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3146 visited++;
3147 if (!cache_has_entries(vp)) {
3148 VI_UNLOCK(vp);
3149 continue;
3150 }
3151 vholdl(vp);
3152 VI_UNLOCK(vp);
3153 cache_purge(vp);
3154 purged++;
3155 vdrop(vp);
3156 }
3157
3158 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
3159 }
3160
3161 /*
3162 * Perform canonical checks and cache lookup and pass on to filesystem
3163 * through the vop_cachedlookup only if needed.
3164 */
3165
3166 int
vfs_cache_lookup(struct vop_lookup_args * ap)3167 vfs_cache_lookup(struct vop_lookup_args *ap)
3168 {
3169 struct vnode *dvp;
3170 int error;
3171 struct vnode **vpp = ap->a_vpp;
3172 struct componentname *cnp = ap->a_cnp;
3173 int flags = cnp->cn_flags;
3174
3175 *vpp = NULL;
3176 dvp = ap->a_dvp;
3177
3178 if (dvp->v_type != VDIR)
3179 return (ENOTDIR);
3180
3181 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
3182 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
3183 return (EROFS);
3184
3185 error = vn_dir_check_exec(dvp, cnp);
3186 if (error != 0)
3187 return (error);
3188
3189 error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
3190 if (error == 0)
3191 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
3192 if (error == -1)
3193 return (0);
3194 return (error);
3195 }
3196
3197 /* Implementation of the getcwd syscall. */
3198 int
sys___getcwd(struct thread * td,struct __getcwd_args * uap)3199 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
3200 {
3201 char *buf, *retbuf;
3202 size_t buflen;
3203 int error;
3204
3205 buflen = uap->buflen;
3206 if (__predict_false(buflen < 2))
3207 return (EINVAL);
3208 if (buflen > MAXPATHLEN)
3209 buflen = MAXPATHLEN;
3210
3211 buf = uma_zalloc(namei_zone, M_WAITOK);
3212 error = vn_getcwd(buf, &retbuf, &buflen);
3213 if (error == 0)
3214 error = copyout(retbuf, uap->buf, buflen);
3215 uma_zfree(namei_zone, buf);
3216 return (error);
3217 }
3218
3219 int
vn_getcwd(char * buf,char ** retbuf,size_t * buflen)3220 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
3221 {
3222 struct pwd *pwd;
3223 int error;
3224
3225 vfs_smr_enter();
3226 pwd = pwd_get_smr();
3227 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
3228 buflen, 0);
3229 VFS_SMR_ASSERT_NOT_ENTERED();
3230 if (error < 0) {
3231 pwd = pwd_hold(curthread);
3232 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
3233 retbuf, buflen);
3234 pwd_drop(pwd);
3235 }
3236
3237 #ifdef KTRACE
3238 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
3239 ktrnamei(*retbuf);
3240 #endif
3241 return (error);
3242 }
3243
3244 /*
3245 * Canonicalize a path by walking it forward and back.
3246 *
3247 * BUGS:
3248 * - Nothing guarantees the integrity of the entire chain. Consider the case
3249 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of
3250 * "foo" into "quux" during the backwards walk. The result will be
3251 * "quux/bar/baz/qux", which could not have been obtained by an incremental
3252 * walk in userspace. Moreover, the path we return is inaccessible if the
3253 * calling thread lacks permission to traverse "quux".
3254 */
3255 static int
kern___realpathat(struct thread * td,int fd,const char * path,char * buf,size_t size,int flags,enum uio_seg pathseg)3256 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
3257 size_t size, int flags, enum uio_seg pathseg)
3258 {
3259 struct nameidata nd;
3260 char *retbuf, *freebuf;
3261 int error;
3262
3263 if (flags != 0)
3264 return (EINVAL);
3265 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1,
3266 pathseg, path, fd, &cap_fstat_rights);
3267 if ((error = namei(&nd)) != 0)
3268 return (error);
3269
3270 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
3271 (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
3272 struct vnode *covered_vp;
3273
3274 /*
3275 * This happens if vp is a file mount. The call to
3276 * vn_fullpath_hardlink can panic if path resolution can't be
3277 * handled without the directory.
3278 *
3279 * To resolve this, we find the vnode which was mounted on -
3280 * this should have a unique global path since we disallow
3281 * mounting on linked files.
3282 */
3283 error = vn_lock(nd.ni_vp, LK_SHARED);
3284 if (error != 0)
3285 goto out;
3286 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
3287 vref(covered_vp);
3288 VOP_UNLOCK(nd.ni_vp);
3289 error = vn_fullpath(covered_vp, &retbuf, &freebuf);
3290 vrele(covered_vp);
3291 } else {
3292 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp,
3293 nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf,
3294 &freebuf, &size);
3295 }
3296 if (error == 0) {
3297 size_t len;
3298
3299 len = strlen(retbuf) + 1;
3300 if (size < len)
3301 error = ENAMETOOLONG;
3302 else if (pathseg == UIO_USERSPACE)
3303 error = copyout(retbuf, buf, len);
3304 else
3305 memcpy(buf, retbuf, len);
3306 free(freebuf, M_TEMP);
3307 }
3308 out:
3309 vrele(nd.ni_vp);
3310 vrele(nd.ni_dvp);
3311 NDFREE_PNBUF(&nd);
3312 return (error);
3313 }
3314
3315 int
sys___realpathat(struct thread * td,struct __realpathat_args * uap)3316 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
3317 {
3318
3319 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
3320 uap->flags, UIO_USERSPACE));
3321 }
3322
3323 /*
3324 * Retrieve the full filesystem path that correspond to a vnode from the name
3325 * cache (if available)
3326 */
3327 int
vn_fullpath(struct vnode * vp,char ** retbuf,char ** freebuf)3328 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
3329 {
3330 struct pwd *pwd;
3331 char *buf;
3332 size_t buflen;
3333 int error;
3334
3335 if (__predict_false(vp == NULL))
3336 return (EINVAL);
3337
3338 buflen = MAXPATHLEN;
3339 buf = malloc(buflen, M_TEMP, M_WAITOK);
3340 vfs_smr_enter();
3341 pwd = pwd_get_smr();
3342 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
3343 VFS_SMR_ASSERT_NOT_ENTERED();
3344 if (error < 0) {
3345 pwd = pwd_hold(curthread);
3346 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
3347 pwd_drop(pwd);
3348 }
3349 if (error == 0)
3350 *freebuf = buf;
3351 else
3352 free(buf, M_TEMP);
3353 return (error);
3354 }
3355
3356 /*
3357 * This function is similar to vn_fullpath, but it attempts to lookup the
3358 * pathname relative to the global root mount point. This is required for the
3359 * auditing sub-system, as audited pathnames must be absolute, relative to the
3360 * global root mount point.
3361 */
3362 int
vn_fullpath_global(struct vnode * vp,char ** retbuf,char ** freebuf)3363 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
3364 {
3365 char *buf;
3366 size_t buflen;
3367 int error;
3368
3369 if (__predict_false(vp == NULL))
3370 return (EINVAL);
3371 buflen = MAXPATHLEN;
3372 buf = malloc(buflen, M_TEMP, M_WAITOK);
3373 vfs_smr_enter();
3374 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
3375 VFS_SMR_ASSERT_NOT_ENTERED();
3376 if (error < 0) {
3377 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
3378 }
3379 if (error == 0)
3380 *freebuf = buf;
3381 else
3382 free(buf, M_TEMP);
3383 return (error);
3384 }
3385
3386 static struct namecache *
vn_dd_from_dst(struct vnode * vp)3387 vn_dd_from_dst(struct vnode *vp)
3388 {
3389 struct namecache *ncp;
3390
3391 cache_assert_vnode_locked(vp);
3392 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3393 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3394 return (ncp);
3395 }
3396 return (NULL);
3397 }
3398
3399 int
vn_vptocnp(struct vnode ** vp,char * buf,size_t * buflen)3400 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3401 {
3402 struct vnode *dvp;
3403 struct namecache *ncp;
3404 struct mtx *vlp;
3405 int error;
3406
3407 vlp = VP2VNODELOCK(*vp);
3408 mtx_lock(vlp);
3409 ncp = (*vp)->v_cache_dd;
3410 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3411 KASSERT(ncp == vn_dd_from_dst(*vp),
3412 ("%s: mismatch for dd entry (%p != %p)", __func__,
3413 ncp, vn_dd_from_dst(*vp)));
3414 } else {
3415 ncp = vn_dd_from_dst(*vp);
3416 }
3417 if (ncp != NULL) {
3418 if (*buflen < ncp->nc_nlen) {
3419 mtx_unlock(vlp);
3420 vrele(*vp);
3421 counter_u64_add(numfullpathfail4, 1);
3422 error = ENOMEM;
3423 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3424 vp, NULL);
3425 return (error);
3426 }
3427 *buflen -= ncp->nc_nlen;
3428 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3429 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3430 ncp->nc_name, vp);
3431 dvp = *vp;
3432 *vp = ncp->nc_dvp;
3433 vref(*vp);
3434 mtx_unlock(vlp);
3435 vrele(dvp);
3436 return (0);
3437 }
3438 SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3439
3440 mtx_unlock(vlp);
3441 vn_lock(*vp, LK_SHARED | LK_RETRY);
3442 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3443 vput(*vp);
3444 if (error) {
3445 counter_u64_add(numfullpathfail2, 1);
3446 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3447 return (error);
3448 }
3449
3450 *vp = dvp;
3451 if (VN_IS_DOOMED(dvp)) {
3452 /* forced unmount */
3453 vrele(dvp);
3454 error = ENOENT;
3455 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3456 return (error);
3457 }
3458 /*
3459 * *vp has its use count incremented still.
3460 */
3461
3462 return (0);
3463 }
3464
3465 /*
3466 * Resolve a directory to a pathname.
3467 *
3468 * The name of the directory can always be found in the namecache or fetched
3469 * from the filesystem. There is also guaranteed to be only one parent, meaning
3470 * we can just follow vnodes up until we find the root.
3471 *
3472 * The vnode must be referenced.
3473 */
3474 static int
vn_fullpath_dir(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * len,size_t addend)3475 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3476 size_t *len, size_t addend)
3477 {
3478 #ifdef KDTRACE_HOOKS
3479 struct vnode *startvp = vp;
3480 #endif
3481 struct vnode *vp1;
3482 size_t buflen;
3483 int error;
3484 bool slash_prefixed;
3485
3486 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3487 VNPASS(vp->v_usecount > 0, vp);
3488
3489 buflen = *len;
3490
3491 slash_prefixed = true;
3492 if (addend == 0) {
3493 MPASS(*len >= 2);
3494 buflen--;
3495 buf[buflen] = '\0';
3496 slash_prefixed = false;
3497 }
3498
3499 error = 0;
3500
3501 SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3502 counter_u64_add(numfullpathcalls, 1);
3503 while (vp != rdir && vp != rootvnode) {
3504 /*
3505 * The vp vnode must be already fully constructed,
3506 * since it is either found in namecache or obtained
3507 * from VOP_VPTOCNP(). We may test for VV_ROOT safely
3508 * without obtaining the vnode lock.
3509 */
3510 if ((vp->v_vflag & VV_ROOT) != 0) {
3511 vn_lock(vp, LK_RETRY | LK_SHARED);
3512
3513 /*
3514 * With the vnode locked, check for races with
3515 * unmount, forced or not. Note that we
3516 * already verified that vp is not equal to
3517 * the root vnode, which means that
3518 * mnt_vnodecovered can be NULL only for the
3519 * case of unmount.
3520 */
3521 if (VN_IS_DOOMED(vp) ||
3522 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3523 vp1->v_mountedhere != vp->v_mount) {
3524 vput(vp);
3525 error = ENOENT;
3526 SDT_PROBE3(vfs, namecache, fullpath, return,
3527 error, vp, NULL);
3528 break;
3529 }
3530
3531 vref(vp1);
3532 vput(vp);
3533 vp = vp1;
3534 continue;
3535 }
3536 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3537 error = vn_vptocnp(&vp, buf, &buflen);
3538 if (error)
3539 break;
3540 if (buflen == 0) {
3541 vrele(vp);
3542 error = ENOMEM;
3543 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3544 startvp, NULL);
3545 break;
3546 }
3547 buf[--buflen] = '/';
3548 slash_prefixed = true;
3549 }
3550 if (error)
3551 return (error);
3552 if (!slash_prefixed) {
3553 if (buflen == 0) {
3554 vrele(vp);
3555 counter_u64_add(numfullpathfail4, 1);
3556 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3557 startvp, NULL);
3558 return (ENOMEM);
3559 }
3560 buf[--buflen] = '/';
3561 }
3562 counter_u64_add(numfullpathfound, 1);
3563 vrele(vp);
3564
3565 *retbuf = buf + buflen;
3566 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3567 *len -= buflen;
3568 *len += addend;
3569 return (0);
3570 }
3571
3572 /*
3573 * Resolve an arbitrary vnode to a pathname.
3574 *
3575 * Note 2 caveats:
3576 * - hardlinks are not tracked, thus if the vnode is not a directory this can
3577 * resolve to a different path than the one used to find it
3578 * - namecache is not mandatory, meaning names are not guaranteed to be added
3579 * (in which case resolving fails)
3580 */
3581 static void __inline
cache_rev_failed_impl(int * reason,int line)3582 cache_rev_failed_impl(int *reason, int line)
3583 {
3584
3585 *reason = line;
3586 }
3587 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__)
3588
3589 static int
vn_fullpath_any_smr(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen,size_t addend)3590 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3591 char **retbuf, size_t *buflen, size_t addend)
3592 {
3593 #ifdef KDTRACE_HOOKS
3594 struct vnode *startvp = vp;
3595 #endif
3596 struct vnode *tvp;
3597 struct mount *mp;
3598 struct namecache *ncp;
3599 size_t orig_buflen;
3600 int reason;
3601 int error;
3602 #ifdef KDTRACE_HOOKS
3603 int i;
3604 #endif
3605 seqc_t vp_seqc, tvp_seqc;
3606 u_char nc_flag;
3607
3608 VFS_SMR_ASSERT_ENTERED();
3609
3610 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
3611 vfs_smr_exit();
3612 return (-1);
3613 }
3614
3615 orig_buflen = *buflen;
3616
3617 if (addend == 0) {
3618 MPASS(*buflen >= 2);
3619 *buflen -= 1;
3620 buf[*buflen] = '\0';
3621 }
3622
3623 if (vp == rdir || vp == rootvnode) {
3624 if (addend == 0) {
3625 *buflen -= 1;
3626 buf[*buflen] = '/';
3627 }
3628 goto out_ok;
3629 }
3630
3631 #ifdef KDTRACE_HOOKS
3632 i = 0;
3633 #endif
3634 error = -1;
3635 ncp = NULL; /* for sdt probe down below */
3636 vp_seqc = vn_seqc_read_any(vp);
3637 if (seqc_in_modify(vp_seqc)) {
3638 cache_rev_failed(&reason);
3639 goto out_abort;
3640 }
3641
3642 for (;;) {
3643 #ifdef KDTRACE_HOOKS
3644 i++;
3645 #endif
3646 if ((vp->v_vflag & VV_ROOT) != 0) {
3647 mp = atomic_load_ptr(&vp->v_mount);
3648 if (mp == NULL) {
3649 cache_rev_failed(&reason);
3650 goto out_abort;
3651 }
3652 tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3653 tvp_seqc = vn_seqc_read_any(tvp);
3654 if (seqc_in_modify(tvp_seqc)) {
3655 cache_rev_failed(&reason);
3656 goto out_abort;
3657 }
3658 if (!vn_seqc_consistent(vp, vp_seqc)) {
3659 cache_rev_failed(&reason);
3660 goto out_abort;
3661 }
3662 vp = tvp;
3663 vp_seqc = tvp_seqc;
3664 continue;
3665 }
3666 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3667 if (ncp == NULL) {
3668 cache_rev_failed(&reason);
3669 goto out_abort;
3670 }
3671 nc_flag = atomic_load_char(&ncp->nc_flag);
3672 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3673 cache_rev_failed(&reason);
3674 goto out_abort;
3675 }
3676 if (ncp->nc_nlen >= *buflen) {
3677 cache_rev_failed(&reason);
3678 error = ENOMEM;
3679 goto out_abort;
3680 }
3681 *buflen -= ncp->nc_nlen;
3682 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3683 *buflen -= 1;
3684 buf[*buflen] = '/';
3685 tvp = ncp->nc_dvp;
3686 tvp_seqc = vn_seqc_read_any(tvp);
3687 if (seqc_in_modify(tvp_seqc)) {
3688 cache_rev_failed(&reason);
3689 goto out_abort;
3690 }
3691 if (!vn_seqc_consistent(vp, vp_seqc)) {
3692 cache_rev_failed(&reason);
3693 goto out_abort;
3694 }
3695 /*
3696 * Acquire fence provided by vn_seqc_read_any above.
3697 */
3698 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3699 cache_rev_failed(&reason);
3700 goto out_abort;
3701 }
3702 if (!cache_ncp_canuse(ncp)) {
3703 cache_rev_failed(&reason);
3704 goto out_abort;
3705 }
3706 vp = tvp;
3707 vp_seqc = tvp_seqc;
3708 if (vp == rdir || vp == rootvnode)
3709 break;
3710 }
3711 out_ok:
3712 vfs_smr_exit();
3713 *retbuf = buf + *buflen;
3714 *buflen = orig_buflen - *buflen + addend;
3715 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3716 return (0);
3717
3718 out_abort:
3719 *buflen = orig_buflen;
3720 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3721 vfs_smr_exit();
3722 return (error);
3723 }
3724
3725 static int
vn_fullpath_any(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen)3726 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3727 size_t *buflen)
3728 {
3729 size_t orig_buflen, addend;
3730 int error;
3731
3732 if (*buflen < 2)
3733 return (EINVAL);
3734
3735 orig_buflen = *buflen;
3736
3737 vref(vp);
3738 addend = 0;
3739 if (vp->v_type != VDIR) {
3740 *buflen -= 1;
3741 buf[*buflen] = '\0';
3742 error = vn_vptocnp(&vp, buf, buflen);
3743 if (error)
3744 return (error);
3745 if (*buflen == 0) {
3746 vrele(vp);
3747 return (ENOMEM);
3748 }
3749 *buflen -= 1;
3750 buf[*buflen] = '/';
3751 addend = orig_buflen - *buflen;
3752 }
3753
3754 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3755 }
3756
3757 /*
3758 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3759 *
3760 * Since the namecache does not track hardlinks, the caller is expected to
3761 * first look up the target vnode with WANTPARENT flag passed to namei to get
3762 * dvp and vp.
3763 *
3764 * Then we have 2 cases:
3765 * - if the found vnode is a directory, the path can be constructed just by
3766 * following names up the chain
3767 * - otherwise we populate the buffer with the saved name and start resolving
3768 * from the parent
3769 */
3770 int
vn_fullpath_hardlink(struct vnode * vp,struct vnode * dvp,const char * hrdl_name,size_t hrdl_name_length,char ** retbuf,char ** freebuf,size_t * buflen)3771 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
3772 const char *hrdl_name, size_t hrdl_name_length,
3773 char **retbuf, char **freebuf, size_t *buflen)
3774 {
3775 char *buf, *tmpbuf;
3776 struct pwd *pwd;
3777 size_t addend;
3778 int error;
3779 __enum_uint8(vtype) type;
3780
3781 if (*buflen < 2)
3782 return (EINVAL);
3783 if (*buflen > MAXPATHLEN)
3784 *buflen = MAXPATHLEN;
3785
3786 buf = malloc(*buflen, M_TEMP, M_WAITOK);
3787
3788 addend = 0;
3789
3790 /*
3791 * Check for VBAD to work around the vp_crossmp bug in lookup().
3792 *
3793 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3794 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3795 * If the type is VDIR (like in this very case) we can skip looking
3796 * at ni_dvp in the first place. However, since vnodes get passed here
3797 * unlocked the target may transition to doomed state (type == VBAD)
3798 * before we get to evaluate the condition. If this happens, we will
3799 * populate part of the buffer and descend to vn_fullpath_dir with
3800 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3801 */
3802 type = atomic_load_8(&vp->v_type);
3803 if (type == VBAD) {
3804 error = ENOENT;
3805 goto out_bad;
3806 }
3807 if (type != VDIR) {
3808 addend = hrdl_name_length + 2;
3809 if (*buflen < addend) {
3810 error = ENOMEM;
3811 goto out_bad;
3812 }
3813 *buflen -= addend;
3814 tmpbuf = buf + *buflen;
3815 tmpbuf[0] = '/';
3816 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
3817 tmpbuf[addend - 1] = '\0';
3818 vp = dvp;
3819 }
3820
3821 vfs_smr_enter();
3822 pwd = pwd_get_smr();
3823 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3824 addend);
3825 VFS_SMR_ASSERT_NOT_ENTERED();
3826 if (error < 0) {
3827 pwd = pwd_hold(curthread);
3828 vref(vp);
3829 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3830 addend);
3831 pwd_drop(pwd);
3832 }
3833 if (error != 0)
3834 goto out_bad;
3835
3836 *freebuf = buf;
3837
3838 return (0);
3839 out_bad:
3840 free(buf, M_TEMP);
3841 return (error);
3842 }
3843
3844 struct vnode *
vn_dir_dd_ino(struct vnode * vp)3845 vn_dir_dd_ino(struct vnode *vp)
3846 {
3847 struct namecache *ncp;
3848 struct vnode *ddvp;
3849 struct mtx *vlp;
3850 enum vgetstate vs;
3851
3852 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3853 vlp = VP2VNODELOCK(vp);
3854 mtx_lock(vlp);
3855 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3856 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3857 continue;
3858 ddvp = ncp->nc_dvp;
3859 vs = vget_prep(ddvp);
3860 mtx_unlock(vlp);
3861 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3862 return (NULL);
3863 return (ddvp);
3864 }
3865 mtx_unlock(vlp);
3866 return (NULL);
3867 }
3868
3869 int
vn_commname(struct vnode * vp,char * buf,u_int buflen)3870 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3871 {
3872 struct namecache *ncp;
3873 struct mtx *vlp;
3874 int l;
3875
3876 vlp = VP2VNODELOCK(vp);
3877 mtx_lock(vlp);
3878 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3879 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3880 break;
3881 if (ncp == NULL) {
3882 mtx_unlock(vlp);
3883 return (ENOENT);
3884 }
3885 l = min(ncp->nc_nlen, buflen - 1);
3886 memcpy(buf, ncp->nc_name, l);
3887 mtx_unlock(vlp);
3888 buf[l] = '\0';
3889 return (0);
3890 }
3891
3892 /*
3893 * This function updates path string to vnode's full global path
3894 * and checks the size of the new path string against the pathlen argument.
3895 *
3896 * Requires a locked, referenced vnode.
3897 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3898 *
3899 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3900 * because it falls back to the ".." lookup if the namecache lookup fails.
3901 */
3902 int
vn_path_to_global_path(struct thread * td,struct vnode * vp,char * path,u_int pathlen)3903 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3904 u_int pathlen)
3905 {
3906 struct nameidata nd;
3907 struct vnode *vp1;
3908 char *rpath, *fbuf;
3909 int error;
3910
3911 ASSERT_VOP_ELOCKED(vp, __func__);
3912
3913 /* Construct global filesystem path from vp. */
3914 VOP_UNLOCK(vp);
3915 error = vn_fullpath_global(vp, &rpath, &fbuf);
3916
3917 if (error != 0) {
3918 vrele(vp);
3919 return (error);
3920 }
3921
3922 if (strlen(rpath) >= pathlen) {
3923 vrele(vp);
3924 error = ENAMETOOLONG;
3925 goto out;
3926 }
3927
3928 /*
3929 * Re-lookup the vnode by path to detect a possible rename.
3930 * As a side effect, the vnode is relocked.
3931 * If vnode was renamed, return ENOENT.
3932 */
3933 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3934 error = namei(&nd);
3935 if (error != 0) {
3936 vrele(vp);
3937 goto out;
3938 }
3939 NDFREE_PNBUF(&nd);
3940 vp1 = nd.ni_vp;
3941 vrele(vp);
3942 if (vp1 == vp)
3943 strcpy(path, rpath);
3944 else {
3945 vput(vp1);
3946 error = ENOENT;
3947 }
3948
3949 out:
3950 free(fbuf, M_TEMP);
3951 return (error);
3952 }
3953
3954 /*
3955 * This is similar to vn_path_to_global_path but allows for regular
3956 * files which may not be present in the cache.
3957 *
3958 * Requires a locked, referenced vnode.
3959 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3960 */
3961 int
vn_path_to_global_path_hardlink(struct thread * td,struct vnode * vp,struct vnode * dvp,char * path,u_int pathlen,const char * leaf_name,size_t leaf_length)3962 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
3963 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
3964 size_t leaf_length)
3965 {
3966 struct nameidata nd;
3967 struct vnode *vp1;
3968 char *rpath, *fbuf;
3969 size_t len;
3970 int error;
3971
3972 ASSERT_VOP_ELOCKED(vp, __func__);
3973
3974 /*
3975 * Construct global filesystem path from dvp, vp and leaf
3976 * name.
3977 */
3978 VOP_UNLOCK(vp);
3979 len = pathlen;
3980 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
3981 &rpath, &fbuf, &len);
3982
3983 if (error != 0) {
3984 vrele(vp);
3985 return (error);
3986 }
3987
3988 if (strlen(rpath) >= pathlen) {
3989 vrele(vp);
3990 error = ENAMETOOLONG;
3991 goto out;
3992 }
3993
3994 /*
3995 * Re-lookup the vnode by path to detect a possible rename.
3996 * As a side effect, the vnode is relocked.
3997 * If vnode was renamed, return ENOENT.
3998 */
3999 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
4000 error = namei(&nd);
4001 if (error != 0) {
4002 vrele(vp);
4003 goto out;
4004 }
4005 NDFREE_PNBUF(&nd);
4006 vp1 = nd.ni_vp;
4007 vrele(vp);
4008 if (vp1 == vp)
4009 strcpy(path, rpath);
4010 else {
4011 vput(vp1);
4012 error = ENOENT;
4013 }
4014
4015 out:
4016 free(fbuf, M_TEMP);
4017 return (error);
4018 }
4019
4020 void
cache_vop_inotify(struct vnode * vp,int event,uint32_t cookie)4021 cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie)
4022 {
4023 struct mtx *vlp;
4024 struct namecache *ncp;
4025 int isdir;
4026 bool logged, self;
4027
4028 isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
4029 self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 &&
4030 (vp->v_type != VDIR || (event & ~_IN_DIR_EVENTS) != 0);
4031
4032 if (self) {
4033 int selfevent;
4034
4035 if (event == _IN_ATTRIB_LINKCOUNT)
4036 selfevent = IN_ATTRIB;
4037 else
4038 selfevent = event;
4039 inotify_log(vp, NULL, 0, selfevent | isdir, cookie);
4040 }
4041 if ((event & IN_ALL_EVENTS) == 0)
4042 return;
4043
4044 logged = false;
4045 vlp = VP2VNODELOCK(vp);
4046 mtx_lock(vlp);
4047 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
4048 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
4049 continue;
4050 if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) {
4051 /*
4052 * XXX-MJ if the vnode has two links in the same
4053 * dir, we'll log the same event twice.
4054 */
4055 inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen,
4056 event | isdir, cookie);
4057 logged = true;
4058 }
4059 }
4060 if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) {
4061 /*
4062 * We didn't find a watched directory that contains this vnode,
4063 * so stop calling VOP_INOTIFY for operations on the vnode.
4064 */
4065 vn_irflag_unset(vp, VIRF_INOTIFY_PARENT);
4066 }
4067 mtx_unlock(vlp);
4068 }
4069
4070 #ifdef DDB
4071 static void
db_print_vpath(struct vnode * vp)4072 db_print_vpath(struct vnode *vp)
4073 {
4074
4075 while (vp != NULL) {
4076 db_printf("%p: ", vp);
4077 if (vp == rootvnode) {
4078 db_printf("/");
4079 vp = NULL;
4080 } else {
4081 if (vp->v_vflag & VV_ROOT) {
4082 db_printf("<mount point>");
4083 vp = vp->v_mount->mnt_vnodecovered;
4084 } else {
4085 struct namecache *ncp;
4086 char *ncn;
4087 int i;
4088
4089 ncp = TAILQ_FIRST(&vp->v_cache_dst);
4090 if (ncp != NULL) {
4091 ncn = ncp->nc_name;
4092 for (i = 0; i < ncp->nc_nlen; i++)
4093 db_printf("%c", *ncn++);
4094 vp = ncp->nc_dvp;
4095 } else {
4096 vp = NULL;
4097 }
4098 }
4099 }
4100 db_printf("\n");
4101 }
4102
4103 return;
4104 }
4105
DB_SHOW_COMMAND(vpath,db_show_vpath)4106 DB_SHOW_COMMAND(vpath, db_show_vpath)
4107 {
4108 struct vnode *vp;
4109
4110 if (!have_addr) {
4111 db_printf("usage: show vpath <struct vnode *>\n");
4112 return;
4113 }
4114
4115 vp = (struct vnode *)addr;
4116 db_print_vpath(vp);
4117 }
4118
4119 #endif
4120
4121 static int cache_fast_lookup = 1;
4122
4123 #define CACHE_FPL_FAILED -2020
4124
4125 static int
cache_vop_bad_vexec(struct vop_fplookup_vexec_args * v)4126 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v)
4127 {
4128 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n");
4129 panic("no proper vop_fplookup_vexec");
4130 }
4131
4132 static int
cache_vop_bad_symlink(struct vop_fplookup_symlink_args * v)4133 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v)
4134 {
4135 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n");
4136 panic("no proper vop_fplookup_symlink");
4137 }
4138
4139 void
cache_vop_vector_register(struct vop_vector * v)4140 cache_vop_vector_register(struct vop_vector *v)
4141 {
4142 size_t ops;
4143
4144 ops = 0;
4145 if (v->vop_fplookup_vexec != NULL) {
4146 ops++;
4147 }
4148 if (v->vop_fplookup_symlink != NULL) {
4149 ops++;
4150 }
4151
4152 if (ops == 2) {
4153 return;
4154 }
4155
4156 if (ops == 0) {
4157 v->vop_fplookup_vexec = cache_vop_bad_vexec;
4158 v->vop_fplookup_symlink = cache_vop_bad_symlink;
4159 return;
4160 }
4161
4162 printf("%s: invalid vop vector %p -- either all or none fplookup vops "
4163 "need to be provided", __func__, v);
4164 if (v->vop_fplookup_vexec == NULL) {
4165 printf("%s: missing vop_fplookup_vexec\n", __func__);
4166 }
4167 if (v->vop_fplookup_symlink == NULL) {
4168 printf("%s: missing vop_fplookup_symlink\n", __func__);
4169 }
4170 panic("bad vop vector %p", v);
4171 }
4172
4173 #ifdef INVARIANTS
4174 void
cache_validate_vop_vector(struct mount * mp,struct vop_vector * vops)4175 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops)
4176 {
4177 if (mp == NULL)
4178 return;
4179
4180 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4181 return;
4182
4183 if (vops->vop_fplookup_vexec == NULL ||
4184 vops->vop_fplookup_vexec == cache_vop_bad_vexec)
4185 panic("bad vop_fplookup_vexec on vector %p for filesystem %s",
4186 vops, mp->mnt_vfc->vfc_name);
4187
4188 if (vops->vop_fplookup_symlink == NULL ||
4189 vops->vop_fplookup_symlink == cache_vop_bad_symlink)
4190 panic("bad vop_fplookup_symlink on vector %p for filesystem %s",
4191 vops, mp->mnt_vfc->vfc_name);
4192 }
4193 #endif
4194
4195 void
cache_fast_lookup_enabled_recalc(void)4196 cache_fast_lookup_enabled_recalc(void)
4197 {
4198 int lookup_flag;
4199 int mac_on;
4200
4201 #ifdef MAC
4202 mac_on = mac_vnode_check_lookup_enabled();
4203 mac_on |= mac_vnode_check_readlink_enabled();
4204 #else
4205 mac_on = 0;
4206 #endif
4207
4208 lookup_flag = atomic_load_int(&cache_fast_lookup);
4209 if (lookup_flag && !mac_on) {
4210 atomic_store_char(&cache_fast_lookup_enabled, true);
4211 } else {
4212 atomic_store_char(&cache_fast_lookup_enabled, false);
4213 }
4214 }
4215
4216 static int
syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)4217 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
4218 {
4219 int error, old;
4220
4221 old = atomic_load_int(&cache_fast_lookup);
4222 error = sysctl_handle_int(oidp, arg1, arg2, req);
4223 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
4224 cache_fast_lookup_enabled_recalc();
4225 return (error);
4226 }
4227 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
4228 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
4229
4230 /*
4231 * Components of nameidata (or objects it can point to) which may
4232 * need restoring in case fast path lookup fails.
4233 */
4234 struct nameidata_outer {
4235 size_t ni_pathlen;
4236 uint64_t cn_flags;
4237 };
4238
4239 struct nameidata_saved {
4240 #ifdef INVARIANTS
4241 char *cn_nameptr;
4242 size_t ni_pathlen;
4243 #endif
4244 };
4245
4246 #ifdef INVARIANTS
4247 struct cache_fpl_debug {
4248 size_t ni_pathlen;
4249 };
4250 #endif
4251
4252 struct cache_fpl {
4253 struct nameidata *ndp;
4254 struct componentname *cnp;
4255 char *nulchar;
4256 struct vnode *dvp;
4257 struct vnode *tvp;
4258 seqc_t dvp_seqc;
4259 seqc_t tvp_seqc;
4260 uint32_t hash;
4261 struct nameidata_saved snd;
4262 struct nameidata_outer snd_outer;
4263 int line;
4264 enum cache_fpl_status status:8;
4265 bool in_smr;
4266 bool fsearch;
4267 struct pwd **pwd;
4268 #ifdef INVARIANTS
4269 struct cache_fpl_debug debug;
4270 #endif
4271 };
4272
4273 static bool cache_fplookup_mp_supported(struct mount *mp);
4274 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
4275 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
4276 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
4277 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
4278 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
4279 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
4280 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
4281 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
4282 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
4283
4284 static void
cache_fpl_cleanup_cnp(struct componentname * cnp)4285 cache_fpl_cleanup_cnp(struct componentname *cnp)
4286 {
4287
4288 uma_zfree(namei_zone, cnp->cn_pnbuf);
4289 cnp->cn_pnbuf = NULL;
4290 cnp->cn_nameptr = NULL;
4291 }
4292
4293 static struct vnode *
cache_fpl_handle_root(struct cache_fpl * fpl)4294 cache_fpl_handle_root(struct cache_fpl *fpl)
4295 {
4296 struct nameidata *ndp;
4297 struct componentname *cnp;
4298
4299 ndp = fpl->ndp;
4300 cnp = fpl->cnp;
4301
4302 MPASS(*(cnp->cn_nameptr) == '/');
4303 cnp->cn_nameptr++;
4304 cache_fpl_pathlen_dec(fpl);
4305
4306 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4307 do {
4308 cnp->cn_nameptr++;
4309 cache_fpl_pathlen_dec(fpl);
4310 } while (*(cnp->cn_nameptr) == '/');
4311 }
4312
4313 return (ndp->ni_rootdir);
4314 }
4315
4316 static void
cache_fpl_checkpoint_outer(struct cache_fpl * fpl)4317 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
4318 {
4319
4320 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
4321 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
4322 }
4323
4324 static void
cache_fpl_checkpoint(struct cache_fpl * fpl)4325 cache_fpl_checkpoint(struct cache_fpl *fpl)
4326 {
4327
4328 #ifdef INVARIANTS
4329 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
4330 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
4331 #endif
4332 }
4333
4334 static void
cache_fpl_restore_partial(struct cache_fpl * fpl)4335 cache_fpl_restore_partial(struct cache_fpl *fpl)
4336 {
4337
4338 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
4339 #ifdef INVARIANTS
4340 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
4341 #endif
4342 }
4343
4344 static void
cache_fpl_restore_abort(struct cache_fpl * fpl)4345 cache_fpl_restore_abort(struct cache_fpl *fpl)
4346 {
4347
4348 cache_fpl_restore_partial(fpl);
4349 /*
4350 * It is 0 on entry by API contract.
4351 */
4352 fpl->ndp->ni_resflags = 0;
4353 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
4354 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
4355 }
4356
4357 #ifdef INVARIANTS
4358 #define cache_fpl_smr_assert_entered(fpl) ({ \
4359 struct cache_fpl *_fpl = (fpl); \
4360 MPASS(_fpl->in_smr == true); \
4361 VFS_SMR_ASSERT_ENTERED(); \
4362 })
4363 #define cache_fpl_smr_assert_not_entered(fpl) ({ \
4364 struct cache_fpl *_fpl = (fpl); \
4365 MPASS(_fpl->in_smr == false); \
4366 VFS_SMR_ASSERT_NOT_ENTERED(); \
4367 })
4368 static void
cache_fpl_assert_status(struct cache_fpl * fpl)4369 cache_fpl_assert_status(struct cache_fpl *fpl)
4370 {
4371
4372 switch (fpl->status) {
4373 case CACHE_FPL_STATUS_UNSET:
4374 __assert_unreachable();
4375 break;
4376 case CACHE_FPL_STATUS_DESTROYED:
4377 case CACHE_FPL_STATUS_ABORTED:
4378 case CACHE_FPL_STATUS_PARTIAL:
4379 case CACHE_FPL_STATUS_HANDLED:
4380 break;
4381 }
4382 }
4383 #else
4384 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
4385 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
4386 #define cache_fpl_assert_status(fpl) do { } while (0)
4387 #endif
4388
4389 #define cache_fpl_smr_enter_initial(fpl) ({ \
4390 struct cache_fpl *_fpl = (fpl); \
4391 vfs_smr_enter(); \
4392 _fpl->in_smr = true; \
4393 })
4394
4395 #define cache_fpl_smr_enter(fpl) ({ \
4396 struct cache_fpl *_fpl = (fpl); \
4397 MPASS(_fpl->in_smr == false); \
4398 vfs_smr_enter(); \
4399 _fpl->in_smr = true; \
4400 })
4401
4402 #define cache_fpl_smr_exit(fpl) ({ \
4403 struct cache_fpl *_fpl = (fpl); \
4404 MPASS(_fpl->in_smr == true); \
4405 vfs_smr_exit(); \
4406 _fpl->in_smr = false; \
4407 })
4408
4409 static int
cache_fpl_aborted_early_impl(struct cache_fpl * fpl,int line)4410 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
4411 {
4412
4413 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4414 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4415 ("%s: converting to abort from %d at %d, set at %d\n",
4416 __func__, fpl->status, line, fpl->line));
4417 }
4418 cache_fpl_smr_assert_not_entered(fpl);
4419 fpl->status = CACHE_FPL_STATUS_ABORTED;
4420 fpl->line = line;
4421 return (CACHE_FPL_FAILED);
4422 }
4423
4424 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__)
4425
4426 static int __noinline
cache_fpl_aborted_impl(struct cache_fpl * fpl,int line)4427 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
4428 {
4429 struct nameidata *ndp;
4430 struct componentname *cnp;
4431
4432 ndp = fpl->ndp;
4433 cnp = fpl->cnp;
4434
4435 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4436 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4437 ("%s: converting to abort from %d at %d, set at %d\n",
4438 __func__, fpl->status, line, fpl->line));
4439 }
4440 fpl->status = CACHE_FPL_STATUS_ABORTED;
4441 fpl->line = line;
4442 if (fpl->in_smr)
4443 cache_fpl_smr_exit(fpl);
4444 cache_fpl_restore_abort(fpl);
4445 /*
4446 * Resolving symlinks overwrites data passed by the caller.
4447 * Let namei know.
4448 */
4449 if (ndp->ni_loopcnt > 0) {
4450 fpl->status = CACHE_FPL_STATUS_DESTROYED;
4451 cache_fpl_cleanup_cnp(cnp);
4452 }
4453 return (CACHE_FPL_FAILED);
4454 }
4455
4456 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
4457
4458 static int __noinline
cache_fpl_partial_impl(struct cache_fpl * fpl,int line)4459 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
4460 {
4461
4462 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4463 ("%s: setting to partial at %d, but already set to %d at %d\n",
4464 __func__, line, fpl->status, fpl->line));
4465 cache_fpl_smr_assert_entered(fpl);
4466 fpl->status = CACHE_FPL_STATUS_PARTIAL;
4467 fpl->line = line;
4468 return (cache_fplookup_partial_setup(fpl));
4469 }
4470
4471 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
4472
4473 static int
cache_fpl_handled_impl(struct cache_fpl * fpl,int line)4474 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
4475 {
4476
4477 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4478 ("%s: setting to handled at %d, but already set to %d at %d\n",
4479 __func__, line, fpl->status, fpl->line));
4480 cache_fpl_smr_assert_not_entered(fpl);
4481 fpl->status = CACHE_FPL_STATUS_HANDLED;
4482 fpl->line = line;
4483 return (0);
4484 }
4485
4486 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__)
4487
4488 static int
cache_fpl_handled_error_impl(struct cache_fpl * fpl,int error,int line)4489 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
4490 {
4491
4492 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4493 ("%s: setting to handled at %d, but already set to %d at %d\n",
4494 __func__, line, fpl->status, fpl->line));
4495 MPASS(error != 0);
4496 MPASS(error != CACHE_FPL_FAILED);
4497 cache_fpl_smr_assert_not_entered(fpl);
4498 fpl->status = CACHE_FPL_STATUS_HANDLED;
4499 fpl->line = line;
4500 fpl->dvp = NULL;
4501 fpl->tvp = NULL;
4502 return (error);
4503 }
4504
4505 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__)
4506
4507 static bool
cache_fpl_terminated(struct cache_fpl * fpl)4508 cache_fpl_terminated(struct cache_fpl *fpl)
4509 {
4510
4511 return (fpl->status != CACHE_FPL_STATUS_UNSET);
4512 }
4513
4514 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
4515 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
4516 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \
4517 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
4518 OPENWRITE | WANTIOCTLCAPS | NAMEILOOKUP)
4519
4520 #define CACHE_FPL_INTERNAL_CN_FLAGS \
4521 (ISDOTDOT | MAKEENTRY | ISLASTCN)
4522
4523 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4524 "supported and internal flags overlap");
4525
4526 static bool
cache_fpl_islastcn(struct nameidata * ndp)4527 cache_fpl_islastcn(struct nameidata *ndp)
4528 {
4529
4530 return (*ndp->ni_next == 0);
4531 }
4532
4533 static bool
cache_fpl_istrailingslash(struct cache_fpl * fpl)4534 cache_fpl_istrailingslash(struct cache_fpl *fpl)
4535 {
4536
4537 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
4538 return (*(fpl->nulchar - 1) == '/');
4539 }
4540
4541 static bool
cache_fpl_isdotdot(struct componentname * cnp)4542 cache_fpl_isdotdot(struct componentname *cnp)
4543 {
4544
4545 if (cnp->cn_namelen == 2 &&
4546 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
4547 return (true);
4548 return (false);
4549 }
4550
4551 static bool
cache_can_fplookup(struct cache_fpl * fpl)4552 cache_can_fplookup(struct cache_fpl *fpl)
4553 {
4554 struct nameidata *ndp;
4555 struct componentname *cnp;
4556 struct thread *td;
4557
4558 ndp = fpl->ndp;
4559 cnp = fpl->cnp;
4560 td = curthread;
4561
4562 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
4563 cache_fpl_aborted_early(fpl);
4564 return (false);
4565 }
4566 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4567 cache_fpl_aborted_early(fpl);
4568 return (false);
4569 }
4570 if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) {
4571 cache_fpl_aborted_early(fpl);
4572 return (false);
4573 }
4574 if (AUDITING_TD(td)) {
4575 cache_fpl_aborted_early(fpl);
4576 return (false);
4577 }
4578 if (ndp->ni_startdir != NULL) {
4579 cache_fpl_aborted_early(fpl);
4580 return (false);
4581 }
4582 return (true);
4583 }
4584
4585 static int __noinline
cache_fplookup_dirfd(struct cache_fpl * fpl,struct vnode ** vpp)4586 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4587 {
4588 struct nameidata *ndp;
4589 struct componentname *cnp;
4590 int error, flags;
4591
4592 ndp = fpl->ndp;
4593 cnp = fpl->cnp;
4594
4595 error = fgetvp_lookup_smr(ndp, vpp, &flags);
4596 if (__predict_false(error != 0)) {
4597 return (cache_fpl_aborted(fpl));
4598 }
4599 if (__predict_false((flags & O_RESOLVE_BENEATH) != 0)) {
4600 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & RBENEATH) == 0,
4601 "RBENEATH supported by fplookup");
4602 cache_fpl_smr_exit(fpl);
4603 cache_fpl_aborted(fpl);
4604 return (EOPNOTSUPP);
4605 }
4606 fpl->fsearch = (flags & FSEARCH) != 0;
4607 if ((*vpp)->v_type != VDIR) {
4608 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
4609 cache_fpl_smr_exit(fpl);
4610 return (cache_fpl_handled_error(fpl, ENOTDIR));
4611 }
4612 }
4613 return (0);
4614 }
4615
4616 static int __noinline
cache_fplookup_negative_promote(struct cache_fpl * fpl,struct namecache * oncp,uint32_t hash)4617 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4618 uint32_t hash)
4619 {
4620 struct componentname *cnp;
4621 struct vnode *dvp;
4622
4623 cnp = fpl->cnp;
4624 dvp = fpl->dvp;
4625
4626 cache_fpl_smr_exit(fpl);
4627 if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4628 return (cache_fpl_handled_error(fpl, ENOENT));
4629 else
4630 return (cache_fpl_aborted(fpl));
4631 }
4632
4633 /*
4634 * The target vnode is not supported, prepare for the slow path to take over.
4635 */
4636 static int __noinline
cache_fplookup_partial_setup(struct cache_fpl * fpl)4637 cache_fplookup_partial_setup(struct cache_fpl *fpl)
4638 {
4639 struct nameidata *ndp;
4640 struct componentname *cnp;
4641 enum vgetstate dvs;
4642 struct vnode *dvp;
4643 struct pwd *pwd;
4644 seqc_t dvp_seqc;
4645
4646 ndp = fpl->ndp;
4647 cnp = fpl->cnp;
4648 pwd = *(fpl->pwd);
4649 dvp = fpl->dvp;
4650 dvp_seqc = fpl->dvp_seqc;
4651
4652 if (!pwd_hold_smr(pwd)) {
4653 return (cache_fpl_aborted(fpl));
4654 }
4655
4656 /*
4657 * Note that seqc is checked before the vnode is locked, so by
4658 * the time regular lookup gets to it it may have moved.
4659 *
4660 * Ultimately this does not affect correctness, any lookup errors
4661 * are userspace racing with itself. It is guaranteed that any
4662 * path which ultimately gets found could also have been found
4663 * by regular lookup going all the way in absence of concurrent
4664 * modifications.
4665 */
4666 dvs = vget_prep_smr(dvp);
4667 cache_fpl_smr_exit(fpl);
4668 if (__predict_false(dvs == VGET_NONE)) {
4669 pwd_drop(pwd);
4670 return (cache_fpl_aborted(fpl));
4671 }
4672
4673 vget_finish_ref(dvp, dvs);
4674 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4675 vrele(dvp);
4676 pwd_drop(pwd);
4677 return (cache_fpl_aborted(fpl));
4678 }
4679
4680 cache_fpl_restore_partial(fpl);
4681 #ifdef INVARIANTS
4682 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4683 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4684 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4685 }
4686 #endif
4687
4688 ndp->ni_startdir = dvp;
4689 cnp->cn_flags |= MAKEENTRY;
4690 if (cache_fpl_islastcn(ndp))
4691 cnp->cn_flags |= ISLASTCN;
4692 if (cache_fpl_isdotdot(cnp))
4693 cnp->cn_flags |= ISDOTDOT;
4694
4695 /*
4696 * Skip potential extra slashes parsing did not take care of.
4697 * cache_fplookup_skip_slashes explains the mechanism.
4698 */
4699 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4700 do {
4701 cnp->cn_nameptr++;
4702 cache_fpl_pathlen_dec(fpl);
4703 } while (*(cnp->cn_nameptr) == '/');
4704 }
4705
4706 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4707 #ifdef INVARIANTS
4708 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4709 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4710 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4711 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4712 }
4713 #endif
4714 return (0);
4715 }
4716
4717 static int
cache_fplookup_final_child(struct cache_fpl * fpl,enum vgetstate tvs)4718 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4719 {
4720 struct componentname *cnp;
4721 struct vnode *tvp;
4722 seqc_t tvp_seqc;
4723 int error, lkflags;
4724
4725 cnp = fpl->cnp;
4726 tvp = fpl->tvp;
4727 tvp_seqc = fpl->tvp_seqc;
4728
4729 if ((cnp->cn_flags & LOCKLEAF) != 0) {
4730 lkflags = LK_SHARED;
4731 if ((cnp->cn_flags & LOCKSHARED) == 0)
4732 lkflags = LK_EXCLUSIVE;
4733 error = vget_finish(tvp, lkflags, tvs);
4734 if (__predict_false(error != 0)) {
4735 return (cache_fpl_aborted(fpl));
4736 }
4737 } else {
4738 vget_finish_ref(tvp, tvs);
4739 }
4740
4741 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4742 if ((cnp->cn_flags & LOCKLEAF) != 0)
4743 vput(tvp);
4744 else
4745 vrele(tvp);
4746 return (cache_fpl_aborted(fpl));
4747 }
4748
4749 return (cache_fpl_handled(fpl));
4750 }
4751
4752 /*
4753 * They want to possibly modify the state of the namecache.
4754 */
4755 static int __noinline
cache_fplookup_final_modifying(struct cache_fpl * fpl)4756 cache_fplookup_final_modifying(struct cache_fpl *fpl)
4757 {
4758 struct nameidata *ndp __diagused;
4759 struct componentname *cnp;
4760 enum vgetstate dvs;
4761 struct vnode *dvp, *tvp;
4762 struct mount *mp;
4763 seqc_t dvp_seqc;
4764 int error;
4765 bool docache;
4766
4767 ndp = fpl->ndp;
4768 cnp = fpl->cnp;
4769 dvp = fpl->dvp;
4770 dvp_seqc = fpl->dvp_seqc;
4771
4772 MPASS(*(cnp->cn_nameptr) != '/');
4773 MPASS(cache_fpl_islastcn(ndp));
4774 if ((cnp->cn_flags & LOCKPARENT) == 0)
4775 MPASS((cnp->cn_flags & WANTPARENT) != 0);
4776 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4777 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4778 cnp->cn_nameiop == RENAME);
4779 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4780 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4781
4782 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4783 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4784 docache = false;
4785
4786 /*
4787 * Regular lookup nulifies the slash, which we don't do here.
4788 * Don't take chances with filesystem routines seeing it for
4789 * the last entry.
4790 */
4791 if (cache_fpl_istrailingslash(fpl)) {
4792 return (cache_fpl_partial(fpl));
4793 }
4794
4795 mp = atomic_load_ptr(&dvp->v_mount);
4796 if (__predict_false(mp == NULL)) {
4797 return (cache_fpl_aborted(fpl));
4798 }
4799
4800 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4801 cache_fpl_smr_exit(fpl);
4802 /*
4803 * Original code keeps not checking for CREATE which
4804 * might be a bug. For now let the old lookup decide.
4805 */
4806 if (cnp->cn_nameiop == CREATE) {
4807 return (cache_fpl_aborted(fpl));
4808 }
4809 return (cache_fpl_handled_error(fpl, EROFS));
4810 }
4811
4812 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4813 cache_fpl_smr_exit(fpl);
4814 return (cache_fpl_handled_error(fpl, EEXIST));
4815 }
4816
4817 /*
4818 * Secure access to dvp; check cache_fplookup_partial_setup for
4819 * reasoning.
4820 *
4821 * XXX At least UFS requires its lookup routine to be called for
4822 * the last path component, which leads to some level of complication
4823 * and inefficiency:
4824 * - the target routine always locks the target vnode, but our caller
4825 * may not need it locked
4826 * - some of the VOP machinery asserts that the parent is locked, which
4827 * once more may be not required
4828 *
4829 * TODO: add a flag for filesystems which don't need this.
4830 */
4831 dvs = vget_prep_smr(dvp);
4832 cache_fpl_smr_exit(fpl);
4833 if (__predict_false(dvs == VGET_NONE)) {
4834 return (cache_fpl_aborted(fpl));
4835 }
4836
4837 vget_finish_ref(dvp, dvs);
4838 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4839 vrele(dvp);
4840 return (cache_fpl_aborted(fpl));
4841 }
4842
4843 error = vn_lock(dvp, LK_EXCLUSIVE);
4844 if (__predict_false(error != 0)) {
4845 vrele(dvp);
4846 return (cache_fpl_aborted(fpl));
4847 }
4848
4849 tvp = NULL;
4850 cnp->cn_flags |= ISLASTCN;
4851 if (docache)
4852 cnp->cn_flags |= MAKEENTRY;
4853 if (cache_fpl_isdotdot(cnp))
4854 cnp->cn_flags |= ISDOTDOT;
4855 cnp->cn_lkflags = LK_EXCLUSIVE;
4856 error = VOP_LOOKUP(dvp, &tvp, cnp);
4857 switch (error) {
4858 case EJUSTRETURN:
4859 case 0:
4860 break;
4861 case ENOTDIR:
4862 case ENOENT:
4863 vput(dvp);
4864 return (cache_fpl_handled_error(fpl, error));
4865 default:
4866 vput(dvp);
4867 return (cache_fpl_aborted(fpl));
4868 }
4869
4870 fpl->tvp = tvp;
4871
4872 if (tvp == NULL) {
4873 MPASS(error == EJUSTRETURN);
4874 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4875 VOP_UNLOCK(dvp);
4876 }
4877 return (cache_fpl_handled(fpl));
4878 }
4879
4880 /*
4881 * There are very hairy corner cases concerning various flag combinations
4882 * and locking state. In particular here we only hold one lock instead of
4883 * two.
4884 *
4885 * Skip the complexity as it is of no significance for normal workloads.
4886 */
4887 if (__predict_false(tvp == dvp)) {
4888 vput(dvp);
4889 vrele(tvp);
4890 return (cache_fpl_aborted(fpl));
4891 }
4892
4893 /*
4894 * If they want the symlink itself we are fine, but if they want to
4895 * follow it regular lookup has to be engaged.
4896 */
4897 if (tvp->v_type == VLNK) {
4898 if ((cnp->cn_flags & FOLLOW) != 0) {
4899 vput(dvp);
4900 vput(tvp);
4901 return (cache_fpl_aborted(fpl));
4902 }
4903 }
4904
4905 /*
4906 * Since we expect this to be the terminal vnode it should almost never
4907 * be a mount point.
4908 */
4909 if (__predict_false(cache_fplookup_is_mp(fpl))) {
4910 vput(dvp);
4911 vput(tvp);
4912 return (cache_fpl_aborted(fpl));
4913 }
4914
4915 if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4916 vput(dvp);
4917 vput(tvp);
4918 return (cache_fpl_handled_error(fpl, EEXIST));
4919 }
4920
4921 if ((cnp->cn_flags & LOCKLEAF) == 0) {
4922 VOP_UNLOCK(tvp);
4923 }
4924
4925 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4926 VOP_UNLOCK(dvp);
4927 }
4928
4929 return (cache_fpl_handled(fpl));
4930 }
4931
4932 static int __noinline
cache_fplookup_modifying(struct cache_fpl * fpl)4933 cache_fplookup_modifying(struct cache_fpl *fpl)
4934 {
4935 struct nameidata *ndp;
4936
4937 ndp = fpl->ndp;
4938
4939 if (!cache_fpl_islastcn(ndp)) {
4940 return (cache_fpl_partial(fpl));
4941 }
4942 return (cache_fplookup_final_modifying(fpl));
4943 }
4944
4945 static int __noinline
cache_fplookup_final_withparent(struct cache_fpl * fpl)4946 cache_fplookup_final_withparent(struct cache_fpl *fpl)
4947 {
4948 struct componentname *cnp;
4949 enum vgetstate dvs, tvs;
4950 struct vnode *dvp, *tvp;
4951 seqc_t dvp_seqc;
4952 int error;
4953
4954 cnp = fpl->cnp;
4955 dvp = fpl->dvp;
4956 dvp_seqc = fpl->dvp_seqc;
4957 tvp = fpl->tvp;
4958
4959 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4960
4961 /*
4962 * This is less efficient than it can be for simplicity.
4963 */
4964 dvs = vget_prep_smr(dvp);
4965 if (__predict_false(dvs == VGET_NONE)) {
4966 return (cache_fpl_aborted(fpl));
4967 }
4968 tvs = vget_prep_smr(tvp);
4969 if (__predict_false(tvs == VGET_NONE)) {
4970 cache_fpl_smr_exit(fpl);
4971 vget_abort(dvp, dvs);
4972 return (cache_fpl_aborted(fpl));
4973 }
4974
4975 cache_fpl_smr_exit(fpl);
4976
4977 if ((cnp->cn_flags & LOCKPARENT) != 0) {
4978 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4979 if (__predict_false(error != 0)) {
4980 vget_abort(tvp, tvs);
4981 return (cache_fpl_aborted(fpl));
4982 }
4983 } else {
4984 vget_finish_ref(dvp, dvs);
4985 }
4986
4987 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4988 vget_abort(tvp, tvs);
4989 if ((cnp->cn_flags & LOCKPARENT) != 0)
4990 vput(dvp);
4991 else
4992 vrele(dvp);
4993 return (cache_fpl_aborted(fpl));
4994 }
4995
4996 error = cache_fplookup_final_child(fpl, tvs);
4997 if (__predict_false(error != 0)) {
4998 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
4999 fpl->status == CACHE_FPL_STATUS_DESTROYED);
5000 if ((cnp->cn_flags & LOCKPARENT) != 0)
5001 vput(dvp);
5002 else
5003 vrele(dvp);
5004 return (error);
5005 }
5006
5007 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
5008 return (0);
5009 }
5010
5011 static int
cache_fplookup_final(struct cache_fpl * fpl)5012 cache_fplookup_final(struct cache_fpl *fpl)
5013 {
5014 struct componentname *cnp;
5015 enum vgetstate tvs;
5016 struct vnode *dvp, *tvp;
5017 seqc_t dvp_seqc;
5018
5019 cnp = fpl->cnp;
5020 dvp = fpl->dvp;
5021 dvp_seqc = fpl->dvp_seqc;
5022 tvp = fpl->tvp;
5023
5024 MPASS(*(cnp->cn_nameptr) != '/');
5025
5026 if (cnp->cn_nameiop != LOOKUP) {
5027 return (cache_fplookup_final_modifying(fpl));
5028 }
5029
5030 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
5031 return (cache_fplookup_final_withparent(fpl));
5032
5033 tvs = vget_prep_smr(tvp);
5034 if (__predict_false(tvs == VGET_NONE)) {
5035 return (cache_fpl_partial(fpl));
5036 }
5037
5038 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5039 cache_fpl_smr_exit(fpl);
5040 vget_abort(tvp, tvs);
5041 return (cache_fpl_aborted(fpl));
5042 }
5043
5044 cache_fpl_smr_exit(fpl);
5045 return (cache_fplookup_final_child(fpl, tvs));
5046 }
5047
5048 /*
5049 * Comment from locked lookup:
5050 * Check for degenerate name (e.g. / or "") which is a way of talking about a
5051 * directory, e.g. like "/." or ".".
5052 */
5053 static int __noinline
cache_fplookup_degenerate(struct cache_fpl * fpl)5054 cache_fplookup_degenerate(struct cache_fpl *fpl)
5055 {
5056 struct componentname *cnp;
5057 struct vnode *dvp;
5058 enum vgetstate dvs;
5059 int error, lkflags;
5060 #ifdef INVARIANTS
5061 char *cp;
5062 #endif
5063
5064 fpl->tvp = fpl->dvp;
5065 fpl->tvp_seqc = fpl->dvp_seqc;
5066
5067 cnp = fpl->cnp;
5068 dvp = fpl->dvp;
5069
5070 #ifdef INVARIANTS
5071 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
5072 KASSERT(*cp == '/',
5073 ("%s: encountered non-slash; string [%s]\n", __func__,
5074 cnp->cn_pnbuf));
5075 }
5076 #endif
5077
5078 if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
5079 cache_fpl_smr_exit(fpl);
5080 return (cache_fpl_handled_error(fpl, EISDIR));
5081 }
5082
5083 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
5084 return (cache_fplookup_final_withparent(fpl));
5085 }
5086
5087 dvs = vget_prep_smr(dvp);
5088 cache_fpl_smr_exit(fpl);
5089 if (__predict_false(dvs == VGET_NONE)) {
5090 return (cache_fpl_aborted(fpl));
5091 }
5092
5093 if ((cnp->cn_flags & LOCKLEAF) != 0) {
5094 lkflags = LK_SHARED;
5095 if ((cnp->cn_flags & LOCKSHARED) == 0)
5096 lkflags = LK_EXCLUSIVE;
5097 error = vget_finish(dvp, lkflags, dvs);
5098 if (__predict_false(error != 0)) {
5099 return (cache_fpl_aborted(fpl));
5100 }
5101 } else {
5102 vget_finish_ref(dvp, dvs);
5103 }
5104 return (cache_fpl_handled(fpl));
5105 }
5106
5107 static int __noinline
cache_fplookup_emptypath(struct cache_fpl * fpl)5108 cache_fplookup_emptypath(struct cache_fpl *fpl)
5109 {
5110 struct nameidata *ndp;
5111 struct componentname *cnp;
5112 enum vgetstate tvs;
5113 struct vnode *tvp;
5114 int error, lkflags;
5115
5116 fpl->tvp = fpl->dvp;
5117 fpl->tvp_seqc = fpl->dvp_seqc;
5118
5119 ndp = fpl->ndp;
5120 cnp = fpl->cnp;
5121 tvp = fpl->tvp;
5122
5123 MPASS(*cnp->cn_pnbuf == '\0');
5124
5125 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
5126 cache_fpl_smr_exit(fpl);
5127 return (cache_fpl_handled_error(fpl, ENOENT));
5128 }
5129
5130 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
5131
5132 tvs = vget_prep_smr(tvp);
5133 cache_fpl_smr_exit(fpl);
5134 if (__predict_false(tvs == VGET_NONE)) {
5135 return (cache_fpl_aborted(fpl));
5136 }
5137
5138 if ((cnp->cn_flags & LOCKLEAF) != 0) {
5139 lkflags = LK_SHARED;
5140 if ((cnp->cn_flags & LOCKSHARED) == 0)
5141 lkflags = LK_EXCLUSIVE;
5142 error = vget_finish(tvp, lkflags, tvs);
5143 if (__predict_false(error != 0)) {
5144 return (cache_fpl_aborted(fpl));
5145 }
5146 } else {
5147 vget_finish_ref(tvp, tvs);
5148 }
5149
5150 ndp->ni_resflags |= NIRES_EMPTYPATH;
5151 return (cache_fpl_handled(fpl));
5152 }
5153
5154 static int __noinline
cache_fplookup_noentry(struct cache_fpl * fpl)5155 cache_fplookup_noentry(struct cache_fpl *fpl)
5156 {
5157 struct nameidata *ndp;
5158 struct componentname *cnp;
5159 enum vgetstate dvs;
5160 struct vnode *dvp, *tvp;
5161 seqc_t dvp_seqc;
5162 int error;
5163
5164 ndp = fpl->ndp;
5165 cnp = fpl->cnp;
5166 dvp = fpl->dvp;
5167 dvp_seqc = fpl->dvp_seqc;
5168
5169 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
5170 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
5171 if (cnp->cn_nameiop == LOOKUP)
5172 MPASS((cnp->cn_flags & NOCACHE) == 0);
5173 MPASS(!cache_fpl_isdotdot(cnp));
5174
5175 /*
5176 * Hack: delayed name len checking.
5177 */
5178 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5179 cache_fpl_smr_exit(fpl);
5180 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5181 }
5182
5183 if (cnp->cn_nameptr[0] == '/') {
5184 return (cache_fplookup_skip_slashes(fpl));
5185 }
5186
5187 if (cnp->cn_pnbuf[0] == '\0') {
5188 return (cache_fplookup_emptypath(fpl));
5189 }
5190
5191 if (cnp->cn_nameptr[0] == '\0') {
5192 if (fpl->tvp == NULL) {
5193 return (cache_fplookup_degenerate(fpl));
5194 }
5195 return (cache_fplookup_trailingslash(fpl));
5196 }
5197
5198 if (cnp->cn_nameiop != LOOKUP) {
5199 fpl->tvp = NULL;
5200 return (cache_fplookup_modifying(fpl));
5201 }
5202
5203 /*
5204 * Only try to fill in the component if it is the last one,
5205 * otherwise not only there may be several to handle but the
5206 * walk may be complicated.
5207 */
5208 if (!cache_fpl_islastcn(ndp)) {
5209 return (cache_fpl_partial(fpl));
5210 }
5211
5212 /*
5213 * Regular lookup nulifies the slash, which we don't do here.
5214 * Don't take chances with filesystem routines seeing it for
5215 * the last entry.
5216 */
5217 if (cache_fpl_istrailingslash(fpl)) {
5218 return (cache_fpl_partial(fpl));
5219 }
5220
5221 /*
5222 * Secure access to dvp; check cache_fplookup_partial_setup for
5223 * reasoning.
5224 */
5225 dvs = vget_prep_smr(dvp);
5226 cache_fpl_smr_exit(fpl);
5227 if (__predict_false(dvs == VGET_NONE)) {
5228 return (cache_fpl_aborted(fpl));
5229 }
5230
5231 vget_finish_ref(dvp, dvs);
5232 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5233 vrele(dvp);
5234 return (cache_fpl_aborted(fpl));
5235 }
5236
5237 error = vn_lock(dvp, LK_SHARED);
5238 if (__predict_false(error != 0)) {
5239 vrele(dvp);
5240 return (cache_fpl_aborted(fpl));
5241 }
5242
5243 tvp = NULL;
5244 /*
5245 * TODO: provide variants which don't require locking either vnode.
5246 */
5247 cnp->cn_flags |= ISLASTCN | MAKEENTRY;
5248 cnp->cn_lkflags = LK_SHARED;
5249 if ((cnp->cn_flags & LOCKSHARED) == 0) {
5250 cnp->cn_lkflags = LK_EXCLUSIVE;
5251 }
5252 error = VOP_LOOKUP(dvp, &tvp, cnp);
5253 switch (error) {
5254 case EJUSTRETURN:
5255 case 0:
5256 break;
5257 case ENOTDIR:
5258 case ENOENT:
5259 vput(dvp);
5260 return (cache_fpl_handled_error(fpl, error));
5261 default:
5262 vput(dvp);
5263 return (cache_fpl_aborted(fpl));
5264 }
5265
5266 fpl->tvp = tvp;
5267
5268 if (tvp == NULL) {
5269 MPASS(error == EJUSTRETURN);
5270 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5271 vput(dvp);
5272 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5273 VOP_UNLOCK(dvp);
5274 }
5275 return (cache_fpl_handled(fpl));
5276 }
5277
5278 if (tvp->v_type == VLNK) {
5279 if ((cnp->cn_flags & FOLLOW) != 0) {
5280 vput(dvp);
5281 vput(tvp);
5282 return (cache_fpl_aborted(fpl));
5283 }
5284 }
5285
5286 if (__predict_false(cache_fplookup_is_mp(fpl))) {
5287 vput(dvp);
5288 vput(tvp);
5289 return (cache_fpl_aborted(fpl));
5290 }
5291
5292 if ((cnp->cn_flags & LOCKLEAF) == 0) {
5293 VOP_UNLOCK(tvp);
5294 }
5295
5296 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5297 vput(dvp);
5298 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5299 VOP_UNLOCK(dvp);
5300 }
5301 return (cache_fpl_handled(fpl));
5302 }
5303
5304 static int __noinline
cache_fplookup_dot(struct cache_fpl * fpl)5305 cache_fplookup_dot(struct cache_fpl *fpl)
5306 {
5307 int error;
5308
5309 MPASS(!seqc_in_modify(fpl->dvp_seqc));
5310
5311 if (__predict_false(fpl->dvp->v_type != VDIR)) {
5312 cache_fpl_smr_exit(fpl);
5313 return (cache_fpl_handled_error(fpl, ENOTDIR));
5314 }
5315
5316 /*
5317 * Just re-assign the value. seqc will be checked later for the first
5318 * non-dot path component in line and/or before deciding to return the
5319 * vnode.
5320 */
5321 fpl->tvp = fpl->dvp;
5322 fpl->tvp_seqc = fpl->dvp_seqc;
5323
5324 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
5325
5326 error = 0;
5327 if (cache_fplookup_is_mp(fpl)) {
5328 error = cache_fplookup_cross_mount(fpl);
5329 }
5330 return (error);
5331 }
5332
5333 static int __noinline
cache_fplookup_dotdot(struct cache_fpl * fpl)5334 cache_fplookup_dotdot(struct cache_fpl *fpl)
5335 {
5336 struct nameidata *ndp;
5337 struct namecache *ncp;
5338 struct vnode *dvp;
5339 u_char nc_flag;
5340
5341 ndp = fpl->ndp;
5342 dvp = fpl->dvp;
5343
5344 MPASS(cache_fpl_isdotdot(fpl->cnp));
5345
5346 /*
5347 * XXX this is racy the same way regular lookup is
5348 */
5349 if (vfs_lookup_isroot(ndp, dvp)) {
5350 fpl->tvp = dvp;
5351 fpl->tvp_seqc = vn_seqc_read_any(dvp);
5352 if (seqc_in_modify(fpl->tvp_seqc)) {
5353 return (cache_fpl_aborted(fpl));
5354 }
5355 return (0);
5356 }
5357
5358 if ((dvp->v_vflag & VV_ROOT) != 0) {
5359 /*
5360 * TODO
5361 * The opposite of climb mount is needed here.
5362 */
5363 return (cache_fpl_partial(fpl));
5364 }
5365
5366 if (__predict_false(dvp->v_type != VDIR)) {
5367 cache_fpl_smr_exit(fpl);
5368 return (cache_fpl_handled_error(fpl, ENOTDIR));
5369 }
5370
5371 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
5372 if (ncp == NULL) {
5373 return (cache_fpl_aborted(fpl));
5374 }
5375
5376 nc_flag = atomic_load_char(&ncp->nc_flag);
5377 if ((nc_flag & NCF_ISDOTDOT) != 0) {
5378 if ((nc_flag & NCF_NEGATIVE) != 0)
5379 return (cache_fpl_aborted(fpl));
5380 fpl->tvp = ncp->nc_vp;
5381 } else {
5382 fpl->tvp = ncp->nc_dvp;
5383 }
5384
5385 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
5386 if (seqc_in_modify(fpl->tvp_seqc)) {
5387 return (cache_fpl_partial(fpl));
5388 }
5389
5390 /*
5391 * Acquire fence provided by vn_seqc_read_any above.
5392 */
5393 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
5394 return (cache_fpl_aborted(fpl));
5395 }
5396
5397 if (!cache_ncp_canuse(ncp)) {
5398 return (cache_fpl_aborted(fpl));
5399 }
5400
5401 return (0);
5402 }
5403
5404 static int __noinline
cache_fplookup_neg(struct cache_fpl * fpl,struct namecache * ncp,uint32_t hash)5405 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
5406 {
5407 u_char nc_flag __diagused;
5408 bool neg_promote;
5409
5410 #ifdef INVARIANTS
5411 nc_flag = atomic_load_char(&ncp->nc_flag);
5412 MPASS((nc_flag & NCF_NEGATIVE) != 0);
5413 #endif
5414 /*
5415 * If they want to create an entry we need to replace this one.
5416 */
5417 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
5418 fpl->tvp = NULL;
5419 return (cache_fplookup_modifying(fpl));
5420 }
5421 neg_promote = cache_neg_hit_prep(ncp);
5422 if (!cache_fpl_neg_ncp_canuse(ncp)) {
5423 cache_neg_hit_abort(ncp);
5424 return (cache_fpl_partial(fpl));
5425 }
5426 if (neg_promote) {
5427 return (cache_fplookup_negative_promote(fpl, ncp, hash));
5428 }
5429 cache_neg_hit_finish(ncp);
5430 cache_fpl_smr_exit(fpl);
5431 return (cache_fpl_handled_error(fpl, ENOENT));
5432 }
5433
5434 /*
5435 * Resolve a symlink. Called by filesystem-specific routines.
5436 *
5437 * Code flow is:
5438 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
5439 */
5440 int
cache_symlink_resolve(struct cache_fpl * fpl,const char * string,size_t len)5441 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
5442 {
5443 struct nameidata *ndp;
5444 struct componentname *cnp;
5445 size_t adjust;
5446
5447 ndp = fpl->ndp;
5448 cnp = fpl->cnp;
5449
5450 if (__predict_false(len == 0)) {
5451 return (ENOENT);
5452 }
5453
5454 if (__predict_false(len > MAXPATHLEN - 2)) {
5455 if (cache_fpl_istrailingslash(fpl)) {
5456 return (EAGAIN);
5457 }
5458 }
5459
5460 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
5461 #ifdef INVARIANTS
5462 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
5463 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5464 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5465 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5466 }
5467 #endif
5468
5469 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
5470 return (ENAMETOOLONG);
5471 }
5472
5473 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
5474 return (ELOOP);
5475 }
5476
5477 adjust = len;
5478 if (ndp->ni_pathlen > 1) {
5479 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
5480 } else {
5481 if (cache_fpl_istrailingslash(fpl)) {
5482 adjust = len + 1;
5483 cnp->cn_pnbuf[len] = '/';
5484 cnp->cn_pnbuf[len + 1] = '\0';
5485 } else {
5486 cnp->cn_pnbuf[len] = '\0';
5487 }
5488 }
5489 bcopy(string, cnp->cn_pnbuf, len);
5490
5491 ndp->ni_pathlen += adjust;
5492 cache_fpl_pathlen_add(fpl, adjust);
5493 cnp->cn_nameptr = cnp->cn_pnbuf;
5494 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5495 fpl->tvp = NULL;
5496 return (0);
5497 }
5498
5499 static int __noinline
cache_fplookup_symlink(struct cache_fpl * fpl)5500 cache_fplookup_symlink(struct cache_fpl *fpl)
5501 {
5502 struct mount *mp;
5503 struct nameidata *ndp;
5504 struct componentname *cnp;
5505 struct vnode *dvp, *tvp;
5506 struct pwd *pwd;
5507 int error;
5508
5509 ndp = fpl->ndp;
5510 cnp = fpl->cnp;
5511 dvp = fpl->dvp;
5512 tvp = fpl->tvp;
5513 pwd = *(fpl->pwd);
5514
5515 if (cache_fpl_islastcn(ndp)) {
5516 if ((cnp->cn_flags & FOLLOW) == 0) {
5517 return (cache_fplookup_final(fpl));
5518 }
5519 }
5520
5521 mp = atomic_load_ptr(&dvp->v_mount);
5522 if (__predict_false(mp == NULL)) {
5523 return (cache_fpl_aborted(fpl));
5524 }
5525
5526 /*
5527 * Note this check races against setting the flag just like regular
5528 * lookup.
5529 */
5530 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
5531 cache_fpl_smr_exit(fpl);
5532 return (cache_fpl_handled_error(fpl, EACCES));
5533 }
5534
5535 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
5536 if (__predict_false(error != 0)) {
5537 switch (error) {
5538 case EAGAIN:
5539 return (cache_fpl_partial(fpl));
5540 case ENOENT:
5541 case ENAMETOOLONG:
5542 case ELOOP:
5543 cache_fpl_smr_exit(fpl);
5544 return (cache_fpl_handled_error(fpl, error));
5545 default:
5546 return (cache_fpl_aborted(fpl));
5547 }
5548 }
5549
5550 if (*(cnp->cn_nameptr) == '/') {
5551 fpl->dvp = cache_fpl_handle_root(fpl);
5552 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5553 if (seqc_in_modify(fpl->dvp_seqc)) {
5554 return (cache_fpl_aborted(fpl));
5555 }
5556 /*
5557 * The main loop assumes that ->dvp points to a vnode belonging
5558 * to a filesystem which can do lockless lookup, but the absolute
5559 * symlink can be wandering off to one which does not.
5560 */
5561 mp = atomic_load_ptr(&fpl->dvp->v_mount);
5562 if (__predict_false(mp == NULL)) {
5563 return (cache_fpl_aborted(fpl));
5564 }
5565 if (!cache_fplookup_mp_supported(mp)) {
5566 cache_fpl_checkpoint(fpl);
5567 return (cache_fpl_partial(fpl));
5568 }
5569 if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) {
5570 return (cache_fpl_aborted(fpl));
5571 }
5572 }
5573 return (0);
5574 }
5575
5576 static int
cache_fplookup_next(struct cache_fpl * fpl)5577 cache_fplookup_next(struct cache_fpl *fpl)
5578 {
5579 struct componentname *cnp;
5580 struct namecache *ncp;
5581 struct vnode *dvp, *tvp;
5582 u_char nc_flag;
5583 uint32_t hash;
5584 int error;
5585
5586 cnp = fpl->cnp;
5587 dvp = fpl->dvp;
5588 hash = fpl->hash;
5589
5590 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
5591 if (cnp->cn_namelen == 1) {
5592 return (cache_fplookup_dot(fpl));
5593 }
5594 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
5595 return (cache_fplookup_dotdot(fpl));
5596 }
5597 }
5598
5599 MPASS(!cache_fpl_isdotdot(cnp));
5600
5601 ncp = cache_ncp_find(dvp, cnp, hash);
5602 if (__predict_false(ncp == NULL)) {
5603 return (cache_fplookup_noentry(fpl));
5604 }
5605
5606 tvp = atomic_load_ptr(&ncp->nc_vp);
5607 nc_flag = atomic_load_char(&ncp->nc_flag);
5608 if ((nc_flag & NCF_NEGATIVE) != 0) {
5609 return (cache_fplookup_neg(fpl, ncp, hash));
5610 }
5611
5612 if (!cache_ncp_canuse(ncp)) {
5613 return (cache_fpl_partial(fpl));
5614 }
5615
5616 fpl->tvp = tvp;
5617 fpl->tvp_seqc = vn_seqc_read_any(tvp);
5618 if (seqc_in_modify(fpl->tvp_seqc)) {
5619 return (cache_fpl_partial(fpl));
5620 }
5621
5622 counter_u64_add(numposhits, 1);
5623 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
5624
5625 error = 0;
5626 if (cache_fplookup_is_mp(fpl)) {
5627 error = cache_fplookup_cross_mount(fpl);
5628 }
5629 return (error);
5630 }
5631
5632 static bool
cache_fplookup_mp_supported(struct mount * mp)5633 cache_fplookup_mp_supported(struct mount *mp)
5634 {
5635
5636 MPASS(mp != NULL);
5637 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
5638 return (false);
5639 return (true);
5640 }
5641
5642 /*
5643 * Walk up the mount stack (if any).
5644 *
5645 * Correctness is provided in the following ways:
5646 * - all vnodes are protected from freeing with SMR
5647 * - struct mount objects are type stable making them always safe to access
5648 * - stability of the particular mount is provided by busying it
5649 * - relationship between the vnode which is mounted on and the mount is
5650 * verified with the vnode sequence counter after busying
5651 * - association between root vnode of the mount and the mount is protected
5652 * by busy
5653 *
5654 * From that point on we can read the sequence counter of the root vnode
5655 * and get the next mount on the stack (if any) using the same protection.
5656 *
5657 * By the end of successful walk we are guaranteed the reached state was
5658 * indeed present at least at some point which matches the regular lookup.
5659 */
5660 static int __noinline
cache_fplookup_climb_mount(struct cache_fpl * fpl)5661 cache_fplookup_climb_mount(struct cache_fpl *fpl)
5662 {
5663 struct mount *mp, *prev_mp;
5664 struct mount_pcpu *mpcpu, *prev_mpcpu;
5665 struct vnode *vp;
5666 seqc_t vp_seqc;
5667
5668 vp = fpl->tvp;
5669 vp_seqc = fpl->tvp_seqc;
5670
5671 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5672 mp = atomic_load_ptr(&vp->v_mountedhere);
5673 if (__predict_false(mp == NULL)) {
5674 return (0);
5675 }
5676
5677 prev_mp = NULL;
5678 for (;;) {
5679 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5680 if (prev_mp != NULL)
5681 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5682 return (cache_fpl_partial(fpl));
5683 }
5684 if (prev_mp != NULL)
5685 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5686 if (!vn_seqc_consistent(vp, vp_seqc)) {
5687 vfs_op_thread_exit_crit(mp, mpcpu);
5688 return (cache_fpl_partial(fpl));
5689 }
5690 if (!cache_fplookup_mp_supported(mp)) {
5691 vfs_op_thread_exit_crit(mp, mpcpu);
5692 return (cache_fpl_partial(fpl));
5693 }
5694 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5695 if (vp == NULL) {
5696 vfs_op_thread_exit_crit(mp, mpcpu);
5697 return (cache_fpl_partial(fpl));
5698 }
5699 vp_seqc = vn_seqc_read_any(vp);
5700 if (seqc_in_modify(vp_seqc)) {
5701 vfs_op_thread_exit_crit(mp, mpcpu);
5702 return (cache_fpl_partial(fpl));
5703 }
5704 prev_mp = mp;
5705 prev_mpcpu = mpcpu;
5706 mp = atomic_load_ptr(&vp->v_mountedhere);
5707 if (mp == NULL)
5708 break;
5709 }
5710
5711 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5712 fpl->tvp = vp;
5713 fpl->tvp_seqc = vp_seqc;
5714 return (0);
5715 }
5716
5717 static int __noinline
cache_fplookup_cross_mount(struct cache_fpl * fpl)5718 cache_fplookup_cross_mount(struct cache_fpl *fpl)
5719 {
5720 struct mount *mp;
5721 struct mount_pcpu *mpcpu;
5722 struct vnode *vp;
5723 seqc_t vp_seqc;
5724
5725 vp = fpl->tvp;
5726 vp_seqc = fpl->tvp_seqc;
5727
5728 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5729 mp = atomic_load_ptr(&vp->v_mountedhere);
5730 if (__predict_false(mp == NULL)) {
5731 return (0);
5732 }
5733
5734 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5735 return (cache_fpl_partial(fpl));
5736 }
5737 if (!vn_seqc_consistent(vp, vp_seqc)) {
5738 vfs_op_thread_exit_crit(mp, mpcpu);
5739 return (cache_fpl_partial(fpl));
5740 }
5741 if (!cache_fplookup_mp_supported(mp)) {
5742 vfs_op_thread_exit_crit(mp, mpcpu);
5743 return (cache_fpl_partial(fpl));
5744 }
5745 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5746 if (__predict_false(vp == NULL)) {
5747 vfs_op_thread_exit_crit(mp, mpcpu);
5748 return (cache_fpl_partial(fpl));
5749 }
5750 vp_seqc = vn_seqc_read_any(vp);
5751 vfs_op_thread_exit_crit(mp, mpcpu);
5752 if (seqc_in_modify(vp_seqc)) {
5753 return (cache_fpl_partial(fpl));
5754 }
5755 mp = atomic_load_ptr(&vp->v_mountedhere);
5756 if (__predict_false(mp != NULL)) {
5757 /*
5758 * There are possibly more mount points on top.
5759 * Normally this does not happen so for simplicity just start
5760 * over.
5761 */
5762 return (cache_fplookup_climb_mount(fpl));
5763 }
5764
5765 fpl->tvp = vp;
5766 fpl->tvp_seqc = vp_seqc;
5767 return (0);
5768 }
5769
5770 /*
5771 * Check if a vnode is mounted on.
5772 */
5773 static bool
cache_fplookup_is_mp(struct cache_fpl * fpl)5774 cache_fplookup_is_mp(struct cache_fpl *fpl)
5775 {
5776 struct vnode *vp;
5777
5778 vp = fpl->tvp;
5779 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5780 }
5781
5782 /*
5783 * Parse the path.
5784 *
5785 * The code was originally copy-pasted from regular lookup and despite
5786 * clean ups leaves performance on the table. Any modifications here
5787 * must take into account that in case off fallback the resulting
5788 * nameidata state has to be compatible with the original.
5789 */
5790
5791 /*
5792 * Debug ni_pathlen tracking.
5793 */
5794 #ifdef INVARIANTS
5795 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5796 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5797 {
5798
5799 fpl->debug.ni_pathlen += n;
5800 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5801 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5802 }
5803
5804 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5805 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5806 {
5807
5808 fpl->debug.ni_pathlen -= n;
5809 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5810 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5811 }
5812
5813 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5814 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5815 {
5816
5817 cache_fpl_pathlen_add(fpl, 1);
5818 }
5819
5820 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5821 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5822 {
5823
5824 cache_fpl_pathlen_sub(fpl, 1);
5825 }
5826 #else
5827 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5828 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5829 {
5830 }
5831
5832 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5833 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5834 {
5835 }
5836
5837 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5838 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5839 {
5840 }
5841
5842 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5843 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5844 {
5845 }
5846 #endif
5847
5848 static void
cache_fplookup_parse(struct cache_fpl * fpl)5849 cache_fplookup_parse(struct cache_fpl *fpl)
5850 {
5851 struct nameidata *ndp;
5852 struct componentname *cnp;
5853 struct vnode *dvp;
5854 char *cp;
5855 uint32_t hash;
5856
5857 ndp = fpl->ndp;
5858 cnp = fpl->cnp;
5859 dvp = fpl->dvp;
5860
5861 /*
5862 * Find the end of this path component, it is either / or nul.
5863 *
5864 * Store / as a temporary sentinel so that we only have one character
5865 * to test for. Pathnames tend to be short so this should not be
5866 * resulting in cache misses.
5867 *
5868 * TODO: fix this to be word-sized.
5869 */
5870 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
5871 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5872 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5873 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5874 fpl->nulchar, cnp->cn_pnbuf));
5875 KASSERT(*fpl->nulchar == '\0',
5876 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5877 cnp->cn_pnbuf));
5878 hash = cache_get_hash_iter_start(dvp);
5879 *fpl->nulchar = '/';
5880 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5881 KASSERT(*cp != '\0',
5882 ("%s: encountered unexpected nul; string [%s]\n", __func__,
5883 cnp->cn_nameptr));
5884 hash = cache_get_hash_iter(*cp, hash);
5885 continue;
5886 }
5887 *fpl->nulchar = '\0';
5888 fpl->hash = cache_get_hash_iter_finish(hash);
5889
5890 cnp->cn_namelen = cp - cnp->cn_nameptr;
5891 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5892
5893 #ifdef INVARIANTS
5894 /*
5895 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
5896 * we are going to fail this lookup with ENAMETOOLONG (see below).
5897 */
5898 if (cnp->cn_namelen <= NAME_MAX) {
5899 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
5900 panic("%s: mismatched hash for [%s] len %ld", __func__,
5901 cnp->cn_nameptr, cnp->cn_namelen);
5902 }
5903 }
5904 #endif
5905
5906 /*
5907 * Hack: we have to check if the found path component's length exceeds
5908 * NAME_MAX. However, the condition is very rarely true and check can
5909 * be elided in the common case -- if an entry was found in the cache,
5910 * then it could not have been too long to begin with.
5911 */
5912 ndp->ni_next = cp;
5913 }
5914
5915 static void
cache_fplookup_parse_advance(struct cache_fpl * fpl)5916 cache_fplookup_parse_advance(struct cache_fpl *fpl)
5917 {
5918 struct nameidata *ndp;
5919 struct componentname *cnp;
5920
5921 ndp = fpl->ndp;
5922 cnp = fpl->cnp;
5923
5924 cnp->cn_nameptr = ndp->ni_next;
5925 KASSERT(*(cnp->cn_nameptr) == '/',
5926 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5927 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5928 cnp->cn_nameptr++;
5929 cache_fpl_pathlen_dec(fpl);
5930 }
5931
5932 /*
5933 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5934 *
5935 * Lockless lookup tries to elide checking for spurious slashes and should they
5936 * be present is guaranteed to fail to find an entry. In this case the caller
5937 * must check if the name starts with a slash and call this routine. It is
5938 * going to fast forward across the spurious slashes and set the state up for
5939 * retry.
5940 */
5941 static int __noinline
cache_fplookup_skip_slashes(struct cache_fpl * fpl)5942 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
5943 {
5944 struct nameidata *ndp;
5945 struct componentname *cnp;
5946
5947 ndp = fpl->ndp;
5948 cnp = fpl->cnp;
5949
5950 MPASS(*(cnp->cn_nameptr) == '/');
5951 do {
5952 cnp->cn_nameptr++;
5953 cache_fpl_pathlen_dec(fpl);
5954 } while (*(cnp->cn_nameptr) == '/');
5955
5956 /*
5957 * Go back to one slash so that cache_fplookup_parse_advance has
5958 * something to skip.
5959 */
5960 cnp->cn_nameptr--;
5961 cache_fpl_pathlen_inc(fpl);
5962
5963 /*
5964 * cache_fplookup_parse_advance starts from ndp->ni_next
5965 */
5966 ndp->ni_next = cnp->cn_nameptr;
5967
5968 /*
5969 * See cache_fplookup_dot.
5970 */
5971 fpl->tvp = fpl->dvp;
5972 fpl->tvp_seqc = fpl->dvp_seqc;
5973
5974 return (0);
5975 }
5976
5977 /*
5978 * Handle trailing slashes (e.g., "foo/").
5979 *
5980 * If a trailing slash is found the terminal vnode must be a directory.
5981 * Regular lookup shortens the path by nulifying the first trailing slash and
5982 * sets the TRAILINGSLASH flag to denote this took place. There are several
5983 * checks on it performed later.
5984 *
5985 * Similarly to spurious slashes, lockless lookup handles this in a speculative
5986 * manner relying on an invariant that a non-directory vnode will get a miss.
5987 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
5988 *
5989 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
5990 * and denotes this is the last path component, which avoids looping back.
5991 *
5992 * Only plain lookups are supported for now to restrict corner cases to handle.
5993 */
5994 static int __noinline
cache_fplookup_trailingslash(struct cache_fpl * fpl)5995 cache_fplookup_trailingslash(struct cache_fpl *fpl)
5996 {
5997 #ifdef INVARIANTS
5998 size_t ni_pathlen;
5999 #endif
6000 struct nameidata *ndp;
6001 struct componentname *cnp;
6002 struct namecache *ncp;
6003 struct vnode *tvp;
6004 char *cn_nameptr_orig, *cn_nameptr_slash;
6005 seqc_t tvp_seqc;
6006 u_char nc_flag;
6007
6008 ndp = fpl->ndp;
6009 cnp = fpl->cnp;
6010 tvp = fpl->tvp;
6011 tvp_seqc = fpl->tvp_seqc;
6012
6013 MPASS(fpl->dvp == fpl->tvp);
6014 KASSERT(cache_fpl_istrailingslash(fpl),
6015 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
6016 cnp->cn_pnbuf));
6017 KASSERT(cnp->cn_nameptr[0] == '\0',
6018 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
6019 cnp->cn_pnbuf));
6020 KASSERT(cnp->cn_namelen == 0,
6021 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
6022 cnp->cn_pnbuf));
6023 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
6024
6025 if (cnp->cn_nameiop != LOOKUP) {
6026 return (cache_fpl_aborted(fpl));
6027 }
6028
6029 if (__predict_false(tvp->v_type != VDIR)) {
6030 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
6031 return (cache_fpl_aborted(fpl));
6032 }
6033 cache_fpl_smr_exit(fpl);
6034 return (cache_fpl_handled_error(fpl, ENOTDIR));
6035 }
6036
6037 /*
6038 * Denote the last component.
6039 */
6040 ndp->ni_next = &cnp->cn_nameptr[0];
6041 MPASS(cache_fpl_islastcn(ndp));
6042
6043 /*
6044 * Unwind trailing slashes.
6045 */
6046 cn_nameptr_orig = cnp->cn_nameptr;
6047 while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
6048 cnp->cn_nameptr--;
6049 if (cnp->cn_nameptr[0] != '/') {
6050 break;
6051 }
6052 }
6053
6054 /*
6055 * Unwind to the beginning of the path component.
6056 *
6057 * Note the path may or may not have started with a slash.
6058 */
6059 cn_nameptr_slash = cnp->cn_nameptr;
6060 while (cnp->cn_nameptr > cnp->cn_pnbuf) {
6061 cnp->cn_nameptr--;
6062 if (cnp->cn_nameptr[0] == '/') {
6063 break;
6064 }
6065 }
6066 if (cnp->cn_nameptr[0] == '/') {
6067 cnp->cn_nameptr++;
6068 }
6069
6070 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
6071 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
6072 cache_fpl_checkpoint(fpl);
6073
6074 #ifdef INVARIANTS
6075 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
6076 if (ni_pathlen != fpl->debug.ni_pathlen) {
6077 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
6078 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
6079 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
6080 }
6081 #endif
6082
6083 /*
6084 * If this was a "./" lookup the parent directory is already correct.
6085 */
6086 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
6087 return (0);
6088 }
6089
6090 /*
6091 * Otherwise we need to look it up.
6092 */
6093 tvp = fpl->tvp;
6094 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
6095 if (__predict_false(ncp == NULL)) {
6096 return (cache_fpl_aborted(fpl));
6097 }
6098 nc_flag = atomic_load_char(&ncp->nc_flag);
6099 if ((nc_flag & NCF_ISDOTDOT) != 0) {
6100 return (cache_fpl_aborted(fpl));
6101 }
6102 fpl->dvp = ncp->nc_dvp;
6103 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
6104 if (seqc_in_modify(fpl->dvp_seqc)) {
6105 return (cache_fpl_aborted(fpl));
6106 }
6107 return (0);
6108 }
6109
6110 /*
6111 * See the API contract for VOP_FPLOOKUP_VEXEC.
6112 */
6113 static int __noinline
cache_fplookup_failed_vexec(struct cache_fpl * fpl,int error)6114 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
6115 {
6116 struct componentname *cnp;
6117 struct vnode *dvp;
6118 seqc_t dvp_seqc;
6119
6120 cnp = fpl->cnp;
6121 dvp = fpl->dvp;
6122 dvp_seqc = fpl->dvp_seqc;
6123
6124 /*
6125 * Hack: delayed empty path checking.
6126 */
6127 if (cnp->cn_pnbuf[0] == '\0') {
6128 return (cache_fplookup_emptypath(fpl));
6129 }
6130
6131 /*
6132 * TODO: Due to ignoring trailing slashes lookup will perform a
6133 * permission check on the last dir when it should not be doing it. It
6134 * may fail, but said failure should be ignored. It is possible to fix
6135 * it up fully without resorting to regular lookup, but for now just
6136 * abort.
6137 */
6138 if (cache_fpl_istrailingslash(fpl)) {
6139 return (cache_fpl_aborted(fpl));
6140 }
6141
6142 /*
6143 * Hack: delayed degenerate path checking.
6144 */
6145 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
6146 return (cache_fplookup_degenerate(fpl));
6147 }
6148
6149 /*
6150 * Hack: delayed name len checking.
6151 */
6152 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
6153 cache_fpl_smr_exit(fpl);
6154 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
6155 }
6156
6157 /*
6158 * Hack: they may be looking up foo/bar, where foo is not a directory.
6159 * In such a case we need to return ENOTDIR, but we may happen to get
6160 * here with a different error.
6161 */
6162 if (dvp->v_type != VDIR) {
6163 error = ENOTDIR;
6164 }
6165
6166 /*
6167 * Hack: handle O_SEARCH.
6168 *
6169 * Open Group Base Specifications Issue 7, 2018 edition states:
6170 * <quote>
6171 * If the access mode of the open file description associated with the
6172 * file descriptor is not O_SEARCH, the function shall check whether
6173 * directory searches are permitted using the current permissions of
6174 * the directory underlying the file descriptor. If the access mode is
6175 * O_SEARCH, the function shall not perform the check.
6176 * </quote>
6177 *
6178 * Regular lookup tests for the NOEXECCHECK flag for every path
6179 * component to decide whether to do the permission check. However,
6180 * since most lookups never have the flag (and when they do it is only
6181 * present for the first path component), lockless lookup only acts on
6182 * it if there is a permission problem. Here the flag is represented
6183 * with a boolean so that we don't have to clear it on the way out.
6184 *
6185 * For simplicity this always aborts.
6186 * TODO: check if this is the first lookup and ignore the permission
6187 * problem. Note the flag has to survive fallback (if it happens to be
6188 * performed).
6189 */
6190 if (fpl->fsearch) {
6191 return (cache_fpl_aborted(fpl));
6192 }
6193
6194 switch (error) {
6195 case EAGAIN:
6196 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6197 error = cache_fpl_aborted(fpl);
6198 } else {
6199 cache_fpl_partial(fpl);
6200 }
6201 break;
6202 default:
6203 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6204 error = cache_fpl_aborted(fpl);
6205 } else {
6206 cache_fpl_smr_exit(fpl);
6207 cache_fpl_handled_error(fpl, error);
6208 }
6209 break;
6210 }
6211 return (error);
6212 }
6213
6214 static int
cache_fplookup_impl(struct vnode * dvp,struct cache_fpl * fpl)6215 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
6216 {
6217 struct nameidata *ndp;
6218 struct componentname *cnp;
6219 struct mount *mp;
6220 int error;
6221
6222 ndp = fpl->ndp;
6223 cnp = fpl->cnp;
6224
6225 cache_fpl_checkpoint(fpl);
6226
6227 /*
6228 * The vnode at hand is almost always stable, skip checking for it.
6229 * Worst case this postpones the check towards the end of the iteration
6230 * of the main loop.
6231 */
6232 fpl->dvp = dvp;
6233 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
6234
6235 mp = atomic_load_ptr(&dvp->v_mount);
6236 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
6237 return (cache_fpl_aborted(fpl));
6238 }
6239
6240 MPASS(fpl->tvp == NULL);
6241
6242 for (;;) {
6243 cache_fplookup_parse(fpl);
6244
6245 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
6246 if (__predict_false(error != 0)) {
6247 error = cache_fplookup_failed_vexec(fpl, error);
6248 break;
6249 }
6250
6251 error = cache_fplookup_next(fpl);
6252 if (__predict_false(cache_fpl_terminated(fpl))) {
6253 break;
6254 }
6255
6256 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
6257
6258 if (fpl->tvp->v_type == VLNK) {
6259 error = cache_fplookup_symlink(fpl);
6260 if (cache_fpl_terminated(fpl)) {
6261 break;
6262 }
6263 } else {
6264 if (cache_fpl_islastcn(ndp)) {
6265 error = cache_fplookup_final(fpl);
6266 break;
6267 }
6268
6269 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
6270 error = cache_fpl_aborted(fpl);
6271 break;
6272 }
6273
6274 fpl->dvp = fpl->tvp;
6275 fpl->dvp_seqc = fpl->tvp_seqc;
6276 cache_fplookup_parse_advance(fpl);
6277 }
6278
6279 cache_fpl_checkpoint(fpl);
6280 }
6281
6282 return (error);
6283 }
6284
6285 /*
6286 * Fast path lookup protected with SMR and sequence counters.
6287 *
6288 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
6289 *
6290 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
6291 * outlined below.
6292 *
6293 * Traditional vnode lookup conceptually looks like this:
6294 *
6295 * vn_lock(current);
6296 * for (;;) {
6297 * next = find();
6298 * vn_lock(next);
6299 * vn_unlock(current);
6300 * current = next;
6301 * if (last)
6302 * break;
6303 * }
6304 * return (current);
6305 *
6306 * Each jump to the next vnode is safe memory-wise and atomic with respect to
6307 * any modifications thanks to holding respective locks.
6308 *
6309 * The same guarantee can be provided with a combination of safe memory
6310 * reclamation and sequence counters instead. If all operations which affect
6311 * the relationship between the current vnode and the one we are looking for
6312 * also modify the counter, we can verify whether all the conditions held as
6313 * we made the jump. This includes things like permissions, mount points etc.
6314 * Counter modification is provided by enclosing relevant places in
6315 * vn_seqc_write_begin()/end() calls.
6316 *
6317 * Thus this translates to:
6318 *
6319 * vfs_smr_enter();
6320 * dvp_seqc = seqc_read_any(dvp);
6321 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
6322 * abort();
6323 * for (;;) {
6324 * tvp = find();
6325 * tvp_seqc = seqc_read_any(tvp);
6326 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
6327 * abort();
6328 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
6329 * abort();
6330 * dvp = tvp; // we know nothing of importance has changed
6331 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
6332 * if (last)
6333 * break;
6334 * }
6335 * vget(); // secure the vnode
6336 * if (!seqc_consistent(tvp, tvp_seqc) // final check
6337 * abort();
6338 * // at this point we know nothing has changed for any parent<->child pair
6339 * // as they were crossed during the lookup, meaning we matched the guarantee
6340 * // of the locked variant
6341 * return (tvp);
6342 *
6343 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
6344 * - they are called while within vfs_smr protection which they must never exit
6345 * - EAGAIN can be returned to denote checking could not be performed, it is
6346 * always valid to return it
6347 * - if the sequence counter has not changed the result must be valid
6348 * - if the sequence counter has changed both false positives and false negatives
6349 * are permitted (since the result will be rejected later)
6350 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
6351 *
6352 * Caveats to watch out for:
6353 * - vnodes are passed unlocked and unreferenced with nothing stopping
6354 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
6355 * to use atomic_load_ptr to fetch it.
6356 * - the aforementioned object can also get freed, meaning absent other means it
6357 * should be protected with vfs_smr
6358 * - either safely checking permissions as they are modified or guaranteeing
6359 * their stability is left to the routine
6360 */
6361 int
cache_fplookup(struct nameidata * ndp,enum cache_fpl_status * status,struct pwd ** pwdp)6362 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
6363 struct pwd **pwdp)
6364 {
6365 struct cache_fpl fpl;
6366 struct pwd *pwd;
6367 struct vnode *dvp;
6368 struct componentname *cnp;
6369 int error;
6370
6371 fpl.status = CACHE_FPL_STATUS_UNSET;
6372 fpl.in_smr = false;
6373 fpl.ndp = ndp;
6374 fpl.cnp = cnp = &ndp->ni_cnd;
6375 MPASS(ndp->ni_lcf == 0);
6376 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
6377 ("%s: internal flags found in cn_flags %" PRIx64, __func__,
6378 cnp->cn_flags));
6379 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
6380 MPASS(ndp->ni_resflags == 0);
6381
6382 if (__predict_false(!cache_can_fplookup(&fpl))) {
6383 *status = fpl.status;
6384 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6385 return (EOPNOTSUPP);
6386 }
6387
6388 cache_fpl_checkpoint_outer(&fpl);
6389
6390 cache_fpl_smr_enter_initial(&fpl);
6391 #ifdef INVARIANTS
6392 fpl.debug.ni_pathlen = ndp->ni_pathlen;
6393 #endif
6394 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
6395 fpl.fsearch = false;
6396 fpl.tvp = NULL; /* for degenerate path handling */
6397 fpl.pwd = pwdp;
6398 pwd = pwd_get_smr();
6399 *(fpl.pwd) = pwd;
6400 namei_setup_rootdir(ndp, cnp, pwd);
6401 ndp->ni_topdir = pwd->pwd_jdir;
6402
6403 if (cnp->cn_pnbuf[0] == '/') {
6404 dvp = cache_fpl_handle_root(&fpl);
6405 ndp->ni_resflags = NIRES_ABS;
6406 } else {
6407 if (ndp->ni_dirfd == AT_FDCWD) {
6408 dvp = pwd->pwd_cdir;
6409 } else {
6410 error = cache_fplookup_dirfd(&fpl, &dvp);
6411 if (__predict_false(error != 0)) {
6412 goto out;
6413 }
6414 }
6415 }
6416
6417 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
6418 error = cache_fplookup_impl(dvp, &fpl);
6419 out:
6420 cache_fpl_smr_assert_not_entered(&fpl);
6421 cache_fpl_assert_status(&fpl);
6422 *status = fpl.status;
6423 if (SDT_PROBES_ENABLED()) {
6424 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6425 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
6426 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
6427 ndp);
6428 }
6429
6430 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
6431 MPASS(error != CACHE_FPL_FAILED);
6432 if (error != 0) {
6433 cache_fpl_cleanup_cnp(fpl.cnp);
6434 MPASS(fpl.dvp == NULL);
6435 MPASS(fpl.tvp == NULL);
6436 }
6437 ndp->ni_dvp = fpl.dvp;
6438 ndp->ni_vp = fpl.tvp;
6439 }
6440 return (error);
6441 }
6442