1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include "opt_ddb.h"
36 #include "opt_ktrace.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/capsicum.h>
41 #include <sys/counter.h>
42 #include <sys/filedesc.h>
43 #include <sys/fnv_hash.h>
44 #include <sys/inotify.h>
45 #include <sys/kernel.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/fcntl.h>
50 #include <sys/jail.h>
51 #include <sys/mount.h>
52 #include <sys/namei.h>
53 #include <sys/proc.h>
54 #include <sys/seqc.h>
55 #include <sys/sdt.h>
56 #include <sys/smr.h>
57 #include <sys/smp.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysctl.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <ck_queue.h>
63 #ifdef KTRACE
64 #include <sys/ktrace.h>
65 #endif
66 #ifdef INVARIANTS
67 #include <machine/_inttypes.h>
68 #endif
69
70 #include <security/audit/audit.h>
71 #include <security/mac/mac_framework.h>
72
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76
77 #include <vm/uma.h>
78
79 /*
80 * High level overview of name caching in the VFS layer.
81 *
82 * Originally caching was implemented as part of UFS, later extracted to allow
83 * use by other filesystems. A decision was made to make it optional and
84 * completely detached from the rest of the kernel, which comes with limitations
85 * outlined near the end of this comment block.
86 *
87 * This fundamental choice needs to be revisited. In the meantime, the current
88 * state is described below. Significance of all notable routines is explained
89 * in comments placed above their implementation. Scattered throughout the
90 * file are TODO comments indicating shortcomings which can be fixed without
91 * reworking everything (most of the fixes will likely be reusable). Various
92 * details are omitted from this explanation to not clutter the overview, they
93 * have to be checked by reading the code and associated commentary.
94 *
95 * Keep in mind that it's individual path components which are cached, not full
96 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
97 * one for each name.
98 *
99 * I. Data organization
100 *
101 * Entries are described by "struct namecache" objects and stored in a hash
102 * table. See cache_get_hash for more information.
103 *
104 * "struct vnode" contains pointers to source entries (names which can be found
105 * when traversing through said vnode), destination entries (names of that
106 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
107 * the parent vnode.
108 *
109 * The (directory vnode; name) tuple reliably determines the target entry if
110 * it exists.
111 *
112 * Since there were no small locks at the time of writing this comment (all are
113 * 32 bytes in size on LP64), the code works around the problem by introducing
114 * lock arrays to protect hash buckets and vnode lists.
115 *
116 * II. Filesystem integration
117 *
118 * Filesystems participating in name caching do the following:
119 * - set vop_lookup routine to vfs_cache_lookup
120 * - set vop_cachedlookup to a routine which can perform the lookup if the
121 * above fails
122 * - if they support lockless lookup (see below), they set vop_fplookup_vexec
123 * and vop_fplookup_symlink along with the MNTK_FPLOOKUP flag on the mount
124 * point
125 * - call cache_purge or cache_vop_* routines to eliminate stale entries as
126 * applicable
127 * - call cache_enter to add entries depending on the MAKEENTRY flag
128 *
129 * With the above in mind, there are 2 entry points when doing lookups:
130 * - ... -> namei -> cache_fplookup -- this is the default
131 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
132 * should the above fail
133 *
134 * Example code flow how an entry is added:
135 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
136 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
137 *
138 * You may notice a degree of CPU waste in this callchain.
139 *
140 * III. Performance considerations
141 *
142 * For lockless case forward lookup avoids any writes to shared areas apart
143 * from the terminal path component. In other words non-modifying lookups of
144 * different files don't suffer any scalability problems in the namecache
145 * itself.
146 *
147 * Looking up the same file is limited by VFS and goes beyond the scope of this
148 * file.
149 *
150 * At least on amd64 the single-threaded bottleneck for long paths is hashing
151 * (see cache_get_hash). There are cases where the code issues acquire fence
152 * multiple times, they can be combined on architectures which suffer from it.
153 *
154 * For locked case each encountered vnode has to be referenced and locked in
155 * order to be handed out to the caller (normally that's namei). This
156 * introduces significant hit single-threaded and serialization multi-threaded.
157 *
158 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
159 * avoids any writes to shared areas to any components.
160 *
161 * Unrelated insertions are partially serialized on updating the global entry
162 * counter and possibly serialized on colliding bucket or vnode locks.
163 *
164 * IV. Observability
165 *
166 * Several statistics are collected in the vfs.cache sysctl tree.
167 *
168 * Some of the state can be checked for with explicit dtrace probes, must of it
169 * depends on implementation details.
170 *
171 * Examples:
172 *
173 * # Check what lookups failed to be handled in a lockless manner. Column 1 is
174 * # line number, column 2 is status code (see cache_fpl_status)
175 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
176 *
177 * # Histogram of lengths of names added, aggregated by which programs are doing it
178 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
179 *
180 * # Same as above but only those which exceed 64 characters
181 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
182 *
183 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
184 * # path is it
185 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
186 *
187 * V. Limitations and implementation defects
188 *
189 * - since it is possible there is no entry for an open file, tools like
190 * "procstat" may fail to resolve fd -> vnode -> path to anything
191 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
192 * shortage) in which case the above problem applies
193 * - hardlinks are not tracked, thus if a vnode is reachable in more than one
194 * way, resolving a name may return a different path than the one used to
195 * open it (even if said path is still valid)
196 * - by default entries are not added for newly created files
197 * - adding an entry may need to evict negative entry first, which happens in 2
198 * distinct places (evicting on lookup, adding in a later VOP) making it
199 * impossible to simply reuse it
200 * - there is a simple scheme to evict negative entries as the cache is approaching
201 * its capacity, but it is very unclear if doing so is a good idea to begin with
202 * - vnodes are subject to being recycled even if target inode is left in memory,
203 * which loses the name cache entries when it perhaps should not. in case of tmpfs
204 * names get duplicated -- kept by filesystem itself and namecache separately
205 * - vnode reclamation (see vnlru in kern/vfs_subr.c) defaults to skipping
206 * directories for this very reason, which arguably further reducing quality
207 * of vnode LRU. Per the above this is done to avoid breaking vnode -> path
208 * resolution (it becomes expensive for directories and impossible for the rest)
209 * This would not be a factor if namecache entries could persist without vnodes.
210 * - struct namecache has a fixed size and comes in 2 variants, often wasting
211 * space. now hard to replace with malloc due to dependence on SMR, which
212 * requires UMA zones to opt in
213 * - lack of better integration with the kernel also turns nullfs into a layered
214 * filesystem instead of something which can take advantage of caching
215 *
216 * Appendix A: where is the time lost, expanding on paragraph III
217 *
218 * While some care went into optimizing lookups, there is still plenty of
219 * performance left on the table, most notably from single-threaded standpoint.
220 * Below is a woefully incomplete list of changes which can help. Ideas are
221 * mostly sketched out, no claim is made all kinks or prerequisites are laid
222 * out. The name of the game is eliding branches altogether and hopefully some
223 * of memory accesses.
224 *
225 * Note there is performance lost all over VFS.
226 *
227 * === SMR-only lookup
228 *
229 * For commonly used ops like stat(2), when the terminal vnode *is* cached,
230 * lockless lookup could refrain from refing/locking the found vnode and
231 * instead return while within the SMR section. Then a call to, say,
232 * vop_stat_smr could do the work (or fail with EAGAIN), finally the result
233 * would be validated with seqc not changing. This would be faster
234 * single-threaded as it dodges atomics and would provide full scalability for
235 * multicore uses. This would *not* work for open(2) or other calls which need
236 * the vnode to hang around for the long haul, but would work for aforementioned
237 * stat(2) but also access(2), readlink(2), realpathat(2) and probably more.
238 *
239 * === copyinstr
240 *
241 * On all architectures it operates one byte at a time, while it could be
242 * word-sized instead thanks to the Mycroft trick.
243 *
244 * API itself is rather pessimal for path lookup, accepting arbitrary sizes and
245 * *optionally* filling in the length parameter.
246 *
247 * Instead a new routine (copyinpath?) could be introduced, demanding a buffer
248 * size which is a multiply of the word (and never zero), with the length
249 * always returned. On top of it the routine could be allowed to transform the
250 * buffer in arbitrary ways, most notably writing past the found length (not to
251 * be confused with writing past buffer size) -- this would allow word-sized
252 * movs while checking for '\0' later.
253 *
254 * === detour through namei
255 *
256 * Currently one suffers being called from namei, which then has to check if
257 * things worked out locklessly. Instead the lockless lookup could be the
258 * actual entry point which calls what is currently namei as a fallback.
259 *
260 * It could be hotpatched if lockless lookup is disabled.
261 *
262 * === avoidable branches in cache_can_fplookup
263 *
264 * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if
265 * this is off, none of fplookup code should execute, see above).
266 *
267 * Both audit and capsicum branches can be combined into one, but it requires
268 * paying off a lot of tech debt first.
269 *
270 * ni_startdir could be indicated with a flag in cn_flags, eliminating the
271 * branch.
272 *
273 * === mount stacks
274 *
275 * Crossing a mount requires checking if perhaps something is mounted on top.
276 * Instead, an additional entry could be added to struct mount with a pointer
277 * to the final mount on the stack. This would be recalculated on each
278 * mount/unmount.
279 *
280 * === root vnodes
281 *
282 * It could become part of the API contract to *always* have a rootvnode set in
283 * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have
284 * to be modified to always skip them.
285 *
286 * === inactive on v_usecount reaching 0
287 *
288 * VOP_NEED_INACTIVE should not exist. Filesystems can indicate need for such
289 * processing with a bit in usecount and adding a hold count. Then vput fast path
290 * would become as simple as (ACHTUNG: locking ignored):
291 *
292 * ref = atomic_fetchadd_int(&vp->v_count, -1) - 1;
293 * if ((ref & MAGIC_BIT) == 0) // common case
294 * return;
295 * if (ref != 0) // the bit is set but this was not the last user
296 * return;
297 * // do inactive here
298 *
299 * Also see below.
300 *
301 * === v_holdcnt
302 *
303 * Hold count should probably get eliminated, but one can argue it is a useful
304 * feature. Even if so, handling of v_usecount could be decoupled from it --
305 * vnlru et al would consider the vnode not-freeable if has either hold or
306 * usecount on it.
307 *
308 * This would eliminate 2 atomics in the common case of securing a vnode and
309 * undoing it.
310 */
311
312 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
313 "Name cache");
314
315 SDT_PROVIDER_DECLARE(vfs);
316 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
317 "struct vnode *");
318 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
319 "struct vnode *");
320 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
321 "char *");
322 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
323 "const char *");
324 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
325 "struct namecache *", "int", "int");
326 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
327 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
328 "char *", "struct vnode *");
329 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
330 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
331 "struct vnode *", "char *");
332 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
333 "struct vnode *");
334 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
335 "struct vnode *", "char *");
336 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
337 "char *");
338 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
339 "struct componentname *");
340 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
341 "struct componentname *");
342 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
343 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
344 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
345 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
346 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
347 "struct vnode *");
348 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
349 "char *");
350 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
351 "char *");
352 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
353
354 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata *", "int",
355 "enum cache_fpl_status");
356 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
357 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
358
359 static char __read_frequently cache_fast_lookup_enabled = true;
360
361 /*
362 * This structure describes the elements in the cache of recent
363 * names looked up by namei.
364 */
365 struct negstate {
366 u_char neg_flag;
367 u_char neg_hit;
368 };
369 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
370 "the state must fit in a union with a pointer without growing it");
371
372 struct namecache {
373 LIST_ENTRY(namecache) nc_src; /* source vnode list */
374 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
375 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
376 struct vnode *nc_dvp; /* vnode of parent of name */
377 union {
378 struct vnode *nu_vp; /* vnode the name refers to */
379 struct negstate nu_neg;/* negative entry state */
380 } n_un;
381 u_char nc_flag; /* flag bits */
382 u_char nc_nlen; /* length of name */
383 char nc_name[]; /* segment name + nul */
384 };
385
386 /*
387 * struct namecache_ts is used in place of struct namecache when time(s) need
388 * to be stored. The nc_dotdottime field is used when a cache entry is mapping
389 * both a non-dotdot directory name plus dotdot for the directory's
390 * parent.
391 */
392 struct namecache_ts {
393 struct timespec nc_time; /* timespec provided by fs */
394 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
395 int nc_ticks; /* ticks value when entry was added */
396 int nc_pad;
397 struct namecache nc_nc;
398 };
399
400 TAILQ_HEAD(cache_freebatch, namecache);
401
402 /*
403 * Ensure all zones are sufficently aligned to hold both
404 * struct namecache and struct namecache_ts.
405 */
406 #define CACHE_ZONE_ALIGN_MASK UMA_ALIGNOF(struct namecache_ts)
407
408 /*
409 * TODO: CACHE_PATH_CUTOFF was initially introduced with an arbitrary
410 * value of 32 in FreeBSD 5.2.0. It was bumped to 35 and the path was
411 * NUL terminated with the introduction of DTrace probes. Later, it was
412 * expanded to match the alignment allowing an increase to 39, but it
413 * was not re-evaluated for suitability. It was again bumped to 45 on
414 * 64-bit systems and 41 on 32-bit systems (the current values, now
415 * computed at compile time rather than hardcoded). A simple test
416 * counting lengths during package building in 2020 showed that the
417 * value of 45 covers about 86% of all added entries, reaching 99%
418 * at 65.
419 *
420 * Regardless of the above, use of dedicated zones instead of malloc may be
421 * inducing additional waste. This may be hard to address as said zones are
422 * tied to VFS SMR. Even if retaining them, the current split should be
423 * re-evaluated.
424 */
425 #define CACHE_PATH_CUTOFF_MIN 40
426 #define CACHE_STRUCT_LEN(pathlen) \
427 (offsetof(struct namecache, nc_name) + (pathlen) + 1)
428 #define CACHE_PATH_CUTOFF \
429 (roundup2(CACHE_STRUCT_LEN(CACHE_PATH_CUTOFF_MIN), \
430 _Alignof(struct namecache_ts)) - CACHE_STRUCT_LEN(0))
431
432 #define CACHE_ZONE_SMALL_SIZE \
433 CACHE_STRUCT_LEN(CACHE_PATH_CUTOFF)
434 #define CACHE_ZONE_SMALL_TS_SIZE \
435 (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
436 #define CACHE_ZONE_LARGE_SIZE \
437 roundup2(CACHE_STRUCT_LEN(NAME_MAX), _Alignof(struct namecache_ts))
438 #define CACHE_ZONE_LARGE_TS_SIZE \
439 (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
440
441 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGN_MASK + 1)) == 0,
442 "bad zone size");
443 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGN_MASK + 1)) == 0,
444 "bad zone size");
445 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGN_MASK + 1)) == 0,
446 "bad zone size");
447 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGN_MASK + 1)) == 0,
448 "bad zone size");
449
450 #define nc_vp n_un.nu_vp
451 #define nc_neg n_un.nu_neg
452
453 /*
454 * Flags in namecache.nc_flag
455 */
456 #define NCF_WHITE 0x01
457 #define NCF_ISDOTDOT 0x02
458 #define NCF_TS 0x04
459 #define NCF_DTS 0x08
460 #define NCF_DVDROP 0x10
461 #define NCF_NEGATIVE 0x20
462 #define NCF_INVALID 0x40
463 #define NCF_WIP 0x80
464
465 /*
466 * Flags in negstate.neg_flag
467 */
468 #define NEG_HOT 0x01
469
470 static bool cache_neg_evict_cond(u_long lnumcache);
471
472 /*
473 * Mark an entry as invalid.
474 *
475 * This is called before it starts getting deconstructed.
476 */
477 static void
cache_ncp_invalidate(struct namecache * ncp)478 cache_ncp_invalidate(struct namecache *ncp)
479 {
480
481 KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
482 ("%s: entry %p already invalid", __func__, ncp));
483 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
484 atomic_thread_fence_rel();
485 }
486
487 /*
488 * Does this entry match the given directory and name?
489 */
490 static bool
cache_ncp_match(struct namecache * ncp,struct vnode * dvp,struct componentname * cnp)491 cache_ncp_match(struct namecache *ncp, struct vnode *dvp,
492 struct componentname *cnp)
493 {
494 return (ncp->nc_dvp == dvp &&
495 ncp->nc_nlen == cnp->cn_namelen &&
496 bcmp(ncp->nc_name, cnp->cn_nameptr, cnp->cn_namelen) == 0);
497 }
498
499 /*
500 * Check whether the entry can be safely used.
501 *
502 * All places which elide locks are supposed to call this after they are
503 * done with reading from an entry.
504 */
505 #define cache_ncp_canuse(ncp) ({ \
506 struct namecache *_ncp = (ncp); \
507 u_char _nc_flag; \
508 \
509 atomic_thread_fence_acq(); \
510 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
511 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \
512 })
513
514 /*
515 * Like the above but also checks NCF_WHITE.
516 */
517 #define cache_fpl_neg_ncp_canuse(ncp) ({ \
518 struct namecache *_ncp = (ncp); \
519 u_char _nc_flag; \
520 \
521 atomic_thread_fence_acq(); \
522 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
523 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \
524 })
525
526 VFS_SMR_DECLARE;
527
528 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
529 "Name cache parameters");
530
531 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */
532 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0,
533 "Total namecache capacity");
534
535 u_int ncsizefactor = 2;
536 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
537 "Size factor for namecache");
538
539 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */
540 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
541 "Ratio of negative namecache entries");
542
543 /*
544 * Negative entry % of namecache capacity above which automatic eviction is allowed.
545 *
546 * Check cache_neg_evict_cond for details.
547 */
548 static u_int ncnegminpct = 3;
549
550 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */
551 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
552 "Negative entry count above which automatic eviction is allowed");
553
554 /*
555 * Structures associated with name caching.
556 */
557 #define NCHHASH(hash) \
558 (&nchashtbl[(hash) & nchash])
559 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
560 static u_long __read_mostly nchash; /* size of hash table */
561 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
562 "Size of namecache hash table");
563 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
564 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
565
566 struct nchstats nchstats; /* cache effectiveness statistics */
567
568 static u_int __exclusive_cache_line neg_cycle;
569
570 #define ncneghash 3
571 #define numneglists (ncneghash + 1)
572
573 struct neglist {
574 struct mtx nl_evict_lock;
575 struct mtx nl_lock __aligned(CACHE_LINE_SIZE);
576 TAILQ_HEAD(, namecache) nl_list;
577 TAILQ_HEAD(, namecache) nl_hotlist;
578 u_long nl_hotnum;
579 } __aligned(CACHE_LINE_SIZE);
580
581 static struct neglist neglists[numneglists];
582
583 static inline struct neglist *
NCP2NEGLIST(struct namecache * ncp)584 NCP2NEGLIST(struct namecache *ncp)
585 {
586
587 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
588 }
589
590 static inline struct negstate *
NCP2NEGSTATE(struct namecache * ncp)591 NCP2NEGSTATE(struct namecache *ncp)
592 {
593
594 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
595 return (&ncp->nc_neg);
596 }
597
598 #define numbucketlocks (ncbuckethash + 1)
599 static u_int __read_mostly ncbuckethash;
600 static struct mtx_padalign __read_mostly *bucketlocks;
601 #define HASH2BUCKETLOCK(hash) \
602 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
603
604 #define numvnodelocks (ncvnodehash + 1)
605 static u_int __read_mostly ncvnodehash;
606 static struct mtx __read_mostly *vnodelocks;
607 static inline struct mtx *
VP2VNODELOCK(struct vnode * vp)608 VP2VNODELOCK(struct vnode *vp)
609 {
610
611 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
612 }
613
614 /*
615 * Search the hash table for a namecache entry. Either the corresponding bucket
616 * must be locked, or the caller must be in an SMR read section.
617 */
618 static struct namecache *
cache_ncp_find(struct vnode * dvp,struct componentname * cnp,uint32_t hash)619 cache_ncp_find(struct vnode *dvp, struct componentname *cnp, uint32_t hash)
620 {
621 struct namecache *ncp;
622
623 KASSERT(mtx_owned(HASH2BUCKETLOCK(hash)) || VFS_SMR_ENTERED(),
624 ("%s: hash %u not locked", __func__, hash));
625 CK_SLIST_FOREACH(ncp, NCHHASH(hash), nc_hash) {
626 if (cache_ncp_match(ncp, dvp, cnp))
627 break;
628 }
629 return (ncp);
630 }
631
632 static void
cache_out_ts(struct namecache * ncp,struct timespec * tsp,int * ticksp)633 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
634 {
635 struct namecache_ts *ncp_ts;
636
637 KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
638 (tsp == NULL && ticksp == NULL),
639 ("No NCF_TS"));
640
641 if (tsp == NULL)
642 return;
643
644 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
645 *tsp = ncp_ts->nc_time;
646 *ticksp = ncp_ts->nc_ticks;
647 }
648
649 #ifdef DEBUG_CACHE
650 static int __read_mostly doingcache = 1; /* 1 => enable the cache */
651 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
652 "VFS namecache enabled");
653 #endif
654
655 /* Export size information to userland */
656 SYSCTL_SIZEOF_STRUCT(namecache);
657
658 /*
659 * The new name cache statistics
660 */
661 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
662 "Name cache statistics");
663
664 #define STATNODE_ULONG(name, varname, descr) \
665 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
666 #define STATNODE_COUNTER(name, varname, descr) \
667 static COUNTER_U64_DEFINE_EARLY(varname); \
668 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
669 descr);
670 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
671 STATNODE_ULONG(count, numcache, "Number of cache entries");
672 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
673 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
674 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
675 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
676 STATNODE_COUNTER(poszaps, numposzaps,
677 "Number of cache hits (positive) we do not want to cache");
678 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
679 STATNODE_COUNTER(negzaps, numnegzaps,
680 "Number of cache hits (negative) we do not want to cache");
681 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
682 /* These count for vn_getcwd(), too. */
683 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
684 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
685 "Number of fullpath search errors (VOP_VPTOCNP failures)");
686 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
687 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
688 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
689
690 /*
691 * Debug or developer statistics.
692 */
693 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
694 "Name cache debugging");
695 #define DEBUGNODE_ULONG(name, varname, descr) \
696 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
697 static u_long zap_bucket_relock_success;
698 DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success,
699 "Number of successful removals after relocking");
700 static u_long zap_bucket_fail;
701 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
702 static u_long zap_bucket_fail2;
703 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
704 static u_long cache_lock_vnodes_cel_3_failures;
705 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
706 "Number of times 3-way vnode locking failed");
707
708 static void cache_zap_locked(struct namecache *ncp);
709 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
710 char **retbuf, size_t *buflen, size_t addend);
711 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
712 char **retbuf, size_t *buflen);
713 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
714 char **retbuf, size_t *len, size_t addend);
715
716 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
717
718 static inline void
cache_assert_vlp_locked(struct mtx * vlp)719 cache_assert_vlp_locked(struct mtx *vlp)
720 {
721
722 if (vlp != NULL)
723 mtx_assert(vlp, MA_OWNED);
724 }
725
726 static inline void
cache_assert_vnode_locked(struct vnode * vp)727 cache_assert_vnode_locked(struct vnode *vp)
728 {
729 struct mtx *vlp;
730
731 vlp = VP2VNODELOCK(vp);
732 cache_assert_vlp_locked(vlp);
733 }
734
735 /*
736 * Directory vnodes with entries are held for two reasons:
737 * 1. make them less of a target for reclamation in vnlru
738 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
739 *
740 * It will be feasible to stop doing it altogether if all filesystems start
741 * supporting lockless lookup.
742 */
743 static void
cache_hold_vnode(struct vnode * vp)744 cache_hold_vnode(struct vnode *vp)
745 {
746
747 cache_assert_vnode_locked(vp);
748 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
749 vhold(vp);
750 counter_u64_add(numcachehv, 1);
751 }
752
753 static void
cache_drop_vnode(struct vnode * vp)754 cache_drop_vnode(struct vnode *vp)
755 {
756
757 /*
758 * Called after all locks are dropped, meaning we can't assert
759 * on the state of v_cache_src.
760 */
761 vdrop(vp);
762 counter_u64_add(numcachehv, -1);
763 }
764
765 /*
766 * UMA zones.
767 */
768 static uma_zone_t __read_mostly cache_zone_small;
769 static uma_zone_t __read_mostly cache_zone_small_ts;
770 static uma_zone_t __read_mostly cache_zone_large;
771 static uma_zone_t __read_mostly cache_zone_large_ts;
772
773 char *
cache_symlink_alloc(size_t size,int flags)774 cache_symlink_alloc(size_t size, int flags)
775 {
776
777 if (size < CACHE_ZONE_SMALL_SIZE) {
778 return (uma_zalloc_smr(cache_zone_small, flags));
779 }
780 if (size < CACHE_ZONE_LARGE_SIZE) {
781 return (uma_zalloc_smr(cache_zone_large, flags));
782 }
783 counter_u64_add(symlinktoobig, 1);
784 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
785 return (NULL);
786 }
787
788 void
cache_symlink_free(char * string,size_t size)789 cache_symlink_free(char *string, size_t size)
790 {
791
792 MPASS(string != NULL);
793 KASSERT(size < CACHE_ZONE_LARGE_SIZE,
794 ("%s: size %zu too big", __func__, size));
795
796 if (size < CACHE_ZONE_SMALL_SIZE) {
797 uma_zfree_smr(cache_zone_small, string);
798 return;
799 }
800 if (size < CACHE_ZONE_LARGE_SIZE) {
801 uma_zfree_smr(cache_zone_large, string);
802 return;
803 }
804 __assert_unreachable();
805 }
806
807 static struct namecache *
cache_alloc_uma(int len,bool ts)808 cache_alloc_uma(int len, bool ts)
809 {
810 struct namecache_ts *ncp_ts;
811 struct namecache *ncp;
812
813 if (__predict_false(ts)) {
814 if (len <= CACHE_PATH_CUTOFF)
815 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
816 else
817 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
818 ncp = &ncp_ts->nc_nc;
819 } else {
820 if (len <= CACHE_PATH_CUTOFF)
821 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
822 else
823 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
824 }
825 return (ncp);
826 }
827
828 static void
cache_free_uma(struct namecache * ncp)829 cache_free_uma(struct namecache *ncp)
830 {
831 struct namecache_ts *ncp_ts;
832
833 if (__predict_false(ncp->nc_flag & NCF_TS)) {
834 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
835 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
836 uma_zfree_smr(cache_zone_small_ts, ncp_ts);
837 else
838 uma_zfree_smr(cache_zone_large_ts, ncp_ts);
839 } else {
840 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
841 uma_zfree_smr(cache_zone_small, ncp);
842 else
843 uma_zfree_smr(cache_zone_large, ncp);
844 }
845 }
846
847 static struct namecache *
cache_alloc(int len,bool ts)848 cache_alloc(int len, bool ts)
849 {
850 u_long lnumcache;
851
852 /*
853 * Avoid blowout in namecache entries.
854 *
855 * Bugs:
856 * 1. filesystems may end up trying to add an already existing entry
857 * (for example this can happen after a cache miss during concurrent
858 * lookup), in which case we will call cache_neg_evict despite not
859 * adding anything.
860 * 2. the routine may fail to free anything and no provisions are made
861 * to make it try harder (see the inside for failure modes)
862 * 3. it only ever looks at negative entries.
863 */
864 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
865 if (cache_neg_evict_cond(lnumcache)) {
866 lnumcache = atomic_load_long(&numcache);
867 }
868 if (__predict_false(lnumcache >= ncsize)) {
869 atomic_subtract_long(&numcache, 1);
870 counter_u64_add(numdrops, 1);
871 return (NULL);
872 }
873 return (cache_alloc_uma(len, ts));
874 }
875
876 static void
cache_free(struct namecache * ncp)877 cache_free(struct namecache *ncp)
878 {
879
880 MPASS(ncp != NULL);
881 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
882 cache_drop_vnode(ncp->nc_dvp);
883 }
884 cache_free_uma(ncp);
885 atomic_subtract_long(&numcache, 1);
886 }
887
888 static void
cache_free_batch(struct cache_freebatch * batch)889 cache_free_batch(struct cache_freebatch *batch)
890 {
891 struct namecache *ncp, *nnp;
892 int i;
893
894 i = 0;
895 if (TAILQ_EMPTY(batch))
896 goto out;
897 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
898 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
899 cache_drop_vnode(ncp->nc_dvp);
900 }
901 cache_free_uma(ncp);
902 i++;
903 }
904 atomic_subtract_long(&numcache, i);
905 out:
906 SDT_PROBE1(vfs, namecache, purge, batch, i);
907 }
908
909 /*
910 * Hashing.
911 *
912 * The code was made to use FNV in 2001 and this choice needs to be revisited.
913 *
914 * Short summary of the difficulty:
915 * The longest name which can be inserted is NAME_MAX characters in length (or
916 * 255 at the time of writing this comment), while majority of names used in
917 * practice are significantly shorter (mostly below 10). More importantly
918 * majority of lookups performed find names are even shorter than that.
919 *
920 * This poses a problem where hashes which do better than FNV past word size
921 * (or so) tend to come with additional overhead when finalizing the result,
922 * making them noticeably slower for the most commonly used range.
923 *
924 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
925 *
926 * When looking it up the most time consuming part by a large margin (at least
927 * on amd64) is hashing. Replacing FNV with something which pessimizes short
928 * input would make the slowest part stand out even more.
929 */
930
931 /*
932 * TODO: With the value stored we can do better than computing the hash based
933 * on the address.
934 */
935 static void
cache_prehash(struct vnode * vp)936 cache_prehash(struct vnode *vp)
937 {
938
939 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
940 }
941
942 static uint32_t
cache_get_hash(char * name,u_char len,struct vnode * dvp)943 cache_get_hash(char *name, u_char len, struct vnode *dvp)
944 {
945
946 return (fnv_32_buf(name, len, dvp->v_nchash));
947 }
948
949 static uint32_t
cache_get_hash_iter_start(struct vnode * dvp)950 cache_get_hash_iter_start(struct vnode *dvp)
951 {
952
953 return (dvp->v_nchash);
954 }
955
956 static uint32_t
cache_get_hash_iter(char c,uint32_t hash)957 cache_get_hash_iter(char c, uint32_t hash)
958 {
959
960 return (fnv_32_buf(&c, 1, hash));
961 }
962
963 static uint32_t
cache_get_hash_iter_finish(uint32_t hash)964 cache_get_hash_iter_finish(uint32_t hash)
965 {
966
967 return (hash);
968 }
969
970 static inline struct nchashhead *
NCP2BUCKET(struct namecache * ncp)971 NCP2BUCKET(struct namecache *ncp)
972 {
973 uint32_t hash;
974
975 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
976 return (NCHHASH(hash));
977 }
978
979 static inline struct mtx *
NCP2BUCKETLOCK(struct namecache * ncp)980 NCP2BUCKETLOCK(struct namecache *ncp)
981 {
982 uint32_t hash;
983
984 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
985 return (HASH2BUCKETLOCK(hash));
986 }
987
988 #ifdef INVARIANTS
989 static void
cache_assert_bucket_locked(struct namecache * ncp)990 cache_assert_bucket_locked(struct namecache *ncp)
991 {
992 struct mtx *blp;
993
994 blp = NCP2BUCKETLOCK(ncp);
995 mtx_assert(blp, MA_OWNED);
996 }
997
998 static void
cache_assert_bucket_unlocked(struct namecache * ncp)999 cache_assert_bucket_unlocked(struct namecache *ncp)
1000 {
1001 struct mtx *blp;
1002
1003 blp = NCP2BUCKETLOCK(ncp);
1004 mtx_assert(blp, MA_NOTOWNED);
1005 }
1006 #else
1007 #define cache_assert_bucket_locked(x) do { } while (0)
1008 #define cache_assert_bucket_unlocked(x) do { } while (0)
1009 #endif
1010
1011 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
1012 static void
_cache_sort_vnodes(void ** p1,void ** p2)1013 _cache_sort_vnodes(void **p1, void **p2)
1014 {
1015 void *tmp;
1016
1017 MPASS(*p1 != NULL || *p2 != NULL);
1018
1019 if (*p1 > *p2) {
1020 tmp = *p2;
1021 *p2 = *p1;
1022 *p1 = tmp;
1023 }
1024 }
1025
1026 static void
cache_lock_all_buckets(void)1027 cache_lock_all_buckets(void)
1028 {
1029 u_int i;
1030
1031 for (i = 0; i < numbucketlocks; i++)
1032 mtx_lock(&bucketlocks[i]);
1033 }
1034
1035 static void
cache_unlock_all_buckets(void)1036 cache_unlock_all_buckets(void)
1037 {
1038 u_int i;
1039
1040 for (i = 0; i < numbucketlocks; i++)
1041 mtx_unlock(&bucketlocks[i]);
1042 }
1043
1044 static void
cache_lock_all_vnodes(void)1045 cache_lock_all_vnodes(void)
1046 {
1047 u_int i;
1048
1049 for (i = 0; i < numvnodelocks; i++)
1050 mtx_lock(&vnodelocks[i]);
1051 }
1052
1053 static void
cache_unlock_all_vnodes(void)1054 cache_unlock_all_vnodes(void)
1055 {
1056 u_int i;
1057
1058 for (i = 0; i < numvnodelocks; i++)
1059 mtx_unlock(&vnodelocks[i]);
1060 }
1061
1062 static int
cache_trylock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1063 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1064 {
1065
1066 cache_sort_vnodes(&vlp1, &vlp2);
1067
1068 if (vlp1 != NULL) {
1069 if (!mtx_trylock(vlp1))
1070 return (EAGAIN);
1071 }
1072 if (!mtx_trylock(vlp2)) {
1073 if (vlp1 != NULL)
1074 mtx_unlock(vlp1);
1075 return (EAGAIN);
1076 }
1077
1078 return (0);
1079 }
1080
1081 static void
cache_lock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1082 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1083 {
1084
1085 MPASS(vlp1 != NULL || vlp2 != NULL);
1086 MPASS(vlp1 <= vlp2);
1087
1088 if (vlp1 != NULL)
1089 mtx_lock(vlp1);
1090 if (vlp2 != NULL)
1091 mtx_lock(vlp2);
1092 }
1093
1094 static void
cache_unlock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1095 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1096 {
1097
1098 MPASS(vlp1 != NULL || vlp2 != NULL);
1099
1100 if (vlp1 != NULL)
1101 mtx_unlock(vlp1);
1102 if (vlp2 != NULL)
1103 mtx_unlock(vlp2);
1104 }
1105
1106 static int
sysctl_nchstats(SYSCTL_HANDLER_ARGS)1107 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
1108 {
1109 struct nchstats snap;
1110
1111 if (req->oldptr == NULL)
1112 return (SYSCTL_OUT(req, 0, sizeof(snap)));
1113
1114 snap = nchstats;
1115 snap.ncs_goodhits = counter_u64_fetch(numposhits);
1116 snap.ncs_neghits = counter_u64_fetch(numneghits);
1117 snap.ncs_badhits = counter_u64_fetch(numposzaps) +
1118 counter_u64_fetch(numnegzaps);
1119 snap.ncs_miss = counter_u64_fetch(nummisszap) +
1120 counter_u64_fetch(nummiss);
1121
1122 return (SYSCTL_OUT(req, &snap, sizeof(snap)));
1123 }
1124 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
1125 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
1126 "VFS cache effectiveness statistics");
1127
1128 static int
sysctl_hitpct(SYSCTL_HANDLER_ARGS)1129 sysctl_hitpct(SYSCTL_HANDLER_ARGS)
1130 {
1131 long poshits, neghits, miss, total;
1132 long pct;
1133
1134 poshits = counter_u64_fetch(numposhits);
1135 neghits = counter_u64_fetch(numneghits);
1136 miss = counter_u64_fetch(nummiss);
1137 total = poshits + neghits + miss;
1138
1139 pct = 0;
1140 if (total != 0)
1141 pct = ((poshits + neghits) * 100) / total;
1142 return (sysctl_handle_int(oidp, 0, pct, req));
1143 }
1144 SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct,
1145 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct,
1146 "I", "Percentage of hits");
1147
1148 static void
cache_recalc_neg_min(void)1149 cache_recalc_neg_min(void)
1150 {
1151
1152 neg_min = (ncsize * ncnegminpct) / 100;
1153 }
1154
1155 static int
sysctl_negminpct(SYSCTL_HANDLER_ARGS)1156 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
1157 {
1158 u_int val;
1159 int error;
1160
1161 val = ncnegminpct;
1162 error = sysctl_handle_int(oidp, &val, 0, req);
1163 if (error != 0 || req->newptr == NULL)
1164 return (error);
1165
1166 if (val == ncnegminpct)
1167 return (0);
1168 if (val < 0 || val > 99)
1169 return (EINVAL);
1170 ncnegminpct = val;
1171 cache_recalc_neg_min();
1172 return (0);
1173 }
1174
1175 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
1176 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
1177 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
1178
1179 #ifdef DEBUG_CACHE
1180 /*
1181 * Grab an atomic snapshot of the name cache hash chain lengths
1182 */
1183 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
1184 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
1185 "hash table stats");
1186
1187 static int
sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)1188 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
1189 {
1190 struct nchashhead *ncpp;
1191 struct namecache *ncp;
1192 int i, error, n_nchash, *cntbuf;
1193
1194 retry:
1195 n_nchash = nchash + 1; /* nchash is max index, not count */
1196 if (req->oldptr == NULL)
1197 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
1198 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
1199 cache_lock_all_buckets();
1200 if (n_nchash != nchash + 1) {
1201 cache_unlock_all_buckets();
1202 free(cntbuf, M_TEMP);
1203 goto retry;
1204 }
1205 /* Scan hash tables counting entries */
1206 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
1207 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
1208 cntbuf[i]++;
1209 cache_unlock_all_buckets();
1210 for (error = 0, i = 0; i < n_nchash; i++)
1211 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
1212 break;
1213 free(cntbuf, M_TEMP);
1214 return (error);
1215 }
1216 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
1217 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
1218 "nchash chain lengths");
1219
1220 static int
sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)1221 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
1222 {
1223 int error;
1224 struct nchashhead *ncpp;
1225 struct namecache *ncp;
1226 int n_nchash;
1227 int count, maxlength, used, pct;
1228
1229 if (!req->oldptr)
1230 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
1231
1232 cache_lock_all_buckets();
1233 n_nchash = nchash + 1; /* nchash is max index, not count */
1234 used = 0;
1235 maxlength = 0;
1236
1237 /* Scan hash tables for applicable entries */
1238 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
1239 count = 0;
1240 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
1241 count++;
1242 }
1243 if (count)
1244 used++;
1245 if (maxlength < count)
1246 maxlength = count;
1247 }
1248 n_nchash = nchash + 1;
1249 cache_unlock_all_buckets();
1250 pct = (used * 100) / (n_nchash / 100);
1251 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
1252 if (error)
1253 return (error);
1254 error = SYSCTL_OUT(req, &used, sizeof(used));
1255 if (error)
1256 return (error);
1257 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
1258 if (error)
1259 return (error);
1260 error = SYSCTL_OUT(req, &pct, sizeof(pct));
1261 if (error)
1262 return (error);
1263 return (0);
1264 }
1265 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1266 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1267 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1268 #endif
1269
1270 /*
1271 * Negative entries management
1272 *
1273 * Various workloads create plenty of negative entries and barely use them
1274 * afterwards. Moreover malicious users can keep performing bogus lookups
1275 * adding even more entries. For example "make tinderbox" as of writing this
1276 * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1277 * negative.
1278 *
1279 * As such, a rather aggressive eviction method is needed. The currently
1280 * employed method is a placeholder.
1281 *
1282 * Entries are split over numneglists separate lists, each of which is further
1283 * split into hot and cold entries. Entries get promoted after getting a hit.
1284 * Eviction happens on addition of new entry.
1285 */
1286 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1287 "Name cache negative entry statistics");
1288
1289 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1290 "Number of negative cache entries");
1291
1292 static COUNTER_U64_DEFINE_EARLY(neg_created);
1293 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1294 "Number of created negative entries");
1295
1296 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1297 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1298 "Number of evicted negative entries");
1299
1300 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1301 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1302 &neg_evict_skipped_empty,
1303 "Number of times evicting failed due to lack of entries");
1304
1305 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1306 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1307 &neg_evict_skipped_missed,
1308 "Number of times evicting failed due to target entry disappearing");
1309
1310 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1311 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1312 &neg_evict_skipped_contended,
1313 "Number of times evicting failed due to contention");
1314
1315 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1316 "Number of cache hits (negative)");
1317
1318 static int
sysctl_neg_hot(SYSCTL_HANDLER_ARGS)1319 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1320 {
1321 int i, out;
1322
1323 out = 0;
1324 for (i = 0; i < numneglists; i++)
1325 out += neglists[i].nl_hotnum;
1326
1327 return (SYSCTL_OUT(req, &out, sizeof(out)));
1328 }
1329 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1330 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1331 "Number of hot negative entries");
1332
1333 static void
cache_neg_init(struct namecache * ncp)1334 cache_neg_init(struct namecache *ncp)
1335 {
1336 struct negstate *ns;
1337
1338 ncp->nc_flag |= NCF_NEGATIVE;
1339 ns = NCP2NEGSTATE(ncp);
1340 ns->neg_flag = 0;
1341 ns->neg_hit = 0;
1342 counter_u64_add(neg_created, 1);
1343 }
1344
1345 #define CACHE_NEG_PROMOTION_THRESH 2
1346
1347 static bool
cache_neg_hit_prep(struct namecache * ncp)1348 cache_neg_hit_prep(struct namecache *ncp)
1349 {
1350 struct negstate *ns;
1351 u_char n;
1352
1353 ns = NCP2NEGSTATE(ncp);
1354 n = atomic_load_char(&ns->neg_hit);
1355 for (;;) {
1356 if (n >= CACHE_NEG_PROMOTION_THRESH)
1357 return (false);
1358 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1359 break;
1360 }
1361 return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1362 }
1363
1364 /*
1365 * Nothing to do here but it is provided for completeness as some
1366 * cache_neg_hit_prep callers may end up returning without even
1367 * trying to promote.
1368 */
1369 #define cache_neg_hit_abort(ncp) do { } while (0)
1370
1371 static void
cache_neg_hit_finish(struct namecache * ncp)1372 cache_neg_hit_finish(struct namecache *ncp)
1373 {
1374
1375 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1376 counter_u64_add(numneghits, 1);
1377 }
1378
1379 /*
1380 * Move a negative entry to the hot list.
1381 */
1382 static void
cache_neg_promote_locked(struct namecache * ncp)1383 cache_neg_promote_locked(struct namecache *ncp)
1384 {
1385 struct neglist *nl;
1386 struct negstate *ns;
1387
1388 ns = NCP2NEGSTATE(ncp);
1389 nl = NCP2NEGLIST(ncp);
1390 mtx_assert(&nl->nl_lock, MA_OWNED);
1391 if ((ns->neg_flag & NEG_HOT) == 0) {
1392 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1393 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1394 nl->nl_hotnum++;
1395 ns->neg_flag |= NEG_HOT;
1396 }
1397 }
1398
1399 /*
1400 * Move a hot negative entry to the cold list.
1401 */
1402 static void
cache_neg_demote_locked(struct namecache * ncp)1403 cache_neg_demote_locked(struct namecache *ncp)
1404 {
1405 struct neglist *nl;
1406 struct negstate *ns;
1407
1408 ns = NCP2NEGSTATE(ncp);
1409 nl = NCP2NEGLIST(ncp);
1410 mtx_assert(&nl->nl_lock, MA_OWNED);
1411 MPASS(ns->neg_flag & NEG_HOT);
1412 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1413 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1414 nl->nl_hotnum--;
1415 ns->neg_flag &= ~NEG_HOT;
1416 atomic_store_char(&ns->neg_hit, 0);
1417 }
1418
1419 /*
1420 * Move a negative entry to the hot list if it matches the lookup.
1421 *
1422 * We have to take locks, but they may be contended and in the worst
1423 * case we may need to go off CPU. We don't want to spin within the
1424 * smr section and we can't block with it. Exiting the section means
1425 * the found entry could have been evicted. We are going to look it
1426 * up again.
1427 */
1428 static bool
cache_neg_promote_cond(struct vnode * dvp,struct componentname * cnp,struct namecache * oncp,uint32_t hash)1429 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1430 struct namecache *oncp, uint32_t hash)
1431 {
1432 struct namecache *ncp;
1433 struct neglist *nl;
1434 u_char nc_flag;
1435
1436 nl = NCP2NEGLIST(oncp);
1437
1438 mtx_lock(&nl->nl_lock);
1439 /*
1440 * For hash iteration.
1441 */
1442 vfs_smr_enter();
1443
1444 /*
1445 * Avoid all surprises by only succeeding if we got the same entry and
1446 * bailing completely otherwise.
1447 * XXX There are no provisions to keep the vnode around, meaning we may
1448 * end up promoting a negative entry for a *new* vnode and returning
1449 * ENOENT on its account. This is the error we want to return anyway
1450 * and promotion is harmless.
1451 *
1452 * In particular at this point there can be a new ncp which matches the
1453 * search but hashes to a different neglist.
1454 */
1455 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1456 if (ncp == oncp)
1457 break;
1458 }
1459
1460 /*
1461 * No match to begin with.
1462 */
1463 if (__predict_false(ncp == NULL)) {
1464 goto out_abort;
1465 }
1466
1467 /*
1468 * The newly found entry may be something different...
1469 */
1470 if (!cache_ncp_match(ncp, dvp, cnp)) {
1471 goto out_abort;
1472 }
1473
1474 /*
1475 * ... and not even negative.
1476 */
1477 nc_flag = atomic_load_char(&ncp->nc_flag);
1478 if ((nc_flag & NCF_NEGATIVE) == 0) {
1479 goto out_abort;
1480 }
1481
1482 if (!cache_ncp_canuse(ncp)) {
1483 goto out_abort;
1484 }
1485
1486 cache_neg_promote_locked(ncp);
1487 cache_neg_hit_finish(ncp);
1488 vfs_smr_exit();
1489 mtx_unlock(&nl->nl_lock);
1490 return (true);
1491 out_abort:
1492 vfs_smr_exit();
1493 mtx_unlock(&nl->nl_lock);
1494 return (false);
1495 }
1496
1497 static void
cache_neg_promote(struct namecache * ncp)1498 cache_neg_promote(struct namecache *ncp)
1499 {
1500 struct neglist *nl;
1501
1502 nl = NCP2NEGLIST(ncp);
1503 mtx_lock(&nl->nl_lock);
1504 cache_neg_promote_locked(ncp);
1505 mtx_unlock(&nl->nl_lock);
1506 }
1507
1508 static void
cache_neg_insert(struct namecache * ncp)1509 cache_neg_insert(struct namecache *ncp)
1510 {
1511 struct neglist *nl;
1512
1513 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1514 cache_assert_bucket_locked(ncp);
1515 nl = NCP2NEGLIST(ncp);
1516 mtx_lock(&nl->nl_lock);
1517 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1518 mtx_unlock(&nl->nl_lock);
1519 atomic_add_long(&numneg, 1);
1520 }
1521
1522 static void
cache_neg_remove(struct namecache * ncp)1523 cache_neg_remove(struct namecache *ncp)
1524 {
1525 struct neglist *nl;
1526 struct negstate *ns;
1527
1528 cache_assert_bucket_locked(ncp);
1529 nl = NCP2NEGLIST(ncp);
1530 ns = NCP2NEGSTATE(ncp);
1531 mtx_lock(&nl->nl_lock);
1532 if ((ns->neg_flag & NEG_HOT) != 0) {
1533 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1534 nl->nl_hotnum--;
1535 } else {
1536 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1537 }
1538 mtx_unlock(&nl->nl_lock);
1539 atomic_subtract_long(&numneg, 1);
1540 }
1541
1542 static struct neglist *
cache_neg_evict_select_list(void)1543 cache_neg_evict_select_list(void)
1544 {
1545 struct neglist *nl;
1546 u_int c;
1547
1548 c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1549 nl = &neglists[c % numneglists];
1550 if (!mtx_trylock(&nl->nl_evict_lock)) {
1551 counter_u64_add(neg_evict_skipped_contended, 1);
1552 return (NULL);
1553 }
1554 return (nl);
1555 }
1556
1557 static struct namecache *
cache_neg_evict_select_entry(struct neglist * nl)1558 cache_neg_evict_select_entry(struct neglist *nl)
1559 {
1560 struct namecache *ncp, *lncp;
1561 struct negstate *ns, *lns;
1562 int i;
1563
1564 mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1565 mtx_assert(&nl->nl_lock, MA_OWNED);
1566 ncp = TAILQ_FIRST(&nl->nl_list);
1567 if (ncp == NULL)
1568 return (NULL);
1569 lncp = ncp;
1570 lns = NCP2NEGSTATE(lncp);
1571 for (i = 1; i < 4; i++) {
1572 ncp = TAILQ_NEXT(ncp, nc_dst);
1573 if (ncp == NULL)
1574 break;
1575 ns = NCP2NEGSTATE(ncp);
1576 if (ns->neg_hit < lns->neg_hit) {
1577 lncp = ncp;
1578 lns = ns;
1579 }
1580 }
1581 return (lncp);
1582 }
1583
1584 static bool
cache_neg_evict(void)1585 cache_neg_evict(void)
1586 {
1587 struct namecache *ncp, *ncp2;
1588 struct neglist *nl;
1589 struct vnode *dvp;
1590 struct mtx *dvlp;
1591 struct mtx *blp;
1592 uint32_t hash;
1593 u_char nlen;
1594 bool evicted;
1595
1596 nl = cache_neg_evict_select_list();
1597 if (nl == NULL) {
1598 return (false);
1599 }
1600
1601 mtx_lock(&nl->nl_lock);
1602 ncp = TAILQ_FIRST(&nl->nl_hotlist);
1603 if (ncp != NULL) {
1604 cache_neg_demote_locked(ncp);
1605 }
1606 ncp = cache_neg_evict_select_entry(nl);
1607 if (ncp == NULL) {
1608 counter_u64_add(neg_evict_skipped_empty, 1);
1609 mtx_unlock(&nl->nl_lock);
1610 mtx_unlock(&nl->nl_evict_lock);
1611 return (false);
1612 }
1613 nlen = ncp->nc_nlen;
1614 dvp = ncp->nc_dvp;
1615 hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1616 dvlp = VP2VNODELOCK(dvp);
1617 blp = HASH2BUCKETLOCK(hash);
1618 mtx_unlock(&nl->nl_lock);
1619 mtx_unlock(&nl->nl_evict_lock);
1620 mtx_lock(dvlp);
1621 mtx_lock(blp);
1622 /*
1623 * Note that since all locks were dropped above, the entry may be
1624 * gone or reallocated to be something else.
1625 */
1626 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1627 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1628 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1629 break;
1630 }
1631 if (ncp2 == NULL) {
1632 counter_u64_add(neg_evict_skipped_missed, 1);
1633 ncp = NULL;
1634 evicted = false;
1635 } else {
1636 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1637 MPASS(blp == NCP2BUCKETLOCK(ncp));
1638 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1639 ncp->nc_name);
1640 cache_zap_locked(ncp);
1641 counter_u64_add(neg_evicted, 1);
1642 evicted = true;
1643 }
1644 mtx_unlock(blp);
1645 mtx_unlock(dvlp);
1646 if (ncp != NULL)
1647 cache_free(ncp);
1648 return (evicted);
1649 }
1650
1651 /*
1652 * Maybe evict a negative entry to create more room.
1653 *
1654 * The ncnegfactor parameter limits what fraction of the total count
1655 * can comprise of negative entries. However, if the cache is just
1656 * warming up this leads to excessive evictions. As such, ncnegminpct
1657 * (recomputed to neg_min) dictates whether the above should be
1658 * applied.
1659 *
1660 * Try evicting if the cache is close to full capacity regardless of
1661 * other considerations.
1662 */
1663 static bool
cache_neg_evict_cond(u_long lnumcache)1664 cache_neg_evict_cond(u_long lnumcache)
1665 {
1666 u_long lnumneg;
1667
1668 if (ncsize - 1000 < lnumcache)
1669 goto out_evict;
1670 lnumneg = atomic_load_long(&numneg);
1671 if (lnumneg < neg_min)
1672 return (false);
1673 if (lnumneg * ncnegfactor < lnumcache)
1674 return (false);
1675 out_evict:
1676 return (cache_neg_evict());
1677 }
1678
1679 /*
1680 * cache_zap_locked():
1681 *
1682 * Removes a namecache entry from cache, whether it contains an actual
1683 * pointer to a vnode or if it is just a negative cache entry.
1684 */
1685 static void
cache_zap_locked(struct namecache * ncp)1686 cache_zap_locked(struct namecache *ncp)
1687 {
1688 struct nchashhead *ncpp;
1689 struct vnode *dvp, *vp;
1690
1691 dvp = ncp->nc_dvp;
1692 vp = ncp->nc_vp;
1693
1694 if (!(ncp->nc_flag & NCF_NEGATIVE))
1695 cache_assert_vnode_locked(vp);
1696 cache_assert_vnode_locked(dvp);
1697 cache_assert_bucket_locked(ncp);
1698
1699 cache_ncp_invalidate(ncp);
1700
1701 ncpp = NCP2BUCKET(ncp);
1702 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1703 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1704 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1705 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1706 if (ncp == vp->v_cache_dd) {
1707 atomic_store_ptr(&vp->v_cache_dd, NULL);
1708 }
1709 } else {
1710 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1711 cache_neg_remove(ncp);
1712 }
1713 if (ncp->nc_flag & NCF_ISDOTDOT) {
1714 if (ncp == dvp->v_cache_dd) {
1715 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1716 }
1717 } else {
1718 LIST_REMOVE(ncp, nc_src);
1719 if (LIST_EMPTY(&dvp->v_cache_src)) {
1720 ncp->nc_flag |= NCF_DVDROP;
1721 }
1722 }
1723 }
1724
1725 static void
cache_zap_negative_locked_vnode_kl(struct namecache * ncp,struct vnode * vp)1726 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1727 {
1728 struct mtx *blp;
1729
1730 MPASS(ncp->nc_dvp == vp);
1731 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1732 cache_assert_vnode_locked(vp);
1733
1734 blp = NCP2BUCKETLOCK(ncp);
1735 mtx_lock(blp);
1736 cache_zap_locked(ncp);
1737 mtx_unlock(blp);
1738 }
1739
1740 static bool
cache_zap_locked_vnode_kl2(struct namecache * ncp,struct vnode * vp,struct mtx ** vlpp)1741 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1742 struct mtx **vlpp)
1743 {
1744 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1745 struct mtx *blp;
1746
1747 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1748 cache_assert_vnode_locked(vp);
1749
1750 if (ncp->nc_flag & NCF_NEGATIVE) {
1751 if (*vlpp != NULL) {
1752 mtx_unlock(*vlpp);
1753 *vlpp = NULL;
1754 }
1755 cache_zap_negative_locked_vnode_kl(ncp, vp);
1756 return (true);
1757 }
1758
1759 pvlp = VP2VNODELOCK(vp);
1760 blp = NCP2BUCKETLOCK(ncp);
1761 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1762 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1763
1764 if (*vlpp == vlp1 || *vlpp == vlp2) {
1765 to_unlock = *vlpp;
1766 *vlpp = NULL;
1767 } else {
1768 if (*vlpp != NULL) {
1769 mtx_unlock(*vlpp);
1770 *vlpp = NULL;
1771 }
1772 cache_sort_vnodes(&vlp1, &vlp2);
1773 if (vlp1 == pvlp) {
1774 mtx_lock(vlp2);
1775 to_unlock = vlp2;
1776 } else {
1777 if (!mtx_trylock(vlp1))
1778 goto out_relock;
1779 to_unlock = vlp1;
1780 }
1781 }
1782 mtx_lock(blp);
1783 cache_zap_locked(ncp);
1784 mtx_unlock(blp);
1785 if (to_unlock != NULL)
1786 mtx_unlock(to_unlock);
1787 return (true);
1788
1789 out_relock:
1790 mtx_unlock(vlp2);
1791 mtx_lock(vlp1);
1792 mtx_lock(vlp2);
1793 MPASS(*vlpp == NULL);
1794 *vlpp = vlp1;
1795 return (false);
1796 }
1797
1798 /*
1799 * If trylocking failed we can get here. We know enough to take all needed locks
1800 * in the right order and re-lookup the entry.
1801 */
1802 static int
cache_zap_unlocked_bucket(struct namecache * ncp,struct componentname * cnp,struct vnode * dvp,struct mtx * dvlp,struct mtx * vlp,uint32_t hash,struct mtx * blp)1803 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1804 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1805 struct mtx *blp)
1806 {
1807 struct namecache *rncp;
1808 struct mtx *rvlp;
1809
1810 cache_assert_bucket_unlocked(ncp);
1811
1812 cache_sort_vnodes(&dvlp, &vlp);
1813 cache_lock_vnodes(dvlp, vlp);
1814 mtx_lock(blp);
1815 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1816 if (rncp == ncp && cache_ncp_match(rncp, dvp, cnp))
1817 break;
1818 }
1819 if (rncp == NULL)
1820 goto out_mismatch;
1821
1822 if (!(ncp->nc_flag & NCF_NEGATIVE))
1823 rvlp = VP2VNODELOCK(rncp->nc_vp);
1824 else
1825 rvlp = NULL;
1826 if (rvlp != vlp)
1827 goto out_mismatch;
1828
1829 cache_zap_locked(rncp);
1830 mtx_unlock(blp);
1831 cache_unlock_vnodes(dvlp, vlp);
1832 atomic_add_long(&zap_bucket_relock_success, 1);
1833 return (0);
1834
1835 out_mismatch:
1836 mtx_unlock(blp);
1837 cache_unlock_vnodes(dvlp, vlp);
1838 return (EAGAIN);
1839 }
1840
1841 static int __noinline
cache_zap_locked_bucket(struct namecache * ncp,struct componentname * cnp,uint32_t hash,struct mtx * blp)1842 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1843 uint32_t hash, struct mtx *blp)
1844 {
1845 struct mtx *dvlp, *vlp;
1846 struct vnode *dvp;
1847
1848 cache_assert_bucket_locked(ncp);
1849
1850 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1851 vlp = NULL;
1852 if (!(ncp->nc_flag & NCF_NEGATIVE))
1853 vlp = VP2VNODELOCK(ncp->nc_vp);
1854 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1855 cache_zap_locked(ncp);
1856 mtx_unlock(blp);
1857 cache_unlock_vnodes(dvlp, vlp);
1858 return (0);
1859 }
1860
1861 dvp = ncp->nc_dvp;
1862 mtx_unlock(blp);
1863 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1864 }
1865
1866 static __noinline int
cache_remove_cnp(struct vnode * dvp,struct componentname * cnp)1867 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1868 {
1869 struct namecache *ncp;
1870 struct mtx *blp;
1871 struct mtx *dvlp, *dvlp2;
1872 uint32_t hash;
1873 int error;
1874
1875 if (cnp->cn_namelen == 2 &&
1876 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1877 dvlp = VP2VNODELOCK(dvp);
1878 dvlp2 = NULL;
1879 mtx_lock(dvlp);
1880 retry_dotdot:
1881 ncp = dvp->v_cache_dd;
1882 if (ncp == NULL) {
1883 mtx_unlock(dvlp);
1884 if (dvlp2 != NULL)
1885 mtx_unlock(dvlp2);
1886 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1887 return (0);
1888 }
1889 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1890 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1891 goto retry_dotdot;
1892 MPASS(dvp->v_cache_dd == NULL);
1893 mtx_unlock(dvlp);
1894 if (dvlp2 != NULL)
1895 mtx_unlock(dvlp2);
1896 cache_free(ncp);
1897 } else {
1898 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1899 mtx_unlock(dvlp);
1900 if (dvlp2 != NULL)
1901 mtx_unlock(dvlp2);
1902 }
1903 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1904 return (1);
1905 }
1906
1907 /*
1908 * XXX note that access here is completely unlocked with no provisions
1909 * to keep the hash allocated. If one is sufficiently unlucky a
1910 * parallel cache resize can reallocate the hash, unmap backing pages
1911 * and cause the empty check below to fault.
1912 *
1913 * Fixing this has epsilon priority, but can be done with no overhead
1914 * for this codepath with sufficient effort.
1915 */
1916 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1917 blp = HASH2BUCKETLOCK(hash);
1918 retry:
1919 if (CK_SLIST_EMPTY(NCHHASH(hash)))
1920 goto out_no_entry;
1921
1922 mtx_lock(blp);
1923 ncp = cache_ncp_find(dvp, cnp, hash);
1924 if (ncp == NULL) {
1925 mtx_unlock(blp);
1926 goto out_no_entry;
1927 }
1928
1929 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1930 if (__predict_false(error != 0)) {
1931 atomic_add_long(&zap_bucket_fail, 1);
1932 goto retry;
1933 }
1934 counter_u64_add(numposzaps, 1);
1935 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1936 cache_free(ncp);
1937 return (1);
1938 out_no_entry:
1939 counter_u64_add(nummisszap, 1);
1940 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1941 return (0);
1942 }
1943
1944 static int __noinline
cache_lookup_dot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1945 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1946 struct timespec *tsp, int *ticksp)
1947 {
1948 int ltype;
1949
1950 *vpp = dvp;
1951 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1952 if (tsp != NULL)
1953 timespecclear(tsp);
1954 if (ticksp != NULL)
1955 *ticksp = ticks;
1956 vrefact(*vpp);
1957 /*
1958 * When we lookup "." we still can be asked to lock it
1959 * differently...
1960 */
1961 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1962 if (ltype != VOP_ISLOCKED(*vpp)) {
1963 if (ltype == LK_EXCLUSIVE) {
1964 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1965 if (VN_IS_DOOMED((*vpp))) {
1966 /* forced unmount */
1967 vrele(*vpp);
1968 *vpp = NULL;
1969 return (ENOENT);
1970 }
1971 } else
1972 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1973 }
1974 return (-1);
1975 }
1976
1977 static int __noinline
cache_lookup_dotdot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1978 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1979 struct timespec *tsp, int *ticksp)
1980 {
1981 struct namecache_ts *ncp_ts;
1982 struct namecache *ncp;
1983 struct mtx *dvlp;
1984 enum vgetstate vs;
1985 int error, ltype;
1986 bool whiteout;
1987
1988 MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1989
1990 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1991 cache_remove_cnp(dvp, cnp);
1992 return (0);
1993 }
1994
1995 retry:
1996 dvlp = VP2VNODELOCK(dvp);
1997 mtx_lock(dvlp);
1998 ncp = dvp->v_cache_dd;
1999 if (ncp == NULL) {
2000 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
2001 mtx_unlock(dvlp);
2002 return (0);
2003 }
2004 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
2005 if (ncp->nc_flag & NCF_NEGATIVE)
2006 *vpp = NULL;
2007 else
2008 *vpp = ncp->nc_vp;
2009 } else
2010 *vpp = ncp->nc_dvp;
2011 if (*vpp == NULL)
2012 goto negative_success;
2013 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
2014 cache_out_ts(ncp, tsp, ticksp);
2015 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
2016 NCF_DTS && tsp != NULL) {
2017 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2018 *tsp = ncp_ts->nc_dotdottime;
2019 }
2020
2021 MPASS(dvp != *vpp);
2022 ltype = VOP_ISLOCKED(dvp);
2023 VOP_UNLOCK(dvp);
2024 vs = vget_prep(*vpp);
2025 mtx_unlock(dvlp);
2026 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2027 vn_lock(dvp, ltype | LK_RETRY);
2028 if (VN_IS_DOOMED(dvp)) {
2029 if (error == 0)
2030 vput(*vpp);
2031 *vpp = NULL;
2032 return (ENOENT);
2033 }
2034 if (error) {
2035 *vpp = NULL;
2036 goto retry;
2037 }
2038 return (-1);
2039 negative_success:
2040 if (__predict_false(cnp->cn_nameiop == CREATE)) {
2041 if (cnp->cn_flags & ISLASTCN) {
2042 counter_u64_add(numnegzaps, 1);
2043 cache_zap_negative_locked_vnode_kl(ncp, dvp);
2044 mtx_unlock(dvlp);
2045 cache_free(ncp);
2046 return (0);
2047 }
2048 }
2049
2050 whiteout = (ncp->nc_flag & NCF_WHITE);
2051 cache_out_ts(ncp, tsp, ticksp);
2052 if (cache_neg_hit_prep(ncp))
2053 cache_neg_promote(ncp);
2054 else
2055 cache_neg_hit_finish(ncp);
2056 mtx_unlock(dvlp);
2057 if (whiteout)
2058 cnp->cn_flags |= ISWHITEOUT;
2059 return (ENOENT);
2060 }
2061
2062 /**
2063 * Lookup a name in the name cache
2064 *
2065 * # Arguments
2066 *
2067 * - dvp: Parent directory in which to search.
2068 * - vpp: Return argument. Will contain desired vnode on cache hit.
2069 * - cnp: Parameters of the name search. The most interesting bits of
2070 * the cn_flags field have the following meanings:
2071 * - MAKEENTRY: If clear, free an entry from the cache rather than look
2072 * it up.
2073 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
2074 * - tsp: Return storage for cache timestamp. On a successful (positive
2075 * or negative) lookup, tsp will be filled with any timespec that
2076 * was stored when this cache entry was created. However, it will
2077 * be clear for "." entries.
2078 * - ticks: Return storage for alternate cache timestamp. On a successful
2079 * (positive or negative) lookup, it will contain the ticks value
2080 * that was current when the cache entry was created, unless cnp
2081 * was ".".
2082 *
2083 * Either both tsp and ticks have to be provided or neither of them.
2084 *
2085 * # Returns
2086 *
2087 * - -1: A positive cache hit. vpp will contain the desired vnode.
2088 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
2089 * to a forced unmount. vpp will not be modified. If the entry
2090 * is a whiteout, then the ISWHITEOUT flag will be set in
2091 * cnp->cn_flags.
2092 * - 0: A cache miss. vpp will not be modified.
2093 *
2094 * # Locking
2095 *
2096 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
2097 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
2098 * lock is not recursively acquired.
2099 */
2100 static int __noinline
cache_lookup_fallback(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)2101 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2102 struct timespec *tsp, int *ticksp)
2103 {
2104 struct namecache *ncp;
2105 struct mtx *blp;
2106 uint32_t hash;
2107 enum vgetstate vs;
2108 int error;
2109 bool whiteout;
2110
2111 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2112 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
2113
2114 retry:
2115 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2116 blp = HASH2BUCKETLOCK(hash);
2117 mtx_lock(blp);
2118
2119 ncp = cache_ncp_find(dvp, cnp, hash);
2120 if (__predict_false(ncp == NULL)) {
2121 mtx_unlock(blp);
2122 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2123 counter_u64_add(nummiss, 1);
2124 return (0);
2125 }
2126
2127 if (ncp->nc_flag & NCF_NEGATIVE)
2128 goto negative_success;
2129
2130 counter_u64_add(numposhits, 1);
2131 *vpp = ncp->nc_vp;
2132 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2133 cache_out_ts(ncp, tsp, ticksp);
2134 MPASS(dvp != *vpp);
2135 vs = vget_prep(*vpp);
2136 mtx_unlock(blp);
2137 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2138 if (error) {
2139 *vpp = NULL;
2140 goto retry;
2141 }
2142 return (-1);
2143 negative_success:
2144 /*
2145 * We don't get here with regular lookup apart from corner cases.
2146 */
2147 if (__predict_true(cnp->cn_nameiop == CREATE)) {
2148 if (cnp->cn_flags & ISLASTCN) {
2149 counter_u64_add(numnegzaps, 1);
2150 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
2151 if (__predict_false(error != 0)) {
2152 atomic_add_long(&zap_bucket_fail2, 1);
2153 goto retry;
2154 }
2155 cache_free(ncp);
2156 return (0);
2157 }
2158 }
2159
2160 whiteout = (ncp->nc_flag & NCF_WHITE);
2161 cache_out_ts(ncp, tsp, ticksp);
2162 if (cache_neg_hit_prep(ncp))
2163 cache_neg_promote(ncp);
2164 else
2165 cache_neg_hit_finish(ncp);
2166 mtx_unlock(blp);
2167 if (whiteout)
2168 cnp->cn_flags |= ISWHITEOUT;
2169 return (ENOENT);
2170 }
2171
2172 int
cache_lookup(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)2173 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2174 struct timespec *tsp, int *ticksp)
2175 {
2176 struct namecache *ncp;
2177 uint32_t hash;
2178 enum vgetstate vs;
2179 int error;
2180 bool whiteout, neg_promote;
2181 u_short nc_flag;
2182
2183 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
2184
2185 #ifdef DEBUG_CACHE
2186 if (__predict_false(!doingcache)) {
2187 cnp->cn_flags &= ~MAKEENTRY;
2188 return (0);
2189 }
2190 #endif
2191
2192 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2193 if (cnp->cn_namelen == 1)
2194 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
2195 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
2196 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
2197 }
2198
2199 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2200
2201 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
2202 cache_remove_cnp(dvp, cnp);
2203 return (0);
2204 }
2205
2206 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2207 vfs_smr_enter();
2208
2209 ncp = cache_ncp_find(dvp, cnp, hash);
2210 if (__predict_false(ncp == NULL)) {
2211 vfs_smr_exit();
2212 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2213 counter_u64_add(nummiss, 1);
2214 return (0);
2215 }
2216
2217 nc_flag = atomic_load_char(&ncp->nc_flag);
2218 if (nc_flag & NCF_NEGATIVE)
2219 goto negative_success;
2220
2221 counter_u64_add(numposhits, 1);
2222 *vpp = ncp->nc_vp;
2223 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2224 cache_out_ts(ncp, tsp, ticksp);
2225 MPASS(dvp != *vpp);
2226 if (!cache_ncp_canuse(ncp)) {
2227 vfs_smr_exit();
2228 *vpp = NULL;
2229 goto out_fallback;
2230 }
2231 vs = vget_prep_smr(*vpp);
2232 vfs_smr_exit();
2233 if (__predict_false(vs == VGET_NONE)) {
2234 *vpp = NULL;
2235 goto out_fallback;
2236 }
2237 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2238 if (error) {
2239 *vpp = NULL;
2240 goto out_fallback;
2241 }
2242 return (-1);
2243 negative_success:
2244 if (cnp->cn_nameiop == CREATE) {
2245 if (cnp->cn_flags & ISLASTCN) {
2246 vfs_smr_exit();
2247 goto out_fallback;
2248 }
2249 }
2250
2251 cache_out_ts(ncp, tsp, ticksp);
2252 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
2253 neg_promote = cache_neg_hit_prep(ncp);
2254 if (!cache_ncp_canuse(ncp)) {
2255 cache_neg_hit_abort(ncp);
2256 vfs_smr_exit();
2257 goto out_fallback;
2258 }
2259 if (neg_promote) {
2260 vfs_smr_exit();
2261 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2262 goto out_fallback;
2263 } else {
2264 cache_neg_hit_finish(ncp);
2265 vfs_smr_exit();
2266 }
2267 if (whiteout)
2268 cnp->cn_flags |= ISWHITEOUT;
2269 return (ENOENT);
2270 out_fallback:
2271 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2272 }
2273
2274 struct celockstate {
2275 struct mtx *vlp[3];
2276 struct mtx *blp[2];
2277 };
2278 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2279 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2280
2281 static inline void
cache_celockstate_init(struct celockstate * cel)2282 cache_celockstate_init(struct celockstate *cel)
2283 {
2284
2285 bzero(cel, sizeof(*cel));
2286 }
2287
2288 static void
cache_lock_vnodes_cel(struct celockstate * cel,struct vnode * vp,struct vnode * dvp)2289 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2290 struct vnode *dvp)
2291 {
2292 struct mtx *vlp1, *vlp2;
2293
2294 MPASS(cel->vlp[0] == NULL);
2295 MPASS(cel->vlp[1] == NULL);
2296 MPASS(cel->vlp[2] == NULL);
2297
2298 MPASS(vp != NULL || dvp != NULL);
2299
2300 vlp1 = VP2VNODELOCK(vp);
2301 vlp2 = VP2VNODELOCK(dvp);
2302 cache_sort_vnodes(&vlp1, &vlp2);
2303
2304 if (vlp1 != NULL) {
2305 mtx_lock(vlp1);
2306 cel->vlp[0] = vlp1;
2307 }
2308 mtx_lock(vlp2);
2309 cel->vlp[1] = vlp2;
2310 }
2311
2312 static void
cache_unlock_vnodes_cel(struct celockstate * cel)2313 cache_unlock_vnodes_cel(struct celockstate *cel)
2314 {
2315
2316 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2317
2318 if (cel->vlp[0] != NULL)
2319 mtx_unlock(cel->vlp[0]);
2320 if (cel->vlp[1] != NULL)
2321 mtx_unlock(cel->vlp[1]);
2322 if (cel->vlp[2] != NULL)
2323 mtx_unlock(cel->vlp[2]);
2324 }
2325
2326 static bool
cache_lock_vnodes_cel_3(struct celockstate * cel,struct vnode * vp)2327 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2328 {
2329 struct mtx *vlp;
2330 bool ret;
2331
2332 cache_assert_vlp_locked(cel->vlp[0]);
2333 cache_assert_vlp_locked(cel->vlp[1]);
2334 MPASS(cel->vlp[2] == NULL);
2335
2336 MPASS(vp != NULL);
2337 vlp = VP2VNODELOCK(vp);
2338
2339 ret = true;
2340 if (vlp >= cel->vlp[1]) {
2341 mtx_lock(vlp);
2342 } else {
2343 if (mtx_trylock(vlp))
2344 goto out;
2345 cache_unlock_vnodes_cel(cel);
2346 atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1);
2347 if (vlp < cel->vlp[0]) {
2348 mtx_lock(vlp);
2349 mtx_lock(cel->vlp[0]);
2350 mtx_lock(cel->vlp[1]);
2351 } else {
2352 if (cel->vlp[0] != NULL)
2353 mtx_lock(cel->vlp[0]);
2354 mtx_lock(vlp);
2355 mtx_lock(cel->vlp[1]);
2356 }
2357 ret = false;
2358 }
2359 out:
2360 cel->vlp[2] = vlp;
2361 return (ret);
2362 }
2363
2364 static void
cache_lock_buckets_cel(struct celockstate * cel,struct mtx * blp1,struct mtx * blp2)2365 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2366 struct mtx *blp2)
2367 {
2368
2369 MPASS(cel->blp[0] == NULL);
2370 MPASS(cel->blp[1] == NULL);
2371
2372 cache_sort_vnodes(&blp1, &blp2);
2373
2374 if (blp1 != NULL) {
2375 mtx_lock(blp1);
2376 cel->blp[0] = blp1;
2377 }
2378 mtx_lock(blp2);
2379 cel->blp[1] = blp2;
2380 }
2381
2382 static void
cache_unlock_buckets_cel(struct celockstate * cel)2383 cache_unlock_buckets_cel(struct celockstate *cel)
2384 {
2385
2386 if (cel->blp[0] != NULL)
2387 mtx_unlock(cel->blp[0]);
2388 mtx_unlock(cel->blp[1]);
2389 }
2390
2391 /*
2392 * Lock part of the cache affected by the insertion.
2393 *
2394 * This means vnodelocks for dvp, vp and the relevant bucketlock.
2395 * However, insertion can result in removal of an old entry. In this
2396 * case we have an additional vnode and bucketlock pair to lock.
2397 *
2398 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2399 * preserving the locking order (smaller address first).
2400 */
2401 static void
cache_enter_lock(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2402 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2403 uint32_t hash)
2404 {
2405 struct namecache *ncp;
2406 struct mtx *blps[2];
2407 u_char nc_flag;
2408
2409 blps[0] = HASH2BUCKETLOCK(hash);
2410 for (;;) {
2411 blps[1] = NULL;
2412 cache_lock_vnodes_cel(cel, dvp, vp);
2413 if (vp == NULL || vp->v_type != VDIR)
2414 break;
2415 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2416 if (ncp == NULL)
2417 break;
2418 nc_flag = atomic_load_char(&ncp->nc_flag);
2419 if ((nc_flag & NCF_ISDOTDOT) == 0)
2420 break;
2421 MPASS(ncp->nc_dvp == vp);
2422 blps[1] = NCP2BUCKETLOCK(ncp);
2423 if ((nc_flag & NCF_NEGATIVE) != 0)
2424 break;
2425 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2426 break;
2427 /*
2428 * All vnodes got re-locked. Re-validate the state and if
2429 * nothing changed we are done. Otherwise restart.
2430 */
2431 if (ncp == vp->v_cache_dd &&
2432 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2433 blps[1] == NCP2BUCKETLOCK(ncp) &&
2434 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2435 break;
2436 cache_unlock_vnodes_cel(cel);
2437 cel->vlp[0] = NULL;
2438 cel->vlp[1] = NULL;
2439 cel->vlp[2] = NULL;
2440 }
2441 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2442 }
2443
2444 static void
cache_enter_lock_dd(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2445 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2446 uint32_t hash)
2447 {
2448 struct namecache *ncp;
2449 struct mtx *blps[2];
2450 u_char nc_flag;
2451
2452 blps[0] = HASH2BUCKETLOCK(hash);
2453 for (;;) {
2454 blps[1] = NULL;
2455 cache_lock_vnodes_cel(cel, dvp, vp);
2456 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2457 if (ncp == NULL)
2458 break;
2459 nc_flag = atomic_load_char(&ncp->nc_flag);
2460 if ((nc_flag & NCF_ISDOTDOT) == 0)
2461 break;
2462 MPASS(ncp->nc_dvp == dvp);
2463 blps[1] = NCP2BUCKETLOCK(ncp);
2464 if ((nc_flag & NCF_NEGATIVE) != 0)
2465 break;
2466 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2467 break;
2468 if (ncp == dvp->v_cache_dd &&
2469 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2470 blps[1] == NCP2BUCKETLOCK(ncp) &&
2471 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2472 break;
2473 cache_unlock_vnodes_cel(cel);
2474 cel->vlp[0] = NULL;
2475 cel->vlp[1] = NULL;
2476 cel->vlp[2] = NULL;
2477 }
2478 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2479 }
2480
2481 static void
cache_enter_unlock(struct celockstate * cel)2482 cache_enter_unlock(struct celockstate *cel)
2483 {
2484
2485 cache_unlock_buckets_cel(cel);
2486 cache_unlock_vnodes_cel(cel);
2487 }
2488
2489 static void __noinline
cache_enter_dotdot_prep(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)2490 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2491 struct componentname *cnp)
2492 {
2493 struct celockstate cel;
2494 struct namecache *ncp;
2495 uint32_t hash;
2496 int len;
2497
2498 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2499 return;
2500 len = cnp->cn_namelen;
2501 cache_celockstate_init(&cel);
2502 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2503 cache_enter_lock_dd(&cel, dvp, vp, hash);
2504 ncp = dvp->v_cache_dd;
2505 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2506 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2507 cache_zap_locked(ncp);
2508 } else {
2509 ncp = NULL;
2510 }
2511 atomic_store_ptr(&dvp->v_cache_dd, NULL);
2512 cache_enter_unlock(&cel);
2513 if (ncp != NULL)
2514 cache_free(ncp);
2515 }
2516
2517 /*
2518 * Add an entry to the cache.
2519 */
2520 void
cache_enter_time(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp)2521 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2522 struct timespec *tsp, struct timespec *dtsp)
2523 {
2524 struct celockstate cel;
2525 struct namecache *ncp, *n2, *ndd;
2526 struct namecache_ts *ncp_ts;
2527 uint32_t hash;
2528 int flag;
2529 int len;
2530
2531 KASSERT(cnp->cn_namelen <= NAME_MAX,
2532 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2533 NAME_MAX));
2534 VNPASS(!VN_IS_DOOMED(dvp), dvp);
2535 VNPASS(dvp->v_type != VNON, dvp);
2536 if (vp != NULL) {
2537 VNPASS(!VN_IS_DOOMED(vp), vp);
2538 VNPASS(vp->v_type != VNON, vp);
2539 }
2540 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
2541 KASSERT(dvp == vp,
2542 ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
2543 dvp, vp));
2544 } else {
2545 KASSERT(dvp != vp,
2546 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
2547 cnp->cn_nameptr, dvp));
2548 }
2549
2550 #ifdef DEBUG_CACHE
2551 if (__predict_false(!doingcache))
2552 return;
2553 #endif
2554
2555 flag = 0;
2556 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2557 if (cnp->cn_namelen == 1)
2558 return;
2559 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2560 cache_enter_dotdot_prep(dvp, vp, cnp);
2561 flag = NCF_ISDOTDOT;
2562 }
2563 }
2564
2565 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2566 if (ncp == NULL)
2567 return;
2568
2569 cache_celockstate_init(&cel);
2570 ndd = NULL;
2571 ncp_ts = NULL;
2572
2573 /*
2574 * Calculate the hash key and setup as much of the new
2575 * namecache entry as possible before acquiring the lock.
2576 */
2577 ncp->nc_flag = flag | NCF_WIP;
2578 ncp->nc_vp = vp;
2579 if (vp == NULL)
2580 cache_neg_init(ncp);
2581 ncp->nc_dvp = dvp;
2582 if (tsp != NULL) {
2583 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2584 ncp_ts->nc_time = *tsp;
2585 ncp_ts->nc_ticks = ticks;
2586 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2587 if (dtsp != NULL) {
2588 ncp_ts->nc_dotdottime = *dtsp;
2589 ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2590 }
2591 }
2592 len = ncp->nc_nlen = cnp->cn_namelen;
2593 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2594 memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2595 ncp->nc_name[len] = '\0';
2596 cache_enter_lock(&cel, dvp, vp, hash);
2597
2598 /*
2599 * See if this vnode or negative entry is already in the cache
2600 * with this name. This can happen with concurrent lookups of
2601 * the same path name.
2602 */
2603 n2 = cache_ncp_find(dvp, cnp, hash);
2604 if (n2 != NULL) {
2605 MPASS(cache_ncp_canuse(n2));
2606 if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2607 KASSERT(vp == NULL,
2608 ("%s: found entry pointing to a different vnode "
2609 "(%p != %p); name [%s]",
2610 __func__, NULL, vp, cnp->cn_nameptr));
2611 else
2612 KASSERT(n2->nc_vp == vp,
2613 ("%s: found entry pointing to a different vnode "
2614 "(%p != %p); name [%s]",
2615 __func__, n2->nc_vp, vp, cnp->cn_nameptr));
2616 /*
2617 * Entries are supposed to be immutable unless in the
2618 * process of getting destroyed. Accommodating for
2619 * changing timestamps is possible but not worth it.
2620 * This should be harmless in terms of correctness, in
2621 * the worst case resulting in an earlier expiration.
2622 * Alternatively, the found entry can be replaced
2623 * altogether.
2624 */
2625 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) ==
2626 (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2627 #if 0
2628 if (tsp != NULL) {
2629 KASSERT((n2->nc_flag & NCF_TS) != 0,
2630 ("no NCF_TS"));
2631 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2632 n2_ts->nc_time = ncp_ts->nc_time;
2633 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2634 if (dtsp != NULL) {
2635 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2636 n2_ts->nc_nc.nc_flag |= NCF_DTS;
2637 }
2638 }
2639 #endif
2640 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2641 vp);
2642 goto out_unlock_free;
2643 }
2644
2645 if (flag == NCF_ISDOTDOT) {
2646 /*
2647 * See if we are trying to add .. entry, but some other lookup
2648 * has populated v_cache_dd pointer already.
2649 */
2650 if (dvp->v_cache_dd != NULL)
2651 goto out_unlock_free;
2652 KASSERT(vp == NULL || vp->v_type == VDIR,
2653 ("wrong vnode type %p", vp));
2654 atomic_thread_fence_rel();
2655 atomic_store_ptr(&dvp->v_cache_dd, ncp);
2656 } else if (vp != NULL) {
2657 /*
2658 * Take the slow path in INOTIFY(). This flag will be lazily
2659 * cleared by cache_vop_inotify() once all directories referring
2660 * to vp are unwatched.
2661 */
2662 if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0))
2663 vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT);
2664
2665 /*
2666 * For this case, the cache entry maps both the
2667 * directory name in it and the name ".." for the
2668 * directory's parent.
2669 */
2670 if ((ndd = vp->v_cache_dd) != NULL) {
2671 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2672 cache_zap_locked(ndd);
2673 else
2674 ndd = NULL;
2675 }
2676 atomic_thread_fence_rel();
2677 atomic_store_ptr(&vp->v_cache_dd, ncp);
2678 }
2679
2680 if (flag != NCF_ISDOTDOT) {
2681 if (LIST_EMPTY(&dvp->v_cache_src)) {
2682 cache_hold_vnode(dvp);
2683 }
2684 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2685 }
2686
2687 /*
2688 * If the entry is "negative", we place it into the
2689 * "negative" cache queue, otherwise, we place it into the
2690 * destination vnode's cache entries queue.
2691 */
2692 if (vp != NULL) {
2693 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2694 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2695 vp);
2696 } else {
2697 if (cnp->cn_flags & ISWHITEOUT)
2698 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2699 cache_neg_insert(ncp);
2700 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2701 ncp->nc_name);
2702 }
2703
2704 /*
2705 * Insert the new namecache entry into the appropriate chain
2706 * within the cache entries table.
2707 */
2708 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2709
2710 atomic_thread_fence_rel();
2711 /*
2712 * Mark the entry as fully constructed.
2713 * It is immutable past this point until its removal.
2714 */
2715 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2716
2717 cache_enter_unlock(&cel);
2718 if (ndd != NULL)
2719 cache_free(ndd);
2720 return;
2721 out_unlock_free:
2722 cache_enter_unlock(&cel);
2723 cache_free(ncp);
2724 return;
2725 }
2726
2727 /*
2728 * A variant of the above accepting flags.
2729 *
2730 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
2731 *
2732 * TODO: this routine is a hack. It blindly removes the old entry, even if it
2733 * happens to match and it is doing it in an inefficient manner. It was added
2734 * to accommodate NFS which runs into a case where the target for a given name
2735 * may change from under it. Note this does nothing to solve the following
2736 * race: 2 callers of cache_enter_time_flags pass a different target vnode for
2737 * the same [dvp, cnp]. It may be argued that code doing this is broken.
2738 */
2739 void
cache_enter_time_flags(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp,int flags)2740 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2741 struct timespec *tsp, struct timespec *dtsp, int flags)
2742 {
2743
2744 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
2745
2746 if (flags & VFS_CACHE_DROPOLD)
2747 cache_remove_cnp(dvp, cnp);
2748 cache_enter_time(dvp, vp, cnp, tsp, dtsp);
2749 }
2750
2751 static u_long
cache_roundup_2(u_long val)2752 cache_roundup_2(u_long val)
2753 {
2754 u_long res;
2755
2756 for (res = 1; res <= val; res <<= 1)
2757 continue;
2758
2759 return (res);
2760 }
2761
2762 static struct nchashhead *
nchinittbl(u_long elements,u_long * hashmask)2763 nchinittbl(u_long elements, u_long *hashmask)
2764 {
2765 struct nchashhead *hashtbl;
2766 u_long hashsize, i;
2767
2768 hashsize = cache_roundup_2(elements) / 2;
2769
2770 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2771 for (i = 0; i < hashsize; i++)
2772 CK_SLIST_INIT(&hashtbl[i]);
2773 *hashmask = hashsize - 1;
2774 return (hashtbl);
2775 }
2776
2777 static void
ncfreetbl(struct nchashhead * hashtbl)2778 ncfreetbl(struct nchashhead *hashtbl)
2779 {
2780
2781 free(hashtbl, M_VFSCACHE);
2782 }
2783
2784 /*
2785 * Name cache initialization, from vfs_init() when we are booting
2786 */
2787 static void
nchinit(void * dummy __unused)2788 nchinit(void *dummy __unused)
2789 {
2790 u_int i;
2791
2792 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2793 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGN_MASK, UMA_ZONE_ZINIT);
2794 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2795 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGN_MASK, UMA_ZONE_ZINIT);
2796 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2797 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGN_MASK, UMA_ZONE_ZINIT);
2798 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2799 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGN_MASK, UMA_ZONE_ZINIT);
2800
2801 VFS_SMR_ZONE_SET(cache_zone_small);
2802 VFS_SMR_ZONE_SET(cache_zone_small_ts);
2803 VFS_SMR_ZONE_SET(cache_zone_large);
2804 VFS_SMR_ZONE_SET(cache_zone_large_ts);
2805
2806 ncsize = desiredvnodes * ncsizefactor;
2807 cache_recalc_neg_min();
2808 nchashtbl = nchinittbl(ncsize, &nchash);
2809 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2810 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2811 ncbuckethash = 7;
2812 if (ncbuckethash > nchash)
2813 ncbuckethash = nchash;
2814 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2815 M_WAITOK | M_ZERO);
2816 for (i = 0; i < numbucketlocks; i++)
2817 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2818 ncvnodehash = ncbuckethash;
2819 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2820 M_WAITOK | M_ZERO);
2821 for (i = 0; i < numvnodelocks; i++)
2822 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2823
2824 for (i = 0; i < numneglists; i++) {
2825 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2826 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2827 TAILQ_INIT(&neglists[i].nl_list);
2828 TAILQ_INIT(&neglists[i].nl_hotlist);
2829 }
2830 }
2831 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2832
2833 void
cache_vnode_init(struct vnode * vp)2834 cache_vnode_init(struct vnode *vp)
2835 {
2836
2837 LIST_INIT(&vp->v_cache_src);
2838 TAILQ_INIT(&vp->v_cache_dst);
2839 vp->v_cache_dd = NULL;
2840 cache_prehash(vp);
2841 }
2842
2843 /*
2844 * Induce transient cache misses for lockless operation in cache_lookup() by
2845 * using a temporary hash table.
2846 *
2847 * This will force a fs lookup.
2848 *
2849 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
2850 * to observe all CPUs not performing the lookup.
2851 */
2852 static void
cache_changesize_set_temp(struct nchashhead * temptbl,u_long temphash)2853 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
2854 {
2855
2856 MPASS(temphash < nchash);
2857 /*
2858 * Change the size. The new size is smaller and can safely be used
2859 * against the existing table. All lookups which now hash wrong will
2860 * result in a cache miss, which all callers are supposed to know how
2861 * to handle.
2862 */
2863 atomic_store_long(&nchash, temphash);
2864 atomic_thread_fence_rel();
2865 vfs_smr_synchronize();
2866 /*
2867 * At this point everyone sees the updated hash value, but they still
2868 * see the old table.
2869 */
2870 atomic_store_ptr(&nchashtbl, temptbl);
2871 atomic_thread_fence_rel();
2872 vfs_smr_synchronize();
2873 /*
2874 * At this point everyone sees the updated table pointer and size pair.
2875 */
2876 }
2877
2878 /*
2879 * Set the new hash table.
2880 *
2881 * Similarly to cache_changesize_set_temp(), this has to synchronize against
2882 * lockless operation in cache_lookup().
2883 */
2884 static void
cache_changesize_set_new(struct nchashhead * new_tbl,u_long new_hash)2885 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
2886 {
2887
2888 MPASS(nchash < new_hash);
2889 /*
2890 * Change the pointer first. This wont result in out of bounds access
2891 * since the temporary table is guaranteed to be smaller.
2892 */
2893 atomic_store_ptr(&nchashtbl, new_tbl);
2894 atomic_thread_fence_rel();
2895 vfs_smr_synchronize();
2896 /*
2897 * At this point everyone sees the updated pointer value, but they
2898 * still see the old size.
2899 */
2900 atomic_store_long(&nchash, new_hash);
2901 atomic_thread_fence_rel();
2902 vfs_smr_synchronize();
2903 /*
2904 * At this point everyone sees the updated table pointer and size pair.
2905 */
2906 }
2907
2908 void
cache_changesize(u_long newmaxvnodes)2909 cache_changesize(u_long newmaxvnodes)
2910 {
2911 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
2912 u_long new_nchash, old_nchash, temphash;
2913 struct namecache *ncp;
2914 uint32_t hash;
2915 u_long newncsize;
2916 u_long i;
2917
2918 newncsize = newmaxvnodes * ncsizefactor;
2919 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2920 if (newmaxvnodes < numbucketlocks)
2921 newmaxvnodes = numbucketlocks;
2922
2923 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2924 /* If same hash table size, nothing to do */
2925 if (nchash == new_nchash) {
2926 ncfreetbl(new_nchashtbl);
2927 return;
2928 }
2929
2930 temptbl = nchinittbl(1, &temphash);
2931
2932 /*
2933 * Move everything from the old hash table to the new table.
2934 * None of the namecache entries in the table can be removed
2935 * because to do so, they have to be removed from the hash table.
2936 */
2937 cache_lock_all_vnodes();
2938 cache_lock_all_buckets();
2939 old_nchashtbl = nchashtbl;
2940 old_nchash = nchash;
2941 cache_changesize_set_temp(temptbl, temphash);
2942 for (i = 0; i <= old_nchash; i++) {
2943 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2944 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2945 ncp->nc_dvp);
2946 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2947 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
2948 }
2949 }
2950 ncsize = newncsize;
2951 cache_recalc_neg_min();
2952 cache_changesize_set_new(new_nchashtbl, new_nchash);
2953 cache_unlock_all_buckets();
2954 cache_unlock_all_vnodes();
2955 ncfreetbl(old_nchashtbl);
2956 ncfreetbl(temptbl);
2957 }
2958
2959 /*
2960 * Remove all entries from and to a particular vnode.
2961 */
2962 static void
cache_purge_impl(struct vnode * vp)2963 cache_purge_impl(struct vnode *vp)
2964 {
2965 struct cache_freebatch batch;
2966 struct namecache *ncp;
2967 struct mtx *vlp, *vlp2;
2968
2969 TAILQ_INIT(&batch);
2970 vlp = VP2VNODELOCK(vp);
2971 vlp2 = NULL;
2972 mtx_lock(vlp);
2973 retry:
2974 while (!LIST_EMPTY(&vp->v_cache_src)) {
2975 ncp = LIST_FIRST(&vp->v_cache_src);
2976 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2977 goto retry;
2978 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2979 }
2980 while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2981 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2982 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2983 goto retry;
2984 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2985 }
2986 ncp = vp->v_cache_dd;
2987 if (ncp != NULL) {
2988 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2989 ("lost dotdot link"));
2990 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2991 goto retry;
2992 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2993 }
2994 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2995 mtx_unlock(vlp);
2996 if (vlp2 != NULL)
2997 mtx_unlock(vlp2);
2998 cache_free_batch(&batch);
2999 }
3000
3001 /*
3002 * Opportunistic check to see if there is anything to do.
3003 */
3004 static bool
cache_has_entries(struct vnode * vp)3005 cache_has_entries(struct vnode *vp)
3006 {
3007
3008 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
3009 atomic_load_ptr(&vp->v_cache_dd) == NULL)
3010 return (false);
3011 return (true);
3012 }
3013
3014 void
cache_purge(struct vnode * vp)3015 cache_purge(struct vnode *vp)
3016 {
3017
3018 SDT_PROBE1(vfs, namecache, purge, done, vp);
3019 if (!cache_has_entries(vp))
3020 return;
3021 cache_purge_impl(vp);
3022 }
3023
3024 /*
3025 * Only to be used by vgone.
3026 */
3027 void
cache_purge_vgone(struct vnode * vp)3028 cache_purge_vgone(struct vnode *vp)
3029 {
3030 struct mtx *vlp;
3031
3032 VNPASS(VN_IS_DOOMED(vp), vp);
3033 if (cache_has_entries(vp)) {
3034 cache_purge_impl(vp);
3035 return;
3036 }
3037
3038 /*
3039 * Serialize against a potential thread doing cache_purge.
3040 */
3041 vlp = VP2VNODELOCK(vp);
3042 mtx_wait_unlocked(vlp);
3043 if (cache_has_entries(vp)) {
3044 cache_purge_impl(vp);
3045 return;
3046 }
3047 return;
3048 }
3049
3050 /*
3051 * Remove all negative entries for a particular directory vnode.
3052 */
3053 void
cache_purge_negative(struct vnode * vp)3054 cache_purge_negative(struct vnode *vp)
3055 {
3056 struct cache_freebatch batch;
3057 struct namecache *ncp, *nnp;
3058 struct mtx *vlp;
3059
3060 SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
3061 if (LIST_EMPTY(&vp->v_cache_src))
3062 return;
3063 TAILQ_INIT(&batch);
3064 vlp = VP2VNODELOCK(vp);
3065 mtx_lock(vlp);
3066 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
3067 if (!(ncp->nc_flag & NCF_NEGATIVE))
3068 continue;
3069 cache_zap_negative_locked_vnode_kl(ncp, vp);
3070 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
3071 }
3072 mtx_unlock(vlp);
3073 cache_free_batch(&batch);
3074 }
3075
3076 /*
3077 * Entry points for modifying VOP operations.
3078 */
3079 void
cache_vop_rename(struct vnode * fdvp,struct vnode * fvp,struct vnode * tdvp,struct vnode * tvp,struct componentname * fcnp,struct componentname * tcnp)3080 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
3081 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
3082 {
3083
3084 ASSERT_VOP_IN_SEQC(fdvp);
3085 ASSERT_VOP_IN_SEQC(fvp);
3086 ASSERT_VOP_IN_SEQC(tdvp);
3087 if (tvp != NULL)
3088 ASSERT_VOP_IN_SEQC(tvp);
3089
3090 cache_purge(fvp);
3091 if (tvp != NULL) {
3092 cache_purge(tvp);
3093 KASSERT(!cache_remove_cnp(tdvp, tcnp),
3094 ("%s: lingering negative entry", __func__));
3095 } else {
3096 cache_remove_cnp(tdvp, tcnp);
3097 }
3098
3099 /*
3100 * TODO
3101 *
3102 * Historically renaming was always purging all revelang entries,
3103 * but that's quite wasteful. In particular turns out that in many cases
3104 * the target file is immediately accessed after rename, inducing a cache
3105 * miss.
3106 *
3107 * Recode this to reduce relocking and reuse the existing entry (if any)
3108 * instead of just removing it above and allocating a new one here.
3109 */
3110 cache_enter(tdvp, fvp, tcnp);
3111 }
3112
3113 void
cache_vop_rmdir(struct vnode * dvp,struct vnode * vp)3114 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
3115 {
3116
3117 ASSERT_VOP_IN_SEQC(dvp);
3118 ASSERT_VOP_IN_SEQC(vp);
3119 cache_purge(vp);
3120 }
3121
3122 #ifdef INVARIANTS
3123 /*
3124 * Validate that if an entry exists it matches.
3125 */
3126 void
cache_validate(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)3127 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
3128 {
3129 struct namecache *ncp;
3130 struct mtx *blp;
3131 uint32_t hash;
3132
3133 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3134 if (CK_SLIST_EMPTY(NCHHASH(hash)))
3135 return;
3136 blp = HASH2BUCKETLOCK(hash);
3137 mtx_lock(blp);
3138 ncp = cache_ncp_find(dvp, cnp, hash);
3139 if (ncp != NULL && ncp->nc_vp != vp) {
3140 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
3141 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
3142 }
3143 mtx_unlock(blp);
3144 }
3145
3146 void
cache_assert_no_entries(struct vnode * vp)3147 cache_assert_no_entries(struct vnode *vp)
3148 {
3149
3150 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp);
3151 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
3152 VNPASS(vp->v_cache_dd == NULL, vp);
3153 }
3154 #endif
3155
3156 /*
3157 * Flush all entries referencing a particular filesystem.
3158 */
3159 void
cache_purgevfs(struct mount * mp)3160 cache_purgevfs(struct mount *mp)
3161 {
3162 struct vnode *vp, *mvp;
3163 size_t visited __sdt_used, purged __sdt_used;
3164
3165 visited = purged = 0;
3166 /*
3167 * Somewhat wasteful iteration over all vnodes. Would be better to
3168 * support filtering and avoid the interlock to begin with.
3169 */
3170 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3171 visited++;
3172 if (!cache_has_entries(vp)) {
3173 VI_UNLOCK(vp);
3174 continue;
3175 }
3176 vholdl(vp);
3177 VI_UNLOCK(vp);
3178 cache_purge(vp);
3179 purged++;
3180 vdrop(vp);
3181 }
3182
3183 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
3184 }
3185
3186 /*
3187 * Perform canonical checks and cache lookup and pass on to filesystem
3188 * through the vop_cachedlookup only if needed.
3189 */
3190
3191 int
vfs_cache_lookup(struct vop_lookup_args * ap)3192 vfs_cache_lookup(struct vop_lookup_args *ap)
3193 {
3194 struct vnode *dvp;
3195 int error;
3196 struct vnode **vpp = ap->a_vpp;
3197 struct componentname *cnp = ap->a_cnp;
3198 int flags = cnp->cn_flags;
3199
3200 *vpp = NULL;
3201 dvp = ap->a_dvp;
3202
3203 if (dvp->v_type != VDIR)
3204 return (ENOTDIR);
3205
3206 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
3207 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
3208 return (EROFS);
3209
3210 error = vn_dir_check_exec(dvp, cnp);
3211 if (error != 0)
3212 return (error);
3213
3214 error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
3215 if (error == 0)
3216 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
3217 if (error == -1)
3218 return (0);
3219 return (error);
3220 }
3221
3222 /* Implementation of the getcwd syscall. */
3223 int
sys___getcwd(struct thread * td,struct __getcwd_args * uap)3224 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
3225 {
3226 char *buf, *retbuf;
3227 size_t buflen;
3228 int error;
3229
3230 buflen = uap->buflen;
3231 if (__predict_false(buflen < 2))
3232 return (EINVAL);
3233 if (buflen > MAXPATHLEN)
3234 buflen = MAXPATHLEN;
3235
3236 buf = uma_zalloc(namei_zone, M_WAITOK);
3237 error = vn_getcwd(buf, &retbuf, &buflen);
3238 if (error == 0)
3239 error = copyout(retbuf, uap->buf, buflen);
3240 uma_zfree(namei_zone, buf);
3241 return (error);
3242 }
3243
3244 int
vn_getcwd(char * buf,char ** retbuf,size_t * buflen)3245 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
3246 {
3247 struct pwd *pwd;
3248 int error;
3249
3250 vfs_smr_enter();
3251 pwd = pwd_get_smr();
3252 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
3253 buflen, 0);
3254 VFS_SMR_ASSERT_NOT_ENTERED();
3255 if (error < 0) {
3256 pwd = pwd_hold(curthread);
3257 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
3258 retbuf, buflen);
3259 pwd_drop(pwd);
3260 }
3261
3262 #ifdef KTRACE
3263 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
3264 ktrnamei(*retbuf);
3265 #endif
3266 return (error);
3267 }
3268
3269 /*
3270 * Canonicalize a path by walking it forward and back.
3271 *
3272 * BUGS:
3273 * - Nothing guarantees the integrity of the entire chain. Consider the case
3274 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of
3275 * "foo" into "quux" during the backwards walk. The result will be
3276 * "quux/bar/baz/qux", which could not have been obtained by an incremental
3277 * walk in userspace. Moreover, the path we return is inaccessible if the
3278 * calling thread lacks permission to traverse "quux".
3279 */
3280 static int
kern___realpathat(struct thread * td,int fd,const char * path,char * buf,size_t size,int flags,enum uio_seg pathseg)3281 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
3282 size_t size, int flags, enum uio_seg pathseg)
3283 {
3284 struct nameidata nd;
3285 char *retbuf, *freebuf;
3286 int error;
3287
3288 if (flags != 0)
3289 return (EINVAL);
3290 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1,
3291 pathseg, path, fd, &cap_fstat_rights);
3292 if ((error = namei(&nd)) != 0)
3293 return (error);
3294
3295 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
3296 (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
3297 struct vnode *covered_vp;
3298
3299 /*
3300 * This happens if vp is a file mount. The call to
3301 * vn_fullpath_hardlink can panic if path resolution can't be
3302 * handled without the directory.
3303 *
3304 * To resolve this, we find the vnode which was mounted on -
3305 * this should have a unique global path since we disallow
3306 * mounting on linked files.
3307 */
3308 error = vn_lock(nd.ni_vp, LK_SHARED);
3309 if (error != 0)
3310 goto out;
3311 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
3312 vref(covered_vp);
3313 VOP_UNLOCK(nd.ni_vp);
3314 error = vn_fullpath(covered_vp, &retbuf, &freebuf);
3315 vrele(covered_vp);
3316 } else {
3317 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp,
3318 nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf,
3319 &freebuf, &size);
3320 }
3321 if (error == 0) {
3322 size_t len;
3323
3324 len = strlen(retbuf) + 1;
3325 if (size < len)
3326 error = ENAMETOOLONG;
3327 else if (pathseg == UIO_USERSPACE)
3328 error = copyout(retbuf, buf, len);
3329 else
3330 memcpy(buf, retbuf, len);
3331 free(freebuf, M_TEMP);
3332 }
3333 out:
3334 vrele(nd.ni_vp);
3335 vrele(nd.ni_dvp);
3336 NDFREE_PNBUF(&nd);
3337 return (error);
3338 }
3339
3340 int
sys___realpathat(struct thread * td,struct __realpathat_args * uap)3341 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
3342 {
3343
3344 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
3345 uap->flags, UIO_USERSPACE));
3346 }
3347
3348 static int
vn_fullpath_up_to_pwd_vnode(struct vnode * vp,struct vnode * (* const get_pwd_vnode)(const struct pwd *),char ** retbuf,char ** freebuf)3349 vn_fullpath_up_to_pwd_vnode(struct vnode *vp,
3350 struct vnode *(*const get_pwd_vnode)(const struct pwd *),
3351 char **retbuf, char **freebuf)
3352 {
3353 struct pwd *pwd;
3354 char *buf;
3355 size_t buflen;
3356 int error;
3357
3358 if (__predict_false(vp == NULL))
3359 return (EINVAL);
3360
3361 buflen = MAXPATHLEN;
3362 buf = malloc(buflen, M_TEMP, M_WAITOK);
3363 vfs_smr_enter();
3364 pwd = pwd_get_smr();
3365 error = vn_fullpath_any_smr(vp, get_pwd_vnode(pwd), buf, retbuf,
3366 &buflen, 0);
3367 VFS_SMR_ASSERT_NOT_ENTERED();
3368 if (error < 0) {
3369 pwd = pwd_hold(curthread);
3370 error = vn_fullpath_any(vp, get_pwd_vnode(pwd), buf, retbuf,
3371 &buflen);
3372 pwd_drop(pwd);
3373 }
3374 if (error == 0)
3375 *freebuf = buf;
3376 else
3377 free(buf, M_TEMP);
3378 return (error);
3379 }
3380
3381 static inline struct vnode *
get_rdir(const struct pwd * pwd)3382 get_rdir(const struct pwd *pwd)
3383 {
3384 return (pwd->pwd_rdir);
3385 }
3386
3387 /*
3388 * Produce a filesystem path that starts from the current chroot directory and
3389 * corresponds to the passed vnode, using the name cache (if available).
3390 */
3391 int
vn_fullpath(struct vnode * vp,char ** retbuf,char ** freebuf)3392 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
3393 {
3394 return (vn_fullpath_up_to_pwd_vnode(vp, get_rdir, retbuf, freebuf));
3395 }
3396
3397 static inline struct vnode *
get_jdir(const struct pwd * pwd)3398 get_jdir(const struct pwd *pwd)
3399 {
3400 return (pwd->pwd_jdir);
3401 }
3402
3403 /*
3404 * Produce a filesystem path that starts from the current jail's root directory
3405 * and corresponds to the passed vnode, using the name cache (if available).
3406 *
3407 * This function allows to ignore chroots done inside a jail (or the host),
3408 * allowing path checks to remain unaffected by privileged or unprivileged
3409 * chroot calls.
3410 */
3411 int
vn_fullpath_jail(struct vnode * vp,char ** retbuf,char ** freebuf)3412 vn_fullpath_jail(struct vnode *vp, char **retbuf, char **freebuf)
3413 {
3414 return (vn_fullpath_up_to_pwd_vnode(vp, get_jdir, retbuf, freebuf));
3415 }
3416
3417 /*
3418 * This function is similar to vn_fullpath, but it attempts to lookup the
3419 * pathname relative to the global root mount point. This is required for the
3420 * auditing sub-system, as audited pathnames must be absolute, relative to the
3421 * global root mount point.
3422 */
3423 int
vn_fullpath_global(struct vnode * vp,char ** retbuf,char ** freebuf)3424 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
3425 {
3426 char *buf;
3427 size_t buflen;
3428 int error;
3429
3430 if (__predict_false(vp == NULL))
3431 return (EINVAL);
3432 buflen = MAXPATHLEN;
3433 buf = malloc(buflen, M_TEMP, M_WAITOK);
3434 vfs_smr_enter();
3435 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
3436 VFS_SMR_ASSERT_NOT_ENTERED();
3437 if (error < 0) {
3438 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
3439 }
3440 if (error == 0)
3441 *freebuf = buf;
3442 else
3443 free(buf, M_TEMP);
3444 return (error);
3445 }
3446
3447 static struct namecache *
vn_dd_from_dst(struct vnode * vp)3448 vn_dd_from_dst(struct vnode *vp)
3449 {
3450 struct namecache *ncp;
3451
3452 cache_assert_vnode_locked(vp);
3453 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3454 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3455 return (ncp);
3456 }
3457 return (NULL);
3458 }
3459
3460 int
vn_vptocnp(struct vnode ** vp,char * buf,size_t * buflen)3461 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3462 {
3463 struct vnode *dvp;
3464 struct namecache *ncp;
3465 struct mtx *vlp;
3466 int error;
3467
3468 vlp = VP2VNODELOCK(*vp);
3469 mtx_lock(vlp);
3470 ncp = (*vp)->v_cache_dd;
3471 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3472 KASSERT(ncp == vn_dd_from_dst(*vp),
3473 ("%s: mismatch for dd entry (%p != %p)", __func__,
3474 ncp, vn_dd_from_dst(*vp)));
3475 } else {
3476 ncp = vn_dd_from_dst(*vp);
3477 }
3478 if (ncp != NULL) {
3479 if (*buflen < ncp->nc_nlen) {
3480 mtx_unlock(vlp);
3481 vrele(*vp);
3482 counter_u64_add(numfullpathfail4, 1);
3483 error = ENOMEM;
3484 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3485 vp, NULL);
3486 return (error);
3487 }
3488 *buflen -= ncp->nc_nlen;
3489 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3490 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3491 ncp->nc_name, vp);
3492 dvp = *vp;
3493 *vp = ncp->nc_dvp;
3494 vref(*vp);
3495 mtx_unlock(vlp);
3496 vrele(dvp);
3497 return (0);
3498 }
3499 SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3500
3501 mtx_unlock(vlp);
3502 vn_lock(*vp, LK_SHARED | LK_RETRY);
3503 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3504 vput(*vp);
3505 if (error) {
3506 counter_u64_add(numfullpathfail2, 1);
3507 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3508 return (error);
3509 }
3510
3511 *vp = dvp;
3512 if (VN_IS_DOOMED(dvp)) {
3513 /* forced unmount */
3514 vrele(dvp);
3515 error = ENOENT;
3516 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3517 return (error);
3518 }
3519 /*
3520 * *vp has its use count incremented still.
3521 */
3522
3523 return (0);
3524 }
3525
3526 /*
3527 * Resolve a directory to a pathname.
3528 *
3529 * The name of the directory can always be found in the namecache or fetched
3530 * from the filesystem. There is also guaranteed to be only one parent, meaning
3531 * we can just follow vnodes up until we find the root.
3532 *
3533 * The vnode must be referenced.
3534 */
3535 static int
vn_fullpath_dir(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * len,size_t addend)3536 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3537 size_t *len, size_t addend)
3538 {
3539 #ifdef KDTRACE_HOOKS
3540 struct vnode *startvp = vp;
3541 #endif
3542 struct vnode *vp1;
3543 size_t buflen;
3544 int error;
3545 bool slash_prefixed;
3546
3547 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3548 VNPASS(vp->v_usecount > 0, vp);
3549
3550 buflen = *len;
3551
3552 slash_prefixed = true;
3553 if (addend == 0) {
3554 MPASS(*len >= 2);
3555 buflen--;
3556 buf[buflen] = '\0';
3557 slash_prefixed = false;
3558 }
3559
3560 error = 0;
3561
3562 SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3563 counter_u64_add(numfullpathcalls, 1);
3564 while (vp != rdir && vp != rootvnode) {
3565 /*
3566 * The vp vnode must be already fully constructed,
3567 * since it is either found in namecache or obtained
3568 * from VOP_VPTOCNP(). We may test for VV_ROOT safely
3569 * without obtaining the vnode lock.
3570 */
3571 if ((vp->v_vflag & VV_ROOT) != 0) {
3572 vn_lock(vp, LK_RETRY | LK_SHARED);
3573
3574 /*
3575 * With the vnode locked, check for races with
3576 * unmount, forced or not. Note that we
3577 * already verified that vp is not equal to
3578 * the root vnode, which means that
3579 * mnt_vnodecovered can be NULL only for the
3580 * case of unmount.
3581 */
3582 if (VN_IS_DOOMED(vp) ||
3583 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3584 vp1->v_mountedhere != vp->v_mount) {
3585 vput(vp);
3586 error = ENOENT;
3587 SDT_PROBE3(vfs, namecache, fullpath, return,
3588 error, vp, NULL);
3589 break;
3590 }
3591
3592 vref(vp1);
3593 vput(vp);
3594 vp = vp1;
3595 continue;
3596 }
3597 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3598 error = vn_vptocnp(&vp, buf, &buflen);
3599 if (error)
3600 break;
3601 if (buflen == 0) {
3602 vrele(vp);
3603 error = ENOMEM;
3604 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3605 startvp, NULL);
3606 break;
3607 }
3608 buf[--buflen] = '/';
3609 slash_prefixed = true;
3610 }
3611 if (error)
3612 return (error);
3613 if (!slash_prefixed) {
3614 if (buflen == 0) {
3615 vrele(vp);
3616 counter_u64_add(numfullpathfail4, 1);
3617 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3618 startvp, NULL);
3619 return (ENOMEM);
3620 }
3621 buf[--buflen] = '/';
3622 }
3623 counter_u64_add(numfullpathfound, 1);
3624 vrele(vp);
3625
3626 *retbuf = buf + buflen;
3627 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3628 *len -= buflen;
3629 *len += addend;
3630 return (0);
3631 }
3632
3633 /*
3634 * Resolve an arbitrary vnode to a pathname.
3635 *
3636 * Note 2 caveats:
3637 * - hardlinks are not tracked, thus if the vnode is not a directory this can
3638 * resolve to a different path than the one used to find it
3639 * - namecache is not mandatory, meaning names are not guaranteed to be added
3640 * (in which case resolving fails)
3641 */
3642 static void __inline
cache_rev_failed_impl(int * reason,int line)3643 cache_rev_failed_impl(int *reason, int line)
3644 {
3645
3646 *reason = line;
3647 }
3648 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__)
3649
3650 static int
vn_fullpath_any_smr(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen,size_t addend)3651 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3652 char **retbuf, size_t *buflen, size_t addend)
3653 {
3654 #ifdef KDTRACE_HOOKS
3655 struct vnode *startvp = vp;
3656 #endif
3657 struct vnode *tvp;
3658 struct mount *mp;
3659 struct namecache *ncp;
3660 size_t orig_buflen;
3661 int reason;
3662 int error;
3663 #ifdef KDTRACE_HOOKS
3664 int i;
3665 #endif
3666 seqc_t vp_seqc, tvp_seqc;
3667 u_char nc_flag;
3668
3669 VFS_SMR_ASSERT_ENTERED();
3670
3671 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
3672 vfs_smr_exit();
3673 return (-1);
3674 }
3675
3676 orig_buflen = *buflen;
3677
3678 if (addend == 0) {
3679 MPASS(*buflen >= 2);
3680 *buflen -= 1;
3681 buf[*buflen] = '\0';
3682 }
3683
3684 if (vp == rdir || vp == rootvnode) {
3685 if (addend == 0) {
3686 *buflen -= 1;
3687 buf[*buflen] = '/';
3688 }
3689 goto out_ok;
3690 }
3691
3692 #ifdef KDTRACE_HOOKS
3693 i = 0;
3694 #endif
3695 error = -1;
3696 ncp = NULL; /* for sdt probe down below */
3697 vp_seqc = vn_seqc_read_any(vp);
3698 if (seqc_in_modify(vp_seqc)) {
3699 cache_rev_failed(&reason);
3700 goto out_abort;
3701 }
3702
3703 for (;;) {
3704 #ifdef KDTRACE_HOOKS
3705 i++;
3706 #endif
3707 if ((vp->v_vflag & VV_ROOT) != 0) {
3708 mp = atomic_load_ptr(&vp->v_mount);
3709 if (mp == NULL) {
3710 cache_rev_failed(&reason);
3711 goto out_abort;
3712 }
3713 tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3714 tvp_seqc = vn_seqc_read_any(tvp);
3715 if (seqc_in_modify(tvp_seqc)) {
3716 cache_rev_failed(&reason);
3717 goto out_abort;
3718 }
3719 if (!vn_seqc_consistent(vp, vp_seqc)) {
3720 cache_rev_failed(&reason);
3721 goto out_abort;
3722 }
3723 vp = tvp;
3724 vp_seqc = tvp_seqc;
3725 continue;
3726 }
3727 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3728 if (ncp == NULL) {
3729 cache_rev_failed(&reason);
3730 goto out_abort;
3731 }
3732 nc_flag = atomic_load_char(&ncp->nc_flag);
3733 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3734 cache_rev_failed(&reason);
3735 goto out_abort;
3736 }
3737 if (ncp->nc_nlen >= *buflen) {
3738 cache_rev_failed(&reason);
3739 error = ENOMEM;
3740 goto out_abort;
3741 }
3742 *buflen -= ncp->nc_nlen;
3743 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3744 *buflen -= 1;
3745 buf[*buflen] = '/';
3746 tvp = ncp->nc_dvp;
3747 tvp_seqc = vn_seqc_read_any(tvp);
3748 if (seqc_in_modify(tvp_seqc)) {
3749 cache_rev_failed(&reason);
3750 goto out_abort;
3751 }
3752 if (!vn_seqc_consistent(vp, vp_seqc)) {
3753 cache_rev_failed(&reason);
3754 goto out_abort;
3755 }
3756 /*
3757 * Acquire fence provided by vn_seqc_read_any above.
3758 */
3759 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3760 cache_rev_failed(&reason);
3761 goto out_abort;
3762 }
3763 if (!cache_ncp_canuse(ncp)) {
3764 cache_rev_failed(&reason);
3765 goto out_abort;
3766 }
3767 vp = tvp;
3768 vp_seqc = tvp_seqc;
3769 if (vp == rdir || vp == rootvnode)
3770 break;
3771 }
3772 out_ok:
3773 vfs_smr_exit();
3774 *retbuf = buf + *buflen;
3775 *buflen = orig_buflen - *buflen + addend;
3776 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3777 return (0);
3778
3779 out_abort:
3780 *buflen = orig_buflen;
3781 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3782 vfs_smr_exit();
3783 return (error);
3784 }
3785
3786 static int
vn_fullpath_any(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen)3787 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3788 size_t *buflen)
3789 {
3790 size_t orig_buflen, addend;
3791 int error;
3792
3793 if (*buflen < 2)
3794 return (EINVAL);
3795
3796 orig_buflen = *buflen;
3797
3798 vref(vp);
3799 addend = 0;
3800 if (vp->v_type != VDIR) {
3801 *buflen -= 1;
3802 buf[*buflen] = '\0';
3803 error = vn_vptocnp(&vp, buf, buflen);
3804 if (error)
3805 return (error);
3806 if (*buflen == 0) {
3807 vrele(vp);
3808 return (ENOMEM);
3809 }
3810 *buflen -= 1;
3811 buf[*buflen] = '/';
3812 addend = orig_buflen - *buflen;
3813 }
3814
3815 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3816 }
3817
3818 /*
3819 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3820 *
3821 * Since the namecache does not track hardlinks, the caller is expected to
3822 * first look up the target vnode with WANTPARENT flag passed to namei to get
3823 * dvp and vp.
3824 *
3825 * Then we have 2 cases:
3826 * - if the found vnode is a directory, the path can be constructed just by
3827 * following names up the chain
3828 * - otherwise we populate the buffer with the saved name and start resolving
3829 * from the parent
3830 */
3831 int
vn_fullpath_hardlink(struct vnode * vp,struct vnode * dvp,const char * hrdl_name,size_t hrdl_name_length,char ** retbuf,char ** freebuf,size_t * buflen)3832 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
3833 const char *hrdl_name, size_t hrdl_name_length,
3834 char **retbuf, char **freebuf, size_t *buflen)
3835 {
3836 char *buf, *tmpbuf;
3837 struct pwd *pwd;
3838 size_t addend;
3839 int error;
3840 __enum_uint8(vtype) type;
3841
3842 if (*buflen < 2)
3843 return (EINVAL);
3844 if (*buflen > MAXPATHLEN)
3845 *buflen = MAXPATHLEN;
3846
3847 buf = malloc(*buflen, M_TEMP, M_WAITOK);
3848
3849 addend = 0;
3850
3851 /*
3852 * Check for VBAD to work around the vp_crossmp bug in lookup().
3853 *
3854 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3855 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3856 * If the type is VDIR (like in this very case) we can skip looking
3857 * at ni_dvp in the first place. However, since vnodes get passed here
3858 * unlocked the target may transition to doomed state (type == VBAD)
3859 * before we get to evaluate the condition. If this happens, we will
3860 * populate part of the buffer and descend to vn_fullpath_dir with
3861 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3862 */
3863 type = atomic_load_8(&vp->v_type);
3864 if (type == VBAD) {
3865 error = ENOENT;
3866 goto out_bad;
3867 }
3868 if (type != VDIR) {
3869 addend = hrdl_name_length + 2;
3870 if (*buflen < addend) {
3871 error = ENOMEM;
3872 goto out_bad;
3873 }
3874 *buflen -= addend;
3875 tmpbuf = buf + *buflen;
3876 tmpbuf[0] = '/';
3877 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
3878 tmpbuf[addend - 1] = '\0';
3879 vp = dvp;
3880 }
3881
3882 vfs_smr_enter();
3883 pwd = pwd_get_smr();
3884 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3885 addend);
3886 VFS_SMR_ASSERT_NOT_ENTERED();
3887 if (error < 0) {
3888 pwd = pwd_hold(curthread);
3889 vref(vp);
3890 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3891 addend);
3892 pwd_drop(pwd);
3893 }
3894 if (error != 0)
3895 goto out_bad;
3896
3897 *freebuf = buf;
3898
3899 return (0);
3900 out_bad:
3901 free(buf, M_TEMP);
3902 return (error);
3903 }
3904
3905 struct vnode *
vn_dir_dd_ino(struct vnode * vp)3906 vn_dir_dd_ino(struct vnode *vp)
3907 {
3908 struct namecache *ncp;
3909 struct vnode *ddvp;
3910 struct mtx *vlp;
3911 enum vgetstate vs;
3912
3913 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3914 vlp = VP2VNODELOCK(vp);
3915 mtx_lock(vlp);
3916 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3917 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3918 continue;
3919 ddvp = ncp->nc_dvp;
3920 vs = vget_prep(ddvp);
3921 mtx_unlock(vlp);
3922 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3923 return (NULL);
3924 return (ddvp);
3925 }
3926 mtx_unlock(vlp);
3927 return (NULL);
3928 }
3929
3930 int
vn_commname(struct vnode * vp,char * buf,u_int buflen)3931 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3932 {
3933 struct namecache *ncp;
3934 struct mtx *vlp;
3935 int l;
3936
3937 vlp = VP2VNODELOCK(vp);
3938 mtx_lock(vlp);
3939 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3940 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3941 break;
3942 if (ncp == NULL) {
3943 mtx_unlock(vlp);
3944 return (ENOENT);
3945 }
3946 l = min(ncp->nc_nlen, buflen - 1);
3947 memcpy(buf, ncp->nc_name, l);
3948 mtx_unlock(vlp);
3949 buf[l] = '\0';
3950 return (0);
3951 }
3952
3953 /*
3954 * This function updates path string to vnode's full global path
3955 * and checks the size of the new path string against the pathlen argument.
3956 *
3957 * Requires a locked, referenced vnode.
3958 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3959 *
3960 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3961 * because it falls back to the ".." lookup if the namecache lookup fails.
3962 */
3963 int
vn_path_to_global_path(struct thread * td,struct vnode * vp,char * path,u_int pathlen)3964 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3965 u_int pathlen)
3966 {
3967 struct nameidata nd;
3968 struct vnode *vp1;
3969 char *rpath, *fbuf;
3970 int error;
3971
3972 ASSERT_VOP_ELOCKED(vp, __func__);
3973
3974 /* Construct global filesystem path from vp. */
3975 VOP_UNLOCK(vp);
3976 error = vn_fullpath_global(vp, &rpath, &fbuf);
3977
3978 if (error != 0) {
3979 vrele(vp);
3980 return (error);
3981 }
3982
3983 if (strlen(rpath) >= pathlen) {
3984 vrele(vp);
3985 error = ENAMETOOLONG;
3986 goto out;
3987 }
3988
3989 /*
3990 * Re-lookup the vnode by path to detect a possible rename.
3991 * As a side effect, the vnode is relocked.
3992 * If vnode was renamed, return ENOENT.
3993 */
3994 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3995 error = namei(&nd);
3996 if (error != 0) {
3997 vrele(vp);
3998 goto out;
3999 }
4000 NDFREE_PNBUF(&nd);
4001 vp1 = nd.ni_vp;
4002 vrele(vp);
4003 if (vp1 == vp)
4004 strcpy(path, rpath);
4005 else {
4006 vput(vp1);
4007 error = ENOENT;
4008 }
4009
4010 out:
4011 free(fbuf, M_TEMP);
4012 return (error);
4013 }
4014
4015 /*
4016 * This is similar to vn_path_to_global_path but allows for regular
4017 * files which may not be present in the cache.
4018 *
4019 * Requires a locked, referenced vnode.
4020 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
4021 */
4022 int
vn_path_to_global_path_hardlink(struct thread * td,struct vnode * vp,struct vnode * dvp,char * path,u_int pathlen,const char * leaf_name,size_t leaf_length)4023 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
4024 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
4025 size_t leaf_length)
4026 {
4027 struct nameidata nd;
4028 struct vnode *vp1;
4029 char *rpath, *fbuf;
4030 size_t len;
4031 int error;
4032
4033 ASSERT_VOP_ELOCKED(vp, __func__);
4034
4035 /*
4036 * Construct global filesystem path from dvp, vp and leaf
4037 * name.
4038 */
4039 VOP_UNLOCK(vp);
4040 len = pathlen;
4041 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
4042 &rpath, &fbuf, &len);
4043
4044 if (error != 0) {
4045 vrele(vp);
4046 return (error);
4047 }
4048
4049 if (strlen(rpath) >= pathlen) {
4050 vrele(vp);
4051 error = ENAMETOOLONG;
4052 goto out;
4053 }
4054
4055 /*
4056 * Re-lookup the vnode by path to detect a possible rename.
4057 * As a side effect, the vnode is relocked.
4058 * If vnode was renamed, return ENOENT.
4059 */
4060 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
4061 error = namei(&nd);
4062 if (error != 0) {
4063 vrele(vp);
4064 goto out;
4065 }
4066 NDFREE_PNBUF(&nd);
4067 vp1 = nd.ni_vp;
4068 vrele(vp);
4069 if (vp1 == vp)
4070 strcpy(path, rpath);
4071 else {
4072 vput(vp1);
4073 error = ENOENT;
4074 }
4075
4076 out:
4077 free(fbuf, M_TEMP);
4078 return (error);
4079 }
4080
4081 void
cache_vop_inotify(struct vnode * vp,int event,uint32_t cookie)4082 cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie)
4083 {
4084 struct mtx *vlp;
4085 struct namecache *ncp;
4086 int isdir;
4087 bool logged, self;
4088
4089 isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
4090 self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 &&
4091 (vp->v_type != VDIR || (event & ~_IN_DIR_EVENTS) != 0);
4092
4093 if (self) {
4094 int selfevent;
4095
4096 if (event == _IN_ATTRIB_LINKCOUNT)
4097 selfevent = IN_ATTRIB;
4098 else
4099 selfevent = event;
4100 inotify_log(vp, NULL, 0, selfevent | isdir, cookie);
4101 }
4102 if ((event & IN_ALL_EVENTS) == 0)
4103 return;
4104
4105 logged = false;
4106 vlp = VP2VNODELOCK(vp);
4107 mtx_lock(vlp);
4108 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
4109 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
4110 continue;
4111 if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) {
4112 /*
4113 * XXX-MJ if the vnode has two links in the same
4114 * dir, we'll log the same event twice.
4115 */
4116 inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen,
4117 event | isdir, cookie);
4118 logged = true;
4119 }
4120 }
4121 if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) {
4122 /*
4123 * We didn't find a watched directory that contains this vnode,
4124 * so stop calling VOP_INOTIFY for operations on the vnode.
4125 */
4126 vn_irflag_unset(vp, VIRF_INOTIFY_PARENT);
4127 }
4128 mtx_unlock(vlp);
4129 }
4130
4131 #ifdef DDB
4132 static void
db_print_vpath(struct vnode * vp)4133 db_print_vpath(struct vnode *vp)
4134 {
4135
4136 while (vp != NULL) {
4137 db_printf("%p: ", vp);
4138 if (vp == rootvnode) {
4139 db_printf("/");
4140 vp = NULL;
4141 } else {
4142 if (vp->v_vflag & VV_ROOT) {
4143 db_printf("<mount point>");
4144 vp = vp->v_mount->mnt_vnodecovered;
4145 } else {
4146 struct namecache *ncp;
4147 char *ncn;
4148 int i;
4149
4150 ncp = TAILQ_FIRST(&vp->v_cache_dst);
4151 if (ncp != NULL) {
4152 ncn = ncp->nc_name;
4153 for (i = 0; i < ncp->nc_nlen; i++)
4154 db_printf("%c", *ncn++);
4155 vp = ncp->nc_dvp;
4156 } else {
4157 vp = NULL;
4158 }
4159 }
4160 }
4161 db_printf("\n");
4162 }
4163
4164 return;
4165 }
4166
DB_SHOW_COMMAND(vpath,db_show_vpath)4167 DB_SHOW_COMMAND(vpath, db_show_vpath)
4168 {
4169 struct vnode *vp;
4170
4171 if (!have_addr) {
4172 db_printf("usage: show vpath <struct vnode *>\n");
4173 return;
4174 }
4175
4176 vp = (struct vnode *)addr;
4177 db_print_vpath(vp);
4178 }
4179
4180 #endif
4181
4182 static int cache_fast_lookup = 1;
4183
4184 #define CACHE_FPL_FAILED -2020
4185
4186 static int
cache_vop_bad_vexec(struct vop_fplookup_vexec_args * v)4187 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v)
4188 {
4189 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n");
4190 panic("no proper vop_fplookup_vexec");
4191 }
4192
4193 static int
cache_vop_bad_symlink(struct vop_fplookup_symlink_args * v)4194 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v)
4195 {
4196 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n");
4197 panic("no proper vop_fplookup_symlink");
4198 }
4199
4200 void
cache_vop_vector_register(struct vop_vector * v)4201 cache_vop_vector_register(struct vop_vector *v)
4202 {
4203 size_t ops;
4204
4205 ops = 0;
4206 if (v->vop_fplookup_vexec != NULL) {
4207 ops++;
4208 }
4209 if (v->vop_fplookup_symlink != NULL) {
4210 ops++;
4211 }
4212
4213 if (ops == 2) {
4214 return;
4215 }
4216
4217 if (ops == 0) {
4218 v->vop_fplookup_vexec = cache_vop_bad_vexec;
4219 v->vop_fplookup_symlink = cache_vop_bad_symlink;
4220 return;
4221 }
4222
4223 printf("%s: invalid vop vector %p -- either all or none fplookup vops "
4224 "need to be provided", __func__, v);
4225 if (v->vop_fplookup_vexec == NULL) {
4226 printf("%s: missing vop_fplookup_vexec\n", __func__);
4227 }
4228 if (v->vop_fplookup_symlink == NULL) {
4229 printf("%s: missing vop_fplookup_symlink\n", __func__);
4230 }
4231 panic("bad vop vector %p", v);
4232 }
4233
4234 #ifdef INVARIANTS
4235 void
cache_validate_vop_vector(struct mount * mp,struct vop_vector * vops)4236 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops)
4237 {
4238 if (mp == NULL)
4239 return;
4240
4241 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4242 return;
4243
4244 if (vops->vop_fplookup_vexec == NULL ||
4245 vops->vop_fplookup_vexec == cache_vop_bad_vexec)
4246 panic("bad vop_fplookup_vexec on vector %p for filesystem %s",
4247 vops, mp->mnt_vfc->vfc_name);
4248
4249 if (vops->vop_fplookup_symlink == NULL ||
4250 vops->vop_fplookup_symlink == cache_vop_bad_symlink)
4251 panic("bad vop_fplookup_symlink on vector %p for filesystem %s",
4252 vops, mp->mnt_vfc->vfc_name);
4253 }
4254 #endif
4255
4256 void
cache_fast_lookup_enabled_recalc(void)4257 cache_fast_lookup_enabled_recalc(void)
4258 {
4259 int lookup_flag;
4260 int mac_on;
4261
4262 #ifdef MAC
4263 mac_on = mac_vnode_check_lookup_enabled();
4264 mac_on |= mac_vnode_check_readlink_enabled();
4265 #else
4266 mac_on = 0;
4267 #endif
4268
4269 lookup_flag = atomic_load_int(&cache_fast_lookup);
4270 if (lookup_flag && !mac_on) {
4271 atomic_store_char(&cache_fast_lookup_enabled, true);
4272 } else {
4273 atomic_store_char(&cache_fast_lookup_enabled, false);
4274 }
4275 }
4276
4277 static int
syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)4278 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
4279 {
4280 int error, old;
4281
4282 old = atomic_load_int(&cache_fast_lookup);
4283 error = sysctl_handle_int(oidp, arg1, arg2, req);
4284 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
4285 cache_fast_lookup_enabled_recalc();
4286 return (error);
4287 }
4288 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
4289 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
4290
4291 /*
4292 * Components of nameidata (or objects it can point to) which may
4293 * need restoring in case fast path lookup fails.
4294 */
4295 struct nameidata_outer {
4296 size_t ni_pathlen;
4297 uint64_t cn_flags;
4298 };
4299
4300 struct nameidata_saved {
4301 #ifdef INVARIANTS
4302 char *cn_nameptr;
4303 size_t ni_pathlen;
4304 #endif
4305 };
4306
4307 #ifdef INVARIANTS
4308 struct cache_fpl_debug {
4309 size_t ni_pathlen;
4310 };
4311 #endif
4312
4313 struct cache_fpl {
4314 struct nameidata *ndp;
4315 struct componentname *cnp;
4316 char *nulchar;
4317 struct vnode *dvp;
4318 struct vnode *tvp;
4319 seqc_t dvp_seqc;
4320 seqc_t tvp_seqc;
4321 uint32_t hash;
4322 struct nameidata_saved snd;
4323 struct nameidata_outer snd_outer;
4324 int line;
4325 enum cache_fpl_status status:8;
4326 bool in_smr;
4327 bool fsearch;
4328 struct pwd **pwd;
4329 #ifdef INVARIANTS
4330 struct cache_fpl_debug debug;
4331 #endif
4332 };
4333
4334 static bool cache_fplookup_mp_supported(struct mount *mp);
4335 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
4336 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
4337 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
4338 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
4339 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
4340 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
4341 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
4342 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
4343 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
4344
4345 static void
cache_fpl_cleanup_cnp(struct componentname * cnp)4346 cache_fpl_cleanup_cnp(struct componentname *cnp)
4347 {
4348
4349 uma_zfree(namei_zone, cnp->cn_pnbuf);
4350 cnp->cn_pnbuf = NULL;
4351 cnp->cn_nameptr = NULL;
4352 }
4353
4354 static struct vnode *
cache_fpl_handle_root(struct cache_fpl * fpl)4355 cache_fpl_handle_root(struct cache_fpl *fpl)
4356 {
4357 struct nameidata *ndp;
4358 struct componentname *cnp;
4359
4360 ndp = fpl->ndp;
4361 cnp = fpl->cnp;
4362
4363 MPASS(*(cnp->cn_nameptr) == '/');
4364 cnp->cn_nameptr++;
4365 cache_fpl_pathlen_dec(fpl);
4366
4367 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4368 do {
4369 cnp->cn_nameptr++;
4370 cache_fpl_pathlen_dec(fpl);
4371 } while (*(cnp->cn_nameptr) == '/');
4372 }
4373
4374 return (ndp->ni_rootdir);
4375 }
4376
4377 static void
cache_fpl_checkpoint_outer(struct cache_fpl * fpl)4378 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
4379 {
4380
4381 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
4382 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
4383 }
4384
4385 static void
cache_fpl_checkpoint(struct cache_fpl * fpl)4386 cache_fpl_checkpoint(struct cache_fpl *fpl)
4387 {
4388
4389 #ifdef INVARIANTS
4390 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
4391 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
4392 #endif
4393 }
4394
4395 static void
cache_fpl_restore_partial(struct cache_fpl * fpl)4396 cache_fpl_restore_partial(struct cache_fpl *fpl)
4397 {
4398
4399 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
4400 #ifdef INVARIANTS
4401 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
4402 #endif
4403 }
4404
4405 static void
cache_fpl_restore_abort(struct cache_fpl * fpl)4406 cache_fpl_restore_abort(struct cache_fpl *fpl)
4407 {
4408
4409 cache_fpl_restore_partial(fpl);
4410 /*
4411 * It is 0 on entry by API contract.
4412 */
4413 fpl->ndp->ni_resflags = 0;
4414 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
4415 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
4416 }
4417
4418 #ifdef INVARIANTS
4419 #define cache_fpl_smr_assert_entered(fpl) ({ \
4420 struct cache_fpl *_fpl = (fpl); \
4421 MPASS(_fpl->in_smr == true); \
4422 VFS_SMR_ASSERT_ENTERED(); \
4423 })
4424 #define cache_fpl_smr_assert_not_entered(fpl) ({ \
4425 struct cache_fpl *_fpl = (fpl); \
4426 MPASS(_fpl->in_smr == false); \
4427 VFS_SMR_ASSERT_NOT_ENTERED(); \
4428 })
4429 static void
cache_fpl_assert_status(struct cache_fpl * fpl)4430 cache_fpl_assert_status(struct cache_fpl *fpl)
4431 {
4432
4433 switch (fpl->status) {
4434 case CACHE_FPL_STATUS_UNSET:
4435 __assert_unreachable();
4436 break;
4437 case CACHE_FPL_STATUS_DESTROYED:
4438 case CACHE_FPL_STATUS_ABORTED:
4439 case CACHE_FPL_STATUS_PARTIAL:
4440 case CACHE_FPL_STATUS_HANDLED:
4441 break;
4442 }
4443 }
4444 #else
4445 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
4446 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
4447 #define cache_fpl_assert_status(fpl) do { } while (0)
4448 #endif
4449
4450 #define cache_fpl_smr_enter_initial(fpl) ({ \
4451 struct cache_fpl *_fpl = (fpl); \
4452 vfs_smr_enter(); \
4453 _fpl->in_smr = true; \
4454 })
4455
4456 #define cache_fpl_smr_enter(fpl) ({ \
4457 struct cache_fpl *_fpl = (fpl); \
4458 MPASS(_fpl->in_smr == false); \
4459 vfs_smr_enter(); \
4460 _fpl->in_smr = true; \
4461 })
4462
4463 #define cache_fpl_smr_exit(fpl) ({ \
4464 struct cache_fpl *_fpl = (fpl); \
4465 MPASS(_fpl->in_smr == true); \
4466 vfs_smr_exit(); \
4467 _fpl->in_smr = false; \
4468 })
4469
4470 static int
cache_fpl_aborted_early_impl(struct cache_fpl * fpl,int line)4471 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
4472 {
4473
4474 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4475 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4476 ("%s: converting to abort from %d at %d, set at %d\n",
4477 __func__, fpl->status, line, fpl->line));
4478 }
4479 cache_fpl_smr_assert_not_entered(fpl);
4480 fpl->status = CACHE_FPL_STATUS_ABORTED;
4481 fpl->line = line;
4482 return (CACHE_FPL_FAILED);
4483 }
4484
4485 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__)
4486
4487 static int __noinline
cache_fpl_aborted_impl(struct cache_fpl * fpl,int line)4488 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
4489 {
4490 struct nameidata *ndp;
4491 struct componentname *cnp;
4492
4493 ndp = fpl->ndp;
4494 cnp = fpl->cnp;
4495
4496 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4497 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4498 ("%s: converting to abort from %d at %d, set at %d\n",
4499 __func__, fpl->status, line, fpl->line));
4500 }
4501 fpl->status = CACHE_FPL_STATUS_ABORTED;
4502 fpl->line = line;
4503 if (fpl->in_smr)
4504 cache_fpl_smr_exit(fpl);
4505 cache_fpl_restore_abort(fpl);
4506 /*
4507 * Resolving symlinks overwrites data passed by the caller.
4508 * Let namei know.
4509 */
4510 if (ndp->ni_loopcnt > 0) {
4511 fpl->status = CACHE_FPL_STATUS_DESTROYED;
4512 cache_fpl_cleanup_cnp(cnp);
4513 }
4514 return (CACHE_FPL_FAILED);
4515 }
4516
4517 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
4518
4519 static int __noinline
cache_fpl_partial_impl(struct cache_fpl * fpl,int line)4520 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
4521 {
4522
4523 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4524 ("%s: setting to partial at %d, but already set to %d at %d\n",
4525 __func__, line, fpl->status, fpl->line));
4526 cache_fpl_smr_assert_entered(fpl);
4527 fpl->status = CACHE_FPL_STATUS_PARTIAL;
4528 fpl->line = line;
4529 return (cache_fplookup_partial_setup(fpl));
4530 }
4531
4532 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
4533
4534 static int
cache_fpl_handled_impl(struct cache_fpl * fpl,int line)4535 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
4536 {
4537
4538 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4539 ("%s: setting to handled at %d, but already set to %d at %d\n",
4540 __func__, line, fpl->status, fpl->line));
4541 cache_fpl_smr_assert_not_entered(fpl);
4542 fpl->status = CACHE_FPL_STATUS_HANDLED;
4543 fpl->line = line;
4544 return (0);
4545 }
4546
4547 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__)
4548
4549 static int
cache_fpl_handled_error_impl(struct cache_fpl * fpl,int error,int line)4550 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
4551 {
4552
4553 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4554 ("%s: setting to handled at %d, but already set to %d at %d\n",
4555 __func__, line, fpl->status, fpl->line));
4556 MPASS(error != 0);
4557 MPASS(error != CACHE_FPL_FAILED);
4558 cache_fpl_smr_assert_not_entered(fpl);
4559 fpl->status = CACHE_FPL_STATUS_HANDLED;
4560 fpl->line = line;
4561 fpl->dvp = NULL;
4562 fpl->tvp = NULL;
4563 return (error);
4564 }
4565
4566 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__)
4567
4568 static bool
cache_fpl_terminated(struct cache_fpl * fpl)4569 cache_fpl_terminated(struct cache_fpl *fpl)
4570 {
4571
4572 return (fpl->status != CACHE_FPL_STATUS_UNSET);
4573 }
4574
4575 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
4576 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
4577 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \
4578 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
4579 OPENWRITE | WANTIOCTLCAPS | NAMEILOOKUP)
4580
4581 #define CACHE_FPL_INTERNAL_CN_FLAGS \
4582 (ISDOTDOT | MAKEENTRY | ISLASTCN)
4583
4584 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4585 "supported and internal flags overlap");
4586
4587 static bool
cache_fpl_islastcn(struct nameidata * ndp)4588 cache_fpl_islastcn(struct nameidata *ndp)
4589 {
4590
4591 return (*ndp->ni_next == 0);
4592 }
4593
4594 static bool
cache_fpl_istrailingslash(struct cache_fpl * fpl)4595 cache_fpl_istrailingslash(struct cache_fpl *fpl)
4596 {
4597
4598 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
4599 return (*(fpl->nulchar - 1) == '/');
4600 }
4601
4602 static bool
cache_fpl_isdotdot(struct componentname * cnp)4603 cache_fpl_isdotdot(struct componentname *cnp)
4604 {
4605
4606 if (cnp->cn_namelen == 2 &&
4607 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
4608 return (true);
4609 return (false);
4610 }
4611
4612 static bool
cache_can_fplookup(struct cache_fpl * fpl)4613 cache_can_fplookup(struct cache_fpl *fpl)
4614 {
4615 struct nameidata *ndp;
4616 struct componentname *cnp;
4617 struct thread *td;
4618
4619 ndp = fpl->ndp;
4620 cnp = fpl->cnp;
4621 td = curthread;
4622
4623 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
4624 cache_fpl_aborted_early(fpl);
4625 return (false);
4626 }
4627 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4628 cache_fpl_aborted_early(fpl);
4629 return (false);
4630 }
4631 if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) {
4632 cache_fpl_aborted_early(fpl);
4633 return (false);
4634 }
4635 if (AUDITING_TD(td)) {
4636 cache_fpl_aborted_early(fpl);
4637 return (false);
4638 }
4639 if (ndp->ni_startdir != NULL) {
4640 cache_fpl_aborted_early(fpl);
4641 return (false);
4642 }
4643 return (true);
4644 }
4645
4646 static int __noinline
cache_fplookup_dirfd(struct cache_fpl * fpl,struct vnode ** vpp)4647 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4648 {
4649 struct nameidata *ndp;
4650 struct componentname *cnp;
4651 int error, flags;
4652
4653 ndp = fpl->ndp;
4654 cnp = fpl->cnp;
4655
4656 error = fgetvp_lookup_smr(ndp, vpp, &flags);
4657 if (__predict_false(error != 0)) {
4658 return (cache_fpl_aborted(fpl));
4659 }
4660 if (__predict_false((flags & O_RESOLVE_BENEATH) != 0)) {
4661 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & RBENEATH) == 0,
4662 "RBENEATH supported by fplookup");
4663 cache_fpl_smr_exit(fpl);
4664 cache_fpl_aborted(fpl);
4665 return (EOPNOTSUPP);
4666 }
4667 fpl->fsearch = (flags & FSEARCH) != 0;
4668 if ((*vpp)->v_type != VDIR) {
4669 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
4670 cache_fpl_smr_exit(fpl);
4671 return (cache_fpl_handled_error(fpl, ENOTDIR));
4672 }
4673 }
4674 return (0);
4675 }
4676
4677 static int __noinline
cache_fplookup_negative_promote(struct cache_fpl * fpl,struct namecache * oncp,uint32_t hash)4678 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4679 uint32_t hash)
4680 {
4681 struct componentname *cnp;
4682 struct vnode *dvp;
4683
4684 cnp = fpl->cnp;
4685 dvp = fpl->dvp;
4686
4687 cache_fpl_smr_exit(fpl);
4688 if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4689 return (cache_fpl_handled_error(fpl, ENOENT));
4690 else
4691 return (cache_fpl_aborted(fpl));
4692 }
4693
4694 /*
4695 * Prepare fallback to the locked lookup while trying to retain the progress.
4696 */
4697 static int __noinline
cache_fplookup_partial_setup(struct cache_fpl * fpl)4698 cache_fplookup_partial_setup(struct cache_fpl *fpl)
4699 {
4700 struct nameidata *ndp;
4701 struct componentname *cnp;
4702 enum vgetstate dvs;
4703 struct vnode *dvp;
4704 struct pwd *pwd;
4705 seqc_t dvp_seqc;
4706
4707 ndp = fpl->ndp;
4708 cnp = fpl->cnp;
4709 pwd = *(fpl->pwd);
4710 dvp = fpl->dvp;
4711 dvp_seqc = fpl->dvp_seqc;
4712
4713 if (!pwd_hold_smr(pwd)) {
4714 return (cache_fpl_aborted(fpl));
4715 }
4716
4717 /*
4718 * Note that seqc is checked before the vnode is locked, so by
4719 * the time regular lookup gets to it it may have moved.
4720 *
4721 * Ultimately this does not affect correctness, any lookup errors
4722 * are userspace racing with itself. It is guaranteed that any
4723 * path which ultimately gets found could also have been found
4724 * by regular lookup going all the way in absence of concurrent
4725 * modifications.
4726 */
4727 dvs = vget_prep_smr(dvp);
4728 cache_fpl_smr_exit(fpl);
4729 if (__predict_false(dvs == VGET_NONE)) {
4730 pwd_drop(pwd);
4731 return (cache_fpl_aborted(fpl));
4732 }
4733
4734 vget_finish_ref(dvp, dvs);
4735 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4736 vrele(dvp);
4737 pwd_drop(pwd);
4738 return (cache_fpl_aborted(fpl));
4739 }
4740
4741 cache_fpl_restore_partial(fpl);
4742 #ifdef INVARIANTS
4743 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4744 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4745 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4746 }
4747 #endif
4748
4749 ndp->ni_startdir = dvp;
4750 cnp->cn_flags |= MAKEENTRY;
4751 if (cache_fpl_islastcn(ndp))
4752 cnp->cn_flags |= ISLASTCN;
4753 if (cache_fpl_isdotdot(cnp))
4754 cnp->cn_flags |= ISDOTDOT;
4755
4756 /*
4757 * Skip potential extra slashes parsing did not take care of.
4758 * cache_fplookup_skip_slashes explains the mechanism.
4759 */
4760 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4761 do {
4762 cnp->cn_nameptr++;
4763 cache_fpl_pathlen_dec(fpl);
4764 } while (*(cnp->cn_nameptr) == '/');
4765 }
4766
4767 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4768 #ifdef INVARIANTS
4769 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4770 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4771 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4772 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4773 }
4774 #endif
4775 return (0);
4776 }
4777
4778 static int
cache_fplookup_final_child(struct cache_fpl * fpl,enum vgetstate tvs)4779 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4780 {
4781 struct componentname *cnp;
4782 struct vnode *tvp;
4783 seqc_t tvp_seqc;
4784 int error, lkflags;
4785
4786 cnp = fpl->cnp;
4787 tvp = fpl->tvp;
4788 tvp_seqc = fpl->tvp_seqc;
4789
4790 if ((cnp->cn_flags & LOCKLEAF) != 0) {
4791 lkflags = LK_SHARED;
4792 if ((cnp->cn_flags & LOCKSHARED) == 0)
4793 lkflags = LK_EXCLUSIVE;
4794 error = vget_finish(tvp, lkflags, tvs);
4795 if (__predict_false(error != 0)) {
4796 return (cache_fpl_aborted(fpl));
4797 }
4798 } else {
4799 vget_finish_ref(tvp, tvs);
4800 }
4801
4802 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4803 if ((cnp->cn_flags & LOCKLEAF) != 0)
4804 vput(tvp);
4805 else
4806 vrele(tvp);
4807 return (cache_fpl_aborted(fpl));
4808 }
4809
4810 return (cache_fpl_handled(fpl));
4811 }
4812
4813 /*
4814 * They want to possibly modify the state of the namecache.
4815 */
4816 static int __noinline
cache_fplookup_final_modifying(struct cache_fpl * fpl)4817 cache_fplookup_final_modifying(struct cache_fpl *fpl)
4818 {
4819 struct nameidata *ndp __diagused;
4820 struct componentname *cnp;
4821 enum vgetstate dvs;
4822 struct vnode *dvp, *tvp;
4823 struct mount *mp;
4824 seqc_t dvp_seqc;
4825 int error;
4826 bool docache;
4827
4828 ndp = fpl->ndp;
4829 cnp = fpl->cnp;
4830 dvp = fpl->dvp;
4831 dvp_seqc = fpl->dvp_seqc;
4832
4833 MPASS(*(cnp->cn_nameptr) != '/');
4834 MPASS(cache_fpl_islastcn(ndp));
4835 if ((cnp->cn_flags & LOCKPARENT) == 0)
4836 MPASS((cnp->cn_flags & WANTPARENT) != 0);
4837 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4838 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4839 cnp->cn_nameiop == RENAME);
4840 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4841 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4842
4843 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4844 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4845 docache = false;
4846
4847 /*
4848 * Regular lookup nulifies the slash, which we don't do here.
4849 * Don't take chances with filesystem routines seeing it for
4850 * the last entry.
4851 */
4852 if (cache_fpl_istrailingslash(fpl)) {
4853 return (cache_fpl_partial(fpl));
4854 }
4855
4856 mp = atomic_load_ptr(&dvp->v_mount);
4857 if (__predict_false(mp == NULL)) {
4858 return (cache_fpl_aborted(fpl));
4859 }
4860
4861 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4862 cache_fpl_smr_exit(fpl);
4863 /*
4864 * Original code keeps not checking for CREATE which
4865 * might be a bug. For now let the old lookup decide.
4866 */
4867 if (cnp->cn_nameiop == CREATE) {
4868 return (cache_fpl_aborted(fpl));
4869 }
4870 return (cache_fpl_handled_error(fpl, EROFS));
4871 }
4872
4873 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4874 cache_fpl_smr_exit(fpl);
4875 return (cache_fpl_handled_error(fpl, EEXIST));
4876 }
4877
4878 /*
4879 * Secure access to dvp; check cache_fplookup_partial_setup for
4880 * reasoning.
4881 *
4882 * XXX At least UFS requires its lookup routine to be called for
4883 * the last path component, which leads to some level of complication
4884 * and inefficiency:
4885 * - the target routine always locks the target vnode, but our caller
4886 * may not need it locked
4887 * - some of the VOP machinery asserts that the parent is locked, which
4888 * once more may be not required
4889 *
4890 * TODO: add a flag for filesystems which don't need this.
4891 */
4892 dvs = vget_prep_smr(dvp);
4893 cache_fpl_smr_exit(fpl);
4894 if (__predict_false(dvs == VGET_NONE)) {
4895 return (cache_fpl_aborted(fpl));
4896 }
4897
4898 vget_finish_ref(dvp, dvs);
4899 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4900 vrele(dvp);
4901 return (cache_fpl_aborted(fpl));
4902 }
4903
4904 error = vn_lock(dvp, LK_EXCLUSIVE);
4905 if (__predict_false(error != 0)) {
4906 vrele(dvp);
4907 return (cache_fpl_aborted(fpl));
4908 }
4909
4910 tvp = NULL;
4911 cnp->cn_flags |= ISLASTCN;
4912 if (docache)
4913 cnp->cn_flags |= MAKEENTRY;
4914 if (cache_fpl_isdotdot(cnp))
4915 cnp->cn_flags |= ISDOTDOT;
4916 cnp->cn_lkflags = LK_EXCLUSIVE;
4917 error = VOP_LOOKUP(dvp, &tvp, cnp);
4918 switch (error) {
4919 case EJUSTRETURN:
4920 case 0:
4921 break;
4922 case ENOTDIR:
4923 case ENOENT:
4924 vput(dvp);
4925 return (cache_fpl_handled_error(fpl, error));
4926 default:
4927 vput(dvp);
4928 return (cache_fpl_aborted(fpl));
4929 }
4930
4931 fpl->tvp = tvp;
4932
4933 if (tvp == NULL) {
4934 MPASS(error == EJUSTRETURN);
4935 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4936 VOP_UNLOCK(dvp);
4937 }
4938 return (cache_fpl_handled(fpl));
4939 }
4940
4941 /*
4942 * There are very hairy corner cases concerning various flag combinations
4943 * and locking state. In particular here we only hold one lock instead of
4944 * two.
4945 *
4946 * Skip the complexity as it is of no significance for normal workloads.
4947 */
4948 if (__predict_false(tvp == dvp)) {
4949 vput(dvp);
4950 vrele(tvp);
4951 return (cache_fpl_aborted(fpl));
4952 }
4953
4954 /*
4955 * If they want the symlink itself we are fine, but if they want to
4956 * follow it regular lookup has to be engaged.
4957 */
4958 if (tvp->v_type == VLNK) {
4959 if ((cnp->cn_flags & FOLLOW) != 0) {
4960 vput(dvp);
4961 vput(tvp);
4962 return (cache_fpl_aborted(fpl));
4963 }
4964 }
4965
4966 /*
4967 * Since we expect this to be the terminal vnode it should almost never
4968 * be a mount point.
4969 */
4970 if (__predict_false(cache_fplookup_is_mp(fpl))) {
4971 vput(dvp);
4972 vput(tvp);
4973 return (cache_fpl_aborted(fpl));
4974 }
4975
4976 if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4977 vput(dvp);
4978 vput(tvp);
4979 return (cache_fpl_handled_error(fpl, EEXIST));
4980 }
4981
4982 if ((cnp->cn_flags & LOCKLEAF) == 0) {
4983 VOP_UNLOCK(tvp);
4984 }
4985
4986 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4987 VOP_UNLOCK(dvp);
4988 }
4989
4990 return (cache_fpl_handled(fpl));
4991 }
4992
4993 static int __noinline
cache_fplookup_modifying(struct cache_fpl * fpl)4994 cache_fplookup_modifying(struct cache_fpl *fpl)
4995 {
4996 struct nameidata *ndp;
4997
4998 ndp = fpl->ndp;
4999
5000 if (!cache_fpl_islastcn(ndp)) {
5001 return (cache_fpl_partial(fpl));
5002 }
5003 return (cache_fplookup_final_modifying(fpl));
5004 }
5005
5006 static int __noinline
cache_fplookup_final_withparent(struct cache_fpl * fpl)5007 cache_fplookup_final_withparent(struct cache_fpl *fpl)
5008 {
5009 struct componentname *cnp;
5010 enum vgetstate dvs, tvs;
5011 struct vnode *dvp, *tvp;
5012 seqc_t dvp_seqc;
5013 int error;
5014
5015 cnp = fpl->cnp;
5016 dvp = fpl->dvp;
5017 dvp_seqc = fpl->dvp_seqc;
5018 tvp = fpl->tvp;
5019
5020 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
5021
5022 /*
5023 * This is less efficient than it can be for simplicity.
5024 */
5025 dvs = vget_prep_smr(dvp);
5026 if (__predict_false(dvs == VGET_NONE)) {
5027 return (cache_fpl_aborted(fpl));
5028 }
5029 tvs = vget_prep_smr(tvp);
5030 if (__predict_false(tvs == VGET_NONE)) {
5031 cache_fpl_smr_exit(fpl);
5032 vget_abort(dvp, dvs);
5033 return (cache_fpl_aborted(fpl));
5034 }
5035
5036 cache_fpl_smr_exit(fpl);
5037
5038 if ((cnp->cn_flags & LOCKPARENT) != 0) {
5039 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
5040 if (__predict_false(error != 0)) {
5041 vget_abort(tvp, tvs);
5042 return (cache_fpl_aborted(fpl));
5043 }
5044 } else {
5045 vget_finish_ref(dvp, dvs);
5046 }
5047
5048 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5049 vget_abort(tvp, tvs);
5050 if ((cnp->cn_flags & LOCKPARENT) != 0)
5051 vput(dvp);
5052 else
5053 vrele(dvp);
5054 return (cache_fpl_aborted(fpl));
5055 }
5056
5057 error = cache_fplookup_final_child(fpl, tvs);
5058 if (__predict_false(error != 0)) {
5059 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
5060 fpl->status == CACHE_FPL_STATUS_DESTROYED);
5061 if ((cnp->cn_flags & LOCKPARENT) != 0)
5062 vput(dvp);
5063 else
5064 vrele(dvp);
5065 return (error);
5066 }
5067
5068 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
5069 return (0);
5070 }
5071
5072 static int
cache_fplookup_final(struct cache_fpl * fpl)5073 cache_fplookup_final(struct cache_fpl *fpl)
5074 {
5075 struct componentname *cnp;
5076 enum vgetstate tvs;
5077 struct vnode *dvp, *tvp;
5078 seqc_t dvp_seqc;
5079
5080 cnp = fpl->cnp;
5081 dvp = fpl->dvp;
5082 dvp_seqc = fpl->dvp_seqc;
5083 tvp = fpl->tvp;
5084
5085 MPASS(*(cnp->cn_nameptr) != '/');
5086
5087 if (cnp->cn_nameiop != LOOKUP) {
5088 return (cache_fplookup_final_modifying(fpl));
5089 }
5090
5091 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
5092 return (cache_fplookup_final_withparent(fpl));
5093
5094 tvs = vget_prep_smr(tvp);
5095 if (__predict_false(tvs == VGET_NONE)) {
5096 return (cache_fpl_partial(fpl));
5097 }
5098
5099 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5100 cache_fpl_smr_exit(fpl);
5101 vget_abort(tvp, tvs);
5102 return (cache_fpl_aborted(fpl));
5103 }
5104
5105 cache_fpl_smr_exit(fpl);
5106 return (cache_fplookup_final_child(fpl, tvs));
5107 }
5108
5109 /*
5110 * Comment from locked lookup:
5111 * Check for degenerate name (e.g. / or "") which is a way of talking about a
5112 * directory, e.g. like "/." or ".".
5113 */
5114 static int __noinline
cache_fplookup_degenerate(struct cache_fpl * fpl)5115 cache_fplookup_degenerate(struct cache_fpl *fpl)
5116 {
5117 struct componentname *cnp;
5118 struct vnode *dvp;
5119 enum vgetstate dvs;
5120 int error, lkflags;
5121 #ifdef INVARIANTS
5122 char *cp;
5123 #endif
5124
5125 fpl->tvp = fpl->dvp;
5126 fpl->tvp_seqc = fpl->dvp_seqc;
5127
5128 cnp = fpl->cnp;
5129 dvp = fpl->dvp;
5130
5131 #ifdef INVARIANTS
5132 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
5133 KASSERT(*cp == '/',
5134 ("%s: encountered non-slash; string [%s]\n", __func__,
5135 cnp->cn_pnbuf));
5136 }
5137 #endif
5138
5139 if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
5140 cache_fpl_smr_exit(fpl);
5141 return (cache_fpl_handled_error(fpl, EISDIR));
5142 }
5143
5144 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
5145 return (cache_fplookup_final_withparent(fpl));
5146 }
5147
5148 dvs = vget_prep_smr(dvp);
5149 cache_fpl_smr_exit(fpl);
5150 if (__predict_false(dvs == VGET_NONE)) {
5151 return (cache_fpl_aborted(fpl));
5152 }
5153
5154 if ((cnp->cn_flags & LOCKLEAF) != 0) {
5155 lkflags = LK_SHARED;
5156 if ((cnp->cn_flags & LOCKSHARED) == 0)
5157 lkflags = LK_EXCLUSIVE;
5158 error = vget_finish(dvp, lkflags, dvs);
5159 if (__predict_false(error != 0)) {
5160 return (cache_fpl_aborted(fpl));
5161 }
5162 } else {
5163 vget_finish_ref(dvp, dvs);
5164 }
5165 return (cache_fpl_handled(fpl));
5166 }
5167
5168 static int __noinline
cache_fplookup_emptypath(struct cache_fpl * fpl)5169 cache_fplookup_emptypath(struct cache_fpl *fpl)
5170 {
5171 struct nameidata *ndp;
5172 struct componentname *cnp;
5173 enum vgetstate tvs;
5174 struct vnode *tvp;
5175 int error, lkflags;
5176
5177 fpl->tvp = fpl->dvp;
5178 fpl->tvp_seqc = fpl->dvp_seqc;
5179
5180 ndp = fpl->ndp;
5181 cnp = fpl->cnp;
5182 tvp = fpl->tvp;
5183
5184 MPASS(*cnp->cn_pnbuf == '\0');
5185
5186 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
5187 cache_fpl_smr_exit(fpl);
5188 return (cache_fpl_handled_error(fpl, ENOENT));
5189 }
5190
5191 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
5192
5193 tvs = vget_prep_smr(tvp);
5194 cache_fpl_smr_exit(fpl);
5195 if (__predict_false(tvs == VGET_NONE)) {
5196 return (cache_fpl_aborted(fpl));
5197 }
5198
5199 if ((cnp->cn_flags & LOCKLEAF) != 0) {
5200 lkflags = LK_SHARED;
5201 if ((cnp->cn_flags & LOCKSHARED) == 0)
5202 lkflags = LK_EXCLUSIVE;
5203 error = vget_finish(tvp, lkflags, tvs);
5204 if (__predict_false(error != 0)) {
5205 return (cache_fpl_aborted(fpl));
5206 }
5207 } else {
5208 vget_finish_ref(tvp, tvs);
5209 }
5210
5211 ndp->ni_resflags |= NIRES_EMPTYPATH;
5212 return (cache_fpl_handled(fpl));
5213 }
5214
5215 static int __noinline
cache_fplookup_noentry(struct cache_fpl * fpl)5216 cache_fplookup_noentry(struct cache_fpl *fpl)
5217 {
5218 struct nameidata *ndp;
5219 struct componentname *cnp;
5220 enum vgetstate dvs;
5221 struct vnode *dvp, *tvp;
5222 seqc_t dvp_seqc;
5223 int error;
5224
5225 ndp = fpl->ndp;
5226 cnp = fpl->cnp;
5227 dvp = fpl->dvp;
5228 dvp_seqc = fpl->dvp_seqc;
5229
5230 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
5231 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
5232 if (cnp->cn_nameiop == LOOKUP)
5233 MPASS((cnp->cn_flags & NOCACHE) == 0);
5234 MPASS(!cache_fpl_isdotdot(cnp));
5235
5236 /*
5237 * Hack: delayed name len checking.
5238 */
5239 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5240 cache_fpl_smr_exit(fpl);
5241 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5242 }
5243
5244 if (cnp->cn_nameptr[0] == '/') {
5245 return (cache_fplookup_skip_slashes(fpl));
5246 }
5247
5248 if (cnp->cn_pnbuf[0] == '\0') {
5249 return (cache_fplookup_emptypath(fpl));
5250 }
5251
5252 if (cnp->cn_nameptr[0] == '\0') {
5253 if (fpl->tvp == NULL) {
5254 return (cache_fplookup_degenerate(fpl));
5255 }
5256 return (cache_fplookup_trailingslash(fpl));
5257 }
5258
5259 if (cnp->cn_nameiop != LOOKUP) {
5260 fpl->tvp = NULL;
5261 return (cache_fplookup_modifying(fpl));
5262 }
5263
5264 /*
5265 * Only try to fill in the component if it is the last one,
5266 * otherwise not only there may be several to handle but the
5267 * walk may be complicated.
5268 */
5269 if (!cache_fpl_islastcn(ndp)) {
5270 return (cache_fpl_partial(fpl));
5271 }
5272
5273 /*
5274 * Regular lookup nulifies the slash, which we don't do here.
5275 * Don't take chances with filesystem routines seeing it for
5276 * the last entry.
5277 */
5278 if (cache_fpl_istrailingslash(fpl)) {
5279 return (cache_fpl_partial(fpl));
5280 }
5281
5282 /*
5283 * Secure access to dvp; check cache_fplookup_partial_setup for
5284 * reasoning.
5285 */
5286 dvs = vget_prep_smr(dvp);
5287 cache_fpl_smr_exit(fpl);
5288 if (__predict_false(dvs == VGET_NONE)) {
5289 return (cache_fpl_aborted(fpl));
5290 }
5291
5292 vget_finish_ref(dvp, dvs);
5293 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5294 vrele(dvp);
5295 return (cache_fpl_aborted(fpl));
5296 }
5297
5298 error = vn_lock(dvp, LK_SHARED);
5299 if (__predict_false(error != 0)) {
5300 vrele(dvp);
5301 return (cache_fpl_aborted(fpl));
5302 }
5303
5304 tvp = NULL;
5305 /*
5306 * TODO: provide variants which don't require locking either vnode.
5307 */
5308 cnp->cn_flags |= ISLASTCN | MAKEENTRY;
5309 cnp->cn_lkflags = LK_SHARED;
5310 if ((cnp->cn_flags & LOCKSHARED) == 0) {
5311 cnp->cn_lkflags = LK_EXCLUSIVE;
5312 }
5313 error = VOP_LOOKUP(dvp, &tvp, cnp);
5314 switch (error) {
5315 case EJUSTRETURN:
5316 case 0:
5317 break;
5318 case ENOTDIR:
5319 case ENOENT:
5320 vput(dvp);
5321 return (cache_fpl_handled_error(fpl, error));
5322 default:
5323 vput(dvp);
5324 return (cache_fpl_aborted(fpl));
5325 }
5326
5327 fpl->tvp = tvp;
5328
5329 if (tvp == NULL) {
5330 MPASS(error == EJUSTRETURN);
5331 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5332 vput(dvp);
5333 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5334 VOP_UNLOCK(dvp);
5335 }
5336 return (cache_fpl_handled(fpl));
5337 }
5338
5339 if (tvp->v_type == VLNK) {
5340 if ((cnp->cn_flags & FOLLOW) != 0) {
5341 vput(dvp);
5342 vput(tvp);
5343 return (cache_fpl_aborted(fpl));
5344 }
5345 }
5346
5347 if (__predict_false(cache_fplookup_is_mp(fpl))) {
5348 vput(dvp);
5349 vput(tvp);
5350 return (cache_fpl_aborted(fpl));
5351 }
5352
5353 if ((cnp->cn_flags & LOCKLEAF) == 0) {
5354 VOP_UNLOCK(tvp);
5355 }
5356
5357 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5358 vput(dvp);
5359 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5360 VOP_UNLOCK(dvp);
5361 }
5362 return (cache_fpl_handled(fpl));
5363 }
5364
5365 static int __noinline
cache_fplookup_dot(struct cache_fpl * fpl)5366 cache_fplookup_dot(struct cache_fpl *fpl)
5367 {
5368 int error;
5369
5370 MPASS(!seqc_in_modify(fpl->dvp_seqc));
5371
5372 if (__predict_false(fpl->dvp->v_type != VDIR)) {
5373 cache_fpl_smr_exit(fpl);
5374 return (cache_fpl_handled_error(fpl, ENOTDIR));
5375 }
5376
5377 /*
5378 * Just re-assign the value. seqc will be checked later for the first
5379 * non-dot path component in line and/or before deciding to return the
5380 * vnode.
5381 */
5382 fpl->tvp = fpl->dvp;
5383 fpl->tvp_seqc = fpl->dvp_seqc;
5384
5385 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
5386
5387 error = 0;
5388 if (cache_fplookup_is_mp(fpl)) {
5389 error = cache_fplookup_cross_mount(fpl);
5390 }
5391 return (error);
5392 }
5393
5394 static int __noinline
cache_fplookup_dotdot(struct cache_fpl * fpl)5395 cache_fplookup_dotdot(struct cache_fpl *fpl)
5396 {
5397 struct nameidata *ndp;
5398 struct namecache *ncp;
5399 struct vnode *dvp;
5400 u_char nc_flag;
5401
5402 ndp = fpl->ndp;
5403 dvp = fpl->dvp;
5404
5405 MPASS(cache_fpl_isdotdot(fpl->cnp));
5406
5407 /*
5408 * XXX this is racy the same way regular lookup is
5409 */
5410 if (vfs_lookup_isroot(ndp, dvp)) {
5411 fpl->tvp = dvp;
5412 fpl->tvp_seqc = vn_seqc_read_any(dvp);
5413 if (seqc_in_modify(fpl->tvp_seqc)) {
5414 return (cache_fpl_aborted(fpl));
5415 }
5416 return (0);
5417 }
5418
5419 if ((dvp->v_vflag & VV_ROOT) != 0) {
5420 /*
5421 * TODO
5422 * The opposite of climb mount is needed here.
5423 */
5424 return (cache_fpl_partial(fpl));
5425 }
5426
5427 if (__predict_false(dvp->v_type != VDIR)) {
5428 cache_fpl_smr_exit(fpl);
5429 return (cache_fpl_handled_error(fpl, ENOTDIR));
5430 }
5431
5432 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
5433 if (ncp == NULL) {
5434 return (cache_fpl_aborted(fpl));
5435 }
5436
5437 nc_flag = atomic_load_char(&ncp->nc_flag);
5438 if ((nc_flag & NCF_ISDOTDOT) != 0) {
5439 if ((nc_flag & NCF_NEGATIVE) != 0)
5440 return (cache_fpl_aborted(fpl));
5441 fpl->tvp = ncp->nc_vp;
5442 } else {
5443 fpl->tvp = ncp->nc_dvp;
5444 }
5445
5446 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
5447 if (seqc_in_modify(fpl->tvp_seqc)) {
5448 return (cache_fpl_partial(fpl));
5449 }
5450
5451 /*
5452 * Acquire fence provided by vn_seqc_read_any above.
5453 */
5454 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
5455 return (cache_fpl_aborted(fpl));
5456 }
5457
5458 if (!cache_ncp_canuse(ncp)) {
5459 return (cache_fpl_aborted(fpl));
5460 }
5461
5462 return (0);
5463 }
5464
5465 static int __noinline
cache_fplookup_neg(struct cache_fpl * fpl,struct namecache * ncp,uint32_t hash)5466 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
5467 {
5468 u_char nc_flag __diagused;
5469 bool neg_promote;
5470
5471 #ifdef INVARIANTS
5472 nc_flag = atomic_load_char(&ncp->nc_flag);
5473 MPASS((nc_flag & NCF_NEGATIVE) != 0);
5474 #endif
5475 /*
5476 * If they want to create an entry we need to replace this one.
5477 */
5478 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
5479 fpl->tvp = NULL;
5480 return (cache_fplookup_modifying(fpl));
5481 }
5482 neg_promote = cache_neg_hit_prep(ncp);
5483 if (!cache_fpl_neg_ncp_canuse(ncp)) {
5484 cache_neg_hit_abort(ncp);
5485 return (cache_fpl_partial(fpl));
5486 }
5487 if (neg_promote) {
5488 return (cache_fplookup_negative_promote(fpl, ncp, hash));
5489 }
5490 cache_neg_hit_finish(ncp);
5491 cache_fpl_smr_exit(fpl);
5492 return (cache_fpl_handled_error(fpl, ENOENT));
5493 }
5494
5495 /*
5496 * Resolve a symlink. Called by filesystem-specific routines.
5497 *
5498 * Code flow is:
5499 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
5500 */
5501 int
cache_symlink_resolve(struct cache_fpl * fpl,const char * string,size_t len)5502 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
5503 {
5504 struct nameidata *ndp;
5505 struct componentname *cnp;
5506 size_t adjust;
5507
5508 ndp = fpl->ndp;
5509 cnp = fpl->cnp;
5510
5511 if (__predict_false(len == 0)) {
5512 return (ENOENT);
5513 }
5514
5515 if (__predict_false(len > MAXPATHLEN - 2)) {
5516 if (cache_fpl_istrailingslash(fpl)) {
5517 return (EAGAIN);
5518 }
5519 }
5520
5521 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
5522 #ifdef INVARIANTS
5523 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
5524 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5525 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5526 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5527 }
5528 #endif
5529
5530 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
5531 return (ENAMETOOLONG);
5532 }
5533
5534 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
5535 return (ELOOP);
5536 }
5537
5538 adjust = len;
5539 if (ndp->ni_pathlen > 1) {
5540 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
5541 } else {
5542 if (cache_fpl_istrailingslash(fpl)) {
5543 adjust = len + 1;
5544 cnp->cn_pnbuf[len] = '/';
5545 cnp->cn_pnbuf[len + 1] = '\0';
5546 } else {
5547 cnp->cn_pnbuf[len] = '\0';
5548 }
5549 }
5550 bcopy(string, cnp->cn_pnbuf, len);
5551
5552 ndp->ni_pathlen += adjust;
5553 cache_fpl_pathlen_add(fpl, adjust);
5554 cnp->cn_nameptr = cnp->cn_pnbuf;
5555 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5556 fpl->tvp = NULL;
5557 return (0);
5558 }
5559
5560 static int __noinline
cache_fplookup_symlink(struct cache_fpl * fpl)5561 cache_fplookup_symlink(struct cache_fpl *fpl)
5562 {
5563 struct mount *mp;
5564 struct nameidata *ndp;
5565 struct componentname *cnp;
5566 struct vnode *dvp, *tvp;
5567 struct pwd *pwd;
5568 int error;
5569
5570 ndp = fpl->ndp;
5571 cnp = fpl->cnp;
5572 dvp = fpl->dvp;
5573 tvp = fpl->tvp;
5574 pwd = *(fpl->pwd);
5575
5576 if (cache_fpl_islastcn(ndp)) {
5577 if ((cnp->cn_flags & FOLLOW) == 0) {
5578 return (cache_fplookup_final(fpl));
5579 }
5580 }
5581
5582 mp = atomic_load_ptr(&dvp->v_mount);
5583 if (__predict_false(mp == NULL)) {
5584 return (cache_fpl_aborted(fpl));
5585 }
5586
5587 /*
5588 * Note this check races against setting the flag just like regular
5589 * lookup.
5590 */
5591 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
5592 cache_fpl_smr_exit(fpl);
5593 return (cache_fpl_handled_error(fpl, EACCES));
5594 }
5595
5596 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
5597 if (__predict_false(error != 0)) {
5598 switch (error) {
5599 case EAGAIN:
5600 return (cache_fpl_partial(fpl));
5601 case ENOENT:
5602 case ENAMETOOLONG:
5603 case ELOOP:
5604 cache_fpl_smr_exit(fpl);
5605 return (cache_fpl_handled_error(fpl, error));
5606 default:
5607 return (cache_fpl_aborted(fpl));
5608 }
5609 }
5610
5611 if (*(cnp->cn_nameptr) == '/') {
5612 fpl->dvp = cache_fpl_handle_root(fpl);
5613 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5614 if (seqc_in_modify(fpl->dvp_seqc)) {
5615 return (cache_fpl_aborted(fpl));
5616 }
5617 /*
5618 * The main loop assumes that ->dvp points to a vnode belonging
5619 * to a filesystem which can do lockless lookup, but the absolute
5620 * symlink can be wandering off to one which does not.
5621 */
5622 mp = atomic_load_ptr(&fpl->dvp->v_mount);
5623 if (__predict_false(mp == NULL)) {
5624 return (cache_fpl_aborted(fpl));
5625 }
5626 if (!cache_fplookup_mp_supported(mp)) {
5627 cache_fpl_checkpoint(fpl);
5628 return (cache_fpl_partial(fpl));
5629 }
5630 if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) {
5631 return (cache_fpl_aborted(fpl));
5632 }
5633 }
5634 return (0);
5635 }
5636
5637 static int
cache_fplookup_next(struct cache_fpl * fpl)5638 cache_fplookup_next(struct cache_fpl *fpl)
5639 {
5640 struct componentname *cnp;
5641 struct namecache *ncp;
5642 struct vnode *dvp, *tvp;
5643 u_char nc_flag;
5644 uint32_t hash;
5645 int error;
5646
5647 cnp = fpl->cnp;
5648 dvp = fpl->dvp;
5649 hash = fpl->hash;
5650
5651 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
5652 if (cnp->cn_namelen == 1) {
5653 return (cache_fplookup_dot(fpl));
5654 }
5655 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
5656 return (cache_fplookup_dotdot(fpl));
5657 }
5658 }
5659
5660 MPASS(!cache_fpl_isdotdot(cnp));
5661
5662 ncp = cache_ncp_find(dvp, cnp, hash);
5663 if (__predict_false(ncp == NULL)) {
5664 return (cache_fplookup_noentry(fpl));
5665 }
5666
5667 tvp = atomic_load_ptr(&ncp->nc_vp);
5668 nc_flag = atomic_load_char(&ncp->nc_flag);
5669 if ((nc_flag & NCF_NEGATIVE) != 0) {
5670 return (cache_fplookup_neg(fpl, ncp, hash));
5671 }
5672
5673 if (!cache_ncp_canuse(ncp)) {
5674 return (cache_fpl_partial(fpl));
5675 }
5676
5677 fpl->tvp = tvp;
5678 fpl->tvp_seqc = vn_seqc_read_any(tvp);
5679 if (seqc_in_modify(fpl->tvp_seqc)) {
5680 return (cache_fpl_partial(fpl));
5681 }
5682
5683 counter_u64_add(numposhits, 1);
5684 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
5685
5686 error = 0;
5687 if (cache_fplookup_is_mp(fpl)) {
5688 error = cache_fplookup_cross_mount(fpl);
5689 }
5690 return (error);
5691 }
5692
5693 static bool
cache_fplookup_mp_supported(struct mount * mp)5694 cache_fplookup_mp_supported(struct mount *mp)
5695 {
5696
5697 MPASS(mp != NULL);
5698 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
5699 return (false);
5700 return (true);
5701 }
5702
5703 /*
5704 * Walk up the mount stack (if any).
5705 *
5706 * Correctness is provided in the following ways:
5707 * - all vnodes are protected from freeing with SMR
5708 * - struct mount objects are type stable making them always safe to access
5709 * - stability of the particular mount is provided by busying it
5710 * - relationship between the vnode which is mounted on and the mount is
5711 * verified with the vnode sequence counter after busying
5712 * - association between root vnode of the mount and the mount is protected
5713 * by busy
5714 *
5715 * From that point on we can read the sequence counter of the root vnode
5716 * and get the next mount on the stack (if any) using the same protection.
5717 *
5718 * By the end of successful walk we are guaranteed the reached state was
5719 * indeed present at least at some point which matches the regular lookup.
5720 */
5721 static int __noinline
cache_fplookup_climb_mount(struct cache_fpl * fpl)5722 cache_fplookup_climb_mount(struct cache_fpl *fpl)
5723 {
5724 struct mount *mp, *prev_mp;
5725 struct mount_pcpu *mpcpu, *prev_mpcpu;
5726 struct vnode *vp;
5727 seqc_t vp_seqc;
5728
5729 vp = fpl->tvp;
5730 vp_seqc = fpl->tvp_seqc;
5731
5732 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5733 mp = atomic_load_ptr(&vp->v_mountedhere);
5734 if (__predict_false(mp == NULL)) {
5735 return (0);
5736 }
5737
5738 prev_mp = NULL;
5739 for (;;) {
5740 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5741 if (prev_mp != NULL)
5742 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5743 return (cache_fpl_partial(fpl));
5744 }
5745 if (prev_mp != NULL)
5746 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5747 if (!vn_seqc_consistent(vp, vp_seqc)) {
5748 vfs_op_thread_exit_crit(mp, mpcpu);
5749 return (cache_fpl_partial(fpl));
5750 }
5751 if (!cache_fplookup_mp_supported(mp)) {
5752 vfs_op_thread_exit_crit(mp, mpcpu);
5753 return (cache_fpl_partial(fpl));
5754 }
5755 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5756 if (vp == NULL) {
5757 vfs_op_thread_exit_crit(mp, mpcpu);
5758 return (cache_fpl_partial(fpl));
5759 }
5760 vp_seqc = vn_seqc_read_any(vp);
5761 if (seqc_in_modify(vp_seqc)) {
5762 vfs_op_thread_exit_crit(mp, mpcpu);
5763 return (cache_fpl_partial(fpl));
5764 }
5765 prev_mp = mp;
5766 prev_mpcpu = mpcpu;
5767 mp = atomic_load_ptr(&vp->v_mountedhere);
5768 if (mp == NULL)
5769 break;
5770 }
5771
5772 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5773 fpl->tvp = vp;
5774 fpl->tvp_seqc = vp_seqc;
5775 return (0);
5776 }
5777
5778 static int __noinline
cache_fplookup_cross_mount(struct cache_fpl * fpl)5779 cache_fplookup_cross_mount(struct cache_fpl *fpl)
5780 {
5781 struct mount *mp;
5782 struct mount_pcpu *mpcpu;
5783 struct vnode *vp;
5784 seqc_t vp_seqc;
5785
5786 vp = fpl->tvp;
5787 vp_seqc = fpl->tvp_seqc;
5788
5789 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5790 mp = atomic_load_ptr(&vp->v_mountedhere);
5791 if (__predict_false(mp == NULL)) {
5792 return (0);
5793 }
5794
5795 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5796 return (cache_fpl_partial(fpl));
5797 }
5798 if (!vn_seqc_consistent(vp, vp_seqc)) {
5799 vfs_op_thread_exit_crit(mp, mpcpu);
5800 return (cache_fpl_partial(fpl));
5801 }
5802 if (!cache_fplookup_mp_supported(mp)) {
5803 vfs_op_thread_exit_crit(mp, mpcpu);
5804 return (cache_fpl_partial(fpl));
5805 }
5806 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5807 if (__predict_false(vp == NULL)) {
5808 vfs_op_thread_exit_crit(mp, mpcpu);
5809 return (cache_fpl_partial(fpl));
5810 }
5811 vp_seqc = vn_seqc_read_any(vp);
5812 vfs_op_thread_exit_crit(mp, mpcpu);
5813 if (seqc_in_modify(vp_seqc)) {
5814 return (cache_fpl_partial(fpl));
5815 }
5816 mp = atomic_load_ptr(&vp->v_mountedhere);
5817 if (__predict_false(mp != NULL)) {
5818 /*
5819 * There are possibly more mount points on top.
5820 * Normally this does not happen so for simplicity just start
5821 * over.
5822 */
5823 return (cache_fplookup_climb_mount(fpl));
5824 }
5825
5826 fpl->tvp = vp;
5827 fpl->tvp_seqc = vp_seqc;
5828 return (0);
5829 }
5830
5831 /*
5832 * Check if a vnode is mounted on.
5833 */
5834 static bool
cache_fplookup_is_mp(struct cache_fpl * fpl)5835 cache_fplookup_is_mp(struct cache_fpl *fpl)
5836 {
5837 struct vnode *vp;
5838
5839 vp = fpl->tvp;
5840 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5841 }
5842
5843 /*
5844 * Parse the path.
5845 *
5846 * The code was originally copy-pasted from regular lookup and despite
5847 * clean ups leaves performance on the table. Any modifications here
5848 * must take into account that in case off fallback the resulting
5849 * nameidata state has to be compatible with the original.
5850 */
5851
5852 /*
5853 * Debug ni_pathlen tracking.
5854 */
5855 #ifdef INVARIANTS
5856 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5857 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5858 {
5859
5860 fpl->debug.ni_pathlen += n;
5861 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5862 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5863 }
5864
5865 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5866 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5867 {
5868
5869 fpl->debug.ni_pathlen -= n;
5870 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5871 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5872 }
5873
5874 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5875 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5876 {
5877
5878 cache_fpl_pathlen_add(fpl, 1);
5879 }
5880
5881 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5882 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5883 {
5884
5885 cache_fpl_pathlen_sub(fpl, 1);
5886 }
5887 #else
5888 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5889 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5890 {
5891 }
5892
5893 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5894 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5895 {
5896 }
5897
5898 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5899 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5900 {
5901 }
5902
5903 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5904 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5905 {
5906 }
5907 #endif
5908
5909 static void
cache_fplookup_parse(struct cache_fpl * fpl)5910 cache_fplookup_parse(struct cache_fpl *fpl)
5911 {
5912 struct nameidata *ndp;
5913 struct componentname *cnp;
5914 struct vnode *dvp;
5915 char *cp;
5916 uint32_t hash;
5917
5918 ndp = fpl->ndp;
5919 cnp = fpl->cnp;
5920 dvp = fpl->dvp;
5921
5922 /*
5923 * Find the end of this path component, it is either / or nul.
5924 *
5925 * Store / as a temporary sentinel so that we only have one character
5926 * to test for. Pathnames tend to be short so this should not be
5927 * resulting in cache misses.
5928 *
5929 * TODO: fix this to be word-sized.
5930 */
5931 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
5932 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5933 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5934 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5935 fpl->nulchar, cnp->cn_pnbuf));
5936 KASSERT(*fpl->nulchar == '\0',
5937 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5938 cnp->cn_pnbuf));
5939 hash = cache_get_hash_iter_start(dvp);
5940 *fpl->nulchar = '/';
5941 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5942 KASSERT(*cp != '\0',
5943 ("%s: encountered unexpected nul; string [%s]\n", __func__,
5944 cnp->cn_nameptr));
5945 hash = cache_get_hash_iter(*cp, hash);
5946 continue;
5947 }
5948 *fpl->nulchar = '\0';
5949 fpl->hash = cache_get_hash_iter_finish(hash);
5950
5951 cnp->cn_namelen = cp - cnp->cn_nameptr;
5952 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5953
5954 #ifdef INVARIANTS
5955 /*
5956 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
5957 * we are going to fail this lookup with ENAMETOOLONG (see below).
5958 */
5959 if (cnp->cn_namelen <= NAME_MAX) {
5960 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
5961 panic("%s: mismatched hash for [%s] len %ld", __func__,
5962 cnp->cn_nameptr, cnp->cn_namelen);
5963 }
5964 }
5965 #endif
5966
5967 /*
5968 * Hack: we have to check if the found path component's length exceeds
5969 * NAME_MAX. However, the condition is very rarely true and check can
5970 * be elided in the common case -- if an entry was found in the cache,
5971 * then it could not have been too long to begin with.
5972 */
5973 ndp->ni_next = cp;
5974 }
5975
5976 static void
cache_fplookup_parse_advance(struct cache_fpl * fpl)5977 cache_fplookup_parse_advance(struct cache_fpl *fpl)
5978 {
5979 struct nameidata *ndp;
5980 struct componentname *cnp;
5981
5982 ndp = fpl->ndp;
5983 cnp = fpl->cnp;
5984
5985 cnp->cn_nameptr = ndp->ni_next;
5986 KASSERT(*(cnp->cn_nameptr) == '/',
5987 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5988 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5989 cnp->cn_nameptr++;
5990 cache_fpl_pathlen_dec(fpl);
5991 }
5992
5993 /*
5994 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5995 *
5996 * Lockless lookup tries to elide checking for spurious slashes and should they
5997 * be present is guaranteed to fail to find an entry. In this case the caller
5998 * must check if the name starts with a slash and call this routine. It is
5999 * going to fast forward across the spurious slashes and set the state up for
6000 * retry.
6001 */
6002 static int __noinline
cache_fplookup_skip_slashes(struct cache_fpl * fpl)6003 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
6004 {
6005 struct nameidata *ndp;
6006 struct componentname *cnp;
6007
6008 ndp = fpl->ndp;
6009 cnp = fpl->cnp;
6010
6011 MPASS(*(cnp->cn_nameptr) == '/');
6012 do {
6013 cnp->cn_nameptr++;
6014 cache_fpl_pathlen_dec(fpl);
6015 } while (*(cnp->cn_nameptr) == '/');
6016
6017 /*
6018 * Go back to one slash so that cache_fplookup_parse_advance has
6019 * something to skip.
6020 */
6021 cnp->cn_nameptr--;
6022 cache_fpl_pathlen_inc(fpl);
6023
6024 /*
6025 * cache_fplookup_parse_advance starts from ndp->ni_next
6026 */
6027 ndp->ni_next = cnp->cn_nameptr;
6028
6029 /*
6030 * See cache_fplookup_dot.
6031 */
6032 fpl->tvp = fpl->dvp;
6033 fpl->tvp_seqc = fpl->dvp_seqc;
6034
6035 return (0);
6036 }
6037
6038 /*
6039 * Handle trailing slashes (e.g., "foo/").
6040 *
6041 * If a trailing slash is found the terminal vnode must be a directory.
6042 * Regular lookup shortens the path by nulifying the first trailing slash and
6043 * sets the TRAILINGSLASH flag to denote this took place. There are several
6044 * checks on it performed later.
6045 *
6046 * Similarly to spurious slashes, lockless lookup handles this in a speculative
6047 * manner relying on an invariant that a non-directory vnode will get a miss.
6048 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
6049 *
6050 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
6051 * and denotes this is the last path component, which avoids looping back.
6052 *
6053 * Only plain lookups are supported for now to restrict corner cases to handle.
6054 */
6055 static int __noinline
cache_fplookup_trailingslash(struct cache_fpl * fpl)6056 cache_fplookup_trailingslash(struct cache_fpl *fpl)
6057 {
6058 #ifdef INVARIANTS
6059 size_t ni_pathlen;
6060 #endif
6061 struct nameidata *ndp;
6062 struct componentname *cnp;
6063 struct namecache *ncp;
6064 struct vnode *tvp;
6065 char *cn_nameptr_orig, *cn_nameptr_slash;
6066 seqc_t tvp_seqc;
6067 u_char nc_flag;
6068
6069 ndp = fpl->ndp;
6070 cnp = fpl->cnp;
6071 tvp = fpl->tvp;
6072 tvp_seqc = fpl->tvp_seqc;
6073
6074 MPASS(fpl->dvp == fpl->tvp);
6075 KASSERT(cache_fpl_istrailingslash(fpl),
6076 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
6077 cnp->cn_pnbuf));
6078 KASSERT(cnp->cn_nameptr[0] == '\0',
6079 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
6080 cnp->cn_pnbuf));
6081 KASSERT(cnp->cn_namelen == 0,
6082 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
6083 cnp->cn_pnbuf));
6084 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
6085
6086 if (cnp->cn_nameiop != LOOKUP) {
6087 return (cache_fpl_aborted(fpl));
6088 }
6089
6090 if (__predict_false(tvp->v_type != VDIR)) {
6091 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
6092 return (cache_fpl_aborted(fpl));
6093 }
6094 cache_fpl_smr_exit(fpl);
6095 return (cache_fpl_handled_error(fpl, ENOTDIR));
6096 }
6097
6098 /*
6099 * Denote the last component.
6100 */
6101 ndp->ni_next = &cnp->cn_nameptr[0];
6102 MPASS(cache_fpl_islastcn(ndp));
6103
6104 /*
6105 * Unwind trailing slashes.
6106 */
6107 cn_nameptr_orig = cnp->cn_nameptr;
6108 while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
6109 cnp->cn_nameptr--;
6110 if (cnp->cn_nameptr[0] != '/') {
6111 break;
6112 }
6113 }
6114
6115 /*
6116 * Unwind to the beginning of the path component.
6117 *
6118 * Note the path may or may not have started with a slash.
6119 */
6120 cn_nameptr_slash = cnp->cn_nameptr;
6121 while (cnp->cn_nameptr > cnp->cn_pnbuf) {
6122 cnp->cn_nameptr--;
6123 if (cnp->cn_nameptr[0] == '/') {
6124 break;
6125 }
6126 }
6127 if (cnp->cn_nameptr[0] == '/') {
6128 cnp->cn_nameptr++;
6129 }
6130
6131 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
6132 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
6133 cache_fpl_checkpoint(fpl);
6134
6135 #ifdef INVARIANTS
6136 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
6137 if (ni_pathlen != fpl->debug.ni_pathlen) {
6138 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
6139 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
6140 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
6141 }
6142 #endif
6143
6144 /*
6145 * If this was a "./" lookup the parent directory is already correct.
6146 */
6147 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
6148 return (0);
6149 }
6150
6151 /*
6152 * Otherwise we need to look it up.
6153 */
6154 tvp = fpl->tvp;
6155 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
6156 if (__predict_false(ncp == NULL)) {
6157 return (cache_fpl_aborted(fpl));
6158 }
6159 nc_flag = atomic_load_char(&ncp->nc_flag);
6160 if ((nc_flag & NCF_ISDOTDOT) != 0) {
6161 return (cache_fpl_aborted(fpl));
6162 }
6163 fpl->dvp = ncp->nc_dvp;
6164 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
6165 if (seqc_in_modify(fpl->dvp_seqc)) {
6166 return (cache_fpl_aborted(fpl));
6167 }
6168 return (0);
6169 }
6170
6171 /*
6172 * See the API contract for VOP_FPLOOKUP_VEXEC.
6173 */
6174 static int __noinline
cache_fplookup_failed_vexec(struct cache_fpl * fpl,int error)6175 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
6176 {
6177 struct componentname *cnp;
6178 struct vnode *dvp;
6179 seqc_t dvp_seqc;
6180
6181 cnp = fpl->cnp;
6182 dvp = fpl->dvp;
6183 dvp_seqc = fpl->dvp_seqc;
6184
6185 /*
6186 * Hack: delayed empty path checking.
6187 */
6188 if (cnp->cn_pnbuf[0] == '\0') {
6189 return (cache_fplookup_emptypath(fpl));
6190 }
6191
6192 /*
6193 * TODO: Due to ignoring trailing slashes lookup will perform a
6194 * permission check on the last dir when it should not be doing it. It
6195 * may fail, but said failure should be ignored. It is possible to fix
6196 * it up fully without resorting to regular lookup, but for now just
6197 * abort.
6198 */
6199 if (cache_fpl_istrailingslash(fpl)) {
6200 return (cache_fpl_aborted(fpl));
6201 }
6202
6203 /*
6204 * Hack: delayed degenerate path checking.
6205 */
6206 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
6207 return (cache_fplookup_degenerate(fpl));
6208 }
6209
6210 /*
6211 * Hack: delayed name len checking.
6212 */
6213 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
6214 cache_fpl_smr_exit(fpl);
6215 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
6216 }
6217
6218 /*
6219 * Hack: they may be looking up foo/bar, where foo is not a directory.
6220 * In such a case we need to return ENOTDIR, but we may happen to get
6221 * here with a different error.
6222 */
6223 if (dvp->v_type != VDIR) {
6224 error = ENOTDIR;
6225 }
6226
6227 /*
6228 * Hack: handle O_SEARCH.
6229 *
6230 * Open Group Base Specifications Issue 7, 2018 edition states:
6231 * <quote>
6232 * If the access mode of the open file description associated with the
6233 * file descriptor is not O_SEARCH, the function shall check whether
6234 * directory searches are permitted using the current permissions of
6235 * the directory underlying the file descriptor. If the access mode is
6236 * O_SEARCH, the function shall not perform the check.
6237 * </quote>
6238 *
6239 * Regular lookup tests for the NOEXECCHECK flag for every path
6240 * component to decide whether to do the permission check. However,
6241 * since most lookups never have the flag (and when they do it is only
6242 * present for the first path component), lockless lookup only acts on
6243 * it if there is a permission problem. Here the flag is represented
6244 * with a boolean so that we don't have to clear it on the way out.
6245 *
6246 * For simplicity this always aborts.
6247 * TODO: check if this is the first lookup and ignore the permission
6248 * problem. Note the flag has to survive fallback (if it happens to be
6249 * performed).
6250 */
6251 if (fpl->fsearch) {
6252 return (cache_fpl_aborted(fpl));
6253 }
6254
6255 switch (error) {
6256 case EAGAIN:
6257 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6258 error = cache_fpl_aborted(fpl);
6259 } else {
6260 cache_fpl_partial(fpl);
6261 }
6262 break;
6263 default:
6264 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6265 error = cache_fpl_aborted(fpl);
6266 } else {
6267 cache_fpl_smr_exit(fpl);
6268 cache_fpl_handled_error(fpl, error);
6269 }
6270 break;
6271 }
6272 return (error);
6273 }
6274
6275 static int
cache_fplookup_impl(struct vnode * dvp,struct cache_fpl * fpl)6276 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
6277 {
6278 struct nameidata *ndp;
6279 struct componentname *cnp;
6280 struct mount *mp;
6281 int error;
6282
6283 ndp = fpl->ndp;
6284 cnp = fpl->cnp;
6285
6286 cache_fpl_checkpoint(fpl);
6287
6288 /*
6289 * The vnode at hand is almost always stable, skip checking for it.
6290 * Worst case this postpones the check towards the end of the iteration
6291 * of the main loop.
6292 */
6293 fpl->dvp = dvp;
6294 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
6295
6296 mp = atomic_load_ptr(&dvp->v_mount);
6297 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
6298 return (cache_fpl_aborted(fpl));
6299 }
6300
6301 MPASS(fpl->tvp == NULL);
6302
6303 for (;;) {
6304 cache_fplookup_parse(fpl);
6305
6306 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
6307 if (__predict_false(error != 0)) {
6308 error = cache_fplookup_failed_vexec(fpl, error);
6309 break;
6310 }
6311
6312 error = cache_fplookup_next(fpl);
6313 if (__predict_false(cache_fpl_terminated(fpl))) {
6314 break;
6315 }
6316
6317 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
6318
6319 if (fpl->tvp->v_type == VLNK) {
6320 error = cache_fplookup_symlink(fpl);
6321 if (cache_fpl_terminated(fpl)) {
6322 break;
6323 }
6324 } else {
6325 if (cache_fpl_islastcn(ndp)) {
6326 error = cache_fplookup_final(fpl);
6327 break;
6328 }
6329
6330 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
6331 error = cache_fpl_aborted(fpl);
6332 break;
6333 }
6334
6335 fpl->dvp = fpl->tvp;
6336 fpl->dvp_seqc = fpl->tvp_seqc;
6337 cache_fplookup_parse_advance(fpl);
6338 }
6339
6340 cache_fpl_checkpoint(fpl);
6341 }
6342
6343 return (error);
6344 }
6345
6346 /*
6347 * Fast path lookup protected with SMR and sequence counters.
6348 *
6349 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
6350 *
6351 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
6352 * outlined at the end.
6353 *
6354 * Traversing from one vnode to another requires atomicity with regard to
6355 * permissions, mount points and of course their relative placement (if you are
6356 * looking up "bar" in "foo" and you found it, it better be in that directory
6357 * at the time).
6358 *
6359 * Normally this is accomplished with locking, but it comes with a significant
6360 * performance hit and is untenable as a fast path even in a moderate core
6361 * count environment (at the time of writing this comment this would be a
6362 * little south of 100).
6363 *
6364 * The same guarantee can be provided with a combination of safe memory
6365 * reclamation and sequence counters instead. If all operations which affect
6366 * the relationship between the current vnode and the one we are looking for
6367 * also modify the counter, we can verify whether all the conditions held as
6368 * we made the jump.
6369 *
6370 * See places which issue vn_seqc_write_begin()/vn_seqc_write_end() for
6371 * operations affected.
6372 *
6373 * Suppose the variable "cnp" contains lookup metadata (the path etc.), then
6374 * locked lookup conceptually looks like this:
6375 *
6376 * // lock the current directory
6377 * vn_lock(dvp);
6378 * for (;;) {
6379 * // permission check
6380 * if (!canlookup(dvp, cnp))
6381 * abort();
6382 * // look for the target name inside dvp
6383 * tvp = findnext(dvp, cnp);
6384 * vn_lock(tvp);
6385 * // tvp is still guaranteed to be inside of dvp because of the lock on dvp
6386 * vn_unlock(dvp);
6387 * // dvp is unlocked. its state is now arbitrary, but that's fine as we
6388 * // made the jump while everything relevant was correct, continue with tvp
6389 * // as the directory to look up names in
6390 * tvp = dvp;
6391 * if (last)
6392 * break;
6393 * // if not last loop back and continue until done
6394 * }
6395 * vget(tvp);
6396 * return (tvp);
6397 *
6398 * Lockless lookup replaces locking with sequence counter checks:
6399 *
6400 * vfs_smr_enter();
6401 * dvp_seqc = seqc_read_any(dvp);
6402 * // fail if someone is altering the directory vnode
6403 * if (seqc_in_modify(dvp_seqc))
6404 * abort();
6405 * for (;;) {
6406 * // permission check. note it can race, but we will validate the outcome
6407 * // with a seqc
6408 * if (!canlookup_smr(dvp, cnp)) {
6409 * // has dvp changed from under us? if so, the denial may be invalid
6410 * if (!seqc_consistent(dvp, dvp_seqc)
6411 * fallback_to_locked();
6412 * // nothing changed, lookup denial is valid
6413 * fail();
6414 * }
6415 * // look for the target name inside dvp
6416 * tvp = findnext(dvp, cnp);
6417 * tvp_seqc = seqc_read_any(tvp);
6418 * // bail if someone is altering the target vnode
6419 * if (seqc_in_modify(tvp_seqc))
6420 * fallback_to_locked();
6421 * // bail if someone is altering the directory vnode
6422 * if (!seqc_consistent(dvp, dvp_seqc)
6423 * fallback_to_locked();
6424 * // we confirmed neither dvp nor tvp changed while we were making the
6425 * // jump to the next component, thus the result is the same as if we
6426 * // held the lock on dvp and tvp the entire time, continue with tvp
6427 * // as the directory to look up names in
6428 * dvp = tvp;
6429 * dvp_seqc = tvp_seqc;
6430 * if (last)
6431 * break;
6432 * }
6433 * vget(); // secure the vnode
6434 * if (!seqc_consistent(tvp, tvp_seqc) // final check
6435 * fallback_to_locked();
6436 * // at this point we know nothing has changed for any parent<->child pair
6437 * // as they were crossed during the lookup, meaning we matched the guarantee
6438 * // of the locked variant
6439 * return (tvp);
6440 *
6441 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
6442 * - they are called while within vfs_smr protection which they must never exit
6443 * - EAGAIN can be returned to denote checking could not be performed, it is
6444 * always valid to return it
6445 * - if the sequence counter has not changed the result must be valid
6446 * - if the sequence counter has changed both false positives and false negatives
6447 * are permitted (since the result will be rejected later)
6448 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
6449 *
6450 * Caveats to watch out for:
6451 * - vnodes are passed unlocked and unreferenced with nothing stopping
6452 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
6453 * to use atomic_load_ptr to fetch it.
6454 * - the aforementioned object can also get freed, meaning absent other means it
6455 * should be protected with vfs_smr
6456 * - either safely checking permissions as they are modified or guaranteeing
6457 * their stability is left to the routine
6458 */
6459 int
cache_fplookup(struct nameidata * ndp,enum cache_fpl_status * status,struct pwd ** pwdp)6460 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
6461 struct pwd **pwdp)
6462 {
6463 struct cache_fpl fpl;
6464 struct pwd *pwd;
6465 struct vnode *dvp;
6466 struct componentname *cnp;
6467 int error;
6468
6469 fpl.status = CACHE_FPL_STATUS_UNSET;
6470 fpl.in_smr = false;
6471 fpl.ndp = ndp;
6472 fpl.cnp = cnp = &ndp->ni_cnd;
6473 MPASS(ndp->ni_lcf == 0);
6474 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
6475 ("%s: internal flags found in cn_flags %" PRIx64, __func__,
6476 cnp->cn_flags));
6477 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
6478 MPASS(ndp->ni_resflags == 0);
6479
6480 if (__predict_false(!cache_can_fplookup(&fpl))) {
6481 *status = fpl.status;
6482 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6483 return (EOPNOTSUPP);
6484 }
6485
6486 cache_fpl_checkpoint_outer(&fpl);
6487
6488 cache_fpl_smr_enter_initial(&fpl);
6489 #ifdef INVARIANTS
6490 fpl.debug.ni_pathlen = ndp->ni_pathlen;
6491 #endif
6492 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
6493 fpl.fsearch = false;
6494 fpl.tvp = NULL; /* for degenerate path handling */
6495 fpl.pwd = pwdp;
6496 pwd = pwd_get_smr();
6497 *(fpl.pwd) = pwd;
6498 namei_setup_rootdir(ndp, cnp, pwd);
6499 ndp->ni_topdir = pwd->pwd_jdir;
6500
6501 if (cnp->cn_pnbuf[0] == '/') {
6502 dvp = cache_fpl_handle_root(&fpl);
6503 ndp->ni_resflags = NIRES_ABS;
6504 } else {
6505 if (ndp->ni_dirfd == AT_FDCWD) {
6506 dvp = pwd->pwd_cdir;
6507 } else {
6508 error = cache_fplookup_dirfd(&fpl, &dvp);
6509 if (__predict_false(error != 0)) {
6510 goto out;
6511 }
6512 }
6513 }
6514
6515 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
6516 error = cache_fplookup_impl(dvp, &fpl);
6517 out:
6518 cache_fpl_smr_assert_not_entered(&fpl);
6519 cache_fpl_assert_status(&fpl);
6520 *status = fpl.status;
6521 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6522 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
6523 MPASS(error != CACHE_FPL_FAILED);
6524 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
6525 ndp);
6526 if (error != 0) {
6527 cache_fpl_cleanup_cnp(fpl.cnp);
6528 MPASS(fpl.dvp == NULL);
6529 MPASS(fpl.tvp == NULL);
6530 }
6531 ndp->ni_dvp = fpl.dvp;
6532 ndp->ni_vp = fpl.tvp;
6533 }
6534 return (error);
6535 }
6536