1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include "opt_ddb.h"
36 #include "opt_ktrace.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/capsicum.h>
41 #include <sys/counter.h>
42 #include <sys/filedesc.h>
43 #include <sys/fnv_hash.h>
44 #include <sys/kernel.h>
45 #include <sys/ktr.h>
46 #include <sys/lock.h>
47 #include <sys/malloc.h>
48 #include <sys/fcntl.h>
49 #include <sys/jail.h>
50 #include <sys/mount.h>
51 #include <sys/namei.h>
52 #include <sys/proc.h>
53 #include <sys/seqc.h>
54 #include <sys/sdt.h>
55 #include <sys/smr.h>
56 #include <sys/smp.h>
57 #include <sys/syscallsubr.h>
58 #include <sys/sysctl.h>
59 #include <sys/sysproto.h>
60 #include <sys/vnode.h>
61 #include <ck_queue.h>
62 #ifdef KTRACE
63 #include <sys/ktrace.h>
64 #endif
65 #ifdef INVARIANTS
66 #include <machine/_inttypes.h>
67 #endif
68
69 #include <security/audit/audit.h>
70 #include <security/mac/mac_framework.h>
71
72 #ifdef DDB
73 #include <ddb/ddb.h>
74 #endif
75
76 #include <vm/uma.h>
77
78 /*
79 * High level overview of name caching in the VFS layer.
80 *
81 * Originally caching was implemented as part of UFS, later extracted to allow
82 * use by other filesystems. A decision was made to make it optional and
83 * completely detached from the rest of the kernel, which comes with limitations
84 * outlined near the end of this comment block.
85 *
86 * This fundamental choice needs to be revisited. In the meantime, the current
87 * state is described below. Significance of all notable routines is explained
88 * in comments placed above their implementation. Scattered thoroughout the
89 * file are TODO comments indicating shortcomings which can be fixed without
90 * reworking everything (most of the fixes will likely be reusable). Various
91 * details are omitted from this explanation to not clutter the overview, they
92 * have to be checked by reading the code and associated commentary.
93 *
94 * Keep in mind that it's individual path components which are cached, not full
95 * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
96 * one for each name.
97 *
98 * I. Data organization
99 *
100 * Entries are described by "struct namecache" objects and stored in a hash
101 * table. See cache_get_hash for more information.
102 *
103 * "struct vnode" contains pointers to source entries (names which can be found
104 * when traversing through said vnode), destination entries (names of that
105 * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
106 * the parent vnode.
107 *
108 * The (directory vnode; name) tuple reliably determines the target entry if
109 * it exists.
110 *
111 * Since there are no small locks at this time (all are 32 bytes in size on
112 * LP64), the code works around the problem by introducing lock arrays to
113 * protect hash buckets and vnode lists.
114 *
115 * II. Filesystem integration
116 *
117 * Filesystems participating in name caching do the following:
118 * - set vop_lookup routine to vfs_cache_lookup
119 * - set vop_cachedlookup to whatever can perform the lookup if the above fails
120 * - if they support lockless lookup (see below), vop_fplookup_vexec and
121 * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
122 * mount point
123 * - call cache_purge or cache_vop_* routines to eliminate stale entries as
124 * applicable
125 * - call cache_enter to add entries depending on the MAKEENTRY flag
126 *
127 * With the above in mind, there are 2 entry points when doing lookups:
128 * - ... -> namei -> cache_fplookup -- this is the default
129 * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
130 * should the above fail
131 *
132 * Example code flow how an entry is added:
133 * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
134 * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
135 *
136 * III. Performance considerations
137 *
138 * For lockless case forward lookup avoids any writes to shared areas apart
139 * from the terminal path component. In other words non-modifying lookups of
140 * different files don't suffer any scalability problems in the namecache.
141 * Looking up the same file is limited by VFS and goes beyond the scope of this
142 * file.
143 *
144 * At least on amd64 the single-threaded bottleneck for long paths is hashing
145 * (see cache_get_hash). There are cases where the code issues acquire fence
146 * multiple times, they can be combined on architectures which suffer from it.
147 *
148 * For locked case each encountered vnode has to be referenced and locked in
149 * order to be handed out to the caller (normally that's namei). This
150 * introduces significant hit single-threaded and serialization multi-threaded.
151 *
152 * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
153 * avoids any writes to shared areas to any components.
154 *
155 * Unrelated insertions are partially serialized on updating the global entry
156 * counter and possibly serialized on colliding bucket or vnode locks.
157 *
158 * IV. Observability
159 *
160 * Note not everything has an explicit dtrace probe nor it should have, thus
161 * some of the one-liners below depend on implementation details.
162 *
163 * Examples:
164 *
165 * # Check what lookups failed to be handled in a lockless manner. Column 1 is
166 * # line number, column 2 is status code (see cache_fpl_status)
167 * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
168 *
169 * # Lengths of names added by binary name
170 * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
171 *
172 * # Same as above but only those which exceed 64 characters
173 * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
174 *
175 * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
176 * # path is it
177 * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
178 *
179 * V. Limitations and implementation defects
180 *
181 * - since it is possible there is no entry for an open file, tools like
182 * "procstat" may fail to resolve fd -> vnode -> path to anything
183 * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
184 * shortage) in which case the above problem applies
185 * - hardlinks are not tracked, thus if a vnode is reachable in more than one
186 * way, resolving a name may return a different path than the one used to
187 * open it (even if said path is still valid)
188 * - by default entries are not added for newly created files
189 * - adding an entry may need to evict negative entry first, which happens in 2
190 * distinct places (evicting on lookup, adding in a later VOP) making it
191 * impossible to simply reuse it
192 * - there is a simple scheme to evict negative entries as the cache is approaching
193 * its capacity, but it is very unclear if doing so is a good idea to begin with
194 * - vnodes are subject to being recycled even if target inode is left in memory,
195 * which loses the name cache entries when it perhaps should not. in case of tmpfs
196 * names get duplicated -- kept by filesystem itself and namecache separately
197 * - struct namecache has a fixed size and comes in 2 variants, often wasting
198 * space. now hard to replace with malloc due to dependence on SMR, which
199 * requires UMA zones to opt in
200 * - lack of better integration with the kernel also turns nullfs into a layered
201 * filesystem instead of something which can take advantage of caching
202 *
203 * Appendix A: where is the time lost, expanding on paragraph III
204 *
205 * While some care went into optimizing lookups, there is still plenty of
206 * performance left on the table, most notably from single-threaded standpoint.
207 * Below is a woefully incomplete list of changes which can help. Ideas are
208 * mostly sketched out, no claim is made all kinks or prerequisites are laid
209 * out.
210 *
211 * Note there is performance lost all over VFS.
212 *
213 * === SMR-only lookup
214 *
215 * For commonly used ops like stat(2), when the terminal vnode *is* cached,
216 * lockless lookup could refrain from refing/locking the found vnode and
217 * instead return while within the SMR section. Then a call to, say,
218 * vop_stat_smr could do the work (or fail with EAGAIN), finally the result
219 * would be validated with seqc not changing. This would be faster
220 * single-threaded as it dodges atomics and would provide full scalability for
221 * multicore uses. This would *not* work for open(2) or other calls which need
222 * the vnode to hang around for the long haul, but would work for aforementioned
223 * stat(2) but also access(2), readlink(2), realpathat(2) and probably more.
224 *
225 * === hotpatching for sdt probes
226 *
227 * They result in *tons* of branches all over with rather regrettable codegen
228 * at times. Removing sdt probes altogether gives over 2% boost in lookup rate.
229 * Reworking the code to patch itself at runtime with asm goto would solve it.
230 * asm goto is fully supported by gcc and clang.
231 *
232 * === copyinstr
233 *
234 * On all architectures it operates one byte at a time, while it could be
235 * word-sized instead thanks to the Mycroft trick.
236 *
237 * API itself is rather pessimal for path lookup, accepting arbitrary sizes and
238 * *optionally* filling in the length parameter.
239 *
240 * Instead a new routine (copyinpath?) could be introduced, demanding a buffer
241 * size which is a multiply of the word (and never zero), with the length
242 * always returned. On top of it the routine could be allowed to transform the
243 * buffer in arbitrary ways, most notably writing past the found length (not to
244 * be confused with writing past buffer size) -- this would allow word-sized
245 * movs while checking for '\0' later.
246 *
247 * === detour through namei
248 *
249 * Currently one suffers being called from namei, which then has to check if
250 * things worked out locklessly. Instead the lockless lookup could be the
251 * actual entry point which calls what is currently namei as a fallback.
252 *
253 * === avoidable branches in cache_can_fplookup
254 *
255 * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if
256 * this is off, none of fplookup code should execute).
257 *
258 * Both audit and capsicum branches can be combined into one, but it requires
259 * paying off a lot of tech debt first.
260 *
261 * ni_startdir could be indicated with a flag in cn_flags, eliminating the
262 * branch.
263 *
264 * === mount stacks
265 *
266 * Crossing a mount requires checking if perhaps something is mounted on top.
267 * Instead, an additional entry could be added to struct mount with a pointer
268 * to the final mount on the stack. This would be recalculated on each
269 * mount/unmount.
270 *
271 * === root vnodes
272 *
273 * It could become part of the API contract to *always* have a rootvnode set in
274 * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have
275 * to be modified to always skip them.
276 *
277 * === inactive on v_usecount reaching 0
278 *
279 * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such
280 * processing with a bit in usecount.
281 *
282 * === v_holdcnt
283 *
284 * Hold count should probably get eliminated, but one can argue it is a useful
285 * feature. Even if so, handling of v_usecount could be decoupled from it --
286 * vnlru et al would consider the vnode not-freeable if has either hold or
287 * usecount on it.
288 *
289 * This would eliminate 2 atomics.
290 */
291
292 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
293 "Name cache");
294
295 SDT_PROVIDER_DECLARE(vfs);
296 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
297 "struct vnode *");
298 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
299 "struct vnode *");
300 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
301 "char *");
302 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
303 "const char *");
304 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
305 "struct namecache *", "int", "int");
306 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
307 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
308 "char *", "struct vnode *");
309 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
310 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
311 "struct vnode *", "char *");
312 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
313 "struct vnode *");
314 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
315 "struct vnode *", "char *");
316 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
317 "char *");
318 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
319 "struct componentname *");
320 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
321 "struct componentname *");
322 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
323 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
324 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
325 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
326 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
327 "struct vnode *");
328 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
329 "char *");
330 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
331 "char *");
332 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
333
334 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
335 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
336 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
337
338 static char __read_frequently cache_fast_lookup_enabled = true;
339
340 /*
341 * This structure describes the elements in the cache of recent
342 * names looked up by namei.
343 */
344 struct negstate {
345 u_char neg_flag;
346 u_char neg_hit;
347 };
348 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
349 "the state must fit in a union with a pointer without growing it");
350
351 struct namecache {
352 LIST_ENTRY(namecache) nc_src; /* source vnode list */
353 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
354 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
355 struct vnode *nc_dvp; /* vnode of parent of name */
356 union {
357 struct vnode *nu_vp; /* vnode the name refers to */
358 struct negstate nu_neg;/* negative entry state */
359 } n_un;
360 u_char nc_flag; /* flag bits */
361 u_char nc_nlen; /* length of name */
362 char nc_name[]; /* segment name + nul */
363 };
364
365 /*
366 * struct namecache_ts repeats struct namecache layout up to the
367 * nc_nlen member.
368 * struct namecache_ts is used in place of struct namecache when time(s) need
369 * to be stored. The nc_dotdottime field is used when a cache entry is mapping
370 * both a non-dotdot directory name plus dotdot for the directory's
371 * parent.
372 *
373 * See below for alignment requirement.
374 */
375 struct namecache_ts {
376 struct timespec nc_time; /* timespec provided by fs */
377 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
378 int nc_ticks; /* ticks value when entry was added */
379 int nc_pad;
380 struct namecache nc_nc;
381 };
382
383 TAILQ_HEAD(cache_freebatch, namecache);
384
385 /*
386 * At least mips n32 performs 64-bit accesses to timespec as found
387 * in namecache_ts and requires them to be aligned. Since others
388 * may be in the same spot suffer a little bit and enforce the
389 * alignment for everyone. Note this is a nop for 64-bit platforms.
390 */
391 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t)
392
393 /*
394 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
395 * 4.4 BSD codebase. Later on struct namecache was tweaked to become
396 * smaller and the value was bumped to retain the total size, but it
397 * was never re-evaluated for suitability. A simple test counting
398 * lengths during package building shows that the value of 45 covers
399 * about 86% of all added entries, reaching 99% at 65.
400 *
401 * Regardless of the above, use of dedicated zones instead of malloc may be
402 * inducing additional waste. This may be hard to address as said zones are
403 * tied to VFS SMR. Even if retaining them, the current split should be
404 * re-evaluated.
405 */
406 #ifdef __LP64__
407 #define CACHE_PATH_CUTOFF 45
408 #define CACHE_LARGE_PAD 6
409 #else
410 #define CACHE_PATH_CUTOFF 41
411 #define CACHE_LARGE_PAD 2
412 #endif
413
414 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
415 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
416 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
417 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
418
419 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
420 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
421 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
422 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
423
424 #define nc_vp n_un.nu_vp
425 #define nc_neg n_un.nu_neg
426
427 /*
428 * Flags in namecache.nc_flag
429 */
430 #define NCF_WHITE 0x01
431 #define NCF_ISDOTDOT 0x02
432 #define NCF_TS 0x04
433 #define NCF_DTS 0x08
434 #define NCF_DVDROP 0x10
435 #define NCF_NEGATIVE 0x20
436 #define NCF_INVALID 0x40
437 #define NCF_WIP 0x80
438
439 /*
440 * Flags in negstate.neg_flag
441 */
442 #define NEG_HOT 0x01
443
444 static bool cache_neg_evict_cond(u_long lnumcache);
445
446 /*
447 * Mark an entry as invalid.
448 *
449 * This is called before it starts getting deconstructed.
450 */
451 static void
cache_ncp_invalidate(struct namecache * ncp)452 cache_ncp_invalidate(struct namecache *ncp)
453 {
454
455 KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
456 ("%s: entry %p already invalid", __func__, ncp));
457 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
458 atomic_thread_fence_rel();
459 }
460
461 /*
462 * Does this entry match the given directory and name?
463 */
464 static bool
cache_ncp_match(struct namecache * ncp,struct vnode * dvp,struct componentname * cnp)465 cache_ncp_match(struct namecache *ncp, struct vnode *dvp,
466 struct componentname *cnp)
467 {
468 return (ncp->nc_dvp == dvp &&
469 ncp->nc_nlen == cnp->cn_namelen &&
470 bcmp(ncp->nc_name, cnp->cn_nameptr, cnp->cn_namelen) == 0);
471 }
472
473 /*
474 * Check whether the entry can be safely used.
475 *
476 * All places which elide locks are supposed to call this after they are
477 * done with reading from an entry.
478 */
479 #define cache_ncp_canuse(ncp) ({ \
480 struct namecache *_ncp = (ncp); \
481 u_char _nc_flag; \
482 \
483 atomic_thread_fence_acq(); \
484 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
485 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \
486 })
487
488 /*
489 * Like the above but also checks NCF_WHITE.
490 */
491 #define cache_fpl_neg_ncp_canuse(ncp) ({ \
492 struct namecache *_ncp = (ncp); \
493 u_char _nc_flag; \
494 \
495 atomic_thread_fence_acq(); \
496 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
497 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \
498 })
499
500 VFS_SMR_DECLARE;
501
502 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
503 "Name cache parameters");
504
505 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */
506 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0,
507 "Total namecache capacity");
508
509 u_int ncsizefactor = 2;
510 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
511 "Size factor for namecache");
512
513 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */
514 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
515 "Ratio of negative namecache entries");
516
517 /*
518 * Negative entry % of namecache capacity above which automatic eviction is allowed.
519 *
520 * Check cache_neg_evict_cond for details.
521 */
522 static u_int ncnegminpct = 3;
523
524 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */
525 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
526 "Negative entry count above which automatic eviction is allowed");
527
528 /*
529 * Structures associated with name caching.
530 */
531 #define NCHHASH(hash) \
532 (&nchashtbl[(hash) & nchash])
533 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
534 static u_long __read_mostly nchash; /* size of hash table */
535 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
536 "Size of namecache hash table");
537 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
538 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
539
540 struct nchstats nchstats; /* cache effectiveness statistics */
541
542 static u_int __exclusive_cache_line neg_cycle;
543
544 #define ncneghash 3
545 #define numneglists (ncneghash + 1)
546
547 struct neglist {
548 struct mtx nl_evict_lock;
549 struct mtx nl_lock __aligned(CACHE_LINE_SIZE);
550 TAILQ_HEAD(, namecache) nl_list;
551 TAILQ_HEAD(, namecache) nl_hotlist;
552 u_long nl_hotnum;
553 } __aligned(CACHE_LINE_SIZE);
554
555 static struct neglist neglists[numneglists];
556
557 static inline struct neglist *
NCP2NEGLIST(struct namecache * ncp)558 NCP2NEGLIST(struct namecache *ncp)
559 {
560
561 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
562 }
563
564 static inline struct negstate *
NCP2NEGSTATE(struct namecache * ncp)565 NCP2NEGSTATE(struct namecache *ncp)
566 {
567
568 MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
569 return (&ncp->nc_neg);
570 }
571
572 #define numbucketlocks (ncbuckethash + 1)
573 static u_int __read_mostly ncbuckethash;
574 static struct mtx_padalign __read_mostly *bucketlocks;
575 #define HASH2BUCKETLOCK(hash) \
576 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
577
578 #define numvnodelocks (ncvnodehash + 1)
579 static u_int __read_mostly ncvnodehash;
580 static struct mtx __read_mostly *vnodelocks;
581 static inline struct mtx *
VP2VNODELOCK(struct vnode * vp)582 VP2VNODELOCK(struct vnode *vp)
583 {
584
585 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
586 }
587
588 /*
589 * Search the hash table for a namecache entry. Either the corresponding bucket
590 * must be locked, or the caller must be in an SMR read section.
591 */
592 static struct namecache *
cache_ncp_find(struct vnode * dvp,struct componentname * cnp,uint32_t hash)593 cache_ncp_find(struct vnode *dvp, struct componentname *cnp, uint32_t hash)
594 {
595 struct namecache *ncp;
596
597 KASSERT(mtx_owned(HASH2BUCKETLOCK(hash)) || VFS_SMR_ENTERED(),
598 ("%s: hash %u not locked", __func__, hash));
599 CK_SLIST_FOREACH(ncp, NCHHASH(hash), nc_hash) {
600 if (cache_ncp_match(ncp, dvp, cnp))
601 break;
602 }
603 return (ncp);
604 }
605
606 static void
cache_out_ts(struct namecache * ncp,struct timespec * tsp,int * ticksp)607 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
608 {
609 struct namecache_ts *ncp_ts;
610
611 KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
612 (tsp == NULL && ticksp == NULL),
613 ("No NCF_TS"));
614
615 if (tsp == NULL)
616 return;
617
618 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
619 *tsp = ncp_ts->nc_time;
620 *ticksp = ncp_ts->nc_ticks;
621 }
622
623 #ifdef DEBUG_CACHE
624 static int __read_mostly doingcache = 1; /* 1 => enable the cache */
625 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
626 "VFS namecache enabled");
627 #endif
628
629 /* Export size information to userland */
630 SYSCTL_SIZEOF_STRUCT(namecache);
631
632 /*
633 * The new name cache statistics
634 */
635 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
636 "Name cache statistics");
637
638 #define STATNODE_ULONG(name, varname, descr) \
639 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
640 #define STATNODE_COUNTER(name, varname, descr) \
641 static COUNTER_U64_DEFINE_EARLY(varname); \
642 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
643 descr);
644 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
645 STATNODE_ULONG(count, numcache, "Number of cache entries");
646 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
647 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
648 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
649 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
650 STATNODE_COUNTER(poszaps, numposzaps,
651 "Number of cache hits (positive) we do not want to cache");
652 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
653 STATNODE_COUNTER(negzaps, numnegzaps,
654 "Number of cache hits (negative) we do not want to cache");
655 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
656 /* These count for vn_getcwd(), too. */
657 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
658 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
659 "Number of fullpath search errors (VOP_VPTOCNP failures)");
660 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
661 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
662 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
663
664 /*
665 * Debug or developer statistics.
666 */
667 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
668 "Name cache debugging");
669 #define DEBUGNODE_ULONG(name, varname, descr) \
670 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
671 static u_long zap_bucket_relock_success;
672 DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success,
673 "Number of successful removals after relocking");
674 static u_long zap_bucket_fail;
675 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
676 static u_long zap_bucket_fail2;
677 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
678 static u_long cache_lock_vnodes_cel_3_failures;
679 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
680 "Number of times 3-way vnode locking failed");
681
682 static void cache_zap_locked(struct namecache *ncp);
683 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
684 char **retbuf, size_t *buflen, size_t addend);
685 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
686 char **retbuf, size_t *buflen);
687 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
688 char **retbuf, size_t *len, size_t addend);
689
690 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
691
692 static inline void
cache_assert_vlp_locked(struct mtx * vlp)693 cache_assert_vlp_locked(struct mtx *vlp)
694 {
695
696 if (vlp != NULL)
697 mtx_assert(vlp, MA_OWNED);
698 }
699
700 static inline void
cache_assert_vnode_locked(struct vnode * vp)701 cache_assert_vnode_locked(struct vnode *vp)
702 {
703 struct mtx *vlp;
704
705 vlp = VP2VNODELOCK(vp);
706 cache_assert_vlp_locked(vlp);
707 }
708
709 /*
710 * Directory vnodes with entries are held for two reasons:
711 * 1. make them less of a target for reclamation in vnlru
712 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
713 *
714 * It will be feasible to stop doing it altogether if all filesystems start
715 * supporting lockless lookup.
716 */
717 static void
cache_hold_vnode(struct vnode * vp)718 cache_hold_vnode(struct vnode *vp)
719 {
720
721 cache_assert_vnode_locked(vp);
722 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
723 vhold(vp);
724 counter_u64_add(numcachehv, 1);
725 }
726
727 static void
cache_drop_vnode(struct vnode * vp)728 cache_drop_vnode(struct vnode *vp)
729 {
730
731 /*
732 * Called after all locks are dropped, meaning we can't assert
733 * on the state of v_cache_src.
734 */
735 vdrop(vp);
736 counter_u64_add(numcachehv, -1);
737 }
738
739 /*
740 * UMA zones.
741 */
742 static uma_zone_t __read_mostly cache_zone_small;
743 static uma_zone_t __read_mostly cache_zone_small_ts;
744 static uma_zone_t __read_mostly cache_zone_large;
745 static uma_zone_t __read_mostly cache_zone_large_ts;
746
747 char *
cache_symlink_alloc(size_t size,int flags)748 cache_symlink_alloc(size_t size, int flags)
749 {
750
751 if (size < CACHE_ZONE_SMALL_SIZE) {
752 return (uma_zalloc_smr(cache_zone_small, flags));
753 }
754 if (size < CACHE_ZONE_LARGE_SIZE) {
755 return (uma_zalloc_smr(cache_zone_large, flags));
756 }
757 counter_u64_add(symlinktoobig, 1);
758 SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
759 return (NULL);
760 }
761
762 void
cache_symlink_free(char * string,size_t size)763 cache_symlink_free(char *string, size_t size)
764 {
765
766 MPASS(string != NULL);
767 KASSERT(size < CACHE_ZONE_LARGE_SIZE,
768 ("%s: size %zu too big", __func__, size));
769
770 if (size < CACHE_ZONE_SMALL_SIZE) {
771 uma_zfree_smr(cache_zone_small, string);
772 return;
773 }
774 if (size < CACHE_ZONE_LARGE_SIZE) {
775 uma_zfree_smr(cache_zone_large, string);
776 return;
777 }
778 __assert_unreachable();
779 }
780
781 static struct namecache *
cache_alloc_uma(int len,bool ts)782 cache_alloc_uma(int len, bool ts)
783 {
784 struct namecache_ts *ncp_ts;
785 struct namecache *ncp;
786
787 if (__predict_false(ts)) {
788 if (len <= CACHE_PATH_CUTOFF)
789 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
790 else
791 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
792 ncp = &ncp_ts->nc_nc;
793 } else {
794 if (len <= CACHE_PATH_CUTOFF)
795 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
796 else
797 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
798 }
799 return (ncp);
800 }
801
802 static void
cache_free_uma(struct namecache * ncp)803 cache_free_uma(struct namecache *ncp)
804 {
805 struct namecache_ts *ncp_ts;
806
807 if (__predict_false(ncp->nc_flag & NCF_TS)) {
808 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
809 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
810 uma_zfree_smr(cache_zone_small_ts, ncp_ts);
811 else
812 uma_zfree_smr(cache_zone_large_ts, ncp_ts);
813 } else {
814 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
815 uma_zfree_smr(cache_zone_small, ncp);
816 else
817 uma_zfree_smr(cache_zone_large, ncp);
818 }
819 }
820
821 static struct namecache *
cache_alloc(int len,bool ts)822 cache_alloc(int len, bool ts)
823 {
824 u_long lnumcache;
825
826 /*
827 * Avoid blowout in namecache entries.
828 *
829 * Bugs:
830 * 1. filesystems may end up trying to add an already existing entry
831 * (for example this can happen after a cache miss during concurrent
832 * lookup), in which case we will call cache_neg_evict despite not
833 * adding anything.
834 * 2. the routine may fail to free anything and no provisions are made
835 * to make it try harder (see the inside for failure modes)
836 * 3. it only ever looks at negative entries.
837 */
838 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
839 if (cache_neg_evict_cond(lnumcache)) {
840 lnumcache = atomic_load_long(&numcache);
841 }
842 if (__predict_false(lnumcache >= ncsize)) {
843 atomic_subtract_long(&numcache, 1);
844 counter_u64_add(numdrops, 1);
845 return (NULL);
846 }
847 return (cache_alloc_uma(len, ts));
848 }
849
850 static void
cache_free(struct namecache * ncp)851 cache_free(struct namecache *ncp)
852 {
853
854 MPASS(ncp != NULL);
855 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
856 cache_drop_vnode(ncp->nc_dvp);
857 }
858 cache_free_uma(ncp);
859 atomic_subtract_long(&numcache, 1);
860 }
861
862 static void
cache_free_batch(struct cache_freebatch * batch)863 cache_free_batch(struct cache_freebatch *batch)
864 {
865 struct namecache *ncp, *nnp;
866 int i;
867
868 i = 0;
869 if (TAILQ_EMPTY(batch))
870 goto out;
871 TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
872 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
873 cache_drop_vnode(ncp->nc_dvp);
874 }
875 cache_free_uma(ncp);
876 i++;
877 }
878 atomic_subtract_long(&numcache, i);
879 out:
880 SDT_PROBE1(vfs, namecache, purge, batch, i);
881 }
882
883 /*
884 * Hashing.
885 *
886 * The code was made to use FNV in 2001 and this choice needs to be revisited.
887 *
888 * Short summary of the difficulty:
889 * The longest name which can be inserted is NAME_MAX characters in length (or
890 * 255 at the time of writing this comment), while majority of names used in
891 * practice are significantly shorter (mostly below 10). More importantly
892 * majority of lookups performed find names are even shorter than that.
893 *
894 * This poses a problem where hashes which do better than FNV past word size
895 * (or so) tend to come with additional overhead when finalizing the result,
896 * making them noticeably slower for the most commonly used range.
897 *
898 * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
899 *
900 * When looking it up the most time consuming part by a large margin (at least
901 * on amd64) is hashing. Replacing FNV with something which pessimizes short
902 * input would make the slowest part stand out even more.
903 */
904
905 /*
906 * TODO: With the value stored we can do better than computing the hash based
907 * on the address.
908 */
909 static void
cache_prehash(struct vnode * vp)910 cache_prehash(struct vnode *vp)
911 {
912
913 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
914 }
915
916 static uint32_t
cache_get_hash(char * name,u_char len,struct vnode * dvp)917 cache_get_hash(char *name, u_char len, struct vnode *dvp)
918 {
919
920 return (fnv_32_buf(name, len, dvp->v_nchash));
921 }
922
923 static uint32_t
cache_get_hash_iter_start(struct vnode * dvp)924 cache_get_hash_iter_start(struct vnode *dvp)
925 {
926
927 return (dvp->v_nchash);
928 }
929
930 static uint32_t
cache_get_hash_iter(char c,uint32_t hash)931 cache_get_hash_iter(char c, uint32_t hash)
932 {
933
934 return (fnv_32_buf(&c, 1, hash));
935 }
936
937 static uint32_t
cache_get_hash_iter_finish(uint32_t hash)938 cache_get_hash_iter_finish(uint32_t hash)
939 {
940
941 return (hash);
942 }
943
944 static inline struct nchashhead *
NCP2BUCKET(struct namecache * ncp)945 NCP2BUCKET(struct namecache *ncp)
946 {
947 uint32_t hash;
948
949 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
950 return (NCHHASH(hash));
951 }
952
953 static inline struct mtx *
NCP2BUCKETLOCK(struct namecache * ncp)954 NCP2BUCKETLOCK(struct namecache *ncp)
955 {
956 uint32_t hash;
957
958 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
959 return (HASH2BUCKETLOCK(hash));
960 }
961
962 #ifdef INVARIANTS
963 static void
cache_assert_bucket_locked(struct namecache * ncp)964 cache_assert_bucket_locked(struct namecache *ncp)
965 {
966 struct mtx *blp;
967
968 blp = NCP2BUCKETLOCK(ncp);
969 mtx_assert(blp, MA_OWNED);
970 }
971
972 static void
cache_assert_bucket_unlocked(struct namecache * ncp)973 cache_assert_bucket_unlocked(struct namecache *ncp)
974 {
975 struct mtx *blp;
976
977 blp = NCP2BUCKETLOCK(ncp);
978 mtx_assert(blp, MA_NOTOWNED);
979 }
980 #else
981 #define cache_assert_bucket_locked(x) do { } while (0)
982 #define cache_assert_bucket_unlocked(x) do { } while (0)
983 #endif
984
985 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
986 static void
_cache_sort_vnodes(void ** p1,void ** p2)987 _cache_sort_vnodes(void **p1, void **p2)
988 {
989 void *tmp;
990
991 MPASS(*p1 != NULL || *p2 != NULL);
992
993 if (*p1 > *p2) {
994 tmp = *p2;
995 *p2 = *p1;
996 *p1 = tmp;
997 }
998 }
999
1000 static void
cache_lock_all_buckets(void)1001 cache_lock_all_buckets(void)
1002 {
1003 u_int i;
1004
1005 for (i = 0; i < numbucketlocks; i++)
1006 mtx_lock(&bucketlocks[i]);
1007 }
1008
1009 static void
cache_unlock_all_buckets(void)1010 cache_unlock_all_buckets(void)
1011 {
1012 u_int i;
1013
1014 for (i = 0; i < numbucketlocks; i++)
1015 mtx_unlock(&bucketlocks[i]);
1016 }
1017
1018 static void
cache_lock_all_vnodes(void)1019 cache_lock_all_vnodes(void)
1020 {
1021 u_int i;
1022
1023 for (i = 0; i < numvnodelocks; i++)
1024 mtx_lock(&vnodelocks[i]);
1025 }
1026
1027 static void
cache_unlock_all_vnodes(void)1028 cache_unlock_all_vnodes(void)
1029 {
1030 u_int i;
1031
1032 for (i = 0; i < numvnodelocks; i++)
1033 mtx_unlock(&vnodelocks[i]);
1034 }
1035
1036 static int
cache_trylock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1037 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1038 {
1039
1040 cache_sort_vnodes(&vlp1, &vlp2);
1041
1042 if (vlp1 != NULL) {
1043 if (!mtx_trylock(vlp1))
1044 return (EAGAIN);
1045 }
1046 if (!mtx_trylock(vlp2)) {
1047 if (vlp1 != NULL)
1048 mtx_unlock(vlp1);
1049 return (EAGAIN);
1050 }
1051
1052 return (0);
1053 }
1054
1055 static void
cache_lock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1056 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1057 {
1058
1059 MPASS(vlp1 != NULL || vlp2 != NULL);
1060 MPASS(vlp1 <= vlp2);
1061
1062 if (vlp1 != NULL)
1063 mtx_lock(vlp1);
1064 if (vlp2 != NULL)
1065 mtx_lock(vlp2);
1066 }
1067
1068 static void
cache_unlock_vnodes(struct mtx * vlp1,struct mtx * vlp2)1069 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1070 {
1071
1072 MPASS(vlp1 != NULL || vlp2 != NULL);
1073
1074 if (vlp1 != NULL)
1075 mtx_unlock(vlp1);
1076 if (vlp2 != NULL)
1077 mtx_unlock(vlp2);
1078 }
1079
1080 static int
sysctl_nchstats(SYSCTL_HANDLER_ARGS)1081 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
1082 {
1083 struct nchstats snap;
1084
1085 if (req->oldptr == NULL)
1086 return (SYSCTL_OUT(req, 0, sizeof(snap)));
1087
1088 snap = nchstats;
1089 snap.ncs_goodhits = counter_u64_fetch(numposhits);
1090 snap.ncs_neghits = counter_u64_fetch(numneghits);
1091 snap.ncs_badhits = counter_u64_fetch(numposzaps) +
1092 counter_u64_fetch(numnegzaps);
1093 snap.ncs_miss = counter_u64_fetch(nummisszap) +
1094 counter_u64_fetch(nummiss);
1095
1096 return (SYSCTL_OUT(req, &snap, sizeof(snap)));
1097 }
1098 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
1099 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
1100 "VFS cache effectiveness statistics");
1101
1102 static int
sysctl_hitpct(SYSCTL_HANDLER_ARGS)1103 sysctl_hitpct(SYSCTL_HANDLER_ARGS)
1104 {
1105 long poshits, neghits, miss, total;
1106 long pct;
1107
1108 poshits = counter_u64_fetch(numposhits);
1109 neghits = counter_u64_fetch(numneghits);
1110 miss = counter_u64_fetch(nummiss);
1111 total = poshits + neghits + miss;
1112
1113 pct = 0;
1114 if (total != 0)
1115 pct = ((poshits + neghits) * 100) / total;
1116 return (sysctl_handle_int(oidp, 0, pct, req));
1117 }
1118 SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct,
1119 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct,
1120 "I", "Percentage of hits");
1121
1122 static void
cache_recalc_neg_min(void)1123 cache_recalc_neg_min(void)
1124 {
1125
1126 neg_min = (ncsize * ncnegminpct) / 100;
1127 }
1128
1129 static int
sysctl_negminpct(SYSCTL_HANDLER_ARGS)1130 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
1131 {
1132 u_int val;
1133 int error;
1134
1135 val = ncnegminpct;
1136 error = sysctl_handle_int(oidp, &val, 0, req);
1137 if (error != 0 || req->newptr == NULL)
1138 return (error);
1139
1140 if (val == ncnegminpct)
1141 return (0);
1142 if (val < 0 || val > 99)
1143 return (EINVAL);
1144 ncnegminpct = val;
1145 cache_recalc_neg_min();
1146 return (0);
1147 }
1148
1149 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
1150 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
1151 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
1152
1153 #ifdef DEBUG_CACHE
1154 /*
1155 * Grab an atomic snapshot of the name cache hash chain lengths
1156 */
1157 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
1158 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
1159 "hash table stats");
1160
1161 static int
sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)1162 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
1163 {
1164 struct nchashhead *ncpp;
1165 struct namecache *ncp;
1166 int i, error, n_nchash, *cntbuf;
1167
1168 retry:
1169 n_nchash = nchash + 1; /* nchash is max index, not count */
1170 if (req->oldptr == NULL)
1171 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
1172 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
1173 cache_lock_all_buckets();
1174 if (n_nchash != nchash + 1) {
1175 cache_unlock_all_buckets();
1176 free(cntbuf, M_TEMP);
1177 goto retry;
1178 }
1179 /* Scan hash tables counting entries */
1180 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
1181 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
1182 cntbuf[i]++;
1183 cache_unlock_all_buckets();
1184 for (error = 0, i = 0; i < n_nchash; i++)
1185 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
1186 break;
1187 free(cntbuf, M_TEMP);
1188 return (error);
1189 }
1190 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
1191 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
1192 "nchash chain lengths");
1193
1194 static int
sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)1195 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
1196 {
1197 int error;
1198 struct nchashhead *ncpp;
1199 struct namecache *ncp;
1200 int n_nchash;
1201 int count, maxlength, used, pct;
1202
1203 if (!req->oldptr)
1204 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
1205
1206 cache_lock_all_buckets();
1207 n_nchash = nchash + 1; /* nchash is max index, not count */
1208 used = 0;
1209 maxlength = 0;
1210
1211 /* Scan hash tables for applicable entries */
1212 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
1213 count = 0;
1214 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
1215 count++;
1216 }
1217 if (count)
1218 used++;
1219 if (maxlength < count)
1220 maxlength = count;
1221 }
1222 n_nchash = nchash + 1;
1223 cache_unlock_all_buckets();
1224 pct = (used * 100) / (n_nchash / 100);
1225 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
1226 if (error)
1227 return (error);
1228 error = SYSCTL_OUT(req, &used, sizeof(used));
1229 if (error)
1230 return (error);
1231 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
1232 if (error)
1233 return (error);
1234 error = SYSCTL_OUT(req, &pct, sizeof(pct));
1235 if (error)
1236 return (error);
1237 return (0);
1238 }
1239 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1240 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1241 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1242 #endif
1243
1244 /*
1245 * Negative entries management
1246 *
1247 * Various workloads create plenty of negative entries and barely use them
1248 * afterwards. Moreover malicious users can keep performing bogus lookups
1249 * adding even more entries. For example "make tinderbox" as of writing this
1250 * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1251 * negative.
1252 *
1253 * As such, a rather aggressive eviction method is needed. The currently
1254 * employed method is a placeholder.
1255 *
1256 * Entries are split over numneglists separate lists, each of which is further
1257 * split into hot and cold entries. Entries get promoted after getting a hit.
1258 * Eviction happens on addition of new entry.
1259 */
1260 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1261 "Name cache negative entry statistics");
1262
1263 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1264 "Number of negative cache entries");
1265
1266 static COUNTER_U64_DEFINE_EARLY(neg_created);
1267 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1268 "Number of created negative entries");
1269
1270 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1271 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1272 "Number of evicted negative entries");
1273
1274 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1275 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1276 &neg_evict_skipped_empty,
1277 "Number of times evicting failed due to lack of entries");
1278
1279 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1280 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1281 &neg_evict_skipped_missed,
1282 "Number of times evicting failed due to target entry disappearing");
1283
1284 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1285 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1286 &neg_evict_skipped_contended,
1287 "Number of times evicting failed due to contention");
1288
1289 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1290 "Number of cache hits (negative)");
1291
1292 static int
sysctl_neg_hot(SYSCTL_HANDLER_ARGS)1293 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1294 {
1295 int i, out;
1296
1297 out = 0;
1298 for (i = 0; i < numneglists; i++)
1299 out += neglists[i].nl_hotnum;
1300
1301 return (SYSCTL_OUT(req, &out, sizeof(out)));
1302 }
1303 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1304 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1305 "Number of hot negative entries");
1306
1307 static void
cache_neg_init(struct namecache * ncp)1308 cache_neg_init(struct namecache *ncp)
1309 {
1310 struct negstate *ns;
1311
1312 ncp->nc_flag |= NCF_NEGATIVE;
1313 ns = NCP2NEGSTATE(ncp);
1314 ns->neg_flag = 0;
1315 ns->neg_hit = 0;
1316 counter_u64_add(neg_created, 1);
1317 }
1318
1319 #define CACHE_NEG_PROMOTION_THRESH 2
1320
1321 static bool
cache_neg_hit_prep(struct namecache * ncp)1322 cache_neg_hit_prep(struct namecache *ncp)
1323 {
1324 struct negstate *ns;
1325 u_char n;
1326
1327 ns = NCP2NEGSTATE(ncp);
1328 n = atomic_load_char(&ns->neg_hit);
1329 for (;;) {
1330 if (n >= CACHE_NEG_PROMOTION_THRESH)
1331 return (false);
1332 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1333 break;
1334 }
1335 return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1336 }
1337
1338 /*
1339 * Nothing to do here but it is provided for completeness as some
1340 * cache_neg_hit_prep callers may end up returning without even
1341 * trying to promote.
1342 */
1343 #define cache_neg_hit_abort(ncp) do { } while (0)
1344
1345 static void
cache_neg_hit_finish(struct namecache * ncp)1346 cache_neg_hit_finish(struct namecache *ncp)
1347 {
1348
1349 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1350 counter_u64_add(numneghits, 1);
1351 }
1352
1353 /*
1354 * Move a negative entry to the hot list.
1355 */
1356 static void
cache_neg_promote_locked(struct namecache * ncp)1357 cache_neg_promote_locked(struct namecache *ncp)
1358 {
1359 struct neglist *nl;
1360 struct negstate *ns;
1361
1362 ns = NCP2NEGSTATE(ncp);
1363 nl = NCP2NEGLIST(ncp);
1364 mtx_assert(&nl->nl_lock, MA_OWNED);
1365 if ((ns->neg_flag & NEG_HOT) == 0) {
1366 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1367 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1368 nl->nl_hotnum++;
1369 ns->neg_flag |= NEG_HOT;
1370 }
1371 }
1372
1373 /*
1374 * Move a hot negative entry to the cold list.
1375 */
1376 static void
cache_neg_demote_locked(struct namecache * ncp)1377 cache_neg_demote_locked(struct namecache *ncp)
1378 {
1379 struct neglist *nl;
1380 struct negstate *ns;
1381
1382 ns = NCP2NEGSTATE(ncp);
1383 nl = NCP2NEGLIST(ncp);
1384 mtx_assert(&nl->nl_lock, MA_OWNED);
1385 MPASS(ns->neg_flag & NEG_HOT);
1386 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1387 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1388 nl->nl_hotnum--;
1389 ns->neg_flag &= ~NEG_HOT;
1390 atomic_store_char(&ns->neg_hit, 0);
1391 }
1392
1393 /*
1394 * Move a negative entry to the hot list if it matches the lookup.
1395 *
1396 * We have to take locks, but they may be contended and in the worst
1397 * case we may need to go off CPU. We don't want to spin within the
1398 * smr section and we can't block with it. Exiting the section means
1399 * the found entry could have been evicted. We are going to look it
1400 * up again.
1401 */
1402 static bool
cache_neg_promote_cond(struct vnode * dvp,struct componentname * cnp,struct namecache * oncp,uint32_t hash)1403 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1404 struct namecache *oncp, uint32_t hash)
1405 {
1406 struct namecache *ncp;
1407 struct neglist *nl;
1408 u_char nc_flag;
1409
1410 nl = NCP2NEGLIST(oncp);
1411
1412 mtx_lock(&nl->nl_lock);
1413 /*
1414 * For hash iteration.
1415 */
1416 vfs_smr_enter();
1417
1418 /*
1419 * Avoid all surprises by only succeeding if we got the same entry and
1420 * bailing completely otherwise.
1421 * XXX There are no provisions to keep the vnode around, meaning we may
1422 * end up promoting a negative entry for a *new* vnode and returning
1423 * ENOENT on its account. This is the error we want to return anyway
1424 * and promotion is harmless.
1425 *
1426 * In particular at this point there can be a new ncp which matches the
1427 * search but hashes to a different neglist.
1428 */
1429 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1430 if (ncp == oncp)
1431 break;
1432 }
1433
1434 /*
1435 * No match to begin with.
1436 */
1437 if (__predict_false(ncp == NULL)) {
1438 goto out_abort;
1439 }
1440
1441 /*
1442 * The newly found entry may be something different...
1443 */
1444 if (!cache_ncp_match(ncp, dvp, cnp)) {
1445 goto out_abort;
1446 }
1447
1448 /*
1449 * ... and not even negative.
1450 */
1451 nc_flag = atomic_load_char(&ncp->nc_flag);
1452 if ((nc_flag & NCF_NEGATIVE) == 0) {
1453 goto out_abort;
1454 }
1455
1456 if (!cache_ncp_canuse(ncp)) {
1457 goto out_abort;
1458 }
1459
1460 cache_neg_promote_locked(ncp);
1461 cache_neg_hit_finish(ncp);
1462 vfs_smr_exit();
1463 mtx_unlock(&nl->nl_lock);
1464 return (true);
1465 out_abort:
1466 vfs_smr_exit();
1467 mtx_unlock(&nl->nl_lock);
1468 return (false);
1469 }
1470
1471 static void
cache_neg_promote(struct namecache * ncp)1472 cache_neg_promote(struct namecache *ncp)
1473 {
1474 struct neglist *nl;
1475
1476 nl = NCP2NEGLIST(ncp);
1477 mtx_lock(&nl->nl_lock);
1478 cache_neg_promote_locked(ncp);
1479 mtx_unlock(&nl->nl_lock);
1480 }
1481
1482 static void
cache_neg_insert(struct namecache * ncp)1483 cache_neg_insert(struct namecache *ncp)
1484 {
1485 struct neglist *nl;
1486
1487 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1488 cache_assert_bucket_locked(ncp);
1489 nl = NCP2NEGLIST(ncp);
1490 mtx_lock(&nl->nl_lock);
1491 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1492 mtx_unlock(&nl->nl_lock);
1493 atomic_add_long(&numneg, 1);
1494 }
1495
1496 static void
cache_neg_remove(struct namecache * ncp)1497 cache_neg_remove(struct namecache *ncp)
1498 {
1499 struct neglist *nl;
1500 struct negstate *ns;
1501
1502 cache_assert_bucket_locked(ncp);
1503 nl = NCP2NEGLIST(ncp);
1504 ns = NCP2NEGSTATE(ncp);
1505 mtx_lock(&nl->nl_lock);
1506 if ((ns->neg_flag & NEG_HOT) != 0) {
1507 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1508 nl->nl_hotnum--;
1509 } else {
1510 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1511 }
1512 mtx_unlock(&nl->nl_lock);
1513 atomic_subtract_long(&numneg, 1);
1514 }
1515
1516 static struct neglist *
cache_neg_evict_select_list(void)1517 cache_neg_evict_select_list(void)
1518 {
1519 struct neglist *nl;
1520 u_int c;
1521
1522 c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1523 nl = &neglists[c % numneglists];
1524 if (!mtx_trylock(&nl->nl_evict_lock)) {
1525 counter_u64_add(neg_evict_skipped_contended, 1);
1526 return (NULL);
1527 }
1528 return (nl);
1529 }
1530
1531 static struct namecache *
cache_neg_evict_select_entry(struct neglist * nl)1532 cache_neg_evict_select_entry(struct neglist *nl)
1533 {
1534 struct namecache *ncp, *lncp;
1535 struct negstate *ns, *lns;
1536 int i;
1537
1538 mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1539 mtx_assert(&nl->nl_lock, MA_OWNED);
1540 ncp = TAILQ_FIRST(&nl->nl_list);
1541 if (ncp == NULL)
1542 return (NULL);
1543 lncp = ncp;
1544 lns = NCP2NEGSTATE(lncp);
1545 for (i = 1; i < 4; i++) {
1546 ncp = TAILQ_NEXT(ncp, nc_dst);
1547 if (ncp == NULL)
1548 break;
1549 ns = NCP2NEGSTATE(ncp);
1550 if (ns->neg_hit < lns->neg_hit) {
1551 lncp = ncp;
1552 lns = ns;
1553 }
1554 }
1555 return (lncp);
1556 }
1557
1558 static bool
cache_neg_evict(void)1559 cache_neg_evict(void)
1560 {
1561 struct namecache *ncp, *ncp2;
1562 struct neglist *nl;
1563 struct vnode *dvp;
1564 struct mtx *dvlp;
1565 struct mtx *blp;
1566 uint32_t hash;
1567 u_char nlen;
1568 bool evicted;
1569
1570 nl = cache_neg_evict_select_list();
1571 if (nl == NULL) {
1572 return (false);
1573 }
1574
1575 mtx_lock(&nl->nl_lock);
1576 ncp = TAILQ_FIRST(&nl->nl_hotlist);
1577 if (ncp != NULL) {
1578 cache_neg_demote_locked(ncp);
1579 }
1580 ncp = cache_neg_evict_select_entry(nl);
1581 if (ncp == NULL) {
1582 counter_u64_add(neg_evict_skipped_empty, 1);
1583 mtx_unlock(&nl->nl_lock);
1584 mtx_unlock(&nl->nl_evict_lock);
1585 return (false);
1586 }
1587 nlen = ncp->nc_nlen;
1588 dvp = ncp->nc_dvp;
1589 hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1590 dvlp = VP2VNODELOCK(dvp);
1591 blp = HASH2BUCKETLOCK(hash);
1592 mtx_unlock(&nl->nl_lock);
1593 mtx_unlock(&nl->nl_evict_lock);
1594 mtx_lock(dvlp);
1595 mtx_lock(blp);
1596 /*
1597 * Note that since all locks were dropped above, the entry may be
1598 * gone or reallocated to be something else.
1599 */
1600 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1601 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1602 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1603 break;
1604 }
1605 if (ncp2 == NULL) {
1606 counter_u64_add(neg_evict_skipped_missed, 1);
1607 ncp = NULL;
1608 evicted = false;
1609 } else {
1610 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1611 MPASS(blp == NCP2BUCKETLOCK(ncp));
1612 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1613 ncp->nc_name);
1614 cache_zap_locked(ncp);
1615 counter_u64_add(neg_evicted, 1);
1616 evicted = true;
1617 }
1618 mtx_unlock(blp);
1619 mtx_unlock(dvlp);
1620 if (ncp != NULL)
1621 cache_free(ncp);
1622 return (evicted);
1623 }
1624
1625 /*
1626 * Maybe evict a negative entry to create more room.
1627 *
1628 * The ncnegfactor parameter limits what fraction of the total count
1629 * can comprise of negative entries. However, if the cache is just
1630 * warming up this leads to excessive evictions. As such, ncnegminpct
1631 * (recomputed to neg_min) dictates whether the above should be
1632 * applied.
1633 *
1634 * Try evicting if the cache is close to full capacity regardless of
1635 * other considerations.
1636 */
1637 static bool
cache_neg_evict_cond(u_long lnumcache)1638 cache_neg_evict_cond(u_long lnumcache)
1639 {
1640 u_long lnumneg;
1641
1642 if (ncsize - 1000 < lnumcache)
1643 goto out_evict;
1644 lnumneg = atomic_load_long(&numneg);
1645 if (lnumneg < neg_min)
1646 return (false);
1647 if (lnumneg * ncnegfactor < lnumcache)
1648 return (false);
1649 out_evict:
1650 return (cache_neg_evict());
1651 }
1652
1653 /*
1654 * cache_zap_locked():
1655 *
1656 * Removes a namecache entry from cache, whether it contains an actual
1657 * pointer to a vnode or if it is just a negative cache entry.
1658 */
1659 static void
cache_zap_locked(struct namecache * ncp)1660 cache_zap_locked(struct namecache *ncp)
1661 {
1662 struct nchashhead *ncpp;
1663 struct vnode *dvp, *vp;
1664
1665 dvp = ncp->nc_dvp;
1666 vp = ncp->nc_vp;
1667
1668 if (!(ncp->nc_flag & NCF_NEGATIVE))
1669 cache_assert_vnode_locked(vp);
1670 cache_assert_vnode_locked(dvp);
1671 cache_assert_bucket_locked(ncp);
1672
1673 cache_ncp_invalidate(ncp);
1674
1675 ncpp = NCP2BUCKET(ncp);
1676 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1677 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1678 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1679 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1680 if (ncp == vp->v_cache_dd) {
1681 atomic_store_ptr(&vp->v_cache_dd, NULL);
1682 }
1683 } else {
1684 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1685 cache_neg_remove(ncp);
1686 }
1687 if (ncp->nc_flag & NCF_ISDOTDOT) {
1688 if (ncp == dvp->v_cache_dd) {
1689 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1690 }
1691 } else {
1692 LIST_REMOVE(ncp, nc_src);
1693 if (LIST_EMPTY(&dvp->v_cache_src)) {
1694 ncp->nc_flag |= NCF_DVDROP;
1695 }
1696 }
1697 }
1698
1699 static void
cache_zap_negative_locked_vnode_kl(struct namecache * ncp,struct vnode * vp)1700 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1701 {
1702 struct mtx *blp;
1703
1704 MPASS(ncp->nc_dvp == vp);
1705 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1706 cache_assert_vnode_locked(vp);
1707
1708 blp = NCP2BUCKETLOCK(ncp);
1709 mtx_lock(blp);
1710 cache_zap_locked(ncp);
1711 mtx_unlock(blp);
1712 }
1713
1714 static bool
cache_zap_locked_vnode_kl2(struct namecache * ncp,struct vnode * vp,struct mtx ** vlpp)1715 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1716 struct mtx **vlpp)
1717 {
1718 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1719 struct mtx *blp;
1720
1721 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1722 cache_assert_vnode_locked(vp);
1723
1724 if (ncp->nc_flag & NCF_NEGATIVE) {
1725 if (*vlpp != NULL) {
1726 mtx_unlock(*vlpp);
1727 *vlpp = NULL;
1728 }
1729 cache_zap_negative_locked_vnode_kl(ncp, vp);
1730 return (true);
1731 }
1732
1733 pvlp = VP2VNODELOCK(vp);
1734 blp = NCP2BUCKETLOCK(ncp);
1735 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1736 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1737
1738 if (*vlpp == vlp1 || *vlpp == vlp2) {
1739 to_unlock = *vlpp;
1740 *vlpp = NULL;
1741 } else {
1742 if (*vlpp != NULL) {
1743 mtx_unlock(*vlpp);
1744 *vlpp = NULL;
1745 }
1746 cache_sort_vnodes(&vlp1, &vlp2);
1747 if (vlp1 == pvlp) {
1748 mtx_lock(vlp2);
1749 to_unlock = vlp2;
1750 } else {
1751 if (!mtx_trylock(vlp1))
1752 goto out_relock;
1753 to_unlock = vlp1;
1754 }
1755 }
1756 mtx_lock(blp);
1757 cache_zap_locked(ncp);
1758 mtx_unlock(blp);
1759 if (to_unlock != NULL)
1760 mtx_unlock(to_unlock);
1761 return (true);
1762
1763 out_relock:
1764 mtx_unlock(vlp2);
1765 mtx_lock(vlp1);
1766 mtx_lock(vlp2);
1767 MPASS(*vlpp == NULL);
1768 *vlpp = vlp1;
1769 return (false);
1770 }
1771
1772 /*
1773 * If trylocking failed we can get here. We know enough to take all needed locks
1774 * in the right order and re-lookup the entry.
1775 */
1776 static int
cache_zap_unlocked_bucket(struct namecache * ncp,struct componentname * cnp,struct vnode * dvp,struct mtx * dvlp,struct mtx * vlp,uint32_t hash,struct mtx * blp)1777 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1778 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1779 struct mtx *blp)
1780 {
1781 struct namecache *rncp;
1782 struct mtx *rvlp;
1783
1784 cache_assert_bucket_unlocked(ncp);
1785
1786 cache_sort_vnodes(&dvlp, &vlp);
1787 cache_lock_vnodes(dvlp, vlp);
1788 mtx_lock(blp);
1789 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1790 if (rncp == ncp && cache_ncp_match(rncp, dvp, cnp))
1791 break;
1792 }
1793 if (rncp == NULL)
1794 goto out_mismatch;
1795
1796 if (!(ncp->nc_flag & NCF_NEGATIVE))
1797 rvlp = VP2VNODELOCK(rncp->nc_vp);
1798 else
1799 rvlp = NULL;
1800 if (rvlp != vlp)
1801 goto out_mismatch;
1802
1803 cache_zap_locked(rncp);
1804 mtx_unlock(blp);
1805 cache_unlock_vnodes(dvlp, vlp);
1806 atomic_add_long(&zap_bucket_relock_success, 1);
1807 return (0);
1808
1809 out_mismatch:
1810 mtx_unlock(blp);
1811 cache_unlock_vnodes(dvlp, vlp);
1812 return (EAGAIN);
1813 }
1814
1815 static int __noinline
cache_zap_locked_bucket(struct namecache * ncp,struct componentname * cnp,uint32_t hash,struct mtx * blp)1816 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1817 uint32_t hash, struct mtx *blp)
1818 {
1819 struct mtx *dvlp, *vlp;
1820 struct vnode *dvp;
1821
1822 cache_assert_bucket_locked(ncp);
1823
1824 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1825 vlp = NULL;
1826 if (!(ncp->nc_flag & NCF_NEGATIVE))
1827 vlp = VP2VNODELOCK(ncp->nc_vp);
1828 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1829 cache_zap_locked(ncp);
1830 mtx_unlock(blp);
1831 cache_unlock_vnodes(dvlp, vlp);
1832 return (0);
1833 }
1834
1835 dvp = ncp->nc_dvp;
1836 mtx_unlock(blp);
1837 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1838 }
1839
1840 static __noinline int
cache_remove_cnp(struct vnode * dvp,struct componentname * cnp)1841 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1842 {
1843 struct namecache *ncp;
1844 struct mtx *blp;
1845 struct mtx *dvlp, *dvlp2;
1846 uint32_t hash;
1847 int error;
1848
1849 if (cnp->cn_namelen == 2 &&
1850 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1851 dvlp = VP2VNODELOCK(dvp);
1852 dvlp2 = NULL;
1853 mtx_lock(dvlp);
1854 retry_dotdot:
1855 ncp = dvp->v_cache_dd;
1856 if (ncp == NULL) {
1857 mtx_unlock(dvlp);
1858 if (dvlp2 != NULL)
1859 mtx_unlock(dvlp2);
1860 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1861 return (0);
1862 }
1863 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1864 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1865 goto retry_dotdot;
1866 MPASS(dvp->v_cache_dd == NULL);
1867 mtx_unlock(dvlp);
1868 if (dvlp2 != NULL)
1869 mtx_unlock(dvlp2);
1870 cache_free(ncp);
1871 } else {
1872 atomic_store_ptr(&dvp->v_cache_dd, NULL);
1873 mtx_unlock(dvlp);
1874 if (dvlp2 != NULL)
1875 mtx_unlock(dvlp2);
1876 }
1877 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1878 return (1);
1879 }
1880
1881 /*
1882 * XXX note that access here is completely unlocked with no provisions
1883 * to keep the hash allocated. If one is sufficiently unlucky a
1884 * parallel cache resize can reallocate the hash, unmap backing pages
1885 * and cause the empty check below to fault.
1886 *
1887 * Fixing this has epsilon priority, but can be done with no overhead
1888 * for this codepath with sufficient effort.
1889 */
1890 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1891 blp = HASH2BUCKETLOCK(hash);
1892 retry:
1893 if (CK_SLIST_EMPTY(NCHHASH(hash)))
1894 goto out_no_entry;
1895
1896 mtx_lock(blp);
1897 ncp = cache_ncp_find(dvp, cnp, hash);
1898 if (ncp == NULL) {
1899 mtx_unlock(blp);
1900 goto out_no_entry;
1901 }
1902
1903 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1904 if (__predict_false(error != 0)) {
1905 atomic_add_long(&zap_bucket_fail, 1);
1906 goto retry;
1907 }
1908 counter_u64_add(numposzaps, 1);
1909 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1910 cache_free(ncp);
1911 return (1);
1912 out_no_entry:
1913 counter_u64_add(nummisszap, 1);
1914 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1915 return (0);
1916 }
1917
1918 static int __noinline
cache_lookup_dot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1919 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1920 struct timespec *tsp, int *ticksp)
1921 {
1922 int ltype;
1923
1924 *vpp = dvp;
1925 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1926 if (tsp != NULL)
1927 timespecclear(tsp);
1928 if (ticksp != NULL)
1929 *ticksp = ticks;
1930 vrefact(*vpp);
1931 /*
1932 * When we lookup "." we still can be asked to lock it
1933 * differently...
1934 */
1935 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1936 if (ltype != VOP_ISLOCKED(*vpp)) {
1937 if (ltype == LK_EXCLUSIVE) {
1938 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1939 if (VN_IS_DOOMED((*vpp))) {
1940 /* forced unmount */
1941 vrele(*vpp);
1942 *vpp = NULL;
1943 return (ENOENT);
1944 }
1945 } else
1946 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1947 }
1948 return (-1);
1949 }
1950
1951 static int __noinline
cache_lookup_dotdot(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)1952 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1953 struct timespec *tsp, int *ticksp)
1954 {
1955 struct namecache_ts *ncp_ts;
1956 struct namecache *ncp;
1957 struct mtx *dvlp;
1958 enum vgetstate vs;
1959 int error, ltype;
1960 bool whiteout;
1961
1962 MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1963
1964 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1965 cache_remove_cnp(dvp, cnp);
1966 return (0);
1967 }
1968
1969 retry:
1970 dvlp = VP2VNODELOCK(dvp);
1971 mtx_lock(dvlp);
1972 ncp = dvp->v_cache_dd;
1973 if (ncp == NULL) {
1974 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
1975 mtx_unlock(dvlp);
1976 return (0);
1977 }
1978 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1979 if (ncp->nc_flag & NCF_NEGATIVE)
1980 *vpp = NULL;
1981 else
1982 *vpp = ncp->nc_vp;
1983 } else
1984 *vpp = ncp->nc_dvp;
1985 if (*vpp == NULL)
1986 goto negative_success;
1987 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1988 cache_out_ts(ncp, tsp, ticksp);
1989 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1990 NCF_DTS && tsp != NULL) {
1991 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1992 *tsp = ncp_ts->nc_dotdottime;
1993 }
1994
1995 MPASS(dvp != *vpp);
1996 ltype = VOP_ISLOCKED(dvp);
1997 VOP_UNLOCK(dvp);
1998 vs = vget_prep(*vpp);
1999 mtx_unlock(dvlp);
2000 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2001 vn_lock(dvp, ltype | LK_RETRY);
2002 if (VN_IS_DOOMED(dvp)) {
2003 if (error == 0)
2004 vput(*vpp);
2005 *vpp = NULL;
2006 return (ENOENT);
2007 }
2008 if (error) {
2009 *vpp = NULL;
2010 goto retry;
2011 }
2012 return (-1);
2013 negative_success:
2014 if (__predict_false(cnp->cn_nameiop == CREATE)) {
2015 if (cnp->cn_flags & ISLASTCN) {
2016 counter_u64_add(numnegzaps, 1);
2017 cache_zap_negative_locked_vnode_kl(ncp, dvp);
2018 mtx_unlock(dvlp);
2019 cache_free(ncp);
2020 return (0);
2021 }
2022 }
2023
2024 whiteout = (ncp->nc_flag & NCF_WHITE);
2025 cache_out_ts(ncp, tsp, ticksp);
2026 if (cache_neg_hit_prep(ncp))
2027 cache_neg_promote(ncp);
2028 else
2029 cache_neg_hit_finish(ncp);
2030 mtx_unlock(dvlp);
2031 if (whiteout)
2032 cnp->cn_flags |= ISWHITEOUT;
2033 return (ENOENT);
2034 }
2035
2036 /**
2037 * Lookup a name in the name cache
2038 *
2039 * # Arguments
2040 *
2041 * - dvp: Parent directory in which to search.
2042 * - vpp: Return argument. Will contain desired vnode on cache hit.
2043 * - cnp: Parameters of the name search. The most interesting bits of
2044 * the cn_flags field have the following meanings:
2045 * - MAKEENTRY: If clear, free an entry from the cache rather than look
2046 * it up.
2047 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
2048 * - tsp: Return storage for cache timestamp. On a successful (positive
2049 * or negative) lookup, tsp will be filled with any timespec that
2050 * was stored when this cache entry was created. However, it will
2051 * be clear for "." entries.
2052 * - ticks: Return storage for alternate cache timestamp. On a successful
2053 * (positive or negative) lookup, it will contain the ticks value
2054 * that was current when the cache entry was created, unless cnp
2055 * was ".".
2056 *
2057 * Either both tsp and ticks have to be provided or neither of them.
2058 *
2059 * # Returns
2060 *
2061 * - -1: A positive cache hit. vpp will contain the desired vnode.
2062 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
2063 * to a forced unmount. vpp will not be modified. If the entry
2064 * is a whiteout, then the ISWHITEOUT flag will be set in
2065 * cnp->cn_flags.
2066 * - 0: A cache miss. vpp will not be modified.
2067 *
2068 * # Locking
2069 *
2070 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
2071 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
2072 * lock is not recursively acquired.
2073 */
2074 static int __noinline
cache_lookup_fallback(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)2075 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2076 struct timespec *tsp, int *ticksp)
2077 {
2078 struct namecache *ncp;
2079 struct mtx *blp;
2080 uint32_t hash;
2081 enum vgetstate vs;
2082 int error;
2083 bool whiteout;
2084
2085 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2086 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
2087
2088 retry:
2089 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2090 blp = HASH2BUCKETLOCK(hash);
2091 mtx_lock(blp);
2092
2093 ncp = cache_ncp_find(dvp, cnp, hash);
2094 if (__predict_false(ncp == NULL)) {
2095 mtx_unlock(blp);
2096 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2097 counter_u64_add(nummiss, 1);
2098 return (0);
2099 }
2100
2101 if (ncp->nc_flag & NCF_NEGATIVE)
2102 goto negative_success;
2103
2104 counter_u64_add(numposhits, 1);
2105 *vpp = ncp->nc_vp;
2106 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2107 cache_out_ts(ncp, tsp, ticksp);
2108 MPASS(dvp != *vpp);
2109 vs = vget_prep(*vpp);
2110 mtx_unlock(blp);
2111 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2112 if (error) {
2113 *vpp = NULL;
2114 goto retry;
2115 }
2116 return (-1);
2117 negative_success:
2118 /*
2119 * We don't get here with regular lookup apart from corner cases.
2120 */
2121 if (__predict_true(cnp->cn_nameiop == CREATE)) {
2122 if (cnp->cn_flags & ISLASTCN) {
2123 counter_u64_add(numnegzaps, 1);
2124 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
2125 if (__predict_false(error != 0)) {
2126 atomic_add_long(&zap_bucket_fail2, 1);
2127 goto retry;
2128 }
2129 cache_free(ncp);
2130 return (0);
2131 }
2132 }
2133
2134 whiteout = (ncp->nc_flag & NCF_WHITE);
2135 cache_out_ts(ncp, tsp, ticksp);
2136 if (cache_neg_hit_prep(ncp))
2137 cache_neg_promote(ncp);
2138 else
2139 cache_neg_hit_finish(ncp);
2140 mtx_unlock(blp);
2141 if (whiteout)
2142 cnp->cn_flags |= ISWHITEOUT;
2143 return (ENOENT);
2144 }
2145
2146 int
cache_lookup(struct vnode * dvp,struct vnode ** vpp,struct componentname * cnp,struct timespec * tsp,int * ticksp)2147 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2148 struct timespec *tsp, int *ticksp)
2149 {
2150 struct namecache *ncp;
2151 uint32_t hash;
2152 enum vgetstate vs;
2153 int error;
2154 bool whiteout, neg_promote;
2155 u_short nc_flag;
2156
2157 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
2158
2159 #ifdef DEBUG_CACHE
2160 if (__predict_false(!doingcache)) {
2161 cnp->cn_flags &= ~MAKEENTRY;
2162 return (0);
2163 }
2164 #endif
2165
2166 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2167 if (cnp->cn_namelen == 1)
2168 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
2169 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
2170 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
2171 }
2172
2173 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2174
2175 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
2176 cache_remove_cnp(dvp, cnp);
2177 return (0);
2178 }
2179
2180 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2181 vfs_smr_enter();
2182
2183 ncp = cache_ncp_find(dvp, cnp, hash);
2184 if (__predict_false(ncp == NULL)) {
2185 vfs_smr_exit();
2186 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2187 counter_u64_add(nummiss, 1);
2188 return (0);
2189 }
2190
2191 nc_flag = atomic_load_char(&ncp->nc_flag);
2192 if (nc_flag & NCF_NEGATIVE)
2193 goto negative_success;
2194
2195 counter_u64_add(numposhits, 1);
2196 *vpp = ncp->nc_vp;
2197 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2198 cache_out_ts(ncp, tsp, ticksp);
2199 MPASS(dvp != *vpp);
2200 if (!cache_ncp_canuse(ncp)) {
2201 vfs_smr_exit();
2202 *vpp = NULL;
2203 goto out_fallback;
2204 }
2205 vs = vget_prep_smr(*vpp);
2206 vfs_smr_exit();
2207 if (__predict_false(vs == VGET_NONE)) {
2208 *vpp = NULL;
2209 goto out_fallback;
2210 }
2211 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2212 if (error) {
2213 *vpp = NULL;
2214 goto out_fallback;
2215 }
2216 return (-1);
2217 negative_success:
2218 if (cnp->cn_nameiop == CREATE) {
2219 if (cnp->cn_flags & ISLASTCN) {
2220 vfs_smr_exit();
2221 goto out_fallback;
2222 }
2223 }
2224
2225 cache_out_ts(ncp, tsp, ticksp);
2226 whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
2227 neg_promote = cache_neg_hit_prep(ncp);
2228 if (!cache_ncp_canuse(ncp)) {
2229 cache_neg_hit_abort(ncp);
2230 vfs_smr_exit();
2231 goto out_fallback;
2232 }
2233 if (neg_promote) {
2234 vfs_smr_exit();
2235 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2236 goto out_fallback;
2237 } else {
2238 cache_neg_hit_finish(ncp);
2239 vfs_smr_exit();
2240 }
2241 if (whiteout)
2242 cnp->cn_flags |= ISWHITEOUT;
2243 return (ENOENT);
2244 out_fallback:
2245 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2246 }
2247
2248 struct celockstate {
2249 struct mtx *vlp[3];
2250 struct mtx *blp[2];
2251 };
2252 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2253 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2254
2255 static inline void
cache_celockstate_init(struct celockstate * cel)2256 cache_celockstate_init(struct celockstate *cel)
2257 {
2258
2259 bzero(cel, sizeof(*cel));
2260 }
2261
2262 static void
cache_lock_vnodes_cel(struct celockstate * cel,struct vnode * vp,struct vnode * dvp)2263 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2264 struct vnode *dvp)
2265 {
2266 struct mtx *vlp1, *vlp2;
2267
2268 MPASS(cel->vlp[0] == NULL);
2269 MPASS(cel->vlp[1] == NULL);
2270 MPASS(cel->vlp[2] == NULL);
2271
2272 MPASS(vp != NULL || dvp != NULL);
2273
2274 vlp1 = VP2VNODELOCK(vp);
2275 vlp2 = VP2VNODELOCK(dvp);
2276 cache_sort_vnodes(&vlp1, &vlp2);
2277
2278 if (vlp1 != NULL) {
2279 mtx_lock(vlp1);
2280 cel->vlp[0] = vlp1;
2281 }
2282 mtx_lock(vlp2);
2283 cel->vlp[1] = vlp2;
2284 }
2285
2286 static void
cache_unlock_vnodes_cel(struct celockstate * cel)2287 cache_unlock_vnodes_cel(struct celockstate *cel)
2288 {
2289
2290 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2291
2292 if (cel->vlp[0] != NULL)
2293 mtx_unlock(cel->vlp[0]);
2294 if (cel->vlp[1] != NULL)
2295 mtx_unlock(cel->vlp[1]);
2296 if (cel->vlp[2] != NULL)
2297 mtx_unlock(cel->vlp[2]);
2298 }
2299
2300 static bool
cache_lock_vnodes_cel_3(struct celockstate * cel,struct vnode * vp)2301 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2302 {
2303 struct mtx *vlp;
2304 bool ret;
2305
2306 cache_assert_vlp_locked(cel->vlp[0]);
2307 cache_assert_vlp_locked(cel->vlp[1]);
2308 MPASS(cel->vlp[2] == NULL);
2309
2310 MPASS(vp != NULL);
2311 vlp = VP2VNODELOCK(vp);
2312
2313 ret = true;
2314 if (vlp >= cel->vlp[1]) {
2315 mtx_lock(vlp);
2316 } else {
2317 if (mtx_trylock(vlp))
2318 goto out;
2319 cache_unlock_vnodes_cel(cel);
2320 atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1);
2321 if (vlp < cel->vlp[0]) {
2322 mtx_lock(vlp);
2323 mtx_lock(cel->vlp[0]);
2324 mtx_lock(cel->vlp[1]);
2325 } else {
2326 if (cel->vlp[0] != NULL)
2327 mtx_lock(cel->vlp[0]);
2328 mtx_lock(vlp);
2329 mtx_lock(cel->vlp[1]);
2330 }
2331 ret = false;
2332 }
2333 out:
2334 cel->vlp[2] = vlp;
2335 return (ret);
2336 }
2337
2338 static void
cache_lock_buckets_cel(struct celockstate * cel,struct mtx * blp1,struct mtx * blp2)2339 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2340 struct mtx *blp2)
2341 {
2342
2343 MPASS(cel->blp[0] == NULL);
2344 MPASS(cel->blp[1] == NULL);
2345
2346 cache_sort_vnodes(&blp1, &blp2);
2347
2348 if (blp1 != NULL) {
2349 mtx_lock(blp1);
2350 cel->blp[0] = blp1;
2351 }
2352 mtx_lock(blp2);
2353 cel->blp[1] = blp2;
2354 }
2355
2356 static void
cache_unlock_buckets_cel(struct celockstate * cel)2357 cache_unlock_buckets_cel(struct celockstate *cel)
2358 {
2359
2360 if (cel->blp[0] != NULL)
2361 mtx_unlock(cel->blp[0]);
2362 mtx_unlock(cel->blp[1]);
2363 }
2364
2365 /*
2366 * Lock part of the cache affected by the insertion.
2367 *
2368 * This means vnodelocks for dvp, vp and the relevant bucketlock.
2369 * However, insertion can result in removal of an old entry. In this
2370 * case we have an additional vnode and bucketlock pair to lock.
2371 *
2372 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2373 * preserving the locking order (smaller address first).
2374 */
2375 static void
cache_enter_lock(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2376 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2377 uint32_t hash)
2378 {
2379 struct namecache *ncp;
2380 struct mtx *blps[2];
2381 u_char nc_flag;
2382
2383 blps[0] = HASH2BUCKETLOCK(hash);
2384 for (;;) {
2385 blps[1] = NULL;
2386 cache_lock_vnodes_cel(cel, dvp, vp);
2387 if (vp == NULL || vp->v_type != VDIR)
2388 break;
2389 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2390 if (ncp == NULL)
2391 break;
2392 nc_flag = atomic_load_char(&ncp->nc_flag);
2393 if ((nc_flag & NCF_ISDOTDOT) == 0)
2394 break;
2395 MPASS(ncp->nc_dvp == vp);
2396 blps[1] = NCP2BUCKETLOCK(ncp);
2397 if ((nc_flag & NCF_NEGATIVE) != 0)
2398 break;
2399 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2400 break;
2401 /*
2402 * All vnodes got re-locked. Re-validate the state and if
2403 * nothing changed we are done. Otherwise restart.
2404 */
2405 if (ncp == vp->v_cache_dd &&
2406 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2407 blps[1] == NCP2BUCKETLOCK(ncp) &&
2408 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2409 break;
2410 cache_unlock_vnodes_cel(cel);
2411 cel->vlp[0] = NULL;
2412 cel->vlp[1] = NULL;
2413 cel->vlp[2] = NULL;
2414 }
2415 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2416 }
2417
2418 static void
cache_enter_lock_dd(struct celockstate * cel,struct vnode * dvp,struct vnode * vp,uint32_t hash)2419 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2420 uint32_t hash)
2421 {
2422 struct namecache *ncp;
2423 struct mtx *blps[2];
2424 u_char nc_flag;
2425
2426 blps[0] = HASH2BUCKETLOCK(hash);
2427 for (;;) {
2428 blps[1] = NULL;
2429 cache_lock_vnodes_cel(cel, dvp, vp);
2430 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2431 if (ncp == NULL)
2432 break;
2433 nc_flag = atomic_load_char(&ncp->nc_flag);
2434 if ((nc_flag & NCF_ISDOTDOT) == 0)
2435 break;
2436 MPASS(ncp->nc_dvp == dvp);
2437 blps[1] = NCP2BUCKETLOCK(ncp);
2438 if ((nc_flag & NCF_NEGATIVE) != 0)
2439 break;
2440 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2441 break;
2442 if (ncp == dvp->v_cache_dd &&
2443 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2444 blps[1] == NCP2BUCKETLOCK(ncp) &&
2445 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2446 break;
2447 cache_unlock_vnodes_cel(cel);
2448 cel->vlp[0] = NULL;
2449 cel->vlp[1] = NULL;
2450 cel->vlp[2] = NULL;
2451 }
2452 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2453 }
2454
2455 static void
cache_enter_unlock(struct celockstate * cel)2456 cache_enter_unlock(struct celockstate *cel)
2457 {
2458
2459 cache_unlock_buckets_cel(cel);
2460 cache_unlock_vnodes_cel(cel);
2461 }
2462
2463 static void __noinline
cache_enter_dotdot_prep(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)2464 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2465 struct componentname *cnp)
2466 {
2467 struct celockstate cel;
2468 struct namecache *ncp;
2469 uint32_t hash;
2470 int len;
2471
2472 if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2473 return;
2474 len = cnp->cn_namelen;
2475 cache_celockstate_init(&cel);
2476 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2477 cache_enter_lock_dd(&cel, dvp, vp, hash);
2478 ncp = dvp->v_cache_dd;
2479 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2480 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2481 cache_zap_locked(ncp);
2482 } else {
2483 ncp = NULL;
2484 }
2485 atomic_store_ptr(&dvp->v_cache_dd, NULL);
2486 cache_enter_unlock(&cel);
2487 if (ncp != NULL)
2488 cache_free(ncp);
2489 }
2490
2491 /*
2492 * Add an entry to the cache.
2493 */
2494 void
cache_enter_time(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp)2495 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2496 struct timespec *tsp, struct timespec *dtsp)
2497 {
2498 struct celockstate cel;
2499 struct namecache *ncp, *n2, *ndd;
2500 struct namecache_ts *ncp_ts;
2501 uint32_t hash;
2502 int flag;
2503 int len;
2504
2505 KASSERT(cnp->cn_namelen <= NAME_MAX,
2506 ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2507 NAME_MAX));
2508 VNPASS(!VN_IS_DOOMED(dvp), dvp);
2509 VNPASS(dvp->v_type != VNON, dvp);
2510 if (vp != NULL) {
2511 VNPASS(!VN_IS_DOOMED(vp), vp);
2512 VNPASS(vp->v_type != VNON, vp);
2513 }
2514 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
2515 KASSERT(dvp == vp,
2516 ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
2517 dvp, vp));
2518 } else {
2519 KASSERT(dvp != vp,
2520 ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
2521 cnp->cn_nameptr, dvp));
2522 }
2523
2524 #ifdef DEBUG_CACHE
2525 if (__predict_false(!doingcache))
2526 return;
2527 #endif
2528
2529 flag = 0;
2530 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2531 if (cnp->cn_namelen == 1)
2532 return;
2533 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2534 cache_enter_dotdot_prep(dvp, vp, cnp);
2535 flag = NCF_ISDOTDOT;
2536 }
2537 }
2538
2539 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2540 if (ncp == NULL)
2541 return;
2542
2543 cache_celockstate_init(&cel);
2544 ndd = NULL;
2545 ncp_ts = NULL;
2546
2547 /*
2548 * Calculate the hash key and setup as much of the new
2549 * namecache entry as possible before acquiring the lock.
2550 */
2551 ncp->nc_flag = flag | NCF_WIP;
2552 ncp->nc_vp = vp;
2553 if (vp == NULL)
2554 cache_neg_init(ncp);
2555 ncp->nc_dvp = dvp;
2556 if (tsp != NULL) {
2557 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2558 ncp_ts->nc_time = *tsp;
2559 ncp_ts->nc_ticks = ticks;
2560 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2561 if (dtsp != NULL) {
2562 ncp_ts->nc_dotdottime = *dtsp;
2563 ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2564 }
2565 }
2566 len = ncp->nc_nlen = cnp->cn_namelen;
2567 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2568 memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2569 ncp->nc_name[len] = '\0';
2570 cache_enter_lock(&cel, dvp, vp, hash);
2571
2572 /*
2573 * See if this vnode or negative entry is already in the cache
2574 * with this name. This can happen with concurrent lookups of
2575 * the same path name.
2576 */
2577 n2 = cache_ncp_find(dvp, cnp, hash);
2578 if (n2 != NULL) {
2579 MPASS(cache_ncp_canuse(n2));
2580 if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2581 KASSERT(vp == NULL,
2582 ("%s: found entry pointing to a different vnode "
2583 "(%p != %p); name [%s]",
2584 __func__, NULL, vp, cnp->cn_nameptr));
2585 else
2586 KASSERT(n2->nc_vp == vp,
2587 ("%s: found entry pointing to a different vnode "
2588 "(%p != %p); name [%s]",
2589 __func__, n2->nc_vp, vp, cnp->cn_nameptr));
2590 /*
2591 * Entries are supposed to be immutable unless in the
2592 * process of getting destroyed. Accommodating for
2593 * changing timestamps is possible but not worth it.
2594 * This should be harmless in terms of correctness, in
2595 * the worst case resulting in an earlier expiration.
2596 * Alternatively, the found entry can be replaced
2597 * altogether.
2598 */
2599 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) ==
2600 (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2601 #if 0
2602 if (tsp != NULL) {
2603 KASSERT((n2->nc_flag & NCF_TS) != 0,
2604 ("no NCF_TS"));
2605 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2606 n2_ts->nc_time = ncp_ts->nc_time;
2607 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2608 if (dtsp != NULL) {
2609 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2610 n2_ts->nc_nc.nc_flag |= NCF_DTS;
2611 }
2612 }
2613 #endif
2614 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2615 vp);
2616 goto out_unlock_free;
2617 }
2618
2619 if (flag == NCF_ISDOTDOT) {
2620 /*
2621 * See if we are trying to add .. entry, but some other lookup
2622 * has populated v_cache_dd pointer already.
2623 */
2624 if (dvp->v_cache_dd != NULL)
2625 goto out_unlock_free;
2626 KASSERT(vp == NULL || vp->v_type == VDIR,
2627 ("wrong vnode type %p", vp));
2628 atomic_thread_fence_rel();
2629 atomic_store_ptr(&dvp->v_cache_dd, ncp);
2630 } else if (vp != NULL) {
2631 /*
2632 * For this case, the cache entry maps both the
2633 * directory name in it and the name ".." for the
2634 * directory's parent.
2635 */
2636 if ((ndd = vp->v_cache_dd) != NULL) {
2637 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2638 cache_zap_locked(ndd);
2639 else
2640 ndd = NULL;
2641 }
2642 atomic_thread_fence_rel();
2643 atomic_store_ptr(&vp->v_cache_dd, ncp);
2644 }
2645
2646 if (flag != NCF_ISDOTDOT) {
2647 if (LIST_EMPTY(&dvp->v_cache_src)) {
2648 cache_hold_vnode(dvp);
2649 }
2650 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2651 }
2652
2653 /*
2654 * If the entry is "negative", we place it into the
2655 * "negative" cache queue, otherwise, we place it into the
2656 * destination vnode's cache entries queue.
2657 */
2658 if (vp != NULL) {
2659 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2660 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2661 vp);
2662 } else {
2663 if (cnp->cn_flags & ISWHITEOUT)
2664 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2665 cache_neg_insert(ncp);
2666 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2667 ncp->nc_name);
2668 }
2669
2670 /*
2671 * Insert the new namecache entry into the appropriate chain
2672 * within the cache entries table.
2673 */
2674 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2675
2676 atomic_thread_fence_rel();
2677 /*
2678 * Mark the entry as fully constructed.
2679 * It is immutable past this point until its removal.
2680 */
2681 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2682
2683 cache_enter_unlock(&cel);
2684 if (ndd != NULL)
2685 cache_free(ndd);
2686 return;
2687 out_unlock_free:
2688 cache_enter_unlock(&cel);
2689 cache_free(ncp);
2690 return;
2691 }
2692
2693 /*
2694 * A variant of the above accepting flags.
2695 *
2696 * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
2697 *
2698 * TODO: this routine is a hack. It blindly removes the old entry, even if it
2699 * happens to match and it is doing it in an inefficient manner. It was added
2700 * to accommodate NFS which runs into a case where the target for a given name
2701 * may change from under it. Note this does nothing to solve the following
2702 * race: 2 callers of cache_enter_time_flags pass a different target vnode for
2703 * the same [dvp, cnp]. It may be argued that code doing this is broken.
2704 */
2705 void
cache_enter_time_flags(struct vnode * dvp,struct vnode * vp,struct componentname * cnp,struct timespec * tsp,struct timespec * dtsp,int flags)2706 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2707 struct timespec *tsp, struct timespec *dtsp, int flags)
2708 {
2709
2710 MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
2711
2712 if (flags & VFS_CACHE_DROPOLD)
2713 cache_remove_cnp(dvp, cnp);
2714 cache_enter_time(dvp, vp, cnp, tsp, dtsp);
2715 }
2716
2717 static u_long
cache_roundup_2(u_long val)2718 cache_roundup_2(u_long val)
2719 {
2720 u_long res;
2721
2722 for (res = 1; res <= val; res <<= 1)
2723 continue;
2724
2725 return (res);
2726 }
2727
2728 static struct nchashhead *
nchinittbl(u_long elements,u_long * hashmask)2729 nchinittbl(u_long elements, u_long *hashmask)
2730 {
2731 struct nchashhead *hashtbl;
2732 u_long hashsize, i;
2733
2734 hashsize = cache_roundup_2(elements) / 2;
2735
2736 hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2737 for (i = 0; i < hashsize; i++)
2738 CK_SLIST_INIT(&hashtbl[i]);
2739 *hashmask = hashsize - 1;
2740 return (hashtbl);
2741 }
2742
2743 static void
ncfreetbl(struct nchashhead * hashtbl)2744 ncfreetbl(struct nchashhead *hashtbl)
2745 {
2746
2747 free(hashtbl, M_VFSCACHE);
2748 }
2749
2750 /*
2751 * Name cache initialization, from vfs_init() when we are booting
2752 */
2753 static void
nchinit(void * dummy __unused)2754 nchinit(void *dummy __unused)
2755 {
2756 u_int i;
2757
2758 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2759 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2760 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2761 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2762 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2763 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2764 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2765 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2766
2767 VFS_SMR_ZONE_SET(cache_zone_small);
2768 VFS_SMR_ZONE_SET(cache_zone_small_ts);
2769 VFS_SMR_ZONE_SET(cache_zone_large);
2770 VFS_SMR_ZONE_SET(cache_zone_large_ts);
2771
2772 ncsize = desiredvnodes * ncsizefactor;
2773 cache_recalc_neg_min();
2774 nchashtbl = nchinittbl(ncsize, &nchash);
2775 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2776 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2777 ncbuckethash = 7;
2778 if (ncbuckethash > nchash)
2779 ncbuckethash = nchash;
2780 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2781 M_WAITOK | M_ZERO);
2782 for (i = 0; i < numbucketlocks; i++)
2783 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2784 ncvnodehash = ncbuckethash;
2785 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2786 M_WAITOK | M_ZERO);
2787 for (i = 0; i < numvnodelocks; i++)
2788 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2789
2790 for (i = 0; i < numneglists; i++) {
2791 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2792 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2793 TAILQ_INIT(&neglists[i].nl_list);
2794 TAILQ_INIT(&neglists[i].nl_hotlist);
2795 }
2796 }
2797 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2798
2799 void
cache_vnode_init(struct vnode * vp)2800 cache_vnode_init(struct vnode *vp)
2801 {
2802
2803 LIST_INIT(&vp->v_cache_src);
2804 TAILQ_INIT(&vp->v_cache_dst);
2805 vp->v_cache_dd = NULL;
2806 cache_prehash(vp);
2807 }
2808
2809 /*
2810 * Induce transient cache misses for lockless operation in cache_lookup() by
2811 * using a temporary hash table.
2812 *
2813 * This will force a fs lookup.
2814 *
2815 * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
2816 * to observe all CPUs not performing the lookup.
2817 */
2818 static void
cache_changesize_set_temp(struct nchashhead * temptbl,u_long temphash)2819 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
2820 {
2821
2822 MPASS(temphash < nchash);
2823 /*
2824 * Change the size. The new size is smaller and can safely be used
2825 * against the existing table. All lookups which now hash wrong will
2826 * result in a cache miss, which all callers are supposed to know how
2827 * to handle.
2828 */
2829 atomic_store_long(&nchash, temphash);
2830 atomic_thread_fence_rel();
2831 vfs_smr_synchronize();
2832 /*
2833 * At this point everyone sees the updated hash value, but they still
2834 * see the old table.
2835 */
2836 atomic_store_ptr(&nchashtbl, temptbl);
2837 atomic_thread_fence_rel();
2838 vfs_smr_synchronize();
2839 /*
2840 * At this point everyone sees the updated table pointer and size pair.
2841 */
2842 }
2843
2844 /*
2845 * Set the new hash table.
2846 *
2847 * Similarly to cache_changesize_set_temp(), this has to synchronize against
2848 * lockless operation in cache_lookup().
2849 */
2850 static void
cache_changesize_set_new(struct nchashhead * new_tbl,u_long new_hash)2851 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
2852 {
2853
2854 MPASS(nchash < new_hash);
2855 /*
2856 * Change the pointer first. This wont result in out of bounds access
2857 * since the temporary table is guaranteed to be smaller.
2858 */
2859 atomic_store_ptr(&nchashtbl, new_tbl);
2860 atomic_thread_fence_rel();
2861 vfs_smr_synchronize();
2862 /*
2863 * At this point everyone sees the updated pointer value, but they
2864 * still see the old size.
2865 */
2866 atomic_store_long(&nchash, new_hash);
2867 atomic_thread_fence_rel();
2868 vfs_smr_synchronize();
2869 /*
2870 * At this point everyone sees the updated table pointer and size pair.
2871 */
2872 }
2873
2874 void
cache_changesize(u_long newmaxvnodes)2875 cache_changesize(u_long newmaxvnodes)
2876 {
2877 struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
2878 u_long new_nchash, old_nchash, temphash;
2879 struct namecache *ncp;
2880 uint32_t hash;
2881 u_long newncsize;
2882 u_long i;
2883
2884 newncsize = newmaxvnodes * ncsizefactor;
2885 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2886 if (newmaxvnodes < numbucketlocks)
2887 newmaxvnodes = numbucketlocks;
2888
2889 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2890 /* If same hash table size, nothing to do */
2891 if (nchash == new_nchash) {
2892 ncfreetbl(new_nchashtbl);
2893 return;
2894 }
2895
2896 temptbl = nchinittbl(1, &temphash);
2897
2898 /*
2899 * Move everything from the old hash table to the new table.
2900 * None of the namecache entries in the table can be removed
2901 * because to do so, they have to be removed from the hash table.
2902 */
2903 cache_lock_all_vnodes();
2904 cache_lock_all_buckets();
2905 old_nchashtbl = nchashtbl;
2906 old_nchash = nchash;
2907 cache_changesize_set_temp(temptbl, temphash);
2908 for (i = 0; i <= old_nchash; i++) {
2909 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2910 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2911 ncp->nc_dvp);
2912 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2913 CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
2914 }
2915 }
2916 ncsize = newncsize;
2917 cache_recalc_neg_min();
2918 cache_changesize_set_new(new_nchashtbl, new_nchash);
2919 cache_unlock_all_buckets();
2920 cache_unlock_all_vnodes();
2921 ncfreetbl(old_nchashtbl);
2922 ncfreetbl(temptbl);
2923 }
2924
2925 /*
2926 * Remove all entries from and to a particular vnode.
2927 */
2928 static void
cache_purge_impl(struct vnode * vp)2929 cache_purge_impl(struct vnode *vp)
2930 {
2931 struct cache_freebatch batch;
2932 struct namecache *ncp;
2933 struct mtx *vlp, *vlp2;
2934
2935 TAILQ_INIT(&batch);
2936 vlp = VP2VNODELOCK(vp);
2937 vlp2 = NULL;
2938 mtx_lock(vlp);
2939 retry:
2940 while (!LIST_EMPTY(&vp->v_cache_src)) {
2941 ncp = LIST_FIRST(&vp->v_cache_src);
2942 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2943 goto retry;
2944 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2945 }
2946 while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2947 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2948 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2949 goto retry;
2950 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2951 }
2952 ncp = vp->v_cache_dd;
2953 if (ncp != NULL) {
2954 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2955 ("lost dotdot link"));
2956 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2957 goto retry;
2958 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2959 }
2960 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2961 mtx_unlock(vlp);
2962 if (vlp2 != NULL)
2963 mtx_unlock(vlp2);
2964 cache_free_batch(&batch);
2965 }
2966
2967 /*
2968 * Opportunistic check to see if there is anything to do.
2969 */
2970 static bool
cache_has_entries(struct vnode * vp)2971 cache_has_entries(struct vnode *vp)
2972 {
2973
2974 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2975 atomic_load_ptr(&vp->v_cache_dd) == NULL)
2976 return (false);
2977 return (true);
2978 }
2979
2980 void
cache_purge(struct vnode * vp)2981 cache_purge(struct vnode *vp)
2982 {
2983
2984 SDT_PROBE1(vfs, namecache, purge, done, vp);
2985 if (!cache_has_entries(vp))
2986 return;
2987 cache_purge_impl(vp);
2988 }
2989
2990 /*
2991 * Only to be used by vgone.
2992 */
2993 void
cache_purge_vgone(struct vnode * vp)2994 cache_purge_vgone(struct vnode *vp)
2995 {
2996 struct mtx *vlp;
2997
2998 VNPASS(VN_IS_DOOMED(vp), vp);
2999 if (cache_has_entries(vp)) {
3000 cache_purge_impl(vp);
3001 return;
3002 }
3003
3004 /*
3005 * Serialize against a potential thread doing cache_purge.
3006 */
3007 vlp = VP2VNODELOCK(vp);
3008 mtx_wait_unlocked(vlp);
3009 if (cache_has_entries(vp)) {
3010 cache_purge_impl(vp);
3011 return;
3012 }
3013 return;
3014 }
3015
3016 /*
3017 * Remove all negative entries for a particular directory vnode.
3018 */
3019 void
cache_purge_negative(struct vnode * vp)3020 cache_purge_negative(struct vnode *vp)
3021 {
3022 struct cache_freebatch batch;
3023 struct namecache *ncp, *nnp;
3024 struct mtx *vlp;
3025
3026 SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
3027 if (LIST_EMPTY(&vp->v_cache_src))
3028 return;
3029 TAILQ_INIT(&batch);
3030 vlp = VP2VNODELOCK(vp);
3031 mtx_lock(vlp);
3032 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
3033 if (!(ncp->nc_flag & NCF_NEGATIVE))
3034 continue;
3035 cache_zap_negative_locked_vnode_kl(ncp, vp);
3036 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
3037 }
3038 mtx_unlock(vlp);
3039 cache_free_batch(&batch);
3040 }
3041
3042 /*
3043 * Entry points for modifying VOP operations.
3044 */
3045 void
cache_vop_rename(struct vnode * fdvp,struct vnode * fvp,struct vnode * tdvp,struct vnode * tvp,struct componentname * fcnp,struct componentname * tcnp)3046 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
3047 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
3048 {
3049
3050 ASSERT_VOP_IN_SEQC(fdvp);
3051 ASSERT_VOP_IN_SEQC(fvp);
3052 ASSERT_VOP_IN_SEQC(tdvp);
3053 if (tvp != NULL)
3054 ASSERT_VOP_IN_SEQC(tvp);
3055
3056 cache_purge(fvp);
3057 if (tvp != NULL) {
3058 cache_purge(tvp);
3059 KASSERT(!cache_remove_cnp(tdvp, tcnp),
3060 ("%s: lingering negative entry", __func__));
3061 } else {
3062 cache_remove_cnp(tdvp, tcnp);
3063 }
3064
3065 /*
3066 * TODO
3067 *
3068 * Historically renaming was always purging all revelang entries,
3069 * but that's quite wasteful. In particular turns out that in many cases
3070 * the target file is immediately accessed after rename, inducing a cache
3071 * miss.
3072 *
3073 * Recode this to reduce relocking and reuse the existing entry (if any)
3074 * instead of just removing it above and allocating a new one here.
3075 */
3076 cache_enter(tdvp, fvp, tcnp);
3077 }
3078
3079 void
cache_vop_rmdir(struct vnode * dvp,struct vnode * vp)3080 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
3081 {
3082
3083 ASSERT_VOP_IN_SEQC(dvp);
3084 ASSERT_VOP_IN_SEQC(vp);
3085 cache_purge(vp);
3086 }
3087
3088 #ifdef INVARIANTS
3089 /*
3090 * Validate that if an entry exists it matches.
3091 */
3092 void
cache_validate(struct vnode * dvp,struct vnode * vp,struct componentname * cnp)3093 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
3094 {
3095 struct namecache *ncp;
3096 struct mtx *blp;
3097 uint32_t hash;
3098
3099 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3100 if (CK_SLIST_EMPTY(NCHHASH(hash)))
3101 return;
3102 blp = HASH2BUCKETLOCK(hash);
3103 mtx_lock(blp);
3104 ncp = cache_ncp_find(dvp, cnp, hash);
3105 if (ncp != NULL && ncp->nc_vp != vp) {
3106 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
3107 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
3108 }
3109 mtx_unlock(blp);
3110 }
3111
3112 void
cache_assert_no_entries(struct vnode * vp)3113 cache_assert_no_entries(struct vnode *vp)
3114 {
3115
3116 VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp);
3117 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
3118 VNPASS(vp->v_cache_dd == NULL, vp);
3119 }
3120 #endif
3121
3122 /*
3123 * Flush all entries referencing a particular filesystem.
3124 */
3125 void
cache_purgevfs(struct mount * mp)3126 cache_purgevfs(struct mount *mp)
3127 {
3128 struct vnode *vp, *mvp;
3129 size_t visited __sdt_used, purged __sdt_used;
3130
3131 visited = purged = 0;
3132 /*
3133 * Somewhat wasteful iteration over all vnodes. Would be better to
3134 * support filtering and avoid the interlock to begin with.
3135 */
3136 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3137 visited++;
3138 if (!cache_has_entries(vp)) {
3139 VI_UNLOCK(vp);
3140 continue;
3141 }
3142 vholdl(vp);
3143 VI_UNLOCK(vp);
3144 cache_purge(vp);
3145 purged++;
3146 vdrop(vp);
3147 }
3148
3149 SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
3150 }
3151
3152 /*
3153 * Perform canonical checks and cache lookup and pass on to filesystem
3154 * through the vop_cachedlookup only if needed.
3155 */
3156
3157 int
vfs_cache_lookup(struct vop_lookup_args * ap)3158 vfs_cache_lookup(struct vop_lookup_args *ap)
3159 {
3160 struct vnode *dvp;
3161 int error;
3162 struct vnode **vpp = ap->a_vpp;
3163 struct componentname *cnp = ap->a_cnp;
3164 int flags = cnp->cn_flags;
3165
3166 *vpp = NULL;
3167 dvp = ap->a_dvp;
3168
3169 if (dvp->v_type != VDIR)
3170 return (ENOTDIR);
3171
3172 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
3173 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
3174 return (EROFS);
3175
3176 error = vn_dir_check_exec(dvp, cnp);
3177 if (error != 0)
3178 return (error);
3179
3180 error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
3181 if (error == 0)
3182 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
3183 if (error == -1)
3184 return (0);
3185 return (error);
3186 }
3187
3188 /* Implementation of the getcwd syscall. */
3189 int
sys___getcwd(struct thread * td,struct __getcwd_args * uap)3190 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
3191 {
3192 char *buf, *retbuf;
3193 size_t buflen;
3194 int error;
3195
3196 buflen = uap->buflen;
3197 if (__predict_false(buflen < 2))
3198 return (EINVAL);
3199 if (buflen > MAXPATHLEN)
3200 buflen = MAXPATHLEN;
3201
3202 buf = uma_zalloc(namei_zone, M_WAITOK);
3203 error = vn_getcwd(buf, &retbuf, &buflen);
3204 if (error == 0)
3205 error = copyout(retbuf, uap->buf, buflen);
3206 uma_zfree(namei_zone, buf);
3207 return (error);
3208 }
3209
3210 int
vn_getcwd(char * buf,char ** retbuf,size_t * buflen)3211 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
3212 {
3213 struct pwd *pwd;
3214 int error;
3215
3216 vfs_smr_enter();
3217 pwd = pwd_get_smr();
3218 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
3219 buflen, 0);
3220 VFS_SMR_ASSERT_NOT_ENTERED();
3221 if (error < 0) {
3222 pwd = pwd_hold(curthread);
3223 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
3224 retbuf, buflen);
3225 pwd_drop(pwd);
3226 }
3227
3228 #ifdef KTRACE
3229 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
3230 ktrnamei(*retbuf);
3231 #endif
3232 return (error);
3233 }
3234
3235 /*
3236 * Canonicalize a path by walking it forward and back.
3237 *
3238 * BUGS:
3239 * - Nothing guarantees the integrity of the entire chain. Consider the case
3240 * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of
3241 * "foo" into "quux" during the backwards walk. The result will be
3242 * "quux/bar/baz/qux", which could not have been obtained by an incremental
3243 * walk in userspace. Moreover, the path we return is inaccessible if the
3244 * calling thread lacks permission to traverse "quux".
3245 */
3246 static int
kern___realpathat(struct thread * td,int fd,const char * path,char * buf,size_t size,int flags,enum uio_seg pathseg)3247 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
3248 size_t size, int flags, enum uio_seg pathseg)
3249 {
3250 struct nameidata nd;
3251 char *retbuf, *freebuf;
3252 int error;
3253
3254 if (flags != 0)
3255 return (EINVAL);
3256 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1,
3257 pathseg, path, fd, &cap_fstat_rights);
3258 if ((error = namei(&nd)) != 0)
3259 return (error);
3260
3261 if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
3262 (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
3263 struct vnode *covered_vp;
3264
3265 /*
3266 * This happens if vp is a file mount. The call to
3267 * vn_fullpath_hardlink can panic if path resolution can't be
3268 * handled without the directory.
3269 *
3270 * To resolve this, we find the vnode which was mounted on -
3271 * this should have a unique global path since we disallow
3272 * mounting on linked files.
3273 */
3274 error = vn_lock(nd.ni_vp, LK_SHARED);
3275 if (error != 0)
3276 goto out;
3277 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
3278 vref(covered_vp);
3279 VOP_UNLOCK(nd.ni_vp);
3280 error = vn_fullpath(covered_vp, &retbuf, &freebuf);
3281 vrele(covered_vp);
3282 } else {
3283 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp,
3284 nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf,
3285 &freebuf, &size);
3286 }
3287 if (error == 0) {
3288 size_t len;
3289
3290 len = strlen(retbuf) + 1;
3291 if (size < len)
3292 error = ENAMETOOLONG;
3293 else if (pathseg == UIO_USERSPACE)
3294 error = copyout(retbuf, buf, len);
3295 else
3296 memcpy(buf, retbuf, len);
3297 free(freebuf, M_TEMP);
3298 }
3299 out:
3300 vrele(nd.ni_vp);
3301 vrele(nd.ni_dvp);
3302 NDFREE_PNBUF(&nd);
3303 return (error);
3304 }
3305
3306 int
sys___realpathat(struct thread * td,struct __realpathat_args * uap)3307 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
3308 {
3309
3310 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
3311 uap->flags, UIO_USERSPACE));
3312 }
3313
3314 /*
3315 * Retrieve the full filesystem path that correspond to a vnode from the name
3316 * cache (if available)
3317 */
3318 int
vn_fullpath(struct vnode * vp,char ** retbuf,char ** freebuf)3319 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
3320 {
3321 struct pwd *pwd;
3322 char *buf;
3323 size_t buflen;
3324 int error;
3325
3326 if (__predict_false(vp == NULL))
3327 return (EINVAL);
3328
3329 buflen = MAXPATHLEN;
3330 buf = malloc(buflen, M_TEMP, M_WAITOK);
3331 vfs_smr_enter();
3332 pwd = pwd_get_smr();
3333 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
3334 VFS_SMR_ASSERT_NOT_ENTERED();
3335 if (error < 0) {
3336 pwd = pwd_hold(curthread);
3337 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
3338 pwd_drop(pwd);
3339 }
3340 if (error == 0)
3341 *freebuf = buf;
3342 else
3343 free(buf, M_TEMP);
3344 return (error);
3345 }
3346
3347 /*
3348 * This function is similar to vn_fullpath, but it attempts to lookup the
3349 * pathname relative to the global root mount point. This is required for the
3350 * auditing sub-system, as audited pathnames must be absolute, relative to the
3351 * global root mount point.
3352 */
3353 int
vn_fullpath_global(struct vnode * vp,char ** retbuf,char ** freebuf)3354 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
3355 {
3356 char *buf;
3357 size_t buflen;
3358 int error;
3359
3360 if (__predict_false(vp == NULL))
3361 return (EINVAL);
3362 buflen = MAXPATHLEN;
3363 buf = malloc(buflen, M_TEMP, M_WAITOK);
3364 vfs_smr_enter();
3365 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
3366 VFS_SMR_ASSERT_NOT_ENTERED();
3367 if (error < 0) {
3368 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
3369 }
3370 if (error == 0)
3371 *freebuf = buf;
3372 else
3373 free(buf, M_TEMP);
3374 return (error);
3375 }
3376
3377 static struct namecache *
vn_dd_from_dst(struct vnode * vp)3378 vn_dd_from_dst(struct vnode *vp)
3379 {
3380 struct namecache *ncp;
3381
3382 cache_assert_vnode_locked(vp);
3383 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3384 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3385 return (ncp);
3386 }
3387 return (NULL);
3388 }
3389
3390 int
vn_vptocnp(struct vnode ** vp,char * buf,size_t * buflen)3391 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3392 {
3393 struct vnode *dvp;
3394 struct namecache *ncp;
3395 struct mtx *vlp;
3396 int error;
3397
3398 vlp = VP2VNODELOCK(*vp);
3399 mtx_lock(vlp);
3400 ncp = (*vp)->v_cache_dd;
3401 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3402 KASSERT(ncp == vn_dd_from_dst(*vp),
3403 ("%s: mismatch for dd entry (%p != %p)", __func__,
3404 ncp, vn_dd_from_dst(*vp)));
3405 } else {
3406 ncp = vn_dd_from_dst(*vp);
3407 }
3408 if (ncp != NULL) {
3409 if (*buflen < ncp->nc_nlen) {
3410 mtx_unlock(vlp);
3411 vrele(*vp);
3412 counter_u64_add(numfullpathfail4, 1);
3413 error = ENOMEM;
3414 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3415 vp, NULL);
3416 return (error);
3417 }
3418 *buflen -= ncp->nc_nlen;
3419 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3420 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3421 ncp->nc_name, vp);
3422 dvp = *vp;
3423 *vp = ncp->nc_dvp;
3424 vref(*vp);
3425 mtx_unlock(vlp);
3426 vrele(dvp);
3427 return (0);
3428 }
3429 SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3430
3431 mtx_unlock(vlp);
3432 vn_lock(*vp, LK_SHARED | LK_RETRY);
3433 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3434 vput(*vp);
3435 if (error) {
3436 counter_u64_add(numfullpathfail2, 1);
3437 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3438 return (error);
3439 }
3440
3441 *vp = dvp;
3442 if (VN_IS_DOOMED(dvp)) {
3443 /* forced unmount */
3444 vrele(dvp);
3445 error = ENOENT;
3446 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3447 return (error);
3448 }
3449 /*
3450 * *vp has its use count incremented still.
3451 */
3452
3453 return (0);
3454 }
3455
3456 /*
3457 * Resolve a directory to a pathname.
3458 *
3459 * The name of the directory can always be found in the namecache or fetched
3460 * from the filesystem. There is also guaranteed to be only one parent, meaning
3461 * we can just follow vnodes up until we find the root.
3462 *
3463 * The vnode must be referenced.
3464 */
3465 static int
vn_fullpath_dir(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * len,size_t addend)3466 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3467 size_t *len, size_t addend)
3468 {
3469 #ifdef KDTRACE_HOOKS
3470 struct vnode *startvp = vp;
3471 #endif
3472 struct vnode *vp1;
3473 size_t buflen;
3474 int error;
3475 bool slash_prefixed;
3476
3477 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3478 VNPASS(vp->v_usecount > 0, vp);
3479
3480 buflen = *len;
3481
3482 slash_prefixed = true;
3483 if (addend == 0) {
3484 MPASS(*len >= 2);
3485 buflen--;
3486 buf[buflen] = '\0';
3487 slash_prefixed = false;
3488 }
3489
3490 error = 0;
3491
3492 SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3493 counter_u64_add(numfullpathcalls, 1);
3494 while (vp != rdir && vp != rootvnode) {
3495 /*
3496 * The vp vnode must be already fully constructed,
3497 * since it is either found in namecache or obtained
3498 * from VOP_VPTOCNP(). We may test for VV_ROOT safely
3499 * without obtaining the vnode lock.
3500 */
3501 if ((vp->v_vflag & VV_ROOT) != 0) {
3502 vn_lock(vp, LK_RETRY | LK_SHARED);
3503
3504 /*
3505 * With the vnode locked, check for races with
3506 * unmount, forced or not. Note that we
3507 * already verified that vp is not equal to
3508 * the root vnode, which means that
3509 * mnt_vnodecovered can be NULL only for the
3510 * case of unmount.
3511 */
3512 if (VN_IS_DOOMED(vp) ||
3513 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3514 vp1->v_mountedhere != vp->v_mount) {
3515 vput(vp);
3516 error = ENOENT;
3517 SDT_PROBE3(vfs, namecache, fullpath, return,
3518 error, vp, NULL);
3519 break;
3520 }
3521
3522 vref(vp1);
3523 vput(vp);
3524 vp = vp1;
3525 continue;
3526 }
3527 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3528 error = vn_vptocnp(&vp, buf, &buflen);
3529 if (error)
3530 break;
3531 if (buflen == 0) {
3532 vrele(vp);
3533 error = ENOMEM;
3534 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3535 startvp, NULL);
3536 break;
3537 }
3538 buf[--buflen] = '/';
3539 slash_prefixed = true;
3540 }
3541 if (error)
3542 return (error);
3543 if (!slash_prefixed) {
3544 if (buflen == 0) {
3545 vrele(vp);
3546 counter_u64_add(numfullpathfail4, 1);
3547 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3548 startvp, NULL);
3549 return (ENOMEM);
3550 }
3551 buf[--buflen] = '/';
3552 }
3553 counter_u64_add(numfullpathfound, 1);
3554 vrele(vp);
3555
3556 *retbuf = buf + buflen;
3557 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3558 *len -= buflen;
3559 *len += addend;
3560 return (0);
3561 }
3562
3563 /*
3564 * Resolve an arbitrary vnode to a pathname.
3565 *
3566 * Note 2 caveats:
3567 * - hardlinks are not tracked, thus if the vnode is not a directory this can
3568 * resolve to a different path than the one used to find it
3569 * - namecache is not mandatory, meaning names are not guaranteed to be added
3570 * (in which case resolving fails)
3571 */
3572 static void __inline
cache_rev_failed_impl(int * reason,int line)3573 cache_rev_failed_impl(int *reason, int line)
3574 {
3575
3576 *reason = line;
3577 }
3578 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__)
3579
3580 static int
vn_fullpath_any_smr(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen,size_t addend)3581 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3582 char **retbuf, size_t *buflen, size_t addend)
3583 {
3584 #ifdef KDTRACE_HOOKS
3585 struct vnode *startvp = vp;
3586 #endif
3587 struct vnode *tvp;
3588 struct mount *mp;
3589 struct namecache *ncp;
3590 size_t orig_buflen;
3591 int reason;
3592 int error;
3593 #ifdef KDTRACE_HOOKS
3594 int i;
3595 #endif
3596 seqc_t vp_seqc, tvp_seqc;
3597 u_char nc_flag;
3598
3599 VFS_SMR_ASSERT_ENTERED();
3600
3601 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
3602 vfs_smr_exit();
3603 return (-1);
3604 }
3605
3606 orig_buflen = *buflen;
3607
3608 if (addend == 0) {
3609 MPASS(*buflen >= 2);
3610 *buflen -= 1;
3611 buf[*buflen] = '\0';
3612 }
3613
3614 if (vp == rdir || vp == rootvnode) {
3615 if (addend == 0) {
3616 *buflen -= 1;
3617 buf[*buflen] = '/';
3618 }
3619 goto out_ok;
3620 }
3621
3622 #ifdef KDTRACE_HOOKS
3623 i = 0;
3624 #endif
3625 error = -1;
3626 ncp = NULL; /* for sdt probe down below */
3627 vp_seqc = vn_seqc_read_any(vp);
3628 if (seqc_in_modify(vp_seqc)) {
3629 cache_rev_failed(&reason);
3630 goto out_abort;
3631 }
3632
3633 for (;;) {
3634 #ifdef KDTRACE_HOOKS
3635 i++;
3636 #endif
3637 if ((vp->v_vflag & VV_ROOT) != 0) {
3638 mp = atomic_load_ptr(&vp->v_mount);
3639 if (mp == NULL) {
3640 cache_rev_failed(&reason);
3641 goto out_abort;
3642 }
3643 tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3644 tvp_seqc = vn_seqc_read_any(tvp);
3645 if (seqc_in_modify(tvp_seqc)) {
3646 cache_rev_failed(&reason);
3647 goto out_abort;
3648 }
3649 if (!vn_seqc_consistent(vp, vp_seqc)) {
3650 cache_rev_failed(&reason);
3651 goto out_abort;
3652 }
3653 vp = tvp;
3654 vp_seqc = tvp_seqc;
3655 continue;
3656 }
3657 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3658 if (ncp == NULL) {
3659 cache_rev_failed(&reason);
3660 goto out_abort;
3661 }
3662 nc_flag = atomic_load_char(&ncp->nc_flag);
3663 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3664 cache_rev_failed(&reason);
3665 goto out_abort;
3666 }
3667 if (ncp->nc_nlen >= *buflen) {
3668 cache_rev_failed(&reason);
3669 error = ENOMEM;
3670 goto out_abort;
3671 }
3672 *buflen -= ncp->nc_nlen;
3673 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3674 *buflen -= 1;
3675 buf[*buflen] = '/';
3676 tvp = ncp->nc_dvp;
3677 tvp_seqc = vn_seqc_read_any(tvp);
3678 if (seqc_in_modify(tvp_seqc)) {
3679 cache_rev_failed(&reason);
3680 goto out_abort;
3681 }
3682 if (!vn_seqc_consistent(vp, vp_seqc)) {
3683 cache_rev_failed(&reason);
3684 goto out_abort;
3685 }
3686 /*
3687 * Acquire fence provided by vn_seqc_read_any above.
3688 */
3689 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3690 cache_rev_failed(&reason);
3691 goto out_abort;
3692 }
3693 if (!cache_ncp_canuse(ncp)) {
3694 cache_rev_failed(&reason);
3695 goto out_abort;
3696 }
3697 vp = tvp;
3698 vp_seqc = tvp_seqc;
3699 if (vp == rdir || vp == rootvnode)
3700 break;
3701 }
3702 out_ok:
3703 vfs_smr_exit();
3704 *retbuf = buf + *buflen;
3705 *buflen = orig_buflen - *buflen + addend;
3706 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3707 return (0);
3708
3709 out_abort:
3710 *buflen = orig_buflen;
3711 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3712 vfs_smr_exit();
3713 return (error);
3714 }
3715
3716 static int
vn_fullpath_any(struct vnode * vp,struct vnode * rdir,char * buf,char ** retbuf,size_t * buflen)3717 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3718 size_t *buflen)
3719 {
3720 size_t orig_buflen, addend;
3721 int error;
3722
3723 if (*buflen < 2)
3724 return (EINVAL);
3725
3726 orig_buflen = *buflen;
3727
3728 vref(vp);
3729 addend = 0;
3730 if (vp->v_type != VDIR) {
3731 *buflen -= 1;
3732 buf[*buflen] = '\0';
3733 error = vn_vptocnp(&vp, buf, buflen);
3734 if (error)
3735 return (error);
3736 if (*buflen == 0) {
3737 vrele(vp);
3738 return (ENOMEM);
3739 }
3740 *buflen -= 1;
3741 buf[*buflen] = '/';
3742 addend = orig_buflen - *buflen;
3743 }
3744
3745 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3746 }
3747
3748 /*
3749 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3750 *
3751 * Since the namecache does not track hardlinks, the caller is expected to
3752 * first look up the target vnode with WANTPARENT flag passed to namei to get
3753 * dvp and vp.
3754 *
3755 * Then we have 2 cases:
3756 * - if the found vnode is a directory, the path can be constructed just by
3757 * following names up the chain
3758 * - otherwise we populate the buffer with the saved name and start resolving
3759 * from the parent
3760 */
3761 int
vn_fullpath_hardlink(struct vnode * vp,struct vnode * dvp,const char * hrdl_name,size_t hrdl_name_length,char ** retbuf,char ** freebuf,size_t * buflen)3762 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
3763 const char *hrdl_name, size_t hrdl_name_length,
3764 char **retbuf, char **freebuf, size_t *buflen)
3765 {
3766 char *buf, *tmpbuf;
3767 struct pwd *pwd;
3768 size_t addend;
3769 int error;
3770 __enum_uint8(vtype) type;
3771
3772 if (*buflen < 2)
3773 return (EINVAL);
3774 if (*buflen > MAXPATHLEN)
3775 *buflen = MAXPATHLEN;
3776
3777 buf = malloc(*buflen, M_TEMP, M_WAITOK);
3778
3779 addend = 0;
3780
3781 /*
3782 * Check for VBAD to work around the vp_crossmp bug in lookup().
3783 *
3784 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3785 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3786 * If the type is VDIR (like in this very case) we can skip looking
3787 * at ni_dvp in the first place. However, since vnodes get passed here
3788 * unlocked the target may transition to doomed state (type == VBAD)
3789 * before we get to evaluate the condition. If this happens, we will
3790 * populate part of the buffer and descend to vn_fullpath_dir with
3791 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3792 */
3793 type = atomic_load_8(&vp->v_type);
3794 if (type == VBAD) {
3795 error = ENOENT;
3796 goto out_bad;
3797 }
3798 if (type != VDIR) {
3799 addend = hrdl_name_length + 2;
3800 if (*buflen < addend) {
3801 error = ENOMEM;
3802 goto out_bad;
3803 }
3804 *buflen -= addend;
3805 tmpbuf = buf + *buflen;
3806 tmpbuf[0] = '/';
3807 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
3808 tmpbuf[addend - 1] = '\0';
3809 vp = dvp;
3810 }
3811
3812 vfs_smr_enter();
3813 pwd = pwd_get_smr();
3814 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3815 addend);
3816 VFS_SMR_ASSERT_NOT_ENTERED();
3817 if (error < 0) {
3818 pwd = pwd_hold(curthread);
3819 vref(vp);
3820 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3821 addend);
3822 pwd_drop(pwd);
3823 }
3824 if (error != 0)
3825 goto out_bad;
3826
3827 *freebuf = buf;
3828
3829 return (0);
3830 out_bad:
3831 free(buf, M_TEMP);
3832 return (error);
3833 }
3834
3835 struct vnode *
vn_dir_dd_ino(struct vnode * vp)3836 vn_dir_dd_ino(struct vnode *vp)
3837 {
3838 struct namecache *ncp;
3839 struct vnode *ddvp;
3840 struct mtx *vlp;
3841 enum vgetstate vs;
3842
3843 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3844 vlp = VP2VNODELOCK(vp);
3845 mtx_lock(vlp);
3846 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3847 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3848 continue;
3849 ddvp = ncp->nc_dvp;
3850 vs = vget_prep(ddvp);
3851 mtx_unlock(vlp);
3852 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3853 return (NULL);
3854 return (ddvp);
3855 }
3856 mtx_unlock(vlp);
3857 return (NULL);
3858 }
3859
3860 int
vn_commname(struct vnode * vp,char * buf,u_int buflen)3861 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3862 {
3863 struct namecache *ncp;
3864 struct mtx *vlp;
3865 int l;
3866
3867 vlp = VP2VNODELOCK(vp);
3868 mtx_lock(vlp);
3869 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3870 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3871 break;
3872 if (ncp == NULL) {
3873 mtx_unlock(vlp);
3874 return (ENOENT);
3875 }
3876 l = min(ncp->nc_nlen, buflen - 1);
3877 memcpy(buf, ncp->nc_name, l);
3878 mtx_unlock(vlp);
3879 buf[l] = '\0';
3880 return (0);
3881 }
3882
3883 /*
3884 * This function updates path string to vnode's full global path
3885 * and checks the size of the new path string against the pathlen argument.
3886 *
3887 * Requires a locked, referenced vnode.
3888 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3889 *
3890 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3891 * because it falls back to the ".." lookup if the namecache lookup fails.
3892 */
3893 int
vn_path_to_global_path(struct thread * td,struct vnode * vp,char * path,u_int pathlen)3894 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3895 u_int pathlen)
3896 {
3897 struct nameidata nd;
3898 struct vnode *vp1;
3899 char *rpath, *fbuf;
3900 int error;
3901
3902 ASSERT_VOP_ELOCKED(vp, __func__);
3903
3904 /* Construct global filesystem path from vp. */
3905 VOP_UNLOCK(vp);
3906 error = vn_fullpath_global(vp, &rpath, &fbuf);
3907
3908 if (error != 0) {
3909 vrele(vp);
3910 return (error);
3911 }
3912
3913 if (strlen(rpath) >= pathlen) {
3914 vrele(vp);
3915 error = ENAMETOOLONG;
3916 goto out;
3917 }
3918
3919 /*
3920 * Re-lookup the vnode by path to detect a possible rename.
3921 * As a side effect, the vnode is relocked.
3922 * If vnode was renamed, return ENOENT.
3923 */
3924 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3925 error = namei(&nd);
3926 if (error != 0) {
3927 vrele(vp);
3928 goto out;
3929 }
3930 NDFREE_PNBUF(&nd);
3931 vp1 = nd.ni_vp;
3932 vrele(vp);
3933 if (vp1 == vp)
3934 strcpy(path, rpath);
3935 else {
3936 vput(vp1);
3937 error = ENOENT;
3938 }
3939
3940 out:
3941 free(fbuf, M_TEMP);
3942 return (error);
3943 }
3944
3945 /*
3946 * This is similar to vn_path_to_global_path but allows for regular
3947 * files which may not be present in the cache.
3948 *
3949 * Requires a locked, referenced vnode.
3950 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3951 */
3952 int
vn_path_to_global_path_hardlink(struct thread * td,struct vnode * vp,struct vnode * dvp,char * path,u_int pathlen,const char * leaf_name,size_t leaf_length)3953 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
3954 struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
3955 size_t leaf_length)
3956 {
3957 struct nameidata nd;
3958 struct vnode *vp1;
3959 char *rpath, *fbuf;
3960 size_t len;
3961 int error;
3962
3963 ASSERT_VOP_ELOCKED(vp, __func__);
3964
3965 /*
3966 * Construct global filesystem path from dvp, vp and leaf
3967 * name.
3968 */
3969 VOP_UNLOCK(vp);
3970 len = pathlen;
3971 error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
3972 &rpath, &fbuf, &len);
3973
3974 if (error != 0) {
3975 vrele(vp);
3976 return (error);
3977 }
3978
3979 if (strlen(rpath) >= pathlen) {
3980 vrele(vp);
3981 error = ENAMETOOLONG;
3982 goto out;
3983 }
3984
3985 /*
3986 * Re-lookup the vnode by path to detect a possible rename.
3987 * As a side effect, the vnode is relocked.
3988 * If vnode was renamed, return ENOENT.
3989 */
3990 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3991 error = namei(&nd);
3992 if (error != 0) {
3993 vrele(vp);
3994 goto out;
3995 }
3996 NDFREE_PNBUF(&nd);
3997 vp1 = nd.ni_vp;
3998 vrele(vp);
3999 if (vp1 == vp)
4000 strcpy(path, rpath);
4001 else {
4002 vput(vp1);
4003 error = ENOENT;
4004 }
4005
4006 out:
4007 free(fbuf, M_TEMP);
4008 return (error);
4009 }
4010
4011 #ifdef DDB
4012 static void
db_print_vpath(struct vnode * vp)4013 db_print_vpath(struct vnode *vp)
4014 {
4015
4016 while (vp != NULL) {
4017 db_printf("%p: ", vp);
4018 if (vp == rootvnode) {
4019 db_printf("/");
4020 vp = NULL;
4021 } else {
4022 if (vp->v_vflag & VV_ROOT) {
4023 db_printf("<mount point>");
4024 vp = vp->v_mount->mnt_vnodecovered;
4025 } else {
4026 struct namecache *ncp;
4027 char *ncn;
4028 int i;
4029
4030 ncp = TAILQ_FIRST(&vp->v_cache_dst);
4031 if (ncp != NULL) {
4032 ncn = ncp->nc_name;
4033 for (i = 0; i < ncp->nc_nlen; i++)
4034 db_printf("%c", *ncn++);
4035 vp = ncp->nc_dvp;
4036 } else {
4037 vp = NULL;
4038 }
4039 }
4040 }
4041 db_printf("\n");
4042 }
4043
4044 return;
4045 }
4046
DB_SHOW_COMMAND(vpath,db_show_vpath)4047 DB_SHOW_COMMAND(vpath, db_show_vpath)
4048 {
4049 struct vnode *vp;
4050
4051 if (!have_addr) {
4052 db_printf("usage: show vpath <struct vnode *>\n");
4053 return;
4054 }
4055
4056 vp = (struct vnode *)addr;
4057 db_print_vpath(vp);
4058 }
4059
4060 #endif
4061
4062 static int cache_fast_lookup = 1;
4063
4064 #define CACHE_FPL_FAILED -2020
4065
4066 static int
cache_vop_bad_vexec(struct vop_fplookup_vexec_args * v)4067 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v)
4068 {
4069 vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n");
4070 panic("no proper vop_fplookup_vexec");
4071 }
4072
4073 static int
cache_vop_bad_symlink(struct vop_fplookup_symlink_args * v)4074 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v)
4075 {
4076 vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n");
4077 panic("no proper vop_fplookup_symlink");
4078 }
4079
4080 void
cache_vop_vector_register(struct vop_vector * v)4081 cache_vop_vector_register(struct vop_vector *v)
4082 {
4083 size_t ops;
4084
4085 ops = 0;
4086 if (v->vop_fplookup_vexec != NULL) {
4087 ops++;
4088 }
4089 if (v->vop_fplookup_symlink != NULL) {
4090 ops++;
4091 }
4092
4093 if (ops == 2) {
4094 return;
4095 }
4096
4097 if (ops == 0) {
4098 v->vop_fplookup_vexec = cache_vop_bad_vexec;
4099 v->vop_fplookup_symlink = cache_vop_bad_symlink;
4100 return;
4101 }
4102
4103 printf("%s: invalid vop vector %p -- either all or none fplookup vops "
4104 "need to be provided", __func__, v);
4105 if (v->vop_fplookup_vexec == NULL) {
4106 printf("%s: missing vop_fplookup_vexec\n", __func__);
4107 }
4108 if (v->vop_fplookup_symlink == NULL) {
4109 printf("%s: missing vop_fplookup_symlink\n", __func__);
4110 }
4111 panic("bad vop vector %p", v);
4112 }
4113
4114 #ifdef INVARIANTS
4115 void
cache_validate_vop_vector(struct mount * mp,struct vop_vector * vops)4116 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops)
4117 {
4118 if (mp == NULL)
4119 return;
4120
4121 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4122 return;
4123
4124 if (vops->vop_fplookup_vexec == NULL ||
4125 vops->vop_fplookup_vexec == cache_vop_bad_vexec)
4126 panic("bad vop_fplookup_vexec on vector %p for filesystem %s",
4127 vops, mp->mnt_vfc->vfc_name);
4128
4129 if (vops->vop_fplookup_symlink == NULL ||
4130 vops->vop_fplookup_symlink == cache_vop_bad_symlink)
4131 panic("bad vop_fplookup_symlink on vector %p for filesystem %s",
4132 vops, mp->mnt_vfc->vfc_name);
4133 }
4134 #endif
4135
4136 void
cache_fast_lookup_enabled_recalc(void)4137 cache_fast_lookup_enabled_recalc(void)
4138 {
4139 int lookup_flag;
4140 int mac_on;
4141
4142 #ifdef MAC
4143 mac_on = mac_vnode_check_lookup_enabled();
4144 mac_on |= mac_vnode_check_readlink_enabled();
4145 #else
4146 mac_on = 0;
4147 #endif
4148
4149 lookup_flag = atomic_load_int(&cache_fast_lookup);
4150 if (lookup_flag && !mac_on) {
4151 atomic_store_char(&cache_fast_lookup_enabled, true);
4152 } else {
4153 atomic_store_char(&cache_fast_lookup_enabled, false);
4154 }
4155 }
4156
4157 static int
syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)4158 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
4159 {
4160 int error, old;
4161
4162 old = atomic_load_int(&cache_fast_lookup);
4163 error = sysctl_handle_int(oidp, arg1, arg2, req);
4164 if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
4165 cache_fast_lookup_enabled_recalc();
4166 return (error);
4167 }
4168 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
4169 &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
4170
4171 /*
4172 * Components of nameidata (or objects it can point to) which may
4173 * need restoring in case fast path lookup fails.
4174 */
4175 struct nameidata_outer {
4176 size_t ni_pathlen;
4177 uint64_t cn_flags;
4178 };
4179
4180 struct nameidata_saved {
4181 #ifdef INVARIANTS
4182 char *cn_nameptr;
4183 size_t ni_pathlen;
4184 #endif
4185 };
4186
4187 #ifdef INVARIANTS
4188 struct cache_fpl_debug {
4189 size_t ni_pathlen;
4190 };
4191 #endif
4192
4193 struct cache_fpl {
4194 struct nameidata *ndp;
4195 struct componentname *cnp;
4196 char *nulchar;
4197 struct vnode *dvp;
4198 struct vnode *tvp;
4199 seqc_t dvp_seqc;
4200 seqc_t tvp_seqc;
4201 uint32_t hash;
4202 struct nameidata_saved snd;
4203 struct nameidata_outer snd_outer;
4204 int line;
4205 enum cache_fpl_status status:8;
4206 bool in_smr;
4207 bool fsearch;
4208 struct pwd **pwd;
4209 #ifdef INVARIANTS
4210 struct cache_fpl_debug debug;
4211 #endif
4212 };
4213
4214 static bool cache_fplookup_mp_supported(struct mount *mp);
4215 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
4216 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
4217 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
4218 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
4219 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
4220 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
4221 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
4222 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
4223 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
4224
4225 static void
cache_fpl_cleanup_cnp(struct componentname * cnp)4226 cache_fpl_cleanup_cnp(struct componentname *cnp)
4227 {
4228
4229 uma_zfree(namei_zone, cnp->cn_pnbuf);
4230 cnp->cn_pnbuf = NULL;
4231 cnp->cn_nameptr = NULL;
4232 }
4233
4234 static struct vnode *
cache_fpl_handle_root(struct cache_fpl * fpl)4235 cache_fpl_handle_root(struct cache_fpl *fpl)
4236 {
4237 struct nameidata *ndp;
4238 struct componentname *cnp;
4239
4240 ndp = fpl->ndp;
4241 cnp = fpl->cnp;
4242
4243 MPASS(*(cnp->cn_nameptr) == '/');
4244 cnp->cn_nameptr++;
4245 cache_fpl_pathlen_dec(fpl);
4246
4247 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4248 do {
4249 cnp->cn_nameptr++;
4250 cache_fpl_pathlen_dec(fpl);
4251 } while (*(cnp->cn_nameptr) == '/');
4252 }
4253
4254 return (ndp->ni_rootdir);
4255 }
4256
4257 static void
cache_fpl_checkpoint_outer(struct cache_fpl * fpl)4258 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
4259 {
4260
4261 fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
4262 fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
4263 }
4264
4265 static void
cache_fpl_checkpoint(struct cache_fpl * fpl)4266 cache_fpl_checkpoint(struct cache_fpl *fpl)
4267 {
4268
4269 #ifdef INVARIANTS
4270 fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
4271 fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
4272 #endif
4273 }
4274
4275 static void
cache_fpl_restore_partial(struct cache_fpl * fpl)4276 cache_fpl_restore_partial(struct cache_fpl *fpl)
4277 {
4278
4279 fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
4280 #ifdef INVARIANTS
4281 fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
4282 #endif
4283 }
4284
4285 static void
cache_fpl_restore_abort(struct cache_fpl * fpl)4286 cache_fpl_restore_abort(struct cache_fpl *fpl)
4287 {
4288
4289 cache_fpl_restore_partial(fpl);
4290 /*
4291 * It is 0 on entry by API contract.
4292 */
4293 fpl->ndp->ni_resflags = 0;
4294 fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
4295 fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
4296 }
4297
4298 #ifdef INVARIANTS
4299 #define cache_fpl_smr_assert_entered(fpl) ({ \
4300 struct cache_fpl *_fpl = (fpl); \
4301 MPASS(_fpl->in_smr == true); \
4302 VFS_SMR_ASSERT_ENTERED(); \
4303 })
4304 #define cache_fpl_smr_assert_not_entered(fpl) ({ \
4305 struct cache_fpl *_fpl = (fpl); \
4306 MPASS(_fpl->in_smr == false); \
4307 VFS_SMR_ASSERT_NOT_ENTERED(); \
4308 })
4309 static void
cache_fpl_assert_status(struct cache_fpl * fpl)4310 cache_fpl_assert_status(struct cache_fpl *fpl)
4311 {
4312
4313 switch (fpl->status) {
4314 case CACHE_FPL_STATUS_UNSET:
4315 __assert_unreachable();
4316 break;
4317 case CACHE_FPL_STATUS_DESTROYED:
4318 case CACHE_FPL_STATUS_ABORTED:
4319 case CACHE_FPL_STATUS_PARTIAL:
4320 case CACHE_FPL_STATUS_HANDLED:
4321 break;
4322 }
4323 }
4324 #else
4325 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
4326 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
4327 #define cache_fpl_assert_status(fpl) do { } while (0)
4328 #endif
4329
4330 #define cache_fpl_smr_enter_initial(fpl) ({ \
4331 struct cache_fpl *_fpl = (fpl); \
4332 vfs_smr_enter(); \
4333 _fpl->in_smr = true; \
4334 })
4335
4336 #define cache_fpl_smr_enter(fpl) ({ \
4337 struct cache_fpl *_fpl = (fpl); \
4338 MPASS(_fpl->in_smr == false); \
4339 vfs_smr_enter(); \
4340 _fpl->in_smr = true; \
4341 })
4342
4343 #define cache_fpl_smr_exit(fpl) ({ \
4344 struct cache_fpl *_fpl = (fpl); \
4345 MPASS(_fpl->in_smr == true); \
4346 vfs_smr_exit(); \
4347 _fpl->in_smr = false; \
4348 })
4349
4350 static int
cache_fpl_aborted_early_impl(struct cache_fpl * fpl,int line)4351 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
4352 {
4353
4354 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4355 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4356 ("%s: converting to abort from %d at %d, set at %d\n",
4357 __func__, fpl->status, line, fpl->line));
4358 }
4359 cache_fpl_smr_assert_not_entered(fpl);
4360 fpl->status = CACHE_FPL_STATUS_ABORTED;
4361 fpl->line = line;
4362 return (CACHE_FPL_FAILED);
4363 }
4364
4365 #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__)
4366
4367 static int __noinline
cache_fpl_aborted_impl(struct cache_fpl * fpl,int line)4368 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
4369 {
4370 struct nameidata *ndp;
4371 struct componentname *cnp;
4372
4373 ndp = fpl->ndp;
4374 cnp = fpl->cnp;
4375
4376 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4377 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4378 ("%s: converting to abort from %d at %d, set at %d\n",
4379 __func__, fpl->status, line, fpl->line));
4380 }
4381 fpl->status = CACHE_FPL_STATUS_ABORTED;
4382 fpl->line = line;
4383 if (fpl->in_smr)
4384 cache_fpl_smr_exit(fpl);
4385 cache_fpl_restore_abort(fpl);
4386 /*
4387 * Resolving symlinks overwrites data passed by the caller.
4388 * Let namei know.
4389 */
4390 if (ndp->ni_loopcnt > 0) {
4391 fpl->status = CACHE_FPL_STATUS_DESTROYED;
4392 cache_fpl_cleanup_cnp(cnp);
4393 }
4394 return (CACHE_FPL_FAILED);
4395 }
4396
4397 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
4398
4399 static int __noinline
cache_fpl_partial_impl(struct cache_fpl * fpl,int line)4400 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
4401 {
4402
4403 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4404 ("%s: setting to partial at %d, but already set to %d at %d\n",
4405 __func__, line, fpl->status, fpl->line));
4406 cache_fpl_smr_assert_entered(fpl);
4407 fpl->status = CACHE_FPL_STATUS_PARTIAL;
4408 fpl->line = line;
4409 return (cache_fplookup_partial_setup(fpl));
4410 }
4411
4412 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
4413
4414 static int
cache_fpl_handled_impl(struct cache_fpl * fpl,int line)4415 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
4416 {
4417
4418 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4419 ("%s: setting to handled at %d, but already set to %d at %d\n",
4420 __func__, line, fpl->status, fpl->line));
4421 cache_fpl_smr_assert_not_entered(fpl);
4422 fpl->status = CACHE_FPL_STATUS_HANDLED;
4423 fpl->line = line;
4424 return (0);
4425 }
4426
4427 #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__)
4428
4429 static int
cache_fpl_handled_error_impl(struct cache_fpl * fpl,int error,int line)4430 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
4431 {
4432
4433 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4434 ("%s: setting to handled at %d, but already set to %d at %d\n",
4435 __func__, line, fpl->status, fpl->line));
4436 MPASS(error != 0);
4437 MPASS(error != CACHE_FPL_FAILED);
4438 cache_fpl_smr_assert_not_entered(fpl);
4439 fpl->status = CACHE_FPL_STATUS_HANDLED;
4440 fpl->line = line;
4441 fpl->dvp = NULL;
4442 fpl->tvp = NULL;
4443 return (error);
4444 }
4445
4446 #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__)
4447
4448 static bool
cache_fpl_terminated(struct cache_fpl * fpl)4449 cache_fpl_terminated(struct cache_fpl *fpl)
4450 {
4451
4452 return (fpl->status != CACHE_FPL_STATUS_UNSET);
4453 }
4454
4455 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
4456 (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
4457 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \
4458 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
4459 OPENWRITE | WANTIOCTLCAPS | NAMEILOOKUP)
4460
4461 #define CACHE_FPL_INTERNAL_CN_FLAGS \
4462 (ISDOTDOT | MAKEENTRY | ISLASTCN)
4463
4464 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4465 "supported and internal flags overlap");
4466
4467 static bool
cache_fpl_islastcn(struct nameidata * ndp)4468 cache_fpl_islastcn(struct nameidata *ndp)
4469 {
4470
4471 return (*ndp->ni_next == 0);
4472 }
4473
4474 static bool
cache_fpl_istrailingslash(struct cache_fpl * fpl)4475 cache_fpl_istrailingslash(struct cache_fpl *fpl)
4476 {
4477
4478 MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
4479 return (*(fpl->nulchar - 1) == '/');
4480 }
4481
4482 static bool
cache_fpl_isdotdot(struct componentname * cnp)4483 cache_fpl_isdotdot(struct componentname *cnp)
4484 {
4485
4486 if (cnp->cn_namelen == 2 &&
4487 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
4488 return (true);
4489 return (false);
4490 }
4491
4492 static bool
cache_can_fplookup(struct cache_fpl * fpl)4493 cache_can_fplookup(struct cache_fpl *fpl)
4494 {
4495 struct nameidata *ndp;
4496 struct componentname *cnp;
4497 struct thread *td;
4498
4499 ndp = fpl->ndp;
4500 cnp = fpl->cnp;
4501 td = curthread;
4502
4503 if (!atomic_load_char(&cache_fast_lookup_enabled)) {
4504 cache_fpl_aborted_early(fpl);
4505 return (false);
4506 }
4507 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4508 cache_fpl_aborted_early(fpl);
4509 return (false);
4510 }
4511 if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) {
4512 cache_fpl_aborted_early(fpl);
4513 return (false);
4514 }
4515 if (AUDITING_TD(td)) {
4516 cache_fpl_aborted_early(fpl);
4517 return (false);
4518 }
4519 if (ndp->ni_startdir != NULL) {
4520 cache_fpl_aborted_early(fpl);
4521 return (false);
4522 }
4523 return (true);
4524 }
4525
4526 static int __noinline
cache_fplookup_dirfd(struct cache_fpl * fpl,struct vnode ** vpp)4527 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4528 {
4529 struct nameidata *ndp;
4530 struct componentname *cnp;
4531 int error, flags;
4532
4533 ndp = fpl->ndp;
4534 cnp = fpl->cnp;
4535
4536 error = fgetvp_lookup_smr(ndp, vpp, &flags);
4537 if (__predict_false(error != 0)) {
4538 return (cache_fpl_aborted(fpl));
4539 }
4540 if (__predict_false((flags & O_RESOLVE_BENEATH) != 0)) {
4541 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & RBENEATH) == 0,
4542 "RBENEATH supported by fplookup");
4543 cache_fpl_smr_exit(fpl);
4544 cache_fpl_aborted(fpl);
4545 return (EOPNOTSUPP);
4546 }
4547 fpl->fsearch = (flags & FSEARCH) != 0;
4548 if ((*vpp)->v_type != VDIR) {
4549 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
4550 cache_fpl_smr_exit(fpl);
4551 return (cache_fpl_handled_error(fpl, ENOTDIR));
4552 }
4553 }
4554 return (0);
4555 }
4556
4557 static int __noinline
cache_fplookup_negative_promote(struct cache_fpl * fpl,struct namecache * oncp,uint32_t hash)4558 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4559 uint32_t hash)
4560 {
4561 struct componentname *cnp;
4562 struct vnode *dvp;
4563
4564 cnp = fpl->cnp;
4565 dvp = fpl->dvp;
4566
4567 cache_fpl_smr_exit(fpl);
4568 if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4569 return (cache_fpl_handled_error(fpl, ENOENT));
4570 else
4571 return (cache_fpl_aborted(fpl));
4572 }
4573
4574 /*
4575 * The target vnode is not supported, prepare for the slow path to take over.
4576 */
4577 static int __noinline
cache_fplookup_partial_setup(struct cache_fpl * fpl)4578 cache_fplookup_partial_setup(struct cache_fpl *fpl)
4579 {
4580 struct nameidata *ndp;
4581 struct componentname *cnp;
4582 enum vgetstate dvs;
4583 struct vnode *dvp;
4584 struct pwd *pwd;
4585 seqc_t dvp_seqc;
4586
4587 ndp = fpl->ndp;
4588 cnp = fpl->cnp;
4589 pwd = *(fpl->pwd);
4590 dvp = fpl->dvp;
4591 dvp_seqc = fpl->dvp_seqc;
4592
4593 if (!pwd_hold_smr(pwd)) {
4594 return (cache_fpl_aborted(fpl));
4595 }
4596
4597 /*
4598 * Note that seqc is checked before the vnode is locked, so by
4599 * the time regular lookup gets to it it may have moved.
4600 *
4601 * Ultimately this does not affect correctness, any lookup errors
4602 * are userspace racing with itself. It is guaranteed that any
4603 * path which ultimately gets found could also have been found
4604 * by regular lookup going all the way in absence of concurrent
4605 * modifications.
4606 */
4607 dvs = vget_prep_smr(dvp);
4608 cache_fpl_smr_exit(fpl);
4609 if (__predict_false(dvs == VGET_NONE)) {
4610 pwd_drop(pwd);
4611 return (cache_fpl_aborted(fpl));
4612 }
4613
4614 vget_finish_ref(dvp, dvs);
4615 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4616 vrele(dvp);
4617 pwd_drop(pwd);
4618 return (cache_fpl_aborted(fpl));
4619 }
4620
4621 cache_fpl_restore_partial(fpl);
4622 #ifdef INVARIANTS
4623 if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4624 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4625 cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4626 }
4627 #endif
4628
4629 ndp->ni_startdir = dvp;
4630 cnp->cn_flags |= MAKEENTRY;
4631 if (cache_fpl_islastcn(ndp))
4632 cnp->cn_flags |= ISLASTCN;
4633 if (cache_fpl_isdotdot(cnp))
4634 cnp->cn_flags |= ISDOTDOT;
4635
4636 /*
4637 * Skip potential extra slashes parsing did not take care of.
4638 * cache_fplookup_skip_slashes explains the mechanism.
4639 */
4640 if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4641 do {
4642 cnp->cn_nameptr++;
4643 cache_fpl_pathlen_dec(fpl);
4644 } while (*(cnp->cn_nameptr) == '/');
4645 }
4646
4647 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4648 #ifdef INVARIANTS
4649 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4650 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4651 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4652 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4653 }
4654 #endif
4655 return (0);
4656 }
4657
4658 static int
cache_fplookup_final_child(struct cache_fpl * fpl,enum vgetstate tvs)4659 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4660 {
4661 struct componentname *cnp;
4662 struct vnode *tvp;
4663 seqc_t tvp_seqc;
4664 int error, lkflags;
4665
4666 cnp = fpl->cnp;
4667 tvp = fpl->tvp;
4668 tvp_seqc = fpl->tvp_seqc;
4669
4670 if ((cnp->cn_flags & LOCKLEAF) != 0) {
4671 lkflags = LK_SHARED;
4672 if ((cnp->cn_flags & LOCKSHARED) == 0)
4673 lkflags = LK_EXCLUSIVE;
4674 error = vget_finish(tvp, lkflags, tvs);
4675 if (__predict_false(error != 0)) {
4676 return (cache_fpl_aborted(fpl));
4677 }
4678 } else {
4679 vget_finish_ref(tvp, tvs);
4680 }
4681
4682 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4683 if ((cnp->cn_flags & LOCKLEAF) != 0)
4684 vput(tvp);
4685 else
4686 vrele(tvp);
4687 return (cache_fpl_aborted(fpl));
4688 }
4689
4690 return (cache_fpl_handled(fpl));
4691 }
4692
4693 /*
4694 * They want to possibly modify the state of the namecache.
4695 */
4696 static int __noinline
cache_fplookup_final_modifying(struct cache_fpl * fpl)4697 cache_fplookup_final_modifying(struct cache_fpl *fpl)
4698 {
4699 struct nameidata *ndp __diagused;
4700 struct componentname *cnp;
4701 enum vgetstate dvs;
4702 struct vnode *dvp, *tvp;
4703 struct mount *mp;
4704 seqc_t dvp_seqc;
4705 int error;
4706 bool docache;
4707
4708 ndp = fpl->ndp;
4709 cnp = fpl->cnp;
4710 dvp = fpl->dvp;
4711 dvp_seqc = fpl->dvp_seqc;
4712
4713 MPASS(*(cnp->cn_nameptr) != '/');
4714 MPASS(cache_fpl_islastcn(ndp));
4715 if ((cnp->cn_flags & LOCKPARENT) == 0)
4716 MPASS((cnp->cn_flags & WANTPARENT) != 0);
4717 MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4718 MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4719 cnp->cn_nameiop == RENAME);
4720 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4721 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4722
4723 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4724 if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4725 docache = false;
4726
4727 /*
4728 * Regular lookup nulifies the slash, which we don't do here.
4729 * Don't take chances with filesystem routines seeing it for
4730 * the last entry.
4731 */
4732 if (cache_fpl_istrailingslash(fpl)) {
4733 return (cache_fpl_partial(fpl));
4734 }
4735
4736 mp = atomic_load_ptr(&dvp->v_mount);
4737 if (__predict_false(mp == NULL)) {
4738 return (cache_fpl_aborted(fpl));
4739 }
4740
4741 if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4742 cache_fpl_smr_exit(fpl);
4743 /*
4744 * Original code keeps not checking for CREATE which
4745 * might be a bug. For now let the old lookup decide.
4746 */
4747 if (cnp->cn_nameiop == CREATE) {
4748 return (cache_fpl_aborted(fpl));
4749 }
4750 return (cache_fpl_handled_error(fpl, EROFS));
4751 }
4752
4753 if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4754 cache_fpl_smr_exit(fpl);
4755 return (cache_fpl_handled_error(fpl, EEXIST));
4756 }
4757
4758 /*
4759 * Secure access to dvp; check cache_fplookup_partial_setup for
4760 * reasoning.
4761 *
4762 * XXX At least UFS requires its lookup routine to be called for
4763 * the last path component, which leads to some level of complication
4764 * and inefficiency:
4765 * - the target routine always locks the target vnode, but our caller
4766 * may not need it locked
4767 * - some of the VOP machinery asserts that the parent is locked, which
4768 * once more may be not required
4769 *
4770 * TODO: add a flag for filesystems which don't need this.
4771 */
4772 dvs = vget_prep_smr(dvp);
4773 cache_fpl_smr_exit(fpl);
4774 if (__predict_false(dvs == VGET_NONE)) {
4775 return (cache_fpl_aborted(fpl));
4776 }
4777
4778 vget_finish_ref(dvp, dvs);
4779 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4780 vrele(dvp);
4781 return (cache_fpl_aborted(fpl));
4782 }
4783
4784 error = vn_lock(dvp, LK_EXCLUSIVE);
4785 if (__predict_false(error != 0)) {
4786 vrele(dvp);
4787 return (cache_fpl_aborted(fpl));
4788 }
4789
4790 tvp = NULL;
4791 cnp->cn_flags |= ISLASTCN;
4792 if (docache)
4793 cnp->cn_flags |= MAKEENTRY;
4794 if (cache_fpl_isdotdot(cnp))
4795 cnp->cn_flags |= ISDOTDOT;
4796 cnp->cn_lkflags = LK_EXCLUSIVE;
4797 error = VOP_LOOKUP(dvp, &tvp, cnp);
4798 switch (error) {
4799 case EJUSTRETURN:
4800 case 0:
4801 break;
4802 case ENOTDIR:
4803 case ENOENT:
4804 vput(dvp);
4805 return (cache_fpl_handled_error(fpl, error));
4806 default:
4807 vput(dvp);
4808 return (cache_fpl_aborted(fpl));
4809 }
4810
4811 fpl->tvp = tvp;
4812
4813 if (tvp == NULL) {
4814 MPASS(error == EJUSTRETURN);
4815 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4816 VOP_UNLOCK(dvp);
4817 }
4818 return (cache_fpl_handled(fpl));
4819 }
4820
4821 /*
4822 * There are very hairy corner cases concerning various flag combinations
4823 * and locking state. In particular here we only hold one lock instead of
4824 * two.
4825 *
4826 * Skip the complexity as it is of no significance for normal workloads.
4827 */
4828 if (__predict_false(tvp == dvp)) {
4829 vput(dvp);
4830 vrele(tvp);
4831 return (cache_fpl_aborted(fpl));
4832 }
4833
4834 /*
4835 * If they want the symlink itself we are fine, but if they want to
4836 * follow it regular lookup has to be engaged.
4837 */
4838 if (tvp->v_type == VLNK) {
4839 if ((cnp->cn_flags & FOLLOW) != 0) {
4840 vput(dvp);
4841 vput(tvp);
4842 return (cache_fpl_aborted(fpl));
4843 }
4844 }
4845
4846 /*
4847 * Since we expect this to be the terminal vnode it should almost never
4848 * be a mount point.
4849 */
4850 if (__predict_false(cache_fplookup_is_mp(fpl))) {
4851 vput(dvp);
4852 vput(tvp);
4853 return (cache_fpl_aborted(fpl));
4854 }
4855
4856 if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4857 vput(dvp);
4858 vput(tvp);
4859 return (cache_fpl_handled_error(fpl, EEXIST));
4860 }
4861
4862 if ((cnp->cn_flags & LOCKLEAF) == 0) {
4863 VOP_UNLOCK(tvp);
4864 }
4865
4866 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4867 VOP_UNLOCK(dvp);
4868 }
4869
4870 return (cache_fpl_handled(fpl));
4871 }
4872
4873 static int __noinline
cache_fplookup_modifying(struct cache_fpl * fpl)4874 cache_fplookup_modifying(struct cache_fpl *fpl)
4875 {
4876 struct nameidata *ndp;
4877
4878 ndp = fpl->ndp;
4879
4880 if (!cache_fpl_islastcn(ndp)) {
4881 return (cache_fpl_partial(fpl));
4882 }
4883 return (cache_fplookup_final_modifying(fpl));
4884 }
4885
4886 static int __noinline
cache_fplookup_final_withparent(struct cache_fpl * fpl)4887 cache_fplookup_final_withparent(struct cache_fpl *fpl)
4888 {
4889 struct componentname *cnp;
4890 enum vgetstate dvs, tvs;
4891 struct vnode *dvp, *tvp;
4892 seqc_t dvp_seqc;
4893 int error;
4894
4895 cnp = fpl->cnp;
4896 dvp = fpl->dvp;
4897 dvp_seqc = fpl->dvp_seqc;
4898 tvp = fpl->tvp;
4899
4900 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4901
4902 /*
4903 * This is less efficient than it can be for simplicity.
4904 */
4905 dvs = vget_prep_smr(dvp);
4906 if (__predict_false(dvs == VGET_NONE)) {
4907 return (cache_fpl_aborted(fpl));
4908 }
4909 tvs = vget_prep_smr(tvp);
4910 if (__predict_false(tvs == VGET_NONE)) {
4911 cache_fpl_smr_exit(fpl);
4912 vget_abort(dvp, dvs);
4913 return (cache_fpl_aborted(fpl));
4914 }
4915
4916 cache_fpl_smr_exit(fpl);
4917
4918 if ((cnp->cn_flags & LOCKPARENT) != 0) {
4919 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4920 if (__predict_false(error != 0)) {
4921 vget_abort(tvp, tvs);
4922 return (cache_fpl_aborted(fpl));
4923 }
4924 } else {
4925 vget_finish_ref(dvp, dvs);
4926 }
4927
4928 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4929 vget_abort(tvp, tvs);
4930 if ((cnp->cn_flags & LOCKPARENT) != 0)
4931 vput(dvp);
4932 else
4933 vrele(dvp);
4934 return (cache_fpl_aborted(fpl));
4935 }
4936
4937 error = cache_fplookup_final_child(fpl, tvs);
4938 if (__predict_false(error != 0)) {
4939 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
4940 fpl->status == CACHE_FPL_STATUS_DESTROYED);
4941 if ((cnp->cn_flags & LOCKPARENT) != 0)
4942 vput(dvp);
4943 else
4944 vrele(dvp);
4945 return (error);
4946 }
4947
4948 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4949 return (0);
4950 }
4951
4952 static int
cache_fplookup_final(struct cache_fpl * fpl)4953 cache_fplookup_final(struct cache_fpl *fpl)
4954 {
4955 struct componentname *cnp;
4956 enum vgetstate tvs;
4957 struct vnode *dvp, *tvp;
4958 seqc_t dvp_seqc;
4959
4960 cnp = fpl->cnp;
4961 dvp = fpl->dvp;
4962 dvp_seqc = fpl->dvp_seqc;
4963 tvp = fpl->tvp;
4964
4965 MPASS(*(cnp->cn_nameptr) != '/');
4966
4967 if (cnp->cn_nameiop != LOOKUP) {
4968 return (cache_fplookup_final_modifying(fpl));
4969 }
4970
4971 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4972 return (cache_fplookup_final_withparent(fpl));
4973
4974 tvs = vget_prep_smr(tvp);
4975 if (__predict_false(tvs == VGET_NONE)) {
4976 return (cache_fpl_partial(fpl));
4977 }
4978
4979 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4980 cache_fpl_smr_exit(fpl);
4981 vget_abort(tvp, tvs);
4982 return (cache_fpl_aborted(fpl));
4983 }
4984
4985 cache_fpl_smr_exit(fpl);
4986 return (cache_fplookup_final_child(fpl, tvs));
4987 }
4988
4989 /*
4990 * Comment from locked lookup:
4991 * Check for degenerate name (e.g. / or "") which is a way of talking about a
4992 * directory, e.g. like "/." or ".".
4993 */
4994 static int __noinline
cache_fplookup_degenerate(struct cache_fpl * fpl)4995 cache_fplookup_degenerate(struct cache_fpl *fpl)
4996 {
4997 struct componentname *cnp;
4998 struct vnode *dvp;
4999 enum vgetstate dvs;
5000 int error, lkflags;
5001 #ifdef INVARIANTS
5002 char *cp;
5003 #endif
5004
5005 fpl->tvp = fpl->dvp;
5006 fpl->tvp_seqc = fpl->dvp_seqc;
5007
5008 cnp = fpl->cnp;
5009 dvp = fpl->dvp;
5010
5011 #ifdef INVARIANTS
5012 for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
5013 KASSERT(*cp == '/',
5014 ("%s: encountered non-slash; string [%s]\n", __func__,
5015 cnp->cn_pnbuf));
5016 }
5017 #endif
5018
5019 if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
5020 cache_fpl_smr_exit(fpl);
5021 return (cache_fpl_handled_error(fpl, EISDIR));
5022 }
5023
5024 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
5025 return (cache_fplookup_final_withparent(fpl));
5026 }
5027
5028 dvs = vget_prep_smr(dvp);
5029 cache_fpl_smr_exit(fpl);
5030 if (__predict_false(dvs == VGET_NONE)) {
5031 return (cache_fpl_aborted(fpl));
5032 }
5033
5034 if ((cnp->cn_flags & LOCKLEAF) != 0) {
5035 lkflags = LK_SHARED;
5036 if ((cnp->cn_flags & LOCKSHARED) == 0)
5037 lkflags = LK_EXCLUSIVE;
5038 error = vget_finish(dvp, lkflags, dvs);
5039 if (__predict_false(error != 0)) {
5040 return (cache_fpl_aborted(fpl));
5041 }
5042 } else {
5043 vget_finish_ref(dvp, dvs);
5044 }
5045 return (cache_fpl_handled(fpl));
5046 }
5047
5048 static int __noinline
cache_fplookup_emptypath(struct cache_fpl * fpl)5049 cache_fplookup_emptypath(struct cache_fpl *fpl)
5050 {
5051 struct nameidata *ndp;
5052 struct componentname *cnp;
5053 enum vgetstate tvs;
5054 struct vnode *tvp;
5055 int error, lkflags;
5056
5057 fpl->tvp = fpl->dvp;
5058 fpl->tvp_seqc = fpl->dvp_seqc;
5059
5060 ndp = fpl->ndp;
5061 cnp = fpl->cnp;
5062 tvp = fpl->tvp;
5063
5064 MPASS(*cnp->cn_pnbuf == '\0');
5065
5066 if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
5067 cache_fpl_smr_exit(fpl);
5068 return (cache_fpl_handled_error(fpl, ENOENT));
5069 }
5070
5071 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
5072
5073 tvs = vget_prep_smr(tvp);
5074 cache_fpl_smr_exit(fpl);
5075 if (__predict_false(tvs == VGET_NONE)) {
5076 return (cache_fpl_aborted(fpl));
5077 }
5078
5079 if ((cnp->cn_flags & LOCKLEAF) != 0) {
5080 lkflags = LK_SHARED;
5081 if ((cnp->cn_flags & LOCKSHARED) == 0)
5082 lkflags = LK_EXCLUSIVE;
5083 error = vget_finish(tvp, lkflags, tvs);
5084 if (__predict_false(error != 0)) {
5085 return (cache_fpl_aborted(fpl));
5086 }
5087 } else {
5088 vget_finish_ref(tvp, tvs);
5089 }
5090
5091 ndp->ni_resflags |= NIRES_EMPTYPATH;
5092 return (cache_fpl_handled(fpl));
5093 }
5094
5095 static int __noinline
cache_fplookup_noentry(struct cache_fpl * fpl)5096 cache_fplookup_noentry(struct cache_fpl *fpl)
5097 {
5098 struct nameidata *ndp;
5099 struct componentname *cnp;
5100 enum vgetstate dvs;
5101 struct vnode *dvp, *tvp;
5102 seqc_t dvp_seqc;
5103 int error;
5104
5105 ndp = fpl->ndp;
5106 cnp = fpl->cnp;
5107 dvp = fpl->dvp;
5108 dvp_seqc = fpl->dvp_seqc;
5109
5110 MPASS((cnp->cn_flags & MAKEENTRY) == 0);
5111 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
5112 if (cnp->cn_nameiop == LOOKUP)
5113 MPASS((cnp->cn_flags & NOCACHE) == 0);
5114 MPASS(!cache_fpl_isdotdot(cnp));
5115
5116 /*
5117 * Hack: delayed name len checking.
5118 */
5119 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5120 cache_fpl_smr_exit(fpl);
5121 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5122 }
5123
5124 if (cnp->cn_nameptr[0] == '/') {
5125 return (cache_fplookup_skip_slashes(fpl));
5126 }
5127
5128 if (cnp->cn_pnbuf[0] == '\0') {
5129 return (cache_fplookup_emptypath(fpl));
5130 }
5131
5132 if (cnp->cn_nameptr[0] == '\0') {
5133 if (fpl->tvp == NULL) {
5134 return (cache_fplookup_degenerate(fpl));
5135 }
5136 return (cache_fplookup_trailingslash(fpl));
5137 }
5138
5139 if (cnp->cn_nameiop != LOOKUP) {
5140 fpl->tvp = NULL;
5141 return (cache_fplookup_modifying(fpl));
5142 }
5143
5144 /*
5145 * Only try to fill in the component if it is the last one,
5146 * otherwise not only there may be several to handle but the
5147 * walk may be complicated.
5148 */
5149 if (!cache_fpl_islastcn(ndp)) {
5150 return (cache_fpl_partial(fpl));
5151 }
5152
5153 /*
5154 * Regular lookup nulifies the slash, which we don't do here.
5155 * Don't take chances with filesystem routines seeing it for
5156 * the last entry.
5157 */
5158 if (cache_fpl_istrailingslash(fpl)) {
5159 return (cache_fpl_partial(fpl));
5160 }
5161
5162 /*
5163 * Secure access to dvp; check cache_fplookup_partial_setup for
5164 * reasoning.
5165 */
5166 dvs = vget_prep_smr(dvp);
5167 cache_fpl_smr_exit(fpl);
5168 if (__predict_false(dvs == VGET_NONE)) {
5169 return (cache_fpl_aborted(fpl));
5170 }
5171
5172 vget_finish_ref(dvp, dvs);
5173 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5174 vrele(dvp);
5175 return (cache_fpl_aborted(fpl));
5176 }
5177
5178 error = vn_lock(dvp, LK_SHARED);
5179 if (__predict_false(error != 0)) {
5180 vrele(dvp);
5181 return (cache_fpl_aborted(fpl));
5182 }
5183
5184 tvp = NULL;
5185 /*
5186 * TODO: provide variants which don't require locking either vnode.
5187 */
5188 cnp->cn_flags |= ISLASTCN | MAKEENTRY;
5189 cnp->cn_lkflags = LK_SHARED;
5190 if ((cnp->cn_flags & LOCKSHARED) == 0) {
5191 cnp->cn_lkflags = LK_EXCLUSIVE;
5192 }
5193 error = VOP_LOOKUP(dvp, &tvp, cnp);
5194 switch (error) {
5195 case EJUSTRETURN:
5196 case 0:
5197 break;
5198 case ENOTDIR:
5199 case ENOENT:
5200 vput(dvp);
5201 return (cache_fpl_handled_error(fpl, error));
5202 default:
5203 vput(dvp);
5204 return (cache_fpl_aborted(fpl));
5205 }
5206
5207 fpl->tvp = tvp;
5208
5209 if (tvp == NULL) {
5210 MPASS(error == EJUSTRETURN);
5211 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5212 vput(dvp);
5213 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5214 VOP_UNLOCK(dvp);
5215 }
5216 return (cache_fpl_handled(fpl));
5217 }
5218
5219 if (tvp->v_type == VLNK) {
5220 if ((cnp->cn_flags & FOLLOW) != 0) {
5221 vput(dvp);
5222 vput(tvp);
5223 return (cache_fpl_aborted(fpl));
5224 }
5225 }
5226
5227 if (__predict_false(cache_fplookup_is_mp(fpl))) {
5228 vput(dvp);
5229 vput(tvp);
5230 return (cache_fpl_aborted(fpl));
5231 }
5232
5233 if ((cnp->cn_flags & LOCKLEAF) == 0) {
5234 VOP_UNLOCK(tvp);
5235 }
5236
5237 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5238 vput(dvp);
5239 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5240 VOP_UNLOCK(dvp);
5241 }
5242 return (cache_fpl_handled(fpl));
5243 }
5244
5245 static int __noinline
cache_fplookup_dot(struct cache_fpl * fpl)5246 cache_fplookup_dot(struct cache_fpl *fpl)
5247 {
5248 int error;
5249
5250 MPASS(!seqc_in_modify(fpl->dvp_seqc));
5251
5252 if (__predict_false(fpl->dvp->v_type != VDIR)) {
5253 cache_fpl_smr_exit(fpl);
5254 return (cache_fpl_handled_error(fpl, ENOTDIR));
5255 }
5256
5257 /*
5258 * Just re-assign the value. seqc will be checked later for the first
5259 * non-dot path component in line and/or before deciding to return the
5260 * vnode.
5261 */
5262 fpl->tvp = fpl->dvp;
5263 fpl->tvp_seqc = fpl->dvp_seqc;
5264
5265 SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
5266
5267 error = 0;
5268 if (cache_fplookup_is_mp(fpl)) {
5269 error = cache_fplookup_cross_mount(fpl);
5270 }
5271 return (error);
5272 }
5273
5274 static int __noinline
cache_fplookup_dotdot(struct cache_fpl * fpl)5275 cache_fplookup_dotdot(struct cache_fpl *fpl)
5276 {
5277 struct nameidata *ndp;
5278 struct namecache *ncp;
5279 struct vnode *dvp;
5280 u_char nc_flag;
5281
5282 ndp = fpl->ndp;
5283 dvp = fpl->dvp;
5284
5285 MPASS(cache_fpl_isdotdot(fpl->cnp));
5286
5287 /*
5288 * XXX this is racy the same way regular lookup is
5289 */
5290 if (vfs_lookup_isroot(ndp, dvp)) {
5291 fpl->tvp = dvp;
5292 fpl->tvp_seqc = vn_seqc_read_any(dvp);
5293 if (seqc_in_modify(fpl->tvp_seqc)) {
5294 return (cache_fpl_aborted(fpl));
5295 }
5296 return (0);
5297 }
5298
5299 if ((dvp->v_vflag & VV_ROOT) != 0) {
5300 /*
5301 * TODO
5302 * The opposite of climb mount is needed here.
5303 */
5304 return (cache_fpl_partial(fpl));
5305 }
5306
5307 if (__predict_false(dvp->v_type != VDIR)) {
5308 cache_fpl_smr_exit(fpl);
5309 return (cache_fpl_handled_error(fpl, ENOTDIR));
5310 }
5311
5312 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
5313 if (ncp == NULL) {
5314 return (cache_fpl_aborted(fpl));
5315 }
5316
5317 nc_flag = atomic_load_char(&ncp->nc_flag);
5318 if ((nc_flag & NCF_ISDOTDOT) != 0) {
5319 if ((nc_flag & NCF_NEGATIVE) != 0)
5320 return (cache_fpl_aborted(fpl));
5321 fpl->tvp = ncp->nc_vp;
5322 } else {
5323 fpl->tvp = ncp->nc_dvp;
5324 }
5325
5326 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
5327 if (seqc_in_modify(fpl->tvp_seqc)) {
5328 return (cache_fpl_partial(fpl));
5329 }
5330
5331 /*
5332 * Acquire fence provided by vn_seqc_read_any above.
5333 */
5334 if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
5335 return (cache_fpl_aborted(fpl));
5336 }
5337
5338 if (!cache_ncp_canuse(ncp)) {
5339 return (cache_fpl_aborted(fpl));
5340 }
5341
5342 return (0);
5343 }
5344
5345 static int __noinline
cache_fplookup_neg(struct cache_fpl * fpl,struct namecache * ncp,uint32_t hash)5346 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
5347 {
5348 u_char nc_flag __diagused;
5349 bool neg_promote;
5350
5351 #ifdef INVARIANTS
5352 nc_flag = atomic_load_char(&ncp->nc_flag);
5353 MPASS((nc_flag & NCF_NEGATIVE) != 0);
5354 #endif
5355 /*
5356 * If they want to create an entry we need to replace this one.
5357 */
5358 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
5359 fpl->tvp = NULL;
5360 return (cache_fplookup_modifying(fpl));
5361 }
5362 neg_promote = cache_neg_hit_prep(ncp);
5363 if (!cache_fpl_neg_ncp_canuse(ncp)) {
5364 cache_neg_hit_abort(ncp);
5365 return (cache_fpl_partial(fpl));
5366 }
5367 if (neg_promote) {
5368 return (cache_fplookup_negative_promote(fpl, ncp, hash));
5369 }
5370 cache_neg_hit_finish(ncp);
5371 cache_fpl_smr_exit(fpl);
5372 return (cache_fpl_handled_error(fpl, ENOENT));
5373 }
5374
5375 /*
5376 * Resolve a symlink. Called by filesystem-specific routines.
5377 *
5378 * Code flow is:
5379 * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
5380 */
5381 int
cache_symlink_resolve(struct cache_fpl * fpl,const char * string,size_t len)5382 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
5383 {
5384 struct nameidata *ndp;
5385 struct componentname *cnp;
5386 size_t adjust;
5387
5388 ndp = fpl->ndp;
5389 cnp = fpl->cnp;
5390
5391 if (__predict_false(len == 0)) {
5392 return (ENOENT);
5393 }
5394
5395 if (__predict_false(len > MAXPATHLEN - 2)) {
5396 if (cache_fpl_istrailingslash(fpl)) {
5397 return (EAGAIN);
5398 }
5399 }
5400
5401 ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
5402 #ifdef INVARIANTS
5403 if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
5404 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5405 __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5406 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5407 }
5408 #endif
5409
5410 if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
5411 return (ENAMETOOLONG);
5412 }
5413
5414 if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
5415 return (ELOOP);
5416 }
5417
5418 adjust = len;
5419 if (ndp->ni_pathlen > 1) {
5420 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
5421 } else {
5422 if (cache_fpl_istrailingslash(fpl)) {
5423 adjust = len + 1;
5424 cnp->cn_pnbuf[len] = '/';
5425 cnp->cn_pnbuf[len + 1] = '\0';
5426 } else {
5427 cnp->cn_pnbuf[len] = '\0';
5428 }
5429 }
5430 bcopy(string, cnp->cn_pnbuf, len);
5431
5432 ndp->ni_pathlen += adjust;
5433 cache_fpl_pathlen_add(fpl, adjust);
5434 cnp->cn_nameptr = cnp->cn_pnbuf;
5435 fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5436 fpl->tvp = NULL;
5437 return (0);
5438 }
5439
5440 static int __noinline
cache_fplookup_symlink(struct cache_fpl * fpl)5441 cache_fplookup_symlink(struct cache_fpl *fpl)
5442 {
5443 struct mount *mp;
5444 struct nameidata *ndp;
5445 struct componentname *cnp;
5446 struct vnode *dvp, *tvp;
5447 struct pwd *pwd;
5448 int error;
5449
5450 ndp = fpl->ndp;
5451 cnp = fpl->cnp;
5452 dvp = fpl->dvp;
5453 tvp = fpl->tvp;
5454 pwd = *(fpl->pwd);
5455
5456 if (cache_fpl_islastcn(ndp)) {
5457 if ((cnp->cn_flags & FOLLOW) == 0) {
5458 return (cache_fplookup_final(fpl));
5459 }
5460 }
5461
5462 mp = atomic_load_ptr(&dvp->v_mount);
5463 if (__predict_false(mp == NULL)) {
5464 return (cache_fpl_aborted(fpl));
5465 }
5466
5467 /*
5468 * Note this check races against setting the flag just like regular
5469 * lookup.
5470 */
5471 if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
5472 cache_fpl_smr_exit(fpl);
5473 return (cache_fpl_handled_error(fpl, EACCES));
5474 }
5475
5476 error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
5477 if (__predict_false(error != 0)) {
5478 switch (error) {
5479 case EAGAIN:
5480 return (cache_fpl_partial(fpl));
5481 case ENOENT:
5482 case ENAMETOOLONG:
5483 case ELOOP:
5484 cache_fpl_smr_exit(fpl);
5485 return (cache_fpl_handled_error(fpl, error));
5486 default:
5487 return (cache_fpl_aborted(fpl));
5488 }
5489 }
5490
5491 if (*(cnp->cn_nameptr) == '/') {
5492 fpl->dvp = cache_fpl_handle_root(fpl);
5493 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5494 if (seqc_in_modify(fpl->dvp_seqc)) {
5495 return (cache_fpl_aborted(fpl));
5496 }
5497 /*
5498 * The main loop assumes that ->dvp points to a vnode belonging
5499 * to a filesystem which can do lockless lookup, but the absolute
5500 * symlink can be wandering off to one which does not.
5501 */
5502 mp = atomic_load_ptr(&fpl->dvp->v_mount);
5503 if (__predict_false(mp == NULL)) {
5504 return (cache_fpl_aborted(fpl));
5505 }
5506 if (!cache_fplookup_mp_supported(mp)) {
5507 cache_fpl_checkpoint(fpl);
5508 return (cache_fpl_partial(fpl));
5509 }
5510 if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) {
5511 return (cache_fpl_aborted(fpl));
5512 }
5513 }
5514 return (0);
5515 }
5516
5517 static int
cache_fplookup_next(struct cache_fpl * fpl)5518 cache_fplookup_next(struct cache_fpl *fpl)
5519 {
5520 struct componentname *cnp;
5521 struct namecache *ncp;
5522 struct vnode *dvp, *tvp;
5523 u_char nc_flag;
5524 uint32_t hash;
5525 int error;
5526
5527 cnp = fpl->cnp;
5528 dvp = fpl->dvp;
5529 hash = fpl->hash;
5530
5531 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
5532 if (cnp->cn_namelen == 1) {
5533 return (cache_fplookup_dot(fpl));
5534 }
5535 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
5536 return (cache_fplookup_dotdot(fpl));
5537 }
5538 }
5539
5540 MPASS(!cache_fpl_isdotdot(cnp));
5541
5542 ncp = cache_ncp_find(dvp, cnp, hash);
5543 if (__predict_false(ncp == NULL)) {
5544 return (cache_fplookup_noentry(fpl));
5545 }
5546
5547 tvp = atomic_load_ptr(&ncp->nc_vp);
5548 nc_flag = atomic_load_char(&ncp->nc_flag);
5549 if ((nc_flag & NCF_NEGATIVE) != 0) {
5550 return (cache_fplookup_neg(fpl, ncp, hash));
5551 }
5552
5553 if (!cache_ncp_canuse(ncp)) {
5554 return (cache_fpl_partial(fpl));
5555 }
5556
5557 fpl->tvp = tvp;
5558 fpl->tvp_seqc = vn_seqc_read_any(tvp);
5559 if (seqc_in_modify(fpl->tvp_seqc)) {
5560 return (cache_fpl_partial(fpl));
5561 }
5562
5563 counter_u64_add(numposhits, 1);
5564 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
5565
5566 error = 0;
5567 if (cache_fplookup_is_mp(fpl)) {
5568 error = cache_fplookup_cross_mount(fpl);
5569 }
5570 return (error);
5571 }
5572
5573 static bool
cache_fplookup_mp_supported(struct mount * mp)5574 cache_fplookup_mp_supported(struct mount *mp)
5575 {
5576
5577 MPASS(mp != NULL);
5578 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
5579 return (false);
5580 return (true);
5581 }
5582
5583 /*
5584 * Walk up the mount stack (if any).
5585 *
5586 * Correctness is provided in the following ways:
5587 * - all vnodes are protected from freeing with SMR
5588 * - struct mount objects are type stable making them always safe to access
5589 * - stability of the particular mount is provided by busying it
5590 * - relationship between the vnode which is mounted on and the mount is
5591 * verified with the vnode sequence counter after busying
5592 * - association between root vnode of the mount and the mount is protected
5593 * by busy
5594 *
5595 * From that point on we can read the sequence counter of the root vnode
5596 * and get the next mount on the stack (if any) using the same protection.
5597 *
5598 * By the end of successful walk we are guaranteed the reached state was
5599 * indeed present at least at some point which matches the regular lookup.
5600 */
5601 static int __noinline
cache_fplookup_climb_mount(struct cache_fpl * fpl)5602 cache_fplookup_climb_mount(struct cache_fpl *fpl)
5603 {
5604 struct mount *mp, *prev_mp;
5605 struct mount_pcpu *mpcpu, *prev_mpcpu;
5606 struct vnode *vp;
5607 seqc_t vp_seqc;
5608
5609 vp = fpl->tvp;
5610 vp_seqc = fpl->tvp_seqc;
5611
5612 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5613 mp = atomic_load_ptr(&vp->v_mountedhere);
5614 if (__predict_false(mp == NULL)) {
5615 return (0);
5616 }
5617
5618 prev_mp = NULL;
5619 for (;;) {
5620 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5621 if (prev_mp != NULL)
5622 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5623 return (cache_fpl_partial(fpl));
5624 }
5625 if (prev_mp != NULL)
5626 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5627 if (!vn_seqc_consistent(vp, vp_seqc)) {
5628 vfs_op_thread_exit_crit(mp, mpcpu);
5629 return (cache_fpl_partial(fpl));
5630 }
5631 if (!cache_fplookup_mp_supported(mp)) {
5632 vfs_op_thread_exit_crit(mp, mpcpu);
5633 return (cache_fpl_partial(fpl));
5634 }
5635 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5636 if (vp == NULL) {
5637 vfs_op_thread_exit_crit(mp, mpcpu);
5638 return (cache_fpl_partial(fpl));
5639 }
5640 vp_seqc = vn_seqc_read_any(vp);
5641 if (seqc_in_modify(vp_seqc)) {
5642 vfs_op_thread_exit_crit(mp, mpcpu);
5643 return (cache_fpl_partial(fpl));
5644 }
5645 prev_mp = mp;
5646 prev_mpcpu = mpcpu;
5647 mp = atomic_load_ptr(&vp->v_mountedhere);
5648 if (mp == NULL)
5649 break;
5650 }
5651
5652 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5653 fpl->tvp = vp;
5654 fpl->tvp_seqc = vp_seqc;
5655 return (0);
5656 }
5657
5658 static int __noinline
cache_fplookup_cross_mount(struct cache_fpl * fpl)5659 cache_fplookup_cross_mount(struct cache_fpl *fpl)
5660 {
5661 struct mount *mp;
5662 struct mount_pcpu *mpcpu;
5663 struct vnode *vp;
5664 seqc_t vp_seqc;
5665
5666 vp = fpl->tvp;
5667 vp_seqc = fpl->tvp_seqc;
5668
5669 VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5670 mp = atomic_load_ptr(&vp->v_mountedhere);
5671 if (__predict_false(mp == NULL)) {
5672 return (0);
5673 }
5674
5675 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5676 return (cache_fpl_partial(fpl));
5677 }
5678 if (!vn_seqc_consistent(vp, vp_seqc)) {
5679 vfs_op_thread_exit_crit(mp, mpcpu);
5680 return (cache_fpl_partial(fpl));
5681 }
5682 if (!cache_fplookup_mp_supported(mp)) {
5683 vfs_op_thread_exit_crit(mp, mpcpu);
5684 return (cache_fpl_partial(fpl));
5685 }
5686 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5687 if (__predict_false(vp == NULL)) {
5688 vfs_op_thread_exit_crit(mp, mpcpu);
5689 return (cache_fpl_partial(fpl));
5690 }
5691 vp_seqc = vn_seqc_read_any(vp);
5692 vfs_op_thread_exit_crit(mp, mpcpu);
5693 if (seqc_in_modify(vp_seqc)) {
5694 return (cache_fpl_partial(fpl));
5695 }
5696 mp = atomic_load_ptr(&vp->v_mountedhere);
5697 if (__predict_false(mp != NULL)) {
5698 /*
5699 * There are possibly more mount points on top.
5700 * Normally this does not happen so for simplicity just start
5701 * over.
5702 */
5703 return (cache_fplookup_climb_mount(fpl));
5704 }
5705
5706 fpl->tvp = vp;
5707 fpl->tvp_seqc = vp_seqc;
5708 return (0);
5709 }
5710
5711 /*
5712 * Check if a vnode is mounted on.
5713 */
5714 static bool
cache_fplookup_is_mp(struct cache_fpl * fpl)5715 cache_fplookup_is_mp(struct cache_fpl *fpl)
5716 {
5717 struct vnode *vp;
5718
5719 vp = fpl->tvp;
5720 return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5721 }
5722
5723 /*
5724 * Parse the path.
5725 *
5726 * The code was originally copy-pasted from regular lookup and despite
5727 * clean ups leaves performance on the table. Any modifications here
5728 * must take into account that in case off fallback the resulting
5729 * nameidata state has to be compatible with the original.
5730 */
5731
5732 /*
5733 * Debug ni_pathlen tracking.
5734 */
5735 #ifdef INVARIANTS
5736 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5737 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5738 {
5739
5740 fpl->debug.ni_pathlen += n;
5741 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5742 ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5743 }
5744
5745 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5746 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5747 {
5748
5749 fpl->debug.ni_pathlen -= n;
5750 KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5751 ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5752 }
5753
5754 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5755 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5756 {
5757
5758 cache_fpl_pathlen_add(fpl, 1);
5759 }
5760
5761 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5762 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5763 {
5764
5765 cache_fpl_pathlen_sub(fpl, 1);
5766 }
5767 #else
5768 static void
cache_fpl_pathlen_add(struct cache_fpl * fpl,size_t n)5769 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5770 {
5771 }
5772
5773 static void
cache_fpl_pathlen_sub(struct cache_fpl * fpl,size_t n)5774 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5775 {
5776 }
5777
5778 static void
cache_fpl_pathlen_inc(struct cache_fpl * fpl)5779 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5780 {
5781 }
5782
5783 static void
cache_fpl_pathlen_dec(struct cache_fpl * fpl)5784 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5785 {
5786 }
5787 #endif
5788
5789 static void
cache_fplookup_parse(struct cache_fpl * fpl)5790 cache_fplookup_parse(struct cache_fpl *fpl)
5791 {
5792 struct nameidata *ndp;
5793 struct componentname *cnp;
5794 struct vnode *dvp;
5795 char *cp;
5796 uint32_t hash;
5797
5798 ndp = fpl->ndp;
5799 cnp = fpl->cnp;
5800 dvp = fpl->dvp;
5801
5802 /*
5803 * Find the end of this path component, it is either / or nul.
5804 *
5805 * Store / as a temporary sentinel so that we only have one character
5806 * to test for. Pathnames tend to be short so this should not be
5807 * resulting in cache misses.
5808 *
5809 * TODO: fix this to be word-sized.
5810 */
5811 MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
5812 KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5813 ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5814 __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5815 fpl->nulchar, cnp->cn_pnbuf));
5816 KASSERT(*fpl->nulchar == '\0',
5817 ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5818 cnp->cn_pnbuf));
5819 hash = cache_get_hash_iter_start(dvp);
5820 *fpl->nulchar = '/';
5821 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5822 KASSERT(*cp != '\0',
5823 ("%s: encountered unexpected nul; string [%s]\n", __func__,
5824 cnp->cn_nameptr));
5825 hash = cache_get_hash_iter(*cp, hash);
5826 continue;
5827 }
5828 *fpl->nulchar = '\0';
5829 fpl->hash = cache_get_hash_iter_finish(hash);
5830
5831 cnp->cn_namelen = cp - cnp->cn_nameptr;
5832 cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5833
5834 #ifdef INVARIANTS
5835 /*
5836 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
5837 * we are going to fail this lookup with ENAMETOOLONG (see below).
5838 */
5839 if (cnp->cn_namelen <= NAME_MAX) {
5840 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
5841 panic("%s: mismatched hash for [%s] len %ld", __func__,
5842 cnp->cn_nameptr, cnp->cn_namelen);
5843 }
5844 }
5845 #endif
5846
5847 /*
5848 * Hack: we have to check if the found path component's length exceeds
5849 * NAME_MAX. However, the condition is very rarely true and check can
5850 * be elided in the common case -- if an entry was found in the cache,
5851 * then it could not have been too long to begin with.
5852 */
5853 ndp->ni_next = cp;
5854 }
5855
5856 static void
cache_fplookup_parse_advance(struct cache_fpl * fpl)5857 cache_fplookup_parse_advance(struct cache_fpl *fpl)
5858 {
5859 struct nameidata *ndp;
5860 struct componentname *cnp;
5861
5862 ndp = fpl->ndp;
5863 cnp = fpl->cnp;
5864
5865 cnp->cn_nameptr = ndp->ni_next;
5866 KASSERT(*(cnp->cn_nameptr) == '/',
5867 ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5868 cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5869 cnp->cn_nameptr++;
5870 cache_fpl_pathlen_dec(fpl);
5871 }
5872
5873 /*
5874 * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5875 *
5876 * Lockless lookup tries to elide checking for spurious slashes and should they
5877 * be present is guaranteed to fail to find an entry. In this case the caller
5878 * must check if the name starts with a slash and call this routine. It is
5879 * going to fast forward across the spurious slashes and set the state up for
5880 * retry.
5881 */
5882 static int __noinline
cache_fplookup_skip_slashes(struct cache_fpl * fpl)5883 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
5884 {
5885 struct nameidata *ndp;
5886 struct componentname *cnp;
5887
5888 ndp = fpl->ndp;
5889 cnp = fpl->cnp;
5890
5891 MPASS(*(cnp->cn_nameptr) == '/');
5892 do {
5893 cnp->cn_nameptr++;
5894 cache_fpl_pathlen_dec(fpl);
5895 } while (*(cnp->cn_nameptr) == '/');
5896
5897 /*
5898 * Go back to one slash so that cache_fplookup_parse_advance has
5899 * something to skip.
5900 */
5901 cnp->cn_nameptr--;
5902 cache_fpl_pathlen_inc(fpl);
5903
5904 /*
5905 * cache_fplookup_parse_advance starts from ndp->ni_next
5906 */
5907 ndp->ni_next = cnp->cn_nameptr;
5908
5909 /*
5910 * See cache_fplookup_dot.
5911 */
5912 fpl->tvp = fpl->dvp;
5913 fpl->tvp_seqc = fpl->dvp_seqc;
5914
5915 return (0);
5916 }
5917
5918 /*
5919 * Handle trailing slashes (e.g., "foo/").
5920 *
5921 * If a trailing slash is found the terminal vnode must be a directory.
5922 * Regular lookup shortens the path by nulifying the first trailing slash and
5923 * sets the TRAILINGSLASH flag to denote this took place. There are several
5924 * checks on it performed later.
5925 *
5926 * Similarly to spurious slashes, lockless lookup handles this in a speculative
5927 * manner relying on an invariant that a non-directory vnode will get a miss.
5928 * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
5929 *
5930 * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
5931 * and denotes this is the last path component, which avoids looping back.
5932 *
5933 * Only plain lookups are supported for now to restrict corner cases to handle.
5934 */
5935 static int __noinline
cache_fplookup_trailingslash(struct cache_fpl * fpl)5936 cache_fplookup_trailingslash(struct cache_fpl *fpl)
5937 {
5938 #ifdef INVARIANTS
5939 size_t ni_pathlen;
5940 #endif
5941 struct nameidata *ndp;
5942 struct componentname *cnp;
5943 struct namecache *ncp;
5944 struct vnode *tvp;
5945 char *cn_nameptr_orig, *cn_nameptr_slash;
5946 seqc_t tvp_seqc;
5947 u_char nc_flag;
5948
5949 ndp = fpl->ndp;
5950 cnp = fpl->cnp;
5951 tvp = fpl->tvp;
5952 tvp_seqc = fpl->tvp_seqc;
5953
5954 MPASS(fpl->dvp == fpl->tvp);
5955 KASSERT(cache_fpl_istrailingslash(fpl),
5956 ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
5957 cnp->cn_pnbuf));
5958 KASSERT(cnp->cn_nameptr[0] == '\0',
5959 ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
5960 cnp->cn_pnbuf));
5961 KASSERT(cnp->cn_namelen == 0,
5962 ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
5963 cnp->cn_pnbuf));
5964 MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
5965
5966 if (cnp->cn_nameiop != LOOKUP) {
5967 return (cache_fpl_aborted(fpl));
5968 }
5969
5970 if (__predict_false(tvp->v_type != VDIR)) {
5971 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
5972 return (cache_fpl_aborted(fpl));
5973 }
5974 cache_fpl_smr_exit(fpl);
5975 return (cache_fpl_handled_error(fpl, ENOTDIR));
5976 }
5977
5978 /*
5979 * Denote the last component.
5980 */
5981 ndp->ni_next = &cnp->cn_nameptr[0];
5982 MPASS(cache_fpl_islastcn(ndp));
5983
5984 /*
5985 * Unwind trailing slashes.
5986 */
5987 cn_nameptr_orig = cnp->cn_nameptr;
5988 while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
5989 cnp->cn_nameptr--;
5990 if (cnp->cn_nameptr[0] != '/') {
5991 break;
5992 }
5993 }
5994
5995 /*
5996 * Unwind to the beginning of the path component.
5997 *
5998 * Note the path may or may not have started with a slash.
5999 */
6000 cn_nameptr_slash = cnp->cn_nameptr;
6001 while (cnp->cn_nameptr > cnp->cn_pnbuf) {
6002 cnp->cn_nameptr--;
6003 if (cnp->cn_nameptr[0] == '/') {
6004 break;
6005 }
6006 }
6007 if (cnp->cn_nameptr[0] == '/') {
6008 cnp->cn_nameptr++;
6009 }
6010
6011 cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
6012 cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
6013 cache_fpl_checkpoint(fpl);
6014
6015 #ifdef INVARIANTS
6016 ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
6017 if (ni_pathlen != fpl->debug.ni_pathlen) {
6018 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
6019 __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
6020 cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
6021 }
6022 #endif
6023
6024 /*
6025 * If this was a "./" lookup the parent directory is already correct.
6026 */
6027 if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
6028 return (0);
6029 }
6030
6031 /*
6032 * Otherwise we need to look it up.
6033 */
6034 tvp = fpl->tvp;
6035 ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
6036 if (__predict_false(ncp == NULL)) {
6037 return (cache_fpl_aborted(fpl));
6038 }
6039 nc_flag = atomic_load_char(&ncp->nc_flag);
6040 if ((nc_flag & NCF_ISDOTDOT) != 0) {
6041 return (cache_fpl_aborted(fpl));
6042 }
6043 fpl->dvp = ncp->nc_dvp;
6044 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
6045 if (seqc_in_modify(fpl->dvp_seqc)) {
6046 return (cache_fpl_aborted(fpl));
6047 }
6048 return (0);
6049 }
6050
6051 /*
6052 * See the API contract for VOP_FPLOOKUP_VEXEC.
6053 */
6054 static int __noinline
cache_fplookup_failed_vexec(struct cache_fpl * fpl,int error)6055 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
6056 {
6057 struct componentname *cnp;
6058 struct vnode *dvp;
6059 seqc_t dvp_seqc;
6060
6061 cnp = fpl->cnp;
6062 dvp = fpl->dvp;
6063 dvp_seqc = fpl->dvp_seqc;
6064
6065 /*
6066 * Hack: delayed empty path checking.
6067 */
6068 if (cnp->cn_pnbuf[0] == '\0') {
6069 return (cache_fplookup_emptypath(fpl));
6070 }
6071
6072 /*
6073 * TODO: Due to ignoring trailing slashes lookup will perform a
6074 * permission check on the last dir when it should not be doing it. It
6075 * may fail, but said failure should be ignored. It is possible to fix
6076 * it up fully without resorting to regular lookup, but for now just
6077 * abort.
6078 */
6079 if (cache_fpl_istrailingslash(fpl)) {
6080 return (cache_fpl_aborted(fpl));
6081 }
6082
6083 /*
6084 * Hack: delayed degenerate path checking.
6085 */
6086 if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
6087 return (cache_fplookup_degenerate(fpl));
6088 }
6089
6090 /*
6091 * Hack: delayed name len checking.
6092 */
6093 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
6094 cache_fpl_smr_exit(fpl);
6095 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
6096 }
6097
6098 /*
6099 * Hack: they may be looking up foo/bar, where foo is not a directory.
6100 * In such a case we need to return ENOTDIR, but we may happen to get
6101 * here with a different error.
6102 */
6103 if (dvp->v_type != VDIR) {
6104 error = ENOTDIR;
6105 }
6106
6107 /*
6108 * Hack: handle O_SEARCH.
6109 *
6110 * Open Group Base Specifications Issue 7, 2018 edition states:
6111 * <quote>
6112 * If the access mode of the open file description associated with the
6113 * file descriptor is not O_SEARCH, the function shall check whether
6114 * directory searches are permitted using the current permissions of
6115 * the directory underlying the file descriptor. If the access mode is
6116 * O_SEARCH, the function shall not perform the check.
6117 * </quote>
6118 *
6119 * Regular lookup tests for the NOEXECCHECK flag for every path
6120 * component to decide whether to do the permission check. However,
6121 * since most lookups never have the flag (and when they do it is only
6122 * present for the first path component), lockless lookup only acts on
6123 * it if there is a permission problem. Here the flag is represented
6124 * with a boolean so that we don't have to clear it on the way out.
6125 *
6126 * For simplicity this always aborts.
6127 * TODO: check if this is the first lookup and ignore the permission
6128 * problem. Note the flag has to survive fallback (if it happens to be
6129 * performed).
6130 */
6131 if (fpl->fsearch) {
6132 return (cache_fpl_aborted(fpl));
6133 }
6134
6135 switch (error) {
6136 case EAGAIN:
6137 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6138 error = cache_fpl_aborted(fpl);
6139 } else {
6140 cache_fpl_partial(fpl);
6141 }
6142 break;
6143 default:
6144 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6145 error = cache_fpl_aborted(fpl);
6146 } else {
6147 cache_fpl_smr_exit(fpl);
6148 cache_fpl_handled_error(fpl, error);
6149 }
6150 break;
6151 }
6152 return (error);
6153 }
6154
6155 static int
cache_fplookup_impl(struct vnode * dvp,struct cache_fpl * fpl)6156 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
6157 {
6158 struct nameidata *ndp;
6159 struct componentname *cnp;
6160 struct mount *mp;
6161 int error;
6162
6163 ndp = fpl->ndp;
6164 cnp = fpl->cnp;
6165
6166 cache_fpl_checkpoint(fpl);
6167
6168 /*
6169 * The vnode at hand is almost always stable, skip checking for it.
6170 * Worst case this postpones the check towards the end of the iteration
6171 * of the main loop.
6172 */
6173 fpl->dvp = dvp;
6174 fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
6175
6176 mp = atomic_load_ptr(&dvp->v_mount);
6177 if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
6178 return (cache_fpl_aborted(fpl));
6179 }
6180
6181 MPASS(fpl->tvp == NULL);
6182
6183 for (;;) {
6184 cache_fplookup_parse(fpl);
6185
6186 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
6187 if (__predict_false(error != 0)) {
6188 error = cache_fplookup_failed_vexec(fpl, error);
6189 break;
6190 }
6191
6192 error = cache_fplookup_next(fpl);
6193 if (__predict_false(cache_fpl_terminated(fpl))) {
6194 break;
6195 }
6196
6197 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
6198
6199 if (fpl->tvp->v_type == VLNK) {
6200 error = cache_fplookup_symlink(fpl);
6201 if (cache_fpl_terminated(fpl)) {
6202 break;
6203 }
6204 } else {
6205 if (cache_fpl_islastcn(ndp)) {
6206 error = cache_fplookup_final(fpl);
6207 break;
6208 }
6209
6210 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
6211 error = cache_fpl_aborted(fpl);
6212 break;
6213 }
6214
6215 fpl->dvp = fpl->tvp;
6216 fpl->dvp_seqc = fpl->tvp_seqc;
6217 cache_fplookup_parse_advance(fpl);
6218 }
6219
6220 cache_fpl_checkpoint(fpl);
6221 }
6222
6223 return (error);
6224 }
6225
6226 /*
6227 * Fast path lookup protected with SMR and sequence counters.
6228 *
6229 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
6230 *
6231 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
6232 * outlined below.
6233 *
6234 * Traditional vnode lookup conceptually looks like this:
6235 *
6236 * vn_lock(current);
6237 * for (;;) {
6238 * next = find();
6239 * vn_lock(next);
6240 * vn_unlock(current);
6241 * current = next;
6242 * if (last)
6243 * break;
6244 * }
6245 * return (current);
6246 *
6247 * Each jump to the next vnode is safe memory-wise and atomic with respect to
6248 * any modifications thanks to holding respective locks.
6249 *
6250 * The same guarantee can be provided with a combination of safe memory
6251 * reclamation and sequence counters instead. If all operations which affect
6252 * the relationship between the current vnode and the one we are looking for
6253 * also modify the counter, we can verify whether all the conditions held as
6254 * we made the jump. This includes things like permissions, mount points etc.
6255 * Counter modification is provided by enclosing relevant places in
6256 * vn_seqc_write_begin()/end() calls.
6257 *
6258 * Thus this translates to:
6259 *
6260 * vfs_smr_enter();
6261 * dvp_seqc = seqc_read_any(dvp);
6262 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
6263 * abort();
6264 * for (;;) {
6265 * tvp = find();
6266 * tvp_seqc = seqc_read_any(tvp);
6267 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
6268 * abort();
6269 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
6270 * abort();
6271 * dvp = tvp; // we know nothing of importance has changed
6272 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
6273 * if (last)
6274 * break;
6275 * }
6276 * vget(); // secure the vnode
6277 * if (!seqc_consistent(tvp, tvp_seqc) // final check
6278 * abort();
6279 * // at this point we know nothing has changed for any parent<->child pair
6280 * // as they were crossed during the lookup, meaning we matched the guarantee
6281 * // of the locked variant
6282 * return (tvp);
6283 *
6284 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
6285 * - they are called while within vfs_smr protection which they must never exit
6286 * - EAGAIN can be returned to denote checking could not be performed, it is
6287 * always valid to return it
6288 * - if the sequence counter has not changed the result must be valid
6289 * - if the sequence counter has changed both false positives and false negatives
6290 * are permitted (since the result will be rejected later)
6291 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
6292 *
6293 * Caveats to watch out for:
6294 * - vnodes are passed unlocked and unreferenced with nothing stopping
6295 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
6296 * to use atomic_load_ptr to fetch it.
6297 * - the aforementioned object can also get freed, meaning absent other means it
6298 * should be protected with vfs_smr
6299 * - either safely checking permissions as they are modified or guaranteeing
6300 * their stability is left to the routine
6301 */
6302 int
cache_fplookup(struct nameidata * ndp,enum cache_fpl_status * status,struct pwd ** pwdp)6303 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
6304 struct pwd **pwdp)
6305 {
6306 struct cache_fpl fpl;
6307 struct pwd *pwd;
6308 struct vnode *dvp;
6309 struct componentname *cnp;
6310 int error;
6311
6312 fpl.status = CACHE_FPL_STATUS_UNSET;
6313 fpl.in_smr = false;
6314 fpl.ndp = ndp;
6315 fpl.cnp = cnp = &ndp->ni_cnd;
6316 MPASS(ndp->ni_lcf == 0);
6317 KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
6318 ("%s: internal flags found in cn_flags %" PRIx64, __func__,
6319 cnp->cn_flags));
6320 MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
6321 MPASS(ndp->ni_resflags == 0);
6322
6323 if (__predict_false(!cache_can_fplookup(&fpl))) {
6324 *status = fpl.status;
6325 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6326 return (EOPNOTSUPP);
6327 }
6328
6329 cache_fpl_checkpoint_outer(&fpl);
6330
6331 cache_fpl_smr_enter_initial(&fpl);
6332 #ifdef INVARIANTS
6333 fpl.debug.ni_pathlen = ndp->ni_pathlen;
6334 #endif
6335 fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
6336 fpl.fsearch = false;
6337 fpl.tvp = NULL; /* for degenerate path handling */
6338 fpl.pwd = pwdp;
6339 pwd = pwd_get_smr();
6340 *(fpl.pwd) = pwd;
6341 namei_setup_rootdir(ndp, cnp, pwd);
6342 ndp->ni_topdir = pwd->pwd_jdir;
6343
6344 if (cnp->cn_pnbuf[0] == '/') {
6345 dvp = cache_fpl_handle_root(&fpl);
6346 ndp->ni_resflags = NIRES_ABS;
6347 } else {
6348 if (ndp->ni_dirfd == AT_FDCWD) {
6349 dvp = pwd->pwd_cdir;
6350 } else {
6351 error = cache_fplookup_dirfd(&fpl, &dvp);
6352 if (__predict_false(error != 0)) {
6353 goto out;
6354 }
6355 }
6356 }
6357
6358 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
6359 error = cache_fplookup_impl(dvp, &fpl);
6360 out:
6361 cache_fpl_smr_assert_not_entered(&fpl);
6362 cache_fpl_assert_status(&fpl);
6363 *status = fpl.status;
6364 if (SDT_PROBES_ENABLED()) {
6365 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6366 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
6367 SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
6368 ndp);
6369 }
6370
6371 if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
6372 MPASS(error != CACHE_FPL_FAILED);
6373 if (error != 0) {
6374 cache_fpl_cleanup_cnp(fpl.cnp);
6375 MPASS(fpl.dvp == NULL);
6376 MPASS(fpl.tvp == NULL);
6377 }
6378 ndp->ni_dvp = fpl.dvp;
6379 ndp->ni_vp = fpl.tvp;
6380 }
6381 return (error);
6382 }
6383