1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 */
35
36 #include <sys/cdefs.h>
37 /*
38 * Here is the basic algorithm:
39 * First, some design criteria I used:
40 * - I think a false hit is more serious than a false miss
41 * - A false hit for an RPC that has Op(s) that order via seqid# must be
42 * avoided at all cost
43 * - A valid hit will probably happen a long time after the original reply
44 * and the TCP socket that the original request was received on will no
45 * longer be active
46 * (The long time delay implies to me that LRU is not appropriate.)
47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48 * in them as well as minimizing the risk of redoing retried non-idempotent
49 * Ops.
50 * Because it is biased towards avoiding false hits, multiple entries with
51 * the same xid are to be expected, especially for the case of the entry
52 * in the cache being related to a seqid# sequenced Op.
53 *
54 * The basic algorithm I'm about to code up:
55 * - Null RPCs bypass the cache and are just done
56 * For TCP
57 * - key on <xid, NFS version> (as noted above, there can be several
58 * entries with the same key)
59 * When a request arrives:
60 * For all that match key
61 * - if RPC# != OR request_size !=
62 * - not a match with this one
63 * - if NFSv4 and received on same TCP socket OR
64 * received on a TCP connection created before the
65 * entry was cached
66 * - not a match with this one
67 * (V2,3 clients might retry on same TCP socket)
68 * - calculate checksum on first N bytes of NFS XDR
69 * - if checksum !=
70 * - not a match for this one
71 * If any of the remaining ones that match has a
72 * seqid_refcnt > 0
73 * - not a match (go do RPC, using new cache entry)
74 * If one match left
75 * - a hit (reply from cache)
76 * else
77 * - miss (go do RPC, using new cache entry)
78 *
79 * During processing of NFSv4 request:
80 * - set a flag when a non-idempotent Op is processed
81 * - when an Op that uses a seqid# (Open,...) is processed
82 * - if same seqid# as referenced entry in cache
83 * - free new cache entry
84 * - reply from referenced cache entry
85 * else if next seqid# in order
86 * - free referenced cache entry
87 * - increment seqid_refcnt on new cache entry
88 * - set pointer from Openowner/Lockowner to
89 * new cache entry (aka reference it)
90 * else if first seqid# in sequence
91 * - increment seqid_refcnt on new cache entry
92 * - set pointer from Openowner/Lockowner to
93 * new cache entry (aka reference it)
94 *
95 * At end of RPC processing:
96 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
97 * cache entry
98 * - save reply in cache entry
99 * - calculate checksum on first N bytes of NFS XDR
100 * request
101 * - note op and length of XDR request (in bytes)
102 * - timestamp it
103 * else
104 * - free new cache entry
105 * - Send reply (noting info for socket activity check, below)
106 *
107 * For cache entries saved above:
108 * - if saved since seqid_refcnt was > 0
109 * - free when seqid_refcnt decrements to 0
110 * (when next one in sequence is processed above, or
111 * when Openowner/Lockowner is discarded)
112 * else { non-idempotent Op(s) }
113 * - free when
114 * - some further activity observed on same
115 * socket
116 * (I'm not yet sure how I'm going to do
117 * this. Maybe look at the TCP connection
118 * to see if the send_tcp_sequence# is well
119 * past sent reply OR K additional RPCs
120 * replied on same socket OR?)
121 * OR
122 * - when very old (hours, days, weeks?)
123 *
124 * For UDP (v2, 3 only), pretty much the old way:
125 * - key on <xid, NFS version, RPC#, Client host ip#>
126 * (at most one entry for each key)
127 *
128 * When a Request arrives:
129 * - if a match with entry via key
130 * - if RPC marked In_progress
131 * - discard request (don't send reply)
132 * else
133 * - reply from cache
134 * - timestamp cache entry
135 * else
136 * - add entry to cache, marked In_progress
137 * - do RPC
138 * - when RPC done
139 * - if RPC# non-idempotent
140 * - mark entry Done (not In_progress)
141 * - save reply
142 * - timestamp cache entry
143 * else
144 * - free cache entry
145 * - send reply
146 *
147 * Later, entries with saved replies are free'd a short time (few minutes)
148 * after reply sent (timestamp).
149 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151 * pages 53-63. San Diego, February 1989.
152 * for the UDP case.
153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154 * for TCP. For V3, a reply won't be saved when the flood level is
155 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156 * that case. This level should be set high enough that this almost
157 * never happens.
158 */
159 #include <fs/nfs/nfsport.h>
160
161 extern struct mtx nfsrc_udpmtx;
162
163 NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
164 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
165 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
166 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
167
168 NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
169 NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
170
171 SYSCTL_DECL(_vfs_nfsd);
172
173 static u_int nfsrc_tcphighwater = 0;
174 static int
sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
176 {
177 int error, newhighwater;
178
179 newhighwater = nfsrc_tcphighwater;
180 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181 if (error != 0 || req->newptr == NULL)
182 return (error);
183 if (newhighwater < 0)
184 return (EINVAL);
185 if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
186 NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
187 nfsrc_tcphighwater = newhighwater;
188 return (0);
189 }
190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
191 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
192 sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
193
194 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196 &nfsrc_udphighwater, 0,
197 "High water mark for UDP cache entries");
198 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200 &nfsrc_tcptimeout, 0,
201 "Timeout for TCP entries in the DRC");
202 static u_int nfsrc_tcpnonidempotent = 1;
203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204 &nfsrc_tcpnonidempotent, 0,
205 "Enable the DRC for NFS over TCP");
206
207 NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
208 NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
209
210 /*
211 * and the reverse mapping from generic to Version 2 procedure numbers
212 */
213 static int newnfsv2_procid[NFS_V3NPROCS] = {
214 NFSV2PROC_NULL,
215 NFSV2PROC_GETATTR,
216 NFSV2PROC_SETATTR,
217 NFSV2PROC_LOOKUP,
218 NFSV2PROC_NOOP,
219 NFSV2PROC_READLINK,
220 NFSV2PROC_READ,
221 NFSV2PROC_WRITE,
222 NFSV2PROC_CREATE,
223 NFSV2PROC_MKDIR,
224 NFSV2PROC_SYMLINK,
225 NFSV2PROC_CREATE,
226 NFSV2PROC_REMOVE,
227 NFSV2PROC_RMDIR,
228 NFSV2PROC_RENAME,
229 NFSV2PROC_LINK,
230 NFSV2PROC_READDIR,
231 NFSV2PROC_NOOP,
232 NFSV2PROC_STATFS,
233 NFSV2PROC_NOOP,
234 NFSV2PROC_NOOP,
235 NFSV2PROC_NOOP,
236 };
237
238 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
239 #define NFSRCUDPHASH(xid) \
240 (&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
241 #define NFSRCHASH(xid) \
242 (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
243 #define NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
244 #define TRUE 1
245 #define FALSE 0
246 #define NFSRVCACHE_CHECKLEN 100
247
248 /* True iff the rpc reply is an nfs status ONLY! */
249 static int nfsv2_repstat[NFS_V3NPROCS] = {
250 FALSE,
251 FALSE,
252 FALSE,
253 FALSE,
254 FALSE,
255 FALSE,
256 FALSE,
257 FALSE,
258 FALSE,
259 FALSE,
260 TRUE,
261 TRUE,
262 TRUE,
263 TRUE,
264 FALSE,
265 TRUE,
266 FALSE,
267 FALSE,
268 FALSE,
269 FALSE,
270 FALSE,
271 FALSE,
272 };
273
274 /*
275 * Will NFS want to work over IPv6 someday?
276 */
277 #define NETFAMILY(rp) \
278 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
279
280 /* local functions */
281 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283 static void nfsrc_lock(struct nfsrvcache *rp);
284 static void nfsrc_unlock(struct nfsrvcache *rp);
285 static void nfsrc_wanted(struct nfsrvcache *rp);
286 static void nfsrc_freecache(struct nfsrvcache *rp);
287 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
288 static void nfsrc_marksametcpconn(u_int64_t);
289
290 /*
291 * Return the correct mutex for this cache entry.
292 */
293 static __inline struct mtx *
nfsrc_cachemutex(struct nfsrvcache * rp)294 nfsrc_cachemutex(struct nfsrvcache *rp)
295 {
296
297 if ((rp->rc_flag & RC_UDP) != 0)
298 return (&nfsrc_udpmtx);
299 return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
300 }
301
302 /*
303 * Initialize the server request cache list
304 */
305 void
nfsrvd_initcache(void)306 nfsrvd_initcache(void)
307 {
308 int i;
309
310 NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
311 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
312 NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
313 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
314 NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
315 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
316 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
317 mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
318 MTX_DEF);
319 mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
320 MTX_DEF);
321 }
322 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
323 LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
324 LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
325 LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
326 }
327 TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
328 NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
329 NFSD_VNET(nfsrc_udpcachesize) = 0;
330 }
331
332 /*
333 * Get a cache entry for this request. Basically just malloc a new one
334 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
335 */
336 int
nfsrvd_getcache(struct nfsrv_descript * nd)337 nfsrvd_getcache(struct nfsrv_descript *nd)
338 {
339 struct nfsrvcache *newrp;
340 int ret;
341
342 if (nd->nd_procnum == NFSPROC_NULL)
343 panic("nfsd cache null");
344 newrp = malloc(sizeof (struct nfsrvcache),
345 M_NFSRVCACHE, M_WAITOK);
346 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
347 if (nd->nd_flag & ND_NFSV4)
348 newrp->rc_flag = RC_NFSV4;
349 else if (nd->nd_flag & ND_NFSV3)
350 newrp->rc_flag = RC_NFSV3;
351 else
352 newrp->rc_flag = RC_NFSV2;
353 newrp->rc_xid = nd->nd_retxid;
354 newrp->rc_proc = nd->nd_procnum;
355 newrp->rc_sockref = nd->nd_sockref;
356 newrp->rc_cachetime = nd->nd_tcpconntime;
357 if (nd->nd_flag & ND_SAMETCPCONN)
358 newrp->rc_flag |= RC_SAMETCPCONN;
359 if (nd->nd_nam2 != NULL) {
360 newrp->rc_flag |= RC_UDP;
361 ret = nfsrc_getudp(nd, newrp);
362 } else {
363 ret = nfsrc_gettcp(nd, newrp);
364 }
365 NFSEXITCODE2(0, nd);
366 return (ret);
367 }
368
369 /*
370 * For UDP (v2, v3):
371 * - key on <xid, NFS version, RPC#, Client host ip#>
372 * (at most one entry for each key)
373 */
374 static int
nfsrc_getudp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)375 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
376 {
377 struct nfsrvcache *rp;
378 struct sockaddr_in *saddr;
379 struct sockaddr_in6 *saddr6;
380 struct nfsrvhashhead *hp;
381 int ret = 0;
382 struct mtx *mutex;
383
384 mutex = nfsrc_cachemutex(newrp);
385 hp = NFSRCUDPHASH(newrp->rc_xid);
386 loop:
387 mtx_lock(mutex);
388 LIST_FOREACH(rp, hp, rc_hash) {
389 if (newrp->rc_xid == rp->rc_xid &&
390 newrp->rc_proc == rp->rc_proc &&
391 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
392 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
393 if ((rp->rc_flag & RC_LOCKED) != 0) {
394 rp->rc_flag |= RC_WANTED;
395 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
396 "nfsrc", 10 * hz);
397 goto loop;
398 }
399 if (rp->rc_flag == 0)
400 panic("nfs udp cache0");
401 rp->rc_flag |= RC_LOCKED;
402 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
403 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
404 if (rp->rc_flag & RC_INPROG) {
405 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
406 mtx_unlock(mutex);
407 ret = RC_DROPIT;
408 } else if (rp->rc_flag & RC_REPSTATUS) {
409 /*
410 * V2 only.
411 */
412 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
413 mtx_unlock(mutex);
414 nfsrvd_rephead(nd);
415 *(nd->nd_errp) = rp->rc_status;
416 ret = RC_REPLY;
417 rp->rc_timestamp = NFSD_MONOSEC +
418 NFSRVCACHE_UDPTIMEOUT;
419 } else if (rp->rc_flag & RC_REPMBUF) {
420 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
421 mtx_unlock(mutex);
422 nd->nd_mreq = m_copym(rp->rc_reply, 0,
423 M_COPYALL, M_WAITOK);
424 ret = RC_REPLY;
425 rp->rc_timestamp = NFSD_MONOSEC +
426 NFSRVCACHE_UDPTIMEOUT;
427 } else {
428 panic("nfs udp cache1");
429 }
430 nfsrc_unlock(rp);
431 free(newrp, M_NFSRVCACHE);
432 goto out;
433 }
434 }
435 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
436 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
437 NFSD_VNET(nfsrc_udpcachesize)++;
438
439 newrp->rc_flag |= RC_INPROG;
440 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
441 if (saddr->sin_family == AF_INET)
442 newrp->rc_inet = saddr->sin_addr.s_addr;
443 else if (saddr->sin_family == AF_INET6) {
444 saddr6 = (struct sockaddr_in6 *)saddr;
445 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
446 sizeof (struct in6_addr));
447 newrp->rc_flag |= RC_INETIPV6;
448 }
449 LIST_INSERT_HEAD(hp, newrp, rc_hash);
450 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
451 mtx_unlock(mutex);
452 nd->nd_rp = newrp;
453 ret = RC_DOIT;
454
455 out:
456 NFSEXITCODE2(0, nd);
457 return (ret);
458 }
459
460 /*
461 * Update a request cache entry after the rpc has been done
462 */
463 struct nfsrvcache *
nfsrvd_updatecache(struct nfsrv_descript * nd)464 nfsrvd_updatecache(struct nfsrv_descript *nd)
465 {
466 struct nfsrvcache *rp;
467 struct nfsrvcache *retrp = NULL;
468 struct mbuf *m;
469 struct mtx *mutex;
470
471 rp = nd->nd_rp;
472 if (!rp)
473 panic("nfsrvd_updatecache null rp");
474 nd->nd_rp = NULL;
475 mutex = nfsrc_cachemutex(rp);
476 mtx_lock(mutex);
477 nfsrc_lock(rp);
478 if (!(rp->rc_flag & RC_INPROG))
479 panic("nfsrvd_updatecache not inprog");
480 rp->rc_flag &= ~RC_INPROG;
481 if (rp->rc_flag & RC_UDP) {
482 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
483 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
484 }
485
486 /*
487 * Reply from cache is a special case returned by nfsrv_checkseqid().
488 */
489 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
490 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
491 mtx_unlock(mutex);
492 nd->nd_repstat = 0;
493 if (nd->nd_mreq)
494 m_freem(nd->nd_mreq);
495 if (!(rp->rc_flag & RC_REPMBUF))
496 panic("reply from cache");
497 nd->nd_mreq = m_copym(rp->rc_reply, 0,
498 M_COPYALL, M_WAITOK);
499 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
500 nfsrc_unlock(rp);
501 goto out;
502 }
503
504 /*
505 * If rc_refcnt > 0, save it
506 * For UDP, save it if ND_SAVEREPLY is set
507 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
508 */
509 if (nd->nd_repstat != NFSERR_DONTREPLY &&
510 (rp->rc_refcnt > 0 ||
511 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
512 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
513 NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
514 nfsrc_tcpnonidempotent))) {
515 if (rp->rc_refcnt > 0) {
516 if (!(rp->rc_flag & RC_NFSV4))
517 panic("update_cache refcnt");
518 rp->rc_flag |= RC_REFCNT;
519 }
520 if ((nd->nd_flag & ND_NFSV2) &&
521 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
522 rp->rc_status = nd->nd_repstat;
523 rp->rc_flag |= RC_REPSTATUS;
524 mtx_unlock(mutex);
525 } else {
526 if (!(rp->rc_flag & RC_UDP)) {
527 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
528 1);
529 if (NFSD_VNET(nfsrc_tcpsavedreplies) >
530 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
531 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
532 NFSD_VNET(nfsrc_tcpsavedreplies);
533 }
534 mtx_unlock(mutex);
535 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
536 mtx_lock(mutex);
537 rp->rc_reply = m;
538 rp->rc_flag |= RC_REPMBUF;
539 mtx_unlock(mutex);
540 }
541 if (rp->rc_flag & RC_UDP) {
542 rp->rc_timestamp = NFSD_MONOSEC +
543 NFSRVCACHE_UDPTIMEOUT;
544 nfsrc_unlock(rp);
545 } else {
546 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
547 if (rp->rc_refcnt > 0)
548 nfsrc_unlock(rp);
549 else
550 retrp = rp;
551 }
552 } else {
553 nfsrc_freecache(rp);
554 mtx_unlock(mutex);
555 }
556
557 out:
558 NFSEXITCODE2(0, nd);
559 return (retrp);
560 }
561
562 /*
563 * Invalidate and, if possible, free an in prog cache entry.
564 * Must not sleep.
565 */
566 void
nfsrvd_delcache(struct nfsrvcache * rp)567 nfsrvd_delcache(struct nfsrvcache *rp)
568 {
569 struct mtx *mutex;
570
571 mutex = nfsrc_cachemutex(rp);
572 if (!(rp->rc_flag & RC_INPROG))
573 panic("nfsrvd_delcache not in prog");
574 mtx_lock(mutex);
575 rp->rc_flag &= ~RC_INPROG;
576 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
577 nfsrc_freecache(rp);
578 mtx_unlock(mutex);
579 }
580
581 /*
582 * Called after nfsrvd_updatecache() once the reply is sent, to update
583 * the entry's sequence number and unlock it. The argument is
584 * the pointer returned by nfsrvd_updatecache().
585 */
586 void
nfsrvd_sentcache(struct nfsrvcache * rp,int have_seq,uint32_t seq)587 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
588 {
589 struct nfsrchash_bucket *hbp;
590
591 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
592 if (have_seq) {
593 hbp = NFSRCAHASH(rp->rc_sockref);
594 mtx_lock(&hbp->mtx);
595 rp->rc_tcpseq = seq;
596 if (rp->rc_acked != RC_NO_ACK)
597 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
598 rp->rc_acked = RC_NO_ACK;
599 mtx_unlock(&hbp->mtx);
600 }
601 nfsrc_unlock(rp);
602 }
603
604 /*
605 * Get a cache entry for TCP
606 * - key on <xid, nfs version>
607 * (allow multiple entries for a given key)
608 */
609 static int
nfsrc_gettcp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)610 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
611 {
612 struct nfsrvcache *rp, *nextrp;
613 int i;
614 struct nfsrvcache *hitrp;
615 struct nfsrvhashhead *hp, nfsrc_templist;
616 int hit, ret = 0;
617 struct mtx *mutex;
618
619 mutex = nfsrc_cachemutex(newrp);
620 hp = NFSRCHASH(newrp->rc_xid);
621 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
622 tryagain:
623 mtx_lock(mutex);
624 hit = 1;
625 LIST_INIT(&nfsrc_templist);
626 /*
627 * Get all the matches and put them on the temp list.
628 */
629 rp = LIST_FIRST(hp);
630 while (rp != LIST_END(hp)) {
631 nextrp = LIST_NEXT(rp, rc_hash);
632 if (newrp->rc_xid == rp->rc_xid &&
633 (!(rp->rc_flag & RC_INPROG) ||
634 ((newrp->rc_flag & RC_SAMETCPCONN) &&
635 newrp->rc_sockref == rp->rc_sockref)) &&
636 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
637 newrp->rc_proc == rp->rc_proc &&
638 ((newrp->rc_flag & RC_NFSV4) &&
639 newrp->rc_sockref != rp->rc_sockref &&
640 newrp->rc_cachetime >= rp->rc_cachetime)
641 && newrp->rc_reqlen == rp->rc_reqlen &&
642 newrp->rc_cksum == rp->rc_cksum) {
643 LIST_REMOVE(rp, rc_hash);
644 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
645 }
646 rp = nextrp;
647 }
648
649 /*
650 * Now, use nfsrc_templist to decide if there is a match.
651 */
652 i = 0;
653 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
654 i++;
655 if (rp->rc_refcnt > 0) {
656 hit = 0;
657 break;
658 }
659 }
660 /*
661 * Can be a hit only if one entry left.
662 * Note possible hit entry and put nfsrc_templist back on hash
663 * list.
664 */
665 if (i != 1)
666 hit = 0;
667 hitrp = rp = LIST_FIRST(&nfsrc_templist);
668 while (rp != LIST_END(&nfsrc_templist)) {
669 nextrp = LIST_NEXT(rp, rc_hash);
670 LIST_REMOVE(rp, rc_hash);
671 LIST_INSERT_HEAD(hp, rp, rc_hash);
672 rp = nextrp;
673 }
674 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
675 panic("nfs gettcp cache templist");
676
677 if (hit) {
678 rp = hitrp;
679 if ((rp->rc_flag & RC_LOCKED) != 0) {
680 rp->rc_flag |= RC_WANTED;
681 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
682 "nfsrc", 10 * hz);
683 goto tryagain;
684 }
685 if (rp->rc_flag == 0)
686 panic("nfs tcp cache0");
687 rp->rc_flag |= RC_LOCKED;
688 if (rp->rc_flag & RC_INPROG) {
689 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
690 mtx_unlock(mutex);
691 if (newrp->rc_sockref == rp->rc_sockref)
692 nfsrc_marksametcpconn(rp->rc_sockref);
693 ret = RC_DROPIT;
694 } else if (rp->rc_flag & RC_REPSTATUS) {
695 /*
696 * V2 only.
697 */
698 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
699 mtx_unlock(mutex);
700 if (newrp->rc_sockref == rp->rc_sockref)
701 nfsrc_marksametcpconn(rp->rc_sockref);
702 ret = RC_REPLY;
703 nfsrvd_rephead(nd);
704 *(nd->nd_errp) = rp->rc_status;
705 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
706 } else if (rp->rc_flag & RC_REPMBUF) {
707 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
708 mtx_unlock(mutex);
709 if (newrp->rc_sockref == rp->rc_sockref)
710 nfsrc_marksametcpconn(rp->rc_sockref);
711 ret = RC_REPLY;
712 nd->nd_mreq = m_copym(rp->rc_reply, 0,
713 M_COPYALL, M_WAITOK);
714 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
715 } else {
716 panic("nfs tcp cache1");
717 }
718 nfsrc_unlock(rp);
719 free(newrp, M_NFSRVCACHE);
720 goto out;
721 }
722 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
723 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
724
725 /*
726 * For TCP, multiple entries for a key are allowed, so don't
727 * chain it into the hash table until done.
728 */
729 newrp->rc_cachetime = NFSD_MONOSEC;
730 newrp->rc_flag |= RC_INPROG;
731 LIST_INSERT_HEAD(hp, newrp, rc_hash);
732 mtx_unlock(mutex);
733 nd->nd_rp = newrp;
734 ret = RC_DOIT;
735
736 out:
737 NFSEXITCODE2(0, nd);
738 return (ret);
739 }
740
741 /*
742 * Lock a cache entry.
743 */
744 static void
nfsrc_lock(struct nfsrvcache * rp)745 nfsrc_lock(struct nfsrvcache *rp)
746 {
747 struct mtx *mutex;
748
749 mutex = nfsrc_cachemutex(rp);
750 mtx_assert(mutex, MA_OWNED);
751 while ((rp->rc_flag & RC_LOCKED) != 0) {
752 rp->rc_flag |= RC_WANTED;
753 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
754 }
755 rp->rc_flag |= RC_LOCKED;
756 }
757
758 /*
759 * Unlock a cache entry.
760 */
761 static void
nfsrc_unlock(struct nfsrvcache * rp)762 nfsrc_unlock(struct nfsrvcache *rp)
763 {
764 struct mtx *mutex;
765
766 mutex = nfsrc_cachemutex(rp);
767 mtx_lock(mutex);
768 rp->rc_flag &= ~RC_LOCKED;
769 nfsrc_wanted(rp);
770 mtx_unlock(mutex);
771 }
772
773 /*
774 * Wakeup anyone wanting entry.
775 */
776 static void
nfsrc_wanted(struct nfsrvcache * rp)777 nfsrc_wanted(struct nfsrvcache *rp)
778 {
779 if (rp->rc_flag & RC_WANTED) {
780 rp->rc_flag &= ~RC_WANTED;
781 wakeup((caddr_t)rp);
782 }
783 }
784
785 /*
786 * Free up the entry.
787 * Must not sleep.
788 */
789 static void
nfsrc_freecache(struct nfsrvcache * rp)790 nfsrc_freecache(struct nfsrvcache *rp)
791 {
792 struct nfsrchash_bucket *hbp;
793
794 LIST_REMOVE(rp, rc_hash);
795 if (rp->rc_flag & RC_UDP) {
796 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
797 NFSD_VNET(nfsrc_udpcachesize)--;
798 } else if (rp->rc_acked != RC_NO_SEQ) {
799 hbp = NFSRCAHASH(rp->rc_sockref);
800 mtx_lock(&hbp->mtx);
801 if (rp->rc_acked == RC_NO_ACK)
802 LIST_REMOVE(rp, rc_ahash);
803 mtx_unlock(&hbp->mtx);
804 }
805 nfsrc_wanted(rp);
806 if (rp->rc_flag & RC_REPMBUF) {
807 m_freem(rp->rc_reply);
808 if (!(rp->rc_flag & RC_UDP))
809 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
810 }
811 free(rp, M_NFSRVCACHE);
812 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
813 }
814
815 /*
816 * Clean out the cache. Called when nfsserver module is unloaded.
817 */
818 void
nfsrvd_cleancache(void)819 nfsrvd_cleancache(void)
820 {
821 struct nfsrvcache *rp, *nextrp;
822 int i;
823
824 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
825 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
826 rc_hash, nextrp)
827 nfsrc_freecache(rp);
828 }
829 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
830 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
831 nextrp) {
832 nfsrc_freecache(rp);
833 }
834 }
835 NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
836 NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
837 }
838
839 #define HISTSIZE 16
840 /*
841 * The basic rule is to get rid of entries that are expired.
842 */
843 void
nfsrc_trimcache(u_int64_t sockref,uint32_t snd_una,int final)844 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
845 {
846 struct nfsrchash_bucket *hbp;
847 struct nfsrvcache *rp, *nextrp;
848 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
849 time_t thisstamp;
850 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
851 static int onethread = 0, oneslot = 0;
852
853 if (sockref != 0) {
854 hbp = NFSRCAHASH(sockref);
855 mtx_lock(&hbp->mtx);
856 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
857 if (sockref == rp->rc_sockref) {
858 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
859 rp->rc_acked = RC_ACK;
860 LIST_REMOVE(rp, rc_ahash);
861 } else if (final) {
862 rp->rc_acked = RC_NACK;
863 LIST_REMOVE(rp, rc_ahash);
864 }
865 }
866 }
867 mtx_unlock(&hbp->mtx);
868 }
869
870 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
871 return;
872 if (NFSD_MONOSEC != udp_lasttrim ||
873 NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
874 nfsrc_udphighwater / 2)) {
875 mtx_lock(&nfsrc_udpmtx);
876 udp_lasttrim = NFSD_MONOSEC;
877 TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
878 nextrp) {
879 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
880 && rp->rc_refcnt == 0
881 && ((rp->rc_flag & RC_REFCNT) ||
882 udp_lasttrim > rp->rc_timestamp ||
883 NFSD_VNET(nfsrc_udpcachesize) >
884 nfsrc_udphighwater))
885 nfsrc_freecache(rp);
886 }
887 mtx_unlock(&nfsrc_udpmtx);
888 }
889 if (NFSD_MONOSEC != tcp_lasttrim ||
890 NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
891 force = nfsrc_tcphighwater / 4;
892 if (force > 0 &&
893 NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
894 nfsrc_tcphighwater) {
895 for (i = 0; i < HISTSIZE; i++)
896 time_histo[i] = 0;
897 i = 0;
898 lastslot = NFSRVCACHE_HASHSIZE - 1;
899 } else {
900 force = 0;
901 if (NFSD_MONOSEC != tcp_lasttrim) {
902 i = 0;
903 lastslot = NFSRVCACHE_HASHSIZE - 1;
904 } else {
905 lastslot = i = oneslot;
906 if (++oneslot >= NFSRVCACHE_HASHSIZE)
907 oneslot = 0;
908 }
909 }
910 tto = nfsrc_tcptimeout;
911 tcp_lasttrim = NFSD_MONOSEC;
912 for (; i <= lastslot; i++) {
913 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
914 LIST_FOREACH_SAFE(rp,
915 &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
916 nextrp) {
917 if (!(rp->rc_flag &
918 (RC_INPROG|RC_LOCKED|RC_WANTED))
919 && rp->rc_refcnt == 0) {
920 if ((rp->rc_flag & RC_REFCNT) ||
921 tcp_lasttrim > rp->rc_timestamp ||
922 rp->rc_acked == RC_ACK) {
923 nfsrc_freecache(rp);
924 continue;
925 }
926
927 if (force == 0)
928 continue;
929 /*
930 * The timestamps range from roughly the
931 * present (tcp_lasttrim) to the present
932 * + nfsrc_tcptimeout. Generate a simple
933 * histogram of where the timeouts fall.
934 */
935 j = rp->rc_timestamp - tcp_lasttrim;
936 if (j >= tto)
937 j = HISTSIZE - 1;
938 else if (j < 0)
939 j = 0;
940 else
941 j = j * HISTSIZE / tto;
942 time_histo[j]++;
943 }
944 }
945 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
946 }
947 if (force) {
948 /*
949 * Trim some more with a smaller timeout of as little
950 * as 20% of nfsrc_tcptimeout to try and get below
951 * 80% of the nfsrc_tcphighwater.
952 */
953 k = 0;
954 for (i = 0; i < (HISTSIZE - 2); i++) {
955 k += time_histo[i];
956 if (k > force)
957 break;
958 }
959 k = tto * (i + 1) / HISTSIZE;
960 if (k < 1)
961 k = 1;
962 thisstamp = tcp_lasttrim + k;
963 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
964 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
965 LIST_FOREACH_SAFE(rp,
966 &NFSD_VNET(nfsrchash_table)[i].tbl,
967 rc_hash, nextrp) {
968 if (!(rp->rc_flag &
969 (RC_INPROG|RC_LOCKED|RC_WANTED))
970 && rp->rc_refcnt == 0
971 && ((rp->rc_flag & RC_REFCNT) ||
972 thisstamp > rp->rc_timestamp ||
973 rp->rc_acked == RC_ACK))
974 nfsrc_freecache(rp);
975 }
976 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
977 }
978 }
979 }
980 atomic_store_rel_int(&onethread, 0);
981 }
982
983 /*
984 * Add a seqid# reference to the cache entry.
985 */
986 void
nfsrvd_refcache(struct nfsrvcache * rp)987 nfsrvd_refcache(struct nfsrvcache *rp)
988 {
989 struct mtx *mutex;
990
991 if (rp == NULL)
992 /* For NFSv4.1, there is no cache entry. */
993 return;
994 mutex = nfsrc_cachemutex(rp);
995 mtx_lock(mutex);
996 if (rp->rc_refcnt < 0)
997 panic("nfs cache refcnt");
998 rp->rc_refcnt++;
999 mtx_unlock(mutex);
1000 }
1001
1002 /*
1003 * Dereference a seqid# cache entry.
1004 */
1005 void
nfsrvd_derefcache(struct nfsrvcache * rp)1006 nfsrvd_derefcache(struct nfsrvcache *rp)
1007 {
1008 struct mtx *mutex;
1009
1010 mutex = nfsrc_cachemutex(rp);
1011 mtx_lock(mutex);
1012 if (rp->rc_refcnt <= 0)
1013 panic("nfs cache derefcnt");
1014 rp->rc_refcnt--;
1015 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1016 nfsrc_freecache(rp);
1017 mtx_unlock(mutex);
1018 }
1019
1020 /*
1021 * Calculate the length of the mbuf list and a checksum on the first up to
1022 * NFSRVCACHE_CHECKLEN bytes.
1023 */
1024 static int
nfsrc_getlenandcksum(struct mbuf * m1,u_int16_t * cksum)1025 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1026 {
1027 int len = 0, cklen;
1028 struct mbuf *m;
1029
1030 m = m1;
1031 while (m) {
1032 len += m->m_len;
1033 m = m->m_next;
1034 }
1035 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1036 *cksum = in_cksum(m1, cklen);
1037 return (len);
1038 }
1039
1040 /*
1041 * Mark a TCP connection that is seeing retries. Should never happen for
1042 * NFSv4.
1043 */
1044 static void
nfsrc_marksametcpconn(u_int64_t sockref)1045 nfsrc_marksametcpconn(u_int64_t sockref)
1046 {
1047 }
1048