xref: /freebsd/sys/fs/nfsserver/nfs_nfsdcache.c (revision b9128a37faafede823eb456aa65a11ac69997284)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  */
35 
36 #include <sys/cdefs.h>
37 /*
38  * Here is the basic algorithm:
39  * First, some design criteria I used:
40  * - I think a false hit is more serious than a false miss
41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
42  *   avoided at all cost
43  * - A valid hit will probably happen a long time after the original reply
44  *   and the TCP socket that the original request was received on will no
45  *   longer be active
46  *   (The long time delay implies to me that LRU is not appropriate.)
47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48  *   in them as well as minimizing the risk of redoing retried non-idempotent
49  *   Ops.
50  * Because it is biased towards avoiding false hits, multiple entries with
51  * the same xid are to be expected, especially for the case of the entry
52  * in the cache being related to a seqid# sequenced Op.
53  *
54  * The basic algorithm I'm about to code up:
55  * - Null RPCs bypass the cache and are just done
56  * For TCP
57  * 	- key on <xid, NFS version> (as noted above, there can be several
58  * 				     entries with the same key)
59  * 	When a request arrives:
60  * 		For all that match key
61  * 		- if RPC# != OR request_size !=
62  * 			- not a match with this one
63  * 		- if NFSv4 and received on same TCP socket OR
64  *			received on a TCP connection created before the
65  *			entry was cached
66  * 			- not a match with this one
67  * 			(V2,3 clients might retry on same TCP socket)
68  * 		- calculate checksum on first N bytes of NFS XDR
69  * 		- if checksum !=
70  * 			- not a match for this one
71  * 		If any of the remaining ones that match has a
72  * 			seqid_refcnt > 0
73  * 			- not a match (go do RPC, using new cache entry)
74  * 		If one match left
75  * 			- a hit (reply from cache)
76  * 		else
77  * 			- miss (go do RPC, using new cache entry)
78  *
79  * 	During processing of NFSv4 request:
80  * 		- set a flag when a non-idempotent Op is processed
81  * 		- when an Op that uses a seqid# (Open,...) is processed
82  * 			- if same seqid# as referenced entry in cache
83  * 				- free new cache entry
84  * 				- reply from referenced cache entry
85  * 			  else if next seqid# in order
86  * 				- free referenced cache entry
87  * 				- increment seqid_refcnt on new cache entry
88  * 				- set pointer from Openowner/Lockowner to
89  * 					new cache entry (aka reference it)
90  * 			  else if first seqid# in sequence
91  * 				- increment seqid_refcnt on new cache entry
92  * 				- set pointer from Openowner/Lockowner to
93  * 					new cache entry (aka reference it)
94  *
95  * 	At end of RPC processing:
96  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97  * 			cache entry
98  * 			- save reply in cache entry
99  * 			- calculate checksum on first N bytes of NFS XDR
100  * 				request
101  * 			- note op and length of XDR request (in bytes)
102  * 			- timestamp it
103  * 		  else
104  * 			- free new cache entry
105  * 		- Send reply (noting info for socket activity check, below)
106  *
107  * 	For cache entries saved above:
108  * 		- if saved since seqid_refcnt was > 0
109  * 			- free when seqid_refcnt decrements to 0
110  * 			  (when next one in sequence is processed above, or
111  * 			   when Openowner/Lockowner is discarded)
112  * 		  else { non-idempotent Op(s) }
113  * 			- free when
114  * 				- some further activity observed on same
115  * 					socket
116  * 				  (I'm not yet sure how I'm going to do
117  * 				   this. Maybe look at the TCP connection
118  * 				   to see if the send_tcp_sequence# is well
119  * 				   past sent reply OR K additional RPCs
120  * 				   replied on same socket OR?)
121  * 			  OR
122  * 				- when very old (hours, days, weeks?)
123  *
124  * For UDP (v2, 3 only), pretty much the old way:
125  * - key on <xid, NFS version, RPC#, Client host ip#>
126  *   (at most one entry for each key)
127  *
128  * When a Request arrives:
129  * - if a match with entry via key
130  * 	- if RPC marked In_progress
131  * 		- discard request (don't send reply)
132  * 	  else
133  * 		- reply from cache
134  * 		- timestamp cache entry
135  *   else
136  * 	- add entry to cache, marked In_progress
137  * 	- do RPC
138  * 	- when RPC done
139  * 		- if RPC# non-idempotent
140  * 			- mark entry Done (not In_progress)
141  * 			- save reply
142  * 			- timestamp cache entry
143  * 		  else
144  * 			- free cache entry
145  * 		- send reply
146  *
147  * Later, entries with saved replies are free'd a short time (few minutes)
148  * after reply sent (timestamp).
149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
150  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151  *		pages 53-63. San Diego, February 1989.
152  *	 for the UDP case.
153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154  *	for TCP. For V3, a reply won't be saved when the flood level is
155  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156  *	that case. This level should be set high enough that this almost
157  *	never happens.
158  */
159 #include <fs/nfs/nfsport.h>
160 
161 extern struct mtx nfsrc_udpmtx;
162 
163 NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
164 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
165 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
166 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
167 
168 NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
169 NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
170 
171 SYSCTL_DECL(_vfs_nfsd);
172 
173 static u_int	nfsrc_tcphighwater = 0;
174 static int
175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
176 {
177 	int error, newhighwater;
178 
179 	newhighwater = nfsrc_tcphighwater;
180 	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181 	if (error != 0 || req->newptr == NULL)
182 		return (error);
183 	if (newhighwater < 0)
184 		return (EINVAL);
185 	if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
186 		NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
187 	nfsrc_tcphighwater = newhighwater;
188 	return (0);
189 }
190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
191     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
192     sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
193 
194 static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196     &nfsrc_udphighwater, 0,
197     "High water mark for UDP cache entries");
198 static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200     &nfsrc_tcptimeout, 0,
201     "Timeout for TCP entries in the DRC");
202 static u_int nfsrc_tcpnonidempotent = 1;
203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204     &nfsrc_tcpnonidempotent, 0,
205     "Enable the DRC for NFS over TCP");
206 
207 NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
208 NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
209 
210 /*
211  * and the reverse mapping from generic to Version 2 procedure numbers
212  */
213 static int newnfsv2_procid[NFS_V3NPROCS] = {
214 	NFSV2PROC_NULL,
215 	NFSV2PROC_GETATTR,
216 	NFSV2PROC_SETATTR,
217 	NFSV2PROC_LOOKUP,
218 	NFSV2PROC_NOOP,
219 	NFSV2PROC_READLINK,
220 	NFSV2PROC_READ,
221 	NFSV2PROC_WRITE,
222 	NFSV2PROC_CREATE,
223 	NFSV2PROC_MKDIR,
224 	NFSV2PROC_SYMLINK,
225 	NFSV2PROC_CREATE,
226 	NFSV2PROC_REMOVE,
227 	NFSV2PROC_RMDIR,
228 	NFSV2PROC_RENAME,
229 	NFSV2PROC_LINK,
230 	NFSV2PROC_READDIR,
231 	NFSV2PROC_NOOP,
232 	NFSV2PROC_STATFS,
233 	NFSV2PROC_NOOP,
234 	NFSV2PROC_NOOP,
235 	NFSV2PROC_NOOP,
236 };
237 
238 #define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
239 #define	NFSRCUDPHASH(xid) \
240 	(&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
241 #define	NFSRCHASH(xid) \
242 	(&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
243 #define	NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
244 #define	TRUE	1
245 #define	FALSE	0
246 #define	NFSRVCACHE_CHECKLEN	100
247 
248 /* True iff the rpc reply is an nfs status ONLY! */
249 static int nfsv2_repstat[NFS_V3NPROCS] = {
250 	FALSE,
251 	FALSE,
252 	FALSE,
253 	FALSE,
254 	FALSE,
255 	FALSE,
256 	FALSE,
257 	FALSE,
258 	FALSE,
259 	FALSE,
260 	TRUE,
261 	TRUE,
262 	TRUE,
263 	TRUE,
264 	FALSE,
265 	TRUE,
266 	FALSE,
267 	FALSE,
268 	FALSE,
269 	FALSE,
270 	FALSE,
271 	FALSE,
272 };
273 
274 /*
275  * Will NFS want to work over IPv6 someday?
276  */
277 #define	NETFAMILY(rp) \
278 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
279 
280 /* local functions */
281 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283 static void nfsrc_lock(struct nfsrvcache *rp);
284 static void nfsrc_unlock(struct nfsrvcache *rp);
285 static void nfsrc_wanted(struct nfsrvcache *rp);
286 static void nfsrc_freecache(struct nfsrvcache *rp);
287 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
288 static void nfsrc_marksametcpconn(u_int64_t);
289 
290 /*
291  * Return the correct mutex for this cache entry.
292  */
293 static __inline struct mtx *
294 nfsrc_cachemutex(struct nfsrvcache *rp)
295 {
296 
297 	if ((rp->rc_flag & RC_UDP) != 0)
298 		return (&nfsrc_udpmtx);
299 	return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
300 }
301 
302 /*
303  * Initialize the server request cache list
304  */
305 void
306 nfsrvd_initcache(void)
307 {
308 	int i;
309 
310 	NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
311 	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
312 	NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
313 	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
314 	NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
315 	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
316 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
317 		mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
318 		    MTX_DEF);
319 		mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
320 		    MTX_DEF);
321 	}
322 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
323 		LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
324 		LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
325 		LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
326 	}
327 	TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
328 	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
329 	NFSD_VNET(nfsrc_udpcachesize) = 0;
330 }
331 
332 /*
333  * Get a cache entry for this request. Basically just malloc a new one
334  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
335  */
336 int
337 nfsrvd_getcache(struct nfsrv_descript *nd)
338 {
339 	struct nfsrvcache *newrp;
340 	int ret;
341 
342 	if (nd->nd_procnum == NFSPROC_NULL)
343 		panic("nfsd cache null");
344 	newrp = malloc(sizeof (struct nfsrvcache),
345 	    M_NFSRVCACHE, M_WAITOK);
346 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
347 	if (nd->nd_flag & ND_NFSV4)
348 		newrp->rc_flag = RC_NFSV4;
349 	else if (nd->nd_flag & ND_NFSV3)
350 		newrp->rc_flag = RC_NFSV3;
351 	else
352 		newrp->rc_flag = RC_NFSV2;
353 	newrp->rc_xid = nd->nd_retxid;
354 	newrp->rc_proc = nd->nd_procnum;
355 	newrp->rc_sockref = nd->nd_sockref;
356 	newrp->rc_cachetime = nd->nd_tcpconntime;
357 	if (nd->nd_flag & ND_SAMETCPCONN)
358 		newrp->rc_flag |= RC_SAMETCPCONN;
359 	if (nd->nd_nam2 != NULL) {
360 		newrp->rc_flag |= RC_UDP;
361 		ret = nfsrc_getudp(nd, newrp);
362 	} else {
363 		ret = nfsrc_gettcp(nd, newrp);
364 	}
365 	NFSEXITCODE2(0, nd);
366 	return (ret);
367 }
368 
369 /*
370  * For UDP (v2, v3):
371  * - key on <xid, NFS version, RPC#, Client host ip#>
372  *   (at most one entry for each key)
373  */
374 static int
375 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
376 {
377 	struct nfsrvcache *rp;
378 	struct sockaddr_in *saddr;
379 	struct sockaddr_in6 *saddr6;
380 	struct nfsrvhashhead *hp;
381 	int ret = 0;
382 	struct mtx *mutex;
383 
384 	mutex = nfsrc_cachemutex(newrp);
385 	hp = NFSRCUDPHASH(newrp->rc_xid);
386 loop:
387 	mtx_lock(mutex);
388 	LIST_FOREACH(rp, hp, rc_hash) {
389 	    if (newrp->rc_xid == rp->rc_xid &&
390 		newrp->rc_proc == rp->rc_proc &&
391 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
392 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
393 			if ((rp->rc_flag & RC_LOCKED) != 0) {
394 				rp->rc_flag |= RC_WANTED;
395 				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
396 				    "nfsrc", 10 * hz);
397 				goto loop;
398 			}
399 			if (rp->rc_flag == 0)
400 				panic("nfs udp cache0");
401 			rp->rc_flag |= RC_LOCKED;
402 			TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
403 			TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
404 			if (rp->rc_flag & RC_INPROG) {
405 				NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
406 				mtx_unlock(mutex);
407 				ret = RC_DROPIT;
408 			} else if (rp->rc_flag & RC_REPSTATUS) {
409 				/*
410 				 * V2 only.
411 				 */
412 				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
413 				mtx_unlock(mutex);
414 				nfsrvd_rephead(nd);
415 				*(nd->nd_errp) = rp->rc_status;
416 				ret = RC_REPLY;
417 				rp->rc_timestamp = NFSD_MONOSEC +
418 					NFSRVCACHE_UDPTIMEOUT;
419 			} else if (rp->rc_flag & RC_REPMBUF) {
420 				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
421 				mtx_unlock(mutex);
422 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
423 					M_COPYALL, M_WAITOK);
424 				ret = RC_REPLY;
425 				rp->rc_timestamp = NFSD_MONOSEC +
426 					NFSRVCACHE_UDPTIMEOUT;
427 			} else {
428 				panic("nfs udp cache1");
429 			}
430 			nfsrc_unlock(rp);
431 			free(newrp, M_NFSRVCACHE);
432 			goto out;
433 		}
434 	}
435 	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
436 	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
437 	NFSD_VNET(nfsrc_udpcachesize)++;
438 
439 	newrp->rc_flag |= RC_INPROG;
440 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
441 	if (saddr->sin_family == AF_INET)
442 		newrp->rc_inet = saddr->sin_addr.s_addr;
443 	else if (saddr->sin_family == AF_INET6) {
444 		saddr6 = (struct sockaddr_in6 *)saddr;
445 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
446 		    sizeof (struct in6_addr));
447 		newrp->rc_flag |= RC_INETIPV6;
448 	}
449 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
450 	TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
451 	mtx_unlock(mutex);
452 	nd->nd_rp = newrp;
453 	ret = RC_DOIT;
454 
455 out:
456 	NFSEXITCODE2(0, nd);
457 	return (ret);
458 }
459 
460 /*
461  * Update a request cache entry after the rpc has been done
462  */
463 struct nfsrvcache *
464 nfsrvd_updatecache(struct nfsrv_descript *nd)
465 {
466 	struct nfsrvcache *rp;
467 	struct nfsrvcache *retrp = NULL;
468 	struct mbuf *m;
469 	struct mtx *mutex;
470 
471 	rp = nd->nd_rp;
472 	if (!rp)
473 		panic("nfsrvd_updatecache null rp");
474 	nd->nd_rp = NULL;
475 	mutex = nfsrc_cachemutex(rp);
476 	mtx_lock(mutex);
477 	nfsrc_lock(rp);
478 	if (!(rp->rc_flag & RC_INPROG))
479 		panic("nfsrvd_updatecache not inprog");
480 	rp->rc_flag &= ~RC_INPROG;
481 	if (rp->rc_flag & RC_UDP) {
482 		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
483 		TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
484 	}
485 
486 	/*
487 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
488 	 */
489 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
490 		NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
491 		mtx_unlock(mutex);
492 		nd->nd_repstat = 0;
493 		if (nd->nd_mreq)
494 			m_freem(nd->nd_mreq);
495 		if (!(rp->rc_flag & RC_REPMBUF))
496 			panic("reply from cache");
497 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
498 		    M_COPYALL, M_WAITOK);
499 		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
500 		nfsrc_unlock(rp);
501 		goto out;
502 	}
503 
504 	/*
505 	 * If rc_refcnt > 0, save it
506 	 * For UDP, save it if ND_SAVEREPLY is set
507 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
508 	 */
509 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
510 	    (rp->rc_refcnt > 0 ||
511 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
512 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
513 	      NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
514 	      nfsrc_tcpnonidempotent))) {
515 		if (rp->rc_refcnt > 0) {
516 			if (!(rp->rc_flag & RC_NFSV4))
517 				panic("update_cache refcnt");
518 			rp->rc_flag |= RC_REFCNT;
519 		}
520 		if ((nd->nd_flag & ND_NFSV2) &&
521 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
522 			rp->rc_status = nd->nd_repstat;
523 			rp->rc_flag |= RC_REPSTATUS;
524 			mtx_unlock(mutex);
525 		} else {
526 			if (!(rp->rc_flag & RC_UDP)) {
527 			    atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
528 				1);
529 			    if (NFSD_VNET(nfsrc_tcpsavedreplies) >
530 				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
531 				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
532 				    NFSD_VNET(nfsrc_tcpsavedreplies);
533 			}
534 			mtx_unlock(mutex);
535 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
536 			mtx_lock(mutex);
537 			rp->rc_reply = m;
538 			rp->rc_flag |= RC_REPMBUF;
539 			mtx_unlock(mutex);
540 		}
541 		if (rp->rc_flag & RC_UDP) {
542 			rp->rc_timestamp = NFSD_MONOSEC +
543 			    NFSRVCACHE_UDPTIMEOUT;
544 			nfsrc_unlock(rp);
545 		} else {
546 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
547 			if (rp->rc_refcnt > 0)
548 				nfsrc_unlock(rp);
549 			else
550 				retrp = rp;
551 		}
552 	} else {
553 		nfsrc_freecache(rp);
554 		mtx_unlock(mutex);
555 	}
556 
557 out:
558 	NFSEXITCODE2(0, nd);
559 	return (retrp);
560 }
561 
562 /*
563  * Invalidate and, if possible, free an in prog cache entry.
564  * Must not sleep.
565  */
566 void
567 nfsrvd_delcache(struct nfsrvcache *rp)
568 {
569 	struct mtx *mutex;
570 
571 	mutex = nfsrc_cachemutex(rp);
572 	if (!(rp->rc_flag & RC_INPROG))
573 		panic("nfsrvd_delcache not in prog");
574 	mtx_lock(mutex);
575 	rp->rc_flag &= ~RC_INPROG;
576 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
577 		nfsrc_freecache(rp);
578 	mtx_unlock(mutex);
579 }
580 
581 /*
582  * Called after nfsrvd_updatecache() once the reply is sent, to update
583  * the entry's sequence number and unlock it. The argument is
584  * the pointer returned by nfsrvd_updatecache().
585  */
586 void
587 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
588 {
589 	struct nfsrchash_bucket *hbp;
590 
591 	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
592 	if (have_seq) {
593 		hbp = NFSRCAHASH(rp->rc_sockref);
594 		mtx_lock(&hbp->mtx);
595 		rp->rc_tcpseq = seq;
596 		if (rp->rc_acked != RC_NO_ACK)
597 			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
598 		rp->rc_acked = RC_NO_ACK;
599 		mtx_unlock(&hbp->mtx);
600 	}
601 	nfsrc_unlock(rp);
602 }
603 
604 /*
605  * Get a cache entry for TCP
606  * - key on <xid, nfs version>
607  *   (allow multiple entries for a given key)
608  */
609 static int
610 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
611 {
612 	struct nfsrvcache *rp, *nextrp;
613 	int i;
614 	struct nfsrvcache *hitrp;
615 	struct nfsrvhashhead *hp, nfsrc_templist;
616 	int hit, ret = 0;
617 	struct mtx *mutex;
618 
619 	mutex = nfsrc_cachemutex(newrp);
620 	hp = NFSRCHASH(newrp->rc_xid);
621 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
622 tryagain:
623 	mtx_lock(mutex);
624 	hit = 1;
625 	LIST_INIT(&nfsrc_templist);
626 	/*
627 	 * Get all the matches and put them on the temp list.
628 	 */
629 	rp = LIST_FIRST(hp);
630 	while (rp != LIST_END(hp)) {
631 		nextrp = LIST_NEXT(rp, rc_hash);
632 		if (newrp->rc_xid == rp->rc_xid &&
633 		    (!(rp->rc_flag & RC_INPROG) ||
634 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
635 		      newrp->rc_sockref == rp->rc_sockref)) &&
636 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
637 		    newrp->rc_proc == rp->rc_proc &&
638 		    ((newrp->rc_flag & RC_NFSV4) &&
639 		     newrp->rc_sockref != rp->rc_sockref &&
640 		     newrp->rc_cachetime >= rp->rc_cachetime)
641 		    && newrp->rc_reqlen == rp->rc_reqlen &&
642 		    newrp->rc_cksum == rp->rc_cksum) {
643 			LIST_REMOVE(rp, rc_hash);
644 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
645 		}
646 		rp = nextrp;
647 	}
648 
649 	/*
650 	 * Now, use nfsrc_templist to decide if there is a match.
651 	 */
652 	i = 0;
653 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
654 		i++;
655 		if (rp->rc_refcnt > 0) {
656 			hit = 0;
657 			break;
658 		}
659 	}
660 	/*
661 	 * Can be a hit only if one entry left.
662 	 * Note possible hit entry and put nfsrc_templist back on hash
663 	 * list.
664 	 */
665 	if (i != 1)
666 		hit = 0;
667 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
668 	while (rp != LIST_END(&nfsrc_templist)) {
669 		nextrp = LIST_NEXT(rp, rc_hash);
670 		LIST_REMOVE(rp, rc_hash);
671 		LIST_INSERT_HEAD(hp, rp, rc_hash);
672 		rp = nextrp;
673 	}
674 	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
675 		panic("nfs gettcp cache templist");
676 
677 	if (hit) {
678 		rp = hitrp;
679 		if ((rp->rc_flag & RC_LOCKED) != 0) {
680 			rp->rc_flag |= RC_WANTED;
681 			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
682 			    "nfsrc", 10 * hz);
683 			goto tryagain;
684 		}
685 		if (rp->rc_flag == 0)
686 			panic("nfs tcp cache0");
687 		rp->rc_flag |= RC_LOCKED;
688 		if (rp->rc_flag & RC_INPROG) {
689 			NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
690 			mtx_unlock(mutex);
691 			if (newrp->rc_sockref == rp->rc_sockref)
692 				nfsrc_marksametcpconn(rp->rc_sockref);
693 			ret = RC_DROPIT;
694 		} else if (rp->rc_flag & RC_REPSTATUS) {
695 			/*
696 			 * V2 only.
697 			 */
698 			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
699 			mtx_unlock(mutex);
700 			if (newrp->rc_sockref == rp->rc_sockref)
701 				nfsrc_marksametcpconn(rp->rc_sockref);
702 			ret = RC_REPLY;
703 			nfsrvd_rephead(nd);
704 			*(nd->nd_errp) = rp->rc_status;
705 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
706 		} else if (rp->rc_flag & RC_REPMBUF) {
707 			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
708 			mtx_unlock(mutex);
709 			if (newrp->rc_sockref == rp->rc_sockref)
710 				nfsrc_marksametcpconn(rp->rc_sockref);
711 			ret = RC_REPLY;
712 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
713 				M_COPYALL, M_WAITOK);
714 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
715 		} else {
716 			panic("nfs tcp cache1");
717 		}
718 		nfsrc_unlock(rp);
719 		free(newrp, M_NFSRVCACHE);
720 		goto out;
721 	}
722 	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
723 	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
724 
725 	/*
726 	 * For TCP, multiple entries for a key are allowed, so don't
727 	 * chain it into the hash table until done.
728 	 */
729 	newrp->rc_cachetime = NFSD_MONOSEC;
730 	newrp->rc_flag |= RC_INPROG;
731 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
732 	mtx_unlock(mutex);
733 	nd->nd_rp = newrp;
734 	ret = RC_DOIT;
735 
736 out:
737 	NFSEXITCODE2(0, nd);
738 	return (ret);
739 }
740 
741 /*
742  * Lock a cache entry.
743  */
744 static void
745 nfsrc_lock(struct nfsrvcache *rp)
746 {
747 	struct mtx *mutex;
748 
749 	mutex = nfsrc_cachemutex(rp);
750 	mtx_assert(mutex, MA_OWNED);
751 	while ((rp->rc_flag & RC_LOCKED) != 0) {
752 		rp->rc_flag |= RC_WANTED;
753 		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
754 	}
755 	rp->rc_flag |= RC_LOCKED;
756 }
757 
758 /*
759  * Unlock a cache entry.
760  */
761 static void
762 nfsrc_unlock(struct nfsrvcache *rp)
763 {
764 	struct mtx *mutex;
765 
766 	mutex = nfsrc_cachemutex(rp);
767 	mtx_lock(mutex);
768 	rp->rc_flag &= ~RC_LOCKED;
769 	nfsrc_wanted(rp);
770 	mtx_unlock(mutex);
771 }
772 
773 /*
774  * Wakeup anyone wanting entry.
775  */
776 static void
777 nfsrc_wanted(struct nfsrvcache *rp)
778 {
779 	if (rp->rc_flag & RC_WANTED) {
780 		rp->rc_flag &= ~RC_WANTED;
781 		wakeup((caddr_t)rp);
782 	}
783 }
784 
785 /*
786  * Free up the entry.
787  * Must not sleep.
788  */
789 static void
790 nfsrc_freecache(struct nfsrvcache *rp)
791 {
792 	struct nfsrchash_bucket *hbp;
793 
794 	LIST_REMOVE(rp, rc_hash);
795 	if (rp->rc_flag & RC_UDP) {
796 		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
797 		NFSD_VNET(nfsrc_udpcachesize)--;
798 	} else if (rp->rc_acked != RC_NO_SEQ) {
799 		hbp = NFSRCAHASH(rp->rc_sockref);
800 		mtx_lock(&hbp->mtx);
801 		if (rp->rc_acked == RC_NO_ACK)
802 			LIST_REMOVE(rp, rc_ahash);
803 		mtx_unlock(&hbp->mtx);
804 	}
805 	nfsrc_wanted(rp);
806 	if (rp->rc_flag & RC_REPMBUF) {
807 		m_freem(rp->rc_reply);
808 		if (!(rp->rc_flag & RC_UDP))
809 			atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
810 	}
811 	free(rp, M_NFSRVCACHE);
812 	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
813 }
814 
815 /*
816  * Clean out the cache. Called when nfsserver module is unloaded.
817  */
818 void
819 nfsrvd_cleancache(void)
820 {
821 	struct nfsrvcache *rp, *nextrp;
822 	int i;
823 
824 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
825 		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
826 		    rc_hash, nextrp)
827 			nfsrc_freecache(rp);
828 	}
829 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
830 		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
831 		    nextrp) {
832 			nfsrc_freecache(rp);
833 		}
834 	}
835 	NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
836 	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
837 }
838 
839 #define HISTSIZE	16
840 /*
841  * The basic rule is to get rid of entries that are expired.
842  */
843 void
844 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
845 {
846 	struct nfsrchash_bucket *hbp;
847 	struct nfsrvcache *rp, *nextrp;
848 	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
849 	time_t thisstamp;
850 	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
851 	static int onethread = 0, oneslot = 0;
852 
853 	if (sockref != 0) {
854 		hbp = NFSRCAHASH(sockref);
855 		mtx_lock(&hbp->mtx);
856 		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
857 			if (sockref == rp->rc_sockref) {
858 				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
859 					rp->rc_acked = RC_ACK;
860 					LIST_REMOVE(rp, rc_ahash);
861 				} else if (final) {
862 					rp->rc_acked = RC_NACK;
863 					LIST_REMOVE(rp, rc_ahash);
864 				}
865 			}
866 		}
867 		mtx_unlock(&hbp->mtx);
868 	}
869 
870 	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
871 		return;
872 	if (NFSD_MONOSEC != udp_lasttrim ||
873 	    NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
874 	    nfsrc_udphighwater / 2)) {
875 		mtx_lock(&nfsrc_udpmtx);
876 		udp_lasttrim = NFSD_MONOSEC;
877 		TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
878 		    nextrp) {
879 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
880 			     && rp->rc_refcnt == 0
881 			     && ((rp->rc_flag & RC_REFCNT) ||
882 				 udp_lasttrim > rp->rc_timestamp ||
883 				 NFSD_VNET(nfsrc_udpcachesize) >
884 				 nfsrc_udphighwater))
885 				nfsrc_freecache(rp);
886 		}
887 		mtx_unlock(&nfsrc_udpmtx);
888 	}
889 	if (NFSD_MONOSEC != tcp_lasttrim ||
890 	    NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
891 		force = nfsrc_tcphighwater / 4;
892 		if (force > 0 &&
893 		    NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
894 		    nfsrc_tcphighwater) {
895 			for (i = 0; i < HISTSIZE; i++)
896 				time_histo[i] = 0;
897 			i = 0;
898 			lastslot = NFSRVCACHE_HASHSIZE - 1;
899 		} else {
900 			force = 0;
901 			if (NFSD_MONOSEC != tcp_lasttrim) {
902 				i = 0;
903 				lastslot = NFSRVCACHE_HASHSIZE - 1;
904 			} else {
905 				lastslot = i = oneslot;
906 				if (++oneslot >= NFSRVCACHE_HASHSIZE)
907 					oneslot = 0;
908 			}
909 		}
910 		tto = nfsrc_tcptimeout;
911 		tcp_lasttrim = NFSD_MONOSEC;
912 		for (; i <= lastslot; i++) {
913 			mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
914 			LIST_FOREACH_SAFE(rp,
915 			    &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
916 			    nextrp) {
917 				if (!(rp->rc_flag &
918 				     (RC_INPROG|RC_LOCKED|RC_WANTED))
919 				     && rp->rc_refcnt == 0) {
920 					if ((rp->rc_flag & RC_REFCNT) ||
921 					    tcp_lasttrim > rp->rc_timestamp ||
922 					    rp->rc_acked == RC_ACK) {
923 						nfsrc_freecache(rp);
924 						continue;
925 					}
926 
927 					if (force == 0)
928 						continue;
929 					/*
930 					 * The timestamps range from roughly the
931 					 * present (tcp_lasttrim) to the present
932 					 * + nfsrc_tcptimeout. Generate a simple
933 					 * histogram of where the timeouts fall.
934 					 */
935 					j = rp->rc_timestamp - tcp_lasttrim;
936 					if (j >= tto)
937 						j = HISTSIZE - 1;
938 					else if (j < 0)
939 						j = 0;
940 					else
941 						j = j * HISTSIZE / tto;
942 					time_histo[j]++;
943 				}
944 			}
945 			mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
946 		}
947 		if (force) {
948 			/*
949 			 * Trim some more with a smaller timeout of as little
950 			 * as 20% of nfsrc_tcptimeout to try and get below
951 			 * 80% of the nfsrc_tcphighwater.
952 			 */
953 			k = 0;
954 			for (i = 0; i < (HISTSIZE - 2); i++) {
955 				k += time_histo[i];
956 				if (k > force)
957 					break;
958 			}
959 			k = tto * (i + 1) / HISTSIZE;
960 			if (k < 1)
961 				k = 1;
962 			thisstamp = tcp_lasttrim + k;
963 			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
964 				mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
965 				LIST_FOREACH_SAFE(rp,
966 				    &NFSD_VNET(nfsrchash_table)[i].tbl,
967 				    rc_hash, nextrp) {
968 					if (!(rp->rc_flag &
969 					     (RC_INPROG|RC_LOCKED|RC_WANTED))
970 					     && rp->rc_refcnt == 0
971 					     && ((rp->rc_flag & RC_REFCNT) ||
972 						 thisstamp > rp->rc_timestamp ||
973 						 rp->rc_acked == RC_ACK))
974 						nfsrc_freecache(rp);
975 				}
976 				mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
977 			}
978 		}
979 	}
980 	atomic_store_rel_int(&onethread, 0);
981 }
982 
983 /*
984  * Add a seqid# reference to the cache entry.
985  */
986 void
987 nfsrvd_refcache(struct nfsrvcache *rp)
988 {
989 	struct mtx *mutex;
990 
991 	if (rp == NULL)
992 		/* For NFSv4.1, there is no cache entry. */
993 		return;
994 	mutex = nfsrc_cachemutex(rp);
995 	mtx_lock(mutex);
996 	if (rp->rc_refcnt < 0)
997 		panic("nfs cache refcnt");
998 	rp->rc_refcnt++;
999 	mtx_unlock(mutex);
1000 }
1001 
1002 /*
1003  * Dereference a seqid# cache entry.
1004  */
1005 void
1006 nfsrvd_derefcache(struct nfsrvcache *rp)
1007 {
1008 	struct mtx *mutex;
1009 
1010 	mutex = nfsrc_cachemutex(rp);
1011 	mtx_lock(mutex);
1012 	if (rp->rc_refcnt <= 0)
1013 		panic("nfs cache derefcnt");
1014 	rp->rc_refcnt--;
1015 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1016 		nfsrc_freecache(rp);
1017 	mtx_unlock(mutex);
1018 }
1019 
1020 /*
1021  * Calculate the length of the mbuf list and a checksum on the first up to
1022  * NFSRVCACHE_CHECKLEN bytes.
1023  */
1024 static int
1025 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1026 {
1027 	int len = 0, cklen;
1028 	struct mbuf *m;
1029 
1030 	m = m1;
1031 	while (m) {
1032 		len += m->m_len;
1033 		m = m->m_next;
1034 	}
1035 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1036 	*cksum = in_cksum(m1, cklen);
1037 	return (len);
1038 }
1039 
1040 /*
1041  * Mark a TCP connection that is seeing retries. Should never happen for
1042  * NFSv4.
1043  */
1044 static void
1045 nfsrc_marksametcpconn(u_int64_t sockref)
1046 {
1047 }
1048