xref: /freebsd/sys/fs/nfsserver/nfs_nfsdcache.c (revision 3dd5524264095ed8612c28908e13f80668eff2f9)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 /*
40  * Here is the basic algorithm:
41  * First, some design criteria I used:
42  * - I think a false hit is more serious than a false miss
43  * - A false hit for an RPC that has Op(s) that order via seqid# must be
44  *   avoided at all cost
45  * - A valid hit will probably happen a long time after the original reply
46  *   and the TCP socket that the original request was received on will no
47  *   longer be active
48  *   (The long time delay implies to me that LRU is not appropriate.)
49  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50  *   in them as well as minimizing the risk of redoing retried non-idempotent
51  *   Ops.
52  * Because it is biased towards avoiding false hits, multiple entries with
53  * the same xid are to be expected, especially for the case of the entry
54  * in the cache being related to a seqid# sequenced Op.
55  *
56  * The basic algorithm I'm about to code up:
57  * - Null RPCs bypass the cache and are just done
58  * For TCP
59  * 	- key on <xid, NFS version> (as noted above, there can be several
60  * 				     entries with the same key)
61  * 	When a request arrives:
62  * 		For all that match key
63  * 		- if RPC# != OR request_size !=
64  * 			- not a match with this one
65  * 		- if NFSv4 and received on same TCP socket OR
66  *			received on a TCP connection created before the
67  *			entry was cached
68  * 			- not a match with this one
69  * 			(V2,3 clients might retry on same TCP socket)
70  * 		- calculate checksum on first N bytes of NFS XDR
71  * 		- if checksum !=
72  * 			- not a match for this one
73  * 		If any of the remaining ones that match has a
74  * 			seqid_refcnt > 0
75  * 			- not a match (go do RPC, using new cache entry)
76  * 		If one match left
77  * 			- a hit (reply from cache)
78  * 		else
79  * 			- miss (go do RPC, using new cache entry)
80  *
81  * 	During processing of NFSv4 request:
82  * 		- set a flag when a non-idempotent Op is processed
83  * 		- when an Op that uses a seqid# (Open,...) is processed
84  * 			- if same seqid# as referenced entry in cache
85  * 				- free new cache entry
86  * 				- reply from referenced cache entry
87  * 			  else if next seqid# in order
88  * 				- free referenced cache entry
89  * 				- increment seqid_refcnt on new cache entry
90  * 				- set pointer from Openowner/Lockowner to
91  * 					new cache entry (aka reference it)
92  * 			  else if first seqid# in sequence
93  * 				- increment seqid_refcnt on new cache entry
94  * 				- set pointer from Openowner/Lockowner to
95  * 					new cache entry (aka reference it)
96  *
97  * 	At end of RPC processing:
98  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
99  * 			cache entry
100  * 			- save reply in cache entry
101  * 			- calculate checksum on first N bytes of NFS XDR
102  * 				request
103  * 			- note op and length of XDR request (in bytes)
104  * 			- timestamp it
105  * 		  else
106  * 			- free new cache entry
107  * 		- Send reply (noting info for socket activity check, below)
108  *
109  * 	For cache entries saved above:
110  * 		- if saved since seqid_refcnt was > 0
111  * 			- free when seqid_refcnt decrements to 0
112  * 			  (when next one in sequence is processed above, or
113  * 			   when Openowner/Lockowner is discarded)
114  * 		  else { non-idempotent Op(s) }
115  * 			- free when
116  * 				- some further activity observed on same
117  * 					socket
118  * 				  (I'm not yet sure how I'm going to do
119  * 				   this. Maybe look at the TCP connection
120  * 				   to see if the send_tcp_sequence# is well
121  * 				   past sent reply OR K additional RPCs
122  * 				   replied on same socket OR?)
123  * 			  OR
124  * 				- when very old (hours, days, weeks?)
125  *
126  * For UDP (v2, 3 only), pretty much the old way:
127  * - key on <xid, NFS version, RPC#, Client host ip#>
128  *   (at most one entry for each key)
129  *
130  * When a Request arrives:
131  * - if a match with entry via key
132  * 	- if RPC marked In_progress
133  * 		- discard request (don't send reply)
134  * 	  else
135  * 		- reply from cache
136  * 		- timestamp cache entry
137  *   else
138  * 	- add entry to cache, marked In_progress
139  * 	- do RPC
140  * 	- when RPC done
141  * 		- if RPC# non-idempotent
142  * 			- mark entry Done (not In_progress)
143  * 			- save reply
144  * 			- timestamp cache entry
145  * 		  else
146  * 			- free cache entry
147  * 		- send reply
148  *
149  * Later, entries with saved replies are free'd a short time (few minutes)
150  * after reply sent (timestamp).
151  * Reference: Chet Juszczak, "Improving the Performance and Correctness
152  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153  *		pages 53-63. San Diego, February 1989.
154  *	 for the UDP case.
155  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156  *	for TCP. For V3, a reply won't be saved when the flood level is
157  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158  *	that case. This level should be set high enough that this almost
159  *	never happens.
160  */
161 #include <fs/nfs/nfsport.h>
162 
163 extern struct mtx nfsrc_udpmtx;
164 
165 NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
166 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
167 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
168 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
169 
170 NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
171 NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
172 
173 SYSCTL_DECL(_vfs_nfsd);
174 
175 static u_int	nfsrc_tcphighwater = 0;
176 static int
177 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
178 {
179 	int error, newhighwater;
180 
181 	newhighwater = nfsrc_tcphighwater;
182 	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
183 	if (error != 0 || req->newptr == NULL)
184 		return (error);
185 	if (newhighwater < 0)
186 		return (EINVAL);
187 	if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
188 		NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
189 	nfsrc_tcphighwater = newhighwater;
190 	return (0);
191 }
192 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
193     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
194     sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
195 
196 static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
198     &nfsrc_udphighwater, 0,
199     "High water mark for UDP cache entries");
200 static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
202     &nfsrc_tcptimeout, 0,
203     "Timeout for TCP entries in the DRC");
204 static u_int nfsrc_tcpnonidempotent = 1;
205 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
206     &nfsrc_tcpnonidempotent, 0,
207     "Enable the DRC for NFS over TCP");
208 
209 NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
210 NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
211 
212 /*
213  * and the reverse mapping from generic to Version 2 procedure numbers
214  */
215 static int newnfsv2_procid[NFS_V3NPROCS] = {
216 	NFSV2PROC_NULL,
217 	NFSV2PROC_GETATTR,
218 	NFSV2PROC_SETATTR,
219 	NFSV2PROC_LOOKUP,
220 	NFSV2PROC_NOOP,
221 	NFSV2PROC_READLINK,
222 	NFSV2PROC_READ,
223 	NFSV2PROC_WRITE,
224 	NFSV2PROC_CREATE,
225 	NFSV2PROC_MKDIR,
226 	NFSV2PROC_SYMLINK,
227 	NFSV2PROC_CREATE,
228 	NFSV2PROC_REMOVE,
229 	NFSV2PROC_RMDIR,
230 	NFSV2PROC_RENAME,
231 	NFSV2PROC_LINK,
232 	NFSV2PROC_READDIR,
233 	NFSV2PROC_NOOP,
234 	NFSV2PROC_STATFS,
235 	NFSV2PROC_NOOP,
236 	NFSV2PROC_NOOP,
237 	NFSV2PROC_NOOP,
238 };
239 
240 #define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
241 #define	NFSRCUDPHASH(xid) \
242 	(&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
243 #define	NFSRCHASH(xid) \
244 	(&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
245 #define	NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
246 #define	TRUE	1
247 #define	FALSE	0
248 #define	NFSRVCACHE_CHECKLEN	100
249 
250 /* True iff the rpc reply is an nfs status ONLY! */
251 static int nfsv2_repstat[NFS_V3NPROCS] = {
252 	FALSE,
253 	FALSE,
254 	FALSE,
255 	FALSE,
256 	FALSE,
257 	FALSE,
258 	FALSE,
259 	FALSE,
260 	FALSE,
261 	FALSE,
262 	TRUE,
263 	TRUE,
264 	TRUE,
265 	TRUE,
266 	FALSE,
267 	TRUE,
268 	FALSE,
269 	FALSE,
270 	FALSE,
271 	FALSE,
272 	FALSE,
273 	FALSE,
274 };
275 
276 /*
277  * Will NFS want to work over IPv6 someday?
278  */
279 #define	NETFAMILY(rp) \
280 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
281 
282 /* local functions */
283 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
284 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
285 static void nfsrc_lock(struct nfsrvcache *rp);
286 static void nfsrc_unlock(struct nfsrvcache *rp);
287 static void nfsrc_wanted(struct nfsrvcache *rp);
288 static void nfsrc_freecache(struct nfsrvcache *rp);
289 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
290 static void nfsrc_marksametcpconn(u_int64_t);
291 
292 /*
293  * Return the correct mutex for this cache entry.
294  */
295 static __inline struct mtx *
296 nfsrc_cachemutex(struct nfsrvcache *rp)
297 {
298 
299 	if ((rp->rc_flag & RC_UDP) != 0)
300 		return (&nfsrc_udpmtx);
301 	return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
302 }
303 
304 /*
305  * Initialize the server request cache list
306  */
307 void
308 nfsrvd_initcache(void)
309 {
310 	int i;
311 
312 	NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
313 	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
314 	NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
315 	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
316 	NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
317 	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
318 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
319 		mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
320 		    MTX_DEF);
321 		mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
322 		    MTX_DEF);
323 	}
324 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
325 		LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
326 		LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
327 		LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
328 	}
329 	TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
330 	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
331 	NFSD_VNET(nfsrc_udpcachesize) = 0;
332 }
333 
334 /*
335  * Get a cache entry for this request. Basically just malloc a new one
336  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
337  */
338 int
339 nfsrvd_getcache(struct nfsrv_descript *nd)
340 {
341 	struct nfsrvcache *newrp;
342 	int ret;
343 
344 	if (nd->nd_procnum == NFSPROC_NULL)
345 		panic("nfsd cache null");
346 	newrp = malloc(sizeof (struct nfsrvcache),
347 	    M_NFSRVCACHE, M_WAITOK);
348 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
349 	if (nd->nd_flag & ND_NFSV4)
350 		newrp->rc_flag = RC_NFSV4;
351 	else if (nd->nd_flag & ND_NFSV3)
352 		newrp->rc_flag = RC_NFSV3;
353 	else
354 		newrp->rc_flag = RC_NFSV2;
355 	newrp->rc_xid = nd->nd_retxid;
356 	newrp->rc_proc = nd->nd_procnum;
357 	newrp->rc_sockref = nd->nd_sockref;
358 	newrp->rc_cachetime = nd->nd_tcpconntime;
359 	if (nd->nd_flag & ND_SAMETCPCONN)
360 		newrp->rc_flag |= RC_SAMETCPCONN;
361 	if (nd->nd_nam2 != NULL) {
362 		newrp->rc_flag |= RC_UDP;
363 		ret = nfsrc_getudp(nd, newrp);
364 	} else {
365 		ret = nfsrc_gettcp(nd, newrp);
366 	}
367 	NFSEXITCODE2(0, nd);
368 	return (ret);
369 }
370 
371 /*
372  * For UDP (v2, v3):
373  * - key on <xid, NFS version, RPC#, Client host ip#>
374  *   (at most one entry for each key)
375  */
376 static int
377 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
378 {
379 	struct nfsrvcache *rp;
380 	struct sockaddr_in *saddr;
381 	struct sockaddr_in6 *saddr6;
382 	struct nfsrvhashhead *hp;
383 	int ret = 0;
384 	struct mtx *mutex;
385 
386 	mutex = nfsrc_cachemutex(newrp);
387 	hp = NFSRCUDPHASH(newrp->rc_xid);
388 loop:
389 	mtx_lock(mutex);
390 	LIST_FOREACH(rp, hp, rc_hash) {
391 	    if (newrp->rc_xid == rp->rc_xid &&
392 		newrp->rc_proc == rp->rc_proc &&
393 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
394 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
395 			if ((rp->rc_flag & RC_LOCKED) != 0) {
396 				rp->rc_flag |= RC_WANTED;
397 				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
398 				    "nfsrc", 10 * hz);
399 				goto loop;
400 			}
401 			if (rp->rc_flag == 0)
402 				panic("nfs udp cache0");
403 			rp->rc_flag |= RC_LOCKED;
404 			TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
405 			TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
406 			if (rp->rc_flag & RC_INPROG) {
407 				NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
408 				mtx_unlock(mutex);
409 				ret = RC_DROPIT;
410 			} else if (rp->rc_flag & RC_REPSTATUS) {
411 				/*
412 				 * V2 only.
413 				 */
414 				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
415 				mtx_unlock(mutex);
416 				nfsrvd_rephead(nd);
417 				*(nd->nd_errp) = rp->rc_status;
418 				ret = RC_REPLY;
419 				rp->rc_timestamp = NFSD_MONOSEC +
420 					NFSRVCACHE_UDPTIMEOUT;
421 			} else if (rp->rc_flag & RC_REPMBUF) {
422 				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
423 				mtx_unlock(mutex);
424 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
425 					M_COPYALL, M_WAITOK);
426 				ret = RC_REPLY;
427 				rp->rc_timestamp = NFSD_MONOSEC +
428 					NFSRVCACHE_UDPTIMEOUT;
429 			} else {
430 				panic("nfs udp cache1");
431 			}
432 			nfsrc_unlock(rp);
433 			free(newrp, M_NFSRVCACHE);
434 			goto out;
435 		}
436 	}
437 	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
438 	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
439 	NFSD_VNET(nfsrc_udpcachesize)++;
440 
441 	newrp->rc_flag |= RC_INPROG;
442 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
443 	if (saddr->sin_family == AF_INET)
444 		newrp->rc_inet = saddr->sin_addr.s_addr;
445 	else if (saddr->sin_family == AF_INET6) {
446 		saddr6 = (struct sockaddr_in6 *)saddr;
447 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
448 		    sizeof (struct in6_addr));
449 		newrp->rc_flag |= RC_INETIPV6;
450 	}
451 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
452 	TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
453 	mtx_unlock(mutex);
454 	nd->nd_rp = newrp;
455 	ret = RC_DOIT;
456 
457 out:
458 	NFSEXITCODE2(0, nd);
459 	return (ret);
460 }
461 
462 /*
463  * Update a request cache entry after the rpc has been done
464  */
465 struct nfsrvcache *
466 nfsrvd_updatecache(struct nfsrv_descript *nd)
467 {
468 	struct nfsrvcache *rp;
469 	struct nfsrvcache *retrp = NULL;
470 	struct mbuf *m;
471 	struct mtx *mutex;
472 
473 	rp = nd->nd_rp;
474 	if (!rp)
475 		panic("nfsrvd_updatecache null rp");
476 	nd->nd_rp = NULL;
477 	mutex = nfsrc_cachemutex(rp);
478 	mtx_lock(mutex);
479 	nfsrc_lock(rp);
480 	if (!(rp->rc_flag & RC_INPROG))
481 		panic("nfsrvd_updatecache not inprog");
482 	rp->rc_flag &= ~RC_INPROG;
483 	if (rp->rc_flag & RC_UDP) {
484 		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
485 		TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
486 	}
487 
488 	/*
489 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
490 	 */
491 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
492 		NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
493 		mtx_unlock(mutex);
494 		nd->nd_repstat = 0;
495 		if (nd->nd_mreq)
496 			m_freem(nd->nd_mreq);
497 		if (!(rp->rc_flag & RC_REPMBUF))
498 			panic("reply from cache");
499 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
500 		    M_COPYALL, M_WAITOK);
501 		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
502 		nfsrc_unlock(rp);
503 		goto out;
504 	}
505 
506 	/*
507 	 * If rc_refcnt > 0, save it
508 	 * For UDP, save it if ND_SAVEREPLY is set
509 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
510 	 */
511 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
512 	    (rp->rc_refcnt > 0 ||
513 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
514 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
515 	      NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
516 	      nfsrc_tcpnonidempotent))) {
517 		if (rp->rc_refcnt > 0) {
518 			if (!(rp->rc_flag & RC_NFSV4))
519 				panic("update_cache refcnt");
520 			rp->rc_flag |= RC_REFCNT;
521 		}
522 		if ((nd->nd_flag & ND_NFSV2) &&
523 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
524 			rp->rc_status = nd->nd_repstat;
525 			rp->rc_flag |= RC_REPSTATUS;
526 			mtx_unlock(mutex);
527 		} else {
528 			if (!(rp->rc_flag & RC_UDP)) {
529 			    atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
530 				1);
531 			    if (NFSD_VNET(nfsrc_tcpsavedreplies) >
532 				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
533 				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
534 				    NFSD_VNET(nfsrc_tcpsavedreplies);
535 			}
536 			mtx_unlock(mutex);
537 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
538 			mtx_lock(mutex);
539 			rp->rc_reply = m;
540 			rp->rc_flag |= RC_REPMBUF;
541 			mtx_unlock(mutex);
542 		}
543 		if (rp->rc_flag & RC_UDP) {
544 			rp->rc_timestamp = NFSD_MONOSEC +
545 			    NFSRVCACHE_UDPTIMEOUT;
546 			nfsrc_unlock(rp);
547 		} else {
548 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
549 			if (rp->rc_refcnt > 0)
550 				nfsrc_unlock(rp);
551 			else
552 				retrp = rp;
553 		}
554 	} else {
555 		nfsrc_freecache(rp);
556 		mtx_unlock(mutex);
557 	}
558 
559 out:
560 	NFSEXITCODE2(0, nd);
561 	return (retrp);
562 }
563 
564 /*
565  * Invalidate and, if possible, free an in prog cache entry.
566  * Must not sleep.
567  */
568 void
569 nfsrvd_delcache(struct nfsrvcache *rp)
570 {
571 	struct mtx *mutex;
572 
573 	mutex = nfsrc_cachemutex(rp);
574 	if (!(rp->rc_flag & RC_INPROG))
575 		panic("nfsrvd_delcache not in prog");
576 	mtx_lock(mutex);
577 	rp->rc_flag &= ~RC_INPROG;
578 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
579 		nfsrc_freecache(rp);
580 	mtx_unlock(mutex);
581 }
582 
583 /*
584  * Called after nfsrvd_updatecache() once the reply is sent, to update
585  * the entry's sequence number and unlock it. The argument is
586  * the pointer returned by nfsrvd_updatecache().
587  */
588 void
589 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
590 {
591 	struct nfsrchash_bucket *hbp;
592 
593 	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
594 	if (have_seq) {
595 		hbp = NFSRCAHASH(rp->rc_sockref);
596 		mtx_lock(&hbp->mtx);
597 		rp->rc_tcpseq = seq;
598 		if (rp->rc_acked != RC_NO_ACK)
599 			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
600 		rp->rc_acked = RC_NO_ACK;
601 		mtx_unlock(&hbp->mtx);
602 	}
603 	nfsrc_unlock(rp);
604 }
605 
606 /*
607  * Get a cache entry for TCP
608  * - key on <xid, nfs version>
609  *   (allow multiple entries for a given key)
610  */
611 static int
612 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
613 {
614 	struct nfsrvcache *rp, *nextrp;
615 	int i;
616 	struct nfsrvcache *hitrp;
617 	struct nfsrvhashhead *hp, nfsrc_templist;
618 	int hit, ret = 0;
619 	struct mtx *mutex;
620 
621 	mutex = nfsrc_cachemutex(newrp);
622 	hp = NFSRCHASH(newrp->rc_xid);
623 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
624 tryagain:
625 	mtx_lock(mutex);
626 	hit = 1;
627 	LIST_INIT(&nfsrc_templist);
628 	/*
629 	 * Get all the matches and put them on the temp list.
630 	 */
631 	rp = LIST_FIRST(hp);
632 	while (rp != LIST_END(hp)) {
633 		nextrp = LIST_NEXT(rp, rc_hash);
634 		if (newrp->rc_xid == rp->rc_xid &&
635 		    (!(rp->rc_flag & RC_INPROG) ||
636 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
637 		      newrp->rc_sockref == rp->rc_sockref)) &&
638 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
639 		    newrp->rc_proc == rp->rc_proc &&
640 		    ((newrp->rc_flag & RC_NFSV4) &&
641 		     newrp->rc_sockref != rp->rc_sockref &&
642 		     newrp->rc_cachetime >= rp->rc_cachetime)
643 		    && newrp->rc_reqlen == rp->rc_reqlen &&
644 		    newrp->rc_cksum == rp->rc_cksum) {
645 			LIST_REMOVE(rp, rc_hash);
646 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
647 		}
648 		rp = nextrp;
649 	}
650 
651 	/*
652 	 * Now, use nfsrc_templist to decide if there is a match.
653 	 */
654 	i = 0;
655 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
656 		i++;
657 		if (rp->rc_refcnt > 0) {
658 			hit = 0;
659 			break;
660 		}
661 	}
662 	/*
663 	 * Can be a hit only if one entry left.
664 	 * Note possible hit entry and put nfsrc_templist back on hash
665 	 * list.
666 	 */
667 	if (i != 1)
668 		hit = 0;
669 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
670 	while (rp != LIST_END(&nfsrc_templist)) {
671 		nextrp = LIST_NEXT(rp, rc_hash);
672 		LIST_REMOVE(rp, rc_hash);
673 		LIST_INSERT_HEAD(hp, rp, rc_hash);
674 		rp = nextrp;
675 	}
676 	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
677 		panic("nfs gettcp cache templist");
678 
679 	if (hit) {
680 		rp = hitrp;
681 		if ((rp->rc_flag & RC_LOCKED) != 0) {
682 			rp->rc_flag |= RC_WANTED;
683 			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
684 			    "nfsrc", 10 * hz);
685 			goto tryagain;
686 		}
687 		if (rp->rc_flag == 0)
688 			panic("nfs tcp cache0");
689 		rp->rc_flag |= RC_LOCKED;
690 		if (rp->rc_flag & RC_INPROG) {
691 			NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
692 			mtx_unlock(mutex);
693 			if (newrp->rc_sockref == rp->rc_sockref)
694 				nfsrc_marksametcpconn(rp->rc_sockref);
695 			ret = RC_DROPIT;
696 		} else if (rp->rc_flag & RC_REPSTATUS) {
697 			/*
698 			 * V2 only.
699 			 */
700 			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
701 			mtx_unlock(mutex);
702 			if (newrp->rc_sockref == rp->rc_sockref)
703 				nfsrc_marksametcpconn(rp->rc_sockref);
704 			ret = RC_REPLY;
705 			nfsrvd_rephead(nd);
706 			*(nd->nd_errp) = rp->rc_status;
707 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
708 		} else if (rp->rc_flag & RC_REPMBUF) {
709 			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
710 			mtx_unlock(mutex);
711 			if (newrp->rc_sockref == rp->rc_sockref)
712 				nfsrc_marksametcpconn(rp->rc_sockref);
713 			ret = RC_REPLY;
714 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
715 				M_COPYALL, M_WAITOK);
716 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
717 		} else {
718 			panic("nfs tcp cache1");
719 		}
720 		nfsrc_unlock(rp);
721 		free(newrp, M_NFSRVCACHE);
722 		goto out;
723 	}
724 	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
725 	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
726 
727 	/*
728 	 * For TCP, multiple entries for a key are allowed, so don't
729 	 * chain it into the hash table until done.
730 	 */
731 	newrp->rc_cachetime = NFSD_MONOSEC;
732 	newrp->rc_flag |= RC_INPROG;
733 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
734 	mtx_unlock(mutex);
735 	nd->nd_rp = newrp;
736 	ret = RC_DOIT;
737 
738 out:
739 	NFSEXITCODE2(0, nd);
740 	return (ret);
741 }
742 
743 /*
744  * Lock a cache entry.
745  */
746 static void
747 nfsrc_lock(struct nfsrvcache *rp)
748 {
749 	struct mtx *mutex;
750 
751 	mutex = nfsrc_cachemutex(rp);
752 	mtx_assert(mutex, MA_OWNED);
753 	while ((rp->rc_flag & RC_LOCKED) != 0) {
754 		rp->rc_flag |= RC_WANTED;
755 		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
756 	}
757 	rp->rc_flag |= RC_LOCKED;
758 }
759 
760 /*
761  * Unlock a cache entry.
762  */
763 static void
764 nfsrc_unlock(struct nfsrvcache *rp)
765 {
766 	struct mtx *mutex;
767 
768 	mutex = nfsrc_cachemutex(rp);
769 	mtx_lock(mutex);
770 	rp->rc_flag &= ~RC_LOCKED;
771 	nfsrc_wanted(rp);
772 	mtx_unlock(mutex);
773 }
774 
775 /*
776  * Wakeup anyone wanting entry.
777  */
778 static void
779 nfsrc_wanted(struct nfsrvcache *rp)
780 {
781 	if (rp->rc_flag & RC_WANTED) {
782 		rp->rc_flag &= ~RC_WANTED;
783 		wakeup((caddr_t)rp);
784 	}
785 }
786 
787 /*
788  * Free up the entry.
789  * Must not sleep.
790  */
791 static void
792 nfsrc_freecache(struct nfsrvcache *rp)
793 {
794 	struct nfsrchash_bucket *hbp;
795 
796 	LIST_REMOVE(rp, rc_hash);
797 	if (rp->rc_flag & RC_UDP) {
798 		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
799 		NFSD_VNET(nfsrc_udpcachesize)--;
800 	} else if (rp->rc_acked != RC_NO_SEQ) {
801 		hbp = NFSRCAHASH(rp->rc_sockref);
802 		mtx_lock(&hbp->mtx);
803 		if (rp->rc_acked == RC_NO_ACK)
804 			LIST_REMOVE(rp, rc_ahash);
805 		mtx_unlock(&hbp->mtx);
806 	}
807 	nfsrc_wanted(rp);
808 	if (rp->rc_flag & RC_REPMBUF) {
809 		m_freem(rp->rc_reply);
810 		if (!(rp->rc_flag & RC_UDP))
811 			atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
812 	}
813 	free(rp, M_NFSRVCACHE);
814 	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
815 }
816 
817 /*
818  * Clean out the cache. Called when nfsserver module is unloaded.
819  */
820 void
821 nfsrvd_cleancache(void)
822 {
823 	struct nfsrvcache *rp, *nextrp;
824 	int i;
825 
826 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
827 		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
828 		    rc_hash, nextrp)
829 			nfsrc_freecache(rp);
830 	}
831 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
832 		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
833 		    nextrp) {
834 			nfsrc_freecache(rp);
835 		}
836 	}
837 	NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
838 	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
839 }
840 
841 #define HISTSIZE	16
842 /*
843  * The basic rule is to get rid of entries that are expired.
844  */
845 void
846 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
847 {
848 	struct nfsrchash_bucket *hbp;
849 	struct nfsrvcache *rp, *nextrp;
850 	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
851 	time_t thisstamp;
852 	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
853 	static int onethread = 0, oneslot = 0;
854 
855 	if (sockref != 0) {
856 		hbp = NFSRCAHASH(sockref);
857 		mtx_lock(&hbp->mtx);
858 		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
859 			if (sockref == rp->rc_sockref) {
860 				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
861 					rp->rc_acked = RC_ACK;
862 					LIST_REMOVE(rp, rc_ahash);
863 				} else if (final) {
864 					rp->rc_acked = RC_NACK;
865 					LIST_REMOVE(rp, rc_ahash);
866 				}
867 			}
868 		}
869 		mtx_unlock(&hbp->mtx);
870 	}
871 
872 	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
873 		return;
874 	if (NFSD_MONOSEC != udp_lasttrim ||
875 	    NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
876 	    nfsrc_udphighwater / 2)) {
877 		mtx_lock(&nfsrc_udpmtx);
878 		udp_lasttrim = NFSD_MONOSEC;
879 		TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
880 		    nextrp) {
881 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
882 			     && rp->rc_refcnt == 0
883 			     && ((rp->rc_flag & RC_REFCNT) ||
884 				 udp_lasttrim > rp->rc_timestamp ||
885 				 NFSD_VNET(nfsrc_udpcachesize) >
886 				 nfsrc_udphighwater))
887 				nfsrc_freecache(rp);
888 		}
889 		mtx_unlock(&nfsrc_udpmtx);
890 	}
891 	if (NFSD_MONOSEC != tcp_lasttrim ||
892 	    NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
893 		force = nfsrc_tcphighwater / 4;
894 		if (force > 0 &&
895 		    NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
896 		    nfsrc_tcphighwater) {
897 			for (i = 0; i < HISTSIZE; i++)
898 				time_histo[i] = 0;
899 			i = 0;
900 			lastslot = NFSRVCACHE_HASHSIZE - 1;
901 		} else {
902 			force = 0;
903 			if (NFSD_MONOSEC != tcp_lasttrim) {
904 				i = 0;
905 				lastslot = NFSRVCACHE_HASHSIZE - 1;
906 			} else {
907 				lastslot = i = oneslot;
908 				if (++oneslot >= NFSRVCACHE_HASHSIZE)
909 					oneslot = 0;
910 			}
911 		}
912 		tto = nfsrc_tcptimeout;
913 		tcp_lasttrim = NFSD_MONOSEC;
914 		for (; i <= lastslot; i++) {
915 			mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
916 			LIST_FOREACH_SAFE(rp,
917 			    &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
918 			    nextrp) {
919 				if (!(rp->rc_flag &
920 				     (RC_INPROG|RC_LOCKED|RC_WANTED))
921 				     && rp->rc_refcnt == 0) {
922 					if ((rp->rc_flag & RC_REFCNT) ||
923 					    tcp_lasttrim > rp->rc_timestamp ||
924 					    rp->rc_acked == RC_ACK) {
925 						nfsrc_freecache(rp);
926 						continue;
927 					}
928 
929 					if (force == 0)
930 						continue;
931 					/*
932 					 * The timestamps range from roughly the
933 					 * present (tcp_lasttrim) to the present
934 					 * + nfsrc_tcptimeout. Generate a simple
935 					 * histogram of where the timeouts fall.
936 					 */
937 					j = rp->rc_timestamp - tcp_lasttrim;
938 					if (j >= tto)
939 						j = HISTSIZE - 1;
940 					else if (j < 0)
941 						j = 0;
942 					else
943 						j = j * HISTSIZE / tto;
944 					time_histo[j]++;
945 				}
946 			}
947 			mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
948 		}
949 		if (force) {
950 			/*
951 			 * Trim some more with a smaller timeout of as little
952 			 * as 20% of nfsrc_tcptimeout to try and get below
953 			 * 80% of the nfsrc_tcphighwater.
954 			 */
955 			k = 0;
956 			for (i = 0; i < (HISTSIZE - 2); i++) {
957 				k += time_histo[i];
958 				if (k > force)
959 					break;
960 			}
961 			k = tto * (i + 1) / HISTSIZE;
962 			if (k < 1)
963 				k = 1;
964 			thisstamp = tcp_lasttrim + k;
965 			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
966 				mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
967 				LIST_FOREACH_SAFE(rp,
968 				    &NFSD_VNET(nfsrchash_table)[i].tbl,
969 				    rc_hash, nextrp) {
970 					if (!(rp->rc_flag &
971 					     (RC_INPROG|RC_LOCKED|RC_WANTED))
972 					     && rp->rc_refcnt == 0
973 					     && ((rp->rc_flag & RC_REFCNT) ||
974 						 thisstamp > rp->rc_timestamp ||
975 						 rp->rc_acked == RC_ACK))
976 						nfsrc_freecache(rp);
977 				}
978 				mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
979 			}
980 		}
981 	}
982 	atomic_store_rel_int(&onethread, 0);
983 }
984 
985 /*
986  * Add a seqid# reference to the cache entry.
987  */
988 void
989 nfsrvd_refcache(struct nfsrvcache *rp)
990 {
991 	struct mtx *mutex;
992 
993 	if (rp == NULL)
994 		/* For NFSv4.1, there is no cache entry. */
995 		return;
996 	mutex = nfsrc_cachemutex(rp);
997 	mtx_lock(mutex);
998 	if (rp->rc_refcnt < 0)
999 		panic("nfs cache refcnt");
1000 	rp->rc_refcnt++;
1001 	mtx_unlock(mutex);
1002 }
1003 
1004 /*
1005  * Dereference a seqid# cache entry.
1006  */
1007 void
1008 nfsrvd_derefcache(struct nfsrvcache *rp)
1009 {
1010 	struct mtx *mutex;
1011 
1012 	mutex = nfsrc_cachemutex(rp);
1013 	mtx_lock(mutex);
1014 	if (rp->rc_refcnt <= 0)
1015 		panic("nfs cache derefcnt");
1016 	rp->rc_refcnt--;
1017 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1018 		nfsrc_freecache(rp);
1019 	mtx_unlock(mutex);
1020 }
1021 
1022 /*
1023  * Calculate the length of the mbuf list and a checksum on the first up to
1024  * NFSRVCACHE_CHECKLEN bytes.
1025  */
1026 static int
1027 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1028 {
1029 	int len = 0, cklen;
1030 	struct mbuf *m;
1031 
1032 	m = m1;
1033 	while (m) {
1034 		len += m->m_len;
1035 		m = m->m_next;
1036 	}
1037 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1038 	*cksum = in_cksum(m1, cklen);
1039 	return (len);
1040 }
1041 
1042 /*
1043  * Mark a TCP connection that is seeing retries. Should never happen for
1044  * NFSv4.
1045  */
1046 static void
1047 nfsrc_marksametcpconn(u_int64_t sockref)
1048 {
1049 }
1050