xref: /freebsd/sys/fs/nfsserver/nfs_nfsdcache.c (revision b2d48be1bc7df45ddd13b143a160d0acb5a383c5)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 /*
38  * Here is the basic algorithm:
39  * First, some design criteria I used:
40  * - I think a false hit is more serious than a false miss
41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
42  *   avoided at all cost
43  * - A valid hit will probably happen a long time after the original reply
44  *   and the TCP socket that the original request was received on will no
45  *   longer be active
46  *   (The long time delay implies to me that LRU is not appropriate.)
47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48  *   in them as well as minimizing the risk of redoing retried non-idempotent
49  *   Ops.
50  * Because it is biased towards avoiding false hits, multiple entries with
51  * the same xid are to be expected, especially for the case of the entry
52  * in the cache being related to a seqid# sequenced Op.
53  *
54  * The basic algorithm I'm about to code up:
55  * - Null RPCs bypass the cache and are just done
56  * For TCP
57  * 	- key on <xid, NFS version> (as noted above, there can be several
58  * 				     entries with the same key)
59  * 	When a request arrives:
60  * 		For all that match key
61  * 		- if RPC# != OR request_size !=
62  * 			- not a match with this one
63  * 		- if NFSv4 and received on same TCP socket OR
64  *			received on a TCP connection created before the
65  *			entry was cached
66  * 			- not a match with this one
67  * 			(V2,3 clients might retry on same TCP socket)
68  * 		- calculate checksum on first N bytes of NFS XDR
69  * 		- if checksum !=
70  * 			- not a match for this one
71  * 		If any of the remaining ones that match has a
72  * 			seqid_refcnt > 0
73  * 			- not a match (go do RPC, using new cache entry)
74  * 		If one match left
75  * 			- a hit (reply from cache)
76  * 		else
77  * 			- miss (go do RPC, using new cache entry)
78  *
79  * 	During processing of NFSv4 request:
80  * 		- set a flag when a non-idempotent Op is processed
81  * 		- when an Op that uses a seqid# (Open,...) is processed
82  * 			- if same seqid# as referenced entry in cache
83  * 				- free new cache entry
84  * 				- reply from referenced cache entry
85  * 			  else if next seqid# in order
86  * 				- free referenced cache entry
87  * 				- increment seqid_refcnt on new cache entry
88  * 				- set pointer from Openowner/Lockowner to
89  * 					new cache entry (aka reference it)
90  * 			  else if first seqid# in sequence
91  * 				- increment seqid_refcnt on new cache entry
92  * 				- set pointer from Openowner/Lockowner to
93  * 					new cache entry (aka reference it)
94  *
95  * 	At end of RPC processing:
96  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97  * 			cache entry
98  * 			- save reply in cache entry
99  * 			- calculate checksum on first N bytes of NFS XDR
100  * 				request
101  * 			- note op and length of XDR request (in bytes)
102  * 			- timestamp it
103  * 		  else
104  * 			- free new cache entry
105  * 		- Send reply (noting info for socket activity check, below)
106  *
107  * 	For cache entries saved above:
108  * 		- if saved since seqid_refcnt was > 0
109  * 			- free when seqid_refcnt decrements to 0
110  * 			  (when next one in sequence is processed above, or
111  * 			   when Openowner/Lockowner is discarded)
112  * 		  else { non-idempotent Op(s) }
113  * 			- free when
114  * 				- some further activity observed on same
115  * 					socket
116  * 				  (I'm not yet sure how I'm going to do
117  * 				   this. Maybe look at the TCP connection
118  * 				   to see if the send_tcp_sequence# is well
119  * 				   past sent reply OR K additional RPCs
120  * 				   replied on same socket OR?)
121  * 			  OR
122  * 				- when very old (hours, days, weeks?)
123  *
124  * For UDP (v2, 3 only), pretty much the old way:
125  * - key on <xid, NFS version, RPC#, Client host ip#>
126  *   (at most one entry for each key)
127  *
128  * When a Request arrives:
129  * - if a match with entry via key
130  * 	- if RPC marked In_progress
131  * 		- discard request (don't send reply)
132  * 	  else
133  * 		- reply from cache
134  * 		- timestamp cache entry
135  *   else
136  * 	- add entry to cache, marked In_progress
137  * 	- do RPC
138  * 	- when RPC done
139  * 		- if RPC# non-idempotent
140  * 			- mark entry Done (not In_progress)
141  * 			- save reply
142  * 			- timestamp cache entry
143  * 		  else
144  * 			- free cache entry
145  * 		- send reply
146  *
147  * Later, entries with saved replies are free'd a short time (few minutes)
148  * after reply sent (timestamp).
149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
150  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151  *		pages 53-63. San Diego, February 1989.
152  *	 for the UDP case.
153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154  *	for TCP. For V3, a reply won't be saved when the flood level is
155  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156  *	that case. This level should be set high enough that this almost
157  *	never happens.
158  */
159 #ifndef APPLEKEXT
160 #include <fs/nfs/nfsport.h>
161 
162 extern struct nfsstats newnfsstats;
163 extern struct mtx nfsrc_udpmtx;
164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
166 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
167 #endif	/* !APPLEKEXT */
168 
169 SYSCTL_DECL(_vfs_nfsd);
170 
171 static u_int	nfsrc_tcphighwater = 0;
172 static int
173 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
174 {
175 	int error, newhighwater;
176 
177 	newhighwater = nfsrc_tcphighwater;
178 	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
179 	if (error != 0 || req->newptr == NULL)
180 		return (error);
181 	if (newhighwater < 0)
182 		return (EINVAL);
183 	if (newhighwater >= nfsrc_floodlevel)
184 		nfsrc_floodlevel = newhighwater + newhighwater / 5;
185 	nfsrc_tcphighwater = newhighwater;
186 	return (0);
187 }
188 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
189     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
190     "High water mark for TCP cache entries");
191 
192 static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
193 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
194     &nfsrc_udphighwater, 0,
195     "High water mark for UDP cache entries");
196 static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
198     &nfsrc_tcptimeout, 0,
199     "Timeout for TCP entries in the DRC");
200 static u_int nfsrc_tcpnonidempotent = 1;
201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
202     &nfsrc_tcpnonidempotent, 0,
203     "Enable the DRC for NFS over TCP");
204 
205 static int nfsrc_udpcachesize = 0;
206 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
207 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
208 
209 /*
210  * and the reverse mapping from generic to Version 2 procedure numbers
211  */
212 static int newnfsv2_procid[NFS_V3NPROCS] = {
213 	NFSV2PROC_NULL,
214 	NFSV2PROC_GETATTR,
215 	NFSV2PROC_SETATTR,
216 	NFSV2PROC_LOOKUP,
217 	NFSV2PROC_NOOP,
218 	NFSV2PROC_READLINK,
219 	NFSV2PROC_READ,
220 	NFSV2PROC_WRITE,
221 	NFSV2PROC_CREATE,
222 	NFSV2PROC_MKDIR,
223 	NFSV2PROC_SYMLINK,
224 	NFSV2PROC_CREATE,
225 	NFSV2PROC_REMOVE,
226 	NFSV2PROC_RMDIR,
227 	NFSV2PROC_RENAME,
228 	NFSV2PROC_LINK,
229 	NFSV2PROC_READDIR,
230 	NFSV2PROC_NOOP,
231 	NFSV2PROC_STATFS,
232 	NFSV2PROC_NOOP,
233 	NFSV2PROC_NOOP,
234 	NFSV2PROC_NOOP,
235 };
236 
237 #define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
238 #define	NFSRCUDPHASH(xid) \
239 	(&nfsrvudphashtbl[nfsrc_hash(xid)])
240 #define	NFSRCHASH(xid) \
241 	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
242 #define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
243 #define	TRUE	1
244 #define	FALSE	0
245 #define	NFSRVCACHE_CHECKLEN	100
246 
247 /* True iff the rpc reply is an nfs status ONLY! */
248 static int nfsv2_repstat[NFS_V3NPROCS] = {
249 	FALSE,
250 	FALSE,
251 	FALSE,
252 	FALSE,
253 	FALSE,
254 	FALSE,
255 	FALSE,
256 	FALSE,
257 	FALSE,
258 	FALSE,
259 	TRUE,
260 	TRUE,
261 	TRUE,
262 	TRUE,
263 	FALSE,
264 	TRUE,
265 	FALSE,
266 	FALSE,
267 	FALSE,
268 	FALSE,
269 	FALSE,
270 	FALSE,
271 };
272 
273 /*
274  * Will NFS want to work over IPv6 someday?
275  */
276 #define	NETFAMILY(rp) \
277 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
278 
279 /* local functions */
280 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
281 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282 static void nfsrc_lock(struct nfsrvcache *rp);
283 static void nfsrc_unlock(struct nfsrvcache *rp);
284 static void nfsrc_wanted(struct nfsrvcache *rp);
285 static void nfsrc_freecache(struct nfsrvcache *rp);
286 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
287 static void nfsrc_marksametcpconn(u_int64_t);
288 
289 /*
290  * Return the correct mutex for this cache entry.
291  */
292 static __inline struct mtx *
293 nfsrc_cachemutex(struct nfsrvcache *rp)
294 {
295 
296 	if ((rp->rc_flag & RC_UDP) != 0)
297 		return (&nfsrc_udpmtx);
298 	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
299 }
300 
301 /*
302  * Initialize the server request cache list
303  */
304 APPLESTATIC void
305 nfsrvd_initcache(void)
306 {
307 	int i;
308 	static int inited = 0;
309 
310 	if (inited)
311 		return;
312 	inited = 1;
313 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
314 		LIST_INIT(&nfsrvudphashtbl[i]);
315 		LIST_INIT(&nfsrchash_table[i].tbl);
316 		LIST_INIT(&nfsrcahash_table[i].tbl);
317 	}
318 	TAILQ_INIT(&nfsrvudplru);
319 	nfsrc_tcpsavedreplies = 0;
320 	nfsrc_udpcachesize = 0;
321 	newnfsstats.srvcache_tcppeak = 0;
322 	newnfsstats.srvcache_size = 0;
323 }
324 
325 /*
326  * Get a cache entry for this request. Basically just malloc a new one
327  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328  */
329 APPLESTATIC int
330 nfsrvd_getcache(struct nfsrv_descript *nd)
331 {
332 	struct nfsrvcache *newrp;
333 	int ret;
334 
335 	if (nd->nd_procnum == NFSPROC_NULL)
336 		panic("nfsd cache null");
337 	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
338 	    M_NFSRVCACHE, M_WAITOK);
339 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
340 	if (nd->nd_flag & ND_NFSV4)
341 		newrp->rc_flag = RC_NFSV4;
342 	else if (nd->nd_flag & ND_NFSV3)
343 		newrp->rc_flag = RC_NFSV3;
344 	else
345 		newrp->rc_flag = RC_NFSV2;
346 	newrp->rc_xid = nd->nd_retxid;
347 	newrp->rc_proc = nd->nd_procnum;
348 	newrp->rc_sockref = nd->nd_sockref;
349 	newrp->rc_cachetime = nd->nd_tcpconntime;
350 	if (nd->nd_flag & ND_SAMETCPCONN)
351 		newrp->rc_flag |= RC_SAMETCPCONN;
352 	if (nd->nd_nam2 != NULL) {
353 		newrp->rc_flag |= RC_UDP;
354 		ret = nfsrc_getudp(nd, newrp);
355 	} else {
356 		ret = nfsrc_gettcp(nd, newrp);
357 	}
358 	NFSEXITCODE2(0, nd);
359 	return (ret);
360 }
361 
362 /*
363  * For UDP (v2, v3):
364  * - key on <xid, NFS version, RPC#, Client host ip#>
365  *   (at most one entry for each key)
366  */
367 static int
368 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
369 {
370 	struct nfsrvcache *rp;
371 	struct sockaddr_in *saddr;
372 	struct sockaddr_in6 *saddr6;
373 	struct nfsrvhashhead *hp;
374 	int ret = 0;
375 	struct mtx *mutex;
376 
377 	mutex = nfsrc_cachemutex(newrp);
378 	hp = NFSRCUDPHASH(newrp->rc_xid);
379 loop:
380 	mtx_lock(mutex);
381 	LIST_FOREACH(rp, hp, rc_hash) {
382 	    if (newrp->rc_xid == rp->rc_xid &&
383 		newrp->rc_proc == rp->rc_proc &&
384 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
385 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
386 			if ((rp->rc_flag & RC_LOCKED) != 0) {
387 				rp->rc_flag |= RC_WANTED;
388 				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
389 				    "nfsrc", 10 * hz);
390 				goto loop;
391 			}
392 			if (rp->rc_flag == 0)
393 				panic("nfs udp cache0");
394 			rp->rc_flag |= RC_LOCKED;
395 			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
396 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
397 			if (rp->rc_flag & RC_INPROG) {
398 				newnfsstats.srvcache_inproghits++;
399 				mtx_unlock(mutex);
400 				ret = RC_DROPIT;
401 			} else if (rp->rc_flag & RC_REPSTATUS) {
402 				/*
403 				 * V2 only.
404 				 */
405 				newnfsstats.srvcache_nonidemdonehits++;
406 				mtx_unlock(mutex);
407 				nfsrvd_rephead(nd);
408 				*(nd->nd_errp) = rp->rc_status;
409 				ret = RC_REPLY;
410 				rp->rc_timestamp = NFSD_MONOSEC +
411 					NFSRVCACHE_UDPTIMEOUT;
412 			} else if (rp->rc_flag & RC_REPMBUF) {
413 				newnfsstats.srvcache_nonidemdonehits++;
414 				mtx_unlock(mutex);
415 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
416 					M_COPYALL, M_WAITOK);
417 				ret = RC_REPLY;
418 				rp->rc_timestamp = NFSD_MONOSEC +
419 					NFSRVCACHE_UDPTIMEOUT;
420 			} else {
421 				panic("nfs udp cache1");
422 			}
423 			nfsrc_unlock(rp);
424 			free((caddr_t)newrp, M_NFSRVCACHE);
425 			goto out;
426 		}
427 	}
428 	newnfsstats.srvcache_misses++;
429 	atomic_add_int(&newnfsstats.srvcache_size, 1);
430 	nfsrc_udpcachesize++;
431 
432 	newrp->rc_flag |= RC_INPROG;
433 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
434 	if (saddr->sin_family == AF_INET)
435 		newrp->rc_inet = saddr->sin_addr.s_addr;
436 	else if (saddr->sin_family == AF_INET6) {
437 		saddr6 = (struct sockaddr_in6 *)saddr;
438 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
439 		    sizeof (struct in6_addr));
440 		newrp->rc_flag |= RC_INETIPV6;
441 	}
442 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
443 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
444 	mtx_unlock(mutex);
445 	nd->nd_rp = newrp;
446 	ret = RC_DOIT;
447 
448 out:
449 	NFSEXITCODE2(0, nd);
450 	return (ret);
451 }
452 
453 /*
454  * Update a request cache entry after the rpc has been done
455  */
456 APPLESTATIC struct nfsrvcache *
457 nfsrvd_updatecache(struct nfsrv_descript *nd)
458 {
459 	struct nfsrvcache *rp;
460 	struct nfsrvcache *retrp = NULL;
461 	mbuf_t m;
462 	struct mtx *mutex;
463 
464 	rp = nd->nd_rp;
465 	if (!rp)
466 		panic("nfsrvd_updatecache null rp");
467 	nd->nd_rp = NULL;
468 	mutex = nfsrc_cachemutex(rp);
469 	mtx_lock(mutex);
470 	nfsrc_lock(rp);
471 	if (!(rp->rc_flag & RC_INPROG))
472 		panic("nfsrvd_updatecache not inprog");
473 	rp->rc_flag &= ~RC_INPROG;
474 	if (rp->rc_flag & RC_UDP) {
475 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
476 		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
477 	}
478 
479 	/*
480 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
481 	 */
482 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
483 		newnfsstats.srvcache_nonidemdonehits++;
484 		mtx_unlock(mutex);
485 		nd->nd_repstat = 0;
486 		if (nd->nd_mreq)
487 			mbuf_freem(nd->nd_mreq);
488 		if (!(rp->rc_flag & RC_REPMBUF))
489 			panic("reply from cache");
490 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
491 		    M_COPYALL, M_WAITOK);
492 		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
493 		nfsrc_unlock(rp);
494 		goto out;
495 	}
496 
497 	/*
498 	 * If rc_refcnt > 0, save it
499 	 * For UDP, save it if ND_SAVEREPLY is set
500 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
501 	 */
502 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
503 	    (rp->rc_refcnt > 0 ||
504 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
505 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
506 	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
507 	      nfsrc_tcpnonidempotent))) {
508 		if (rp->rc_refcnt > 0) {
509 			if (!(rp->rc_flag & RC_NFSV4))
510 				panic("update_cache refcnt");
511 			rp->rc_flag |= RC_REFCNT;
512 		}
513 		if ((nd->nd_flag & ND_NFSV2) &&
514 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
515 			rp->rc_status = nd->nd_repstat;
516 			rp->rc_flag |= RC_REPSTATUS;
517 			mtx_unlock(mutex);
518 		} else {
519 			if (!(rp->rc_flag & RC_UDP)) {
520 			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
521 			    if (nfsrc_tcpsavedreplies >
522 				newnfsstats.srvcache_tcppeak)
523 				newnfsstats.srvcache_tcppeak =
524 				    nfsrc_tcpsavedreplies;
525 			}
526 			mtx_unlock(mutex);
527 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
528 			mtx_lock(mutex);
529 			rp->rc_reply = m;
530 			rp->rc_flag |= RC_REPMBUF;
531 			mtx_unlock(mutex);
532 		}
533 		if (rp->rc_flag & RC_UDP) {
534 			rp->rc_timestamp = NFSD_MONOSEC +
535 			    NFSRVCACHE_UDPTIMEOUT;
536 			nfsrc_unlock(rp);
537 		} else {
538 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
539 			if (rp->rc_refcnt > 0)
540 				nfsrc_unlock(rp);
541 			else
542 				retrp = rp;
543 		}
544 	} else {
545 		nfsrc_freecache(rp);
546 		mtx_unlock(mutex);
547 	}
548 
549 out:
550 	NFSEXITCODE2(0, nd);
551 	return (retrp);
552 }
553 
554 /*
555  * Invalidate and, if possible, free an in prog cache entry.
556  * Must not sleep.
557  */
558 APPLESTATIC void
559 nfsrvd_delcache(struct nfsrvcache *rp)
560 {
561 	struct mtx *mutex;
562 
563 	mutex = nfsrc_cachemutex(rp);
564 	if (!(rp->rc_flag & RC_INPROG))
565 		panic("nfsrvd_delcache not in prog");
566 	mtx_lock(mutex);
567 	rp->rc_flag &= ~RC_INPROG;
568 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
569 		nfsrc_freecache(rp);
570 	mtx_unlock(mutex);
571 }
572 
573 /*
574  * Called after nfsrvd_updatecache() once the reply is sent, to update
575  * the entry's sequence number and unlock it. The argument is
576  * the pointer returned by nfsrvd_updatecache().
577  */
578 APPLESTATIC void
579 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
580 {
581 	struct nfsrchash_bucket *hbp;
582 
583 	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
584 	if (have_seq) {
585 		hbp = NFSRCAHASH(rp->rc_sockref);
586 		mtx_lock(&hbp->mtx);
587 		rp->rc_tcpseq = seq;
588 		if (rp->rc_acked != RC_NO_ACK)
589 			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
590 		rp->rc_acked = RC_NO_ACK;
591 		mtx_unlock(&hbp->mtx);
592 	}
593 	nfsrc_unlock(rp);
594 }
595 
596 /*
597  * Get a cache entry for TCP
598  * - key on <xid, nfs version>
599  *   (allow multiple entries for a given key)
600  */
601 static int
602 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
603 {
604 	struct nfsrvcache *rp, *nextrp;
605 	int i;
606 	struct nfsrvcache *hitrp;
607 	struct nfsrvhashhead *hp, nfsrc_templist;
608 	int hit, ret = 0;
609 	struct mtx *mutex;
610 
611 	mutex = nfsrc_cachemutex(newrp);
612 	hp = NFSRCHASH(newrp->rc_xid);
613 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
614 tryagain:
615 	mtx_lock(mutex);
616 	hit = 1;
617 	LIST_INIT(&nfsrc_templist);
618 	/*
619 	 * Get all the matches and put them on the temp list.
620 	 */
621 	rp = LIST_FIRST(hp);
622 	while (rp != LIST_END(hp)) {
623 		nextrp = LIST_NEXT(rp, rc_hash);
624 		if (newrp->rc_xid == rp->rc_xid &&
625 		    (!(rp->rc_flag & RC_INPROG) ||
626 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
627 		      newrp->rc_sockref == rp->rc_sockref)) &&
628 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
629 		    newrp->rc_proc == rp->rc_proc &&
630 		    ((newrp->rc_flag & RC_NFSV4) &&
631 		     newrp->rc_sockref != rp->rc_sockref &&
632 		     newrp->rc_cachetime >= rp->rc_cachetime)
633 		    && newrp->rc_reqlen == rp->rc_reqlen &&
634 		    newrp->rc_cksum == rp->rc_cksum) {
635 			LIST_REMOVE(rp, rc_hash);
636 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
637 		}
638 		rp = nextrp;
639 	}
640 
641 	/*
642 	 * Now, use nfsrc_templist to decide if there is a match.
643 	 */
644 	i = 0;
645 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
646 		i++;
647 		if (rp->rc_refcnt > 0) {
648 			hit = 0;
649 			break;
650 		}
651 	}
652 	/*
653 	 * Can be a hit only if one entry left.
654 	 * Note possible hit entry and put nfsrc_templist back on hash
655 	 * list.
656 	 */
657 	if (i != 1)
658 		hit = 0;
659 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
660 	while (rp != LIST_END(&nfsrc_templist)) {
661 		nextrp = LIST_NEXT(rp, rc_hash);
662 		LIST_REMOVE(rp, rc_hash);
663 		LIST_INSERT_HEAD(hp, rp, rc_hash);
664 		rp = nextrp;
665 	}
666 	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
667 		panic("nfs gettcp cache templist");
668 
669 	if (hit) {
670 		rp = hitrp;
671 		if ((rp->rc_flag & RC_LOCKED) != 0) {
672 			rp->rc_flag |= RC_WANTED;
673 			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
674 			    "nfsrc", 10 * hz);
675 			goto tryagain;
676 		}
677 		if (rp->rc_flag == 0)
678 			panic("nfs tcp cache0");
679 		rp->rc_flag |= RC_LOCKED;
680 		if (rp->rc_flag & RC_INPROG) {
681 			newnfsstats.srvcache_inproghits++;
682 			mtx_unlock(mutex);
683 			if (newrp->rc_sockref == rp->rc_sockref)
684 				nfsrc_marksametcpconn(rp->rc_sockref);
685 			ret = RC_DROPIT;
686 		} else if (rp->rc_flag & RC_REPSTATUS) {
687 			/*
688 			 * V2 only.
689 			 */
690 			newnfsstats.srvcache_nonidemdonehits++;
691 			mtx_unlock(mutex);
692 			if (newrp->rc_sockref == rp->rc_sockref)
693 				nfsrc_marksametcpconn(rp->rc_sockref);
694 			ret = RC_REPLY;
695 			nfsrvd_rephead(nd);
696 			*(nd->nd_errp) = rp->rc_status;
697 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
698 		} else if (rp->rc_flag & RC_REPMBUF) {
699 			newnfsstats.srvcache_nonidemdonehits++;
700 			mtx_unlock(mutex);
701 			if (newrp->rc_sockref == rp->rc_sockref)
702 				nfsrc_marksametcpconn(rp->rc_sockref);
703 			ret = RC_REPLY;
704 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
705 				M_COPYALL, M_WAITOK);
706 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707 		} else {
708 			panic("nfs tcp cache1");
709 		}
710 		nfsrc_unlock(rp);
711 		free((caddr_t)newrp, M_NFSRVCACHE);
712 		goto out;
713 	}
714 	newnfsstats.srvcache_misses++;
715 	atomic_add_int(&newnfsstats.srvcache_size, 1);
716 
717 	/*
718 	 * For TCP, multiple entries for a key are allowed, so don't
719 	 * chain it into the hash table until done.
720 	 */
721 	newrp->rc_cachetime = NFSD_MONOSEC;
722 	newrp->rc_flag |= RC_INPROG;
723 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
724 	mtx_unlock(mutex);
725 	nd->nd_rp = newrp;
726 	ret = RC_DOIT;
727 
728 out:
729 	NFSEXITCODE2(0, nd);
730 	return (ret);
731 }
732 
733 /*
734  * Lock a cache entry.
735  */
736 static void
737 nfsrc_lock(struct nfsrvcache *rp)
738 {
739 	struct mtx *mutex;
740 
741 	mutex = nfsrc_cachemutex(rp);
742 	mtx_assert(mutex, MA_OWNED);
743 	while ((rp->rc_flag & RC_LOCKED) != 0) {
744 		rp->rc_flag |= RC_WANTED;
745 		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
746 	}
747 	rp->rc_flag |= RC_LOCKED;
748 }
749 
750 /*
751  * Unlock a cache entry.
752  */
753 static void
754 nfsrc_unlock(struct nfsrvcache *rp)
755 {
756 	struct mtx *mutex;
757 
758 	mutex = nfsrc_cachemutex(rp);
759 	mtx_lock(mutex);
760 	rp->rc_flag &= ~RC_LOCKED;
761 	nfsrc_wanted(rp);
762 	mtx_unlock(mutex);
763 }
764 
765 /*
766  * Wakeup anyone wanting entry.
767  */
768 static void
769 nfsrc_wanted(struct nfsrvcache *rp)
770 {
771 	if (rp->rc_flag & RC_WANTED) {
772 		rp->rc_flag &= ~RC_WANTED;
773 		wakeup((caddr_t)rp);
774 	}
775 }
776 
777 /*
778  * Free up the entry.
779  * Must not sleep.
780  */
781 static void
782 nfsrc_freecache(struct nfsrvcache *rp)
783 {
784 	struct nfsrchash_bucket *hbp;
785 
786 	LIST_REMOVE(rp, rc_hash);
787 	if (rp->rc_flag & RC_UDP) {
788 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
789 		nfsrc_udpcachesize--;
790 	} else if (rp->rc_acked != RC_NO_SEQ) {
791 		hbp = NFSRCAHASH(rp->rc_sockref);
792 		mtx_lock(&hbp->mtx);
793 		if (rp->rc_acked == RC_NO_ACK)
794 			LIST_REMOVE(rp, rc_ahash);
795 		mtx_unlock(&hbp->mtx);
796 	}
797 	nfsrc_wanted(rp);
798 	if (rp->rc_flag & RC_REPMBUF) {
799 		mbuf_freem(rp->rc_reply);
800 		if (!(rp->rc_flag & RC_UDP))
801 			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
802 	}
803 	FREE((caddr_t)rp, M_NFSRVCACHE);
804 	atomic_add_int(&newnfsstats.srvcache_size, -1);
805 }
806 
807 /*
808  * Clean out the cache. Called when nfsserver module is unloaded.
809  */
810 APPLESTATIC void
811 nfsrvd_cleancache(void)
812 {
813 	struct nfsrvcache *rp, *nextrp;
814 	int i;
815 
816 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
817 		mtx_lock(&nfsrchash_table[i].mtx);
818 		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
819 			nfsrc_freecache(rp);
820 		mtx_unlock(&nfsrchash_table[i].mtx);
821 	}
822 	mtx_lock(&nfsrc_udpmtx);
823 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
824 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
825 			nfsrc_freecache(rp);
826 		}
827 	}
828 	newnfsstats.srvcache_size = 0;
829 	mtx_unlock(&nfsrc_udpmtx);
830 	nfsrc_tcpsavedreplies = 0;
831 }
832 
833 #define HISTSIZE	16
834 /*
835  * The basic rule is to get rid of entries that are expired.
836  */
837 void
838 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
839 {
840 	struct nfsrchash_bucket *hbp;
841 	struct nfsrvcache *rp, *nextrp;
842 	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
843 	time_t thisstamp;
844 	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845 	static int onethread = 0, oneslot = 0;
846 
847 	if (sockref != 0) {
848 		hbp = NFSRCAHASH(sockref);
849 		mtx_lock(&hbp->mtx);
850 		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
851 			if (sockref == rp->rc_sockref) {
852 				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
853 					rp->rc_acked = RC_ACK;
854 					LIST_REMOVE(rp, rc_ahash);
855 				} else if (final) {
856 					rp->rc_acked = RC_NACK;
857 					LIST_REMOVE(rp, rc_ahash);
858 				}
859 			}
860 		}
861 		mtx_unlock(&hbp->mtx);
862 	}
863 
864 	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
865 		return;
866 	if (NFSD_MONOSEC != udp_lasttrim ||
867 	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
868 	    nfsrc_udphighwater / 2)) {
869 		mtx_lock(&nfsrc_udpmtx);
870 		udp_lasttrim = NFSD_MONOSEC;
871 		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
872 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
873 			     && rp->rc_refcnt == 0
874 			     && ((rp->rc_flag & RC_REFCNT) ||
875 				 udp_lasttrim > rp->rc_timestamp ||
876 				 nfsrc_udpcachesize > nfsrc_udphighwater))
877 				nfsrc_freecache(rp);
878 		}
879 		mtx_unlock(&nfsrc_udpmtx);
880 	}
881 	if (NFSD_MONOSEC != tcp_lasttrim ||
882 	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
883 		force = nfsrc_tcphighwater / 4;
884 		if (force > 0 &&
885 		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
886 			for (i = 0; i < HISTSIZE; i++)
887 				time_histo[i] = 0;
888 			i = 0;
889 			lastslot = NFSRVCACHE_HASHSIZE - 1;
890 		} else {
891 			force = 0;
892 			if (NFSD_MONOSEC != tcp_lasttrim) {
893 				i = 0;
894 				lastslot = NFSRVCACHE_HASHSIZE - 1;
895 			} else {
896 				lastslot = i = oneslot;
897 				if (++oneslot >= NFSRVCACHE_HASHSIZE)
898 					oneslot = 0;
899 			}
900 		}
901 		tto = nfsrc_tcptimeout;
902 		tcp_lasttrim = NFSD_MONOSEC;
903 		for (; i <= lastslot; i++) {
904 			mtx_lock(&nfsrchash_table[i].mtx);
905 			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
906 			    nextrp) {
907 				if (!(rp->rc_flag &
908 				     (RC_INPROG|RC_LOCKED|RC_WANTED))
909 				     && rp->rc_refcnt == 0) {
910 					if ((rp->rc_flag & RC_REFCNT) ||
911 					    tcp_lasttrim > rp->rc_timestamp ||
912 					    rp->rc_acked == RC_ACK) {
913 						nfsrc_freecache(rp);
914 						continue;
915 					}
916 
917 					if (force == 0)
918 						continue;
919 					/*
920 					 * The timestamps range from roughly the
921 					 * present (tcp_lasttrim) to the present
922 					 * + nfsrc_tcptimeout. Generate a simple
923 					 * histogram of where the timeouts fall.
924 					 */
925 					j = rp->rc_timestamp - tcp_lasttrim;
926 					if (j >= tto)
927 						j = HISTSIZE - 1;
928 					else if (j < 0)
929 						j = 0;
930 					else
931 						j = j * HISTSIZE / tto;
932 					time_histo[j]++;
933 				}
934 			}
935 			mtx_unlock(&nfsrchash_table[i].mtx);
936 		}
937 		if (force) {
938 			/*
939 			 * Trim some more with a smaller timeout of as little
940 			 * as 20% of nfsrc_tcptimeout to try and get below
941 			 * 80% of the nfsrc_tcphighwater.
942 			 */
943 			k = 0;
944 			for (i = 0; i < (HISTSIZE - 2); i++) {
945 				k += time_histo[i];
946 				if (k > force)
947 					break;
948 			}
949 			k = tto * (i + 1) / HISTSIZE;
950 			if (k < 1)
951 				k = 1;
952 			thisstamp = tcp_lasttrim + k;
953 			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
954 				mtx_lock(&nfsrchash_table[i].mtx);
955 				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
956 				    rc_hash, nextrp) {
957 					if (!(rp->rc_flag &
958 					     (RC_INPROG|RC_LOCKED|RC_WANTED))
959 					     && rp->rc_refcnt == 0
960 					     && ((rp->rc_flag & RC_REFCNT) ||
961 						 thisstamp > rp->rc_timestamp ||
962 						 rp->rc_acked == RC_ACK))
963 						nfsrc_freecache(rp);
964 				}
965 				mtx_unlock(&nfsrchash_table[i].mtx);
966 			}
967 		}
968 	}
969 	atomic_store_rel_int(&onethread, 0);
970 }
971 
972 /*
973  * Add a seqid# reference to the cache entry.
974  */
975 APPLESTATIC void
976 nfsrvd_refcache(struct nfsrvcache *rp)
977 {
978 	struct mtx *mutex;
979 
980 	if (rp == NULL)
981 		/* For NFSv4.1, there is no cache entry. */
982 		return;
983 	mutex = nfsrc_cachemutex(rp);
984 	mtx_lock(mutex);
985 	if (rp->rc_refcnt < 0)
986 		panic("nfs cache refcnt");
987 	rp->rc_refcnt++;
988 	mtx_unlock(mutex);
989 }
990 
991 /*
992  * Dereference a seqid# cache entry.
993  */
994 APPLESTATIC void
995 nfsrvd_derefcache(struct nfsrvcache *rp)
996 {
997 	struct mtx *mutex;
998 
999 	mutex = nfsrc_cachemutex(rp);
1000 	mtx_lock(mutex);
1001 	if (rp->rc_refcnt <= 0)
1002 		panic("nfs cache derefcnt");
1003 	rp->rc_refcnt--;
1004 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1005 		nfsrc_freecache(rp);
1006 	mtx_unlock(mutex);
1007 }
1008 
1009 /*
1010  * Calculate the length of the mbuf list and a checksum on the first up to
1011  * NFSRVCACHE_CHECKLEN bytes.
1012  */
1013 static int
1014 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1015 {
1016 	int len = 0, cklen;
1017 	mbuf_t m;
1018 
1019 	m = m1;
1020 	while (m) {
1021 		len += mbuf_len(m);
1022 		m = mbuf_next(m);
1023 	}
1024 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1025 	*cksum = in_cksum(m1, cklen);
1026 	return (len);
1027 }
1028 
1029 /*
1030  * Mark a TCP connection that is seeing retries. Should never happen for
1031  * NFSv4.
1032  */
1033 static void
1034 nfsrc_marksametcpconn(u_int64_t sockref)
1035 {
1036 }
1037 
1038