xref: /freebsd/sys/fs/nfsserver/nfs_nfsdcache.c (revision ce3adf4362fcca6a43e500b2531f0038adbfbd21)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 /*
38  * Here is the basic algorithm:
39  * First, some design criteria I used:
40  * - I think a false hit is more serious than a false miss
41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
42  *   avoided at all cost
43  * - A valid hit will probably happen a long time after the original reply
44  *   and the TCP socket that the original request was received on will no
45  *   longer be active
46  *   (The long time delay implies to me that LRU is not appropriate.)
47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48  *   in them as well as minimizing the risk of redoing retried non-idempotent
49  *   Ops.
50  * Because it is biased towards avoiding false hits, multiple entries with
51  * the same xid are to be expected, especially for the case of the entry
52  * in the cache being related to a seqid# sequenced Op.
53  *
54  * The basic algorithm I'm about to code up:
55  * - Null RPCs bypass the cache and are just done
56  * For TCP
57  * 	- key on <xid, NFS version> (as noted above, there can be several
58  * 				     entries with the same key)
59  * 	When a request arrives:
60  * 		For all that match key
61  * 		- if RPC# != OR request_size !=
62  * 			- not a match with this one
63  * 		- if NFSv4 and received on same TCP socket OR
64  *			received on a TCP connection created before the
65  *			entry was cached
66  * 			- not a match with this one
67  * 			(V2,3 clients might retry on same TCP socket)
68  * 		- calculate checksum on first N bytes of NFS XDR
69  * 		- if checksum !=
70  * 			- not a match for this one
71  * 		If any of the remaining ones that match has a
72  * 			seqid_refcnt > 0
73  * 			- not a match (go do RPC, using new cache entry)
74  * 		If one match left
75  * 			- a hit (reply from cache)
76  * 		else
77  * 			- miss (go do RPC, using new cache entry)
78  *
79  * 	During processing of NFSv4 request:
80  * 		- set a flag when a non-idempotent Op is processed
81  * 		- when an Op that uses a seqid# (Open,...) is processed
82  * 			- if same seqid# as referenced entry in cache
83  * 				- free new cache entry
84  * 				- reply from referenced cache entry
85  * 			  else if next seqid# in order
86  * 				- free referenced cache entry
87  * 				- increment seqid_refcnt on new cache entry
88  * 				- set pointer from Openowner/Lockowner to
89  * 					new cache entry (aka reference it)
90  * 			  else if first seqid# in sequence
91  * 				- increment seqid_refcnt on new cache entry
92  * 				- set pointer from Openowner/Lockowner to
93  * 					new cache entry (aka reference it)
94  *
95  * 	At end of RPC processing:
96  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97  * 			cache entry
98  * 			- save reply in cache entry
99  * 			- calculate checksum on first N bytes of NFS XDR
100  * 				request
101  * 			- note op and length of XDR request (in bytes)
102  * 			- timestamp it
103  * 		  else
104  * 			- free new cache entry
105  * 		- Send reply (noting info for socket activity check, below)
106  *
107  * 	For cache entries saved above:
108  * 		- if saved since seqid_refcnt was > 0
109  * 			- free when seqid_refcnt decrements to 0
110  * 			  (when next one in sequence is processed above, or
111  * 			   when Openowner/Lockowner is discarded)
112  * 		  else { non-idempotent Op(s) }
113  * 			- free when
114  * 				- some further activity observed on same
115  * 					socket
116  * 				  (I'm not yet sure how I'm going to do
117  * 				   this. Maybe look at the TCP connection
118  * 				   to see if the send_tcp_sequence# is well
119  * 				   past sent reply OR K additional RPCs
120  * 				   replied on same socket OR?)
121  * 			  OR
122  * 				- when very old (hours, days, weeks?)
123  *
124  * For UDP (v2, 3 only), pretty much the old way:
125  * - key on <xid, NFS version, RPC#, Client host ip#>
126  *   (at most one entry for each key)
127  *
128  * When a Request arrives:
129  * - if a match with entry via key
130  * 	- if RPC marked In_progress
131  * 		- discard request (don't send reply)
132  * 	  else
133  * 		- reply from cache
134  * 		- timestamp cache entry
135  *   else
136  * 	- add entry to cache, marked In_progress
137  * 	- do RPC
138  * 	- when RPC done
139  * 		- if RPC# non-idempotent
140  * 			- mark entry Done (not In_progress)
141  * 			- save reply
142  * 			- timestamp cache entry
143  * 		  else
144  * 			- free cache entry
145  * 		- send reply
146  *
147  * Later, entries with saved replies are free'd a short time (few minutes)
148  * after reply sent (timestamp).
149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
150  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151  *		pages 53-63. San Diego, February 1989.
152  *	 for the UDP case.
153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154  *	for TCP. For V3, a reply won't be saved when the flood level is
155  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156  *	that case. This level should be set high enough that this almost
157  *	never happens.
158  */
159 #ifndef APPLEKEXT
160 #include <fs/nfs/nfsport.h>
161 
162 extern struct nfsstats newnfsstats;
163 extern struct mtx nfsrc_udpmtx;
164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
166 #endif	/* !APPLEKEXT */
167 
168 SYSCTL_DECL(_vfs_nfsd);
169 
170 static u_int	nfsrc_tcphighwater = 0;
171 static int
172 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
173 {
174 	int error, newhighwater;
175 
176 	newhighwater = nfsrc_tcphighwater;
177 	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
178 	if (error != 0 || req->newptr == NULL)
179 		return (error);
180 	if (newhighwater < 0)
181 		return (EINVAL);
182 	if (newhighwater >= nfsrc_floodlevel)
183 		nfsrc_floodlevel = newhighwater + newhighwater / 5;
184 	nfsrc_tcphighwater = newhighwater;
185 	return (0);
186 }
187 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
188     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
189     "High water mark for TCP cache entries");
190 
191 static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
192 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
193     &nfsrc_udphighwater, 0,
194     "High water mark for UDP cache entries");
195 static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
196 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
197     &nfsrc_tcptimeout, 0,
198     "Timeout for TCP entries in the DRC");
199 static u_int nfsrc_tcpnonidempotent = 1;
200 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
201     &nfsrc_tcpnonidempotent, 0,
202     "Enable the DRC for NFS over TCP");
203 
204 static int nfsrc_udpcachesize = 0;
205 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
206 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
207 
208 /*
209  * and the reverse mapping from generic to Version 2 procedure numbers
210  */
211 static int newnfsv2_procid[NFS_V3NPROCS] = {
212 	NFSV2PROC_NULL,
213 	NFSV2PROC_GETATTR,
214 	NFSV2PROC_SETATTR,
215 	NFSV2PROC_LOOKUP,
216 	NFSV2PROC_NOOP,
217 	NFSV2PROC_READLINK,
218 	NFSV2PROC_READ,
219 	NFSV2PROC_WRITE,
220 	NFSV2PROC_CREATE,
221 	NFSV2PROC_MKDIR,
222 	NFSV2PROC_SYMLINK,
223 	NFSV2PROC_CREATE,
224 	NFSV2PROC_REMOVE,
225 	NFSV2PROC_RMDIR,
226 	NFSV2PROC_RENAME,
227 	NFSV2PROC_LINK,
228 	NFSV2PROC_READDIR,
229 	NFSV2PROC_NOOP,
230 	NFSV2PROC_STATFS,
231 	NFSV2PROC_NOOP,
232 	NFSV2PROC_NOOP,
233 	NFSV2PROC_NOOP,
234 };
235 
236 #define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
237 #define	NFSRCUDPHASH(xid) \
238 	(&nfsrvudphashtbl[nfsrc_hash(xid)])
239 #define	NFSRCHASH(xid) \
240 	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
241 #define	TRUE	1
242 #define	FALSE	0
243 #define	NFSRVCACHE_CHECKLEN	100
244 
245 /* True iff the rpc reply is an nfs status ONLY! */
246 static int nfsv2_repstat[NFS_V3NPROCS] = {
247 	FALSE,
248 	FALSE,
249 	FALSE,
250 	FALSE,
251 	FALSE,
252 	FALSE,
253 	FALSE,
254 	FALSE,
255 	FALSE,
256 	FALSE,
257 	TRUE,
258 	TRUE,
259 	TRUE,
260 	TRUE,
261 	FALSE,
262 	TRUE,
263 	FALSE,
264 	FALSE,
265 	FALSE,
266 	FALSE,
267 	FALSE,
268 	FALSE,
269 };
270 
271 /*
272  * Will NFS want to work over IPv6 someday?
273  */
274 #define	NETFAMILY(rp) \
275 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
276 
277 /* local functions */
278 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
279 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
280 static void nfsrc_lock(struct nfsrvcache *rp);
281 static void nfsrc_unlock(struct nfsrvcache *rp);
282 static void nfsrc_wanted(struct nfsrvcache *rp);
283 static void nfsrc_freecache(struct nfsrvcache *rp);
284 static void nfsrc_trimcache(u_int64_t, struct socket *);
285 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
286     struct socket *);
287 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
288 static void nfsrc_marksametcpconn(u_int64_t);
289 
290 /*
291  * Return the correct mutex for this cache entry.
292  */
293 static __inline struct mtx *
294 nfsrc_cachemutex(struct nfsrvcache *rp)
295 {
296 
297 	if ((rp->rc_flag & RC_UDP) != 0)
298 		return (&nfsrc_udpmtx);
299 	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
300 }
301 
302 /*
303  * Initialize the server request cache list
304  */
305 APPLESTATIC void
306 nfsrvd_initcache(void)
307 {
308 	int i;
309 	static int inited = 0;
310 
311 	if (inited)
312 		return;
313 	inited = 1;
314 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
315 		LIST_INIT(&nfsrvudphashtbl[i]);
316 		LIST_INIT(&nfsrchash_table[i].tbl);
317 	}
318 	TAILQ_INIT(&nfsrvudplru);
319 	nfsrc_tcpsavedreplies = 0;
320 	nfsrc_udpcachesize = 0;
321 	newnfsstats.srvcache_tcppeak = 0;
322 	newnfsstats.srvcache_size = 0;
323 }
324 
325 /*
326  * Get a cache entry for this request. Basically just malloc a new one
327  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328  * Call nfsrc_trimcache() to clean up the cache before returning.
329  */
330 APPLESTATIC int
331 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
332 {
333 	struct nfsrvcache *newrp;
334 	int ret;
335 
336 	if (nd->nd_procnum == NFSPROC_NULL)
337 		panic("nfsd cache null");
338 	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
339 	    M_NFSRVCACHE, M_WAITOK);
340 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
341 	if (nd->nd_flag & ND_NFSV4)
342 		newrp->rc_flag = RC_NFSV4;
343 	else if (nd->nd_flag & ND_NFSV3)
344 		newrp->rc_flag = RC_NFSV3;
345 	else
346 		newrp->rc_flag = RC_NFSV2;
347 	newrp->rc_xid = nd->nd_retxid;
348 	newrp->rc_proc = nd->nd_procnum;
349 	newrp->rc_sockref = nd->nd_sockref;
350 	newrp->rc_cachetime = nd->nd_tcpconntime;
351 	if (nd->nd_flag & ND_SAMETCPCONN)
352 		newrp->rc_flag |= RC_SAMETCPCONN;
353 	if (nd->nd_nam2 != NULL) {
354 		newrp->rc_flag |= RC_UDP;
355 		ret = nfsrc_getudp(nd, newrp);
356 	} else {
357 		ret = nfsrc_gettcp(nd, newrp);
358 	}
359 	nfsrc_trimcache(nd->nd_sockref, so);
360 	NFSEXITCODE2(0, nd);
361 	return (ret);
362 }
363 
364 /*
365  * For UDP (v2, v3):
366  * - key on <xid, NFS version, RPC#, Client host ip#>
367  *   (at most one entry for each key)
368  */
369 static int
370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
371 {
372 	struct nfsrvcache *rp;
373 	struct sockaddr_in *saddr;
374 	struct sockaddr_in6 *saddr6;
375 	struct nfsrvhashhead *hp;
376 	int ret = 0;
377 	struct mtx *mutex;
378 
379 	mutex = nfsrc_cachemutex(newrp);
380 	hp = NFSRCUDPHASH(newrp->rc_xid);
381 loop:
382 	mtx_lock(mutex);
383 	LIST_FOREACH(rp, hp, rc_hash) {
384 	    if (newrp->rc_xid == rp->rc_xid &&
385 		newrp->rc_proc == rp->rc_proc &&
386 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388 			if ((rp->rc_flag & RC_LOCKED) != 0) {
389 				rp->rc_flag |= RC_WANTED;
390 				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
391 				    "nfsrc", 10 * hz);
392 				goto loop;
393 			}
394 			if (rp->rc_flag == 0)
395 				panic("nfs udp cache0");
396 			rp->rc_flag |= RC_LOCKED;
397 			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399 			if (rp->rc_flag & RC_INPROG) {
400 				newnfsstats.srvcache_inproghits++;
401 				mtx_unlock(mutex);
402 				ret = RC_DROPIT;
403 			} else if (rp->rc_flag & RC_REPSTATUS) {
404 				/*
405 				 * V2 only.
406 				 */
407 				newnfsstats.srvcache_nonidemdonehits++;
408 				mtx_unlock(mutex);
409 				nfsrvd_rephead(nd);
410 				*(nd->nd_errp) = rp->rc_status;
411 				ret = RC_REPLY;
412 				rp->rc_timestamp = NFSD_MONOSEC +
413 					NFSRVCACHE_UDPTIMEOUT;
414 			} else if (rp->rc_flag & RC_REPMBUF) {
415 				newnfsstats.srvcache_nonidemdonehits++;
416 				mtx_unlock(mutex);
417 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
418 					M_COPYALL, M_WAITOK);
419 				ret = RC_REPLY;
420 				rp->rc_timestamp = NFSD_MONOSEC +
421 					NFSRVCACHE_UDPTIMEOUT;
422 			} else {
423 				panic("nfs udp cache1");
424 			}
425 			nfsrc_unlock(rp);
426 			free((caddr_t)newrp, M_NFSRVCACHE);
427 			goto out;
428 		}
429 	}
430 	newnfsstats.srvcache_misses++;
431 	atomic_add_int(&newnfsstats.srvcache_size, 1);
432 	nfsrc_udpcachesize++;
433 
434 	newrp->rc_flag |= RC_INPROG;
435 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436 	if (saddr->sin_family == AF_INET)
437 		newrp->rc_inet = saddr->sin_addr.s_addr;
438 	else if (saddr->sin_family == AF_INET6) {
439 		saddr6 = (struct sockaddr_in6 *)saddr;
440 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441 		    sizeof (struct in6_addr));
442 		newrp->rc_flag |= RC_INETIPV6;
443 	}
444 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
445 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
446 	mtx_unlock(mutex);
447 	nd->nd_rp = newrp;
448 	ret = RC_DOIT;
449 
450 out:
451 	NFSEXITCODE2(0, nd);
452 	return (ret);
453 }
454 
455 /*
456  * Update a request cache entry after the rpc has been done
457  */
458 APPLESTATIC struct nfsrvcache *
459 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
460 {
461 	struct nfsrvcache *rp;
462 	struct nfsrvcache *retrp = NULL;
463 	mbuf_t m;
464 	struct mtx *mutex;
465 
466 	rp = nd->nd_rp;
467 	if (!rp)
468 		panic("nfsrvd_updatecache null rp");
469 	nd->nd_rp = NULL;
470 	mutex = nfsrc_cachemutex(rp);
471 	mtx_lock(mutex);
472 	nfsrc_lock(rp);
473 	if (!(rp->rc_flag & RC_INPROG))
474 		panic("nfsrvd_updatecache not inprog");
475 	rp->rc_flag &= ~RC_INPROG;
476 	if (rp->rc_flag & RC_UDP) {
477 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478 		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
479 	}
480 
481 	/*
482 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
483 	 */
484 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485 		newnfsstats.srvcache_nonidemdonehits++;
486 		mtx_unlock(mutex);
487 		nd->nd_repstat = 0;
488 		if (nd->nd_mreq)
489 			mbuf_freem(nd->nd_mreq);
490 		if (!(rp->rc_flag & RC_REPMBUF))
491 			panic("reply from cache");
492 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
493 		    M_COPYALL, M_WAITOK);
494 		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
495 		nfsrc_unlock(rp);
496 		goto out;
497 	}
498 
499 	/*
500 	 * If rc_refcnt > 0, save it
501 	 * For UDP, save it if ND_SAVEREPLY is set
502 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
503 	 */
504 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
505 	    (rp->rc_refcnt > 0 ||
506 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
507 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
508 	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
509 	      nfsrc_tcpnonidempotent))) {
510 		if (rp->rc_refcnt > 0) {
511 			if (!(rp->rc_flag & RC_NFSV4))
512 				panic("update_cache refcnt");
513 			rp->rc_flag |= RC_REFCNT;
514 		}
515 		if ((nd->nd_flag & ND_NFSV2) &&
516 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
517 			rp->rc_status = nd->nd_repstat;
518 			rp->rc_flag |= RC_REPSTATUS;
519 			mtx_unlock(mutex);
520 		} else {
521 			if (!(rp->rc_flag & RC_UDP)) {
522 			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
523 			    if (nfsrc_tcpsavedreplies >
524 				newnfsstats.srvcache_tcppeak)
525 				newnfsstats.srvcache_tcppeak =
526 				    nfsrc_tcpsavedreplies;
527 			}
528 			mtx_unlock(mutex);
529 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
530 			mtx_lock(mutex);
531 			rp->rc_reply = m;
532 			rp->rc_flag |= RC_REPMBUF;
533 			mtx_unlock(mutex);
534 		}
535 		if (rp->rc_flag & RC_UDP) {
536 			rp->rc_timestamp = NFSD_MONOSEC +
537 			    NFSRVCACHE_UDPTIMEOUT;
538 			nfsrc_unlock(rp);
539 		} else {
540 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
541 			if (rp->rc_refcnt > 0)
542 				nfsrc_unlock(rp);
543 			else
544 				retrp = rp;
545 		}
546 	} else {
547 		nfsrc_freecache(rp);
548 		mtx_unlock(mutex);
549 	}
550 
551 out:
552 	nfsrc_trimcache(nd->nd_sockref, so);
553 	NFSEXITCODE2(0, nd);
554 	return (retrp);
555 }
556 
557 /*
558  * Invalidate and, if possible, free an in prog cache entry.
559  * Must not sleep.
560  */
561 APPLESTATIC void
562 nfsrvd_delcache(struct nfsrvcache *rp)
563 {
564 	struct mtx *mutex;
565 
566 	mutex = nfsrc_cachemutex(rp);
567 	if (!(rp->rc_flag & RC_INPROG))
568 		panic("nfsrvd_delcache not in prog");
569 	mtx_lock(mutex);
570 	rp->rc_flag &= ~RC_INPROG;
571 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
572 		nfsrc_freecache(rp);
573 	mtx_unlock(mutex);
574 }
575 
576 /*
577  * Called after nfsrvd_updatecache() once the reply is sent, to update
578  * the entry for nfsrc_activesocket() and unlock it. The argument is
579  * the pointer returned by nfsrvd_updatecache().
580  */
581 APPLESTATIC void
582 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
583 {
584 	tcp_seq tmp_seq;
585 	struct mtx *mutex;
586 
587 	mutex = nfsrc_cachemutex(rp);
588 	if (!(rp->rc_flag & RC_LOCKED))
589 		panic("nfsrvd_sentcache not locked");
590 	if (!err) {
591 		if ((so->so_proto->pr_domain->dom_family != AF_INET &&
592 		     so->so_proto->pr_domain->dom_family != AF_INET6) ||
593 		     so->so_proto->pr_protocol != IPPROTO_TCP)
594 			panic("nfs sent cache");
595 		if (nfsrv_getsockseqnum(so, &tmp_seq)) {
596 			mtx_lock(mutex);
597 			rp->rc_tcpseq = tmp_seq;
598 			rp->rc_flag |= RC_TCPSEQ;
599 			mtx_unlock(mutex);
600 		}
601 	}
602 	nfsrc_unlock(rp);
603 }
604 
605 /*
606  * Get a cache entry for TCP
607  * - key on <xid, nfs version>
608  *   (allow multiple entries for a given key)
609  */
610 static int
611 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
612 {
613 	struct nfsrvcache *rp, *nextrp;
614 	int i;
615 	struct nfsrvcache *hitrp;
616 	struct nfsrvhashhead *hp, nfsrc_templist;
617 	int hit, ret = 0;
618 	struct mtx *mutex;
619 
620 	mutex = nfsrc_cachemutex(newrp);
621 	hp = NFSRCHASH(newrp->rc_xid);
622 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
623 tryagain:
624 	mtx_lock(mutex);
625 	hit = 1;
626 	LIST_INIT(&nfsrc_templist);
627 	/*
628 	 * Get all the matches and put them on the temp list.
629 	 */
630 	rp = LIST_FIRST(hp);
631 	while (rp != LIST_END(hp)) {
632 		nextrp = LIST_NEXT(rp, rc_hash);
633 		if (newrp->rc_xid == rp->rc_xid &&
634 		    (!(rp->rc_flag & RC_INPROG) ||
635 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
636 		      newrp->rc_sockref == rp->rc_sockref)) &&
637 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
638 		    newrp->rc_proc == rp->rc_proc &&
639 		    ((newrp->rc_flag & RC_NFSV4) &&
640 		     newrp->rc_sockref != rp->rc_sockref &&
641 		     newrp->rc_cachetime >= rp->rc_cachetime)
642 		    && newrp->rc_reqlen == rp->rc_reqlen &&
643 		    newrp->rc_cksum == rp->rc_cksum) {
644 			LIST_REMOVE(rp, rc_hash);
645 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
646 		}
647 		rp = nextrp;
648 	}
649 
650 	/*
651 	 * Now, use nfsrc_templist to decide if there is a match.
652 	 */
653 	i = 0;
654 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
655 		i++;
656 		if (rp->rc_refcnt > 0) {
657 			hit = 0;
658 			break;
659 		}
660 	}
661 	/*
662 	 * Can be a hit only if one entry left.
663 	 * Note possible hit entry and put nfsrc_templist back on hash
664 	 * list.
665 	 */
666 	if (i != 1)
667 		hit = 0;
668 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
669 	while (rp != LIST_END(&nfsrc_templist)) {
670 		nextrp = LIST_NEXT(rp, rc_hash);
671 		LIST_REMOVE(rp, rc_hash);
672 		LIST_INSERT_HEAD(hp, rp, rc_hash);
673 		rp = nextrp;
674 	}
675 	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
676 		panic("nfs gettcp cache templist");
677 
678 	if (hit) {
679 		rp = hitrp;
680 		if ((rp->rc_flag & RC_LOCKED) != 0) {
681 			rp->rc_flag |= RC_WANTED;
682 			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
683 			    "nfsrc", 10 * hz);
684 			goto tryagain;
685 		}
686 		if (rp->rc_flag == 0)
687 			panic("nfs tcp cache0");
688 		rp->rc_flag |= RC_LOCKED;
689 		if (rp->rc_flag & RC_INPROG) {
690 			newnfsstats.srvcache_inproghits++;
691 			mtx_unlock(mutex);
692 			if (newrp->rc_sockref == rp->rc_sockref)
693 				nfsrc_marksametcpconn(rp->rc_sockref);
694 			ret = RC_DROPIT;
695 		} else if (rp->rc_flag & RC_REPSTATUS) {
696 			/*
697 			 * V2 only.
698 			 */
699 			newnfsstats.srvcache_nonidemdonehits++;
700 			mtx_unlock(mutex);
701 			if (newrp->rc_sockref == rp->rc_sockref)
702 				nfsrc_marksametcpconn(rp->rc_sockref);
703 			ret = RC_REPLY;
704 			nfsrvd_rephead(nd);
705 			*(nd->nd_errp) = rp->rc_status;
706 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
707 		} else if (rp->rc_flag & RC_REPMBUF) {
708 			newnfsstats.srvcache_nonidemdonehits++;
709 			mtx_unlock(mutex);
710 			if (newrp->rc_sockref == rp->rc_sockref)
711 				nfsrc_marksametcpconn(rp->rc_sockref);
712 			ret = RC_REPLY;
713 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
714 				M_COPYALL, M_WAITOK);
715 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
716 		} else {
717 			panic("nfs tcp cache1");
718 		}
719 		nfsrc_unlock(rp);
720 		free((caddr_t)newrp, M_NFSRVCACHE);
721 		goto out;
722 	}
723 	newnfsstats.srvcache_misses++;
724 	atomic_add_int(&newnfsstats.srvcache_size, 1);
725 
726 	/*
727 	 * For TCP, multiple entries for a key are allowed, so don't
728 	 * chain it into the hash table until done.
729 	 */
730 	newrp->rc_cachetime = NFSD_MONOSEC;
731 	newrp->rc_flag |= RC_INPROG;
732 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
733 	mtx_unlock(mutex);
734 	nd->nd_rp = newrp;
735 	ret = RC_DOIT;
736 
737 out:
738 	NFSEXITCODE2(0, nd);
739 	return (ret);
740 }
741 
742 /*
743  * Lock a cache entry.
744  */
745 static void
746 nfsrc_lock(struct nfsrvcache *rp)
747 {
748 	struct mtx *mutex;
749 
750 	mutex = nfsrc_cachemutex(rp);
751 	mtx_assert(mutex, MA_OWNED);
752 	while ((rp->rc_flag & RC_LOCKED) != 0) {
753 		rp->rc_flag |= RC_WANTED;
754 		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
755 	}
756 	rp->rc_flag |= RC_LOCKED;
757 }
758 
759 /*
760  * Unlock a cache entry.
761  */
762 static void
763 nfsrc_unlock(struct nfsrvcache *rp)
764 {
765 	struct mtx *mutex;
766 
767 	mutex = nfsrc_cachemutex(rp);
768 	mtx_lock(mutex);
769 	rp->rc_flag &= ~RC_LOCKED;
770 	nfsrc_wanted(rp);
771 	mtx_unlock(mutex);
772 }
773 
774 /*
775  * Wakeup anyone wanting entry.
776  */
777 static void
778 nfsrc_wanted(struct nfsrvcache *rp)
779 {
780 	if (rp->rc_flag & RC_WANTED) {
781 		rp->rc_flag &= ~RC_WANTED;
782 		wakeup((caddr_t)rp);
783 	}
784 }
785 
786 /*
787  * Free up the entry.
788  * Must not sleep.
789  */
790 static void
791 nfsrc_freecache(struct nfsrvcache *rp)
792 {
793 
794 	LIST_REMOVE(rp, rc_hash);
795 	if (rp->rc_flag & RC_UDP) {
796 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
797 		nfsrc_udpcachesize--;
798 	}
799 	nfsrc_wanted(rp);
800 	if (rp->rc_flag & RC_REPMBUF) {
801 		mbuf_freem(rp->rc_reply);
802 		if (!(rp->rc_flag & RC_UDP))
803 			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
804 	}
805 	FREE((caddr_t)rp, M_NFSRVCACHE);
806 	atomic_add_int(&newnfsstats.srvcache_size, -1);
807 }
808 
809 /*
810  * Clean out the cache. Called when nfsserver module is unloaded.
811  */
812 APPLESTATIC void
813 nfsrvd_cleancache(void)
814 {
815 	struct nfsrvcache *rp, *nextrp;
816 	int i;
817 
818 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
819 		mtx_lock(&nfsrchash_table[i].mtx);
820 		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
821 			nfsrc_freecache(rp);
822 		mtx_unlock(&nfsrchash_table[i].mtx);
823 	}
824 	mtx_lock(&nfsrc_udpmtx);
825 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
826 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
827 			nfsrc_freecache(rp);
828 		}
829 	}
830 	newnfsstats.srvcache_size = 0;
831 	mtx_unlock(&nfsrc_udpmtx);
832 	nfsrc_tcpsavedreplies = 0;
833 }
834 
835 /*
836  * The basic rule is to get rid of entries that are expired.
837  */
838 static void
839 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
840 {
841 	struct nfsrvcache *rp, *nextrp;
842 	int i, j, k, time_histo[10];
843 	time_t thisstamp;
844 	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
845 	static int onethread = 0;
846 
847 	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
848 		return;
849 	if (NFSD_MONOSEC != udp_lasttrim ||
850 	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
851 	    nfsrc_udphighwater / 2)) {
852 		mtx_lock(&nfsrc_udpmtx);
853 		udp_lasttrim = NFSD_MONOSEC;
854 		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
855 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
856 			     && rp->rc_refcnt == 0
857 			     && ((rp->rc_flag & RC_REFCNT) ||
858 				 udp_lasttrim > rp->rc_timestamp ||
859 				 nfsrc_udpcachesize > nfsrc_udphighwater))
860 				nfsrc_freecache(rp);
861 		}
862 		mtx_unlock(&nfsrc_udpmtx);
863 	}
864 	if (NFSD_MONOSEC != tcp_lasttrim ||
865 	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
866 		for (i = 0; i < 10; i++)
867 			time_histo[i] = 0;
868 		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
869 			mtx_lock(&nfsrchash_table[i].mtx);
870 			if (i == 0)
871 				tcp_lasttrim = NFSD_MONOSEC;
872 			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
873 			    nextrp) {
874 				if (!(rp->rc_flag &
875 				     (RC_INPROG|RC_LOCKED|RC_WANTED))
876 				     && rp->rc_refcnt == 0) {
877 					/*
878 					 * The timestamps range from roughly the
879 					 * present (tcp_lasttrim) to the present
880 					 * + nfsrc_tcptimeout. Generate a simple
881 					 * histogram of where the timeouts fall.
882 					 */
883 					j = rp->rc_timestamp - tcp_lasttrim;
884 					if (j >= nfsrc_tcptimeout)
885 						j = nfsrc_tcptimeout - 1;
886 					if (j < 0)
887 						j = 0;
888 					j = (j * 10 / nfsrc_tcptimeout) % 10;
889 					time_histo[j]++;
890 					if ((rp->rc_flag & RC_REFCNT) ||
891 					    tcp_lasttrim > rp->rc_timestamp ||
892 					    nfsrc_activesocket(rp, sockref, so))
893 						nfsrc_freecache(rp);
894 				}
895 			}
896 			mtx_unlock(&nfsrchash_table[i].mtx);
897 		}
898 		j = nfsrc_tcphighwater / 5;	/* 20% of it */
899 		if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
900 			/*
901 			 * Trim some more with a smaller timeout of as little
902 			 * as 20% of nfsrc_tcptimeout to try and get below
903 			 * 80% of the nfsrc_tcphighwater.
904 			 */
905 			k = 0;
906 			for (i = 0; i < 8; i++) {
907 				k += time_histo[i];
908 				if (k > j)
909 					break;
910 			}
911 			k = nfsrc_tcptimeout * (i + 1) / 10;
912 			if (k < 1)
913 				k = 1;
914 			thisstamp = tcp_lasttrim + k;
915 			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
916 				mtx_lock(&nfsrchash_table[i].mtx);
917 				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
918 				    rc_hash, nextrp) {
919 					if (!(rp->rc_flag &
920 					     (RC_INPROG|RC_LOCKED|RC_WANTED))
921 					     && rp->rc_refcnt == 0
922 					     && ((rp->rc_flag & RC_REFCNT) ||
923 						 thisstamp > rp->rc_timestamp ||
924 						 nfsrc_activesocket(rp, sockref,
925 						    so)))
926 						nfsrc_freecache(rp);
927 				}
928 				mtx_unlock(&nfsrchash_table[i].mtx);
929 			}
930 		}
931 	}
932 	atomic_store_rel_int(&onethread, 0);
933 }
934 
935 /*
936  * Add a seqid# reference to the cache entry.
937  */
938 APPLESTATIC void
939 nfsrvd_refcache(struct nfsrvcache *rp)
940 {
941 	struct mtx *mutex;
942 
943 	mutex = nfsrc_cachemutex(rp);
944 	mtx_lock(mutex);
945 	if (rp->rc_refcnt < 0)
946 		panic("nfs cache refcnt");
947 	rp->rc_refcnt++;
948 	mtx_unlock(mutex);
949 }
950 
951 /*
952  * Dereference a seqid# cache entry.
953  */
954 APPLESTATIC void
955 nfsrvd_derefcache(struct nfsrvcache *rp)
956 {
957 	struct mtx *mutex;
958 
959 	mutex = nfsrc_cachemutex(rp);
960 	mtx_lock(mutex);
961 	if (rp->rc_refcnt <= 0)
962 		panic("nfs cache derefcnt");
963 	rp->rc_refcnt--;
964 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
965 		nfsrc_freecache(rp);
966 	mtx_unlock(mutex);
967 }
968 
969 /*
970  * Check to see if the socket is active.
971  * Return 1 if the reply has been received/acknowledged by the client,
972  * 0 otherwise.
973  * XXX - Uses tcp internals.
974  */
975 static int
976 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
977     struct socket *cur_so)
978 {
979 	int ret = 0;
980 
981 	if (!(rp->rc_flag & RC_TCPSEQ))
982 		return (ret);
983 	/*
984 	 * If the sockref is the same, it is the same TCP connection.
985 	 */
986 	if (cur_sockref == rp->rc_sockref)
987 		ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
988 	return (ret);
989 }
990 
991 /*
992  * Calculate the length of the mbuf list and a checksum on the first up to
993  * NFSRVCACHE_CHECKLEN bytes.
994  */
995 static int
996 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
997 {
998 	int len = 0, cklen;
999 	mbuf_t m;
1000 
1001 	m = m1;
1002 	while (m) {
1003 		len += mbuf_len(m);
1004 		m = mbuf_next(m);
1005 	}
1006 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1007 	*cksum = in_cksum(m1, cklen);
1008 	return (len);
1009 }
1010 
1011 /*
1012  * Mark a TCP connection that is seeing retries. Should never happen for
1013  * NFSv4.
1014  */
1015 static void
1016 nfsrc_marksametcpconn(u_int64_t sockref)
1017 {
1018 }
1019 
1020