xref: /freebsd/sys/fs/nfsserver/nfs_nfsdcache.c (revision 830940567b49bb0c08dfaed40418999e76616909)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 /*
38  * Here is the basic algorithm:
39  * First, some design criteria I used:
40  * - I think a false hit is more serious than a false miss
41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
42  *   avoided at all cost
43  * - A valid hit will probably happen a long time after the original reply
44  *   and the TCP socket that the original request was received on will no
45  *   longer be active
46  *   (The long time delay implies to me that LRU is not appropriate.)
47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48  *   in them as well as minimizing the risk of redoing retried non-idempotent
49  *   Ops.
50  * Because it is biased towards avoiding false hits, multiple entries with
51  * the same xid are to be expected, especially for the case of the entry
52  * in the cache being related to a seqid# sequenced Op.
53  *
54  * The basic algorithm I'm about to code up:
55  * - Null RPCs bypass the cache and are just done
56  * For TCP
57  * 	- key on <xid, NFS version> (as noted above, there can be several
58  * 				     entries with the same key)
59  * 	When a request arrives:
60  * 		For all that match key
61  * 		- if RPC# != OR request_size !=
62  * 			- not a match with this one
63  * 		- if NFSv4 and received on same TCP socket OR
64  *			received on a TCP connection created before the
65  *			entry was cached
66  * 			- not a match with this one
67  * 			(V2,3 clients might retry on same TCP socket)
68  * 		- calculate checksum on first N bytes of NFS XDR
69  * 		- if checksum !=
70  * 			- not a match for this one
71  * 		If any of the remaining ones that match has a
72  * 			seqid_refcnt > 0
73  * 			- not a match (go do RPC, using new cache entry)
74  * 		If one match left
75  * 			- a hit (reply from cache)
76  * 		else
77  * 			- miss (go do RPC, using new cache entry)
78  *
79  * 	During processing of NFSv4 request:
80  * 		- set a flag when a non-idempotent Op is processed
81  * 		- when an Op that uses a seqid# (Open,...) is processed
82  * 			- if same seqid# as referenced entry in cache
83  * 				- free new cache entry
84  * 				- reply from referenced cache entry
85  * 			  else if next seqid# in order
86  * 				- free referenced cache entry
87  * 				- increment seqid_refcnt on new cache entry
88  * 				- set pointer from Openowner/Lockowner to
89  * 					new cache entry (aka reference it)
90  * 			  else if first seqid# in sequence
91  * 				- increment seqid_refcnt on new cache entry
92  * 				- set pointer from Openowner/Lockowner to
93  * 					new cache entry (aka reference it)
94  *
95  * 	At end of RPC processing:
96  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97  * 			cache entry
98  * 			- save reply in cache entry
99  * 			- calculate checksum on first N bytes of NFS XDR
100  * 				request
101  * 			- note op and length of XDR request (in bytes)
102  * 			- timestamp it
103  * 		  else
104  * 			- free new cache entry
105  * 		- Send reply (noting info for socket activity check, below)
106  *
107  * 	For cache entries saved above:
108  * 		- if saved since seqid_refcnt was > 0
109  * 			- free when seqid_refcnt decrements to 0
110  * 			  (when next one in sequence is processed above, or
111  * 			   when Openowner/Lockowner is discarded)
112  * 		  else { non-idempotent Op(s) }
113  * 			- free when
114  * 				- some further activity observed on same
115  * 					socket
116  * 				  (I'm not yet sure how I'm going to do
117  * 				   this. Maybe look at the TCP connection
118  * 				   to see if the send_tcp_sequence# is well
119  * 				   past sent reply OR K additional RPCs
120  * 				   replied on same socket OR?)
121  * 			  OR
122  * 				- when very old (hours, days, weeks?)
123  *
124  * For UDP (v2, 3 only), pretty much the old way:
125  * - key on <xid, NFS version, RPC#, Client host ip#>
126  *   (at most one entry for each key)
127  *
128  * When a Request arrives:
129  * - if a match with entry via key
130  * 	- if RPC marked In_progress
131  * 		- discard request (don't send reply)
132  * 	  else
133  * 		- reply from cache
134  * 		- timestamp cache entry
135  *   else
136  * 	- add entry to cache, marked In_progress
137  * 	- do RPC
138  * 	- when RPC done
139  * 		- if RPC# non-idempotent
140  * 			- mark entry Done (not In_progress)
141  * 			- save reply
142  * 			- timestamp cache entry
143  * 		  else
144  * 			- free cache entry
145  * 		- send reply
146  *
147  * Later, entries with saved replies are free'd a short time (few minutes)
148  * after reply sent (timestamp).
149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
150  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151  *		pages 53-63. San Diego, February 1989.
152  *	 for the UDP case.
153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154  *	for TCP. For V3, a reply won't be saved when the flood level is
155  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156  *	that case. This level should be set high enough that this almost
157  *	never happens.
158  */
159 #ifndef APPLEKEXT
160 #include <fs/nfs/nfsport.h>
161 
162 extern struct nfsstats newnfsstats;
163 NFSCACHEMUTEX;
164 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
165 #endif	/* !APPLEKEXT */
166 
167 static int nfsrc_tcpnonidempotent = 1;
168 static int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER, nfsrc_udpcachesize = 0;
169 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
170 static struct nfsrvhashhead nfsrvhashtbl[NFSRVCACHE_HASHSIZE],
171     nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
172 /*
173  * and the reverse mapping from generic to Version 2 procedure numbers
174  */
175 static int newnfsv2_procid[NFS_V3NPROCS] = {
176 	NFSV2PROC_NULL,
177 	NFSV2PROC_GETATTR,
178 	NFSV2PROC_SETATTR,
179 	NFSV2PROC_LOOKUP,
180 	NFSV2PROC_NOOP,
181 	NFSV2PROC_READLINK,
182 	NFSV2PROC_READ,
183 	NFSV2PROC_WRITE,
184 	NFSV2PROC_CREATE,
185 	NFSV2PROC_MKDIR,
186 	NFSV2PROC_SYMLINK,
187 	NFSV2PROC_CREATE,
188 	NFSV2PROC_REMOVE,
189 	NFSV2PROC_RMDIR,
190 	NFSV2PROC_RENAME,
191 	NFSV2PROC_LINK,
192 	NFSV2PROC_READDIR,
193 	NFSV2PROC_NOOP,
194 	NFSV2PROC_STATFS,
195 	NFSV2PROC_NOOP,
196 	NFSV2PROC_NOOP,
197 	NFSV2PROC_NOOP,
198 };
199 
200 #define	NFSRCUDPHASH(xid) \
201 	(&nfsrvudphashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
202 #define	NFSRCHASH(xid) \
203 	(&nfsrvhashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
204 #define	TRUE	1
205 #define	FALSE	0
206 #define	NFSRVCACHE_CHECKLEN	100
207 
208 /* True iff the rpc reply is an nfs status ONLY! */
209 static int nfsv2_repstat[NFS_V3NPROCS] = {
210 	FALSE,
211 	FALSE,
212 	FALSE,
213 	FALSE,
214 	FALSE,
215 	FALSE,
216 	FALSE,
217 	FALSE,
218 	FALSE,
219 	FALSE,
220 	TRUE,
221 	TRUE,
222 	TRUE,
223 	TRUE,
224 	FALSE,
225 	TRUE,
226 	FALSE,
227 	FALSE,
228 	FALSE,
229 	FALSE,
230 	FALSE,
231 	FALSE,
232 };
233 
234 /*
235  * Will NFS want to work over IPv6 someday?
236  */
237 #define	NETFAMILY(rp) \
238 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
239 
240 /* local functions */
241 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
242 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
243 static void nfsrc_lock(struct nfsrvcache *rp);
244 static void nfsrc_unlock(struct nfsrvcache *rp);
245 static void nfsrc_wanted(struct nfsrvcache *rp);
246 static void nfsrc_freecache(struct nfsrvcache *rp);
247 static void nfsrc_trimcache(u_int64_t, struct socket *);
248 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
249     struct socket *);
250 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
251 static void nfsrc_marksametcpconn(u_int64_t);
252 
253 /*
254  * Initialize the server request cache list
255  */
256 APPLESTATIC void
257 nfsrvd_initcache(void)
258 {
259 	int i;
260 	static int inited = 0;
261 
262 	if (inited)
263 		return;
264 	inited = 1;
265 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
266 		LIST_INIT(&nfsrvudphashtbl[i]);
267 		LIST_INIT(&nfsrvhashtbl[i]);
268 	}
269 	TAILQ_INIT(&nfsrvudplru);
270 	nfsrc_tcpsavedreplies = 0;
271 	nfsrc_udpcachesize = 0;
272 	newnfsstats.srvcache_tcppeak = 0;
273 	newnfsstats.srvcache_size = 0;
274 }
275 
276 /*
277  * Get a cache entry for this request. Basically just malloc a new one
278  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
279  * Call nfsrc_trimcache() to clean up the cache before returning.
280  */
281 APPLESTATIC int
282 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
283 {
284 	struct nfsrvcache *newrp;
285 	int ret;
286 
287 	if (nd->nd_procnum == NFSPROC_NULL)
288 		panic("nfsd cache null");
289 	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
290 	    M_NFSRVCACHE, M_WAITOK);
291 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
292 	if (nd->nd_flag & ND_NFSV4)
293 		newrp->rc_flag = RC_NFSV4;
294 	else if (nd->nd_flag & ND_NFSV3)
295 		newrp->rc_flag = RC_NFSV3;
296 	else
297 		newrp->rc_flag = RC_NFSV2;
298 	newrp->rc_xid = nd->nd_retxid;
299 	newrp->rc_proc = nd->nd_procnum;
300 	newrp->rc_sockref = nd->nd_sockref;
301 	newrp->rc_cachetime = nd->nd_tcpconntime;
302 	if (nd->nd_flag & ND_SAMETCPCONN)
303 		newrp->rc_flag |= RC_SAMETCPCONN;
304 	if (nd->nd_nam2 != NULL) {
305 		newrp->rc_flag |= RC_UDP;
306 		ret = nfsrc_getudp(nd, newrp);
307 	} else {
308 		ret = nfsrc_gettcp(nd, newrp);
309 	}
310 	nfsrc_trimcache(nd->nd_sockref, so);
311 	return (ret);
312 }
313 
314 /*
315  * For UDP (v2, v3):
316  * - key on <xid, NFS version, RPC#, Client host ip#>
317  *   (at most one entry for each key)
318  */
319 static int
320 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
321 {
322 	struct nfsrvcache *rp;
323 	struct sockaddr_in *saddr;
324 	struct sockaddr_in6 *saddr6;
325 	struct nfsrvhashhead *hp;
326 	int ret = 0;
327 
328 	hp = NFSRCUDPHASH(newrp->rc_xid);
329 loop:
330 	NFSLOCKCACHE();
331 	LIST_FOREACH(rp, hp, rc_hash) {
332 	    if (newrp->rc_xid == rp->rc_xid &&
333 		newrp->rc_proc == rp->rc_proc &&
334 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
335 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
336 			if ((rp->rc_flag & RC_LOCKED) != 0) {
337 				rp->rc_flag |= RC_WANTED;
338 				NFSUNLOCKCACHE();
339 				(void) tsleep((caddr_t)rp, PZERO - 1,
340 				    "nfsrc", 10 * hz);
341 				goto loop;
342 			}
343 			if (rp->rc_flag == 0)
344 				panic("nfs udp cache0");
345 			rp->rc_flag |= RC_LOCKED;
346 			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
347 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
348 			if (rp->rc_flag & RC_INPROG) {
349 				newnfsstats.srvcache_inproghits++;
350 				NFSUNLOCKCACHE();
351 				ret = RC_DROPIT;
352 			} else if (rp->rc_flag & RC_REPSTATUS) {
353 				/*
354 				 * V2 only.
355 				 */
356 				newnfsstats.srvcache_nonidemdonehits++;
357 				NFSUNLOCKCACHE();
358 				nfsrvd_rephead(nd);
359 				*(nd->nd_errp) = rp->rc_status;
360 				ret = RC_REPLY;
361 				rp->rc_timestamp = NFSD_MONOSEC +
362 					NFSRVCACHE_UDPTIMEOUT;
363 			} else if (rp->rc_flag & RC_REPMBUF) {
364 				newnfsstats.srvcache_nonidemdonehits++;
365 				NFSUNLOCKCACHE();
366 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
367 					M_COPYALL, M_WAIT);
368 				ret = RC_REPLY;
369 				rp->rc_timestamp = NFSD_MONOSEC +
370 					NFSRVCACHE_UDPTIMEOUT;
371 			} else {
372 				panic("nfs udp cache1");
373 			}
374 			nfsrc_unlock(rp);
375 			free((caddr_t)newrp, M_NFSRVCACHE);
376 			return (ret);
377 		}
378 	}
379 	newnfsstats.srvcache_misses++;
380 	newnfsstats.srvcache_size++;
381 	nfsrc_udpcachesize++;
382 
383 	newrp->rc_flag |= RC_INPROG;
384 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
385 	if (saddr->sin_family == AF_INET)
386 		newrp->rc_inet = saddr->sin_addr.s_addr;
387 	else if (saddr->sin_family == AF_INET6) {
388 		saddr6 = (struct sockaddr_in6 *)saddr;
389 		NFSBCOPY((caddr_t)&saddr6->sin6_addr,(caddr_t)&newrp->rc_inet6,
390 			sizeof (struct in6_addr));
391 		rp->rc_flag |= RC_INETIPV6;
392 	}
393 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
394 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
395 	NFSUNLOCKCACHE();
396 	nd->nd_rp = newrp;
397 	return (RC_DOIT);
398 }
399 
400 /*
401  * Update a request cache entry after the rpc has been done
402  */
403 APPLESTATIC struct nfsrvcache *
404 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
405 {
406 	struct nfsrvcache *rp;
407 	struct nfsrvcache *retrp = NULL;
408 
409 	rp = nd->nd_rp;
410 	if (!rp)
411 		panic("nfsrvd_updatecache null rp");
412 	nd->nd_rp = NULL;
413 	NFSLOCKCACHE();
414 	nfsrc_lock(rp);
415 	if (!(rp->rc_flag & RC_INPROG))
416 		panic("nfsrvd_updatecache not inprog");
417 	rp->rc_flag &= ~RC_INPROG;
418 	if (rp->rc_flag & RC_UDP) {
419 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
420 		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
421 	}
422 
423 	/*
424 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
425 	 */
426 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
427 		newnfsstats.srvcache_nonidemdonehits++;
428 		NFSUNLOCKCACHE();
429 		nd->nd_repstat = 0;
430 		if (nd->nd_mreq)
431 			mbuf_freem(nd->nd_mreq);
432 		if (!(rp->rc_flag & RC_REPMBUF))
433 			panic("reply from cache");
434 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
435 		    M_COPYALL, M_WAIT);
436 		rp->rc_timestamp = NFSD_MONOSEC + NFSRVCACHE_TCPTIMEOUT;
437 		nfsrc_unlock(rp);
438 		nfsrc_trimcache(nd->nd_sockref, so);
439 		return (retrp);
440 	}
441 
442 	/*
443 	 * If rc_refcnt > 0, save it
444 	 * For UDP, save it if ND_SAVEREPLY is set
445 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
446 	 */
447 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
448 	    (rp->rc_refcnt > 0 ||
449 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
450 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
451 	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
452 	      nfsrc_tcpnonidempotent))) {
453 		if (rp->rc_refcnt > 0) {
454 			if (!(rp->rc_flag & RC_NFSV4))
455 				panic("update_cache refcnt");
456 			rp->rc_flag |= RC_REFCNT;
457 		}
458 		if ((nd->nd_flag & ND_NFSV2) &&
459 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
460 			NFSUNLOCKCACHE();
461 			rp->rc_status = nd->nd_repstat;
462 			rp->rc_flag |= RC_REPSTATUS;
463 		} else {
464 			if (!(rp->rc_flag & RC_UDP)) {
465 			    nfsrc_tcpsavedreplies++;
466 			    if (nfsrc_tcpsavedreplies >
467 				newnfsstats.srvcache_tcppeak)
468 				newnfsstats.srvcache_tcppeak =
469 				    nfsrc_tcpsavedreplies;
470 			}
471 			NFSUNLOCKCACHE();
472 			rp->rc_reply = m_copym(nd->nd_mreq, 0, M_COPYALL,
473 			    M_WAIT);
474 			rp->rc_flag |= RC_REPMBUF;
475 		}
476 		if (rp->rc_flag & RC_UDP) {
477 			rp->rc_timestamp = NFSD_MONOSEC +
478 			    NFSRVCACHE_UDPTIMEOUT;
479 			nfsrc_unlock(rp);
480 		} else {
481 			rp->rc_timestamp = NFSD_MONOSEC +
482 			    NFSRVCACHE_TCPTIMEOUT;
483 			if (rp->rc_refcnt > 0)
484 				nfsrc_unlock(rp);
485 			else
486 				retrp = rp;
487 		}
488 	} else {
489 		nfsrc_freecache(rp);
490 		NFSUNLOCKCACHE();
491 	}
492 	nfsrc_trimcache(nd->nd_sockref, so);
493 	return (retrp);
494 }
495 
496 /*
497  * Invalidate and, if possible, free an in prog cache entry.
498  * Must not sleep.
499  */
500 APPLESTATIC void
501 nfsrvd_delcache(struct nfsrvcache *rp)
502 {
503 
504 	if (!(rp->rc_flag & RC_INPROG))
505 		panic("nfsrvd_delcache not in prog");
506 	NFSLOCKCACHE();
507 	rp->rc_flag &= ~RC_INPROG;
508 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
509 		nfsrc_freecache(rp);
510 	NFSUNLOCKCACHE();
511 }
512 
513 /*
514  * Called after nfsrvd_updatecache() once the reply is sent, to update
515  * the entry for nfsrc_activesocket() and unlock it. The argument is
516  * the pointer returned by nfsrvd_updatecache().
517  */
518 APPLESTATIC void
519 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
520 {
521 
522 	if (!(rp->rc_flag & RC_LOCKED))
523 		panic("nfsrvd_sentcache not locked");
524 	if (!err) {
525 		if (so->so_proto->pr_domain->dom_family != AF_INET ||
526 		    so->so_proto->pr_protocol != IPPROTO_TCP)
527 			panic("nfs sent cache");
528 		if (nfsrv_getsockseqnum(so, &rp->rc_tcpseq))
529 			rp->rc_flag |= RC_TCPSEQ;
530 	}
531 	nfsrc_unlock(rp);
532 }
533 
534 /*
535  * Get a cache entry for TCP
536  * - key on <xid, nfs version>
537  *   (allow multiple entries for a given key)
538  */
539 static int
540 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
541 {
542 	struct nfsrvcache *rp, *nextrp;
543 	int i;
544 	struct nfsrvcache *hitrp;
545 	struct nfsrvhashhead *hp, nfsrc_templist;
546 	int hit, ret = 0;
547 
548 	hp = NFSRCHASH(newrp->rc_xid);
549 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
550 tryagain:
551 	NFSLOCKCACHE();
552 	hit = 1;
553 	LIST_INIT(&nfsrc_templist);
554 	/*
555 	 * Get all the matches and put them on the temp list.
556 	 */
557 	rp = LIST_FIRST(hp);
558 	while (rp != LIST_END(hp)) {
559 		nextrp = LIST_NEXT(rp, rc_hash);
560 		if (newrp->rc_xid == rp->rc_xid &&
561 		    (!(rp->rc_flag & RC_INPROG) ||
562 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
563 		      newrp->rc_sockref == rp->rc_sockref)) &&
564 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
565 		    newrp->rc_proc == rp->rc_proc &&
566 		    ((newrp->rc_flag & RC_NFSV4) &&
567 		     newrp->rc_sockref != rp->rc_sockref &&
568 		     newrp->rc_cachetime >= rp->rc_cachetime)
569 		    && newrp->rc_reqlen == rp->rc_reqlen &&
570 		    newrp->rc_cksum == rp->rc_cksum) {
571 			LIST_REMOVE(rp, rc_hash);
572 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
573 		}
574 		rp = nextrp;
575 	}
576 
577 	/*
578 	 * Now, use nfsrc_templist to decide if there is a match.
579 	 */
580 	i = 0;
581 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
582 		i++;
583 		if (rp->rc_refcnt > 0) {
584 			hit = 0;
585 			break;
586 		}
587 	}
588 	/*
589 	 * Can be a hit only if one entry left.
590 	 * Note possible hit entry and put nfsrc_templist back on hash
591 	 * list.
592 	 */
593 	if (i != 1)
594 		hit = 0;
595 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
596 	while (rp != LIST_END(&nfsrc_templist)) {
597 		nextrp = LIST_NEXT(rp, rc_hash);
598 		LIST_REMOVE(rp, rc_hash);
599 		LIST_INSERT_HEAD(hp, rp, rc_hash);
600 		rp = nextrp;
601 	}
602 	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
603 		panic("nfs gettcp cache templist");
604 
605 	if (hit) {
606 		rp = hitrp;
607 		if ((rp->rc_flag & RC_LOCKED) != 0) {
608 			rp->rc_flag |= RC_WANTED;
609 			NFSUNLOCKCACHE();
610 			(void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 10 * hz);
611 			goto tryagain;
612 		}
613 		if (rp->rc_flag == 0)
614 			panic("nfs tcp cache0");
615 		rp->rc_flag |= RC_LOCKED;
616 		if (rp->rc_flag & RC_INPROG) {
617 			newnfsstats.srvcache_inproghits++;
618 			NFSUNLOCKCACHE();
619 			if (newrp->rc_sockref == rp->rc_sockref)
620 				nfsrc_marksametcpconn(rp->rc_sockref);
621 			ret = RC_DROPIT;
622 		} else if (rp->rc_flag & RC_REPSTATUS) {
623 			/*
624 			 * V2 only.
625 			 */
626 			newnfsstats.srvcache_nonidemdonehits++;
627 			NFSUNLOCKCACHE();
628 			if (newrp->rc_sockref == rp->rc_sockref)
629 				nfsrc_marksametcpconn(rp->rc_sockref);
630 			ret = RC_REPLY;
631 			nfsrvd_rephead(nd);
632 			*(nd->nd_errp) = rp->rc_status;
633 			rp->rc_timestamp = NFSD_MONOSEC +
634 				NFSRVCACHE_TCPTIMEOUT;
635 		} else if (rp->rc_flag & RC_REPMBUF) {
636 			newnfsstats.srvcache_nonidemdonehits++;
637 			NFSUNLOCKCACHE();
638 			if (newrp->rc_sockref == rp->rc_sockref)
639 				nfsrc_marksametcpconn(rp->rc_sockref);
640 			ret = RC_REPLY;
641 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
642 				M_COPYALL, M_WAIT);
643 			rp->rc_timestamp = NFSD_MONOSEC +
644 				NFSRVCACHE_TCPTIMEOUT;
645 		} else {
646 			panic("nfs tcp cache1");
647 		}
648 		nfsrc_unlock(rp);
649 		free((caddr_t)newrp, M_NFSRVCACHE);
650 		return (ret);
651 	}
652 	newnfsstats.srvcache_misses++;
653 	newnfsstats.srvcache_size++;
654 
655 	/*
656 	 * For TCP, multiple entries for a key are allowed, so don't
657 	 * chain it into the hash table until done.
658 	 */
659 	newrp->rc_cachetime = NFSD_MONOSEC;
660 	newrp->rc_flag |= RC_INPROG;
661 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
662 	NFSUNLOCKCACHE();
663 	nd->nd_rp = newrp;
664 	return (RC_DOIT);
665 }
666 
667 /*
668  * Lock a cache entry.
669  * Also puts a mutex lock on the cache list.
670  */
671 static void
672 nfsrc_lock(struct nfsrvcache *rp)
673 {
674 	NFSCACHELOCKREQUIRED();
675 	while ((rp->rc_flag & RC_LOCKED) != 0) {
676 		rp->rc_flag |= RC_WANTED;
677 		(void) nfsmsleep((caddr_t)rp, NFSCACHEMUTEXPTR, PZERO - 1,
678 		    "nfsrc", 0);
679 	}
680 	rp->rc_flag |= RC_LOCKED;
681 }
682 
683 /*
684  * Unlock a cache entry.
685  */
686 static void
687 nfsrc_unlock(struct nfsrvcache *rp)
688 {
689 	rp->rc_flag &= ~RC_LOCKED;
690 	nfsrc_wanted(rp);
691 }
692 
693 /*
694  * Wakeup anyone wanting entry.
695  */
696 static void
697 nfsrc_wanted(struct nfsrvcache *rp)
698 {
699 	if (rp->rc_flag & RC_WANTED) {
700 		rp->rc_flag &= ~RC_WANTED;
701 		wakeup((caddr_t)rp);
702 	}
703 }
704 
705 /*
706  * Free up the entry.
707  * Must not sleep.
708  */
709 static void
710 nfsrc_freecache(struct nfsrvcache *rp)
711 {
712 
713 	NFSCACHELOCKREQUIRED();
714 	LIST_REMOVE(rp, rc_hash);
715 	if (rp->rc_flag & RC_UDP) {
716 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
717 		nfsrc_udpcachesize--;
718 	}
719 	nfsrc_wanted(rp);
720 	if (rp->rc_flag & RC_REPMBUF) {
721 		mbuf_freem(rp->rc_reply);
722 		if (!(rp->rc_flag & RC_UDP))
723 			nfsrc_tcpsavedreplies--;
724 	}
725 	FREE((caddr_t)rp, M_NFSRVCACHE);
726 	newnfsstats.srvcache_size--;
727 }
728 
729 #ifdef notdef
730 /*
731  * Clean out the cache. Called when the last nfsd terminates.
732  */
733 APPLESTATIC void
734 nfsrvd_cleancache(void)
735 {
736 	struct nfsrvcache *rp, *nextrp;
737 	int i;
738 
739 	NFSLOCKCACHE();
740 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
741 		LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
742 			nfsrc_freecache(rp);
743 		}
744 	}
745 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
746 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
747 			nfsrc_freecache(rp);
748 		}
749 	}
750 	newnfsstats.srvcache_size = 0;
751 	nfsrc_tcpsavedreplies = 0;
752 	NFSUNLOCKCACHE();
753 }
754 #endif	/* notdef */
755 
756 /*
757  * The basic rule is to get rid of entries that are expired.
758  */
759 static void
760 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
761 {
762 	struct nfsrvcache *rp, *nextrp;
763 	int i;
764 
765 	NFSLOCKCACHE();
766 	TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
767 		if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
768 		     && rp->rc_refcnt == 0
769 		     && ((rp->rc_flag & RC_REFCNT) ||
770 			 NFSD_MONOSEC > rp->rc_timestamp ||
771 			 nfsrc_udpcachesize > nfsrc_udphighwater))
772 			nfsrc_freecache(rp);
773 	}
774 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
775 		LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
776 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
777 			     && rp->rc_refcnt == 0
778 			     && ((rp->rc_flag & RC_REFCNT) ||
779 				 NFSD_MONOSEC > rp->rc_timestamp ||
780 				 nfsrc_activesocket(rp, sockref, so)))
781 				nfsrc_freecache(rp);
782 		}
783 	}
784 	NFSUNLOCKCACHE();
785 }
786 
787 /*
788  * Add a seqid# reference to the cache entry.
789  */
790 APPLESTATIC void
791 nfsrvd_refcache(struct nfsrvcache *rp)
792 {
793 
794 	NFSLOCKCACHE();
795 	if (rp->rc_refcnt < 0)
796 		panic("nfs cache refcnt");
797 	rp->rc_refcnt++;
798 	NFSUNLOCKCACHE();
799 }
800 
801 /*
802  * Dereference a seqid# cache entry.
803  */
804 APPLESTATIC void
805 nfsrvd_derefcache(struct nfsrvcache *rp)
806 {
807 
808 	NFSLOCKCACHE();
809 	if (rp->rc_refcnt <= 0)
810 		panic("nfs cache derefcnt");
811 	rp->rc_refcnt--;
812 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
813 		nfsrc_freecache(rp);
814 	NFSUNLOCKCACHE();
815 }
816 
817 /*
818  * Check to see if the socket is active.
819  * Return 1 if the reply has been received/acknowledged by the client,
820  * 0 otherwise.
821  * XXX - Uses tcp internals.
822  */
823 static int
824 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
825     struct socket *cur_so)
826 {
827 	int ret = 0;
828 
829 	if (!(rp->rc_flag & RC_TCPSEQ))
830 		return (ret);
831 	/*
832 	 * If the sockref is the same, it is the same TCP connection.
833 	 */
834 	if (cur_sockref == rp->rc_sockref)
835 		ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
836 	return (ret);
837 }
838 
839 /*
840  * Calculate the length of the mbuf list and a checksum on the first up to
841  * NFSRVCACHE_CHECKLEN bytes.
842  */
843 static int
844 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
845 {
846 	int len = 0, cklen;
847 	mbuf_t m;
848 
849 	m = m1;
850 	while (m) {
851 		len += mbuf_len(m);
852 		m = mbuf_next(m);
853 	}
854 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
855 	*cksum = in_cksum(m1, cklen);
856 	return (len);
857 }
858 
859 /*
860  * Mark a TCP connection that is seeing retries. Should never happen for
861  * NFSv4.
862  */
863 static void
864 nfsrc_marksametcpconn(u_int64_t sockref)
865 {
866 }
867 
868