xref: /freebsd/sys/fs/nfsserver/nfs_nfsdcache.c (revision 10b59a9b4add0320d52c15ce057dd697261e7dfc)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 /*
38  * Here is the basic algorithm:
39  * First, some design criteria I used:
40  * - I think a false hit is more serious than a false miss
41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
42  *   avoided at all cost
43  * - A valid hit will probably happen a long time after the original reply
44  *   and the TCP socket that the original request was received on will no
45  *   longer be active
46  *   (The long time delay implies to me that LRU is not appropriate.)
47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48  *   in them as well as minimizing the risk of redoing retried non-idempotent
49  *   Ops.
50  * Because it is biased towards avoiding false hits, multiple entries with
51  * the same xid are to be expected, especially for the case of the entry
52  * in the cache being related to a seqid# sequenced Op.
53  *
54  * The basic algorithm I'm about to code up:
55  * - Null RPCs bypass the cache and are just done
56  * For TCP
57  * 	- key on <xid, NFS version> (as noted above, there can be several
58  * 				     entries with the same key)
59  * 	When a request arrives:
60  * 		For all that match key
61  * 		- if RPC# != OR request_size !=
62  * 			- not a match with this one
63  * 		- if NFSv4 and received on same TCP socket OR
64  *			received on a TCP connection created before the
65  *			entry was cached
66  * 			- not a match with this one
67  * 			(V2,3 clients might retry on same TCP socket)
68  * 		- calculate checksum on first N bytes of NFS XDR
69  * 		- if checksum !=
70  * 			- not a match for this one
71  * 		If any of the remaining ones that match has a
72  * 			seqid_refcnt > 0
73  * 			- not a match (go do RPC, using new cache entry)
74  * 		If one match left
75  * 			- a hit (reply from cache)
76  * 		else
77  * 			- miss (go do RPC, using new cache entry)
78  *
79  * 	During processing of NFSv4 request:
80  * 		- set a flag when a non-idempotent Op is processed
81  * 		- when an Op that uses a seqid# (Open,...) is processed
82  * 			- if same seqid# as referenced entry in cache
83  * 				- free new cache entry
84  * 				- reply from referenced cache entry
85  * 			  else if next seqid# in order
86  * 				- free referenced cache entry
87  * 				- increment seqid_refcnt on new cache entry
88  * 				- set pointer from Openowner/Lockowner to
89  * 					new cache entry (aka reference it)
90  * 			  else if first seqid# in sequence
91  * 				- increment seqid_refcnt on new cache entry
92  * 				- set pointer from Openowner/Lockowner to
93  * 					new cache entry (aka reference it)
94  *
95  * 	At end of RPC processing:
96  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97  * 			cache entry
98  * 			- save reply in cache entry
99  * 			- calculate checksum on first N bytes of NFS XDR
100  * 				request
101  * 			- note op and length of XDR request (in bytes)
102  * 			- timestamp it
103  * 		  else
104  * 			- free new cache entry
105  * 		- Send reply (noting info for socket activity check, below)
106  *
107  * 	For cache entries saved above:
108  * 		- if saved since seqid_refcnt was > 0
109  * 			- free when seqid_refcnt decrements to 0
110  * 			  (when next one in sequence is processed above, or
111  * 			   when Openowner/Lockowner is discarded)
112  * 		  else { non-idempotent Op(s) }
113  * 			- free when
114  * 				- some further activity observed on same
115  * 					socket
116  * 				  (I'm not yet sure how I'm going to do
117  * 				   this. Maybe look at the TCP connection
118  * 				   to see if the send_tcp_sequence# is well
119  * 				   past sent reply OR K additional RPCs
120  * 				   replied on same socket OR?)
121  * 			  OR
122  * 				- when very old (hours, days, weeks?)
123  *
124  * For UDP (v2, 3 only), pretty much the old way:
125  * - key on <xid, NFS version, RPC#, Client host ip#>
126  *   (at most one entry for each key)
127  *
128  * When a Request arrives:
129  * - if a match with entry via key
130  * 	- if RPC marked In_progress
131  * 		- discard request (don't send reply)
132  * 	  else
133  * 		- reply from cache
134  * 		- timestamp cache entry
135  *   else
136  * 	- add entry to cache, marked In_progress
137  * 	- do RPC
138  * 	- when RPC done
139  * 		- if RPC# non-idempotent
140  * 			- mark entry Done (not In_progress)
141  * 			- save reply
142  * 			- timestamp cache entry
143  * 		  else
144  * 			- free cache entry
145  * 		- send reply
146  *
147  * Later, entries with saved replies are free'd a short time (few minutes)
148  * after reply sent (timestamp).
149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
150  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151  *		pages 53-63. San Diego, February 1989.
152  *	 for the UDP case.
153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154  *	for TCP. For V3, a reply won't be saved when the flood level is
155  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156  *	that case. This level should be set high enough that this almost
157  *	never happens.
158  */
159 #ifndef APPLEKEXT
160 #include <fs/nfs/nfsport.h>
161 
162 extern struct nfsstats newnfsstats;
163 NFSCACHEMUTEX;
164 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
165 #endif	/* !APPLEKEXT */
166 
167 static int nfsrc_tcpnonidempotent = 1;
168 static int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER, nfsrc_udpcachesize = 0;
169 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
170 static struct nfsrvhashhead nfsrvhashtbl[NFSRVCACHE_HASHSIZE],
171     nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
172 /*
173  * and the reverse mapping from generic to Version 2 procedure numbers
174  */
175 static int newnfsv2_procid[NFS_V3NPROCS] = {
176 	NFSV2PROC_NULL,
177 	NFSV2PROC_GETATTR,
178 	NFSV2PROC_SETATTR,
179 	NFSV2PROC_LOOKUP,
180 	NFSV2PROC_NOOP,
181 	NFSV2PROC_READLINK,
182 	NFSV2PROC_READ,
183 	NFSV2PROC_WRITE,
184 	NFSV2PROC_CREATE,
185 	NFSV2PROC_MKDIR,
186 	NFSV2PROC_SYMLINK,
187 	NFSV2PROC_CREATE,
188 	NFSV2PROC_REMOVE,
189 	NFSV2PROC_RMDIR,
190 	NFSV2PROC_RENAME,
191 	NFSV2PROC_LINK,
192 	NFSV2PROC_READDIR,
193 	NFSV2PROC_NOOP,
194 	NFSV2PROC_STATFS,
195 	NFSV2PROC_NOOP,
196 	NFSV2PROC_NOOP,
197 	NFSV2PROC_NOOP,
198 };
199 
200 #define	NFSRCUDPHASH(xid) \
201 	(&nfsrvudphashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
202 #define	NFSRCHASH(xid) \
203 	(&nfsrvhashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
204 #define	TRUE	1
205 #define	FALSE	0
206 #define	NFSRVCACHE_CHECKLEN	100
207 
208 /* True iff the rpc reply is an nfs status ONLY! */
209 static int nfsv2_repstat[NFS_V3NPROCS] = {
210 	FALSE,
211 	FALSE,
212 	FALSE,
213 	FALSE,
214 	FALSE,
215 	FALSE,
216 	FALSE,
217 	FALSE,
218 	FALSE,
219 	FALSE,
220 	TRUE,
221 	TRUE,
222 	TRUE,
223 	TRUE,
224 	FALSE,
225 	TRUE,
226 	FALSE,
227 	FALSE,
228 	FALSE,
229 	FALSE,
230 	FALSE,
231 	FALSE,
232 };
233 
234 /*
235  * Will NFS want to work over IPv6 someday?
236  */
237 #define	NETFAMILY(rp) \
238 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
239 
240 /* local functions */
241 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
242 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
243 static void nfsrc_lock(struct nfsrvcache *rp);
244 static void nfsrc_unlock(struct nfsrvcache *rp);
245 static void nfsrc_wanted(struct nfsrvcache *rp);
246 static void nfsrc_freecache(struct nfsrvcache *rp);
247 static void nfsrc_trimcache(u_int64_t, struct socket *);
248 static int nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t,
249     struct socket *);
250 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
251 static void nfsrc_marksametcpconn(u_int64_t);
252 
253 /*
254  * Initialize the server request cache list
255  */
256 APPLESTATIC void
257 nfsrvd_initcache(void)
258 {
259 	int i;
260 	static int inited = 0;
261 
262 	if (inited)
263 		return;
264 	inited = 1;
265 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
266 		LIST_INIT(&nfsrvudphashtbl[i]);
267 		LIST_INIT(&nfsrvhashtbl[i]);
268 	}
269 	TAILQ_INIT(&nfsrvudplru);
270 	nfsrc_tcpsavedreplies = 0;
271 	nfsrc_udpcachesize = 0;
272 	newnfsstats.srvcache_tcppeak = 0;
273 	newnfsstats.srvcache_size = 0;
274 }
275 
276 /*
277  * Get a cache entry for this request. Basically just malloc a new one
278  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
279  * Call nfsrc_trimcache() to clean up the cache before returning.
280  */
281 APPLESTATIC int
282 nfsrvd_getcache(struct nfsrv_descript *nd, struct socket *so)
283 {
284 	struct nfsrvcache *newrp;
285 	int ret;
286 
287 	if (nd->nd_procnum == NFSPROC_NULL)
288 		panic("nfsd cache null");
289 	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
290 	    M_NFSRVCACHE, M_WAITOK);
291 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
292 	if (nd->nd_flag & ND_NFSV4)
293 		newrp->rc_flag = RC_NFSV4;
294 	else if (nd->nd_flag & ND_NFSV3)
295 		newrp->rc_flag = RC_NFSV3;
296 	else
297 		newrp->rc_flag = RC_NFSV2;
298 	newrp->rc_xid = nd->nd_retxid;
299 	newrp->rc_proc = nd->nd_procnum;
300 	newrp->rc_sockref = nd->nd_sockref;
301 	newrp->rc_cachetime = nd->nd_tcpconntime;
302 	if (nd->nd_flag & ND_SAMETCPCONN)
303 		newrp->rc_flag |= RC_SAMETCPCONN;
304 	if (nd->nd_nam2 != NULL) {
305 		newrp->rc_flag |= RC_UDP;
306 		ret = nfsrc_getudp(nd, newrp);
307 	} else {
308 		ret = nfsrc_gettcp(nd, newrp);
309 	}
310 	nfsrc_trimcache(nd->nd_sockref, so);
311 	NFSEXITCODE2(0, nd);
312 	return (ret);
313 }
314 
315 /*
316  * For UDP (v2, v3):
317  * - key on <xid, NFS version, RPC#, Client host ip#>
318  *   (at most one entry for each key)
319  */
320 static int
321 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
322 {
323 	struct nfsrvcache *rp;
324 	struct sockaddr_in *saddr;
325 	struct sockaddr_in6 *saddr6;
326 	struct nfsrvhashhead *hp;
327 	int ret = 0;
328 
329 	hp = NFSRCUDPHASH(newrp->rc_xid);
330 loop:
331 	NFSLOCKCACHE();
332 	LIST_FOREACH(rp, hp, rc_hash) {
333 	    if (newrp->rc_xid == rp->rc_xid &&
334 		newrp->rc_proc == rp->rc_proc &&
335 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
336 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
337 			if ((rp->rc_flag & RC_LOCKED) != 0) {
338 				rp->rc_flag |= RC_WANTED;
339 				NFSUNLOCKCACHE();
340 				(void) tsleep((caddr_t)rp, PZERO - 1,
341 				    "nfsrc", 10 * hz);
342 				goto loop;
343 			}
344 			if (rp->rc_flag == 0)
345 				panic("nfs udp cache0");
346 			rp->rc_flag |= RC_LOCKED;
347 			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
348 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
349 			if (rp->rc_flag & RC_INPROG) {
350 				newnfsstats.srvcache_inproghits++;
351 				NFSUNLOCKCACHE();
352 				ret = RC_DROPIT;
353 			} else if (rp->rc_flag & RC_REPSTATUS) {
354 				/*
355 				 * V2 only.
356 				 */
357 				newnfsstats.srvcache_nonidemdonehits++;
358 				NFSUNLOCKCACHE();
359 				nfsrvd_rephead(nd);
360 				*(nd->nd_errp) = rp->rc_status;
361 				ret = RC_REPLY;
362 				rp->rc_timestamp = NFSD_MONOSEC +
363 					NFSRVCACHE_UDPTIMEOUT;
364 			} else if (rp->rc_flag & RC_REPMBUF) {
365 				newnfsstats.srvcache_nonidemdonehits++;
366 				NFSUNLOCKCACHE();
367 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
368 					M_COPYALL, M_WAIT);
369 				ret = RC_REPLY;
370 				rp->rc_timestamp = NFSD_MONOSEC +
371 					NFSRVCACHE_UDPTIMEOUT;
372 			} else {
373 				panic("nfs udp cache1");
374 			}
375 			nfsrc_unlock(rp);
376 			free((caddr_t)newrp, M_NFSRVCACHE);
377 			goto out;
378 		}
379 	}
380 	newnfsstats.srvcache_misses++;
381 	newnfsstats.srvcache_size++;
382 	nfsrc_udpcachesize++;
383 
384 	newrp->rc_flag |= RC_INPROG;
385 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
386 	if (saddr->sin_family == AF_INET)
387 		newrp->rc_inet = saddr->sin_addr.s_addr;
388 	else if (saddr->sin_family == AF_INET6) {
389 		saddr6 = (struct sockaddr_in6 *)saddr;
390 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
391 		    sizeof (struct in6_addr));
392 		newrp->rc_flag |= RC_INETIPV6;
393 	}
394 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
395 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
396 	NFSUNLOCKCACHE();
397 	nd->nd_rp = newrp;
398 	ret = RC_DOIT;
399 
400 out:
401 	NFSEXITCODE2(0, nd);
402 	return (ret);
403 }
404 
405 /*
406  * Update a request cache entry after the rpc has been done
407  */
408 APPLESTATIC struct nfsrvcache *
409 nfsrvd_updatecache(struct nfsrv_descript *nd, struct socket *so)
410 {
411 	struct nfsrvcache *rp;
412 	struct nfsrvcache *retrp = NULL;
413 	mbuf_t m;
414 
415 	rp = nd->nd_rp;
416 	if (!rp)
417 		panic("nfsrvd_updatecache null rp");
418 	nd->nd_rp = NULL;
419 	NFSLOCKCACHE();
420 	nfsrc_lock(rp);
421 	if (!(rp->rc_flag & RC_INPROG))
422 		panic("nfsrvd_updatecache not inprog");
423 	rp->rc_flag &= ~RC_INPROG;
424 	if (rp->rc_flag & RC_UDP) {
425 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
426 		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
427 	}
428 
429 	/*
430 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
431 	 */
432 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
433 		newnfsstats.srvcache_nonidemdonehits++;
434 		NFSUNLOCKCACHE();
435 		nd->nd_repstat = 0;
436 		if (nd->nd_mreq)
437 			mbuf_freem(nd->nd_mreq);
438 		if (!(rp->rc_flag & RC_REPMBUF))
439 			panic("reply from cache");
440 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
441 		    M_COPYALL, M_WAIT);
442 		rp->rc_timestamp = NFSD_MONOSEC + NFSRVCACHE_TCPTIMEOUT;
443 		nfsrc_unlock(rp);
444 		goto out;
445 	}
446 
447 	/*
448 	 * If rc_refcnt > 0, save it
449 	 * For UDP, save it if ND_SAVEREPLY is set
450 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
451 	 */
452 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
453 	    (rp->rc_refcnt > 0 ||
454 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
455 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
456 	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
457 	      nfsrc_tcpnonidempotent))) {
458 		if (rp->rc_refcnt > 0) {
459 			if (!(rp->rc_flag & RC_NFSV4))
460 				panic("update_cache refcnt");
461 			rp->rc_flag |= RC_REFCNT;
462 		}
463 		if ((nd->nd_flag & ND_NFSV2) &&
464 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
465 			rp->rc_status = nd->nd_repstat;
466 			rp->rc_flag |= RC_REPSTATUS;
467 			NFSUNLOCKCACHE();
468 		} else {
469 			if (!(rp->rc_flag & RC_UDP)) {
470 			    nfsrc_tcpsavedreplies++;
471 			    if (nfsrc_tcpsavedreplies >
472 				newnfsstats.srvcache_tcppeak)
473 				newnfsstats.srvcache_tcppeak =
474 				    nfsrc_tcpsavedreplies;
475 			}
476 			NFSUNLOCKCACHE();
477 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAIT);
478 			NFSLOCKCACHE();
479 			rp->rc_reply = m;
480 			rp->rc_flag |= RC_REPMBUF;
481 			NFSUNLOCKCACHE();
482 		}
483 		if (rp->rc_flag & RC_UDP) {
484 			rp->rc_timestamp = NFSD_MONOSEC +
485 			    NFSRVCACHE_UDPTIMEOUT;
486 			nfsrc_unlock(rp);
487 		} else {
488 			rp->rc_timestamp = NFSD_MONOSEC +
489 			    NFSRVCACHE_TCPTIMEOUT;
490 			if (rp->rc_refcnt > 0)
491 				nfsrc_unlock(rp);
492 			else
493 				retrp = rp;
494 		}
495 	} else {
496 		nfsrc_freecache(rp);
497 		NFSUNLOCKCACHE();
498 	}
499 
500 out:
501 	nfsrc_trimcache(nd->nd_sockref, so);
502 	NFSEXITCODE2(0, nd);
503 	return (retrp);
504 }
505 
506 /*
507  * Invalidate and, if possible, free an in prog cache entry.
508  * Must not sleep.
509  */
510 APPLESTATIC void
511 nfsrvd_delcache(struct nfsrvcache *rp)
512 {
513 
514 	if (!(rp->rc_flag & RC_INPROG))
515 		panic("nfsrvd_delcache not in prog");
516 	NFSLOCKCACHE();
517 	rp->rc_flag &= ~RC_INPROG;
518 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
519 		nfsrc_freecache(rp);
520 	NFSUNLOCKCACHE();
521 }
522 
523 /*
524  * Called after nfsrvd_updatecache() once the reply is sent, to update
525  * the entry for nfsrc_activesocket() and unlock it. The argument is
526  * the pointer returned by nfsrvd_updatecache().
527  */
528 APPLESTATIC void
529 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
530 {
531 	tcp_seq tmp_seq;
532 
533 	if (!(rp->rc_flag & RC_LOCKED))
534 		panic("nfsrvd_sentcache not locked");
535 	if (!err) {
536 		if ((so->so_proto->pr_domain->dom_family != AF_INET &&
537 		     so->so_proto->pr_domain->dom_family != AF_INET6) ||
538 		     so->so_proto->pr_protocol != IPPROTO_TCP)
539 			panic("nfs sent cache");
540 		if (nfsrv_getsockseqnum(so, &tmp_seq)) {
541 			NFSLOCKCACHE();
542 			rp->rc_tcpseq = tmp_seq;
543 			rp->rc_flag |= RC_TCPSEQ;
544 			NFSUNLOCKCACHE();
545 		}
546 	}
547 	nfsrc_unlock(rp);
548 }
549 
550 /*
551  * Get a cache entry for TCP
552  * - key on <xid, nfs version>
553  *   (allow multiple entries for a given key)
554  */
555 static int
556 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
557 {
558 	struct nfsrvcache *rp, *nextrp;
559 	int i;
560 	struct nfsrvcache *hitrp;
561 	struct nfsrvhashhead *hp, nfsrc_templist;
562 	int hit, ret = 0;
563 
564 	hp = NFSRCHASH(newrp->rc_xid);
565 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
566 tryagain:
567 	NFSLOCKCACHE();
568 	hit = 1;
569 	LIST_INIT(&nfsrc_templist);
570 	/*
571 	 * Get all the matches and put them on the temp list.
572 	 */
573 	rp = LIST_FIRST(hp);
574 	while (rp != LIST_END(hp)) {
575 		nextrp = LIST_NEXT(rp, rc_hash);
576 		if (newrp->rc_xid == rp->rc_xid &&
577 		    (!(rp->rc_flag & RC_INPROG) ||
578 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
579 		      newrp->rc_sockref == rp->rc_sockref)) &&
580 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
581 		    newrp->rc_proc == rp->rc_proc &&
582 		    ((newrp->rc_flag & RC_NFSV4) &&
583 		     newrp->rc_sockref != rp->rc_sockref &&
584 		     newrp->rc_cachetime >= rp->rc_cachetime)
585 		    && newrp->rc_reqlen == rp->rc_reqlen &&
586 		    newrp->rc_cksum == rp->rc_cksum) {
587 			LIST_REMOVE(rp, rc_hash);
588 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
589 		}
590 		rp = nextrp;
591 	}
592 
593 	/*
594 	 * Now, use nfsrc_templist to decide if there is a match.
595 	 */
596 	i = 0;
597 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
598 		i++;
599 		if (rp->rc_refcnt > 0) {
600 			hit = 0;
601 			break;
602 		}
603 	}
604 	/*
605 	 * Can be a hit only if one entry left.
606 	 * Note possible hit entry and put nfsrc_templist back on hash
607 	 * list.
608 	 */
609 	if (i != 1)
610 		hit = 0;
611 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
612 	while (rp != LIST_END(&nfsrc_templist)) {
613 		nextrp = LIST_NEXT(rp, rc_hash);
614 		LIST_REMOVE(rp, rc_hash);
615 		LIST_INSERT_HEAD(hp, rp, rc_hash);
616 		rp = nextrp;
617 	}
618 	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
619 		panic("nfs gettcp cache templist");
620 
621 	if (hit) {
622 		rp = hitrp;
623 		if ((rp->rc_flag & RC_LOCKED) != 0) {
624 			rp->rc_flag |= RC_WANTED;
625 			NFSUNLOCKCACHE();
626 			(void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 10 * hz);
627 			goto tryagain;
628 		}
629 		if (rp->rc_flag == 0)
630 			panic("nfs tcp cache0");
631 		rp->rc_flag |= RC_LOCKED;
632 		if (rp->rc_flag & RC_INPROG) {
633 			newnfsstats.srvcache_inproghits++;
634 			NFSUNLOCKCACHE();
635 			if (newrp->rc_sockref == rp->rc_sockref)
636 				nfsrc_marksametcpconn(rp->rc_sockref);
637 			ret = RC_DROPIT;
638 		} else if (rp->rc_flag & RC_REPSTATUS) {
639 			/*
640 			 * V2 only.
641 			 */
642 			newnfsstats.srvcache_nonidemdonehits++;
643 			NFSUNLOCKCACHE();
644 			if (newrp->rc_sockref == rp->rc_sockref)
645 				nfsrc_marksametcpconn(rp->rc_sockref);
646 			ret = RC_REPLY;
647 			nfsrvd_rephead(nd);
648 			*(nd->nd_errp) = rp->rc_status;
649 			rp->rc_timestamp = NFSD_MONOSEC +
650 				NFSRVCACHE_TCPTIMEOUT;
651 		} else if (rp->rc_flag & RC_REPMBUF) {
652 			newnfsstats.srvcache_nonidemdonehits++;
653 			NFSUNLOCKCACHE();
654 			if (newrp->rc_sockref == rp->rc_sockref)
655 				nfsrc_marksametcpconn(rp->rc_sockref);
656 			ret = RC_REPLY;
657 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
658 				M_COPYALL, M_WAIT);
659 			rp->rc_timestamp = NFSD_MONOSEC +
660 				NFSRVCACHE_TCPTIMEOUT;
661 		} else {
662 			panic("nfs tcp cache1");
663 		}
664 		nfsrc_unlock(rp);
665 		free((caddr_t)newrp, M_NFSRVCACHE);
666 		goto out;
667 	}
668 	newnfsstats.srvcache_misses++;
669 	newnfsstats.srvcache_size++;
670 
671 	/*
672 	 * For TCP, multiple entries for a key are allowed, so don't
673 	 * chain it into the hash table until done.
674 	 */
675 	newrp->rc_cachetime = NFSD_MONOSEC;
676 	newrp->rc_flag |= RC_INPROG;
677 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
678 	NFSUNLOCKCACHE();
679 	nd->nd_rp = newrp;
680 	ret = RC_DOIT;
681 
682 out:
683 	NFSEXITCODE2(0, nd);
684 	return (ret);
685 }
686 
687 /*
688  * Lock a cache entry.
689  * Also puts a mutex lock on the cache list.
690  */
691 static void
692 nfsrc_lock(struct nfsrvcache *rp)
693 {
694 	NFSCACHELOCKREQUIRED();
695 	while ((rp->rc_flag & RC_LOCKED) != 0) {
696 		rp->rc_flag |= RC_WANTED;
697 		(void) nfsmsleep((caddr_t)rp, NFSCACHEMUTEXPTR, PZERO - 1,
698 		    "nfsrc", 0);
699 	}
700 	rp->rc_flag |= RC_LOCKED;
701 }
702 
703 /*
704  * Unlock a cache entry.
705  */
706 static void
707 nfsrc_unlock(struct nfsrvcache *rp)
708 {
709 
710 	NFSLOCKCACHE();
711 	rp->rc_flag &= ~RC_LOCKED;
712 	nfsrc_wanted(rp);
713 	NFSUNLOCKCACHE();
714 }
715 
716 /*
717  * Wakeup anyone wanting entry.
718  */
719 static void
720 nfsrc_wanted(struct nfsrvcache *rp)
721 {
722 	if (rp->rc_flag & RC_WANTED) {
723 		rp->rc_flag &= ~RC_WANTED;
724 		wakeup((caddr_t)rp);
725 	}
726 }
727 
728 /*
729  * Free up the entry.
730  * Must not sleep.
731  */
732 static void
733 nfsrc_freecache(struct nfsrvcache *rp)
734 {
735 
736 	NFSCACHELOCKREQUIRED();
737 	LIST_REMOVE(rp, rc_hash);
738 	if (rp->rc_flag & RC_UDP) {
739 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
740 		nfsrc_udpcachesize--;
741 	}
742 	nfsrc_wanted(rp);
743 	if (rp->rc_flag & RC_REPMBUF) {
744 		mbuf_freem(rp->rc_reply);
745 		if (!(rp->rc_flag & RC_UDP))
746 			nfsrc_tcpsavedreplies--;
747 	}
748 	FREE((caddr_t)rp, M_NFSRVCACHE);
749 	newnfsstats.srvcache_size--;
750 }
751 
752 /*
753  * Clean out the cache. Called when nfsserver module is unloaded.
754  */
755 APPLESTATIC void
756 nfsrvd_cleancache(void)
757 {
758 	struct nfsrvcache *rp, *nextrp;
759 	int i;
760 
761 	NFSLOCKCACHE();
762 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
763 		LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
764 			nfsrc_freecache(rp);
765 		}
766 	}
767 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
768 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
769 			nfsrc_freecache(rp);
770 		}
771 	}
772 	newnfsstats.srvcache_size = 0;
773 	nfsrc_tcpsavedreplies = 0;
774 	NFSUNLOCKCACHE();
775 }
776 
777 /*
778  * The basic rule is to get rid of entries that are expired.
779  */
780 static void
781 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
782 {
783 	struct nfsrvcache *rp, *nextrp;
784 	int i;
785 
786 	NFSLOCKCACHE();
787 	TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
788 		if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
789 		     && rp->rc_refcnt == 0
790 		     && ((rp->rc_flag & RC_REFCNT) ||
791 			 NFSD_MONOSEC > rp->rc_timestamp ||
792 			 nfsrc_udpcachesize > nfsrc_udphighwater))
793 			nfsrc_freecache(rp);
794 	}
795 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
796 		LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
797 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
798 			     && rp->rc_refcnt == 0
799 			     && ((rp->rc_flag & RC_REFCNT) ||
800 				 NFSD_MONOSEC > rp->rc_timestamp ||
801 				 nfsrc_activesocket(rp, sockref, so)))
802 				nfsrc_freecache(rp);
803 		}
804 	}
805 	NFSUNLOCKCACHE();
806 }
807 
808 /*
809  * Add a seqid# reference to the cache entry.
810  */
811 APPLESTATIC void
812 nfsrvd_refcache(struct nfsrvcache *rp)
813 {
814 
815 	NFSLOCKCACHE();
816 	if (rp->rc_refcnt < 0)
817 		panic("nfs cache refcnt");
818 	rp->rc_refcnt++;
819 	NFSUNLOCKCACHE();
820 }
821 
822 /*
823  * Dereference a seqid# cache entry.
824  */
825 APPLESTATIC void
826 nfsrvd_derefcache(struct nfsrvcache *rp)
827 {
828 
829 	NFSLOCKCACHE();
830 	if (rp->rc_refcnt <= 0)
831 		panic("nfs cache derefcnt");
832 	rp->rc_refcnt--;
833 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
834 		nfsrc_freecache(rp);
835 	NFSUNLOCKCACHE();
836 }
837 
838 /*
839  * Check to see if the socket is active.
840  * Return 1 if the reply has been received/acknowledged by the client,
841  * 0 otherwise.
842  * XXX - Uses tcp internals.
843  */
844 static int
845 nfsrc_activesocket(struct nfsrvcache *rp, u_int64_t cur_sockref,
846     struct socket *cur_so)
847 {
848 	int ret = 0;
849 
850 	if (!(rp->rc_flag & RC_TCPSEQ))
851 		return (ret);
852 	/*
853 	 * If the sockref is the same, it is the same TCP connection.
854 	 */
855 	if (cur_sockref == rp->rc_sockref)
856 		ret = nfsrv_checksockseqnum(cur_so, rp->rc_tcpseq);
857 	return (ret);
858 }
859 
860 /*
861  * Calculate the length of the mbuf list and a checksum on the first up to
862  * NFSRVCACHE_CHECKLEN bytes.
863  */
864 static int
865 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
866 {
867 	int len = 0, cklen;
868 	mbuf_t m;
869 
870 	m = m1;
871 	while (m) {
872 		len += mbuf_len(m);
873 		m = mbuf_next(m);
874 	}
875 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
876 	*cksum = in_cksum(m1, cklen);
877 	return (len);
878 }
879 
880 /*
881  * Mark a TCP connection that is seeing retries. Should never happen for
882  * NFSv4.
883  */
884 static void
885 nfsrc_marksametcpconn(u_int64_t sockref)
886 {
887 }
888 
889