xref: /freebsd/sys/fs/nfsserver/nfs_nfsdcache.c (revision 4ec234c813eed05c166859bba82c882e40826eb9)
1 /*-
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 /*
38  * Here is the basic algorithm:
39  * First, some design criteria I used:
40  * - I think a false hit is more serious than a false miss
41  * - A false hit for an RPC that has Op(s) that order via seqid# must be
42  *   avoided at all cost
43  * - A valid hit will probably happen a long time after the original reply
44  *   and the TCP socket that the original request was received on will no
45  *   longer be active
46  *   (The long time delay implies to me that LRU is not appropriate.)
47  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48  *   in them as well as minimizing the risk of redoing retried non-idempotent
49  *   Ops.
50  * Because it is biased towards avoiding false hits, multiple entries with
51  * the same xid are to be expected, especially for the case of the entry
52  * in the cache being related to a seqid# sequenced Op.
53  *
54  * The basic algorithm I'm about to code up:
55  * - Null RPCs bypass the cache and are just done
56  * For TCP
57  * 	- key on <xid, NFS version> (as noted above, there can be several
58  * 				     entries with the same key)
59  * 	When a request arrives:
60  * 		For all that match key
61  * 		- if RPC# != OR request_size !=
62  * 			- not a match with this one
63  * 		- if NFSv4 and received on same TCP socket OR
64  *			received on a TCP connection created before the
65  *			entry was cached
66  * 			- not a match with this one
67  * 			(V2,3 clients might retry on same TCP socket)
68  * 		- calculate checksum on first N bytes of NFS XDR
69  * 		- if checksum !=
70  * 			- not a match for this one
71  * 		If any of the remaining ones that match has a
72  * 			seqid_refcnt > 0
73  * 			- not a match (go do RPC, using new cache entry)
74  * 		If one match left
75  * 			- a hit (reply from cache)
76  * 		else
77  * 			- miss (go do RPC, using new cache entry)
78  *
79  * 	During processing of NFSv4 request:
80  * 		- set a flag when a non-idempotent Op is processed
81  * 		- when an Op that uses a seqid# (Open,...) is processed
82  * 			- if same seqid# as referenced entry in cache
83  * 				- free new cache entry
84  * 				- reply from referenced cache entry
85  * 			  else if next seqid# in order
86  * 				- free referenced cache entry
87  * 				- increment seqid_refcnt on new cache entry
88  * 				- set pointer from Openowner/Lockowner to
89  * 					new cache entry (aka reference it)
90  * 			  else if first seqid# in sequence
91  * 				- increment seqid_refcnt on new cache entry
92  * 				- set pointer from Openowner/Lockowner to
93  * 					new cache entry (aka reference it)
94  *
95  * 	At end of RPC processing:
96  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97  * 			cache entry
98  * 			- save reply in cache entry
99  * 			- calculate checksum on first N bytes of NFS XDR
100  * 				request
101  * 			- note op and length of XDR request (in bytes)
102  * 			- timestamp it
103  * 		  else
104  * 			- free new cache entry
105  * 		- Send reply (noting info for socket activity check, below)
106  *
107  * 	For cache entries saved above:
108  * 		- if saved since seqid_refcnt was > 0
109  * 			- free when seqid_refcnt decrements to 0
110  * 			  (when next one in sequence is processed above, or
111  * 			   when Openowner/Lockowner is discarded)
112  * 		  else { non-idempotent Op(s) }
113  * 			- free when
114  * 				- some further activity observed on same
115  * 					socket
116  * 				  (I'm not yet sure how I'm going to do
117  * 				   this. Maybe look at the TCP connection
118  * 				   to see if the send_tcp_sequence# is well
119  * 				   past sent reply OR K additional RPCs
120  * 				   replied on same socket OR?)
121  * 			  OR
122  * 				- when very old (hours, days, weeks?)
123  *
124  * For UDP (v2, 3 only), pretty much the old way:
125  * - key on <xid, NFS version, RPC#, Client host ip#>
126  *   (at most one entry for each key)
127  *
128  * When a Request arrives:
129  * - if a match with entry via key
130  * 	- if RPC marked In_progress
131  * 		- discard request (don't send reply)
132  * 	  else
133  * 		- reply from cache
134  * 		- timestamp cache entry
135  *   else
136  * 	- add entry to cache, marked In_progress
137  * 	- do RPC
138  * 	- when RPC done
139  * 		- if RPC# non-idempotent
140  * 			- mark entry Done (not In_progress)
141  * 			- save reply
142  * 			- timestamp cache entry
143  * 		  else
144  * 			- free cache entry
145  * 		- send reply
146  *
147  * Later, entries with saved replies are free'd a short time (few minutes)
148  * after reply sent (timestamp).
149  * Reference: Chet Juszczak, "Improving the Performance and Correctness
150  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151  *		pages 53-63. San Diego, February 1989.
152  *	 for the UDP case.
153  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154  *	for TCP. For V3, a reply won't be saved when the flood level is
155  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156  *	that case. This level should be set high enough that this almost
157  *	never happens.
158  */
159 #ifndef APPLEKEXT
160 #include <fs/nfs/nfsport.h>
161 
162 extern struct nfsstats newnfsstats;
163 extern struct mtx nfsrc_udpmtx;
164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
165 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
166 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
167 #endif	/* !APPLEKEXT */
168 
169 SYSCTL_DECL(_vfs_nfsd);
170 
171 static u_int	nfsrc_tcphighwater = 0;
172 static int
173 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
174 {
175 	int error, newhighwater;
176 
177 	newhighwater = nfsrc_tcphighwater;
178 	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
179 	if (error != 0 || req->newptr == NULL)
180 		return (error);
181 	if (newhighwater < 0)
182 		return (EINVAL);
183 	if (newhighwater >= nfsrc_floodlevel)
184 		nfsrc_floodlevel = newhighwater + newhighwater / 5;
185 	nfsrc_tcphighwater = newhighwater;
186 	return (0);
187 }
188 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
189     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
190     "High water mark for TCP cache entries");
191 
192 static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
193 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
194     &nfsrc_udphighwater, 0,
195     "High water mark for UDP cache entries");
196 static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
198     &nfsrc_tcptimeout, 0,
199     "Timeout for TCP entries in the DRC");
200 static u_int nfsrc_tcpnonidempotent = 1;
201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
202     &nfsrc_tcpnonidempotent, 0,
203     "Enable the DRC for NFS over TCP");
204 
205 static int nfsrc_udpcachesize = 0;
206 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
207 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
208 
209 /*
210  * and the reverse mapping from generic to Version 2 procedure numbers
211  */
212 static int newnfsv2_procid[NFS_V3NPROCS] = {
213 	NFSV2PROC_NULL,
214 	NFSV2PROC_GETATTR,
215 	NFSV2PROC_SETATTR,
216 	NFSV2PROC_LOOKUP,
217 	NFSV2PROC_NOOP,
218 	NFSV2PROC_READLINK,
219 	NFSV2PROC_READ,
220 	NFSV2PROC_WRITE,
221 	NFSV2PROC_CREATE,
222 	NFSV2PROC_MKDIR,
223 	NFSV2PROC_SYMLINK,
224 	NFSV2PROC_CREATE,
225 	NFSV2PROC_REMOVE,
226 	NFSV2PROC_RMDIR,
227 	NFSV2PROC_RENAME,
228 	NFSV2PROC_LINK,
229 	NFSV2PROC_READDIR,
230 	NFSV2PROC_NOOP,
231 	NFSV2PROC_STATFS,
232 	NFSV2PROC_NOOP,
233 	NFSV2PROC_NOOP,
234 	NFSV2PROC_NOOP,
235 };
236 
237 #define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
238 #define	NFSRCUDPHASH(xid) \
239 	(&nfsrvudphashtbl[nfsrc_hash(xid)])
240 #define	NFSRCHASH(xid) \
241 	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
242 #define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
243 #define	TRUE	1
244 #define	FALSE	0
245 #define	NFSRVCACHE_CHECKLEN	100
246 
247 /* True iff the rpc reply is an nfs status ONLY! */
248 static int nfsv2_repstat[NFS_V3NPROCS] = {
249 	FALSE,
250 	FALSE,
251 	FALSE,
252 	FALSE,
253 	FALSE,
254 	FALSE,
255 	FALSE,
256 	FALSE,
257 	FALSE,
258 	FALSE,
259 	TRUE,
260 	TRUE,
261 	TRUE,
262 	TRUE,
263 	FALSE,
264 	TRUE,
265 	FALSE,
266 	FALSE,
267 	FALSE,
268 	FALSE,
269 	FALSE,
270 	FALSE,
271 };
272 
273 /*
274  * Will NFS want to work over IPv6 someday?
275  */
276 #define	NETFAMILY(rp) \
277 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
278 
279 /* local functions */
280 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
281 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282 static void nfsrc_lock(struct nfsrvcache *rp);
283 static void nfsrc_unlock(struct nfsrvcache *rp);
284 static void nfsrc_wanted(struct nfsrvcache *rp);
285 static void nfsrc_freecache(struct nfsrvcache *rp);
286 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
287 static void nfsrc_marksametcpconn(u_int64_t);
288 
289 /*
290  * Return the correct mutex for this cache entry.
291  */
292 static __inline struct mtx *
293 nfsrc_cachemutex(struct nfsrvcache *rp)
294 {
295 
296 	if ((rp->rc_flag & RC_UDP) != 0)
297 		return (&nfsrc_udpmtx);
298 	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
299 }
300 
301 /*
302  * Initialize the server request cache list
303  */
304 APPLESTATIC void
305 nfsrvd_initcache(void)
306 {
307 	int i;
308 	static int inited = 0;
309 
310 	if (inited)
311 		return;
312 	inited = 1;
313 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
314 		LIST_INIT(&nfsrvudphashtbl[i]);
315 		LIST_INIT(&nfsrchash_table[i].tbl);
316 		LIST_INIT(&nfsrcahash_table[i].tbl);
317 	}
318 	TAILQ_INIT(&nfsrvudplru);
319 	nfsrc_tcpsavedreplies = 0;
320 	nfsrc_udpcachesize = 0;
321 	newnfsstats.srvcache_tcppeak = 0;
322 	newnfsstats.srvcache_size = 0;
323 }
324 
325 /*
326  * Get a cache entry for this request. Basically just malloc a new one
327  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
328  */
329 APPLESTATIC int
330 nfsrvd_getcache(struct nfsrv_descript *nd)
331 {
332 	struct nfsrvcache *newrp;
333 	int ret;
334 
335 	if (nd->nd_procnum == NFSPROC_NULL)
336 		panic("nfsd cache null");
337 	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
338 	    M_NFSRVCACHE, M_WAITOK);
339 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
340 	if (nd->nd_flag & ND_NFSV4)
341 		newrp->rc_flag = RC_NFSV4;
342 	else if (nd->nd_flag & ND_NFSV3)
343 		newrp->rc_flag = RC_NFSV3;
344 	else
345 		newrp->rc_flag = RC_NFSV2;
346 	newrp->rc_xid = nd->nd_retxid;
347 	newrp->rc_proc = nd->nd_procnum;
348 	newrp->rc_sockref = nd->nd_sockref;
349 	newrp->rc_cachetime = nd->nd_tcpconntime;
350 	if (nd->nd_flag & ND_SAMETCPCONN)
351 		newrp->rc_flag |= RC_SAMETCPCONN;
352 	if (nd->nd_nam2 != NULL) {
353 		newrp->rc_flag |= RC_UDP;
354 		ret = nfsrc_getudp(nd, newrp);
355 	} else {
356 		ret = nfsrc_gettcp(nd, newrp);
357 	}
358 	NFSEXITCODE2(0, nd);
359 	return (ret);
360 }
361 
362 /*
363  * For UDP (v2, v3):
364  * - key on <xid, NFS version, RPC#, Client host ip#>
365  *   (at most one entry for each key)
366  */
367 static int
368 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
369 {
370 	struct nfsrvcache *rp;
371 	struct sockaddr_in *saddr;
372 	struct sockaddr_in6 *saddr6;
373 	struct nfsrvhashhead *hp;
374 	int ret = 0;
375 	struct mtx *mutex;
376 
377 	mutex = nfsrc_cachemutex(newrp);
378 	hp = NFSRCUDPHASH(newrp->rc_xid);
379 loop:
380 	mtx_lock(mutex);
381 	LIST_FOREACH(rp, hp, rc_hash) {
382 	    if (newrp->rc_xid == rp->rc_xid &&
383 		newrp->rc_proc == rp->rc_proc &&
384 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
385 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
386 			if ((rp->rc_flag & RC_LOCKED) != 0) {
387 				rp->rc_flag |= RC_WANTED;
388 				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
389 				    "nfsrc", 10 * hz);
390 				goto loop;
391 			}
392 			if (rp->rc_flag == 0)
393 				panic("nfs udp cache0");
394 			rp->rc_flag |= RC_LOCKED;
395 			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
396 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
397 			if (rp->rc_flag & RC_INPROG) {
398 				newnfsstats.srvcache_inproghits++;
399 				mtx_unlock(mutex);
400 				ret = RC_DROPIT;
401 			} else if (rp->rc_flag & RC_REPSTATUS) {
402 				/*
403 				 * V2 only.
404 				 */
405 				newnfsstats.srvcache_nonidemdonehits++;
406 				mtx_unlock(mutex);
407 				nfsrvd_rephead(nd);
408 				*(nd->nd_errp) = rp->rc_status;
409 				ret = RC_REPLY;
410 				rp->rc_timestamp = NFSD_MONOSEC +
411 					NFSRVCACHE_UDPTIMEOUT;
412 			} else if (rp->rc_flag & RC_REPMBUF) {
413 				newnfsstats.srvcache_nonidemdonehits++;
414 				mtx_unlock(mutex);
415 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
416 					M_COPYALL, M_WAITOK);
417 				ret = RC_REPLY;
418 				rp->rc_timestamp = NFSD_MONOSEC +
419 					NFSRVCACHE_UDPTIMEOUT;
420 			} else {
421 				panic("nfs udp cache1");
422 			}
423 			nfsrc_unlock(rp);
424 			free((caddr_t)newrp, M_NFSRVCACHE);
425 			goto out;
426 		}
427 	}
428 	newnfsstats.srvcache_misses++;
429 	atomic_add_int(&newnfsstats.srvcache_size, 1);
430 	nfsrc_udpcachesize++;
431 
432 	newrp->rc_flag |= RC_INPROG;
433 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
434 	if (saddr->sin_family == AF_INET)
435 		newrp->rc_inet = saddr->sin_addr.s_addr;
436 	else if (saddr->sin_family == AF_INET6) {
437 		saddr6 = (struct sockaddr_in6 *)saddr;
438 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
439 		    sizeof (struct in6_addr));
440 		newrp->rc_flag |= RC_INETIPV6;
441 	}
442 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
443 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
444 	mtx_unlock(mutex);
445 	nd->nd_rp = newrp;
446 	ret = RC_DOIT;
447 
448 out:
449 	NFSEXITCODE2(0, nd);
450 	return (ret);
451 }
452 
453 /*
454  * Update a request cache entry after the rpc has been done
455  */
456 APPLESTATIC struct nfsrvcache *
457 nfsrvd_updatecache(struct nfsrv_descript *nd)
458 {
459 	struct nfsrvcache *rp;
460 	struct nfsrvcache *retrp = NULL;
461 	mbuf_t m;
462 	struct mtx *mutex;
463 
464 	rp = nd->nd_rp;
465 	if (!rp)
466 		panic("nfsrvd_updatecache null rp");
467 	nd->nd_rp = NULL;
468 	mutex = nfsrc_cachemutex(rp);
469 	mtx_lock(mutex);
470 	nfsrc_lock(rp);
471 	if (!(rp->rc_flag & RC_INPROG))
472 		panic("nfsrvd_updatecache not inprog");
473 	rp->rc_flag &= ~RC_INPROG;
474 	if (rp->rc_flag & RC_UDP) {
475 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
476 		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
477 	}
478 
479 	/*
480 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
481 	 */
482 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
483 		newnfsstats.srvcache_nonidemdonehits++;
484 		mtx_unlock(mutex);
485 		nd->nd_repstat = 0;
486 		if (nd->nd_mreq)
487 			mbuf_freem(nd->nd_mreq);
488 		if (!(rp->rc_flag & RC_REPMBUF))
489 			panic("reply from cache");
490 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
491 		    M_COPYALL, M_WAITOK);
492 		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
493 		nfsrc_unlock(rp);
494 		goto out;
495 	}
496 
497 	/*
498 	 * If rc_refcnt > 0, save it
499 	 * For UDP, save it if ND_SAVEREPLY is set
500 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
501 	 */
502 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
503 	    (rp->rc_refcnt > 0 ||
504 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
505 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
506 	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
507 	      nfsrc_tcpnonidempotent))) {
508 		if (rp->rc_refcnt > 0) {
509 			if (!(rp->rc_flag & RC_NFSV4))
510 				panic("update_cache refcnt");
511 			rp->rc_flag |= RC_REFCNT;
512 		}
513 		if ((nd->nd_flag & ND_NFSV2) &&
514 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
515 			rp->rc_status = nd->nd_repstat;
516 			rp->rc_flag |= RC_REPSTATUS;
517 			mtx_unlock(mutex);
518 		} else {
519 			if (!(rp->rc_flag & RC_UDP)) {
520 			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
521 			    if (nfsrc_tcpsavedreplies >
522 				newnfsstats.srvcache_tcppeak)
523 				newnfsstats.srvcache_tcppeak =
524 				    nfsrc_tcpsavedreplies;
525 			}
526 			mtx_unlock(mutex);
527 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
528 			mtx_lock(mutex);
529 			rp->rc_reply = m;
530 			rp->rc_flag |= RC_REPMBUF;
531 			mtx_unlock(mutex);
532 		}
533 		if (rp->rc_flag & RC_UDP) {
534 			rp->rc_timestamp = NFSD_MONOSEC +
535 			    NFSRVCACHE_UDPTIMEOUT;
536 			nfsrc_unlock(rp);
537 		} else {
538 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
539 			if (rp->rc_refcnt > 0)
540 				nfsrc_unlock(rp);
541 			else
542 				retrp = rp;
543 		}
544 	} else {
545 		nfsrc_freecache(rp);
546 		mtx_unlock(mutex);
547 	}
548 
549 out:
550 	NFSEXITCODE2(0, nd);
551 	return (retrp);
552 }
553 
554 /*
555  * Invalidate and, if possible, free an in prog cache entry.
556  * Must not sleep.
557  */
558 APPLESTATIC void
559 nfsrvd_delcache(struct nfsrvcache *rp)
560 {
561 	struct mtx *mutex;
562 
563 	mutex = nfsrc_cachemutex(rp);
564 	if (!(rp->rc_flag & RC_INPROG))
565 		panic("nfsrvd_delcache not in prog");
566 	mtx_lock(mutex);
567 	rp->rc_flag &= ~RC_INPROG;
568 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
569 		nfsrc_freecache(rp);
570 	mtx_unlock(mutex);
571 }
572 
573 /*
574  * Called after nfsrvd_updatecache() once the reply is sent, to update
575  * the entry's sequence number and unlock it. The argument is
576  * the pointer returned by nfsrvd_updatecache().
577  */
578 APPLESTATIC void
579 nfsrvd_sentcache(struct nfsrvcache *rp, uint32_t seq)
580 {
581 	struct nfsrchash_bucket *hbp;
582 
583 	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
584 	hbp = NFSRCAHASH(rp->rc_sockref);
585 	mtx_lock(&hbp->mtx);
586 	rp->rc_tcpseq = seq;
587 	if (rp->rc_acked != RC_NO_ACK)
588 		LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
589 	rp->rc_acked = RC_NO_ACK;
590 	mtx_unlock(&hbp->mtx);
591 	nfsrc_unlock(rp);
592 }
593 
594 /*
595  * Get a cache entry for TCP
596  * - key on <xid, nfs version>
597  *   (allow multiple entries for a given key)
598  */
599 static int
600 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
601 {
602 	struct nfsrvcache *rp, *nextrp;
603 	int i;
604 	struct nfsrvcache *hitrp;
605 	struct nfsrvhashhead *hp, nfsrc_templist;
606 	int hit, ret = 0;
607 	struct mtx *mutex;
608 
609 	mutex = nfsrc_cachemutex(newrp);
610 	hp = NFSRCHASH(newrp->rc_xid);
611 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
612 tryagain:
613 	mtx_lock(mutex);
614 	hit = 1;
615 	LIST_INIT(&nfsrc_templist);
616 	/*
617 	 * Get all the matches and put them on the temp list.
618 	 */
619 	rp = LIST_FIRST(hp);
620 	while (rp != LIST_END(hp)) {
621 		nextrp = LIST_NEXT(rp, rc_hash);
622 		if (newrp->rc_xid == rp->rc_xid &&
623 		    (!(rp->rc_flag & RC_INPROG) ||
624 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
625 		      newrp->rc_sockref == rp->rc_sockref)) &&
626 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
627 		    newrp->rc_proc == rp->rc_proc &&
628 		    ((newrp->rc_flag & RC_NFSV4) &&
629 		     newrp->rc_sockref != rp->rc_sockref &&
630 		     newrp->rc_cachetime >= rp->rc_cachetime)
631 		    && newrp->rc_reqlen == rp->rc_reqlen &&
632 		    newrp->rc_cksum == rp->rc_cksum) {
633 			LIST_REMOVE(rp, rc_hash);
634 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
635 		}
636 		rp = nextrp;
637 	}
638 
639 	/*
640 	 * Now, use nfsrc_templist to decide if there is a match.
641 	 */
642 	i = 0;
643 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
644 		i++;
645 		if (rp->rc_refcnt > 0) {
646 			hit = 0;
647 			break;
648 		}
649 	}
650 	/*
651 	 * Can be a hit only if one entry left.
652 	 * Note possible hit entry and put nfsrc_templist back on hash
653 	 * list.
654 	 */
655 	if (i != 1)
656 		hit = 0;
657 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
658 	while (rp != LIST_END(&nfsrc_templist)) {
659 		nextrp = LIST_NEXT(rp, rc_hash);
660 		LIST_REMOVE(rp, rc_hash);
661 		LIST_INSERT_HEAD(hp, rp, rc_hash);
662 		rp = nextrp;
663 	}
664 	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
665 		panic("nfs gettcp cache templist");
666 
667 	if (hit) {
668 		rp = hitrp;
669 		if ((rp->rc_flag & RC_LOCKED) != 0) {
670 			rp->rc_flag |= RC_WANTED;
671 			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
672 			    "nfsrc", 10 * hz);
673 			goto tryagain;
674 		}
675 		if (rp->rc_flag == 0)
676 			panic("nfs tcp cache0");
677 		rp->rc_flag |= RC_LOCKED;
678 		if (rp->rc_flag & RC_INPROG) {
679 			newnfsstats.srvcache_inproghits++;
680 			mtx_unlock(mutex);
681 			if (newrp->rc_sockref == rp->rc_sockref)
682 				nfsrc_marksametcpconn(rp->rc_sockref);
683 			ret = RC_DROPIT;
684 		} else if (rp->rc_flag & RC_REPSTATUS) {
685 			/*
686 			 * V2 only.
687 			 */
688 			newnfsstats.srvcache_nonidemdonehits++;
689 			mtx_unlock(mutex);
690 			if (newrp->rc_sockref == rp->rc_sockref)
691 				nfsrc_marksametcpconn(rp->rc_sockref);
692 			ret = RC_REPLY;
693 			nfsrvd_rephead(nd);
694 			*(nd->nd_errp) = rp->rc_status;
695 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
696 		} else if (rp->rc_flag & RC_REPMBUF) {
697 			newnfsstats.srvcache_nonidemdonehits++;
698 			mtx_unlock(mutex);
699 			if (newrp->rc_sockref == rp->rc_sockref)
700 				nfsrc_marksametcpconn(rp->rc_sockref);
701 			ret = RC_REPLY;
702 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
703 				M_COPYALL, M_WAITOK);
704 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
705 		} else {
706 			panic("nfs tcp cache1");
707 		}
708 		nfsrc_unlock(rp);
709 		free((caddr_t)newrp, M_NFSRVCACHE);
710 		goto out;
711 	}
712 	newnfsstats.srvcache_misses++;
713 	atomic_add_int(&newnfsstats.srvcache_size, 1);
714 
715 	/*
716 	 * For TCP, multiple entries for a key are allowed, so don't
717 	 * chain it into the hash table until done.
718 	 */
719 	newrp->rc_cachetime = NFSD_MONOSEC;
720 	newrp->rc_flag |= RC_INPROG;
721 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
722 	mtx_unlock(mutex);
723 	nd->nd_rp = newrp;
724 	ret = RC_DOIT;
725 
726 out:
727 	NFSEXITCODE2(0, nd);
728 	return (ret);
729 }
730 
731 /*
732  * Lock a cache entry.
733  */
734 static void
735 nfsrc_lock(struct nfsrvcache *rp)
736 {
737 	struct mtx *mutex;
738 
739 	mutex = nfsrc_cachemutex(rp);
740 	mtx_assert(mutex, MA_OWNED);
741 	while ((rp->rc_flag & RC_LOCKED) != 0) {
742 		rp->rc_flag |= RC_WANTED;
743 		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
744 	}
745 	rp->rc_flag |= RC_LOCKED;
746 }
747 
748 /*
749  * Unlock a cache entry.
750  */
751 static void
752 nfsrc_unlock(struct nfsrvcache *rp)
753 {
754 	struct mtx *mutex;
755 
756 	mutex = nfsrc_cachemutex(rp);
757 	mtx_lock(mutex);
758 	rp->rc_flag &= ~RC_LOCKED;
759 	nfsrc_wanted(rp);
760 	mtx_unlock(mutex);
761 }
762 
763 /*
764  * Wakeup anyone wanting entry.
765  */
766 static void
767 nfsrc_wanted(struct nfsrvcache *rp)
768 {
769 	if (rp->rc_flag & RC_WANTED) {
770 		rp->rc_flag &= ~RC_WANTED;
771 		wakeup((caddr_t)rp);
772 	}
773 }
774 
775 /*
776  * Free up the entry.
777  * Must not sleep.
778  */
779 static void
780 nfsrc_freecache(struct nfsrvcache *rp)
781 {
782 	struct nfsrchash_bucket *hbp;
783 
784 	LIST_REMOVE(rp, rc_hash);
785 	if (rp->rc_flag & RC_UDP) {
786 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
787 		nfsrc_udpcachesize--;
788 	} else if (rp->rc_acked != RC_NO_SEQ) {
789 		hbp = NFSRCAHASH(rp->rc_sockref);
790 		mtx_lock(&hbp->mtx);
791 		if (rp->rc_acked == RC_NO_ACK)
792 			LIST_REMOVE(rp, rc_ahash);
793 		mtx_unlock(&hbp->mtx);
794 	}
795 	nfsrc_wanted(rp);
796 	if (rp->rc_flag & RC_REPMBUF) {
797 		mbuf_freem(rp->rc_reply);
798 		if (!(rp->rc_flag & RC_UDP))
799 			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
800 	}
801 	FREE((caddr_t)rp, M_NFSRVCACHE);
802 	atomic_add_int(&newnfsstats.srvcache_size, -1);
803 }
804 
805 /*
806  * Clean out the cache. Called when nfsserver module is unloaded.
807  */
808 APPLESTATIC void
809 nfsrvd_cleancache(void)
810 {
811 	struct nfsrvcache *rp, *nextrp;
812 	int i;
813 
814 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
815 		mtx_lock(&nfsrchash_table[i].mtx);
816 		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
817 			nfsrc_freecache(rp);
818 		mtx_unlock(&nfsrchash_table[i].mtx);
819 	}
820 	mtx_lock(&nfsrc_udpmtx);
821 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
822 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
823 			nfsrc_freecache(rp);
824 		}
825 	}
826 	newnfsstats.srvcache_size = 0;
827 	mtx_unlock(&nfsrc_udpmtx);
828 	nfsrc_tcpsavedreplies = 0;
829 }
830 
831 #define HISTSIZE	16
832 /*
833  * The basic rule is to get rid of entries that are expired.
834  */
835 void
836 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
837 {
838 	struct nfsrchash_bucket *hbp;
839 	struct nfsrvcache *rp, *nextrp;
840 	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
841 	time_t thisstamp;
842 	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
843 	static int onethread = 0, oneslot = 0;
844 
845 	if (sockref != 0) {
846 		hbp = NFSRCAHASH(sockref);
847 		mtx_lock(&hbp->mtx);
848 		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
849 			if (sockref == rp->rc_sockref) {
850 				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
851 					rp->rc_acked = RC_ACK;
852 					LIST_REMOVE(rp, rc_ahash);
853 				} else if (final) {
854 					rp->rc_acked = RC_NACK;
855 					LIST_REMOVE(rp, rc_ahash);
856 				}
857 			}
858 		}
859 		mtx_unlock(&hbp->mtx);
860 	}
861 
862 	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
863 		return;
864 	if (NFSD_MONOSEC != udp_lasttrim ||
865 	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
866 	    nfsrc_udphighwater / 2)) {
867 		mtx_lock(&nfsrc_udpmtx);
868 		udp_lasttrim = NFSD_MONOSEC;
869 		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
870 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
871 			     && rp->rc_refcnt == 0
872 			     && ((rp->rc_flag & RC_REFCNT) ||
873 				 udp_lasttrim > rp->rc_timestamp ||
874 				 nfsrc_udpcachesize > nfsrc_udphighwater))
875 				nfsrc_freecache(rp);
876 		}
877 		mtx_unlock(&nfsrc_udpmtx);
878 	}
879 	if (NFSD_MONOSEC != tcp_lasttrim ||
880 	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
881 		force = nfsrc_tcphighwater / 4;
882 		if (force > 0 &&
883 		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
884 			for (i = 0; i < HISTSIZE; i++)
885 				time_histo[i] = 0;
886 			i = 0;
887 			lastslot = NFSRVCACHE_HASHSIZE - 1;
888 		} else {
889 			force = 0;
890 			if (NFSD_MONOSEC != tcp_lasttrim) {
891 				i = 0;
892 				lastslot = NFSRVCACHE_HASHSIZE - 1;
893 			} else {
894 				lastslot = i = oneslot;
895 				if (++oneslot >= NFSRVCACHE_HASHSIZE)
896 					oneslot = 0;
897 			}
898 		}
899 		tto = nfsrc_tcptimeout;
900 		tcp_lasttrim = NFSD_MONOSEC;
901 		for (; i <= lastslot; i++) {
902 			mtx_lock(&nfsrchash_table[i].mtx);
903 			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
904 			    nextrp) {
905 				if (!(rp->rc_flag &
906 				     (RC_INPROG|RC_LOCKED|RC_WANTED))
907 				     && rp->rc_refcnt == 0) {
908 					if ((rp->rc_flag & RC_REFCNT) ||
909 					    tcp_lasttrim > rp->rc_timestamp ||
910 					    rp->rc_acked == RC_ACK) {
911 						nfsrc_freecache(rp);
912 						continue;
913 					}
914 
915 					if (force == 0)
916 						continue;
917 					/*
918 					 * The timestamps range from roughly the
919 					 * present (tcp_lasttrim) to the present
920 					 * + nfsrc_tcptimeout. Generate a simple
921 					 * histogram of where the timeouts fall.
922 					 */
923 					j = rp->rc_timestamp - tcp_lasttrim;
924 					if (j >= tto)
925 						j = HISTSIZE - 1;
926 					else if (j < 0)
927 						j = 0;
928 					else
929 						j = j * HISTSIZE / tto;
930 					time_histo[j]++;
931 				}
932 			}
933 			mtx_unlock(&nfsrchash_table[i].mtx);
934 		}
935 		if (force) {
936 			/*
937 			 * Trim some more with a smaller timeout of as little
938 			 * as 20% of nfsrc_tcptimeout to try and get below
939 			 * 80% of the nfsrc_tcphighwater.
940 			 */
941 			k = 0;
942 			for (i = 0; i < (HISTSIZE - 2); i++) {
943 				k += time_histo[i];
944 				if (k > force)
945 					break;
946 			}
947 			k = tto * (i + 1) / HISTSIZE;
948 			if (k < 1)
949 				k = 1;
950 			thisstamp = tcp_lasttrim + k;
951 			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
952 				mtx_lock(&nfsrchash_table[i].mtx);
953 				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
954 				    rc_hash, nextrp) {
955 					if (!(rp->rc_flag &
956 					     (RC_INPROG|RC_LOCKED|RC_WANTED))
957 					     && rp->rc_refcnt == 0
958 					     && ((rp->rc_flag & RC_REFCNT) ||
959 						 thisstamp > rp->rc_timestamp ||
960 						 rp->rc_acked == RC_ACK))
961 						nfsrc_freecache(rp);
962 				}
963 				mtx_unlock(&nfsrchash_table[i].mtx);
964 			}
965 		}
966 	}
967 	atomic_store_rel_int(&onethread, 0);
968 }
969 
970 /*
971  * Add a seqid# reference to the cache entry.
972  */
973 APPLESTATIC void
974 nfsrvd_refcache(struct nfsrvcache *rp)
975 {
976 	struct mtx *mutex;
977 
978 	mutex = nfsrc_cachemutex(rp);
979 	mtx_lock(mutex);
980 	if (rp->rc_refcnt < 0)
981 		panic("nfs cache refcnt");
982 	rp->rc_refcnt++;
983 	mtx_unlock(mutex);
984 }
985 
986 /*
987  * Dereference a seqid# cache entry.
988  */
989 APPLESTATIC void
990 nfsrvd_derefcache(struct nfsrvcache *rp)
991 {
992 	struct mtx *mutex;
993 
994 	mutex = nfsrc_cachemutex(rp);
995 	mtx_lock(mutex);
996 	if (rp->rc_refcnt <= 0)
997 		panic("nfs cache derefcnt");
998 	rp->rc_refcnt--;
999 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1000 		nfsrc_freecache(rp);
1001 	mtx_unlock(mutex);
1002 }
1003 
1004 /*
1005  * Calculate the length of the mbuf list and a checksum on the first up to
1006  * NFSRVCACHE_CHECKLEN bytes.
1007  */
1008 static int
1009 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1010 {
1011 	int len = 0, cklen;
1012 	mbuf_t m;
1013 
1014 	m = m1;
1015 	while (m) {
1016 		len += mbuf_len(m);
1017 		m = mbuf_next(m);
1018 	}
1019 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1020 	*cksum = in_cksum(m1, cklen);
1021 	return (len);
1022 }
1023 
1024 /*
1025  * Mark a TCP connection that is seeing retries. Should never happen for
1026  * NFSv4.
1027  */
1028 static void
1029 nfsrc_marksametcpconn(u_int64_t sockref)
1030 {
1031 }
1032 
1033