xref: /freebsd/sys/fs/nfsserver/nfs_nfsdcache.c (revision 162ae9c834f6d9f9cb443bd62cceb23e0b5fef48)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 /*
40  * Here is the basic algorithm:
41  * First, some design criteria I used:
42  * - I think a false hit is more serious than a false miss
43  * - A false hit for an RPC that has Op(s) that order via seqid# must be
44  *   avoided at all cost
45  * - A valid hit will probably happen a long time after the original reply
46  *   and the TCP socket that the original request was received on will no
47  *   longer be active
48  *   (The long time delay implies to me that LRU is not appropriate.)
49  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50  *   in them as well as minimizing the risk of redoing retried non-idempotent
51  *   Ops.
52  * Because it is biased towards avoiding false hits, multiple entries with
53  * the same xid are to be expected, especially for the case of the entry
54  * in the cache being related to a seqid# sequenced Op.
55  *
56  * The basic algorithm I'm about to code up:
57  * - Null RPCs bypass the cache and are just done
58  * For TCP
59  * 	- key on <xid, NFS version> (as noted above, there can be several
60  * 				     entries with the same key)
61  * 	When a request arrives:
62  * 		For all that match key
63  * 		- if RPC# != OR request_size !=
64  * 			- not a match with this one
65  * 		- if NFSv4 and received on same TCP socket OR
66  *			received on a TCP connection created before the
67  *			entry was cached
68  * 			- not a match with this one
69  * 			(V2,3 clients might retry on same TCP socket)
70  * 		- calculate checksum on first N bytes of NFS XDR
71  * 		- if checksum !=
72  * 			- not a match for this one
73  * 		If any of the remaining ones that match has a
74  * 			seqid_refcnt > 0
75  * 			- not a match (go do RPC, using new cache entry)
76  * 		If one match left
77  * 			- a hit (reply from cache)
78  * 		else
79  * 			- miss (go do RPC, using new cache entry)
80  *
81  * 	During processing of NFSv4 request:
82  * 		- set a flag when a non-idempotent Op is processed
83  * 		- when an Op that uses a seqid# (Open,...) is processed
84  * 			- if same seqid# as referenced entry in cache
85  * 				- free new cache entry
86  * 				- reply from referenced cache entry
87  * 			  else if next seqid# in order
88  * 				- free referenced cache entry
89  * 				- increment seqid_refcnt on new cache entry
90  * 				- set pointer from Openowner/Lockowner to
91  * 					new cache entry (aka reference it)
92  * 			  else if first seqid# in sequence
93  * 				- increment seqid_refcnt on new cache entry
94  * 				- set pointer from Openowner/Lockowner to
95  * 					new cache entry (aka reference it)
96  *
97  * 	At end of RPC processing:
98  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
99  * 			cache entry
100  * 			- save reply in cache entry
101  * 			- calculate checksum on first N bytes of NFS XDR
102  * 				request
103  * 			- note op and length of XDR request (in bytes)
104  * 			- timestamp it
105  * 		  else
106  * 			- free new cache entry
107  * 		- Send reply (noting info for socket activity check, below)
108  *
109  * 	For cache entries saved above:
110  * 		- if saved since seqid_refcnt was > 0
111  * 			- free when seqid_refcnt decrements to 0
112  * 			  (when next one in sequence is processed above, or
113  * 			   when Openowner/Lockowner is discarded)
114  * 		  else { non-idempotent Op(s) }
115  * 			- free when
116  * 				- some further activity observed on same
117  * 					socket
118  * 				  (I'm not yet sure how I'm going to do
119  * 				   this. Maybe look at the TCP connection
120  * 				   to see if the send_tcp_sequence# is well
121  * 				   past sent reply OR K additional RPCs
122  * 				   replied on same socket OR?)
123  * 			  OR
124  * 				- when very old (hours, days, weeks?)
125  *
126  * For UDP (v2, 3 only), pretty much the old way:
127  * - key on <xid, NFS version, RPC#, Client host ip#>
128  *   (at most one entry for each key)
129  *
130  * When a Request arrives:
131  * - if a match with entry via key
132  * 	- if RPC marked In_progress
133  * 		- discard request (don't send reply)
134  * 	  else
135  * 		- reply from cache
136  * 		- timestamp cache entry
137  *   else
138  * 	- add entry to cache, marked In_progress
139  * 	- do RPC
140  * 	- when RPC done
141  * 		- if RPC# non-idempotent
142  * 			- mark entry Done (not In_progress)
143  * 			- save reply
144  * 			- timestamp cache entry
145  * 		  else
146  * 			- free cache entry
147  * 		- send reply
148  *
149  * Later, entries with saved replies are free'd a short time (few minutes)
150  * after reply sent (timestamp).
151  * Reference: Chet Juszczak, "Improving the Performance and Correctness
152  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153  *		pages 53-63. San Diego, February 1989.
154  *	 for the UDP case.
155  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156  *	for TCP. For V3, a reply won't be saved when the flood level is
157  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158  *	that case. This level should be set high enough that this almost
159  *	never happens.
160  */
161 #ifndef APPLEKEXT
162 #include <fs/nfs/nfsport.h>
163 
164 extern struct nfsstatsv1 nfsstatsv1;
165 extern struct mtx nfsrc_udpmtx;
166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
167 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
168 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
169 #endif	/* !APPLEKEXT */
170 
171 SYSCTL_DECL(_vfs_nfsd);
172 
173 static u_int	nfsrc_tcphighwater = 0;
174 static int
175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
176 {
177 	int error, newhighwater;
178 
179 	newhighwater = nfsrc_tcphighwater;
180 	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181 	if (error != 0 || req->newptr == NULL)
182 		return (error);
183 	if (newhighwater < 0)
184 		return (EINVAL);
185 	if (newhighwater >= nfsrc_floodlevel)
186 		nfsrc_floodlevel = newhighwater + newhighwater / 5;
187 	nfsrc_tcphighwater = newhighwater;
188 	return (0);
189 }
190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
191     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
192     sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
193 
194 static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196     &nfsrc_udphighwater, 0,
197     "High water mark for UDP cache entries");
198 static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200     &nfsrc_tcptimeout, 0,
201     "Timeout for TCP entries in the DRC");
202 static u_int nfsrc_tcpnonidempotent = 1;
203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204     &nfsrc_tcpnonidempotent, 0,
205     "Enable the DRC for NFS over TCP");
206 
207 static int nfsrc_udpcachesize = 0;
208 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
209 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
210 
211 /*
212  * and the reverse mapping from generic to Version 2 procedure numbers
213  */
214 static int newnfsv2_procid[NFS_V3NPROCS] = {
215 	NFSV2PROC_NULL,
216 	NFSV2PROC_GETATTR,
217 	NFSV2PROC_SETATTR,
218 	NFSV2PROC_LOOKUP,
219 	NFSV2PROC_NOOP,
220 	NFSV2PROC_READLINK,
221 	NFSV2PROC_READ,
222 	NFSV2PROC_WRITE,
223 	NFSV2PROC_CREATE,
224 	NFSV2PROC_MKDIR,
225 	NFSV2PROC_SYMLINK,
226 	NFSV2PROC_CREATE,
227 	NFSV2PROC_REMOVE,
228 	NFSV2PROC_RMDIR,
229 	NFSV2PROC_RENAME,
230 	NFSV2PROC_LINK,
231 	NFSV2PROC_READDIR,
232 	NFSV2PROC_NOOP,
233 	NFSV2PROC_STATFS,
234 	NFSV2PROC_NOOP,
235 	NFSV2PROC_NOOP,
236 	NFSV2PROC_NOOP,
237 };
238 
239 #define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
240 #define	NFSRCUDPHASH(xid) \
241 	(&nfsrvudphashtbl[nfsrc_hash(xid)])
242 #define	NFSRCHASH(xid) \
243 	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
244 #define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
245 #define	TRUE	1
246 #define	FALSE	0
247 #define	NFSRVCACHE_CHECKLEN	100
248 
249 /* True iff the rpc reply is an nfs status ONLY! */
250 static int nfsv2_repstat[NFS_V3NPROCS] = {
251 	FALSE,
252 	FALSE,
253 	FALSE,
254 	FALSE,
255 	FALSE,
256 	FALSE,
257 	FALSE,
258 	FALSE,
259 	FALSE,
260 	FALSE,
261 	TRUE,
262 	TRUE,
263 	TRUE,
264 	TRUE,
265 	FALSE,
266 	TRUE,
267 	FALSE,
268 	FALSE,
269 	FALSE,
270 	FALSE,
271 	FALSE,
272 	FALSE,
273 };
274 
275 /*
276  * Will NFS want to work over IPv6 someday?
277  */
278 #define	NETFAMILY(rp) \
279 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
280 
281 /* local functions */
282 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
284 static void nfsrc_lock(struct nfsrvcache *rp);
285 static void nfsrc_unlock(struct nfsrvcache *rp);
286 static void nfsrc_wanted(struct nfsrvcache *rp);
287 static void nfsrc_freecache(struct nfsrvcache *rp);
288 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
289 static void nfsrc_marksametcpconn(u_int64_t);
290 
291 /*
292  * Return the correct mutex for this cache entry.
293  */
294 static __inline struct mtx *
295 nfsrc_cachemutex(struct nfsrvcache *rp)
296 {
297 
298 	if ((rp->rc_flag & RC_UDP) != 0)
299 		return (&nfsrc_udpmtx);
300 	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
301 }
302 
303 /*
304  * Initialize the server request cache list
305  */
306 APPLESTATIC void
307 nfsrvd_initcache(void)
308 {
309 	int i;
310 	static int inited = 0;
311 
312 	if (inited)
313 		return;
314 	inited = 1;
315 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
316 		LIST_INIT(&nfsrvudphashtbl[i]);
317 		LIST_INIT(&nfsrchash_table[i].tbl);
318 		LIST_INIT(&nfsrcahash_table[i].tbl);
319 	}
320 	TAILQ_INIT(&nfsrvudplru);
321 	nfsrc_tcpsavedreplies = 0;
322 	nfsrc_udpcachesize = 0;
323 	nfsstatsv1.srvcache_tcppeak = 0;
324 	nfsstatsv1.srvcache_size = 0;
325 }
326 
327 /*
328  * Get a cache entry for this request. Basically just malloc a new one
329  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
330  */
331 APPLESTATIC int
332 nfsrvd_getcache(struct nfsrv_descript *nd)
333 {
334 	struct nfsrvcache *newrp;
335 	int ret;
336 
337 	if (nd->nd_procnum == NFSPROC_NULL)
338 		panic("nfsd cache null");
339 	newrp = malloc(sizeof (struct nfsrvcache),
340 	    M_NFSRVCACHE, M_WAITOK);
341 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
342 	if (nd->nd_flag & ND_NFSV4)
343 		newrp->rc_flag = RC_NFSV4;
344 	else if (nd->nd_flag & ND_NFSV3)
345 		newrp->rc_flag = RC_NFSV3;
346 	else
347 		newrp->rc_flag = RC_NFSV2;
348 	newrp->rc_xid = nd->nd_retxid;
349 	newrp->rc_proc = nd->nd_procnum;
350 	newrp->rc_sockref = nd->nd_sockref;
351 	newrp->rc_cachetime = nd->nd_tcpconntime;
352 	if (nd->nd_flag & ND_SAMETCPCONN)
353 		newrp->rc_flag |= RC_SAMETCPCONN;
354 	if (nd->nd_nam2 != NULL) {
355 		newrp->rc_flag |= RC_UDP;
356 		ret = nfsrc_getudp(nd, newrp);
357 	} else {
358 		ret = nfsrc_gettcp(nd, newrp);
359 	}
360 	NFSEXITCODE2(0, nd);
361 	return (ret);
362 }
363 
364 /*
365  * For UDP (v2, v3):
366  * - key on <xid, NFS version, RPC#, Client host ip#>
367  *   (at most one entry for each key)
368  */
369 static int
370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
371 {
372 	struct nfsrvcache *rp;
373 	struct sockaddr_in *saddr;
374 	struct sockaddr_in6 *saddr6;
375 	struct nfsrvhashhead *hp;
376 	int ret = 0;
377 	struct mtx *mutex;
378 
379 	mutex = nfsrc_cachemutex(newrp);
380 	hp = NFSRCUDPHASH(newrp->rc_xid);
381 loop:
382 	mtx_lock(mutex);
383 	LIST_FOREACH(rp, hp, rc_hash) {
384 	    if (newrp->rc_xid == rp->rc_xid &&
385 		newrp->rc_proc == rp->rc_proc &&
386 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388 			if ((rp->rc_flag & RC_LOCKED) != 0) {
389 				rp->rc_flag |= RC_WANTED;
390 				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
391 				    "nfsrc", 10 * hz);
392 				goto loop;
393 			}
394 			if (rp->rc_flag == 0)
395 				panic("nfs udp cache0");
396 			rp->rc_flag |= RC_LOCKED;
397 			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399 			if (rp->rc_flag & RC_INPROG) {
400 				nfsstatsv1.srvcache_inproghits++;
401 				mtx_unlock(mutex);
402 				ret = RC_DROPIT;
403 			} else if (rp->rc_flag & RC_REPSTATUS) {
404 				/*
405 				 * V2 only.
406 				 */
407 				nfsstatsv1.srvcache_nonidemdonehits++;
408 				mtx_unlock(mutex);
409 				nfsrvd_rephead(nd);
410 				*(nd->nd_errp) = rp->rc_status;
411 				ret = RC_REPLY;
412 				rp->rc_timestamp = NFSD_MONOSEC +
413 					NFSRVCACHE_UDPTIMEOUT;
414 			} else if (rp->rc_flag & RC_REPMBUF) {
415 				nfsstatsv1.srvcache_nonidemdonehits++;
416 				mtx_unlock(mutex);
417 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
418 					M_COPYALL, M_WAITOK);
419 				ret = RC_REPLY;
420 				rp->rc_timestamp = NFSD_MONOSEC +
421 					NFSRVCACHE_UDPTIMEOUT;
422 			} else {
423 				panic("nfs udp cache1");
424 			}
425 			nfsrc_unlock(rp);
426 			free(newrp, M_NFSRVCACHE);
427 			goto out;
428 		}
429 	}
430 	nfsstatsv1.srvcache_misses++;
431 	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
432 	nfsrc_udpcachesize++;
433 
434 	newrp->rc_flag |= RC_INPROG;
435 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436 	if (saddr->sin_family == AF_INET)
437 		newrp->rc_inet = saddr->sin_addr.s_addr;
438 	else if (saddr->sin_family == AF_INET6) {
439 		saddr6 = (struct sockaddr_in6 *)saddr;
440 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441 		    sizeof (struct in6_addr));
442 		newrp->rc_flag |= RC_INETIPV6;
443 	}
444 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
445 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
446 	mtx_unlock(mutex);
447 	nd->nd_rp = newrp;
448 	ret = RC_DOIT;
449 
450 out:
451 	NFSEXITCODE2(0, nd);
452 	return (ret);
453 }
454 
455 /*
456  * Update a request cache entry after the rpc has been done
457  */
458 APPLESTATIC struct nfsrvcache *
459 nfsrvd_updatecache(struct nfsrv_descript *nd)
460 {
461 	struct nfsrvcache *rp;
462 	struct nfsrvcache *retrp = NULL;
463 	mbuf_t m;
464 	struct mtx *mutex;
465 
466 	rp = nd->nd_rp;
467 	if (!rp)
468 		panic("nfsrvd_updatecache null rp");
469 	nd->nd_rp = NULL;
470 	mutex = nfsrc_cachemutex(rp);
471 	mtx_lock(mutex);
472 	nfsrc_lock(rp);
473 	if (!(rp->rc_flag & RC_INPROG))
474 		panic("nfsrvd_updatecache not inprog");
475 	rp->rc_flag &= ~RC_INPROG;
476 	if (rp->rc_flag & RC_UDP) {
477 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478 		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
479 	}
480 
481 	/*
482 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
483 	 */
484 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485 		nfsstatsv1.srvcache_nonidemdonehits++;
486 		mtx_unlock(mutex);
487 		nd->nd_repstat = 0;
488 		if (nd->nd_mreq)
489 			mbuf_freem(nd->nd_mreq);
490 		if (!(rp->rc_flag & RC_REPMBUF))
491 			panic("reply from cache");
492 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
493 		    M_COPYALL, M_WAITOK);
494 		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
495 		nfsrc_unlock(rp);
496 		goto out;
497 	}
498 
499 	/*
500 	 * If rc_refcnt > 0, save it
501 	 * For UDP, save it if ND_SAVEREPLY is set
502 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
503 	 */
504 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
505 	    (rp->rc_refcnt > 0 ||
506 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
507 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
508 	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
509 	      nfsrc_tcpnonidempotent))) {
510 		if (rp->rc_refcnt > 0) {
511 			if (!(rp->rc_flag & RC_NFSV4))
512 				panic("update_cache refcnt");
513 			rp->rc_flag |= RC_REFCNT;
514 		}
515 		if ((nd->nd_flag & ND_NFSV2) &&
516 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
517 			rp->rc_status = nd->nd_repstat;
518 			rp->rc_flag |= RC_REPSTATUS;
519 			mtx_unlock(mutex);
520 		} else {
521 			if (!(rp->rc_flag & RC_UDP)) {
522 			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
523 			    if (nfsrc_tcpsavedreplies >
524 				nfsstatsv1.srvcache_tcppeak)
525 				nfsstatsv1.srvcache_tcppeak =
526 				    nfsrc_tcpsavedreplies;
527 			}
528 			mtx_unlock(mutex);
529 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
530 			mtx_lock(mutex);
531 			rp->rc_reply = m;
532 			rp->rc_flag |= RC_REPMBUF;
533 			mtx_unlock(mutex);
534 		}
535 		if (rp->rc_flag & RC_UDP) {
536 			rp->rc_timestamp = NFSD_MONOSEC +
537 			    NFSRVCACHE_UDPTIMEOUT;
538 			nfsrc_unlock(rp);
539 		} else {
540 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
541 			if (rp->rc_refcnt > 0)
542 				nfsrc_unlock(rp);
543 			else
544 				retrp = rp;
545 		}
546 	} else {
547 		nfsrc_freecache(rp);
548 		mtx_unlock(mutex);
549 	}
550 
551 out:
552 	NFSEXITCODE2(0, nd);
553 	return (retrp);
554 }
555 
556 /*
557  * Invalidate and, if possible, free an in prog cache entry.
558  * Must not sleep.
559  */
560 APPLESTATIC void
561 nfsrvd_delcache(struct nfsrvcache *rp)
562 {
563 	struct mtx *mutex;
564 
565 	mutex = nfsrc_cachemutex(rp);
566 	if (!(rp->rc_flag & RC_INPROG))
567 		panic("nfsrvd_delcache not in prog");
568 	mtx_lock(mutex);
569 	rp->rc_flag &= ~RC_INPROG;
570 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
571 		nfsrc_freecache(rp);
572 	mtx_unlock(mutex);
573 }
574 
575 /*
576  * Called after nfsrvd_updatecache() once the reply is sent, to update
577  * the entry's sequence number and unlock it. The argument is
578  * the pointer returned by nfsrvd_updatecache().
579  */
580 APPLESTATIC void
581 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
582 {
583 	struct nfsrchash_bucket *hbp;
584 
585 	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
586 	if (have_seq) {
587 		hbp = NFSRCAHASH(rp->rc_sockref);
588 		mtx_lock(&hbp->mtx);
589 		rp->rc_tcpseq = seq;
590 		if (rp->rc_acked != RC_NO_ACK)
591 			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
592 		rp->rc_acked = RC_NO_ACK;
593 		mtx_unlock(&hbp->mtx);
594 	}
595 	nfsrc_unlock(rp);
596 }
597 
598 /*
599  * Get a cache entry for TCP
600  * - key on <xid, nfs version>
601  *   (allow multiple entries for a given key)
602  */
603 static int
604 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
605 {
606 	struct nfsrvcache *rp, *nextrp;
607 	int i;
608 	struct nfsrvcache *hitrp;
609 	struct nfsrvhashhead *hp, nfsrc_templist;
610 	int hit, ret = 0;
611 	struct mtx *mutex;
612 
613 	mutex = nfsrc_cachemutex(newrp);
614 	hp = NFSRCHASH(newrp->rc_xid);
615 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
616 tryagain:
617 	mtx_lock(mutex);
618 	hit = 1;
619 	LIST_INIT(&nfsrc_templist);
620 	/*
621 	 * Get all the matches and put them on the temp list.
622 	 */
623 	rp = LIST_FIRST(hp);
624 	while (rp != LIST_END(hp)) {
625 		nextrp = LIST_NEXT(rp, rc_hash);
626 		if (newrp->rc_xid == rp->rc_xid &&
627 		    (!(rp->rc_flag & RC_INPROG) ||
628 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
629 		      newrp->rc_sockref == rp->rc_sockref)) &&
630 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
631 		    newrp->rc_proc == rp->rc_proc &&
632 		    ((newrp->rc_flag & RC_NFSV4) &&
633 		     newrp->rc_sockref != rp->rc_sockref &&
634 		     newrp->rc_cachetime >= rp->rc_cachetime)
635 		    && newrp->rc_reqlen == rp->rc_reqlen &&
636 		    newrp->rc_cksum == rp->rc_cksum) {
637 			LIST_REMOVE(rp, rc_hash);
638 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
639 		}
640 		rp = nextrp;
641 	}
642 
643 	/*
644 	 * Now, use nfsrc_templist to decide if there is a match.
645 	 */
646 	i = 0;
647 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
648 		i++;
649 		if (rp->rc_refcnt > 0) {
650 			hit = 0;
651 			break;
652 		}
653 	}
654 	/*
655 	 * Can be a hit only if one entry left.
656 	 * Note possible hit entry and put nfsrc_templist back on hash
657 	 * list.
658 	 */
659 	if (i != 1)
660 		hit = 0;
661 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
662 	while (rp != LIST_END(&nfsrc_templist)) {
663 		nextrp = LIST_NEXT(rp, rc_hash);
664 		LIST_REMOVE(rp, rc_hash);
665 		LIST_INSERT_HEAD(hp, rp, rc_hash);
666 		rp = nextrp;
667 	}
668 	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
669 		panic("nfs gettcp cache templist");
670 
671 	if (hit) {
672 		rp = hitrp;
673 		if ((rp->rc_flag & RC_LOCKED) != 0) {
674 			rp->rc_flag |= RC_WANTED;
675 			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
676 			    "nfsrc", 10 * hz);
677 			goto tryagain;
678 		}
679 		if (rp->rc_flag == 0)
680 			panic("nfs tcp cache0");
681 		rp->rc_flag |= RC_LOCKED;
682 		if (rp->rc_flag & RC_INPROG) {
683 			nfsstatsv1.srvcache_inproghits++;
684 			mtx_unlock(mutex);
685 			if (newrp->rc_sockref == rp->rc_sockref)
686 				nfsrc_marksametcpconn(rp->rc_sockref);
687 			ret = RC_DROPIT;
688 		} else if (rp->rc_flag & RC_REPSTATUS) {
689 			/*
690 			 * V2 only.
691 			 */
692 			nfsstatsv1.srvcache_nonidemdonehits++;
693 			mtx_unlock(mutex);
694 			if (newrp->rc_sockref == rp->rc_sockref)
695 				nfsrc_marksametcpconn(rp->rc_sockref);
696 			ret = RC_REPLY;
697 			nfsrvd_rephead(nd);
698 			*(nd->nd_errp) = rp->rc_status;
699 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
700 		} else if (rp->rc_flag & RC_REPMBUF) {
701 			nfsstatsv1.srvcache_nonidemdonehits++;
702 			mtx_unlock(mutex);
703 			if (newrp->rc_sockref == rp->rc_sockref)
704 				nfsrc_marksametcpconn(rp->rc_sockref);
705 			ret = RC_REPLY;
706 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
707 				M_COPYALL, M_WAITOK);
708 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
709 		} else {
710 			panic("nfs tcp cache1");
711 		}
712 		nfsrc_unlock(rp);
713 		free(newrp, M_NFSRVCACHE);
714 		goto out;
715 	}
716 	nfsstatsv1.srvcache_misses++;
717 	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
718 
719 	/*
720 	 * For TCP, multiple entries for a key are allowed, so don't
721 	 * chain it into the hash table until done.
722 	 */
723 	newrp->rc_cachetime = NFSD_MONOSEC;
724 	newrp->rc_flag |= RC_INPROG;
725 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
726 	mtx_unlock(mutex);
727 	nd->nd_rp = newrp;
728 	ret = RC_DOIT;
729 
730 out:
731 	NFSEXITCODE2(0, nd);
732 	return (ret);
733 }
734 
735 /*
736  * Lock a cache entry.
737  */
738 static void
739 nfsrc_lock(struct nfsrvcache *rp)
740 {
741 	struct mtx *mutex;
742 
743 	mutex = nfsrc_cachemutex(rp);
744 	mtx_assert(mutex, MA_OWNED);
745 	while ((rp->rc_flag & RC_LOCKED) != 0) {
746 		rp->rc_flag |= RC_WANTED;
747 		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
748 	}
749 	rp->rc_flag |= RC_LOCKED;
750 }
751 
752 /*
753  * Unlock a cache entry.
754  */
755 static void
756 nfsrc_unlock(struct nfsrvcache *rp)
757 {
758 	struct mtx *mutex;
759 
760 	mutex = nfsrc_cachemutex(rp);
761 	mtx_lock(mutex);
762 	rp->rc_flag &= ~RC_LOCKED;
763 	nfsrc_wanted(rp);
764 	mtx_unlock(mutex);
765 }
766 
767 /*
768  * Wakeup anyone wanting entry.
769  */
770 static void
771 nfsrc_wanted(struct nfsrvcache *rp)
772 {
773 	if (rp->rc_flag & RC_WANTED) {
774 		rp->rc_flag &= ~RC_WANTED;
775 		wakeup((caddr_t)rp);
776 	}
777 }
778 
779 /*
780  * Free up the entry.
781  * Must not sleep.
782  */
783 static void
784 nfsrc_freecache(struct nfsrvcache *rp)
785 {
786 	struct nfsrchash_bucket *hbp;
787 
788 	LIST_REMOVE(rp, rc_hash);
789 	if (rp->rc_flag & RC_UDP) {
790 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
791 		nfsrc_udpcachesize--;
792 	} else if (rp->rc_acked != RC_NO_SEQ) {
793 		hbp = NFSRCAHASH(rp->rc_sockref);
794 		mtx_lock(&hbp->mtx);
795 		if (rp->rc_acked == RC_NO_ACK)
796 			LIST_REMOVE(rp, rc_ahash);
797 		mtx_unlock(&hbp->mtx);
798 	}
799 	nfsrc_wanted(rp);
800 	if (rp->rc_flag & RC_REPMBUF) {
801 		mbuf_freem(rp->rc_reply);
802 		if (!(rp->rc_flag & RC_UDP))
803 			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
804 	}
805 	free(rp, M_NFSRVCACHE);
806 	atomic_add_int(&nfsstatsv1.srvcache_size, -1);
807 }
808 
809 /*
810  * Clean out the cache. Called when nfsserver module is unloaded.
811  */
812 APPLESTATIC void
813 nfsrvd_cleancache(void)
814 {
815 	struct nfsrvcache *rp, *nextrp;
816 	int i;
817 
818 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
819 		mtx_lock(&nfsrchash_table[i].mtx);
820 		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
821 			nfsrc_freecache(rp);
822 		mtx_unlock(&nfsrchash_table[i].mtx);
823 	}
824 	mtx_lock(&nfsrc_udpmtx);
825 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
826 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
827 			nfsrc_freecache(rp);
828 		}
829 	}
830 	nfsstatsv1.srvcache_size = 0;
831 	mtx_unlock(&nfsrc_udpmtx);
832 	nfsrc_tcpsavedreplies = 0;
833 }
834 
835 #define HISTSIZE	16
836 /*
837  * The basic rule is to get rid of entries that are expired.
838  */
839 void
840 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
841 {
842 	struct nfsrchash_bucket *hbp;
843 	struct nfsrvcache *rp, *nextrp;
844 	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
845 	time_t thisstamp;
846 	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
847 	static int onethread = 0, oneslot = 0;
848 
849 	if (sockref != 0) {
850 		hbp = NFSRCAHASH(sockref);
851 		mtx_lock(&hbp->mtx);
852 		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
853 			if (sockref == rp->rc_sockref) {
854 				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
855 					rp->rc_acked = RC_ACK;
856 					LIST_REMOVE(rp, rc_ahash);
857 				} else if (final) {
858 					rp->rc_acked = RC_NACK;
859 					LIST_REMOVE(rp, rc_ahash);
860 				}
861 			}
862 		}
863 		mtx_unlock(&hbp->mtx);
864 	}
865 
866 	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
867 		return;
868 	if (NFSD_MONOSEC != udp_lasttrim ||
869 	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
870 	    nfsrc_udphighwater / 2)) {
871 		mtx_lock(&nfsrc_udpmtx);
872 		udp_lasttrim = NFSD_MONOSEC;
873 		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
874 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
875 			     && rp->rc_refcnt == 0
876 			     && ((rp->rc_flag & RC_REFCNT) ||
877 				 udp_lasttrim > rp->rc_timestamp ||
878 				 nfsrc_udpcachesize > nfsrc_udphighwater))
879 				nfsrc_freecache(rp);
880 		}
881 		mtx_unlock(&nfsrc_udpmtx);
882 	}
883 	if (NFSD_MONOSEC != tcp_lasttrim ||
884 	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
885 		force = nfsrc_tcphighwater / 4;
886 		if (force > 0 &&
887 		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
888 			for (i = 0; i < HISTSIZE; i++)
889 				time_histo[i] = 0;
890 			i = 0;
891 			lastslot = NFSRVCACHE_HASHSIZE - 1;
892 		} else {
893 			force = 0;
894 			if (NFSD_MONOSEC != tcp_lasttrim) {
895 				i = 0;
896 				lastslot = NFSRVCACHE_HASHSIZE - 1;
897 			} else {
898 				lastslot = i = oneslot;
899 				if (++oneslot >= NFSRVCACHE_HASHSIZE)
900 					oneslot = 0;
901 			}
902 		}
903 		tto = nfsrc_tcptimeout;
904 		tcp_lasttrim = NFSD_MONOSEC;
905 		for (; i <= lastslot; i++) {
906 			mtx_lock(&nfsrchash_table[i].mtx);
907 			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
908 			    nextrp) {
909 				if (!(rp->rc_flag &
910 				     (RC_INPROG|RC_LOCKED|RC_WANTED))
911 				     && rp->rc_refcnt == 0) {
912 					if ((rp->rc_flag & RC_REFCNT) ||
913 					    tcp_lasttrim > rp->rc_timestamp ||
914 					    rp->rc_acked == RC_ACK) {
915 						nfsrc_freecache(rp);
916 						continue;
917 					}
918 
919 					if (force == 0)
920 						continue;
921 					/*
922 					 * The timestamps range from roughly the
923 					 * present (tcp_lasttrim) to the present
924 					 * + nfsrc_tcptimeout. Generate a simple
925 					 * histogram of where the timeouts fall.
926 					 */
927 					j = rp->rc_timestamp - tcp_lasttrim;
928 					if (j >= tto)
929 						j = HISTSIZE - 1;
930 					else if (j < 0)
931 						j = 0;
932 					else
933 						j = j * HISTSIZE / tto;
934 					time_histo[j]++;
935 				}
936 			}
937 			mtx_unlock(&nfsrchash_table[i].mtx);
938 		}
939 		if (force) {
940 			/*
941 			 * Trim some more with a smaller timeout of as little
942 			 * as 20% of nfsrc_tcptimeout to try and get below
943 			 * 80% of the nfsrc_tcphighwater.
944 			 */
945 			k = 0;
946 			for (i = 0; i < (HISTSIZE - 2); i++) {
947 				k += time_histo[i];
948 				if (k > force)
949 					break;
950 			}
951 			k = tto * (i + 1) / HISTSIZE;
952 			if (k < 1)
953 				k = 1;
954 			thisstamp = tcp_lasttrim + k;
955 			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
956 				mtx_lock(&nfsrchash_table[i].mtx);
957 				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
958 				    rc_hash, nextrp) {
959 					if (!(rp->rc_flag &
960 					     (RC_INPROG|RC_LOCKED|RC_WANTED))
961 					     && rp->rc_refcnt == 0
962 					     && ((rp->rc_flag & RC_REFCNT) ||
963 						 thisstamp > rp->rc_timestamp ||
964 						 rp->rc_acked == RC_ACK))
965 						nfsrc_freecache(rp);
966 				}
967 				mtx_unlock(&nfsrchash_table[i].mtx);
968 			}
969 		}
970 	}
971 	atomic_store_rel_int(&onethread, 0);
972 }
973 
974 /*
975  * Add a seqid# reference to the cache entry.
976  */
977 APPLESTATIC void
978 nfsrvd_refcache(struct nfsrvcache *rp)
979 {
980 	struct mtx *mutex;
981 
982 	if (rp == NULL)
983 		/* For NFSv4.1, there is no cache entry. */
984 		return;
985 	mutex = nfsrc_cachemutex(rp);
986 	mtx_lock(mutex);
987 	if (rp->rc_refcnt < 0)
988 		panic("nfs cache refcnt");
989 	rp->rc_refcnt++;
990 	mtx_unlock(mutex);
991 }
992 
993 /*
994  * Dereference a seqid# cache entry.
995  */
996 APPLESTATIC void
997 nfsrvd_derefcache(struct nfsrvcache *rp)
998 {
999 	struct mtx *mutex;
1000 
1001 	mutex = nfsrc_cachemutex(rp);
1002 	mtx_lock(mutex);
1003 	if (rp->rc_refcnt <= 0)
1004 		panic("nfs cache derefcnt");
1005 	rp->rc_refcnt--;
1006 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1007 		nfsrc_freecache(rp);
1008 	mtx_unlock(mutex);
1009 }
1010 
1011 /*
1012  * Calculate the length of the mbuf list and a checksum on the first up to
1013  * NFSRVCACHE_CHECKLEN bytes.
1014  */
1015 static int
1016 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1017 {
1018 	int len = 0, cklen;
1019 	mbuf_t m;
1020 
1021 	m = m1;
1022 	while (m) {
1023 		len += mbuf_len(m);
1024 		m = mbuf_next(m);
1025 	}
1026 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1027 	*cksum = in_cksum(m1, cklen);
1028 	return (len);
1029 }
1030 
1031 /*
1032  * Mark a TCP connection that is seeing retries. Should never happen for
1033  * NFSv4.
1034  */
1035 static void
1036 nfsrc_marksametcpconn(u_int64_t sockref)
1037 {
1038 }
1039 
1040