1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 /* 40 * Here is the basic algorithm: 41 * First, some design criteria I used: 42 * - I think a false hit is more serious than a false miss 43 * - A false hit for an RPC that has Op(s) that order via seqid# must be 44 * avoided at all cost 45 * - A valid hit will probably happen a long time after the original reply 46 * and the TCP socket that the original request was received on will no 47 * longer be active 48 * (The long time delay implies to me that LRU is not appropriate.) 49 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s 50 * in them as well as minimizing the risk of redoing retried non-idempotent 51 * Ops. 52 * Because it is biased towards avoiding false hits, multiple entries with 53 * the same xid are to be expected, especially for the case of the entry 54 * in the cache being related to a seqid# sequenced Op. 55 * 56 * The basic algorithm I'm about to code up: 57 * - Null RPCs bypass the cache and are just done 58 * For TCP 59 * - key on <xid, NFS version> (as noted above, there can be several 60 * entries with the same key) 61 * When a request arrives: 62 * For all that match key 63 * - if RPC# != OR request_size != 64 * - not a match with this one 65 * - if NFSv4 and received on same TCP socket OR 66 * received on a TCP connection created before the 67 * entry was cached 68 * - not a match with this one 69 * (V2,3 clients might retry on same TCP socket) 70 * - calculate checksum on first N bytes of NFS XDR 71 * - if checksum != 72 * - not a match for this one 73 * If any of the remaining ones that match has a 74 * seqid_refcnt > 0 75 * - not a match (go do RPC, using new cache entry) 76 * If one match left 77 * - a hit (reply from cache) 78 * else 79 * - miss (go do RPC, using new cache entry) 80 * 81 * During processing of NFSv4 request: 82 * - set a flag when a non-idempotent Op is processed 83 * - when an Op that uses a seqid# (Open,...) is processed 84 * - if same seqid# as referenced entry in cache 85 * - free new cache entry 86 * - reply from referenced cache entry 87 * else if next seqid# in order 88 * - free referenced cache entry 89 * - increment seqid_refcnt on new cache entry 90 * - set pointer from Openowner/Lockowner to 91 * new cache entry (aka reference it) 92 * else if first seqid# in sequence 93 * - increment seqid_refcnt on new cache entry 94 * - set pointer from Openowner/Lockowner to 95 * new cache entry (aka reference it) 96 * 97 * At end of RPC processing: 98 * - if seqid_refcnt > 0 OR flagged non-idempotent on new 99 * cache entry 100 * - save reply in cache entry 101 * - calculate checksum on first N bytes of NFS XDR 102 * request 103 * - note op and length of XDR request (in bytes) 104 * - timestamp it 105 * else 106 * - free new cache entry 107 * - Send reply (noting info for socket activity check, below) 108 * 109 * For cache entries saved above: 110 * - if saved since seqid_refcnt was > 0 111 * - free when seqid_refcnt decrements to 0 112 * (when next one in sequence is processed above, or 113 * when Openowner/Lockowner is discarded) 114 * else { non-idempotent Op(s) } 115 * - free when 116 * - some further activity observed on same 117 * socket 118 * (I'm not yet sure how I'm going to do 119 * this. Maybe look at the TCP connection 120 * to see if the send_tcp_sequence# is well 121 * past sent reply OR K additional RPCs 122 * replied on same socket OR?) 123 * OR 124 * - when very old (hours, days, weeks?) 125 * 126 * For UDP (v2, 3 only), pretty much the old way: 127 * - key on <xid, NFS version, RPC#, Client host ip#> 128 * (at most one entry for each key) 129 * 130 * When a Request arrives: 131 * - if a match with entry via key 132 * - if RPC marked In_progress 133 * - discard request (don't send reply) 134 * else 135 * - reply from cache 136 * - timestamp cache entry 137 * else 138 * - add entry to cache, marked In_progress 139 * - do RPC 140 * - when RPC done 141 * - if RPC# non-idempotent 142 * - mark entry Done (not In_progress) 143 * - save reply 144 * - timestamp cache entry 145 * else 146 * - free cache entry 147 * - send reply 148 * 149 * Later, entries with saved replies are free'd a short time (few minutes) 150 * after reply sent (timestamp). 151 * Reference: Chet Juszczak, "Improving the Performance and Correctness 152 * of an NFS Server", in Proc. Winter 1989 USENIX Conference, 153 * pages 53-63. San Diego, February 1989. 154 * for the UDP case. 155 * nfsrc_floodlevel is set to the allowable upper limit for saved replies 156 * for TCP. For V3, a reply won't be saved when the flood level is 157 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in 158 * that case. This level should be set high enough that this almost 159 * never happens. 160 */ 161 #ifndef APPLEKEXT 162 #include <fs/nfs/nfsport.h> 163 164 extern struct nfsstatsv1 nfsstatsv1; 165 extern struct mtx nfsrc_udpmtx; 166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE]; 167 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE]; 168 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0; 169 #endif /* !APPLEKEXT */ 170 171 SYSCTL_DECL(_vfs_nfsd); 172 173 static u_int nfsrc_tcphighwater = 0; 174 static int 175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS) 176 { 177 int error, newhighwater; 178 179 newhighwater = nfsrc_tcphighwater; 180 error = sysctl_handle_int(oidp, &newhighwater, 0, req); 181 if (error != 0 || req->newptr == NULL) 182 return (error); 183 if (newhighwater < 0) 184 return (EINVAL); 185 if (newhighwater >= nfsrc_floodlevel) 186 nfsrc_floodlevel = newhighwater + newhighwater / 5; 187 nfsrc_tcphighwater = newhighwater; 188 return (0); 189 } 190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0, 191 sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU", 192 "High water mark for TCP cache entries"); 193 194 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER; 195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW, 196 &nfsrc_udphighwater, 0, 197 "High water mark for UDP cache entries"); 198 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT; 199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW, 200 &nfsrc_tcptimeout, 0, 201 "Timeout for TCP entries in the DRC"); 202 static u_int nfsrc_tcpnonidempotent = 1; 203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW, 204 &nfsrc_tcpnonidempotent, 0, 205 "Enable the DRC for NFS over TCP"); 206 207 static int nfsrc_udpcachesize = 0; 208 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru; 209 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE]; 210 211 /* 212 * and the reverse mapping from generic to Version 2 procedure numbers 213 */ 214 static int newnfsv2_procid[NFS_V3NPROCS] = { 215 NFSV2PROC_NULL, 216 NFSV2PROC_GETATTR, 217 NFSV2PROC_SETATTR, 218 NFSV2PROC_LOOKUP, 219 NFSV2PROC_NOOP, 220 NFSV2PROC_READLINK, 221 NFSV2PROC_READ, 222 NFSV2PROC_WRITE, 223 NFSV2PROC_CREATE, 224 NFSV2PROC_MKDIR, 225 NFSV2PROC_SYMLINK, 226 NFSV2PROC_CREATE, 227 NFSV2PROC_REMOVE, 228 NFSV2PROC_RMDIR, 229 NFSV2PROC_RENAME, 230 NFSV2PROC_LINK, 231 NFSV2PROC_READDIR, 232 NFSV2PROC_NOOP, 233 NFSV2PROC_STATFS, 234 NFSV2PROC_NOOP, 235 NFSV2PROC_NOOP, 236 NFSV2PROC_NOOP, 237 }; 238 239 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE) 240 #define NFSRCUDPHASH(xid) \ 241 (&nfsrvudphashtbl[nfsrc_hash(xid)]) 242 #define NFSRCHASH(xid) \ 243 (&nfsrchash_table[nfsrc_hash(xid)].tbl) 244 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)]) 245 #define TRUE 1 246 #define FALSE 0 247 #define NFSRVCACHE_CHECKLEN 100 248 249 /* True iff the rpc reply is an nfs status ONLY! */ 250 static int nfsv2_repstat[NFS_V3NPROCS] = { 251 FALSE, 252 FALSE, 253 FALSE, 254 FALSE, 255 FALSE, 256 FALSE, 257 FALSE, 258 FALSE, 259 FALSE, 260 FALSE, 261 TRUE, 262 TRUE, 263 TRUE, 264 TRUE, 265 FALSE, 266 TRUE, 267 FALSE, 268 FALSE, 269 FALSE, 270 FALSE, 271 FALSE, 272 FALSE, 273 }; 274 275 /* 276 * Will NFS want to work over IPv6 someday? 277 */ 278 #define NETFAMILY(rp) \ 279 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET) 280 281 /* local functions */ 282 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp); 283 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp); 284 static void nfsrc_lock(struct nfsrvcache *rp); 285 static void nfsrc_unlock(struct nfsrvcache *rp); 286 static void nfsrc_wanted(struct nfsrvcache *rp); 287 static void nfsrc_freecache(struct nfsrvcache *rp); 288 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum); 289 static void nfsrc_marksametcpconn(u_int64_t); 290 291 /* 292 * Return the correct mutex for this cache entry. 293 */ 294 static __inline struct mtx * 295 nfsrc_cachemutex(struct nfsrvcache *rp) 296 { 297 298 if ((rp->rc_flag & RC_UDP) != 0) 299 return (&nfsrc_udpmtx); 300 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx); 301 } 302 303 /* 304 * Initialize the server request cache list 305 */ 306 APPLESTATIC void 307 nfsrvd_initcache(void) 308 { 309 int i; 310 static int inited = 0; 311 312 if (inited) 313 return; 314 inited = 1; 315 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 316 LIST_INIT(&nfsrvudphashtbl[i]); 317 LIST_INIT(&nfsrchash_table[i].tbl); 318 LIST_INIT(&nfsrcahash_table[i].tbl); 319 } 320 TAILQ_INIT(&nfsrvudplru); 321 nfsrc_tcpsavedreplies = 0; 322 nfsrc_udpcachesize = 0; 323 nfsstatsv1.srvcache_tcppeak = 0; 324 nfsstatsv1.srvcache_size = 0; 325 } 326 327 /* 328 * Get a cache entry for this request. Basically just malloc a new one 329 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest. 330 */ 331 APPLESTATIC int 332 nfsrvd_getcache(struct nfsrv_descript *nd) 333 { 334 struct nfsrvcache *newrp; 335 int ret; 336 337 if (nd->nd_procnum == NFSPROC_NULL) 338 panic("nfsd cache null"); 339 newrp = malloc(sizeof (struct nfsrvcache), 340 M_NFSRVCACHE, M_WAITOK); 341 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache)); 342 if (nd->nd_flag & ND_NFSV4) 343 newrp->rc_flag = RC_NFSV4; 344 else if (nd->nd_flag & ND_NFSV3) 345 newrp->rc_flag = RC_NFSV3; 346 else 347 newrp->rc_flag = RC_NFSV2; 348 newrp->rc_xid = nd->nd_retxid; 349 newrp->rc_proc = nd->nd_procnum; 350 newrp->rc_sockref = nd->nd_sockref; 351 newrp->rc_cachetime = nd->nd_tcpconntime; 352 if (nd->nd_flag & ND_SAMETCPCONN) 353 newrp->rc_flag |= RC_SAMETCPCONN; 354 if (nd->nd_nam2 != NULL) { 355 newrp->rc_flag |= RC_UDP; 356 ret = nfsrc_getudp(nd, newrp); 357 } else { 358 ret = nfsrc_gettcp(nd, newrp); 359 } 360 NFSEXITCODE2(0, nd); 361 return (ret); 362 } 363 364 /* 365 * For UDP (v2, v3): 366 * - key on <xid, NFS version, RPC#, Client host ip#> 367 * (at most one entry for each key) 368 */ 369 static int 370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp) 371 { 372 struct nfsrvcache *rp; 373 struct sockaddr_in *saddr; 374 struct sockaddr_in6 *saddr6; 375 struct nfsrvhashhead *hp; 376 int ret = 0; 377 struct mtx *mutex; 378 379 mutex = nfsrc_cachemutex(newrp); 380 hp = NFSRCUDPHASH(newrp->rc_xid); 381 loop: 382 mtx_lock(mutex); 383 LIST_FOREACH(rp, hp, rc_hash) { 384 if (newrp->rc_xid == rp->rc_xid && 385 newrp->rc_proc == rp->rc_proc && 386 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) && 387 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) { 388 if ((rp->rc_flag & RC_LOCKED) != 0) { 389 rp->rc_flag |= RC_WANTED; 390 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, 391 "nfsrc", 10 * hz); 392 goto loop; 393 } 394 if (rp->rc_flag == 0) 395 panic("nfs udp cache0"); 396 rp->rc_flag |= RC_LOCKED; 397 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru); 398 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru); 399 if (rp->rc_flag & RC_INPROG) { 400 nfsstatsv1.srvcache_inproghits++; 401 mtx_unlock(mutex); 402 ret = RC_DROPIT; 403 } else if (rp->rc_flag & RC_REPSTATUS) { 404 /* 405 * V2 only. 406 */ 407 nfsstatsv1.srvcache_nonidemdonehits++; 408 mtx_unlock(mutex); 409 nfsrvd_rephead(nd); 410 *(nd->nd_errp) = rp->rc_status; 411 ret = RC_REPLY; 412 rp->rc_timestamp = NFSD_MONOSEC + 413 NFSRVCACHE_UDPTIMEOUT; 414 } else if (rp->rc_flag & RC_REPMBUF) { 415 nfsstatsv1.srvcache_nonidemdonehits++; 416 mtx_unlock(mutex); 417 nd->nd_mreq = m_copym(rp->rc_reply, 0, 418 M_COPYALL, M_WAITOK); 419 ret = RC_REPLY; 420 rp->rc_timestamp = NFSD_MONOSEC + 421 NFSRVCACHE_UDPTIMEOUT; 422 } else { 423 panic("nfs udp cache1"); 424 } 425 nfsrc_unlock(rp); 426 free(newrp, M_NFSRVCACHE); 427 goto out; 428 } 429 } 430 nfsstatsv1.srvcache_misses++; 431 atomic_add_int(&nfsstatsv1.srvcache_size, 1); 432 nfsrc_udpcachesize++; 433 434 newrp->rc_flag |= RC_INPROG; 435 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *); 436 if (saddr->sin_family == AF_INET) 437 newrp->rc_inet = saddr->sin_addr.s_addr; 438 else if (saddr->sin_family == AF_INET6) { 439 saddr6 = (struct sockaddr_in6 *)saddr; 440 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6, 441 sizeof (struct in6_addr)); 442 newrp->rc_flag |= RC_INETIPV6; 443 } 444 LIST_INSERT_HEAD(hp, newrp, rc_hash); 445 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru); 446 mtx_unlock(mutex); 447 nd->nd_rp = newrp; 448 ret = RC_DOIT; 449 450 out: 451 NFSEXITCODE2(0, nd); 452 return (ret); 453 } 454 455 /* 456 * Update a request cache entry after the rpc has been done 457 */ 458 APPLESTATIC struct nfsrvcache * 459 nfsrvd_updatecache(struct nfsrv_descript *nd) 460 { 461 struct nfsrvcache *rp; 462 struct nfsrvcache *retrp = NULL; 463 mbuf_t m; 464 struct mtx *mutex; 465 466 rp = nd->nd_rp; 467 if (!rp) 468 panic("nfsrvd_updatecache null rp"); 469 nd->nd_rp = NULL; 470 mutex = nfsrc_cachemutex(rp); 471 mtx_lock(mutex); 472 nfsrc_lock(rp); 473 if (!(rp->rc_flag & RC_INPROG)) 474 panic("nfsrvd_updatecache not inprog"); 475 rp->rc_flag &= ~RC_INPROG; 476 if (rp->rc_flag & RC_UDP) { 477 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru); 478 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru); 479 } 480 481 /* 482 * Reply from cache is a special case returned by nfsrv_checkseqid(). 483 */ 484 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) { 485 nfsstatsv1.srvcache_nonidemdonehits++; 486 mtx_unlock(mutex); 487 nd->nd_repstat = 0; 488 if (nd->nd_mreq) 489 mbuf_freem(nd->nd_mreq); 490 if (!(rp->rc_flag & RC_REPMBUF)) 491 panic("reply from cache"); 492 nd->nd_mreq = m_copym(rp->rc_reply, 0, 493 M_COPYALL, M_WAITOK); 494 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 495 nfsrc_unlock(rp); 496 goto out; 497 } 498 499 /* 500 * If rc_refcnt > 0, save it 501 * For UDP, save it if ND_SAVEREPLY is set 502 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set 503 */ 504 if (nd->nd_repstat != NFSERR_DONTREPLY && 505 (rp->rc_refcnt > 0 || 506 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) || 507 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) && 508 nfsrc_tcpsavedreplies <= nfsrc_floodlevel && 509 nfsrc_tcpnonidempotent))) { 510 if (rp->rc_refcnt > 0) { 511 if (!(rp->rc_flag & RC_NFSV4)) 512 panic("update_cache refcnt"); 513 rp->rc_flag |= RC_REFCNT; 514 } 515 if ((nd->nd_flag & ND_NFSV2) && 516 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) { 517 rp->rc_status = nd->nd_repstat; 518 rp->rc_flag |= RC_REPSTATUS; 519 mtx_unlock(mutex); 520 } else { 521 if (!(rp->rc_flag & RC_UDP)) { 522 atomic_add_int(&nfsrc_tcpsavedreplies, 1); 523 if (nfsrc_tcpsavedreplies > 524 nfsstatsv1.srvcache_tcppeak) 525 nfsstatsv1.srvcache_tcppeak = 526 nfsrc_tcpsavedreplies; 527 } 528 mtx_unlock(mutex); 529 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK); 530 mtx_lock(mutex); 531 rp->rc_reply = m; 532 rp->rc_flag |= RC_REPMBUF; 533 mtx_unlock(mutex); 534 } 535 if (rp->rc_flag & RC_UDP) { 536 rp->rc_timestamp = NFSD_MONOSEC + 537 NFSRVCACHE_UDPTIMEOUT; 538 nfsrc_unlock(rp); 539 } else { 540 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 541 if (rp->rc_refcnt > 0) 542 nfsrc_unlock(rp); 543 else 544 retrp = rp; 545 } 546 } else { 547 nfsrc_freecache(rp); 548 mtx_unlock(mutex); 549 } 550 551 out: 552 NFSEXITCODE2(0, nd); 553 return (retrp); 554 } 555 556 /* 557 * Invalidate and, if possible, free an in prog cache entry. 558 * Must not sleep. 559 */ 560 APPLESTATIC void 561 nfsrvd_delcache(struct nfsrvcache *rp) 562 { 563 struct mtx *mutex; 564 565 mutex = nfsrc_cachemutex(rp); 566 if (!(rp->rc_flag & RC_INPROG)) 567 panic("nfsrvd_delcache not in prog"); 568 mtx_lock(mutex); 569 rp->rc_flag &= ~RC_INPROG; 570 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED)) 571 nfsrc_freecache(rp); 572 mtx_unlock(mutex); 573 } 574 575 /* 576 * Called after nfsrvd_updatecache() once the reply is sent, to update 577 * the entry's sequence number and unlock it. The argument is 578 * the pointer returned by nfsrvd_updatecache(). 579 */ 580 APPLESTATIC void 581 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq) 582 { 583 struct nfsrchash_bucket *hbp; 584 585 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked")); 586 if (have_seq) { 587 hbp = NFSRCAHASH(rp->rc_sockref); 588 mtx_lock(&hbp->mtx); 589 rp->rc_tcpseq = seq; 590 if (rp->rc_acked != RC_NO_ACK) 591 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash); 592 rp->rc_acked = RC_NO_ACK; 593 mtx_unlock(&hbp->mtx); 594 } 595 nfsrc_unlock(rp); 596 } 597 598 /* 599 * Get a cache entry for TCP 600 * - key on <xid, nfs version> 601 * (allow multiple entries for a given key) 602 */ 603 static int 604 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp) 605 { 606 struct nfsrvcache *rp, *nextrp; 607 int i; 608 struct nfsrvcache *hitrp; 609 struct nfsrvhashhead *hp, nfsrc_templist; 610 int hit, ret = 0; 611 struct mtx *mutex; 612 613 mutex = nfsrc_cachemutex(newrp); 614 hp = NFSRCHASH(newrp->rc_xid); 615 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum); 616 tryagain: 617 mtx_lock(mutex); 618 hit = 1; 619 LIST_INIT(&nfsrc_templist); 620 /* 621 * Get all the matches and put them on the temp list. 622 */ 623 rp = LIST_FIRST(hp); 624 while (rp != LIST_END(hp)) { 625 nextrp = LIST_NEXT(rp, rc_hash); 626 if (newrp->rc_xid == rp->rc_xid && 627 (!(rp->rc_flag & RC_INPROG) || 628 ((newrp->rc_flag & RC_SAMETCPCONN) && 629 newrp->rc_sockref == rp->rc_sockref)) && 630 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) && 631 newrp->rc_proc == rp->rc_proc && 632 ((newrp->rc_flag & RC_NFSV4) && 633 newrp->rc_sockref != rp->rc_sockref && 634 newrp->rc_cachetime >= rp->rc_cachetime) 635 && newrp->rc_reqlen == rp->rc_reqlen && 636 newrp->rc_cksum == rp->rc_cksum) { 637 LIST_REMOVE(rp, rc_hash); 638 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash); 639 } 640 rp = nextrp; 641 } 642 643 /* 644 * Now, use nfsrc_templist to decide if there is a match. 645 */ 646 i = 0; 647 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) { 648 i++; 649 if (rp->rc_refcnt > 0) { 650 hit = 0; 651 break; 652 } 653 } 654 /* 655 * Can be a hit only if one entry left. 656 * Note possible hit entry and put nfsrc_templist back on hash 657 * list. 658 */ 659 if (i != 1) 660 hit = 0; 661 hitrp = rp = LIST_FIRST(&nfsrc_templist); 662 while (rp != LIST_END(&nfsrc_templist)) { 663 nextrp = LIST_NEXT(rp, rc_hash); 664 LIST_REMOVE(rp, rc_hash); 665 LIST_INSERT_HEAD(hp, rp, rc_hash); 666 rp = nextrp; 667 } 668 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist)) 669 panic("nfs gettcp cache templist"); 670 671 if (hit) { 672 rp = hitrp; 673 if ((rp->rc_flag & RC_LOCKED) != 0) { 674 rp->rc_flag |= RC_WANTED; 675 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, 676 "nfsrc", 10 * hz); 677 goto tryagain; 678 } 679 if (rp->rc_flag == 0) 680 panic("nfs tcp cache0"); 681 rp->rc_flag |= RC_LOCKED; 682 if (rp->rc_flag & RC_INPROG) { 683 nfsstatsv1.srvcache_inproghits++; 684 mtx_unlock(mutex); 685 if (newrp->rc_sockref == rp->rc_sockref) 686 nfsrc_marksametcpconn(rp->rc_sockref); 687 ret = RC_DROPIT; 688 } else if (rp->rc_flag & RC_REPSTATUS) { 689 /* 690 * V2 only. 691 */ 692 nfsstatsv1.srvcache_nonidemdonehits++; 693 mtx_unlock(mutex); 694 if (newrp->rc_sockref == rp->rc_sockref) 695 nfsrc_marksametcpconn(rp->rc_sockref); 696 ret = RC_REPLY; 697 nfsrvd_rephead(nd); 698 *(nd->nd_errp) = rp->rc_status; 699 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 700 } else if (rp->rc_flag & RC_REPMBUF) { 701 nfsstatsv1.srvcache_nonidemdonehits++; 702 mtx_unlock(mutex); 703 if (newrp->rc_sockref == rp->rc_sockref) 704 nfsrc_marksametcpconn(rp->rc_sockref); 705 ret = RC_REPLY; 706 nd->nd_mreq = m_copym(rp->rc_reply, 0, 707 M_COPYALL, M_WAITOK); 708 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 709 } else { 710 panic("nfs tcp cache1"); 711 } 712 nfsrc_unlock(rp); 713 free(newrp, M_NFSRVCACHE); 714 goto out; 715 } 716 nfsstatsv1.srvcache_misses++; 717 atomic_add_int(&nfsstatsv1.srvcache_size, 1); 718 719 /* 720 * For TCP, multiple entries for a key are allowed, so don't 721 * chain it into the hash table until done. 722 */ 723 newrp->rc_cachetime = NFSD_MONOSEC; 724 newrp->rc_flag |= RC_INPROG; 725 LIST_INSERT_HEAD(hp, newrp, rc_hash); 726 mtx_unlock(mutex); 727 nd->nd_rp = newrp; 728 ret = RC_DOIT; 729 730 out: 731 NFSEXITCODE2(0, nd); 732 return (ret); 733 } 734 735 /* 736 * Lock a cache entry. 737 */ 738 static void 739 nfsrc_lock(struct nfsrvcache *rp) 740 { 741 struct mtx *mutex; 742 743 mutex = nfsrc_cachemutex(rp); 744 mtx_assert(mutex, MA_OWNED); 745 while ((rp->rc_flag & RC_LOCKED) != 0) { 746 rp->rc_flag |= RC_WANTED; 747 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0); 748 } 749 rp->rc_flag |= RC_LOCKED; 750 } 751 752 /* 753 * Unlock a cache entry. 754 */ 755 static void 756 nfsrc_unlock(struct nfsrvcache *rp) 757 { 758 struct mtx *mutex; 759 760 mutex = nfsrc_cachemutex(rp); 761 mtx_lock(mutex); 762 rp->rc_flag &= ~RC_LOCKED; 763 nfsrc_wanted(rp); 764 mtx_unlock(mutex); 765 } 766 767 /* 768 * Wakeup anyone wanting entry. 769 */ 770 static void 771 nfsrc_wanted(struct nfsrvcache *rp) 772 { 773 if (rp->rc_flag & RC_WANTED) { 774 rp->rc_flag &= ~RC_WANTED; 775 wakeup((caddr_t)rp); 776 } 777 } 778 779 /* 780 * Free up the entry. 781 * Must not sleep. 782 */ 783 static void 784 nfsrc_freecache(struct nfsrvcache *rp) 785 { 786 struct nfsrchash_bucket *hbp; 787 788 LIST_REMOVE(rp, rc_hash); 789 if (rp->rc_flag & RC_UDP) { 790 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru); 791 nfsrc_udpcachesize--; 792 } else if (rp->rc_acked != RC_NO_SEQ) { 793 hbp = NFSRCAHASH(rp->rc_sockref); 794 mtx_lock(&hbp->mtx); 795 if (rp->rc_acked == RC_NO_ACK) 796 LIST_REMOVE(rp, rc_ahash); 797 mtx_unlock(&hbp->mtx); 798 } 799 nfsrc_wanted(rp); 800 if (rp->rc_flag & RC_REPMBUF) { 801 mbuf_freem(rp->rc_reply); 802 if (!(rp->rc_flag & RC_UDP)) 803 atomic_add_int(&nfsrc_tcpsavedreplies, -1); 804 } 805 free(rp, M_NFSRVCACHE); 806 atomic_add_int(&nfsstatsv1.srvcache_size, -1); 807 } 808 809 /* 810 * Clean out the cache. Called when nfsserver module is unloaded. 811 */ 812 APPLESTATIC void 813 nfsrvd_cleancache(void) 814 { 815 struct nfsrvcache *rp, *nextrp; 816 int i; 817 818 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 819 mtx_lock(&nfsrchash_table[i].mtx); 820 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp) 821 nfsrc_freecache(rp); 822 mtx_unlock(&nfsrchash_table[i].mtx); 823 } 824 mtx_lock(&nfsrc_udpmtx); 825 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 826 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) { 827 nfsrc_freecache(rp); 828 } 829 } 830 nfsstatsv1.srvcache_size = 0; 831 mtx_unlock(&nfsrc_udpmtx); 832 nfsrc_tcpsavedreplies = 0; 833 } 834 835 #define HISTSIZE 16 836 /* 837 * The basic rule is to get rid of entries that are expired. 838 */ 839 void 840 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final) 841 { 842 struct nfsrchash_bucket *hbp; 843 struct nfsrvcache *rp, *nextrp; 844 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE]; 845 time_t thisstamp; 846 static time_t udp_lasttrim = 0, tcp_lasttrim = 0; 847 static int onethread = 0, oneslot = 0; 848 849 if (sockref != 0) { 850 hbp = NFSRCAHASH(sockref); 851 mtx_lock(&hbp->mtx); 852 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) { 853 if (sockref == rp->rc_sockref) { 854 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) { 855 rp->rc_acked = RC_ACK; 856 LIST_REMOVE(rp, rc_ahash); 857 } else if (final) { 858 rp->rc_acked = RC_NACK; 859 LIST_REMOVE(rp, rc_ahash); 860 } 861 } 862 } 863 mtx_unlock(&hbp->mtx); 864 } 865 866 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0) 867 return; 868 if (NFSD_MONOSEC != udp_lasttrim || 869 nfsrc_udpcachesize >= (nfsrc_udphighwater + 870 nfsrc_udphighwater / 2)) { 871 mtx_lock(&nfsrc_udpmtx); 872 udp_lasttrim = NFSD_MONOSEC; 873 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) { 874 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED)) 875 && rp->rc_refcnt == 0 876 && ((rp->rc_flag & RC_REFCNT) || 877 udp_lasttrim > rp->rc_timestamp || 878 nfsrc_udpcachesize > nfsrc_udphighwater)) 879 nfsrc_freecache(rp); 880 } 881 mtx_unlock(&nfsrc_udpmtx); 882 } 883 if (NFSD_MONOSEC != tcp_lasttrim || 884 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) { 885 force = nfsrc_tcphighwater / 4; 886 if (force > 0 && 887 nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) { 888 for (i = 0; i < HISTSIZE; i++) 889 time_histo[i] = 0; 890 i = 0; 891 lastslot = NFSRVCACHE_HASHSIZE - 1; 892 } else { 893 force = 0; 894 if (NFSD_MONOSEC != tcp_lasttrim) { 895 i = 0; 896 lastslot = NFSRVCACHE_HASHSIZE - 1; 897 } else { 898 lastslot = i = oneslot; 899 if (++oneslot >= NFSRVCACHE_HASHSIZE) 900 oneslot = 0; 901 } 902 } 903 tto = nfsrc_tcptimeout; 904 tcp_lasttrim = NFSD_MONOSEC; 905 for (; i <= lastslot; i++) { 906 mtx_lock(&nfsrchash_table[i].mtx); 907 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, 908 nextrp) { 909 if (!(rp->rc_flag & 910 (RC_INPROG|RC_LOCKED|RC_WANTED)) 911 && rp->rc_refcnt == 0) { 912 if ((rp->rc_flag & RC_REFCNT) || 913 tcp_lasttrim > rp->rc_timestamp || 914 rp->rc_acked == RC_ACK) { 915 nfsrc_freecache(rp); 916 continue; 917 } 918 919 if (force == 0) 920 continue; 921 /* 922 * The timestamps range from roughly the 923 * present (tcp_lasttrim) to the present 924 * + nfsrc_tcptimeout. Generate a simple 925 * histogram of where the timeouts fall. 926 */ 927 j = rp->rc_timestamp - tcp_lasttrim; 928 if (j >= tto) 929 j = HISTSIZE - 1; 930 else if (j < 0) 931 j = 0; 932 else 933 j = j * HISTSIZE / tto; 934 time_histo[j]++; 935 } 936 } 937 mtx_unlock(&nfsrchash_table[i].mtx); 938 } 939 if (force) { 940 /* 941 * Trim some more with a smaller timeout of as little 942 * as 20% of nfsrc_tcptimeout to try and get below 943 * 80% of the nfsrc_tcphighwater. 944 */ 945 k = 0; 946 for (i = 0; i < (HISTSIZE - 2); i++) { 947 k += time_histo[i]; 948 if (k > force) 949 break; 950 } 951 k = tto * (i + 1) / HISTSIZE; 952 if (k < 1) 953 k = 1; 954 thisstamp = tcp_lasttrim + k; 955 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 956 mtx_lock(&nfsrchash_table[i].mtx); 957 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, 958 rc_hash, nextrp) { 959 if (!(rp->rc_flag & 960 (RC_INPROG|RC_LOCKED|RC_WANTED)) 961 && rp->rc_refcnt == 0 962 && ((rp->rc_flag & RC_REFCNT) || 963 thisstamp > rp->rc_timestamp || 964 rp->rc_acked == RC_ACK)) 965 nfsrc_freecache(rp); 966 } 967 mtx_unlock(&nfsrchash_table[i].mtx); 968 } 969 } 970 } 971 atomic_store_rel_int(&onethread, 0); 972 } 973 974 /* 975 * Add a seqid# reference to the cache entry. 976 */ 977 APPLESTATIC void 978 nfsrvd_refcache(struct nfsrvcache *rp) 979 { 980 struct mtx *mutex; 981 982 if (rp == NULL) 983 /* For NFSv4.1, there is no cache entry. */ 984 return; 985 mutex = nfsrc_cachemutex(rp); 986 mtx_lock(mutex); 987 if (rp->rc_refcnt < 0) 988 panic("nfs cache refcnt"); 989 rp->rc_refcnt++; 990 mtx_unlock(mutex); 991 } 992 993 /* 994 * Dereference a seqid# cache entry. 995 */ 996 APPLESTATIC void 997 nfsrvd_derefcache(struct nfsrvcache *rp) 998 { 999 struct mtx *mutex; 1000 1001 mutex = nfsrc_cachemutex(rp); 1002 mtx_lock(mutex); 1003 if (rp->rc_refcnt <= 0) 1004 panic("nfs cache derefcnt"); 1005 rp->rc_refcnt--; 1006 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG))) 1007 nfsrc_freecache(rp); 1008 mtx_unlock(mutex); 1009 } 1010 1011 /* 1012 * Calculate the length of the mbuf list and a checksum on the first up to 1013 * NFSRVCACHE_CHECKLEN bytes. 1014 */ 1015 static int 1016 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum) 1017 { 1018 int len = 0, cklen; 1019 mbuf_t m; 1020 1021 m = m1; 1022 while (m) { 1023 len += mbuf_len(m); 1024 m = mbuf_next(m); 1025 } 1026 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len; 1027 *cksum = in_cksum(m1, cklen); 1028 return (len); 1029 } 1030 1031 /* 1032 * Mark a TCP connection that is seeing retries. Should never happen for 1033 * NFSv4. 1034 */ 1035 static void 1036 nfsrc_marksametcpconn(u_int64_t sockref) 1037 { 1038 } 1039 1040