1 /*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 /* 38 * Here is the basic algorithm: 39 * First, some design criteria I used: 40 * - I think a false hit is more serious than a false miss 41 * - A false hit for an RPC that has Op(s) that order via seqid# must be 42 * avoided at all cost 43 * - A valid hit will probably happen a long time after the original reply 44 * and the TCP socket that the original request was received on will no 45 * longer be active 46 * (The long time delay implies to me that LRU is not appropriate.) 47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s 48 * in them as well as minimizing the risk of redoing retried non-idempotent 49 * Ops. 50 * Because it is biased towards avoiding false hits, multiple entries with 51 * the same xid are to be expected, especially for the case of the entry 52 * in the cache being related to a seqid# sequenced Op. 53 * 54 * The basic algorithm I'm about to code up: 55 * - Null RPCs bypass the cache and are just done 56 * For TCP 57 * - key on <xid, NFS version> (as noted above, there can be several 58 * entries with the same key) 59 * When a request arrives: 60 * For all that match key 61 * - if RPC# != OR request_size != 62 * - not a match with this one 63 * - if NFSv4 and received on same TCP socket OR 64 * received on a TCP connection created before the 65 * entry was cached 66 * - not a match with this one 67 * (V2,3 clients might retry on same TCP socket) 68 * - calculate checksum on first N bytes of NFS XDR 69 * - if checksum != 70 * - not a match for this one 71 * If any of the remaining ones that match has a 72 * seqid_refcnt > 0 73 * - not a match (go do RPC, using new cache entry) 74 * If one match left 75 * - a hit (reply from cache) 76 * else 77 * - miss (go do RPC, using new cache entry) 78 * 79 * During processing of NFSv4 request: 80 * - set a flag when a non-idempotent Op is processed 81 * - when an Op that uses a seqid# (Open,...) is processed 82 * - if same seqid# as referenced entry in cache 83 * - free new cache entry 84 * - reply from referenced cache entry 85 * else if next seqid# in order 86 * - free referenced cache entry 87 * - increment seqid_refcnt on new cache entry 88 * - set pointer from Openowner/Lockowner to 89 * new cache entry (aka reference it) 90 * else if first seqid# in sequence 91 * - increment seqid_refcnt on new cache entry 92 * - set pointer from Openowner/Lockowner to 93 * new cache entry (aka reference it) 94 * 95 * At end of RPC processing: 96 * - if seqid_refcnt > 0 OR flagged non-idempotent on new 97 * cache entry 98 * - save reply in cache entry 99 * - calculate checksum on first N bytes of NFS XDR 100 * request 101 * - note op and length of XDR request (in bytes) 102 * - timestamp it 103 * else 104 * - free new cache entry 105 * - Send reply (noting info for socket activity check, below) 106 * 107 * For cache entries saved above: 108 * - if saved since seqid_refcnt was > 0 109 * - free when seqid_refcnt decrements to 0 110 * (when next one in sequence is processed above, or 111 * when Openowner/Lockowner is discarded) 112 * else { non-idempotent Op(s) } 113 * - free when 114 * - some further activity observed on same 115 * socket 116 * (I'm not yet sure how I'm going to do 117 * this. Maybe look at the TCP connection 118 * to see if the send_tcp_sequence# is well 119 * past sent reply OR K additional RPCs 120 * replied on same socket OR?) 121 * OR 122 * - when very old (hours, days, weeks?) 123 * 124 * For UDP (v2, 3 only), pretty much the old way: 125 * - key on <xid, NFS version, RPC#, Client host ip#> 126 * (at most one entry for each key) 127 * 128 * When a Request arrives: 129 * - if a match with entry via key 130 * - if RPC marked In_progress 131 * - discard request (don't send reply) 132 * else 133 * - reply from cache 134 * - timestamp cache entry 135 * else 136 * - add entry to cache, marked In_progress 137 * - do RPC 138 * - when RPC done 139 * - if RPC# non-idempotent 140 * - mark entry Done (not In_progress) 141 * - save reply 142 * - timestamp cache entry 143 * else 144 * - free cache entry 145 * - send reply 146 * 147 * Later, entries with saved replies are free'd a short time (few minutes) 148 * after reply sent (timestamp). 149 * Reference: Chet Juszczak, "Improving the Performance and Correctness 150 * of an NFS Server", in Proc. Winter 1989 USENIX Conference, 151 * pages 53-63. San Diego, February 1989. 152 * for the UDP case. 153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies 154 * for TCP. For V3, a reply won't be saved when the flood level is 155 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in 156 * that case. This level should be set high enough that this almost 157 * never happens. 158 */ 159 #ifndef APPLEKEXT 160 #include <fs/nfs/nfsport.h> 161 162 extern struct nfsstats newnfsstats; 163 extern struct mtx nfsrc_udpmtx; 164 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE]; 165 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE]; 166 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0; 167 #endif /* !APPLEKEXT */ 168 169 SYSCTL_DECL(_vfs_nfsd); 170 171 static u_int nfsrc_tcphighwater = 0; 172 static int 173 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS) 174 { 175 int error, newhighwater; 176 177 newhighwater = nfsrc_tcphighwater; 178 error = sysctl_handle_int(oidp, &newhighwater, 0, req); 179 if (error != 0 || req->newptr == NULL) 180 return (error); 181 if (newhighwater < 0) 182 return (EINVAL); 183 if (newhighwater >= nfsrc_floodlevel) 184 nfsrc_floodlevel = newhighwater + newhighwater / 5; 185 nfsrc_tcphighwater = newhighwater; 186 return (0); 187 } 188 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0, 189 sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU", 190 "High water mark for TCP cache entries"); 191 192 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER; 193 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW, 194 &nfsrc_udphighwater, 0, 195 "High water mark for UDP cache entries"); 196 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT; 197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW, 198 &nfsrc_tcptimeout, 0, 199 "Timeout for TCP entries in the DRC"); 200 static u_int nfsrc_tcpnonidempotent = 1; 201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW, 202 &nfsrc_tcpnonidempotent, 0, 203 "Enable the DRC for NFS over TCP"); 204 205 static int nfsrc_udpcachesize = 0; 206 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru; 207 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE]; 208 209 /* 210 * and the reverse mapping from generic to Version 2 procedure numbers 211 */ 212 static int newnfsv2_procid[NFS_V3NPROCS] = { 213 NFSV2PROC_NULL, 214 NFSV2PROC_GETATTR, 215 NFSV2PROC_SETATTR, 216 NFSV2PROC_LOOKUP, 217 NFSV2PROC_NOOP, 218 NFSV2PROC_READLINK, 219 NFSV2PROC_READ, 220 NFSV2PROC_WRITE, 221 NFSV2PROC_CREATE, 222 NFSV2PROC_MKDIR, 223 NFSV2PROC_SYMLINK, 224 NFSV2PROC_CREATE, 225 NFSV2PROC_REMOVE, 226 NFSV2PROC_RMDIR, 227 NFSV2PROC_RENAME, 228 NFSV2PROC_LINK, 229 NFSV2PROC_READDIR, 230 NFSV2PROC_NOOP, 231 NFSV2PROC_STATFS, 232 NFSV2PROC_NOOP, 233 NFSV2PROC_NOOP, 234 NFSV2PROC_NOOP, 235 }; 236 237 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE) 238 #define NFSRCUDPHASH(xid) \ 239 (&nfsrvudphashtbl[nfsrc_hash(xid)]) 240 #define NFSRCHASH(xid) \ 241 (&nfsrchash_table[nfsrc_hash(xid)].tbl) 242 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)]) 243 #define TRUE 1 244 #define FALSE 0 245 #define NFSRVCACHE_CHECKLEN 100 246 247 /* True iff the rpc reply is an nfs status ONLY! */ 248 static int nfsv2_repstat[NFS_V3NPROCS] = { 249 FALSE, 250 FALSE, 251 FALSE, 252 FALSE, 253 FALSE, 254 FALSE, 255 FALSE, 256 FALSE, 257 FALSE, 258 FALSE, 259 TRUE, 260 TRUE, 261 TRUE, 262 TRUE, 263 FALSE, 264 TRUE, 265 FALSE, 266 FALSE, 267 FALSE, 268 FALSE, 269 FALSE, 270 FALSE, 271 }; 272 273 /* 274 * Will NFS want to work over IPv6 someday? 275 */ 276 #define NETFAMILY(rp) \ 277 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET) 278 279 /* local functions */ 280 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp); 281 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp); 282 static void nfsrc_lock(struct nfsrvcache *rp); 283 static void nfsrc_unlock(struct nfsrvcache *rp); 284 static void nfsrc_wanted(struct nfsrvcache *rp); 285 static void nfsrc_freecache(struct nfsrvcache *rp); 286 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum); 287 static void nfsrc_marksametcpconn(u_int64_t); 288 289 /* 290 * Return the correct mutex for this cache entry. 291 */ 292 static __inline struct mtx * 293 nfsrc_cachemutex(struct nfsrvcache *rp) 294 { 295 296 if ((rp->rc_flag & RC_UDP) != 0) 297 return (&nfsrc_udpmtx); 298 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx); 299 } 300 301 /* 302 * Initialize the server request cache list 303 */ 304 APPLESTATIC void 305 nfsrvd_initcache(void) 306 { 307 int i; 308 static int inited = 0; 309 310 if (inited) 311 return; 312 inited = 1; 313 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 314 LIST_INIT(&nfsrvudphashtbl[i]); 315 LIST_INIT(&nfsrchash_table[i].tbl); 316 LIST_INIT(&nfsrcahash_table[i].tbl); 317 } 318 TAILQ_INIT(&nfsrvudplru); 319 nfsrc_tcpsavedreplies = 0; 320 nfsrc_udpcachesize = 0; 321 newnfsstats.srvcache_tcppeak = 0; 322 newnfsstats.srvcache_size = 0; 323 } 324 325 /* 326 * Get a cache entry for this request. Basically just malloc a new one 327 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest. 328 */ 329 APPLESTATIC int 330 nfsrvd_getcache(struct nfsrv_descript *nd) 331 { 332 struct nfsrvcache *newrp; 333 int ret; 334 335 if (nd->nd_procnum == NFSPROC_NULL) 336 panic("nfsd cache null"); 337 MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache), 338 M_NFSRVCACHE, M_WAITOK); 339 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache)); 340 if (nd->nd_flag & ND_NFSV4) 341 newrp->rc_flag = RC_NFSV4; 342 else if (nd->nd_flag & ND_NFSV3) 343 newrp->rc_flag = RC_NFSV3; 344 else 345 newrp->rc_flag = RC_NFSV2; 346 newrp->rc_xid = nd->nd_retxid; 347 newrp->rc_proc = nd->nd_procnum; 348 newrp->rc_sockref = nd->nd_sockref; 349 newrp->rc_cachetime = nd->nd_tcpconntime; 350 if (nd->nd_flag & ND_SAMETCPCONN) 351 newrp->rc_flag |= RC_SAMETCPCONN; 352 if (nd->nd_nam2 != NULL) { 353 newrp->rc_flag |= RC_UDP; 354 ret = nfsrc_getudp(nd, newrp); 355 } else { 356 ret = nfsrc_gettcp(nd, newrp); 357 } 358 NFSEXITCODE2(0, nd); 359 return (ret); 360 } 361 362 /* 363 * For UDP (v2, v3): 364 * - key on <xid, NFS version, RPC#, Client host ip#> 365 * (at most one entry for each key) 366 */ 367 static int 368 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp) 369 { 370 struct nfsrvcache *rp; 371 struct sockaddr_in *saddr; 372 struct sockaddr_in6 *saddr6; 373 struct nfsrvhashhead *hp; 374 int ret = 0; 375 struct mtx *mutex; 376 377 mutex = nfsrc_cachemutex(newrp); 378 hp = NFSRCUDPHASH(newrp->rc_xid); 379 loop: 380 mtx_lock(mutex); 381 LIST_FOREACH(rp, hp, rc_hash) { 382 if (newrp->rc_xid == rp->rc_xid && 383 newrp->rc_proc == rp->rc_proc && 384 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) && 385 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) { 386 if ((rp->rc_flag & RC_LOCKED) != 0) { 387 rp->rc_flag |= RC_WANTED; 388 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, 389 "nfsrc", 10 * hz); 390 goto loop; 391 } 392 if (rp->rc_flag == 0) 393 panic("nfs udp cache0"); 394 rp->rc_flag |= RC_LOCKED; 395 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru); 396 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru); 397 if (rp->rc_flag & RC_INPROG) { 398 newnfsstats.srvcache_inproghits++; 399 mtx_unlock(mutex); 400 ret = RC_DROPIT; 401 } else if (rp->rc_flag & RC_REPSTATUS) { 402 /* 403 * V2 only. 404 */ 405 newnfsstats.srvcache_nonidemdonehits++; 406 mtx_unlock(mutex); 407 nfsrvd_rephead(nd); 408 *(nd->nd_errp) = rp->rc_status; 409 ret = RC_REPLY; 410 rp->rc_timestamp = NFSD_MONOSEC + 411 NFSRVCACHE_UDPTIMEOUT; 412 } else if (rp->rc_flag & RC_REPMBUF) { 413 newnfsstats.srvcache_nonidemdonehits++; 414 mtx_unlock(mutex); 415 nd->nd_mreq = m_copym(rp->rc_reply, 0, 416 M_COPYALL, M_WAITOK); 417 ret = RC_REPLY; 418 rp->rc_timestamp = NFSD_MONOSEC + 419 NFSRVCACHE_UDPTIMEOUT; 420 } else { 421 panic("nfs udp cache1"); 422 } 423 nfsrc_unlock(rp); 424 free((caddr_t)newrp, M_NFSRVCACHE); 425 goto out; 426 } 427 } 428 newnfsstats.srvcache_misses++; 429 atomic_add_int(&newnfsstats.srvcache_size, 1); 430 nfsrc_udpcachesize++; 431 432 newrp->rc_flag |= RC_INPROG; 433 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *); 434 if (saddr->sin_family == AF_INET) 435 newrp->rc_inet = saddr->sin_addr.s_addr; 436 else if (saddr->sin_family == AF_INET6) { 437 saddr6 = (struct sockaddr_in6 *)saddr; 438 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6, 439 sizeof (struct in6_addr)); 440 newrp->rc_flag |= RC_INETIPV6; 441 } 442 LIST_INSERT_HEAD(hp, newrp, rc_hash); 443 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru); 444 mtx_unlock(mutex); 445 nd->nd_rp = newrp; 446 ret = RC_DOIT; 447 448 out: 449 NFSEXITCODE2(0, nd); 450 return (ret); 451 } 452 453 /* 454 * Update a request cache entry after the rpc has been done 455 */ 456 APPLESTATIC struct nfsrvcache * 457 nfsrvd_updatecache(struct nfsrv_descript *nd) 458 { 459 struct nfsrvcache *rp; 460 struct nfsrvcache *retrp = NULL; 461 mbuf_t m; 462 struct mtx *mutex; 463 464 rp = nd->nd_rp; 465 if (!rp) 466 panic("nfsrvd_updatecache null rp"); 467 nd->nd_rp = NULL; 468 mutex = nfsrc_cachemutex(rp); 469 mtx_lock(mutex); 470 nfsrc_lock(rp); 471 if (!(rp->rc_flag & RC_INPROG)) 472 panic("nfsrvd_updatecache not inprog"); 473 rp->rc_flag &= ~RC_INPROG; 474 if (rp->rc_flag & RC_UDP) { 475 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru); 476 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru); 477 } 478 479 /* 480 * Reply from cache is a special case returned by nfsrv_checkseqid(). 481 */ 482 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) { 483 newnfsstats.srvcache_nonidemdonehits++; 484 mtx_unlock(mutex); 485 nd->nd_repstat = 0; 486 if (nd->nd_mreq) 487 mbuf_freem(nd->nd_mreq); 488 if (!(rp->rc_flag & RC_REPMBUF)) 489 panic("reply from cache"); 490 nd->nd_mreq = m_copym(rp->rc_reply, 0, 491 M_COPYALL, M_WAITOK); 492 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 493 nfsrc_unlock(rp); 494 goto out; 495 } 496 497 /* 498 * If rc_refcnt > 0, save it 499 * For UDP, save it if ND_SAVEREPLY is set 500 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set 501 */ 502 if (nd->nd_repstat != NFSERR_DONTREPLY && 503 (rp->rc_refcnt > 0 || 504 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) || 505 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) && 506 nfsrc_tcpsavedreplies <= nfsrc_floodlevel && 507 nfsrc_tcpnonidempotent))) { 508 if (rp->rc_refcnt > 0) { 509 if (!(rp->rc_flag & RC_NFSV4)) 510 panic("update_cache refcnt"); 511 rp->rc_flag |= RC_REFCNT; 512 } 513 if ((nd->nd_flag & ND_NFSV2) && 514 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) { 515 rp->rc_status = nd->nd_repstat; 516 rp->rc_flag |= RC_REPSTATUS; 517 mtx_unlock(mutex); 518 } else { 519 if (!(rp->rc_flag & RC_UDP)) { 520 atomic_add_int(&nfsrc_tcpsavedreplies, 1); 521 if (nfsrc_tcpsavedreplies > 522 newnfsstats.srvcache_tcppeak) 523 newnfsstats.srvcache_tcppeak = 524 nfsrc_tcpsavedreplies; 525 } 526 mtx_unlock(mutex); 527 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK); 528 mtx_lock(mutex); 529 rp->rc_reply = m; 530 rp->rc_flag |= RC_REPMBUF; 531 mtx_unlock(mutex); 532 } 533 if (rp->rc_flag & RC_UDP) { 534 rp->rc_timestamp = NFSD_MONOSEC + 535 NFSRVCACHE_UDPTIMEOUT; 536 nfsrc_unlock(rp); 537 } else { 538 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 539 if (rp->rc_refcnt > 0) 540 nfsrc_unlock(rp); 541 else 542 retrp = rp; 543 } 544 } else { 545 nfsrc_freecache(rp); 546 mtx_unlock(mutex); 547 } 548 549 out: 550 NFSEXITCODE2(0, nd); 551 return (retrp); 552 } 553 554 /* 555 * Invalidate and, if possible, free an in prog cache entry. 556 * Must not sleep. 557 */ 558 APPLESTATIC void 559 nfsrvd_delcache(struct nfsrvcache *rp) 560 { 561 struct mtx *mutex; 562 563 mutex = nfsrc_cachemutex(rp); 564 if (!(rp->rc_flag & RC_INPROG)) 565 panic("nfsrvd_delcache not in prog"); 566 mtx_lock(mutex); 567 rp->rc_flag &= ~RC_INPROG; 568 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED)) 569 nfsrc_freecache(rp); 570 mtx_unlock(mutex); 571 } 572 573 /* 574 * Called after nfsrvd_updatecache() once the reply is sent, to update 575 * the entry's sequence number and unlock it. The argument is 576 * the pointer returned by nfsrvd_updatecache(). 577 */ 578 APPLESTATIC void 579 nfsrvd_sentcache(struct nfsrvcache *rp, uint32_t seq) 580 { 581 struct nfsrchash_bucket *hbp; 582 583 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked")); 584 hbp = NFSRCAHASH(rp->rc_sockref); 585 mtx_lock(&hbp->mtx); 586 rp->rc_tcpseq = seq; 587 if (rp->rc_acked != RC_NO_ACK) 588 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash); 589 rp->rc_acked = RC_NO_ACK; 590 mtx_unlock(&hbp->mtx); 591 nfsrc_unlock(rp); 592 } 593 594 /* 595 * Get a cache entry for TCP 596 * - key on <xid, nfs version> 597 * (allow multiple entries for a given key) 598 */ 599 static int 600 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp) 601 { 602 struct nfsrvcache *rp, *nextrp; 603 int i; 604 struct nfsrvcache *hitrp; 605 struct nfsrvhashhead *hp, nfsrc_templist; 606 int hit, ret = 0; 607 struct mtx *mutex; 608 609 mutex = nfsrc_cachemutex(newrp); 610 hp = NFSRCHASH(newrp->rc_xid); 611 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum); 612 tryagain: 613 mtx_lock(mutex); 614 hit = 1; 615 LIST_INIT(&nfsrc_templist); 616 /* 617 * Get all the matches and put them on the temp list. 618 */ 619 rp = LIST_FIRST(hp); 620 while (rp != LIST_END(hp)) { 621 nextrp = LIST_NEXT(rp, rc_hash); 622 if (newrp->rc_xid == rp->rc_xid && 623 (!(rp->rc_flag & RC_INPROG) || 624 ((newrp->rc_flag & RC_SAMETCPCONN) && 625 newrp->rc_sockref == rp->rc_sockref)) && 626 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) && 627 newrp->rc_proc == rp->rc_proc && 628 ((newrp->rc_flag & RC_NFSV4) && 629 newrp->rc_sockref != rp->rc_sockref && 630 newrp->rc_cachetime >= rp->rc_cachetime) 631 && newrp->rc_reqlen == rp->rc_reqlen && 632 newrp->rc_cksum == rp->rc_cksum) { 633 LIST_REMOVE(rp, rc_hash); 634 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash); 635 } 636 rp = nextrp; 637 } 638 639 /* 640 * Now, use nfsrc_templist to decide if there is a match. 641 */ 642 i = 0; 643 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) { 644 i++; 645 if (rp->rc_refcnt > 0) { 646 hit = 0; 647 break; 648 } 649 } 650 /* 651 * Can be a hit only if one entry left. 652 * Note possible hit entry and put nfsrc_templist back on hash 653 * list. 654 */ 655 if (i != 1) 656 hit = 0; 657 hitrp = rp = LIST_FIRST(&nfsrc_templist); 658 while (rp != LIST_END(&nfsrc_templist)) { 659 nextrp = LIST_NEXT(rp, rc_hash); 660 LIST_REMOVE(rp, rc_hash); 661 LIST_INSERT_HEAD(hp, rp, rc_hash); 662 rp = nextrp; 663 } 664 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist)) 665 panic("nfs gettcp cache templist"); 666 667 if (hit) { 668 rp = hitrp; 669 if ((rp->rc_flag & RC_LOCKED) != 0) { 670 rp->rc_flag |= RC_WANTED; 671 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, 672 "nfsrc", 10 * hz); 673 goto tryagain; 674 } 675 if (rp->rc_flag == 0) 676 panic("nfs tcp cache0"); 677 rp->rc_flag |= RC_LOCKED; 678 if (rp->rc_flag & RC_INPROG) { 679 newnfsstats.srvcache_inproghits++; 680 mtx_unlock(mutex); 681 if (newrp->rc_sockref == rp->rc_sockref) 682 nfsrc_marksametcpconn(rp->rc_sockref); 683 ret = RC_DROPIT; 684 } else if (rp->rc_flag & RC_REPSTATUS) { 685 /* 686 * V2 only. 687 */ 688 newnfsstats.srvcache_nonidemdonehits++; 689 mtx_unlock(mutex); 690 if (newrp->rc_sockref == rp->rc_sockref) 691 nfsrc_marksametcpconn(rp->rc_sockref); 692 ret = RC_REPLY; 693 nfsrvd_rephead(nd); 694 *(nd->nd_errp) = rp->rc_status; 695 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 696 } else if (rp->rc_flag & RC_REPMBUF) { 697 newnfsstats.srvcache_nonidemdonehits++; 698 mtx_unlock(mutex); 699 if (newrp->rc_sockref == rp->rc_sockref) 700 nfsrc_marksametcpconn(rp->rc_sockref); 701 ret = RC_REPLY; 702 nd->nd_mreq = m_copym(rp->rc_reply, 0, 703 M_COPYALL, M_WAITOK); 704 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 705 } else { 706 panic("nfs tcp cache1"); 707 } 708 nfsrc_unlock(rp); 709 free((caddr_t)newrp, M_NFSRVCACHE); 710 goto out; 711 } 712 newnfsstats.srvcache_misses++; 713 atomic_add_int(&newnfsstats.srvcache_size, 1); 714 715 /* 716 * For TCP, multiple entries for a key are allowed, so don't 717 * chain it into the hash table until done. 718 */ 719 newrp->rc_cachetime = NFSD_MONOSEC; 720 newrp->rc_flag |= RC_INPROG; 721 LIST_INSERT_HEAD(hp, newrp, rc_hash); 722 mtx_unlock(mutex); 723 nd->nd_rp = newrp; 724 ret = RC_DOIT; 725 726 out: 727 NFSEXITCODE2(0, nd); 728 return (ret); 729 } 730 731 /* 732 * Lock a cache entry. 733 */ 734 static void 735 nfsrc_lock(struct nfsrvcache *rp) 736 { 737 struct mtx *mutex; 738 739 mutex = nfsrc_cachemutex(rp); 740 mtx_assert(mutex, MA_OWNED); 741 while ((rp->rc_flag & RC_LOCKED) != 0) { 742 rp->rc_flag |= RC_WANTED; 743 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0); 744 } 745 rp->rc_flag |= RC_LOCKED; 746 } 747 748 /* 749 * Unlock a cache entry. 750 */ 751 static void 752 nfsrc_unlock(struct nfsrvcache *rp) 753 { 754 struct mtx *mutex; 755 756 mutex = nfsrc_cachemutex(rp); 757 mtx_lock(mutex); 758 rp->rc_flag &= ~RC_LOCKED; 759 nfsrc_wanted(rp); 760 mtx_unlock(mutex); 761 } 762 763 /* 764 * Wakeup anyone wanting entry. 765 */ 766 static void 767 nfsrc_wanted(struct nfsrvcache *rp) 768 { 769 if (rp->rc_flag & RC_WANTED) { 770 rp->rc_flag &= ~RC_WANTED; 771 wakeup((caddr_t)rp); 772 } 773 } 774 775 /* 776 * Free up the entry. 777 * Must not sleep. 778 */ 779 static void 780 nfsrc_freecache(struct nfsrvcache *rp) 781 { 782 struct nfsrchash_bucket *hbp; 783 784 LIST_REMOVE(rp, rc_hash); 785 if (rp->rc_flag & RC_UDP) { 786 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru); 787 nfsrc_udpcachesize--; 788 } else if (rp->rc_acked != RC_NO_SEQ) { 789 hbp = NFSRCAHASH(rp->rc_sockref); 790 mtx_lock(&hbp->mtx); 791 if (rp->rc_acked == RC_NO_ACK) 792 LIST_REMOVE(rp, rc_ahash); 793 mtx_unlock(&hbp->mtx); 794 } 795 nfsrc_wanted(rp); 796 if (rp->rc_flag & RC_REPMBUF) { 797 mbuf_freem(rp->rc_reply); 798 if (!(rp->rc_flag & RC_UDP)) 799 atomic_add_int(&nfsrc_tcpsavedreplies, -1); 800 } 801 FREE((caddr_t)rp, M_NFSRVCACHE); 802 atomic_add_int(&newnfsstats.srvcache_size, -1); 803 } 804 805 /* 806 * Clean out the cache. Called when nfsserver module is unloaded. 807 */ 808 APPLESTATIC void 809 nfsrvd_cleancache(void) 810 { 811 struct nfsrvcache *rp, *nextrp; 812 int i; 813 814 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 815 mtx_lock(&nfsrchash_table[i].mtx); 816 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp) 817 nfsrc_freecache(rp); 818 mtx_unlock(&nfsrchash_table[i].mtx); 819 } 820 mtx_lock(&nfsrc_udpmtx); 821 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 822 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) { 823 nfsrc_freecache(rp); 824 } 825 } 826 newnfsstats.srvcache_size = 0; 827 mtx_unlock(&nfsrc_udpmtx); 828 nfsrc_tcpsavedreplies = 0; 829 } 830 831 #define HISTSIZE 16 832 /* 833 * The basic rule is to get rid of entries that are expired. 834 */ 835 void 836 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final) 837 { 838 struct nfsrchash_bucket *hbp; 839 struct nfsrvcache *rp, *nextrp; 840 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE]; 841 time_t thisstamp; 842 static time_t udp_lasttrim = 0, tcp_lasttrim = 0; 843 static int onethread = 0, oneslot = 0; 844 845 if (sockref != 0) { 846 hbp = NFSRCAHASH(sockref); 847 mtx_lock(&hbp->mtx); 848 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) { 849 if (sockref == rp->rc_sockref) { 850 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) { 851 rp->rc_acked = RC_ACK; 852 LIST_REMOVE(rp, rc_ahash); 853 } else if (final) { 854 rp->rc_acked = RC_NACK; 855 LIST_REMOVE(rp, rc_ahash); 856 } 857 } 858 } 859 mtx_unlock(&hbp->mtx); 860 } 861 862 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0) 863 return; 864 if (NFSD_MONOSEC != udp_lasttrim || 865 nfsrc_udpcachesize >= (nfsrc_udphighwater + 866 nfsrc_udphighwater / 2)) { 867 mtx_lock(&nfsrc_udpmtx); 868 udp_lasttrim = NFSD_MONOSEC; 869 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) { 870 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED)) 871 && rp->rc_refcnt == 0 872 && ((rp->rc_flag & RC_REFCNT) || 873 udp_lasttrim > rp->rc_timestamp || 874 nfsrc_udpcachesize > nfsrc_udphighwater)) 875 nfsrc_freecache(rp); 876 } 877 mtx_unlock(&nfsrc_udpmtx); 878 } 879 if (NFSD_MONOSEC != tcp_lasttrim || 880 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) { 881 force = nfsrc_tcphighwater / 4; 882 if (force > 0 && 883 nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) { 884 for (i = 0; i < HISTSIZE; i++) 885 time_histo[i] = 0; 886 i = 0; 887 lastslot = NFSRVCACHE_HASHSIZE - 1; 888 } else { 889 force = 0; 890 if (NFSD_MONOSEC != tcp_lasttrim) { 891 i = 0; 892 lastslot = NFSRVCACHE_HASHSIZE - 1; 893 } else { 894 lastslot = i = oneslot; 895 if (++oneslot >= NFSRVCACHE_HASHSIZE) 896 oneslot = 0; 897 } 898 } 899 tto = nfsrc_tcptimeout; 900 tcp_lasttrim = NFSD_MONOSEC; 901 for (; i <= lastslot; i++) { 902 mtx_lock(&nfsrchash_table[i].mtx); 903 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, 904 nextrp) { 905 if (!(rp->rc_flag & 906 (RC_INPROG|RC_LOCKED|RC_WANTED)) 907 && rp->rc_refcnt == 0) { 908 if ((rp->rc_flag & RC_REFCNT) || 909 tcp_lasttrim > rp->rc_timestamp || 910 rp->rc_acked == RC_ACK) { 911 nfsrc_freecache(rp); 912 continue; 913 } 914 915 if (force == 0) 916 continue; 917 /* 918 * The timestamps range from roughly the 919 * present (tcp_lasttrim) to the present 920 * + nfsrc_tcptimeout. Generate a simple 921 * histogram of where the timeouts fall. 922 */ 923 j = rp->rc_timestamp - tcp_lasttrim; 924 if (j >= tto) 925 j = HISTSIZE - 1; 926 else if (j < 0) 927 j = 0; 928 else 929 j = j * HISTSIZE / tto; 930 time_histo[j]++; 931 } 932 } 933 mtx_unlock(&nfsrchash_table[i].mtx); 934 } 935 if (force) { 936 /* 937 * Trim some more with a smaller timeout of as little 938 * as 20% of nfsrc_tcptimeout to try and get below 939 * 80% of the nfsrc_tcphighwater. 940 */ 941 k = 0; 942 for (i = 0; i < (HISTSIZE - 2); i++) { 943 k += time_histo[i]; 944 if (k > force) 945 break; 946 } 947 k = tto * (i + 1) / HISTSIZE; 948 if (k < 1) 949 k = 1; 950 thisstamp = tcp_lasttrim + k; 951 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 952 mtx_lock(&nfsrchash_table[i].mtx); 953 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, 954 rc_hash, nextrp) { 955 if (!(rp->rc_flag & 956 (RC_INPROG|RC_LOCKED|RC_WANTED)) 957 && rp->rc_refcnt == 0 958 && ((rp->rc_flag & RC_REFCNT) || 959 thisstamp > rp->rc_timestamp || 960 rp->rc_acked == RC_ACK)) 961 nfsrc_freecache(rp); 962 } 963 mtx_unlock(&nfsrchash_table[i].mtx); 964 } 965 } 966 } 967 atomic_store_rel_int(&onethread, 0); 968 } 969 970 /* 971 * Add a seqid# reference to the cache entry. 972 */ 973 APPLESTATIC void 974 nfsrvd_refcache(struct nfsrvcache *rp) 975 { 976 struct mtx *mutex; 977 978 mutex = nfsrc_cachemutex(rp); 979 mtx_lock(mutex); 980 if (rp->rc_refcnt < 0) 981 panic("nfs cache refcnt"); 982 rp->rc_refcnt++; 983 mtx_unlock(mutex); 984 } 985 986 /* 987 * Dereference a seqid# cache entry. 988 */ 989 APPLESTATIC void 990 nfsrvd_derefcache(struct nfsrvcache *rp) 991 { 992 struct mtx *mutex; 993 994 mutex = nfsrc_cachemutex(rp); 995 mtx_lock(mutex); 996 if (rp->rc_refcnt <= 0) 997 panic("nfs cache derefcnt"); 998 rp->rc_refcnt--; 999 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG))) 1000 nfsrc_freecache(rp); 1001 mtx_unlock(mutex); 1002 } 1003 1004 /* 1005 * Calculate the length of the mbuf list and a checksum on the first up to 1006 * NFSRVCACHE_CHECKLEN bytes. 1007 */ 1008 static int 1009 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum) 1010 { 1011 int len = 0, cklen; 1012 mbuf_t m; 1013 1014 m = m1; 1015 while (m) { 1016 len += mbuf_len(m); 1017 m = mbuf_next(m); 1018 } 1019 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len; 1020 *cksum = in_cksum(m1, cklen); 1021 return (len); 1022 } 1023 1024 /* 1025 * Mark a TCP connection that is seeing retries. Should never happen for 1026 * NFSv4. 1027 */ 1028 static void 1029 nfsrc_marksametcpconn(u_int64_t sockref) 1030 { 1031 } 1032 1033