1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 */ 35 36 #include <sys/cdefs.h> 37 /* 38 * Here is the basic algorithm: 39 * First, some design criteria I used: 40 * - I think a false hit is more serious than a false miss 41 * - A false hit for an RPC that has Op(s) that order via seqid# must be 42 * avoided at all cost 43 * - A valid hit will probably happen a long time after the original reply 44 * and the TCP socket that the original request was received on will no 45 * longer be active 46 * (The long time delay implies to me that LRU is not appropriate.) 47 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s 48 * in them as well as minimizing the risk of redoing retried non-idempotent 49 * Ops. 50 * Because it is biased towards avoiding false hits, multiple entries with 51 * the same xid are to be expected, especially for the case of the entry 52 * in the cache being related to a seqid# sequenced Op. 53 * 54 * The basic algorithm I'm about to code up: 55 * - Null RPCs bypass the cache and are just done 56 * For TCP 57 * - key on <xid, NFS version> (as noted above, there can be several 58 * entries with the same key) 59 * When a request arrives: 60 * For all that match key 61 * - if RPC# != OR request_size != 62 * - not a match with this one 63 * - if NFSv4 and received on same TCP socket OR 64 * received on a TCP connection created before the 65 * entry was cached 66 * - not a match with this one 67 * (V2,3 clients might retry on same TCP socket) 68 * - calculate checksum on first N bytes of NFS XDR 69 * - if checksum != 70 * - not a match for this one 71 * If any of the remaining ones that match has a 72 * seqid_refcnt > 0 73 * - not a match (go do RPC, using new cache entry) 74 * If one match left 75 * - a hit (reply from cache) 76 * else 77 * - miss (go do RPC, using new cache entry) 78 * 79 * During processing of NFSv4 request: 80 * - set a flag when a non-idempotent Op is processed 81 * - when an Op that uses a seqid# (Open,...) is processed 82 * - if same seqid# as referenced entry in cache 83 * - free new cache entry 84 * - reply from referenced cache entry 85 * else if next seqid# in order 86 * - free referenced cache entry 87 * - increment seqid_refcnt on new cache entry 88 * - set pointer from Openowner/Lockowner to 89 * new cache entry (aka reference it) 90 * else if first seqid# in sequence 91 * - increment seqid_refcnt on new cache entry 92 * - set pointer from Openowner/Lockowner to 93 * new cache entry (aka reference it) 94 * 95 * At end of RPC processing: 96 * - if seqid_refcnt > 0 OR flagged non-idempotent on new 97 * cache entry 98 * - save reply in cache entry 99 * - calculate checksum on first N bytes of NFS XDR 100 * request 101 * - note op and length of XDR request (in bytes) 102 * - timestamp it 103 * else 104 * - free new cache entry 105 * - Send reply (noting info for socket activity check, below) 106 * 107 * For cache entries saved above: 108 * - if saved since seqid_refcnt was > 0 109 * - free when seqid_refcnt decrements to 0 110 * (when next one in sequence is processed above, or 111 * when Openowner/Lockowner is discarded) 112 * else { non-idempotent Op(s) } 113 * - free when 114 * - some further activity observed on same 115 * socket 116 * (I'm not yet sure how I'm going to do 117 * this. Maybe look at the TCP connection 118 * to see if the send_tcp_sequence# is well 119 * past sent reply OR K additional RPCs 120 * replied on same socket OR?) 121 * OR 122 * - when very old (hours, days, weeks?) 123 * 124 * For UDP (v2, 3 only), pretty much the old way: 125 * - key on <xid, NFS version, RPC#, Client host ip#> 126 * (at most one entry for each key) 127 * 128 * When a Request arrives: 129 * - if a match with entry via key 130 * - if RPC marked In_progress 131 * - discard request (don't send reply) 132 * else 133 * - reply from cache 134 * - timestamp cache entry 135 * else 136 * - add entry to cache, marked In_progress 137 * - do RPC 138 * - when RPC done 139 * - if RPC# non-idempotent 140 * - mark entry Done (not In_progress) 141 * - save reply 142 * - timestamp cache entry 143 * else 144 * - free cache entry 145 * - send reply 146 * 147 * Later, entries with saved replies are free'd a short time (few minutes) 148 * after reply sent (timestamp). 149 * Reference: Chet Juszczak, "Improving the Performance and Correctness 150 * of an NFS Server", in Proc. Winter 1989 USENIX Conference, 151 * pages 53-63. San Diego, February 1989. 152 * for the UDP case. 153 * nfsrc_floodlevel is set to the allowable upper limit for saved replies 154 * for TCP. For V3, a reply won't be saved when the flood level is 155 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in 156 * that case. This level should be set high enough that this almost 157 * never happens. 158 */ 159 #include <fs/nfs/nfsport.h> 160 161 extern struct mtx nfsrc_udpmtx; 162 163 NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl); 164 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table); 165 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table); 166 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p); 167 168 NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL; 169 NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0; 170 171 SYSCTL_DECL(_vfs_nfsd); 172 173 static u_int nfsrc_tcphighwater = 0; 174 static int 175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS) 176 { 177 int error, newhighwater; 178 179 newhighwater = nfsrc_tcphighwater; 180 error = sysctl_handle_int(oidp, &newhighwater, 0, req); 181 if (error != 0 || req->newptr == NULL) 182 return (error); 183 if (newhighwater < 0) 184 return (EINVAL); 185 if (newhighwater >= NFSD_VNET(nfsrc_floodlevel)) 186 NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5; 187 nfsrc_tcphighwater = newhighwater; 188 return (0); 189 } 190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, 191 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater), 192 sysctl_tcphighwater, "IU", "High water mark for TCP cache entries"); 193 194 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER; 195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW, 196 &nfsrc_udphighwater, 0, 197 "High water mark for UDP cache entries"); 198 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT; 199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW, 200 &nfsrc_tcptimeout, 0, 201 "Timeout for TCP entries in the DRC"); 202 static u_int nfsrc_tcpnonidempotent = 1; 203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW, 204 &nfsrc_tcpnonidempotent, 0, 205 "Enable the DRC for NFS over TCP"); 206 207 NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0; 208 NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru); 209 210 /* 211 * and the reverse mapping from generic to Version 2 procedure numbers 212 */ 213 static int newnfsv2_procid[NFS_V3NPROCS] = { 214 NFSV2PROC_NULL, 215 NFSV2PROC_GETATTR, 216 NFSV2PROC_SETATTR, 217 NFSV2PROC_LOOKUP, 218 NFSV2PROC_NOOP, 219 NFSV2PROC_READLINK, 220 NFSV2PROC_READ, 221 NFSV2PROC_WRITE, 222 NFSV2PROC_CREATE, 223 NFSV2PROC_MKDIR, 224 NFSV2PROC_SYMLINK, 225 NFSV2PROC_CREATE, 226 NFSV2PROC_REMOVE, 227 NFSV2PROC_RMDIR, 228 NFSV2PROC_RENAME, 229 NFSV2PROC_LINK, 230 NFSV2PROC_READDIR, 231 NFSV2PROC_NOOP, 232 NFSV2PROC_STATFS, 233 NFSV2PROC_NOOP, 234 NFSV2PROC_NOOP, 235 NFSV2PROC_NOOP, 236 }; 237 238 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE) 239 #define NFSRCUDPHASH(xid) \ 240 (&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)]) 241 #define NFSRCHASH(xid) \ 242 (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl) 243 #define NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)]) 244 #define TRUE 1 245 #define FALSE 0 246 #define NFSRVCACHE_CHECKLEN 100 247 248 /* True iff the rpc reply is an nfs status ONLY! */ 249 static int nfsv2_repstat[NFS_V3NPROCS] = { 250 FALSE, 251 FALSE, 252 FALSE, 253 FALSE, 254 FALSE, 255 FALSE, 256 FALSE, 257 FALSE, 258 FALSE, 259 FALSE, 260 TRUE, 261 TRUE, 262 TRUE, 263 TRUE, 264 FALSE, 265 TRUE, 266 FALSE, 267 FALSE, 268 FALSE, 269 FALSE, 270 FALSE, 271 FALSE, 272 }; 273 274 /* 275 * Will NFS want to work over IPv6 someday? 276 */ 277 #define NETFAMILY(rp) \ 278 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET) 279 280 /* local functions */ 281 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp); 282 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp); 283 static void nfsrc_lock(struct nfsrvcache *rp); 284 static void nfsrc_unlock(struct nfsrvcache *rp); 285 static void nfsrc_wanted(struct nfsrvcache *rp); 286 static void nfsrc_freecache(struct nfsrvcache *rp); 287 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum); 288 static void nfsrc_marksametcpconn(u_int64_t); 289 290 /* 291 * Return the correct mutex for this cache entry. 292 */ 293 static __inline struct mtx * 294 nfsrc_cachemutex(struct nfsrvcache *rp) 295 { 296 297 if ((rp->rc_flag & RC_UDP) != 0) 298 return (&nfsrc_udpmtx); 299 return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx); 300 } 301 302 /* 303 * Initialize the server request cache list 304 */ 305 void 306 nfsrvd_initcache(void) 307 { 308 int i; 309 310 NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) * 311 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO); 312 NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) * 313 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO); 314 NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) * 315 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO); 316 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 317 mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL, 318 MTX_DEF); 319 mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL, 320 MTX_DEF); 321 } 322 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 323 LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]); 324 LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl); 325 LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl); 326 } 327 TAILQ_INIT(&NFSD_VNET(nfsrvudplru)); 328 NFSD_VNET(nfsrc_tcpsavedreplies) = 0; 329 NFSD_VNET(nfsrc_udpcachesize) = 0; 330 } 331 332 /* 333 * Get a cache entry for this request. Basically just malloc a new one 334 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest. 335 */ 336 int 337 nfsrvd_getcache(struct nfsrv_descript *nd) 338 { 339 struct nfsrvcache *newrp; 340 int ret; 341 342 if (nd->nd_procnum == NFSPROC_NULL) 343 panic("nfsd cache null"); 344 newrp = malloc(sizeof (struct nfsrvcache), 345 M_NFSRVCACHE, M_WAITOK); 346 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache)); 347 if (nd->nd_flag & ND_NFSV4) 348 newrp->rc_flag = RC_NFSV4; 349 else if (nd->nd_flag & ND_NFSV3) 350 newrp->rc_flag = RC_NFSV3; 351 else 352 newrp->rc_flag = RC_NFSV2; 353 newrp->rc_xid = nd->nd_retxid; 354 newrp->rc_proc = nd->nd_procnum; 355 newrp->rc_sockref = nd->nd_sockref; 356 newrp->rc_cachetime = nd->nd_tcpconntime; 357 if (nd->nd_flag & ND_SAMETCPCONN) 358 newrp->rc_flag |= RC_SAMETCPCONN; 359 if (nd->nd_nam2 != NULL) { 360 newrp->rc_flag |= RC_UDP; 361 ret = nfsrc_getudp(nd, newrp); 362 } else { 363 ret = nfsrc_gettcp(nd, newrp); 364 } 365 NFSEXITCODE2(0, nd); 366 return (ret); 367 } 368 369 /* 370 * For UDP (v2, v3): 371 * - key on <xid, NFS version, RPC#, Client host ip#> 372 * (at most one entry for each key) 373 */ 374 static int 375 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp) 376 { 377 struct nfsrvcache *rp; 378 struct sockaddr_in *saddr; 379 struct sockaddr_in6 *saddr6; 380 struct nfsrvhashhead *hp; 381 int ret = 0; 382 struct mtx *mutex; 383 384 mutex = nfsrc_cachemutex(newrp); 385 hp = NFSRCUDPHASH(newrp->rc_xid); 386 loop: 387 mtx_lock(mutex); 388 LIST_FOREACH(rp, hp, rc_hash) { 389 if (newrp->rc_xid == rp->rc_xid && 390 newrp->rc_proc == rp->rc_proc && 391 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) && 392 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) { 393 if ((rp->rc_flag & RC_LOCKED) != 0) { 394 rp->rc_flag |= RC_WANTED; 395 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, 396 "nfsrc", 10 * hz); 397 goto loop; 398 } 399 if (rp->rc_flag == 0) 400 panic("nfs udp cache0"); 401 rp->rc_flag |= RC_LOCKED; 402 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 403 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 404 if (rp->rc_flag & RC_INPROG) { 405 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++; 406 mtx_unlock(mutex); 407 ret = RC_DROPIT; 408 } else if (rp->rc_flag & RC_REPSTATUS) { 409 /* 410 * V2 only. 411 */ 412 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 413 mtx_unlock(mutex); 414 nfsrvd_rephead(nd); 415 *(nd->nd_errp) = rp->rc_status; 416 ret = RC_REPLY; 417 rp->rc_timestamp = NFSD_MONOSEC + 418 NFSRVCACHE_UDPTIMEOUT; 419 } else if (rp->rc_flag & RC_REPMBUF) { 420 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 421 mtx_unlock(mutex); 422 nd->nd_mreq = m_copym(rp->rc_reply, 0, 423 M_COPYALL, M_WAITOK); 424 ret = RC_REPLY; 425 rp->rc_timestamp = NFSD_MONOSEC + 426 NFSRVCACHE_UDPTIMEOUT; 427 } else { 428 panic("nfs udp cache1"); 429 } 430 nfsrc_unlock(rp); 431 free(newrp, M_NFSRVCACHE); 432 goto out; 433 } 434 } 435 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++; 436 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1); 437 NFSD_VNET(nfsrc_udpcachesize)++; 438 439 newrp->rc_flag |= RC_INPROG; 440 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *); 441 if (saddr->sin_family == AF_INET) 442 newrp->rc_inet = saddr->sin_addr.s_addr; 443 else if (saddr->sin_family == AF_INET6) { 444 saddr6 = (struct sockaddr_in6 *)saddr; 445 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6, 446 sizeof (struct in6_addr)); 447 newrp->rc_flag |= RC_INETIPV6; 448 } 449 LIST_INSERT_HEAD(hp, newrp, rc_hash); 450 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru); 451 mtx_unlock(mutex); 452 nd->nd_rp = newrp; 453 ret = RC_DOIT; 454 455 out: 456 NFSEXITCODE2(0, nd); 457 return (ret); 458 } 459 460 /* 461 * Update a request cache entry after the rpc has been done 462 */ 463 struct nfsrvcache * 464 nfsrvd_updatecache(struct nfsrv_descript *nd) 465 { 466 struct nfsrvcache *rp; 467 struct nfsrvcache *retrp = NULL; 468 struct mbuf *m; 469 struct mtx *mutex; 470 471 rp = nd->nd_rp; 472 if (!rp) 473 panic("nfsrvd_updatecache null rp"); 474 nd->nd_rp = NULL; 475 mutex = nfsrc_cachemutex(rp); 476 mtx_lock(mutex); 477 nfsrc_lock(rp); 478 if (!(rp->rc_flag & RC_INPROG)) 479 panic("nfsrvd_updatecache not inprog"); 480 rp->rc_flag &= ~RC_INPROG; 481 if (rp->rc_flag & RC_UDP) { 482 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 483 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 484 } 485 486 /* 487 * Reply from cache is a special case returned by nfsrv_checkseqid(). 488 */ 489 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) { 490 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 491 mtx_unlock(mutex); 492 nd->nd_repstat = 0; 493 if (nd->nd_mreq) 494 m_freem(nd->nd_mreq); 495 if (!(rp->rc_flag & RC_REPMBUF)) 496 panic("reply from cache"); 497 nd->nd_mreq = m_copym(rp->rc_reply, 0, 498 M_COPYALL, M_WAITOK); 499 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 500 nfsrc_unlock(rp); 501 goto out; 502 } 503 504 /* 505 * If rc_refcnt > 0, save it 506 * For UDP, save it if ND_SAVEREPLY is set 507 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set 508 */ 509 if (nd->nd_repstat != NFSERR_DONTREPLY && 510 (rp->rc_refcnt > 0 || 511 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) || 512 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) && 513 NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) && 514 nfsrc_tcpnonidempotent))) { 515 if (rp->rc_refcnt > 0) { 516 if (!(rp->rc_flag & RC_NFSV4)) 517 panic("update_cache refcnt"); 518 rp->rc_flag |= RC_REFCNT; 519 } 520 if ((nd->nd_flag & ND_NFSV2) && 521 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) { 522 rp->rc_status = nd->nd_repstat; 523 rp->rc_flag |= RC_REPSTATUS; 524 mtx_unlock(mutex); 525 } else { 526 if (!(rp->rc_flag & RC_UDP)) { 527 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), 528 1); 529 if (NFSD_VNET(nfsrc_tcpsavedreplies) > 530 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak) 531 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak = 532 NFSD_VNET(nfsrc_tcpsavedreplies); 533 } 534 mtx_unlock(mutex); 535 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK); 536 mtx_lock(mutex); 537 rp->rc_reply = m; 538 rp->rc_flag |= RC_REPMBUF; 539 mtx_unlock(mutex); 540 } 541 if (rp->rc_flag & RC_UDP) { 542 rp->rc_timestamp = NFSD_MONOSEC + 543 NFSRVCACHE_UDPTIMEOUT; 544 nfsrc_unlock(rp); 545 } else { 546 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 547 if (rp->rc_refcnt > 0) 548 nfsrc_unlock(rp); 549 else 550 retrp = rp; 551 } 552 } else { 553 nfsrc_freecache(rp); 554 mtx_unlock(mutex); 555 } 556 557 out: 558 NFSEXITCODE2(0, nd); 559 return (retrp); 560 } 561 562 /* 563 * Invalidate and, if possible, free an in prog cache entry. 564 * Must not sleep. 565 */ 566 void 567 nfsrvd_delcache(struct nfsrvcache *rp) 568 { 569 struct mtx *mutex; 570 571 mutex = nfsrc_cachemutex(rp); 572 if (!(rp->rc_flag & RC_INPROG)) 573 panic("nfsrvd_delcache not in prog"); 574 mtx_lock(mutex); 575 rp->rc_flag &= ~RC_INPROG; 576 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED)) 577 nfsrc_freecache(rp); 578 mtx_unlock(mutex); 579 } 580 581 /* 582 * Called after nfsrvd_updatecache() once the reply is sent, to update 583 * the entry's sequence number and unlock it. The argument is 584 * the pointer returned by nfsrvd_updatecache(). 585 */ 586 void 587 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq) 588 { 589 struct nfsrchash_bucket *hbp; 590 591 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked")); 592 if (have_seq) { 593 hbp = NFSRCAHASH(rp->rc_sockref); 594 mtx_lock(&hbp->mtx); 595 rp->rc_tcpseq = seq; 596 if (rp->rc_acked != RC_NO_ACK) 597 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash); 598 rp->rc_acked = RC_NO_ACK; 599 mtx_unlock(&hbp->mtx); 600 } 601 nfsrc_unlock(rp); 602 } 603 604 /* 605 * Get a cache entry for TCP 606 * - key on <xid, nfs version> 607 * (allow multiple entries for a given key) 608 */ 609 static int 610 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp) 611 { 612 struct nfsrvcache *rp, *nextrp; 613 int i; 614 struct nfsrvcache *hitrp; 615 struct nfsrvhashhead *hp, nfsrc_templist; 616 int hit, ret = 0; 617 struct mtx *mutex; 618 619 mutex = nfsrc_cachemutex(newrp); 620 hp = NFSRCHASH(newrp->rc_xid); 621 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum); 622 tryagain: 623 mtx_lock(mutex); 624 hit = 1; 625 LIST_INIT(&nfsrc_templist); 626 /* 627 * Get all the matches and put them on the temp list. 628 */ 629 rp = LIST_FIRST(hp); 630 while (rp != LIST_END(hp)) { 631 nextrp = LIST_NEXT(rp, rc_hash); 632 if (newrp->rc_xid == rp->rc_xid && 633 (!(rp->rc_flag & RC_INPROG) || 634 ((newrp->rc_flag & RC_SAMETCPCONN) && 635 newrp->rc_sockref == rp->rc_sockref)) && 636 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) && 637 newrp->rc_proc == rp->rc_proc && 638 ((newrp->rc_flag & RC_NFSV4) && 639 newrp->rc_sockref != rp->rc_sockref && 640 newrp->rc_cachetime >= rp->rc_cachetime) 641 && newrp->rc_reqlen == rp->rc_reqlen && 642 newrp->rc_cksum == rp->rc_cksum) { 643 LIST_REMOVE(rp, rc_hash); 644 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash); 645 } 646 rp = nextrp; 647 } 648 649 /* 650 * Now, use nfsrc_templist to decide if there is a match. 651 */ 652 i = 0; 653 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) { 654 i++; 655 if (rp->rc_refcnt > 0) { 656 hit = 0; 657 break; 658 } 659 } 660 /* 661 * Can be a hit only if one entry left. 662 * Note possible hit entry and put nfsrc_templist back on hash 663 * list. 664 */ 665 if (i != 1) 666 hit = 0; 667 hitrp = rp = LIST_FIRST(&nfsrc_templist); 668 while (rp != LIST_END(&nfsrc_templist)) { 669 nextrp = LIST_NEXT(rp, rc_hash); 670 LIST_REMOVE(rp, rc_hash); 671 LIST_INSERT_HEAD(hp, rp, rc_hash); 672 rp = nextrp; 673 } 674 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist)) 675 panic("nfs gettcp cache templist"); 676 677 if (hit) { 678 rp = hitrp; 679 if ((rp->rc_flag & RC_LOCKED) != 0) { 680 rp->rc_flag |= RC_WANTED; 681 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, 682 "nfsrc", 10 * hz); 683 goto tryagain; 684 } 685 if (rp->rc_flag == 0) 686 panic("nfs tcp cache0"); 687 rp->rc_flag |= RC_LOCKED; 688 if (rp->rc_flag & RC_INPROG) { 689 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++; 690 mtx_unlock(mutex); 691 if (newrp->rc_sockref == rp->rc_sockref) 692 nfsrc_marksametcpconn(rp->rc_sockref); 693 ret = RC_DROPIT; 694 } else if (rp->rc_flag & RC_REPSTATUS) { 695 /* 696 * V2 only. 697 */ 698 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 699 mtx_unlock(mutex); 700 if (newrp->rc_sockref == rp->rc_sockref) 701 nfsrc_marksametcpconn(rp->rc_sockref); 702 ret = RC_REPLY; 703 nfsrvd_rephead(nd); 704 *(nd->nd_errp) = rp->rc_status; 705 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 706 } else if (rp->rc_flag & RC_REPMBUF) { 707 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 708 mtx_unlock(mutex); 709 if (newrp->rc_sockref == rp->rc_sockref) 710 nfsrc_marksametcpconn(rp->rc_sockref); 711 ret = RC_REPLY; 712 nd->nd_mreq = m_copym(rp->rc_reply, 0, 713 M_COPYALL, M_WAITOK); 714 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 715 } else { 716 panic("nfs tcp cache1"); 717 } 718 nfsrc_unlock(rp); 719 free(newrp, M_NFSRVCACHE); 720 goto out; 721 } 722 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++; 723 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1); 724 725 /* 726 * For TCP, multiple entries for a key are allowed, so don't 727 * chain it into the hash table until done. 728 */ 729 newrp->rc_cachetime = NFSD_MONOSEC; 730 newrp->rc_flag |= RC_INPROG; 731 LIST_INSERT_HEAD(hp, newrp, rc_hash); 732 mtx_unlock(mutex); 733 nd->nd_rp = newrp; 734 ret = RC_DOIT; 735 736 out: 737 NFSEXITCODE2(0, nd); 738 return (ret); 739 } 740 741 /* 742 * Lock a cache entry. 743 */ 744 static void 745 nfsrc_lock(struct nfsrvcache *rp) 746 { 747 struct mtx *mutex; 748 749 mutex = nfsrc_cachemutex(rp); 750 mtx_assert(mutex, MA_OWNED); 751 while ((rp->rc_flag & RC_LOCKED) != 0) { 752 rp->rc_flag |= RC_WANTED; 753 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0); 754 } 755 rp->rc_flag |= RC_LOCKED; 756 } 757 758 /* 759 * Unlock a cache entry. 760 */ 761 static void 762 nfsrc_unlock(struct nfsrvcache *rp) 763 { 764 struct mtx *mutex; 765 766 mutex = nfsrc_cachemutex(rp); 767 mtx_lock(mutex); 768 rp->rc_flag &= ~RC_LOCKED; 769 nfsrc_wanted(rp); 770 mtx_unlock(mutex); 771 } 772 773 /* 774 * Wakeup anyone wanting entry. 775 */ 776 static void 777 nfsrc_wanted(struct nfsrvcache *rp) 778 { 779 if (rp->rc_flag & RC_WANTED) { 780 rp->rc_flag &= ~RC_WANTED; 781 wakeup((caddr_t)rp); 782 } 783 } 784 785 /* 786 * Free up the entry. 787 * Must not sleep. 788 */ 789 static void 790 nfsrc_freecache(struct nfsrvcache *rp) 791 { 792 struct nfsrchash_bucket *hbp; 793 794 LIST_REMOVE(rp, rc_hash); 795 if (rp->rc_flag & RC_UDP) { 796 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 797 NFSD_VNET(nfsrc_udpcachesize)--; 798 } else if (rp->rc_acked != RC_NO_SEQ) { 799 hbp = NFSRCAHASH(rp->rc_sockref); 800 mtx_lock(&hbp->mtx); 801 if (rp->rc_acked == RC_NO_ACK) 802 LIST_REMOVE(rp, rc_ahash); 803 mtx_unlock(&hbp->mtx); 804 } 805 nfsrc_wanted(rp); 806 if (rp->rc_flag & RC_REPMBUF) { 807 m_freem(rp->rc_reply); 808 if (!(rp->rc_flag & RC_UDP)) 809 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1); 810 } 811 free(rp, M_NFSRVCACHE); 812 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1); 813 } 814 815 /* 816 * Clean out the cache. Called when nfsserver module is unloaded. 817 */ 818 void 819 nfsrvd_cleancache(void) 820 { 821 struct nfsrvcache *rp, *nextrp; 822 int i; 823 824 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 825 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl, 826 rc_hash, nextrp) 827 nfsrc_freecache(rp); 828 } 829 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 830 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash, 831 nextrp) { 832 nfsrc_freecache(rp); 833 } 834 } 835 NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0; 836 NFSD_VNET(nfsrc_tcpsavedreplies) = 0; 837 } 838 839 #define HISTSIZE 16 840 /* 841 * The basic rule is to get rid of entries that are expired. 842 */ 843 void 844 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final) 845 { 846 struct nfsrchash_bucket *hbp; 847 struct nfsrvcache *rp, *nextrp; 848 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE]; 849 time_t thisstamp; 850 static time_t udp_lasttrim = 0, tcp_lasttrim = 0; 851 static int onethread = 0, oneslot = 0; 852 853 if (sockref != 0) { 854 hbp = NFSRCAHASH(sockref); 855 mtx_lock(&hbp->mtx); 856 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) { 857 if (sockref == rp->rc_sockref) { 858 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) { 859 rp->rc_acked = RC_ACK; 860 LIST_REMOVE(rp, rc_ahash); 861 } else if (final) { 862 rp->rc_acked = RC_NACK; 863 LIST_REMOVE(rp, rc_ahash); 864 } 865 } 866 } 867 mtx_unlock(&hbp->mtx); 868 } 869 870 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0) 871 return; 872 if (NFSD_MONOSEC != udp_lasttrim || 873 NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater + 874 nfsrc_udphighwater / 2)) { 875 mtx_lock(&nfsrc_udpmtx); 876 udp_lasttrim = NFSD_MONOSEC; 877 TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru, 878 nextrp) { 879 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED)) 880 && rp->rc_refcnt == 0 881 && ((rp->rc_flag & RC_REFCNT) || 882 udp_lasttrim > rp->rc_timestamp || 883 NFSD_VNET(nfsrc_udpcachesize) > 884 nfsrc_udphighwater)) 885 nfsrc_freecache(rp); 886 } 887 mtx_unlock(&nfsrc_udpmtx); 888 } 889 if (NFSD_MONOSEC != tcp_lasttrim || 890 NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) { 891 force = nfsrc_tcphighwater / 4; 892 if (force > 0 && 893 NFSD_VNET(nfsrc_tcpsavedreplies) + force >= 894 nfsrc_tcphighwater) { 895 for (i = 0; i < HISTSIZE; i++) 896 time_histo[i] = 0; 897 i = 0; 898 lastslot = NFSRVCACHE_HASHSIZE - 1; 899 } else { 900 force = 0; 901 if (NFSD_MONOSEC != tcp_lasttrim) { 902 i = 0; 903 lastslot = NFSRVCACHE_HASHSIZE - 1; 904 } else { 905 lastslot = i = oneslot; 906 if (++oneslot >= NFSRVCACHE_HASHSIZE) 907 oneslot = 0; 908 } 909 } 910 tto = nfsrc_tcptimeout; 911 tcp_lasttrim = NFSD_MONOSEC; 912 for (; i <= lastslot; i++) { 913 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx); 914 LIST_FOREACH_SAFE(rp, 915 &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash, 916 nextrp) { 917 if (!(rp->rc_flag & 918 (RC_INPROG|RC_LOCKED|RC_WANTED)) 919 && rp->rc_refcnt == 0) { 920 if ((rp->rc_flag & RC_REFCNT) || 921 tcp_lasttrim > rp->rc_timestamp || 922 rp->rc_acked == RC_ACK) { 923 nfsrc_freecache(rp); 924 continue; 925 } 926 927 if (force == 0) 928 continue; 929 /* 930 * The timestamps range from roughly the 931 * present (tcp_lasttrim) to the present 932 * + nfsrc_tcptimeout. Generate a simple 933 * histogram of where the timeouts fall. 934 */ 935 j = rp->rc_timestamp - tcp_lasttrim; 936 if (j >= tto) 937 j = HISTSIZE - 1; 938 else if (j < 0) 939 j = 0; 940 else 941 j = j * HISTSIZE / tto; 942 time_histo[j]++; 943 } 944 } 945 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx); 946 } 947 if (force) { 948 /* 949 * Trim some more with a smaller timeout of as little 950 * as 20% of nfsrc_tcptimeout to try and get below 951 * 80% of the nfsrc_tcphighwater. 952 */ 953 k = 0; 954 for (i = 0; i < (HISTSIZE - 2); i++) { 955 k += time_histo[i]; 956 if (k > force) 957 break; 958 } 959 k = tto * (i + 1) / HISTSIZE; 960 if (k < 1) 961 k = 1; 962 thisstamp = tcp_lasttrim + k; 963 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 964 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx); 965 LIST_FOREACH_SAFE(rp, 966 &NFSD_VNET(nfsrchash_table)[i].tbl, 967 rc_hash, nextrp) { 968 if (!(rp->rc_flag & 969 (RC_INPROG|RC_LOCKED|RC_WANTED)) 970 && rp->rc_refcnt == 0 971 && ((rp->rc_flag & RC_REFCNT) || 972 thisstamp > rp->rc_timestamp || 973 rp->rc_acked == RC_ACK)) 974 nfsrc_freecache(rp); 975 } 976 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx); 977 } 978 } 979 } 980 atomic_store_rel_int(&onethread, 0); 981 } 982 983 /* 984 * Add a seqid# reference to the cache entry. 985 */ 986 void 987 nfsrvd_refcache(struct nfsrvcache *rp) 988 { 989 struct mtx *mutex; 990 991 if (rp == NULL) 992 /* For NFSv4.1, there is no cache entry. */ 993 return; 994 mutex = nfsrc_cachemutex(rp); 995 mtx_lock(mutex); 996 if (rp->rc_refcnt < 0) 997 panic("nfs cache refcnt"); 998 rp->rc_refcnt++; 999 mtx_unlock(mutex); 1000 } 1001 1002 /* 1003 * Dereference a seqid# cache entry. 1004 */ 1005 void 1006 nfsrvd_derefcache(struct nfsrvcache *rp) 1007 { 1008 struct mtx *mutex; 1009 1010 mutex = nfsrc_cachemutex(rp); 1011 mtx_lock(mutex); 1012 if (rp->rc_refcnt <= 0) 1013 panic("nfs cache derefcnt"); 1014 rp->rc_refcnt--; 1015 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG))) 1016 nfsrc_freecache(rp); 1017 mtx_unlock(mutex); 1018 } 1019 1020 /* 1021 * Calculate the length of the mbuf list and a checksum on the first up to 1022 * NFSRVCACHE_CHECKLEN bytes. 1023 */ 1024 static int 1025 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum) 1026 { 1027 int len = 0, cklen; 1028 struct mbuf *m; 1029 1030 m = m1; 1031 while (m) { 1032 len += m->m_len; 1033 m = m->m_next; 1034 } 1035 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len; 1036 *cksum = in_cksum(m1, cklen); 1037 return (len); 1038 } 1039 1040 /* 1041 * Mark a TCP connection that is seeing retries. Should never happen for 1042 * NFSv4. 1043 */ 1044 static void 1045 nfsrc_marksametcpconn(u_int64_t sockref) 1046 { 1047 } 1048