1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 /* 40 * Here is the basic algorithm: 41 * First, some design criteria I used: 42 * - I think a false hit is more serious than a false miss 43 * - A false hit for an RPC that has Op(s) that order via seqid# must be 44 * avoided at all cost 45 * - A valid hit will probably happen a long time after the original reply 46 * and the TCP socket that the original request was received on will no 47 * longer be active 48 * (The long time delay implies to me that LRU is not appropriate.) 49 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s 50 * in them as well as minimizing the risk of redoing retried non-idempotent 51 * Ops. 52 * Because it is biased towards avoiding false hits, multiple entries with 53 * the same xid are to be expected, especially for the case of the entry 54 * in the cache being related to a seqid# sequenced Op. 55 * 56 * The basic algorithm I'm about to code up: 57 * - Null RPCs bypass the cache and are just done 58 * For TCP 59 * - key on <xid, NFS version> (as noted above, there can be several 60 * entries with the same key) 61 * When a request arrives: 62 * For all that match key 63 * - if RPC# != OR request_size != 64 * - not a match with this one 65 * - if NFSv4 and received on same TCP socket OR 66 * received on a TCP connection created before the 67 * entry was cached 68 * - not a match with this one 69 * (V2,3 clients might retry on same TCP socket) 70 * - calculate checksum on first N bytes of NFS XDR 71 * - if checksum != 72 * - not a match for this one 73 * If any of the remaining ones that match has a 74 * seqid_refcnt > 0 75 * - not a match (go do RPC, using new cache entry) 76 * If one match left 77 * - a hit (reply from cache) 78 * else 79 * - miss (go do RPC, using new cache entry) 80 * 81 * During processing of NFSv4 request: 82 * - set a flag when a non-idempotent Op is processed 83 * - when an Op that uses a seqid# (Open,...) is processed 84 * - if same seqid# as referenced entry in cache 85 * - free new cache entry 86 * - reply from referenced cache entry 87 * else if next seqid# in order 88 * - free referenced cache entry 89 * - increment seqid_refcnt on new cache entry 90 * - set pointer from Openowner/Lockowner to 91 * new cache entry (aka reference it) 92 * else if first seqid# in sequence 93 * - increment seqid_refcnt on new cache entry 94 * - set pointer from Openowner/Lockowner to 95 * new cache entry (aka reference it) 96 * 97 * At end of RPC processing: 98 * - if seqid_refcnt > 0 OR flagged non-idempotent on new 99 * cache entry 100 * - save reply in cache entry 101 * - calculate checksum on first N bytes of NFS XDR 102 * request 103 * - note op and length of XDR request (in bytes) 104 * - timestamp it 105 * else 106 * - free new cache entry 107 * - Send reply (noting info for socket activity check, below) 108 * 109 * For cache entries saved above: 110 * - if saved since seqid_refcnt was > 0 111 * - free when seqid_refcnt decrements to 0 112 * (when next one in sequence is processed above, or 113 * when Openowner/Lockowner is discarded) 114 * else { non-idempotent Op(s) } 115 * - free when 116 * - some further activity observed on same 117 * socket 118 * (I'm not yet sure how I'm going to do 119 * this. Maybe look at the TCP connection 120 * to see if the send_tcp_sequence# is well 121 * past sent reply OR K additional RPCs 122 * replied on same socket OR?) 123 * OR 124 * - when very old (hours, days, weeks?) 125 * 126 * For UDP (v2, 3 only), pretty much the old way: 127 * - key on <xid, NFS version, RPC#, Client host ip#> 128 * (at most one entry for each key) 129 * 130 * When a Request arrives: 131 * - if a match with entry via key 132 * - if RPC marked In_progress 133 * - discard request (don't send reply) 134 * else 135 * - reply from cache 136 * - timestamp cache entry 137 * else 138 * - add entry to cache, marked In_progress 139 * - do RPC 140 * - when RPC done 141 * - if RPC# non-idempotent 142 * - mark entry Done (not In_progress) 143 * - save reply 144 * - timestamp cache entry 145 * else 146 * - free cache entry 147 * - send reply 148 * 149 * Later, entries with saved replies are free'd a short time (few minutes) 150 * after reply sent (timestamp). 151 * Reference: Chet Juszczak, "Improving the Performance and Correctness 152 * of an NFS Server", in Proc. Winter 1989 USENIX Conference, 153 * pages 53-63. San Diego, February 1989. 154 * for the UDP case. 155 * nfsrc_floodlevel is set to the allowable upper limit for saved replies 156 * for TCP. For V3, a reply won't be saved when the flood level is 157 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in 158 * that case. This level should be set high enough that this almost 159 * never happens. 160 */ 161 #include <fs/nfs/nfsport.h> 162 163 extern struct mtx nfsrc_udpmtx; 164 165 NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl); 166 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table); 167 NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table); 168 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p); 169 170 NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL; 171 NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0; 172 173 SYSCTL_DECL(_vfs_nfsd); 174 175 static u_int nfsrc_tcphighwater = 0; 176 static int 177 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS) 178 { 179 int error, newhighwater; 180 181 newhighwater = nfsrc_tcphighwater; 182 error = sysctl_handle_int(oidp, &newhighwater, 0, req); 183 if (error != 0 || req->newptr == NULL) 184 return (error); 185 if (newhighwater < 0) 186 return (EINVAL); 187 if (newhighwater >= NFSD_VNET(nfsrc_floodlevel)) 188 NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5; 189 nfsrc_tcphighwater = newhighwater; 190 return (0); 191 } 192 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, 193 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater), 194 sysctl_tcphighwater, "IU", "High water mark for TCP cache entries"); 195 196 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER; 197 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW, 198 &nfsrc_udphighwater, 0, 199 "High water mark for UDP cache entries"); 200 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT; 201 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW, 202 &nfsrc_tcptimeout, 0, 203 "Timeout for TCP entries in the DRC"); 204 static u_int nfsrc_tcpnonidempotent = 1; 205 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW, 206 &nfsrc_tcpnonidempotent, 0, 207 "Enable the DRC for NFS over TCP"); 208 209 NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0; 210 NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru); 211 212 /* 213 * and the reverse mapping from generic to Version 2 procedure numbers 214 */ 215 static int newnfsv2_procid[NFS_V3NPROCS] = { 216 NFSV2PROC_NULL, 217 NFSV2PROC_GETATTR, 218 NFSV2PROC_SETATTR, 219 NFSV2PROC_LOOKUP, 220 NFSV2PROC_NOOP, 221 NFSV2PROC_READLINK, 222 NFSV2PROC_READ, 223 NFSV2PROC_WRITE, 224 NFSV2PROC_CREATE, 225 NFSV2PROC_MKDIR, 226 NFSV2PROC_SYMLINK, 227 NFSV2PROC_CREATE, 228 NFSV2PROC_REMOVE, 229 NFSV2PROC_RMDIR, 230 NFSV2PROC_RENAME, 231 NFSV2PROC_LINK, 232 NFSV2PROC_READDIR, 233 NFSV2PROC_NOOP, 234 NFSV2PROC_STATFS, 235 NFSV2PROC_NOOP, 236 NFSV2PROC_NOOP, 237 NFSV2PROC_NOOP, 238 }; 239 240 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE) 241 #define NFSRCUDPHASH(xid) \ 242 (&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)]) 243 #define NFSRCHASH(xid) \ 244 (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl) 245 #define NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)]) 246 #define TRUE 1 247 #define FALSE 0 248 #define NFSRVCACHE_CHECKLEN 100 249 250 /* True iff the rpc reply is an nfs status ONLY! */ 251 static int nfsv2_repstat[NFS_V3NPROCS] = { 252 FALSE, 253 FALSE, 254 FALSE, 255 FALSE, 256 FALSE, 257 FALSE, 258 FALSE, 259 FALSE, 260 FALSE, 261 FALSE, 262 TRUE, 263 TRUE, 264 TRUE, 265 TRUE, 266 FALSE, 267 TRUE, 268 FALSE, 269 FALSE, 270 FALSE, 271 FALSE, 272 FALSE, 273 FALSE, 274 }; 275 276 /* 277 * Will NFS want to work over IPv6 someday? 278 */ 279 #define NETFAMILY(rp) \ 280 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET) 281 282 /* local functions */ 283 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp); 284 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp); 285 static void nfsrc_lock(struct nfsrvcache *rp); 286 static void nfsrc_unlock(struct nfsrvcache *rp); 287 static void nfsrc_wanted(struct nfsrvcache *rp); 288 static void nfsrc_freecache(struct nfsrvcache *rp); 289 static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum); 290 static void nfsrc_marksametcpconn(u_int64_t); 291 292 /* 293 * Return the correct mutex for this cache entry. 294 */ 295 static __inline struct mtx * 296 nfsrc_cachemutex(struct nfsrvcache *rp) 297 { 298 299 if ((rp->rc_flag & RC_UDP) != 0) 300 return (&nfsrc_udpmtx); 301 return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx); 302 } 303 304 /* 305 * Initialize the server request cache list 306 */ 307 void 308 nfsrvd_initcache(void) 309 { 310 int i; 311 312 NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) * 313 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO); 314 NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) * 315 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO); 316 NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) * 317 NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO); 318 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 319 mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL, 320 MTX_DEF); 321 mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL, 322 MTX_DEF); 323 } 324 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 325 LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]); 326 LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl); 327 LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl); 328 } 329 TAILQ_INIT(&NFSD_VNET(nfsrvudplru)); 330 NFSD_VNET(nfsrc_tcpsavedreplies) = 0; 331 NFSD_VNET(nfsrc_udpcachesize) = 0; 332 } 333 334 /* 335 * Get a cache entry for this request. Basically just malloc a new one 336 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest. 337 */ 338 int 339 nfsrvd_getcache(struct nfsrv_descript *nd) 340 { 341 struct nfsrvcache *newrp; 342 int ret; 343 344 if (nd->nd_procnum == NFSPROC_NULL) 345 panic("nfsd cache null"); 346 newrp = malloc(sizeof (struct nfsrvcache), 347 M_NFSRVCACHE, M_WAITOK); 348 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache)); 349 if (nd->nd_flag & ND_NFSV4) 350 newrp->rc_flag = RC_NFSV4; 351 else if (nd->nd_flag & ND_NFSV3) 352 newrp->rc_flag = RC_NFSV3; 353 else 354 newrp->rc_flag = RC_NFSV2; 355 newrp->rc_xid = nd->nd_retxid; 356 newrp->rc_proc = nd->nd_procnum; 357 newrp->rc_sockref = nd->nd_sockref; 358 newrp->rc_cachetime = nd->nd_tcpconntime; 359 if (nd->nd_flag & ND_SAMETCPCONN) 360 newrp->rc_flag |= RC_SAMETCPCONN; 361 if (nd->nd_nam2 != NULL) { 362 newrp->rc_flag |= RC_UDP; 363 ret = nfsrc_getudp(nd, newrp); 364 } else { 365 ret = nfsrc_gettcp(nd, newrp); 366 } 367 NFSEXITCODE2(0, nd); 368 return (ret); 369 } 370 371 /* 372 * For UDP (v2, v3): 373 * - key on <xid, NFS version, RPC#, Client host ip#> 374 * (at most one entry for each key) 375 */ 376 static int 377 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp) 378 { 379 struct nfsrvcache *rp; 380 struct sockaddr_in *saddr; 381 struct sockaddr_in6 *saddr6; 382 struct nfsrvhashhead *hp; 383 int ret = 0; 384 struct mtx *mutex; 385 386 mutex = nfsrc_cachemutex(newrp); 387 hp = NFSRCUDPHASH(newrp->rc_xid); 388 loop: 389 mtx_lock(mutex); 390 LIST_FOREACH(rp, hp, rc_hash) { 391 if (newrp->rc_xid == rp->rc_xid && 392 newrp->rc_proc == rp->rc_proc && 393 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) && 394 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) { 395 if ((rp->rc_flag & RC_LOCKED) != 0) { 396 rp->rc_flag |= RC_WANTED; 397 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, 398 "nfsrc", 10 * hz); 399 goto loop; 400 } 401 if (rp->rc_flag == 0) 402 panic("nfs udp cache0"); 403 rp->rc_flag |= RC_LOCKED; 404 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 405 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 406 if (rp->rc_flag & RC_INPROG) { 407 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++; 408 mtx_unlock(mutex); 409 ret = RC_DROPIT; 410 } else if (rp->rc_flag & RC_REPSTATUS) { 411 /* 412 * V2 only. 413 */ 414 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 415 mtx_unlock(mutex); 416 nfsrvd_rephead(nd); 417 *(nd->nd_errp) = rp->rc_status; 418 ret = RC_REPLY; 419 rp->rc_timestamp = NFSD_MONOSEC + 420 NFSRVCACHE_UDPTIMEOUT; 421 } else if (rp->rc_flag & RC_REPMBUF) { 422 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 423 mtx_unlock(mutex); 424 nd->nd_mreq = m_copym(rp->rc_reply, 0, 425 M_COPYALL, M_WAITOK); 426 ret = RC_REPLY; 427 rp->rc_timestamp = NFSD_MONOSEC + 428 NFSRVCACHE_UDPTIMEOUT; 429 } else { 430 panic("nfs udp cache1"); 431 } 432 nfsrc_unlock(rp); 433 free(newrp, M_NFSRVCACHE); 434 goto out; 435 } 436 } 437 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++; 438 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1); 439 NFSD_VNET(nfsrc_udpcachesize)++; 440 441 newrp->rc_flag |= RC_INPROG; 442 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *); 443 if (saddr->sin_family == AF_INET) 444 newrp->rc_inet = saddr->sin_addr.s_addr; 445 else if (saddr->sin_family == AF_INET6) { 446 saddr6 = (struct sockaddr_in6 *)saddr; 447 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6, 448 sizeof (struct in6_addr)); 449 newrp->rc_flag |= RC_INETIPV6; 450 } 451 LIST_INSERT_HEAD(hp, newrp, rc_hash); 452 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru); 453 mtx_unlock(mutex); 454 nd->nd_rp = newrp; 455 ret = RC_DOIT; 456 457 out: 458 NFSEXITCODE2(0, nd); 459 return (ret); 460 } 461 462 /* 463 * Update a request cache entry after the rpc has been done 464 */ 465 struct nfsrvcache * 466 nfsrvd_updatecache(struct nfsrv_descript *nd) 467 { 468 struct nfsrvcache *rp; 469 struct nfsrvcache *retrp = NULL; 470 struct mbuf *m; 471 struct mtx *mutex; 472 473 rp = nd->nd_rp; 474 if (!rp) 475 panic("nfsrvd_updatecache null rp"); 476 nd->nd_rp = NULL; 477 mutex = nfsrc_cachemutex(rp); 478 mtx_lock(mutex); 479 nfsrc_lock(rp); 480 if (!(rp->rc_flag & RC_INPROG)) 481 panic("nfsrvd_updatecache not inprog"); 482 rp->rc_flag &= ~RC_INPROG; 483 if (rp->rc_flag & RC_UDP) { 484 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 485 TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 486 } 487 488 /* 489 * Reply from cache is a special case returned by nfsrv_checkseqid(). 490 */ 491 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) { 492 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 493 mtx_unlock(mutex); 494 nd->nd_repstat = 0; 495 if (nd->nd_mreq) 496 m_freem(nd->nd_mreq); 497 if (!(rp->rc_flag & RC_REPMBUF)) 498 panic("reply from cache"); 499 nd->nd_mreq = m_copym(rp->rc_reply, 0, 500 M_COPYALL, M_WAITOK); 501 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 502 nfsrc_unlock(rp); 503 goto out; 504 } 505 506 /* 507 * If rc_refcnt > 0, save it 508 * For UDP, save it if ND_SAVEREPLY is set 509 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set 510 */ 511 if (nd->nd_repstat != NFSERR_DONTREPLY && 512 (rp->rc_refcnt > 0 || 513 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) || 514 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) && 515 NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) && 516 nfsrc_tcpnonidempotent))) { 517 if (rp->rc_refcnt > 0) { 518 if (!(rp->rc_flag & RC_NFSV4)) 519 panic("update_cache refcnt"); 520 rp->rc_flag |= RC_REFCNT; 521 } 522 if ((nd->nd_flag & ND_NFSV2) && 523 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) { 524 rp->rc_status = nd->nd_repstat; 525 rp->rc_flag |= RC_REPSTATUS; 526 mtx_unlock(mutex); 527 } else { 528 if (!(rp->rc_flag & RC_UDP)) { 529 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), 530 1); 531 if (NFSD_VNET(nfsrc_tcpsavedreplies) > 532 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak) 533 NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak = 534 NFSD_VNET(nfsrc_tcpsavedreplies); 535 } 536 mtx_unlock(mutex); 537 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK); 538 mtx_lock(mutex); 539 rp->rc_reply = m; 540 rp->rc_flag |= RC_REPMBUF; 541 mtx_unlock(mutex); 542 } 543 if (rp->rc_flag & RC_UDP) { 544 rp->rc_timestamp = NFSD_MONOSEC + 545 NFSRVCACHE_UDPTIMEOUT; 546 nfsrc_unlock(rp); 547 } else { 548 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 549 if (rp->rc_refcnt > 0) 550 nfsrc_unlock(rp); 551 else 552 retrp = rp; 553 } 554 } else { 555 nfsrc_freecache(rp); 556 mtx_unlock(mutex); 557 } 558 559 out: 560 NFSEXITCODE2(0, nd); 561 return (retrp); 562 } 563 564 /* 565 * Invalidate and, if possible, free an in prog cache entry. 566 * Must not sleep. 567 */ 568 void 569 nfsrvd_delcache(struct nfsrvcache *rp) 570 { 571 struct mtx *mutex; 572 573 mutex = nfsrc_cachemutex(rp); 574 if (!(rp->rc_flag & RC_INPROG)) 575 panic("nfsrvd_delcache not in prog"); 576 mtx_lock(mutex); 577 rp->rc_flag &= ~RC_INPROG; 578 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED)) 579 nfsrc_freecache(rp); 580 mtx_unlock(mutex); 581 } 582 583 /* 584 * Called after nfsrvd_updatecache() once the reply is sent, to update 585 * the entry's sequence number and unlock it. The argument is 586 * the pointer returned by nfsrvd_updatecache(). 587 */ 588 void 589 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq) 590 { 591 struct nfsrchash_bucket *hbp; 592 593 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked")); 594 if (have_seq) { 595 hbp = NFSRCAHASH(rp->rc_sockref); 596 mtx_lock(&hbp->mtx); 597 rp->rc_tcpseq = seq; 598 if (rp->rc_acked != RC_NO_ACK) 599 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash); 600 rp->rc_acked = RC_NO_ACK; 601 mtx_unlock(&hbp->mtx); 602 } 603 nfsrc_unlock(rp); 604 } 605 606 /* 607 * Get a cache entry for TCP 608 * - key on <xid, nfs version> 609 * (allow multiple entries for a given key) 610 */ 611 static int 612 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp) 613 { 614 struct nfsrvcache *rp, *nextrp; 615 int i; 616 struct nfsrvcache *hitrp; 617 struct nfsrvhashhead *hp, nfsrc_templist; 618 int hit, ret = 0; 619 struct mtx *mutex; 620 621 mutex = nfsrc_cachemutex(newrp); 622 hp = NFSRCHASH(newrp->rc_xid); 623 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum); 624 tryagain: 625 mtx_lock(mutex); 626 hit = 1; 627 LIST_INIT(&nfsrc_templist); 628 /* 629 * Get all the matches and put them on the temp list. 630 */ 631 rp = LIST_FIRST(hp); 632 while (rp != LIST_END(hp)) { 633 nextrp = LIST_NEXT(rp, rc_hash); 634 if (newrp->rc_xid == rp->rc_xid && 635 (!(rp->rc_flag & RC_INPROG) || 636 ((newrp->rc_flag & RC_SAMETCPCONN) && 637 newrp->rc_sockref == rp->rc_sockref)) && 638 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) && 639 newrp->rc_proc == rp->rc_proc && 640 ((newrp->rc_flag & RC_NFSV4) && 641 newrp->rc_sockref != rp->rc_sockref && 642 newrp->rc_cachetime >= rp->rc_cachetime) 643 && newrp->rc_reqlen == rp->rc_reqlen && 644 newrp->rc_cksum == rp->rc_cksum) { 645 LIST_REMOVE(rp, rc_hash); 646 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash); 647 } 648 rp = nextrp; 649 } 650 651 /* 652 * Now, use nfsrc_templist to decide if there is a match. 653 */ 654 i = 0; 655 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) { 656 i++; 657 if (rp->rc_refcnt > 0) { 658 hit = 0; 659 break; 660 } 661 } 662 /* 663 * Can be a hit only if one entry left. 664 * Note possible hit entry and put nfsrc_templist back on hash 665 * list. 666 */ 667 if (i != 1) 668 hit = 0; 669 hitrp = rp = LIST_FIRST(&nfsrc_templist); 670 while (rp != LIST_END(&nfsrc_templist)) { 671 nextrp = LIST_NEXT(rp, rc_hash); 672 LIST_REMOVE(rp, rc_hash); 673 LIST_INSERT_HEAD(hp, rp, rc_hash); 674 rp = nextrp; 675 } 676 if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist)) 677 panic("nfs gettcp cache templist"); 678 679 if (hit) { 680 rp = hitrp; 681 if ((rp->rc_flag & RC_LOCKED) != 0) { 682 rp->rc_flag |= RC_WANTED; 683 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP, 684 "nfsrc", 10 * hz); 685 goto tryagain; 686 } 687 if (rp->rc_flag == 0) 688 panic("nfs tcp cache0"); 689 rp->rc_flag |= RC_LOCKED; 690 if (rp->rc_flag & RC_INPROG) { 691 NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++; 692 mtx_unlock(mutex); 693 if (newrp->rc_sockref == rp->rc_sockref) 694 nfsrc_marksametcpconn(rp->rc_sockref); 695 ret = RC_DROPIT; 696 } else if (rp->rc_flag & RC_REPSTATUS) { 697 /* 698 * V2 only. 699 */ 700 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 701 mtx_unlock(mutex); 702 if (newrp->rc_sockref == rp->rc_sockref) 703 nfsrc_marksametcpconn(rp->rc_sockref); 704 ret = RC_REPLY; 705 nfsrvd_rephead(nd); 706 *(nd->nd_errp) = rp->rc_status; 707 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 708 } else if (rp->rc_flag & RC_REPMBUF) { 709 NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++; 710 mtx_unlock(mutex); 711 if (newrp->rc_sockref == rp->rc_sockref) 712 nfsrc_marksametcpconn(rp->rc_sockref); 713 ret = RC_REPLY; 714 nd->nd_mreq = m_copym(rp->rc_reply, 0, 715 M_COPYALL, M_WAITOK); 716 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout; 717 } else { 718 panic("nfs tcp cache1"); 719 } 720 nfsrc_unlock(rp); 721 free(newrp, M_NFSRVCACHE); 722 goto out; 723 } 724 NFSD_VNET(nfsstatsv1_p)->srvcache_misses++; 725 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1); 726 727 /* 728 * For TCP, multiple entries for a key are allowed, so don't 729 * chain it into the hash table until done. 730 */ 731 newrp->rc_cachetime = NFSD_MONOSEC; 732 newrp->rc_flag |= RC_INPROG; 733 LIST_INSERT_HEAD(hp, newrp, rc_hash); 734 mtx_unlock(mutex); 735 nd->nd_rp = newrp; 736 ret = RC_DOIT; 737 738 out: 739 NFSEXITCODE2(0, nd); 740 return (ret); 741 } 742 743 /* 744 * Lock a cache entry. 745 */ 746 static void 747 nfsrc_lock(struct nfsrvcache *rp) 748 { 749 struct mtx *mutex; 750 751 mutex = nfsrc_cachemutex(rp); 752 mtx_assert(mutex, MA_OWNED); 753 while ((rp->rc_flag & RC_LOCKED) != 0) { 754 rp->rc_flag |= RC_WANTED; 755 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0); 756 } 757 rp->rc_flag |= RC_LOCKED; 758 } 759 760 /* 761 * Unlock a cache entry. 762 */ 763 static void 764 nfsrc_unlock(struct nfsrvcache *rp) 765 { 766 struct mtx *mutex; 767 768 mutex = nfsrc_cachemutex(rp); 769 mtx_lock(mutex); 770 rp->rc_flag &= ~RC_LOCKED; 771 nfsrc_wanted(rp); 772 mtx_unlock(mutex); 773 } 774 775 /* 776 * Wakeup anyone wanting entry. 777 */ 778 static void 779 nfsrc_wanted(struct nfsrvcache *rp) 780 { 781 if (rp->rc_flag & RC_WANTED) { 782 rp->rc_flag &= ~RC_WANTED; 783 wakeup((caddr_t)rp); 784 } 785 } 786 787 /* 788 * Free up the entry. 789 * Must not sleep. 790 */ 791 static void 792 nfsrc_freecache(struct nfsrvcache *rp) 793 { 794 struct nfsrchash_bucket *hbp; 795 796 LIST_REMOVE(rp, rc_hash); 797 if (rp->rc_flag & RC_UDP) { 798 TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru); 799 NFSD_VNET(nfsrc_udpcachesize)--; 800 } else if (rp->rc_acked != RC_NO_SEQ) { 801 hbp = NFSRCAHASH(rp->rc_sockref); 802 mtx_lock(&hbp->mtx); 803 if (rp->rc_acked == RC_NO_ACK) 804 LIST_REMOVE(rp, rc_ahash); 805 mtx_unlock(&hbp->mtx); 806 } 807 nfsrc_wanted(rp); 808 if (rp->rc_flag & RC_REPMBUF) { 809 m_freem(rp->rc_reply); 810 if (!(rp->rc_flag & RC_UDP)) 811 atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1); 812 } 813 free(rp, M_NFSRVCACHE); 814 atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1); 815 } 816 817 /* 818 * Clean out the cache. Called when nfsserver module is unloaded. 819 */ 820 void 821 nfsrvd_cleancache(void) 822 { 823 struct nfsrvcache *rp, *nextrp; 824 int i; 825 826 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 827 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl, 828 rc_hash, nextrp) 829 nfsrc_freecache(rp); 830 } 831 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 832 LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash, 833 nextrp) { 834 nfsrc_freecache(rp); 835 } 836 } 837 NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0; 838 NFSD_VNET(nfsrc_tcpsavedreplies) = 0; 839 } 840 841 #define HISTSIZE 16 842 /* 843 * The basic rule is to get rid of entries that are expired. 844 */ 845 void 846 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final) 847 { 848 struct nfsrchash_bucket *hbp; 849 struct nfsrvcache *rp, *nextrp; 850 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE]; 851 time_t thisstamp; 852 static time_t udp_lasttrim = 0, tcp_lasttrim = 0; 853 static int onethread = 0, oneslot = 0; 854 855 if (sockref != 0) { 856 hbp = NFSRCAHASH(sockref); 857 mtx_lock(&hbp->mtx); 858 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) { 859 if (sockref == rp->rc_sockref) { 860 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) { 861 rp->rc_acked = RC_ACK; 862 LIST_REMOVE(rp, rc_ahash); 863 } else if (final) { 864 rp->rc_acked = RC_NACK; 865 LIST_REMOVE(rp, rc_ahash); 866 } 867 } 868 } 869 mtx_unlock(&hbp->mtx); 870 } 871 872 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0) 873 return; 874 if (NFSD_MONOSEC != udp_lasttrim || 875 NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater + 876 nfsrc_udphighwater / 2)) { 877 mtx_lock(&nfsrc_udpmtx); 878 udp_lasttrim = NFSD_MONOSEC; 879 TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru, 880 nextrp) { 881 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED)) 882 && rp->rc_refcnt == 0 883 && ((rp->rc_flag & RC_REFCNT) || 884 udp_lasttrim > rp->rc_timestamp || 885 NFSD_VNET(nfsrc_udpcachesize) > 886 nfsrc_udphighwater)) 887 nfsrc_freecache(rp); 888 } 889 mtx_unlock(&nfsrc_udpmtx); 890 } 891 if (NFSD_MONOSEC != tcp_lasttrim || 892 NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) { 893 force = nfsrc_tcphighwater / 4; 894 if (force > 0 && 895 NFSD_VNET(nfsrc_tcpsavedreplies) + force >= 896 nfsrc_tcphighwater) { 897 for (i = 0; i < HISTSIZE; i++) 898 time_histo[i] = 0; 899 i = 0; 900 lastslot = NFSRVCACHE_HASHSIZE - 1; 901 } else { 902 force = 0; 903 if (NFSD_MONOSEC != tcp_lasttrim) { 904 i = 0; 905 lastslot = NFSRVCACHE_HASHSIZE - 1; 906 } else { 907 lastslot = i = oneslot; 908 if (++oneslot >= NFSRVCACHE_HASHSIZE) 909 oneslot = 0; 910 } 911 } 912 tto = nfsrc_tcptimeout; 913 tcp_lasttrim = NFSD_MONOSEC; 914 for (; i <= lastslot; i++) { 915 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx); 916 LIST_FOREACH_SAFE(rp, 917 &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash, 918 nextrp) { 919 if (!(rp->rc_flag & 920 (RC_INPROG|RC_LOCKED|RC_WANTED)) 921 && rp->rc_refcnt == 0) { 922 if ((rp->rc_flag & RC_REFCNT) || 923 tcp_lasttrim > rp->rc_timestamp || 924 rp->rc_acked == RC_ACK) { 925 nfsrc_freecache(rp); 926 continue; 927 } 928 929 if (force == 0) 930 continue; 931 /* 932 * The timestamps range from roughly the 933 * present (tcp_lasttrim) to the present 934 * + nfsrc_tcptimeout. Generate a simple 935 * histogram of where the timeouts fall. 936 */ 937 j = rp->rc_timestamp - tcp_lasttrim; 938 if (j >= tto) 939 j = HISTSIZE - 1; 940 else if (j < 0) 941 j = 0; 942 else 943 j = j * HISTSIZE / tto; 944 time_histo[j]++; 945 } 946 } 947 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx); 948 } 949 if (force) { 950 /* 951 * Trim some more with a smaller timeout of as little 952 * as 20% of nfsrc_tcptimeout to try and get below 953 * 80% of the nfsrc_tcphighwater. 954 */ 955 k = 0; 956 for (i = 0; i < (HISTSIZE - 2); i++) { 957 k += time_histo[i]; 958 if (k > force) 959 break; 960 } 961 k = tto * (i + 1) / HISTSIZE; 962 if (k < 1) 963 k = 1; 964 thisstamp = tcp_lasttrim + k; 965 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { 966 mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx); 967 LIST_FOREACH_SAFE(rp, 968 &NFSD_VNET(nfsrchash_table)[i].tbl, 969 rc_hash, nextrp) { 970 if (!(rp->rc_flag & 971 (RC_INPROG|RC_LOCKED|RC_WANTED)) 972 && rp->rc_refcnt == 0 973 && ((rp->rc_flag & RC_REFCNT) || 974 thisstamp > rp->rc_timestamp || 975 rp->rc_acked == RC_ACK)) 976 nfsrc_freecache(rp); 977 } 978 mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx); 979 } 980 } 981 } 982 atomic_store_rel_int(&onethread, 0); 983 } 984 985 /* 986 * Add a seqid# reference to the cache entry. 987 */ 988 void 989 nfsrvd_refcache(struct nfsrvcache *rp) 990 { 991 struct mtx *mutex; 992 993 if (rp == NULL) 994 /* For NFSv4.1, there is no cache entry. */ 995 return; 996 mutex = nfsrc_cachemutex(rp); 997 mtx_lock(mutex); 998 if (rp->rc_refcnt < 0) 999 panic("nfs cache refcnt"); 1000 rp->rc_refcnt++; 1001 mtx_unlock(mutex); 1002 } 1003 1004 /* 1005 * Dereference a seqid# cache entry. 1006 */ 1007 void 1008 nfsrvd_derefcache(struct nfsrvcache *rp) 1009 { 1010 struct mtx *mutex; 1011 1012 mutex = nfsrc_cachemutex(rp); 1013 mtx_lock(mutex); 1014 if (rp->rc_refcnt <= 0) 1015 panic("nfs cache derefcnt"); 1016 rp->rc_refcnt--; 1017 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG))) 1018 nfsrc_freecache(rp); 1019 mtx_unlock(mutex); 1020 } 1021 1022 /* 1023 * Calculate the length of the mbuf list and a checksum on the first up to 1024 * NFSRVCACHE_CHECKLEN bytes. 1025 */ 1026 static int 1027 nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum) 1028 { 1029 int len = 0, cklen; 1030 struct mbuf *m; 1031 1032 m = m1; 1033 while (m) { 1034 len += m->m_len; 1035 m = m->m_next; 1036 } 1037 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len; 1038 *cksum = in_cksum(m1, cklen); 1039 return (len); 1040 } 1041 1042 /* 1043 * Mark a TCP connection that is seeing retries. Should never happen for 1044 * NFSv4. 1045 */ 1046 static void 1047 nfsrc_marksametcpconn(u_int64_t sockref) 1048 { 1049 } 1050