1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/systm.h> 30 #include <sys/sdt.h> 31 #include <rpc/types.h> 32 #include <rpc/auth.h> 33 #include <rpc/auth_unix.h> 34 #include <rpc/auth_des.h> 35 #include <rpc/svc.h> 36 #include <rpc/xdr.h> 37 #include <nfs/nfs4.h> 38 #include <nfs/nfs_dispatch.h> 39 #include <nfs/nfs4_drc.h> 40 41 /* 42 * This is the duplicate request cache for NFSv4 43 */ 44 rfs4_drc_t *nfs4_drc = NULL; 45 46 /* 47 * The default size of the duplicate request cache 48 */ 49 uint32_t nfs4_drc_max = 8 * 1024; 50 51 /* 52 * The number of buckets we'd like to hash the 53 * replies into.. do not change this on the fly. 54 */ 55 uint32_t nfs4_drc_hash = 541; 56 57 /* 58 * Initialize a duplicate request cache. 59 */ 60 rfs4_drc_t * 61 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size) 62 { 63 rfs4_drc_t *drc; 64 uint32_t bki; 65 66 ASSERT(drc_size); 67 ASSERT(drc_hash_size); 68 69 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP); 70 71 drc->max_size = drc_size; 72 drc->in_use = 0; 73 74 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL); 75 76 drc->dr_hash = drc_hash_size; 77 78 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP); 79 80 for (bki = 0; bki < drc_hash_size; bki++) { 81 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t), 82 offsetof(rfs4_dupreq_t, dr_bkt_next)); 83 } 84 85 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t), 86 offsetof(rfs4_dupreq_t, dr_next)); 87 88 return (drc); 89 } 90 91 /* 92 * Destroy a duplicate request cache. 93 */ 94 void 95 rfs4_fini_drc(rfs4_drc_t *drc) 96 { 97 rfs4_dupreq_t *drp, *drp_next; 98 99 ASSERT(drc); 100 101 /* iterate over the dr_cache and free the enties */ 102 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) { 103 104 if (drp->dr_state == NFS4_DUP_REPLAY) 105 rfs4_compound_free(&(drp->dr_res)); 106 107 if (drp->dr_addr.buf != NULL) 108 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 109 110 drp_next = list_next(&(drc->dr_cache), drp); 111 112 kmem_free(drp, sizeof (rfs4_dupreq_t)); 113 } 114 115 mutex_destroy(&drc->lock); 116 kmem_free(drc->dr_buckets, 117 sizeof (list_t)*drc->dr_hash); 118 kmem_free(drc, sizeof (rfs4_drc_t)); 119 } 120 121 /* 122 * rfs4_dr_chstate: 123 * 124 * Change the state of a rfs4_dupreq. If it's not in transition 125 * to the FREE state, update the time used and return. If we 126 * are moving to the FREE state then we need to clean up the 127 * compound results and move the entry to the end of the list. 128 */ 129 void 130 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state) 131 { 132 rfs4_drc_t *drc; 133 134 ASSERT(drp); 135 ASSERT(drp->drc); 136 ASSERT(drp->dr_bkt); 137 ASSERT(MUTEX_HELD(&drp->drc->lock)); 138 139 drp->dr_state = new_state; 140 141 if (new_state != NFS4_DUP_FREE) { 142 gethrestime(&drp->dr_time_used); 143 return; 144 } 145 146 drc = drp->drc; 147 148 /* 149 * Remove entry from the bucket and 150 * dr_cache list, free compound results. 151 */ 152 list_remove(drp->dr_bkt, drp); 153 list_remove(&(drc->dr_cache), drp); 154 rfs4_compound_free(&(drp->dr_res)); 155 } 156 157 /* 158 * rfs4_alloc_dr: 159 * 160 * Malloc a new one if we have not reached our maximum cache 161 * limit, otherwise pick an entry off the tail -- Use if it 162 * is marked as NFS4_DUP_FREE, or is an entry in the 163 * NFS4_DUP_REPLAY state. 164 */ 165 rfs4_dupreq_t * 166 rfs4_alloc_dr(rfs4_drc_t *drc) 167 { 168 rfs4_dupreq_t *drp_tail, *drp = NULL; 169 170 ASSERT(drc); 171 ASSERT(MUTEX_HELD(&drc->lock)); 172 173 /* 174 * Have we hit the cache limit yet ? 175 */ 176 if (drc->in_use < drc->max_size) { 177 /* 178 * nope, so let's malloc a new one 179 */ 180 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP); 181 drp->drc = drc; 182 drc->in_use++; 183 gethrestime(&drp->dr_time_created); 184 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp); 185 return (drp); 186 } 187 188 /* 189 * Cache is all allocated now traverse the list 190 * backwards to find one we can reuse. 191 */ 192 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL; 193 drp_tail = list_prev(&drc->dr_cache, drp_tail)) { 194 195 switch (drp_tail->dr_state) { 196 197 case NFS4_DUP_FREE: 198 list_remove(&(drc->dr_cache), drp_tail); 199 DTRACE_PROBE1(nfss__i__drc_freeclaim, 200 rfs4_dupreq_t *, drp_tail); 201 return (drp_tail); 202 /* NOTREACHED */ 203 204 case NFS4_DUP_REPLAY: 205 /* grab it. */ 206 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE); 207 DTRACE_PROBE1(nfss__i__drc_replayclaim, 208 rfs4_dupreq_t *, drp_tail); 209 return (drp_tail); 210 /* NOTREACHED */ 211 } 212 } 213 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc); 214 return (NULL); 215 } 216 217 /* 218 * rfs4_find_dr: 219 * 220 * Search for an entry in the duplicate request cache by 221 * calculating the hash index based on the XID, and examining 222 * the entries in the hash bucket. If we find a match stamp the 223 * time_used and return. If the entry does not match it could be 224 * ready to be freed. Once we have searched the bucket we call 225 * rfs4_alloc_dr() to allocate a new entry, or reuse one that is 226 * available. 227 */ 228 int 229 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup) 230 { 231 232 uint32_t the_xid; 233 list_t *dr_bkt; 234 rfs4_dupreq_t *drp; 235 int bktdex; 236 237 /* 238 * Get the XID, calculate the bucket and search to 239 * see if we need to replay from the cache. 240 */ 241 the_xid = req->rq_xprt->xp_xid; 242 bktdex = the_xid % drc->dr_hash; 243 244 dr_bkt = (list_t *) 245 &(drc->dr_buckets[(the_xid % drc->dr_hash)]); 246 247 DTRACE_PROBE3(nfss__i__drc_bktdex, 248 int, bktdex, 249 uint32_t, the_xid, 250 list_t *, dr_bkt); 251 252 *dup = NULL; 253 254 mutex_enter(&drc->lock); 255 /* 256 * Search the bucket for a matching xid and address. 257 */ 258 for (drp = list_head(dr_bkt); drp != NULL; 259 drp = list_next(dr_bkt, drp)) { 260 261 if (drp->dr_xid == the_xid && 262 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len && 263 bcmp((caddr_t)drp->dr_addr.buf, 264 (caddr_t)req->rq_xprt->xp_rtaddr.buf, 265 drp->dr_addr.len) == 0) { 266 267 /* 268 * Found a match so REPLAY the Reply 269 */ 270 if (drp->dr_state == NFS4_DUP_REPLAY) { 271 rfs4_dr_chstate(drp, NFS4_DUP_INUSE); 272 mutex_exit(&drc->lock); 273 *dup = drp; 274 DTRACE_PROBE1(nfss__i__drc_replay, 275 rfs4_dupreq_t *, drp); 276 return (NFS4_DUP_REPLAY); 277 } 278 279 /* 280 * This entry must be in transition, so return 281 * the 'pending' status. 282 */ 283 mutex_exit(&drc->lock); 284 return (NFS4_DUP_PENDING); 285 } 286 287 /* 288 * Not a match, but maybe this entry is okay 289 * to be reused. 290 */ 291 if (drp->dr_state == NFS4_DUP_REPLAY) { 292 rfs4_dr_chstate(drp, NFS4_DUP_FREE); 293 list_insert_tail(&(drp->drc->dr_cache), drp); 294 } 295 } 296 297 drp = rfs4_alloc_dr(drc); 298 mutex_exit(&drc->lock); 299 300 /* 301 * The DRC is full and all entries are in use. Upper function 302 * should error out this request and force the client to 303 * retransmit -- effectively this is a resource issue. NFSD 304 * threads tied up with native File System, or the cache size 305 * is too small for the server load. 306 */ 307 if (drp == NULL) 308 return (NFS4_DUP_ERROR); 309 310 /* 311 * Init the state to NEW and clear the time used field. 312 */ 313 drp->dr_state = NFS4_DUP_NEW; 314 drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0; 315 316 /* 317 * If needed, resize the address buffer 318 */ 319 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) { 320 if (drp->dr_addr.buf != NULL) 321 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 322 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len; 323 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP); 324 if (drp->dr_addr.buf == NULL) { 325 /* 326 * If the malloc fails, mark the entry 327 * as free and put on the tail. 328 */ 329 drp->dr_addr.maxlen = 0; 330 drp->dr_state = NFS4_DUP_FREE; 331 mutex_enter(&drc->lock); 332 list_insert_tail(&(drc->dr_cache), drp); 333 mutex_exit(&drc->lock); 334 return (NFS4_DUP_ERROR); 335 } 336 } 337 338 339 /* 340 * Copy the address. 341 */ 342 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len; 343 344 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf, 345 (caddr_t)drp->dr_addr.buf, 346 drp->dr_addr.len); 347 348 drp->dr_xid = the_xid; 349 drp->dr_bkt = dr_bkt; 350 351 /* 352 * Insert at the head of the bucket and 353 * the drc lists.. 354 */ 355 mutex_enter(&drc->lock); 356 list_insert_head(&drc->dr_cache, drp); 357 list_insert_head(dr_bkt, drp); 358 mutex_exit(&drc->lock); 359 360 *dup = drp; 361 362 return (NFS4_DUP_NEW); 363 } 364 365 /* 366 * 367 * This function handles the duplicate request cache, 368 * NULL_PROC and COMPOUND procedure calls for NFSv4; 369 * 370 * Passed into this function are:- 371 * 372 * disp A pointer to our dispatch table entry 373 * req The request to process 374 * xprt The server transport handle 375 * ap A pointer to the arguments 376 * 377 * 378 * When appropriate this function is responsible for inserting 379 * the reply into the duplicate cache or replaying an existing 380 * cached reply. 381 * 382 * dr_stat reflects the state of the duplicate request that 383 * has been inserted into or retrieved from the cache 384 * 385 * drp is the duplicate request entry 386 * 387 */ 388 int 389 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req, 390 SVCXPRT *xprt, char *ap) 391 { 392 393 COMPOUND4res res_buf, *rbp; 394 COMPOUND4args *cap; 395 396 cred_t *cr = NULL; 397 int error = 0; 398 int dis_flags = 0; 399 int dr_stat = NFS4_NOT_DUP; 400 rfs4_dupreq_t *drp = NULL; 401 402 ASSERT(disp); 403 404 /* 405 * Short circuit the RPC_NULL proc. 406 */ 407 if (disp->dis_proc == rpc_null) { 408 if (!svc_sendreply(xprt, xdr_void, NULL)) { 409 return (1); 410 } 411 return (0); 412 } 413 414 /* Only NFSv4 Compounds from this point onward */ 415 416 rbp = &res_buf; 417 cap = (COMPOUND4args *)ap; 418 419 /* 420 * Figure out the disposition of the whole COMPOUND 421 * and record it's IDEMPOTENTCY. 422 */ 423 rfs4_compound_flagproc(cap, &dis_flags); 424 425 /* 426 * If NON-IDEMPOTENT then we need to figure out if this 427 * request can be replied from the duplicate cache. 428 * 429 * If this is a new request then we need to insert the 430 * reply into the duplicate cache. 431 */ 432 if (!(dis_flags & RPC_IDEMPOTENT)) { 433 /* look for a replay from the cache or allocate */ 434 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp); 435 436 switch (dr_stat) { 437 438 case NFS4_DUP_ERROR: 439 svcerr_systemerr(xprt); 440 return (1); 441 /* NOTREACHED */ 442 443 case NFS4_DUP_PENDING: 444 /* 445 * reply has previously been inserted into the 446 * duplicate cache, however the reply has 447 * not yet been sent via svc_sendreply() 448 */ 449 return (1); 450 /* NOTREACHED */ 451 452 case NFS4_DUP_NEW: 453 curthread->t_flag |= T_DONTPEND; 454 /* NON-IDEMPOTENT proc call */ 455 rfs4_compound(cap, rbp, NULL, req, cr); 456 457 curthread->t_flag &= ~T_DONTPEND; 458 459 /* 460 * dr_res must be initialized before calling 461 * rfs4_dr_chstate (it frees the reply). 462 */ 463 drp->dr_res = res_buf; 464 if (curthread->t_flag & T_WOULDBLOCK) { 465 curthread->t_flag &= ~T_WOULDBLOCK; 466 /* 467 * mark this entry as FREE and plop 468 * on the end of the cache list 469 */ 470 mutex_enter(&drp->drc->lock); 471 rfs4_dr_chstate(drp, NFS4_DUP_FREE); 472 list_insert_tail(&(drp->drc->dr_cache), drp); 473 mutex_exit(&drp->drc->lock); 474 return (1); 475 } 476 break; 477 478 case NFS4_DUP_REPLAY: 479 /* replay from the cache */ 480 rbp = &(drp->dr_res); 481 break; 482 } 483 } else { 484 curthread->t_flag |= T_DONTPEND; 485 /* IDEMPOTENT proc call */ 486 rfs4_compound(cap, rbp, NULL, req, cr); 487 488 curthread->t_flag &= ~T_DONTPEND; 489 if (curthread->t_flag & T_WOULDBLOCK) { 490 curthread->t_flag &= ~T_WOULDBLOCK; 491 return (1); 492 } 493 } 494 495 /* 496 * Send out the replayed reply or the 'real' one. 497 */ 498 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 499 DTRACE_PROBE2(nfss__e__dispatch_sendfail, 500 struct svc_req *, xprt, 501 char *, rbp); 502 error++; 503 } 504 505 /* 506 * If this reply was just inserted into the duplicate cache 507 * or it was replayed from the dup cache; (re)mark it as 508 * available for replay 509 * 510 * At first glance, this 'if' statement seems a little strange; 511 * testing for NFS4_DUP_REPLAY, and then calling... 512 * 513 * rfs4_dr_chatate(NFS4_DUP_REPLAY) 514 * 515 * ... but notice that we are checking dr_stat, and not the 516 * state of the entry itself, the entry will be NFS4_DUP_INUSE, 517 * we do that so that we know not to prematurely reap it whilst 518 * we resent it to the client. 519 * 520 */ 521 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) { 522 mutex_enter(&drp->drc->lock); 523 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY); 524 mutex_exit(&drp->drc->lock); 525 } else if (dr_stat == NFS4_NOT_DUP) { 526 rfs4_compound_free(rbp); 527 } 528 529 return (error); 530 } 531