1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/systm.h> 30 #include <sys/sdt.h> 31 #include <rpc/types.h> 32 #include <rpc/auth.h> 33 #include <rpc/auth_unix.h> 34 #include <rpc/auth_des.h> 35 #include <rpc/svc.h> 36 #include <rpc/xdr.h> 37 #include <nfs/nfs4.h> 38 #include <nfs/nfs_dispatch.h> 39 #include <nfs/nfs4_drc.h> 40 41 #define NFS4_MAX_MINOR_VERSION 0 42 43 /* 44 * This is the duplicate request cache for NFSv4 45 */ 46 rfs4_drc_t *nfs4_drc = NULL; 47 48 /* 49 * The default size of the duplicate request cache 50 */ 51 uint32_t nfs4_drc_max = 8 * 1024; 52 53 /* 54 * The number of buckets we'd like to hash the 55 * replies into.. do not change this on the fly. 56 */ 57 uint32_t nfs4_drc_hash = 541; 58 59 /* 60 * Initialize a duplicate request cache. 61 */ 62 rfs4_drc_t * 63 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size) 64 { 65 rfs4_drc_t *drc; 66 uint32_t bki; 67 68 ASSERT(drc_size); 69 ASSERT(drc_hash_size); 70 71 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP); 72 73 drc->max_size = drc_size; 74 drc->in_use = 0; 75 76 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL); 77 78 drc->dr_hash = drc_hash_size; 79 80 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP); 81 82 for (bki = 0; bki < drc_hash_size; bki++) { 83 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t), 84 offsetof(rfs4_dupreq_t, dr_bkt_next)); 85 } 86 87 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t), 88 offsetof(rfs4_dupreq_t, dr_next)); 89 90 return (drc); 91 } 92 93 /* 94 * Destroy a duplicate request cache. 95 */ 96 void 97 rfs4_fini_drc(rfs4_drc_t *drc) 98 { 99 rfs4_dupreq_t *drp, *drp_next; 100 101 ASSERT(drc); 102 103 /* iterate over the dr_cache and free the enties */ 104 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) { 105 106 if (drp->dr_state == NFS4_DUP_REPLAY) 107 rfs4_compound_free(&(drp->dr_res)); 108 109 if (drp->dr_addr.buf != NULL) 110 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 111 112 drp_next = list_next(&(drc->dr_cache), drp); 113 114 kmem_free(drp, sizeof (rfs4_dupreq_t)); 115 } 116 117 mutex_destroy(&drc->lock); 118 kmem_free(drc->dr_buckets, 119 sizeof (list_t)*drc->dr_hash); 120 kmem_free(drc, sizeof (rfs4_drc_t)); 121 } 122 123 /* 124 * rfs4_dr_chstate: 125 * 126 * Change the state of a rfs4_dupreq. If it's not in transition 127 * to the FREE state, return. If we are moving to the FREE state 128 * then we need to clean up the compound results and move the entry 129 * to the end of the list. 130 */ 131 void 132 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state) 133 { 134 rfs4_drc_t *drc; 135 136 ASSERT(drp); 137 ASSERT(drp->drc); 138 ASSERT(drp->dr_bkt); 139 ASSERT(MUTEX_HELD(&drp->drc->lock)); 140 141 drp->dr_state = new_state; 142 143 if (new_state != NFS4_DUP_FREE) 144 return; 145 146 drc = drp->drc; 147 148 /* 149 * Remove entry from the bucket and 150 * dr_cache list, free compound results. 151 */ 152 list_remove(drp->dr_bkt, drp); 153 list_remove(&(drc->dr_cache), drp); 154 rfs4_compound_free(&(drp->dr_res)); 155 } 156 157 /* 158 * rfs4_alloc_dr: 159 * 160 * Malloc a new one if we have not reached our maximum cache 161 * limit, otherwise pick an entry off the tail -- Use if it 162 * is marked as NFS4_DUP_FREE, or is an entry in the 163 * NFS4_DUP_REPLAY state. 164 */ 165 rfs4_dupreq_t * 166 rfs4_alloc_dr(rfs4_drc_t *drc) 167 { 168 rfs4_dupreq_t *drp_tail, *drp = NULL; 169 170 ASSERT(drc); 171 ASSERT(MUTEX_HELD(&drc->lock)); 172 173 /* 174 * Have we hit the cache limit yet ? 175 */ 176 if (drc->in_use < drc->max_size) { 177 /* 178 * nope, so let's malloc a new one 179 */ 180 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP); 181 drp->drc = drc; 182 drc->in_use++; 183 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp); 184 return (drp); 185 } 186 187 /* 188 * Cache is all allocated now traverse the list 189 * backwards to find one we can reuse. 190 */ 191 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL; 192 drp_tail = list_prev(&drc->dr_cache, drp_tail)) { 193 194 switch (drp_tail->dr_state) { 195 196 case NFS4_DUP_FREE: 197 list_remove(&(drc->dr_cache), drp_tail); 198 DTRACE_PROBE1(nfss__i__drc_freeclaim, 199 rfs4_dupreq_t *, drp_tail); 200 return (drp_tail); 201 /* NOTREACHED */ 202 203 case NFS4_DUP_REPLAY: 204 /* grab it. */ 205 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE); 206 DTRACE_PROBE1(nfss__i__drc_replayclaim, 207 rfs4_dupreq_t *, drp_tail); 208 return (drp_tail); 209 /* NOTREACHED */ 210 } 211 } 212 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc); 213 return (NULL); 214 } 215 216 /* 217 * rfs4_find_dr: 218 * 219 * Search for an entry in the duplicate request cache by 220 * calculating the hash index based on the XID, and examining 221 * the entries in the hash bucket. If we find a match, return. 222 * Once we have searched the bucket we call rfs4_alloc_dr() to 223 * allocate a new entry, or reuse one that is available. 224 */ 225 int 226 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup) 227 { 228 229 uint32_t the_xid; 230 list_t *dr_bkt; 231 rfs4_dupreq_t *drp; 232 int bktdex; 233 234 /* 235 * Get the XID, calculate the bucket and search to 236 * see if we need to replay from the cache. 237 */ 238 the_xid = req->rq_xprt->xp_xid; 239 bktdex = the_xid % drc->dr_hash; 240 241 dr_bkt = (list_t *) 242 &(drc->dr_buckets[(the_xid % drc->dr_hash)]); 243 244 DTRACE_PROBE3(nfss__i__drc_bktdex, 245 int, bktdex, 246 uint32_t, the_xid, 247 list_t *, dr_bkt); 248 249 *dup = NULL; 250 251 mutex_enter(&drc->lock); 252 /* 253 * Search the bucket for a matching xid and address. 254 */ 255 for (drp = list_head(dr_bkt); drp != NULL; 256 drp = list_next(dr_bkt, drp)) { 257 258 if (drp->dr_xid == the_xid && 259 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len && 260 bcmp((caddr_t)drp->dr_addr.buf, 261 (caddr_t)req->rq_xprt->xp_rtaddr.buf, 262 drp->dr_addr.len) == 0) { 263 264 /* 265 * Found a match so REPLAY the Reply 266 */ 267 if (drp->dr_state == NFS4_DUP_REPLAY) { 268 rfs4_dr_chstate(drp, NFS4_DUP_INUSE); 269 mutex_exit(&drc->lock); 270 *dup = drp; 271 DTRACE_PROBE1(nfss__i__drc_replay, 272 rfs4_dupreq_t *, drp); 273 return (NFS4_DUP_REPLAY); 274 } 275 276 /* 277 * This entry must be in transition, so return 278 * the 'pending' status. 279 */ 280 mutex_exit(&drc->lock); 281 return (NFS4_DUP_PENDING); 282 } 283 } 284 285 drp = rfs4_alloc_dr(drc); 286 mutex_exit(&drc->lock); 287 288 /* 289 * The DRC is full and all entries are in use. Upper function 290 * should error out this request and force the client to 291 * retransmit -- effectively this is a resource issue. NFSD 292 * threads tied up with native File System, or the cache size 293 * is too small for the server load. 294 */ 295 if (drp == NULL) 296 return (NFS4_DUP_ERROR); 297 298 /* 299 * Init the state to NEW. 300 */ 301 drp->dr_state = NFS4_DUP_NEW; 302 303 /* 304 * If needed, resize the address buffer 305 */ 306 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) { 307 if (drp->dr_addr.buf != NULL) 308 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 309 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len; 310 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP); 311 if (drp->dr_addr.buf == NULL) { 312 /* 313 * If the malloc fails, mark the entry 314 * as free and put on the tail. 315 */ 316 drp->dr_addr.maxlen = 0; 317 drp->dr_state = NFS4_DUP_FREE; 318 mutex_enter(&drc->lock); 319 list_insert_tail(&(drc->dr_cache), drp); 320 mutex_exit(&drc->lock); 321 return (NFS4_DUP_ERROR); 322 } 323 } 324 325 326 /* 327 * Copy the address. 328 */ 329 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len; 330 331 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf, 332 (caddr_t)drp->dr_addr.buf, 333 drp->dr_addr.len); 334 335 drp->dr_xid = the_xid; 336 drp->dr_bkt = dr_bkt; 337 338 /* 339 * Insert at the head of the bucket and 340 * the drc lists.. 341 */ 342 mutex_enter(&drc->lock); 343 list_insert_head(&drc->dr_cache, drp); 344 list_insert_head(dr_bkt, drp); 345 mutex_exit(&drc->lock); 346 347 *dup = drp; 348 349 return (NFS4_DUP_NEW); 350 } 351 352 /* 353 * 354 * This function handles the duplicate request cache, 355 * NULL_PROC and COMPOUND procedure calls for NFSv4; 356 * 357 * Passed into this function are:- 358 * 359 * disp A pointer to our dispatch table entry 360 * req The request to process 361 * xprt The server transport handle 362 * ap A pointer to the arguments 363 * 364 * 365 * When appropriate this function is responsible for inserting 366 * the reply into the duplicate cache or replaying an existing 367 * cached reply. 368 * 369 * dr_stat reflects the state of the duplicate request that 370 * has been inserted into or retrieved from the cache 371 * 372 * drp is the duplicate request entry 373 * 374 */ 375 int 376 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req, 377 SVCXPRT *xprt, char *ap) 378 { 379 380 COMPOUND4res res_buf, *rbp; 381 COMPOUND4args *cap; 382 383 cred_t *cr = NULL; 384 int error = 0; 385 int dis_flags = 0; 386 int dr_stat = NFS4_NOT_DUP; 387 rfs4_dupreq_t *drp = NULL; 388 389 ASSERT(disp); 390 391 /* 392 * Short circuit the RPC_NULL proc. 393 */ 394 if (disp->dis_proc == rpc_null) { 395 DTRACE_NFSV4_1(null__start, struct svc_req *, req); 396 if (!svc_sendreply(xprt, xdr_void, NULL)) { 397 DTRACE_NFSV4_1(null__done, struct svc_req *, req); 398 return (1); 399 } 400 DTRACE_NFSV4_1(null__done, struct svc_req *, req); 401 return (0); 402 } 403 404 /* Only NFSv4 Compounds from this point onward */ 405 406 rbp = &res_buf; 407 cap = (COMPOUND4args *)ap; 408 409 /* 410 * Figure out the disposition of the whole COMPOUND 411 * and record it's IDEMPOTENTCY. 412 */ 413 rfs4_compound_flagproc(cap, &dis_flags); 414 415 /* 416 * If NON-IDEMPOTENT then we need to figure out if this 417 * request can be replied from the duplicate cache. 418 * 419 * If this is a new request then we need to insert the 420 * reply into the duplicate cache. 421 */ 422 if (!(dis_flags & RPC_IDEMPOTENT)) { 423 /* look for a replay from the cache or allocate */ 424 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp); 425 426 switch (dr_stat) { 427 428 case NFS4_DUP_ERROR: 429 svcerr_systemerr(xprt); 430 return (1); 431 /* NOTREACHED */ 432 433 case NFS4_DUP_PENDING: 434 /* 435 * reply has previously been inserted into the 436 * duplicate cache, however the reply has 437 * not yet been sent via svc_sendreply() 438 */ 439 return (1); 440 /* NOTREACHED */ 441 442 case NFS4_DUP_NEW: 443 curthread->t_flag |= T_DONTPEND; 444 /* NON-IDEMPOTENT proc call */ 445 rfs4_compound(cap, rbp, NULL, req, cr); 446 447 curthread->t_flag &= ~T_DONTPEND; 448 449 /* 450 * dr_res must be initialized before calling 451 * rfs4_dr_chstate (it frees the reply). 452 */ 453 drp->dr_res = res_buf; 454 if (curthread->t_flag & T_WOULDBLOCK) { 455 curthread->t_flag &= ~T_WOULDBLOCK; 456 /* 457 * mark this entry as FREE and plop 458 * on the end of the cache list 459 */ 460 mutex_enter(&drp->drc->lock); 461 rfs4_dr_chstate(drp, NFS4_DUP_FREE); 462 list_insert_tail(&(drp->drc->dr_cache), drp); 463 mutex_exit(&drp->drc->lock); 464 return (1); 465 } 466 break; 467 468 case NFS4_DUP_REPLAY: 469 /* replay from the cache */ 470 rbp = &(drp->dr_res); 471 break; 472 } 473 } else { 474 curthread->t_flag |= T_DONTPEND; 475 /* IDEMPOTENT proc call */ 476 rfs4_compound(cap, rbp, NULL, req, cr); 477 478 curthread->t_flag &= ~T_DONTPEND; 479 if (curthread->t_flag & T_WOULDBLOCK) { 480 curthread->t_flag &= ~T_WOULDBLOCK; 481 return (1); 482 } 483 } 484 485 /* 486 * Send out the replayed reply or the 'real' one. 487 */ 488 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 489 DTRACE_PROBE2(nfss__e__dispatch_sendfail, 490 struct svc_req *, xprt, 491 char *, rbp); 492 error++; 493 } 494 495 /* 496 * If this reply was just inserted into the duplicate cache 497 * or it was replayed from the dup cache; (re)mark it as 498 * available for replay 499 * 500 * At first glance, this 'if' statement seems a little strange; 501 * testing for NFS4_DUP_REPLAY, and then calling... 502 * 503 * rfs4_dr_chatate(NFS4_DUP_REPLAY) 504 * 505 * ... but notice that we are checking dr_stat, and not the 506 * state of the entry itself, the entry will be NFS4_DUP_INUSE, 507 * we do that so that we know not to prematurely reap it whilst 508 * we resent it to the client. 509 * 510 */ 511 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) { 512 mutex_enter(&drp->drc->lock); 513 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY); 514 mutex_exit(&drp->drc->lock); 515 } else if (dr_stat == NFS4_NOT_DUP) { 516 rfs4_compound_free(rbp); 517 } 518 519 return (error); 520 } 521 522 bool_t 523 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args) 524 { 525 COMPOUND4args *argsp; 526 COMPOUND4res res_buf, *resp; 527 528 if (req->rq_vers != 4) 529 return (FALSE); 530 531 argsp = (COMPOUND4args *)args; 532 533 if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION) 534 return (FALSE); 535 536 resp = &res_buf; 537 538 /* 539 * Form a reply tag by copying over the reqeuest tag. 540 */ 541 resp->tag.utf8string_val = 542 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 543 resp->tag.utf8string_len = argsp->tag.utf8string_len; 544 bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val, 545 resp->tag.utf8string_len); 546 resp->array_len = 0; 547 resp->array = NULL; 548 resp->status = NFS4ERR_MINOR_VERS_MISMATCH; 549 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)resp)) { 550 DTRACE_PROBE2(nfss__e__minorvers_mismatch, 551 SVCXPRT *, xprt, char *, resp); 552 } 553 rfs4_compound_free(resp); 554 return (TRUE); 555 } 556