1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/systm.h> 30 #include <sys/sdt.h> 31 #include <rpc/types.h> 32 #include <rpc/auth.h> 33 #include <rpc/auth_unix.h> 34 #include <rpc/auth_des.h> 35 #include <rpc/svc.h> 36 #include <rpc/xdr.h> 37 #include <nfs/nfs4.h> 38 #include <nfs/nfs_dispatch.h> 39 #include <nfs/nfs4_drc.h> 40 41 #define NFS4_MAX_MINOR_VERSION 0 42 43 /* 44 * This is the duplicate request cache for NFSv4 45 */ 46 rfs4_drc_t *nfs4_drc = NULL; 47 48 /* 49 * The default size of the duplicate request cache 50 */ 51 uint32_t nfs4_drc_max = 8 * 1024; 52 53 /* 54 * The number of buckets we'd like to hash the 55 * replies into.. do not change this on the fly. 56 */ 57 uint32_t nfs4_drc_hash = 541; 58 59 /* 60 * Initialize a duplicate request cache. 61 */ 62 rfs4_drc_t * 63 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size) 64 { 65 rfs4_drc_t *drc; 66 uint32_t bki; 67 68 ASSERT(drc_size); 69 ASSERT(drc_hash_size); 70 71 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP); 72 73 drc->max_size = drc_size; 74 drc->in_use = 0; 75 76 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL); 77 78 drc->dr_hash = drc_hash_size; 79 80 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP); 81 82 for (bki = 0; bki < drc_hash_size; bki++) { 83 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t), 84 offsetof(rfs4_dupreq_t, dr_bkt_next)); 85 } 86 87 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t), 88 offsetof(rfs4_dupreq_t, dr_next)); 89 90 return (drc); 91 } 92 93 /* 94 * Destroy a duplicate request cache. 95 */ 96 void 97 rfs4_fini_drc(rfs4_drc_t *drc) 98 { 99 rfs4_dupreq_t *drp, *drp_next; 100 101 ASSERT(drc); 102 103 /* iterate over the dr_cache and free the enties */ 104 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) { 105 106 if (drp->dr_state == NFS4_DUP_REPLAY) 107 rfs4_compound_free(&(drp->dr_res)); 108 109 if (drp->dr_addr.buf != NULL) 110 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 111 112 drp_next = list_next(&(drc->dr_cache), drp); 113 114 kmem_free(drp, sizeof (rfs4_dupreq_t)); 115 } 116 117 mutex_destroy(&drc->lock); 118 kmem_free(drc->dr_buckets, 119 sizeof (list_t)*drc->dr_hash); 120 kmem_free(drc, sizeof (rfs4_drc_t)); 121 } 122 123 /* 124 * rfs4_dr_chstate: 125 * 126 * Change the state of a rfs4_dupreq. If it's not in transition 127 * to the FREE state, update the time used and return. If we 128 * are moving to the FREE state then we need to clean up the 129 * compound results and move the entry to the end of the list. 130 */ 131 void 132 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state) 133 { 134 rfs4_drc_t *drc; 135 136 ASSERT(drp); 137 ASSERT(drp->drc); 138 ASSERT(drp->dr_bkt); 139 ASSERT(MUTEX_HELD(&drp->drc->lock)); 140 141 drp->dr_state = new_state; 142 143 if (new_state != NFS4_DUP_FREE) { 144 gethrestime(&drp->dr_time_used); 145 return; 146 } 147 148 drc = drp->drc; 149 150 /* 151 * Remove entry from the bucket and 152 * dr_cache list, free compound results. 153 */ 154 list_remove(drp->dr_bkt, drp); 155 list_remove(&(drc->dr_cache), drp); 156 rfs4_compound_free(&(drp->dr_res)); 157 } 158 159 /* 160 * rfs4_alloc_dr: 161 * 162 * Malloc a new one if we have not reached our maximum cache 163 * limit, otherwise pick an entry off the tail -- Use if it 164 * is marked as NFS4_DUP_FREE, or is an entry in the 165 * NFS4_DUP_REPLAY state. 166 */ 167 rfs4_dupreq_t * 168 rfs4_alloc_dr(rfs4_drc_t *drc) 169 { 170 rfs4_dupreq_t *drp_tail, *drp = NULL; 171 172 ASSERT(drc); 173 ASSERT(MUTEX_HELD(&drc->lock)); 174 175 /* 176 * Have we hit the cache limit yet ? 177 */ 178 if (drc->in_use < drc->max_size) { 179 /* 180 * nope, so let's malloc a new one 181 */ 182 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP); 183 drp->drc = drc; 184 drc->in_use++; 185 gethrestime(&drp->dr_time_created); 186 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp); 187 return (drp); 188 } 189 190 /* 191 * Cache is all allocated now traverse the list 192 * backwards to find one we can reuse. 193 */ 194 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL; 195 drp_tail = list_prev(&drc->dr_cache, drp_tail)) { 196 197 switch (drp_tail->dr_state) { 198 199 case NFS4_DUP_FREE: 200 list_remove(&(drc->dr_cache), drp_tail); 201 DTRACE_PROBE1(nfss__i__drc_freeclaim, 202 rfs4_dupreq_t *, drp_tail); 203 return (drp_tail); 204 /* NOTREACHED */ 205 206 case NFS4_DUP_REPLAY: 207 /* grab it. */ 208 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE); 209 DTRACE_PROBE1(nfss__i__drc_replayclaim, 210 rfs4_dupreq_t *, drp_tail); 211 return (drp_tail); 212 /* NOTREACHED */ 213 } 214 } 215 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc); 216 return (NULL); 217 } 218 219 /* 220 * rfs4_find_dr: 221 * 222 * Search for an entry in the duplicate request cache by 223 * calculating the hash index based on the XID, and examining 224 * the entries in the hash bucket. If we find a match stamp the 225 * time_used and return. If the entry does not match it could be 226 * ready to be freed. Once we have searched the bucket we call 227 * rfs4_alloc_dr() to allocate a new entry, or reuse one that is 228 * available. 229 */ 230 int 231 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup) 232 { 233 234 uint32_t the_xid; 235 list_t *dr_bkt; 236 rfs4_dupreq_t *drp; 237 int bktdex; 238 239 /* 240 * Get the XID, calculate the bucket and search to 241 * see if we need to replay from the cache. 242 */ 243 the_xid = req->rq_xprt->xp_xid; 244 bktdex = the_xid % drc->dr_hash; 245 246 dr_bkt = (list_t *) 247 &(drc->dr_buckets[(the_xid % drc->dr_hash)]); 248 249 DTRACE_PROBE3(nfss__i__drc_bktdex, 250 int, bktdex, 251 uint32_t, the_xid, 252 list_t *, dr_bkt); 253 254 *dup = NULL; 255 256 mutex_enter(&drc->lock); 257 /* 258 * Search the bucket for a matching xid and address. 259 */ 260 for (drp = list_head(dr_bkt); drp != NULL; 261 drp = list_next(dr_bkt, drp)) { 262 263 if (drp->dr_xid == the_xid && 264 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len && 265 bcmp((caddr_t)drp->dr_addr.buf, 266 (caddr_t)req->rq_xprt->xp_rtaddr.buf, 267 drp->dr_addr.len) == 0) { 268 269 /* 270 * Found a match so REPLAY the Reply 271 */ 272 if (drp->dr_state == NFS4_DUP_REPLAY) { 273 rfs4_dr_chstate(drp, NFS4_DUP_INUSE); 274 mutex_exit(&drc->lock); 275 *dup = drp; 276 DTRACE_PROBE1(nfss__i__drc_replay, 277 rfs4_dupreq_t *, drp); 278 return (NFS4_DUP_REPLAY); 279 } 280 281 /* 282 * This entry must be in transition, so return 283 * the 'pending' status. 284 */ 285 mutex_exit(&drc->lock); 286 return (NFS4_DUP_PENDING); 287 } 288 289 /* 290 * Not a match, but maybe this entry is okay 291 * to be reused. 292 */ 293 if (drp->dr_state == NFS4_DUP_REPLAY) { 294 rfs4_dr_chstate(drp, NFS4_DUP_FREE); 295 list_insert_tail(&(drp->drc->dr_cache), drp); 296 } 297 } 298 299 drp = rfs4_alloc_dr(drc); 300 mutex_exit(&drc->lock); 301 302 /* 303 * The DRC is full and all entries are in use. Upper function 304 * should error out this request and force the client to 305 * retransmit -- effectively this is a resource issue. NFSD 306 * threads tied up with native File System, or the cache size 307 * is too small for the server load. 308 */ 309 if (drp == NULL) 310 return (NFS4_DUP_ERROR); 311 312 /* 313 * Init the state to NEW and clear the time used field. 314 */ 315 drp->dr_state = NFS4_DUP_NEW; 316 drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0; 317 318 /* 319 * If needed, resize the address buffer 320 */ 321 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) { 322 if (drp->dr_addr.buf != NULL) 323 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 324 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len; 325 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP); 326 if (drp->dr_addr.buf == NULL) { 327 /* 328 * If the malloc fails, mark the entry 329 * as free and put on the tail. 330 */ 331 drp->dr_addr.maxlen = 0; 332 drp->dr_state = NFS4_DUP_FREE; 333 mutex_enter(&drc->lock); 334 list_insert_tail(&(drc->dr_cache), drp); 335 mutex_exit(&drc->lock); 336 return (NFS4_DUP_ERROR); 337 } 338 } 339 340 341 /* 342 * Copy the address. 343 */ 344 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len; 345 346 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf, 347 (caddr_t)drp->dr_addr.buf, 348 drp->dr_addr.len); 349 350 drp->dr_xid = the_xid; 351 drp->dr_bkt = dr_bkt; 352 353 /* 354 * Insert at the head of the bucket and 355 * the drc lists.. 356 */ 357 mutex_enter(&drc->lock); 358 list_insert_head(&drc->dr_cache, drp); 359 list_insert_head(dr_bkt, drp); 360 mutex_exit(&drc->lock); 361 362 *dup = drp; 363 364 return (NFS4_DUP_NEW); 365 } 366 367 /* 368 * 369 * This function handles the duplicate request cache, 370 * NULL_PROC and COMPOUND procedure calls for NFSv4; 371 * 372 * Passed into this function are:- 373 * 374 * disp A pointer to our dispatch table entry 375 * req The request to process 376 * xprt The server transport handle 377 * ap A pointer to the arguments 378 * 379 * 380 * When appropriate this function is responsible for inserting 381 * the reply into the duplicate cache or replaying an existing 382 * cached reply. 383 * 384 * dr_stat reflects the state of the duplicate request that 385 * has been inserted into or retrieved from the cache 386 * 387 * drp is the duplicate request entry 388 * 389 */ 390 int 391 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req, 392 SVCXPRT *xprt, char *ap) 393 { 394 395 COMPOUND4res res_buf, *rbp; 396 COMPOUND4args *cap; 397 398 cred_t *cr = NULL; 399 int error = 0; 400 int dis_flags = 0; 401 int dr_stat = NFS4_NOT_DUP; 402 rfs4_dupreq_t *drp = NULL; 403 404 ASSERT(disp); 405 406 /* 407 * Short circuit the RPC_NULL proc. 408 */ 409 if (disp->dis_proc == rpc_null) { 410 DTRACE_NFSV4_1(null__start, struct svc_req *, req); 411 if (!svc_sendreply(xprt, xdr_void, NULL)) { 412 DTRACE_NFSV4_1(null__done, struct svc_req *, req); 413 return (1); 414 } 415 DTRACE_NFSV4_1(null__done, struct svc_req *, req); 416 return (0); 417 } 418 419 /* Only NFSv4 Compounds from this point onward */ 420 421 rbp = &res_buf; 422 cap = (COMPOUND4args *)ap; 423 424 /* 425 * Figure out the disposition of the whole COMPOUND 426 * and record it's IDEMPOTENTCY. 427 */ 428 rfs4_compound_flagproc(cap, &dis_flags); 429 430 /* 431 * If NON-IDEMPOTENT then we need to figure out if this 432 * request can be replied from the duplicate cache. 433 * 434 * If this is a new request then we need to insert the 435 * reply into the duplicate cache. 436 */ 437 if (!(dis_flags & RPC_IDEMPOTENT)) { 438 /* look for a replay from the cache or allocate */ 439 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp); 440 441 switch (dr_stat) { 442 443 case NFS4_DUP_ERROR: 444 svcerr_systemerr(xprt); 445 return (1); 446 /* NOTREACHED */ 447 448 case NFS4_DUP_PENDING: 449 /* 450 * reply has previously been inserted into the 451 * duplicate cache, however the reply has 452 * not yet been sent via svc_sendreply() 453 */ 454 return (1); 455 /* NOTREACHED */ 456 457 case NFS4_DUP_NEW: 458 curthread->t_flag |= T_DONTPEND; 459 /* NON-IDEMPOTENT proc call */ 460 rfs4_compound(cap, rbp, NULL, req, cr); 461 462 curthread->t_flag &= ~T_DONTPEND; 463 464 /* 465 * dr_res must be initialized before calling 466 * rfs4_dr_chstate (it frees the reply). 467 */ 468 drp->dr_res = res_buf; 469 if (curthread->t_flag & T_WOULDBLOCK) { 470 curthread->t_flag &= ~T_WOULDBLOCK; 471 /* 472 * mark this entry as FREE and plop 473 * on the end of the cache list 474 */ 475 mutex_enter(&drp->drc->lock); 476 rfs4_dr_chstate(drp, NFS4_DUP_FREE); 477 list_insert_tail(&(drp->drc->dr_cache), drp); 478 mutex_exit(&drp->drc->lock); 479 return (1); 480 } 481 break; 482 483 case NFS4_DUP_REPLAY: 484 /* replay from the cache */ 485 rbp = &(drp->dr_res); 486 break; 487 } 488 } else { 489 curthread->t_flag |= T_DONTPEND; 490 /* IDEMPOTENT proc call */ 491 rfs4_compound(cap, rbp, NULL, req, cr); 492 493 curthread->t_flag &= ~T_DONTPEND; 494 if (curthread->t_flag & T_WOULDBLOCK) { 495 curthread->t_flag &= ~T_WOULDBLOCK; 496 return (1); 497 } 498 } 499 500 /* 501 * Send out the replayed reply or the 'real' one. 502 */ 503 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 504 DTRACE_PROBE2(nfss__e__dispatch_sendfail, 505 struct svc_req *, xprt, 506 char *, rbp); 507 error++; 508 } 509 510 /* 511 * If this reply was just inserted into the duplicate cache 512 * or it was replayed from the dup cache; (re)mark it as 513 * available for replay 514 * 515 * At first glance, this 'if' statement seems a little strange; 516 * testing for NFS4_DUP_REPLAY, and then calling... 517 * 518 * rfs4_dr_chatate(NFS4_DUP_REPLAY) 519 * 520 * ... but notice that we are checking dr_stat, and not the 521 * state of the entry itself, the entry will be NFS4_DUP_INUSE, 522 * we do that so that we know not to prematurely reap it whilst 523 * we resent it to the client. 524 * 525 */ 526 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) { 527 mutex_enter(&drp->drc->lock); 528 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY); 529 mutex_exit(&drp->drc->lock); 530 } else if (dr_stat == NFS4_NOT_DUP) { 531 rfs4_compound_free(rbp); 532 } 533 534 return (error); 535 } 536 537 bool_t 538 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args) 539 { 540 COMPOUND4args *argsp; 541 COMPOUND4res res_buf, *resp; 542 543 if (req->rq_vers != 4) 544 return (FALSE); 545 546 argsp = (COMPOUND4args *)args; 547 548 if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION) 549 return (FALSE); 550 551 resp = &res_buf; 552 553 /* 554 * Form a reply tag by copying over the reqeuest tag. 555 */ 556 resp->tag.utf8string_val = 557 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 558 resp->tag.utf8string_len = argsp->tag.utf8string_len; 559 bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val, 560 resp->tag.utf8string_len); 561 resp->array_len = 0; 562 resp->array = NULL; 563 resp->status = NFS4ERR_MINOR_VERS_MISMATCH; 564 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)resp)) { 565 DTRACE_PROBE2(nfss__e__minorvers_mismatch, 566 SVCXPRT *, xprt, char *, resp); 567 } 568 rfs4_compound_free(resp); 569 return (TRUE); 570 } 571