1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2018 Nexenta Systems, Inc. 29 * Copyright 2020 RackTop Systems, Inc. 30 */ 31 32 #include <sys/systm.h> 33 #include <sys/sdt.h> 34 #include <rpc/types.h> 35 #include <rpc/auth.h> 36 #include <rpc/auth_unix.h> 37 #include <rpc/auth_des.h> 38 #include <rpc/svc.h> 39 #include <rpc/xdr.h> 40 #include <nfs/nfs4.h> 41 #include <nfs/nfs_dispatch.h> 42 #include <nfs/nfs4_drc.h> 43 44 /* 45 * The default size of the duplicate request cache 46 */ 47 uint32_t nfs4_drc_max = 8 * 1024; 48 49 /* 50 * The number of buckets we'd like to hash the 51 * replies into.. do not change this on the fly. 52 */ 53 uint32_t nfs4_drc_hash = 541; 54 55 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp); 56 57 /* 58 * Initialize a duplicate request cache. 59 */ 60 rfs4_drc_t * 61 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size) 62 { 63 rfs4_drc_t *drc; 64 uint32_t bki; 65 66 ASSERT(drc_size); 67 ASSERT(drc_hash_size); 68 69 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP); 70 71 drc->max_size = drc_size; 72 drc->in_use = 0; 73 74 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL); 75 76 drc->dr_hash = drc_hash_size; 77 78 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP); 79 80 for (bki = 0; bki < drc_hash_size; bki++) { 81 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t), 82 offsetof(rfs4_dupreq_t, dr_bkt_next)); 83 } 84 85 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t), 86 offsetof(rfs4_dupreq_t, dr_next)); 87 88 return (drc); 89 } 90 91 /* 92 * Destroy a duplicate request cache. 93 */ 94 void 95 rfs4_fini_drc(void) 96 { 97 nfs4_srv_t *nsrv4 = nfs4_get_srv(); 98 rfs4_drc_t *drc = nsrv4->nfs4_drc; 99 rfs4_dupreq_t *drp, *drp_next; 100 101 /* iterate over the dr_cache and free the enties */ 102 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) { 103 104 if (drp->dr_state == NFS4_DUP_REPLAY) 105 rfs4_compound_free(&(drp->dr_res)); 106 107 if (drp->dr_addr.buf != NULL) 108 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 109 110 drp_next = list_next(&(drc->dr_cache), drp); 111 112 kmem_free(drp, sizeof (rfs4_dupreq_t)); 113 } 114 115 mutex_destroy(&drc->lock); 116 kmem_free(drc->dr_buckets, 117 sizeof (list_t)*drc->dr_hash); 118 kmem_free(drc, sizeof (rfs4_drc_t)); 119 } 120 121 /* 122 * rfs4_dr_chstate: 123 * 124 * Change the state of a rfs4_dupreq. If it's not in transition 125 * to the FREE state, return. If we are moving to the FREE state 126 * then we need to clean up the compound results and move the entry 127 * to the end of the list. 128 */ 129 void 130 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state) 131 { 132 rfs4_drc_t *drc; 133 134 ASSERT(drp); 135 ASSERT(drp->drc); 136 ASSERT(drp->dr_bkt); 137 ASSERT(MUTEX_HELD(&drp->drc->lock)); 138 139 drp->dr_state = new_state; 140 141 if (new_state != NFS4_DUP_FREE) 142 return; 143 144 drc = drp->drc; 145 146 /* 147 * Remove entry from the bucket and 148 * dr_cache list, free compound results. 149 */ 150 list_remove(drp->dr_bkt, drp); 151 list_remove(&(drc->dr_cache), drp); 152 rfs4_compound_free(&(drp->dr_res)); 153 } 154 155 /* 156 * rfs4_alloc_dr: 157 * 158 * Malloc a new one if we have not reached our maximum cache 159 * limit, otherwise pick an entry off the tail -- Use if it 160 * is marked as NFS4_DUP_FREE, or is an entry in the 161 * NFS4_DUP_REPLAY state. 162 */ 163 rfs4_dupreq_t * 164 rfs4_alloc_dr(rfs4_drc_t *drc) 165 { 166 rfs4_dupreq_t *drp_tail, *drp = NULL; 167 168 ASSERT(drc); 169 ASSERT(MUTEX_HELD(&drc->lock)); 170 171 /* 172 * Have we hit the cache limit yet ? 173 */ 174 if (drc->in_use < drc->max_size) { 175 /* 176 * nope, so let's malloc a new one 177 */ 178 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP); 179 drp->drc = drc; 180 drc->in_use++; 181 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp); 182 return (drp); 183 } 184 185 /* 186 * Cache is all allocated now traverse the list 187 * backwards to find one we can reuse. 188 */ 189 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL; 190 drp_tail = list_prev(&drc->dr_cache, drp_tail)) { 191 192 switch (drp_tail->dr_state) { 193 194 case NFS4_DUP_FREE: 195 list_remove(&(drc->dr_cache), drp_tail); 196 DTRACE_PROBE1(nfss__i__drc_freeclaim, 197 rfs4_dupreq_t *, drp_tail); 198 return (drp_tail); 199 /* NOTREACHED */ 200 201 case NFS4_DUP_REPLAY: 202 /* grab it. */ 203 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE); 204 DTRACE_PROBE1(nfss__i__drc_replayclaim, 205 rfs4_dupreq_t *, drp_tail); 206 return (drp_tail); 207 /* NOTREACHED */ 208 } 209 } 210 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc); 211 return (NULL); 212 } 213 214 /* 215 * rfs4_find_dr: 216 * 217 * Search for an entry in the duplicate request cache by 218 * calculating the hash index based on the XID, and examining 219 * the entries in the hash bucket. If we find a match, return. 220 * Once we have searched the bucket we call rfs4_alloc_dr() to 221 * allocate a new entry, or reuse one that is available. 222 */ 223 int 224 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup) 225 { 226 227 uint32_t the_xid; 228 list_t *dr_bkt; 229 rfs4_dupreq_t *drp; 230 int bktdex; 231 232 /* 233 * Get the XID, calculate the bucket and search to 234 * see if we need to replay from the cache. 235 */ 236 the_xid = req->rq_xprt->xp_xid; 237 bktdex = the_xid % drc->dr_hash; 238 239 dr_bkt = (list_t *) 240 &(drc->dr_buckets[(the_xid % drc->dr_hash)]); 241 242 DTRACE_PROBE3(nfss__i__drc_bktdex, 243 int, bktdex, 244 uint32_t, the_xid, 245 list_t *, dr_bkt); 246 247 *dup = NULL; 248 249 mutex_enter(&drc->lock); 250 /* 251 * Search the bucket for a matching xid and address. 252 */ 253 for (drp = list_head(dr_bkt); drp != NULL; 254 drp = list_next(dr_bkt, drp)) { 255 256 if (drp->dr_xid == the_xid && 257 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len && 258 bcmp((caddr_t)drp->dr_addr.buf, 259 (caddr_t)req->rq_xprt->xp_rtaddr.buf, 260 drp->dr_addr.len) == 0) { 261 262 /* 263 * Found a match so REPLAY the Reply 264 */ 265 if (drp->dr_state == NFS4_DUP_REPLAY) { 266 rfs4_dr_chstate(drp, NFS4_DUP_INUSE); 267 mutex_exit(&drc->lock); 268 *dup = drp; 269 DTRACE_PROBE1(nfss__i__drc_replay, 270 rfs4_dupreq_t *, drp); 271 return (NFS4_DUP_REPLAY); 272 } 273 274 /* 275 * This entry must be in transition, so return 276 * the 'pending' status. 277 */ 278 mutex_exit(&drc->lock); 279 return (NFS4_DUP_PENDING); 280 } 281 } 282 283 drp = rfs4_alloc_dr(drc); 284 mutex_exit(&drc->lock); 285 286 /* 287 * The DRC is full and all entries are in use. Upper function 288 * should error out this request and force the client to 289 * retransmit -- effectively this is a resource issue. NFSD 290 * threads tied up with native File System, or the cache size 291 * is too small for the server load. 292 */ 293 if (drp == NULL) 294 return (NFS4_DUP_ERROR); 295 296 /* 297 * Init the state to NEW. 298 */ 299 drp->dr_state = NFS4_DUP_NEW; 300 301 /* 302 * If needed, resize the address buffer 303 */ 304 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) { 305 if (drp->dr_addr.buf != NULL) 306 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 307 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len; 308 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP); 309 if (drp->dr_addr.buf == NULL) { 310 /* 311 * If the malloc fails, mark the entry 312 * as free and put on the tail. 313 */ 314 drp->dr_addr.maxlen = 0; 315 drp->dr_state = NFS4_DUP_FREE; 316 mutex_enter(&drc->lock); 317 list_insert_tail(&(drc->dr_cache), drp); 318 mutex_exit(&drc->lock); 319 return (NFS4_DUP_ERROR); 320 } 321 } 322 323 324 /* 325 * Copy the address. 326 */ 327 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len; 328 329 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf, 330 (caddr_t)drp->dr_addr.buf, 331 drp->dr_addr.len); 332 333 drp->dr_xid = the_xid; 334 drp->dr_bkt = dr_bkt; 335 336 /* 337 * Insert at the head of the bucket and 338 * the drc lists.. 339 */ 340 mutex_enter(&drc->lock); 341 list_insert_head(&drc->dr_cache, drp); 342 list_insert_head(dr_bkt, drp); 343 mutex_exit(&drc->lock); 344 345 *dup = drp; 346 347 return (NFS4_DUP_NEW); 348 } 349 350 /* 351 * 352 * This function handles the duplicate request cache, 353 * NULL_PROC and COMPOUND procedure calls for NFSv4.0; 354 * the 4.x where x > 0 case is handled in rfs4x_dispatch. 355 * 356 * Passed into this function are:- 357 * 358 * disp A pointer to our dispatch table entry 359 * req The request to process 360 * xprt The server transport handle 361 * ap A pointer to the arguments 362 * 363 * 364 * When appropriate this function is responsible for inserting 365 * the reply into the duplicate cache or replaying an existing 366 * cached reply. 367 * 368 * dr_stat reflects the state of the duplicate request that 369 * has been inserted into or retrieved from the cache 370 * 371 * drp is the duplicate request entry 372 * 373 */ 374 int 375 rfs40_dispatch(struct svc_req *req, SVCXPRT *xprt, char *ap) 376 { 377 378 COMPOUND4res res_buf; 379 COMPOUND4res *rbp; 380 COMPOUND4args *cap; 381 int error = 0; 382 int dis_flags = 0; 383 int dr_stat = NFS4_NOT_DUP; 384 rfs4_dupreq_t *drp = NULL; 385 int rv; 386 struct compound_state cs; 387 nfs4_srv_t *nsrv4 = nfs4_get_srv(); 388 rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc; 389 390 /* Only NFSv4 Compounds from this point onward */ 391 392 rbp = &res_buf; 393 cap = (COMPOUND4args *)ap; 394 395 rfs4_init_compound_state(&cs); 396 397 /* 398 * Figure out the disposition of the whole COMPOUND 399 * and record it's IDEMPOTENTCY. 400 */ 401 rfs4_compound_flagproc(cap, &dis_flags); 402 403 /* 404 * If NON-IDEMPOTENT then we need to figure out if this 405 * request can be replied from the duplicate cache. 406 * 407 * If this is a new request then we need to insert the 408 * reply into the duplicate cache. 409 */ 410 if (!(dis_flags & RPC_IDEMPOTENT)) { 411 /* look for a replay from the cache or allocate */ 412 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp); 413 414 switch (dr_stat) { 415 416 case NFS4_DUP_ERROR: 417 rfs4_resource_err(req, cap); 418 return (1); 419 /* NOTREACHED */ 420 421 case NFS4_DUP_PENDING: 422 /* 423 * reply has previously been inserted into the 424 * duplicate cache, however the reply has 425 * not yet been sent via svc_sendreply() 426 */ 427 return (1); 428 /* NOTREACHED */ 429 430 case NFS4_DUP_NEW: 431 curthread->t_flag |= T_DONTPEND; 432 /* NON-IDEMPOTENT proc call */ 433 rfs4_compound(cap, rbp, &cs, req, &rv); 434 curthread->t_flag &= ~T_DONTPEND; 435 436 rfs4_fini_compound_state(&cs); 437 438 if (rv) /* short ckt sendreply on error */ 439 return (rv); 440 441 /* 442 * dr_res must be initialized before calling 443 * rfs4_dr_chstate (it frees the reply). 444 */ 445 drp->dr_res = res_buf; 446 if (curthread->t_flag & T_WOULDBLOCK) { 447 curthread->t_flag &= ~T_WOULDBLOCK; 448 /* 449 * mark this entry as FREE and plop 450 * on the end of the cache list 451 */ 452 mutex_enter(&drp->drc->lock); 453 rfs4_dr_chstate(drp, NFS4_DUP_FREE); 454 list_insert_tail(&(drp->drc->dr_cache), drp); 455 mutex_exit(&drp->drc->lock); 456 return (1); 457 } 458 break; 459 460 case NFS4_DUP_REPLAY: 461 /* replay from the cache */ 462 rbp = &(drp->dr_res); 463 break; 464 } 465 } else { 466 curthread->t_flag |= T_DONTPEND; 467 /* IDEMPOTENT proc call */ 468 rfs4_compound(cap, rbp, &cs, req, &rv); 469 curthread->t_flag &= ~T_DONTPEND; 470 471 rfs4_fini_compound_state(&cs); 472 473 if (rv) /* short ckt sendreply on error */ 474 return (rv); 475 476 if (curthread->t_flag & T_WOULDBLOCK) { 477 curthread->t_flag &= ~T_WOULDBLOCK; 478 return (1); 479 } 480 } 481 482 /* 483 * Send out the replayed reply or the 'real' one. 484 */ 485 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 486 DTRACE_PROBE2(nfss__e__dispatch_sendfail, 487 struct svc_req *, xprt, 488 char *, rbp); 489 svcerr_systemerr(xprt); 490 error++; 491 } 492 493 /* 494 * If this reply was just inserted into the duplicate cache 495 * or it was replayed from the dup cache; (re)mark it as 496 * available for replay 497 * 498 * At first glance, this 'if' statement seems a little strange; 499 * testing for NFS4_DUP_REPLAY, and then calling... 500 * 501 * rfs4_dr_chatate(NFS4_DUP_REPLAY) 502 * 503 * ... but notice that we are checking dr_stat, and not the 504 * state of the entry itself, the entry will be NFS4_DUP_INUSE, 505 * we do that so that we know not to prematurely reap it whilst 506 * we resent it to the client. 507 * 508 */ 509 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) { 510 mutex_enter(&drp->drc->lock); 511 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY); 512 mutex_exit(&drp->drc->lock); 513 } else if (dr_stat == NFS4_NOT_DUP) { 514 rfs4_compound_free(rbp); 515 } 516 517 return (error); 518 } 519 520 static int 521 rfs4_send_minor_mismatch(SVCXPRT *xprt, COMPOUND4args *argsp) 522 { 523 COMPOUND4res res_buf, *resp; 524 int err = 0; 525 526 resp = &res_buf; 527 528 /* 529 * Form a reply tag by copying over the request tag. 530 */ 531 resp->tag.utf8string_len = argsp->tag.utf8string_len; 532 if (argsp->tag.utf8string_len != 0) { 533 resp->tag.utf8string_val = 534 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 535 bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val, 536 resp->tag.utf8string_len); 537 } else { 538 resp->tag.utf8string_val = NULL; 539 } 540 resp->array_len = 0; 541 resp->array = NULL; 542 resp->status = NFS4ERR_MINOR_VERS_MISMATCH; 543 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)resp)) { 544 DTRACE_PROBE2(nfss__e__minorvers_mismatch, 545 SVCXPRT *, xprt, char *, resp); 546 svcerr_systemerr(xprt); 547 err = 1; 548 } 549 rfs4_compound_free(resp); 550 return (err); 551 } 552 553 /* 554 * Test minor version against allowed minor versions. 555 */ 556 static inline bool_t 557 rfs4_minorversion_enabled(uint32_t minorversion) 558 { 559 return (minorversion <= nfs4_get_srv()->nfs4_minor_max); 560 } 561 562 bool_t 563 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args) 564 { 565 COMPOUND4args *argsp; 566 567 if (req->rq_vers != 4) 568 return (FALSE); 569 570 argsp = (COMPOUND4args *)args; 571 572 if (rfs4_minorversion_enabled(argsp->minorversion)) 573 return (FALSE); 574 575 (void) rfs4_send_minor_mismatch(xprt, argsp); 576 return (TRUE); 577 } 578 579 void 580 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp) 581 { 582 COMPOUND4res res_buf, *rbp; 583 nfs_resop4 *resop; 584 PUTFH4res *resp; 585 586 rbp = &res_buf; 587 588 /* 589 * Form a reply tag by copying over the request tag. 590 */ 591 rbp->tag.utf8string_len = argsp->tag.utf8string_len; 592 if (argsp->tag.utf8string_len != 0) { 593 rbp->tag.utf8string_val = 594 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 595 bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val, 596 rbp->tag.utf8string_len); 597 } else { 598 rbp->tag.utf8string_val = NULL; 599 } 600 601 rbp->array_len = 1; 602 rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4), 603 KM_SLEEP); 604 resop = &rbp->array[0]; 605 resop->resop = argsp->array[0].argop; /* copy first op over */ 606 607 /* Any op will do, just need to access status field */ 608 resp = &resop->nfs_resop4_u.opputfh; 609 610 /* 611 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL. 612 * Note that all op numbers in the compound array were already 613 * validated by the XDR decoder (xdr_COMPOUND4args_srv()). 614 */ 615 resp->status = (resop->resop == OP_ILLEGAL ? 616 NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE); 617 618 /* compound status is same as first op status */ 619 rbp->status = resp->status; 620 621 if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 622 DTRACE_PROBE2(nfss__rsrc_err__sendfail, 623 struct svc_req *, req->rq_xprt, char *, rbp); 624 svcerr_systemerr(req->rq_xprt); 625 } 626 627 UTF8STRING_FREE(rbp->tag); 628 kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4)); 629 } 630 631 int 632 rfs4_dispatch(struct rpcdisp *disp __unused, struct svc_req *req, 633 SVCXPRT *xprt, char *ap) 634 { 635 COMPOUND4args *cmp; 636 637 /* 638 * Handle the NULL Proc here 639 */ 640 if (req->rq_proc == RFS_NULL) { 641 return (!svc_sendreply(xprt, xdr_void, NULL)); 642 } 643 644 cmp = (COMPOUND4args *)ap; 645 ASSERT(cmp != NULL); 646 647 if (!rfs4_minorversion_enabled(cmp->minorversion)) 648 return (rfs4_send_minor_mismatch(xprt, cmp)); 649 650 if (cmp->minorversion == 0) 651 return (rfs40_dispatch(req, xprt, ap)); 652 653 return (rfs4x_dispatch(req, xprt, ap)); 654 } 655