1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2018 Nexenta Systems, Inc. 29 * Copyright 2020 RackTop Systems, Inc. 30 */ 31 32 #include <sys/systm.h> 33 #include <sys/sdt.h> 34 #include <rpc/types.h> 35 #include <rpc/auth.h> 36 #include <rpc/auth_unix.h> 37 #include <rpc/auth_des.h> 38 #include <rpc/svc.h> 39 #include <rpc/xdr.h> 40 #include <nfs/nfs4.h> 41 #include <nfs/nfs_dispatch.h> 42 #include <nfs/nfs4_drc.h> 43 44 /* 45 * The default size of the duplicate request cache 46 */ 47 uint32_t nfs4_drc_max = 8 * 1024; 48 49 /* 50 * The number of buckets we'd like to hash the 51 * replies into.. do not change this on the fly. 52 */ 53 uint32_t nfs4_drc_hash = 541; 54 55 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp); 56 57 /* 58 * Initialize a duplicate request cache. 59 */ 60 rfs4_drc_t * 61 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size) 62 { 63 rfs4_drc_t *drc; 64 uint32_t bki; 65 66 ASSERT(drc_size); 67 ASSERT(drc_hash_size); 68 69 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP); 70 71 drc->max_size = drc_size; 72 drc->in_use = 0; 73 74 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL); 75 76 drc->dr_hash = drc_hash_size; 77 78 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP); 79 80 for (bki = 0; bki < drc_hash_size; bki++) { 81 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t), 82 offsetof(rfs4_dupreq_t, dr_bkt_next)); 83 } 84 85 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t), 86 offsetof(rfs4_dupreq_t, dr_next)); 87 88 return (drc); 89 } 90 91 /* 92 * Destroy a duplicate request cache. 93 */ 94 void 95 rfs4_fini_drc(void) 96 { 97 nfs4_srv_t *nsrv4 = nfs4_get_srv(); 98 rfs4_drc_t *drc = nsrv4->nfs4_drc; 99 rfs4_dupreq_t *drp, *drp_next; 100 101 /* iterate over the dr_cache and free the enties */ 102 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) { 103 104 if (drp->dr_state == NFS4_DUP_REPLAY) 105 rfs4_compound_free(&(drp->dr_res)); 106 107 if (drp->dr_addr.buf != NULL) 108 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 109 110 drp_next = list_next(&(drc->dr_cache), drp); 111 112 kmem_free(drp, sizeof (rfs4_dupreq_t)); 113 } 114 115 mutex_destroy(&drc->lock); 116 kmem_free(drc->dr_buckets, 117 sizeof (list_t)*drc->dr_hash); 118 kmem_free(drc, sizeof (rfs4_drc_t)); 119 } 120 121 /* 122 * rfs4_dr_chstate: 123 * 124 * Change the state of a rfs4_dupreq. If it's not in transition 125 * to the FREE state, return. If we are moving to the FREE state 126 * then we need to clean up the compound results and move the entry 127 * to the end of the list. 128 */ 129 void 130 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state) 131 { 132 rfs4_drc_t *drc; 133 134 ASSERT(drp); 135 ASSERT(drp->drc); 136 ASSERT(drp->dr_bkt); 137 ASSERT(MUTEX_HELD(&drp->drc->lock)); 138 139 drp->dr_state = new_state; 140 141 if (new_state != NFS4_DUP_FREE) 142 return; 143 144 drc = drp->drc; 145 146 /* 147 * Remove entry from the bucket and 148 * dr_cache list, free compound results. 149 */ 150 list_remove(drp->dr_bkt, drp); 151 list_remove(&(drc->dr_cache), drp); 152 rfs4_compound_free(&(drp->dr_res)); 153 } 154 155 /* 156 * rfs4_alloc_dr: 157 * 158 * Malloc a new one if we have not reached our maximum cache 159 * limit, otherwise pick an entry off the tail -- Use if it 160 * is marked as NFS4_DUP_FREE, or is an entry in the 161 * NFS4_DUP_REPLAY state. 162 */ 163 rfs4_dupreq_t * 164 rfs4_alloc_dr(rfs4_drc_t *drc) 165 { 166 rfs4_dupreq_t *drp_tail, *drp = NULL; 167 168 ASSERT(drc); 169 ASSERT(MUTEX_HELD(&drc->lock)); 170 171 /* 172 * Have we hit the cache limit yet ? 173 */ 174 if (drc->in_use < drc->max_size) { 175 /* 176 * nope, so let's malloc a new one 177 */ 178 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP); 179 drp->drc = drc; 180 drc->in_use++; 181 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp); 182 return (drp); 183 } 184 185 /* 186 * Cache is all allocated now traverse the list 187 * backwards to find one we can reuse. 188 */ 189 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL; 190 drp_tail = list_prev(&drc->dr_cache, drp_tail)) { 191 192 switch (drp_tail->dr_state) { 193 194 case NFS4_DUP_FREE: 195 list_remove(&(drc->dr_cache), drp_tail); 196 DTRACE_PROBE1(nfss__i__drc_freeclaim, 197 rfs4_dupreq_t *, drp_tail); 198 return (drp_tail); 199 /* NOTREACHED */ 200 201 case NFS4_DUP_REPLAY: 202 /* grab it. */ 203 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE); 204 DTRACE_PROBE1(nfss__i__drc_replayclaim, 205 rfs4_dupreq_t *, drp_tail); 206 return (drp_tail); 207 /* NOTREACHED */ 208 } 209 } 210 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc); 211 return (NULL); 212 } 213 214 /* 215 * rfs4_find_dr: 216 * 217 * Search for an entry in the duplicate request cache by 218 * calculating the hash index based on the XID, and examining 219 * the entries in the hash bucket. If we find a match, return. 220 * Once we have searched the bucket we call rfs4_alloc_dr() to 221 * allocate a new entry, or reuse one that is available. 222 */ 223 int 224 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup) 225 { 226 227 uint32_t the_xid; 228 list_t *dr_bkt; 229 rfs4_dupreq_t *drp; 230 int bktdex; 231 232 /* 233 * Get the XID, calculate the bucket and search to 234 * see if we need to replay from the cache. 235 */ 236 the_xid = req->rq_xprt->xp_xid; 237 bktdex = the_xid % drc->dr_hash; 238 239 dr_bkt = (list_t *) 240 &(drc->dr_buckets[(the_xid % drc->dr_hash)]); 241 242 DTRACE_PROBE3(nfss__i__drc_bktdex, 243 int, bktdex, 244 uint32_t, the_xid, 245 list_t *, dr_bkt); 246 247 *dup = NULL; 248 249 mutex_enter(&drc->lock); 250 /* 251 * Search the bucket for a matching xid and address. 252 */ 253 for (drp = list_head(dr_bkt); drp != NULL; 254 drp = list_next(dr_bkt, drp)) { 255 256 if (drp->dr_xid == the_xid && 257 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len && 258 bcmp((caddr_t)drp->dr_addr.buf, 259 (caddr_t)req->rq_xprt->xp_rtaddr.buf, 260 drp->dr_addr.len) == 0) { 261 262 /* 263 * Found a match so REPLAY the Reply 264 */ 265 if (drp->dr_state == NFS4_DUP_REPLAY) { 266 rfs4_dr_chstate(drp, NFS4_DUP_INUSE); 267 mutex_exit(&drc->lock); 268 *dup = drp; 269 DTRACE_PROBE1(nfss__i__drc_replay, 270 rfs4_dupreq_t *, drp); 271 return (NFS4_DUP_REPLAY); 272 } 273 274 /* 275 * This entry must be in transition, so return 276 * the 'pending' status. 277 */ 278 mutex_exit(&drc->lock); 279 return (NFS4_DUP_PENDING); 280 } 281 } 282 283 drp = rfs4_alloc_dr(drc); 284 mutex_exit(&drc->lock); 285 286 /* 287 * The DRC is full and all entries are in use. Upper function 288 * should error out this request and force the client to 289 * retransmit -- effectively this is a resource issue. NFSD 290 * threads tied up with native File System, or the cache size 291 * is too small for the server load. 292 */ 293 if (drp == NULL) 294 return (NFS4_DUP_ERROR); 295 296 /* 297 * Init the state to NEW. 298 */ 299 drp->dr_state = NFS4_DUP_NEW; 300 301 /* 302 * If needed, resize the address buffer 303 */ 304 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) { 305 if (drp->dr_addr.buf != NULL) 306 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen); 307 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len; 308 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP); 309 if (drp->dr_addr.buf == NULL) { 310 /* 311 * If the malloc fails, mark the entry 312 * as free and put on the tail. 313 */ 314 drp->dr_addr.maxlen = 0; 315 drp->dr_state = NFS4_DUP_FREE; 316 mutex_enter(&drc->lock); 317 list_insert_tail(&(drc->dr_cache), drp); 318 mutex_exit(&drc->lock); 319 return (NFS4_DUP_ERROR); 320 } 321 } 322 323 324 /* 325 * Copy the address. 326 */ 327 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len; 328 329 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf, 330 (caddr_t)drp->dr_addr.buf, 331 drp->dr_addr.len); 332 333 drp->dr_xid = the_xid; 334 drp->dr_bkt = dr_bkt; 335 336 /* 337 * Insert at the head of the bucket and 338 * the drc lists.. 339 */ 340 mutex_enter(&drc->lock); 341 list_insert_head(&drc->dr_cache, drp); 342 list_insert_head(dr_bkt, drp); 343 mutex_exit(&drc->lock); 344 345 *dup = drp; 346 347 return (NFS4_DUP_NEW); 348 } 349 350 /* 351 * 352 * This function handles the duplicate request cache, 353 * NULL_PROC and COMPOUND procedure calls for NFSv4.0; 354 * the 4.x where x > 0 case is handled in rfs4x_dispatch. 355 * 356 * Passed into this function are:- 357 * 358 * disp A pointer to our dispatch table entry 359 * req The request to process 360 * xprt The server transport handle 361 * ap A pointer to the arguments 362 * 363 * 364 * When appropriate this function is responsible for inserting 365 * the reply into the duplicate cache or replaying an existing 366 * cached reply. 367 * 368 * dr_stat reflects the state of the duplicate request that 369 * has been inserted into or retrieved from the cache 370 * 371 * drp is the duplicate request entry 372 * 373 */ 374 int 375 rfs40_dispatch(struct svc_req *req, SVCXPRT *xprt, char *ap) 376 { 377 378 COMPOUND4res res_buf; 379 COMPOUND4res *rbp; 380 COMPOUND4args *cap; 381 int error = 0; 382 int dr_stat = NFS4_NOT_DUP; 383 rfs4_dupreq_t *drp = NULL; 384 int rv; 385 struct compound_state cs; 386 nfs4_srv_t *nsrv4 = nfs4_get_srv(); 387 rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc; 388 389 /* Only NFSv4 Compounds from this point onward */ 390 391 rbp = &res_buf; 392 cap = (COMPOUND4args *)ap; 393 394 rfs4_init_compound_state(&cs); 395 396 /* 397 * Figure out the disposition of the whole COMPOUND 398 * and record it's IDEMPOTENTCY. 399 * 400 * If NON-IDEMPOTENT then we need to figure out if this 401 * request can be replied from the duplicate cache. 402 * 403 * If this is a new request then we need to insert the 404 * reply into the duplicate cache. 405 */ 406 if (!rfs4_idempotent_req(cap)) { 407 /* look for a replay from the cache or allocate */ 408 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp); 409 410 switch (dr_stat) { 411 412 case NFS4_DUP_ERROR: 413 rfs4_resource_err(req, cap); 414 return (1); 415 /* NOTREACHED */ 416 417 case NFS4_DUP_PENDING: 418 /* 419 * reply has previously been inserted into the 420 * duplicate cache, however the reply has 421 * not yet been sent via svc_sendreply() 422 */ 423 return (1); 424 /* NOTREACHED */ 425 426 case NFS4_DUP_NEW: 427 curthread->t_flag |= T_DONTPEND; 428 /* NON-IDEMPOTENT proc call */ 429 rfs4_compound(cap, rbp, &cs, req, &rv); 430 curthread->t_flag &= ~T_DONTPEND; 431 432 rfs4_fini_compound_state(&cs); 433 434 if (rv) /* short ckt sendreply on error */ 435 return (rv); 436 437 /* 438 * dr_res must be initialized before calling 439 * rfs4_dr_chstate (it frees the reply). 440 */ 441 drp->dr_res = res_buf; 442 if (curthread->t_flag & T_WOULDBLOCK) { 443 curthread->t_flag &= ~T_WOULDBLOCK; 444 /* 445 * mark this entry as FREE and plop 446 * on the end of the cache list 447 */ 448 mutex_enter(&drp->drc->lock); 449 rfs4_dr_chstate(drp, NFS4_DUP_FREE); 450 list_insert_tail(&(drp->drc->dr_cache), drp); 451 mutex_exit(&drp->drc->lock); 452 return (1); 453 } 454 break; 455 456 case NFS4_DUP_REPLAY: 457 /* replay from the cache */ 458 rbp = &(drp->dr_res); 459 break; 460 } 461 } else { 462 curthread->t_flag |= T_DONTPEND; 463 /* IDEMPOTENT proc call */ 464 rfs4_compound(cap, rbp, &cs, req, &rv); 465 curthread->t_flag &= ~T_DONTPEND; 466 467 rfs4_fini_compound_state(&cs); 468 469 if (rv) /* short ckt sendreply on error */ 470 return (rv); 471 472 if (curthread->t_flag & T_WOULDBLOCK) { 473 curthread->t_flag &= ~T_WOULDBLOCK; 474 return (1); 475 } 476 } 477 478 /* 479 * Send out the replayed reply or the 'real' one. 480 */ 481 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 482 DTRACE_PROBE2(nfss__e__dispatch_sendfail, 483 struct svc_req *, xprt, 484 char *, rbp); 485 svcerr_systemerr(xprt); 486 error++; 487 } 488 489 /* 490 * If this reply was just inserted into the duplicate cache 491 * or it was replayed from the dup cache; (re)mark it as 492 * available for replay 493 * 494 * At first glance, this 'if' statement seems a little strange; 495 * testing for NFS4_DUP_REPLAY, and then calling... 496 * 497 * rfs4_dr_chatate(NFS4_DUP_REPLAY) 498 * 499 * ... but notice that we are checking dr_stat, and not the 500 * state of the entry itself, the entry will be NFS4_DUP_INUSE, 501 * we do that so that we know not to prematurely reap it whilst 502 * we resent it to the client. 503 * 504 */ 505 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) { 506 mutex_enter(&drp->drc->lock); 507 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY); 508 mutex_exit(&drp->drc->lock); 509 } else if (dr_stat == NFS4_NOT_DUP) { 510 rfs4_compound_free(rbp); 511 } 512 513 return (error); 514 } 515 516 static int 517 rfs4_send_minor_mismatch(SVCXPRT *xprt, COMPOUND4args *argsp) 518 { 519 COMPOUND4res res_buf, *resp; 520 int err = 0; 521 522 resp = &res_buf; 523 524 /* 525 * Form a reply tag by copying over the request tag. 526 */ 527 resp->tag.utf8string_len = argsp->tag.utf8string_len; 528 if (argsp->tag.utf8string_len != 0) { 529 resp->tag.utf8string_val = 530 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 531 bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val, 532 resp->tag.utf8string_len); 533 } else { 534 resp->tag.utf8string_val = NULL; 535 } 536 resp->array_len = 0; 537 resp->array = NULL; 538 resp->status = NFS4ERR_MINOR_VERS_MISMATCH; 539 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)resp)) { 540 DTRACE_PROBE2(nfss__e__minorvers_mismatch, 541 SVCXPRT *, xprt, char *, resp); 542 svcerr_systemerr(xprt); 543 err = 1; 544 } 545 rfs4_compound_free(resp); 546 return (err); 547 } 548 549 /* 550 * Test minor version against allowed minor versions. 551 */ 552 static inline bool_t 553 rfs4_minorversion_enabled(uint32_t minorversion) 554 { 555 return (minorversion <= nfs4_get_srv()->nfs4_minor_max); 556 } 557 558 bool_t 559 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args) 560 { 561 COMPOUND4args *argsp; 562 563 if (req->rq_vers != 4) 564 return (FALSE); 565 566 argsp = (COMPOUND4args *)args; 567 568 if (rfs4_minorversion_enabled(argsp->minorversion)) 569 return (FALSE); 570 571 (void) rfs4_send_minor_mismatch(xprt, argsp); 572 return (TRUE); 573 } 574 575 void 576 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp) 577 { 578 COMPOUND4res res_buf, *rbp; 579 nfs_resop4 *resop; 580 PUTFH4res *resp; 581 582 rbp = &res_buf; 583 584 /* 585 * Form a reply tag by copying over the request tag. 586 */ 587 rbp->tag.utf8string_len = argsp->tag.utf8string_len; 588 if (argsp->tag.utf8string_len != 0) { 589 rbp->tag.utf8string_val = 590 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP); 591 bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val, 592 rbp->tag.utf8string_len); 593 } else { 594 rbp->tag.utf8string_val = NULL; 595 } 596 597 rbp->array_len = 1; 598 rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4), 599 KM_SLEEP); 600 resop = &rbp->array[0]; 601 resop->resop = argsp->array[0].argop; /* copy first op over */ 602 603 /* Any op will do, just need to access status field */ 604 resp = &resop->nfs_resop4_u.opputfh; 605 606 /* 607 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL. 608 * Note that all op numbers in the compound array were already 609 * validated by the XDR decoder (xdr_COMPOUND4args_srv()). 610 */ 611 resp->status = (resop->resop == OP_ILLEGAL ? 612 NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE); 613 614 /* compound status is same as first op status */ 615 rbp->status = resp->status; 616 617 if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) { 618 DTRACE_PROBE2(nfss__rsrc_err__sendfail, 619 struct svc_req *, req->rq_xprt, char *, rbp); 620 svcerr_systemerr(req->rq_xprt); 621 } 622 623 UTF8STRING_FREE(rbp->tag); 624 kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4)); 625 } 626 627 int 628 rfs4_dispatch(struct rpcdisp *disp __unused, struct svc_req *req, 629 SVCXPRT *xprt, char *ap) 630 { 631 COMPOUND4args *cmp; 632 633 /* 634 * Handle the NULL Proc here 635 */ 636 if (req->rq_proc == RFS_NULL) { 637 return (!svc_sendreply(xprt, xdr_void, NULL)); 638 } 639 640 cmp = (COMPOUND4args *)ap; 641 ASSERT(cmp != NULL); 642 643 if (!rfs4_minorversion_enabled(cmp->minorversion)) 644 return (rfs4_send_minor_mismatch(xprt, cmp)); 645 646 if (cmp->minorversion == 0) 647 return (rfs40_dispatch(req, xprt, ap)); 648 649 return (rfs4x_dispatch(req, xprt, ap)); 650 } 651