1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright 2018 Nexenta Systems, Inc.
29 * Copyright 2020 RackTop Systems, Inc.
30 */
31
32 #include <sys/systm.h>
33 #include <sys/sdt.h>
34 #include <rpc/types.h>
35 #include <rpc/auth.h>
36 #include <rpc/auth_unix.h>
37 #include <rpc/auth_des.h>
38 #include <rpc/svc.h>
39 #include <rpc/xdr.h>
40 #include <nfs/nfs4.h>
41 #include <nfs/nfs_dispatch.h>
42 #include <nfs/nfs4_drc.h>
43
44 /*
45 * The default size of the duplicate request cache
46 */
47 uint32_t nfs4_drc_max = 8 * 1024;
48
49 /*
50 * The number of buckets we'd like to hash the
51 * replies into.. do not change this on the fly.
52 */
53 uint32_t nfs4_drc_hash = 541;
54
55 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
56
57 /*
58 * Initialize a duplicate request cache.
59 */
60 rfs4_drc_t *
rfs4_init_drc(uint32_t drc_size,uint32_t drc_hash_size)61 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
62 {
63 rfs4_drc_t *drc;
64 uint32_t bki;
65
66 ASSERT(drc_size);
67 ASSERT(drc_hash_size);
68
69 drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
70
71 drc->max_size = drc_size;
72 drc->in_use = 0;
73
74 mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
75
76 drc->dr_hash = drc_hash_size;
77
78 drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
79
80 for (bki = 0; bki < drc_hash_size; bki++) {
81 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
82 offsetof(rfs4_dupreq_t, dr_bkt_next));
83 }
84
85 list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
86 offsetof(rfs4_dupreq_t, dr_next));
87
88 return (drc);
89 }
90
91 /*
92 * Destroy a duplicate request cache.
93 */
94 void
rfs4_fini_drc(void)95 rfs4_fini_drc(void)
96 {
97 nfs4_srv_t *nsrv4 = nfs4_get_srv();
98 rfs4_drc_t *drc = nsrv4->nfs4_drc;
99 rfs4_dupreq_t *drp, *drp_next;
100
101 /* iterate over the dr_cache and free the enties */
102 for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
103
104 if (drp->dr_state == NFS4_DUP_REPLAY)
105 rfs4_compound_free(&(drp->dr_res));
106
107 if (drp->dr_addr.buf != NULL)
108 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
109
110 drp_next = list_next(&(drc->dr_cache), drp);
111
112 kmem_free(drp, sizeof (rfs4_dupreq_t));
113 }
114
115 mutex_destroy(&drc->lock);
116 kmem_free(drc->dr_buckets,
117 sizeof (list_t)*drc->dr_hash);
118 kmem_free(drc, sizeof (rfs4_drc_t));
119 }
120
121 /*
122 * rfs4_dr_chstate:
123 *
124 * Change the state of a rfs4_dupreq. If it's not in transition
125 * to the FREE state, return. If we are moving to the FREE state
126 * then we need to clean up the compound results and move the entry
127 * to the end of the list.
128 */
129 void
rfs4_dr_chstate(rfs4_dupreq_t * drp,int new_state)130 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
131 {
132 rfs4_drc_t *drc;
133
134 ASSERT(drp);
135 ASSERT(drp->drc);
136 ASSERT(drp->dr_bkt);
137 ASSERT(MUTEX_HELD(&drp->drc->lock));
138
139 drp->dr_state = new_state;
140
141 if (new_state != NFS4_DUP_FREE)
142 return;
143
144 drc = drp->drc;
145
146 /*
147 * Remove entry from the bucket and
148 * dr_cache list, free compound results.
149 */
150 list_remove(drp->dr_bkt, drp);
151 list_remove(&(drc->dr_cache), drp);
152 rfs4_compound_free(&(drp->dr_res));
153 }
154
155 /*
156 * rfs4_alloc_dr:
157 *
158 * Malloc a new one if we have not reached our maximum cache
159 * limit, otherwise pick an entry off the tail -- Use if it
160 * is marked as NFS4_DUP_FREE, or is an entry in the
161 * NFS4_DUP_REPLAY state.
162 */
163 rfs4_dupreq_t *
rfs4_alloc_dr(rfs4_drc_t * drc)164 rfs4_alloc_dr(rfs4_drc_t *drc)
165 {
166 rfs4_dupreq_t *drp_tail, *drp = NULL;
167
168 ASSERT(drc);
169 ASSERT(MUTEX_HELD(&drc->lock));
170
171 /*
172 * Have we hit the cache limit yet ?
173 */
174 if (drc->in_use < drc->max_size) {
175 /*
176 * nope, so let's malloc a new one
177 */
178 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
179 drp->drc = drc;
180 drc->in_use++;
181 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
182 return (drp);
183 }
184
185 /*
186 * Cache is all allocated now traverse the list
187 * backwards to find one we can reuse.
188 */
189 for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
190 drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
191
192 switch (drp_tail->dr_state) {
193
194 case NFS4_DUP_FREE:
195 list_remove(&(drc->dr_cache), drp_tail);
196 DTRACE_PROBE1(nfss__i__drc_freeclaim,
197 rfs4_dupreq_t *, drp_tail);
198 return (drp_tail);
199 /* NOTREACHED */
200
201 case NFS4_DUP_REPLAY:
202 /* grab it. */
203 rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
204 DTRACE_PROBE1(nfss__i__drc_replayclaim,
205 rfs4_dupreq_t *, drp_tail);
206 return (drp_tail);
207 /* NOTREACHED */
208 }
209 }
210 DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
211 return (NULL);
212 }
213
214 /*
215 * rfs4_find_dr:
216 *
217 * Search for an entry in the duplicate request cache by
218 * calculating the hash index based on the XID, and examining
219 * the entries in the hash bucket. If we find a match, return.
220 * Once we have searched the bucket we call rfs4_alloc_dr() to
221 * allocate a new entry, or reuse one that is available.
222 */
223 int
rfs4_find_dr(struct svc_req * req,rfs4_drc_t * drc,rfs4_dupreq_t ** dup)224 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
225 {
226
227 uint32_t the_xid;
228 list_t *dr_bkt;
229 rfs4_dupreq_t *drp;
230 int bktdex;
231
232 /*
233 * Get the XID, calculate the bucket and search to
234 * see if we need to replay from the cache.
235 */
236 the_xid = req->rq_xprt->xp_xid;
237 bktdex = the_xid % drc->dr_hash;
238
239 dr_bkt = (list_t *)
240 &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
241
242 DTRACE_PROBE3(nfss__i__drc_bktdex,
243 int, bktdex,
244 uint32_t, the_xid,
245 list_t *, dr_bkt);
246
247 *dup = NULL;
248
249 mutex_enter(&drc->lock);
250 /*
251 * Search the bucket for a matching xid and address.
252 */
253 for (drp = list_head(dr_bkt); drp != NULL;
254 drp = list_next(dr_bkt, drp)) {
255
256 if (drp->dr_xid == the_xid &&
257 drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
258 bcmp((caddr_t)drp->dr_addr.buf,
259 (caddr_t)req->rq_xprt->xp_rtaddr.buf,
260 drp->dr_addr.len) == 0) {
261
262 /*
263 * Found a match so REPLAY the Reply
264 */
265 if (drp->dr_state == NFS4_DUP_REPLAY) {
266 rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
267 mutex_exit(&drc->lock);
268 *dup = drp;
269 DTRACE_PROBE1(nfss__i__drc_replay,
270 rfs4_dupreq_t *, drp);
271 return (NFS4_DUP_REPLAY);
272 }
273
274 /*
275 * This entry must be in transition, so return
276 * the 'pending' status.
277 */
278 mutex_exit(&drc->lock);
279 return (NFS4_DUP_PENDING);
280 }
281 }
282
283 drp = rfs4_alloc_dr(drc);
284 mutex_exit(&drc->lock);
285
286 /*
287 * The DRC is full and all entries are in use. Upper function
288 * should error out this request and force the client to
289 * retransmit -- effectively this is a resource issue. NFSD
290 * threads tied up with native File System, or the cache size
291 * is too small for the server load.
292 */
293 if (drp == NULL)
294 return (NFS4_DUP_ERROR);
295
296 /*
297 * Init the state to NEW.
298 */
299 drp->dr_state = NFS4_DUP_NEW;
300
301 /*
302 * If needed, resize the address buffer
303 */
304 if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
305 if (drp->dr_addr.buf != NULL)
306 kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
307 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
308 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
309 if (drp->dr_addr.buf == NULL) {
310 /*
311 * If the malloc fails, mark the entry
312 * as free and put on the tail.
313 */
314 drp->dr_addr.maxlen = 0;
315 drp->dr_state = NFS4_DUP_FREE;
316 mutex_enter(&drc->lock);
317 list_insert_tail(&(drc->dr_cache), drp);
318 mutex_exit(&drc->lock);
319 return (NFS4_DUP_ERROR);
320 }
321 }
322
323
324 /*
325 * Copy the address.
326 */
327 drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
328
329 bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
330 (caddr_t)drp->dr_addr.buf,
331 drp->dr_addr.len);
332
333 drp->dr_xid = the_xid;
334 drp->dr_bkt = dr_bkt;
335
336 /*
337 * Insert at the head of the bucket and
338 * the drc lists..
339 */
340 mutex_enter(&drc->lock);
341 list_insert_head(&drc->dr_cache, drp);
342 list_insert_head(dr_bkt, drp);
343 mutex_exit(&drc->lock);
344
345 *dup = drp;
346
347 return (NFS4_DUP_NEW);
348 }
349
350 /*
351 *
352 * This function handles the duplicate request cache,
353 * NULL_PROC and COMPOUND procedure calls for NFSv4.0;
354 * the 4.x where x > 0 case is handled in rfs4x_dispatch.
355 *
356 * Passed into this function are:-
357 *
358 * disp A pointer to our dispatch table entry
359 * req The request to process
360 * xprt The server transport handle
361 * ap A pointer to the arguments
362 *
363 *
364 * When appropriate this function is responsible for inserting
365 * the reply into the duplicate cache or replaying an existing
366 * cached reply.
367 *
368 * dr_stat reflects the state of the duplicate request that
369 * has been inserted into or retrieved from the cache
370 *
371 * drp is the duplicate request entry
372 *
373 */
374 int
rfs40_dispatch(struct svc_req * req,SVCXPRT * xprt,char * ap)375 rfs40_dispatch(struct svc_req *req, SVCXPRT *xprt, char *ap)
376 {
377
378 COMPOUND4res res_buf;
379 COMPOUND4res *rbp;
380 COMPOUND4args *cap;
381 int error = 0;
382 int dis_flags = 0;
383 int dr_stat = NFS4_NOT_DUP;
384 rfs4_dupreq_t *drp = NULL;
385 int rv;
386 struct compound_state cs;
387 nfs4_srv_t *nsrv4 = nfs4_get_srv();
388 rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc;
389
390 /* Only NFSv4 Compounds from this point onward */
391
392 rbp = &res_buf;
393 cap = (COMPOUND4args *)ap;
394
395 rfs4_init_compound_state(&cs);
396
397 /*
398 * Figure out the disposition of the whole COMPOUND
399 * and record it's IDEMPOTENTCY.
400 */
401 rfs4_compound_flagproc(cap, &dis_flags);
402
403 /*
404 * If NON-IDEMPOTENT then we need to figure out if this
405 * request can be replied from the duplicate cache.
406 *
407 * If this is a new request then we need to insert the
408 * reply into the duplicate cache.
409 */
410 if (!(dis_flags & RPC_IDEMPOTENT)) {
411 /* look for a replay from the cache or allocate */
412 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
413
414 switch (dr_stat) {
415
416 case NFS4_DUP_ERROR:
417 rfs4_resource_err(req, cap);
418 return (1);
419 /* NOTREACHED */
420
421 case NFS4_DUP_PENDING:
422 /*
423 * reply has previously been inserted into the
424 * duplicate cache, however the reply has
425 * not yet been sent via svc_sendreply()
426 */
427 return (1);
428 /* NOTREACHED */
429
430 case NFS4_DUP_NEW:
431 curthread->t_flag |= T_DONTPEND;
432 /* NON-IDEMPOTENT proc call */
433 rfs4_compound(cap, rbp, &cs, req, &rv);
434 curthread->t_flag &= ~T_DONTPEND;
435
436 rfs4_fini_compound_state(&cs);
437
438 if (rv) /* short ckt sendreply on error */
439 return (rv);
440
441 /*
442 * dr_res must be initialized before calling
443 * rfs4_dr_chstate (it frees the reply).
444 */
445 drp->dr_res = res_buf;
446 if (curthread->t_flag & T_WOULDBLOCK) {
447 curthread->t_flag &= ~T_WOULDBLOCK;
448 /*
449 * mark this entry as FREE and plop
450 * on the end of the cache list
451 */
452 mutex_enter(&drp->drc->lock);
453 rfs4_dr_chstate(drp, NFS4_DUP_FREE);
454 list_insert_tail(&(drp->drc->dr_cache), drp);
455 mutex_exit(&drp->drc->lock);
456 return (1);
457 }
458 break;
459
460 case NFS4_DUP_REPLAY:
461 /* replay from the cache */
462 rbp = &(drp->dr_res);
463 break;
464 }
465 } else {
466 curthread->t_flag |= T_DONTPEND;
467 /* IDEMPOTENT proc call */
468 rfs4_compound(cap, rbp, &cs, req, &rv);
469 curthread->t_flag &= ~T_DONTPEND;
470
471 rfs4_fini_compound_state(&cs);
472
473 if (rv) /* short ckt sendreply on error */
474 return (rv);
475
476 if (curthread->t_flag & T_WOULDBLOCK) {
477 curthread->t_flag &= ~T_WOULDBLOCK;
478 return (1);
479 }
480 }
481
482 /*
483 * Send out the replayed reply or the 'real' one.
484 */
485 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
486 DTRACE_PROBE2(nfss__e__dispatch_sendfail,
487 struct svc_req *, xprt,
488 char *, rbp);
489 svcerr_systemerr(xprt);
490 error++;
491 }
492
493 /*
494 * If this reply was just inserted into the duplicate cache
495 * or it was replayed from the dup cache; (re)mark it as
496 * available for replay
497 *
498 * At first glance, this 'if' statement seems a little strange;
499 * testing for NFS4_DUP_REPLAY, and then calling...
500 *
501 * rfs4_dr_chatate(NFS4_DUP_REPLAY)
502 *
503 * ... but notice that we are checking dr_stat, and not the
504 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
505 * we do that so that we know not to prematurely reap it whilst
506 * we resent it to the client.
507 *
508 */
509 if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
510 mutex_enter(&drp->drc->lock);
511 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
512 mutex_exit(&drp->drc->lock);
513 } else if (dr_stat == NFS4_NOT_DUP) {
514 rfs4_compound_free(rbp);
515 }
516
517 return (error);
518 }
519
520 static int
rfs4_send_minor_mismatch(SVCXPRT * xprt,COMPOUND4args * argsp)521 rfs4_send_minor_mismatch(SVCXPRT *xprt, COMPOUND4args *argsp)
522 {
523 COMPOUND4res res_buf, *resp;
524 int err = 0;
525
526 resp = &res_buf;
527
528 /*
529 * Form a reply tag by copying over the request tag.
530 */
531 resp->tag.utf8string_len = argsp->tag.utf8string_len;
532 if (argsp->tag.utf8string_len != 0) {
533 resp->tag.utf8string_val =
534 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
535 bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
536 resp->tag.utf8string_len);
537 } else {
538 resp->tag.utf8string_val = NULL;
539 }
540 resp->array_len = 0;
541 resp->array = NULL;
542 resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
543 if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)resp)) {
544 DTRACE_PROBE2(nfss__e__minorvers_mismatch,
545 SVCXPRT *, xprt, char *, resp);
546 svcerr_systemerr(xprt);
547 err = 1;
548 }
549 rfs4_compound_free(resp);
550 return (err);
551 }
552
553 /*
554 * Test minor version against allowed minor versions.
555 */
556 static inline bool_t
rfs4_minorversion_enabled(uint32_t minorversion)557 rfs4_minorversion_enabled(uint32_t minorversion)
558 {
559 return (minorversion <= nfs4_get_srv()->nfs4_minor_max);
560 }
561
562 bool_t
rfs4_minorvers_mismatch(struct svc_req * req,SVCXPRT * xprt,void * args)563 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
564 {
565 COMPOUND4args *argsp;
566
567 if (req->rq_vers != 4)
568 return (FALSE);
569
570 argsp = (COMPOUND4args *)args;
571
572 if (rfs4_minorversion_enabled(argsp->minorversion))
573 return (FALSE);
574
575 (void) rfs4_send_minor_mismatch(xprt, argsp);
576 return (TRUE);
577 }
578
579 void
rfs4_resource_err(struct svc_req * req,COMPOUND4args * argsp)580 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
581 {
582 COMPOUND4res res_buf, *rbp;
583 nfs_resop4 *resop;
584 PUTFH4res *resp;
585
586 rbp = &res_buf;
587
588 /*
589 * Form a reply tag by copying over the request tag.
590 */
591 rbp->tag.utf8string_len = argsp->tag.utf8string_len;
592 if (argsp->tag.utf8string_len != 0) {
593 rbp->tag.utf8string_val =
594 kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
595 bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
596 rbp->tag.utf8string_len);
597 } else {
598 rbp->tag.utf8string_val = NULL;
599 }
600
601 rbp->array_len = 1;
602 rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
603 KM_SLEEP);
604 resop = &rbp->array[0];
605 resop->resop = argsp->array[0].argop; /* copy first op over */
606
607 /* Any op will do, just need to access status field */
608 resp = &resop->nfs_resop4_u.opputfh;
609
610 /*
611 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
612 * Note that all op numbers in the compound array were already
613 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
614 */
615 resp->status = (resop->resop == OP_ILLEGAL ?
616 NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
617
618 /* compound status is same as first op status */
619 rbp->status = resp->status;
620
621 if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
622 DTRACE_PROBE2(nfss__rsrc_err__sendfail,
623 struct svc_req *, req->rq_xprt, char *, rbp);
624 svcerr_systemerr(req->rq_xprt);
625 }
626
627 UTF8STRING_FREE(rbp->tag);
628 kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
629 }
630
631 int
rfs4_dispatch(struct rpcdisp * disp __unused,struct svc_req * req,SVCXPRT * xprt,char * ap)632 rfs4_dispatch(struct rpcdisp *disp __unused, struct svc_req *req,
633 SVCXPRT *xprt, char *ap)
634 {
635 COMPOUND4args *cmp;
636
637 /*
638 * Handle the NULL Proc here
639 */
640 if (req->rq_proc == RFS_NULL) {
641 return (!svc_sendreply(xprt, xdr_void, NULL));
642 }
643
644 cmp = (COMPOUND4args *)ap;
645 ASSERT(cmp != NULL);
646
647 if (!rfs4_minorversion_enabled(cmp->minorversion))
648 return (rfs4_send_minor_mismatch(xprt, cmp));
649
650 if (cmp->minorversion == 0)
651 return (rfs40_dispatch(req, xprt, ap));
652
653 return (rfs4x_dispatch(req, xprt, ap));
654 }
655