xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_dispatch.c (revision 45744051679350ee063cdc366b66bee5223a11ea)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2018 Nexenta Systems, Inc.
29  * Copyright 2020 RackTop Systems, Inc.
30  */
31 
32 #include <sys/systm.h>
33 #include <sys/sdt.h>
34 #include <rpc/types.h>
35 #include <rpc/auth.h>
36 #include <rpc/auth_unix.h>
37 #include <rpc/auth_des.h>
38 #include <rpc/svc.h>
39 #include <rpc/xdr.h>
40 #include <nfs/nfs4.h>
41 #include <nfs/nfs_dispatch.h>
42 #include <nfs/nfs4_drc.h>
43 
44 /*
45  * The default size of the duplicate request cache
46  */
47 uint32_t nfs4_drc_max = 8 * 1024;
48 
49 /*
50  * The number of buckets we'd like to hash the
51  * replies into.. do not change this on the fly.
52  */
53 uint32_t nfs4_drc_hash = 541;
54 
55 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
56 
57 /*
58  * Initialize a duplicate request cache.
59  */
60 rfs4_drc_t *
61 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
62 {
63 	rfs4_drc_t *drc;
64 	uint32_t   bki;
65 
66 	ASSERT(drc_size);
67 	ASSERT(drc_hash_size);
68 
69 	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
70 
71 	drc->max_size = drc_size;
72 	drc->in_use = 0;
73 
74 	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
75 
76 	drc->dr_hash = drc_hash_size;
77 
78 	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
79 
80 	for (bki = 0; bki < drc_hash_size; bki++) {
81 		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
82 		    offsetof(rfs4_dupreq_t, dr_bkt_next));
83 	}
84 
85 	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
86 	    offsetof(rfs4_dupreq_t, dr_next));
87 
88 	return (drc);
89 }
90 
91 /*
92  * Destroy a duplicate request cache.
93  */
94 void
95 rfs4_fini_drc(void)
96 {
97 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
98 	rfs4_drc_t *drc = nsrv4->nfs4_drc;
99 	rfs4_dupreq_t *drp, *drp_next;
100 
101 	/* iterate over the dr_cache and free the enties */
102 	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
103 
104 		if (drp->dr_state == NFS4_DUP_REPLAY)
105 			rfs4_compound_free(&(drp->dr_res));
106 
107 		if (drp->dr_addr.buf != NULL)
108 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
109 
110 		drp_next = list_next(&(drc->dr_cache), drp);
111 
112 		kmem_free(drp, sizeof (rfs4_dupreq_t));
113 	}
114 
115 	mutex_destroy(&drc->lock);
116 	kmem_free(drc->dr_buckets,
117 	    sizeof (list_t)*drc->dr_hash);
118 	kmem_free(drc, sizeof (rfs4_drc_t));
119 }
120 
121 /*
122  * rfs4_dr_chstate:
123  *
124  * Change the state of a rfs4_dupreq. If it's not in transition
125  * to the FREE state, return. If we are moving to the FREE state
126  * then we need to clean up the compound results and move the entry
127  * to the end of the list.
128  */
129 void
130 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
131 {
132 	rfs4_drc_t *drc;
133 
134 	ASSERT(drp);
135 	ASSERT(drp->drc);
136 	ASSERT(drp->dr_bkt);
137 	ASSERT(MUTEX_HELD(&drp->drc->lock));
138 
139 	drp->dr_state = new_state;
140 
141 	if (new_state != NFS4_DUP_FREE)
142 		return;
143 
144 	drc = drp->drc;
145 
146 	/*
147 	 * Remove entry from the bucket and
148 	 * dr_cache list, free compound results.
149 	 */
150 	list_remove(drp->dr_bkt, drp);
151 	list_remove(&(drc->dr_cache), drp);
152 	rfs4_compound_free(&(drp->dr_res));
153 }
154 
155 /*
156  * rfs4_alloc_dr:
157  *
158  * Malloc a new one if we have not reached our maximum cache
159  * limit, otherwise pick an entry off the tail -- Use if it
160  * is marked as NFS4_DUP_FREE, or is an entry in the
161  * NFS4_DUP_REPLAY state.
162  */
163 rfs4_dupreq_t *
164 rfs4_alloc_dr(rfs4_drc_t *drc)
165 {
166 	rfs4_dupreq_t *drp_tail, *drp = NULL;
167 
168 	ASSERT(drc);
169 	ASSERT(MUTEX_HELD(&drc->lock));
170 
171 	/*
172 	 * Have we hit the cache limit yet ?
173 	 */
174 	if (drc->in_use < drc->max_size) {
175 		/*
176 		 * nope, so let's malloc a new one
177 		 */
178 		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
179 		drp->drc = drc;
180 		drc->in_use++;
181 		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
182 		return (drp);
183 	}
184 
185 	/*
186 	 * Cache is all allocated now traverse the list
187 	 * backwards to find one we can reuse.
188 	 */
189 	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
190 	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
191 
192 		switch (drp_tail->dr_state) {
193 
194 		case NFS4_DUP_FREE:
195 			list_remove(&(drc->dr_cache), drp_tail);
196 			DTRACE_PROBE1(nfss__i__drc_freeclaim,
197 			    rfs4_dupreq_t *, drp_tail);
198 			return (drp_tail);
199 			/* NOTREACHED */
200 
201 		case NFS4_DUP_REPLAY:
202 			/* grab it. */
203 			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
204 			DTRACE_PROBE1(nfss__i__drc_replayclaim,
205 			    rfs4_dupreq_t *, drp_tail);
206 			return (drp_tail);
207 			/* NOTREACHED */
208 		}
209 	}
210 	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
211 	return (NULL);
212 }
213 
214 /*
215  * rfs4_find_dr:
216  *
217  * Search for an entry in the duplicate request cache by
218  * calculating the hash index based on the XID, and examining
219  * the entries in the hash bucket. If we find a match, return.
220  * Once we have searched the bucket we call rfs4_alloc_dr() to
221  * allocate a new entry, or reuse one that is available.
222  */
223 int
224 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
225 {
226 
227 	uint32_t	the_xid;
228 	list_t		*dr_bkt;
229 	rfs4_dupreq_t	*drp;
230 	int		bktdex;
231 
232 	/*
233 	 * Get the XID, calculate the bucket and search to
234 	 * see if we need to replay from the cache.
235 	 */
236 	the_xid = req->rq_xprt->xp_xid;
237 	bktdex = the_xid % drc->dr_hash;
238 
239 	dr_bkt = (list_t *)
240 	    &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
241 
242 	DTRACE_PROBE3(nfss__i__drc_bktdex,
243 	    int, bktdex,
244 	    uint32_t, the_xid,
245 	    list_t *, dr_bkt);
246 
247 	*dup = NULL;
248 
249 	mutex_enter(&drc->lock);
250 	/*
251 	 * Search the bucket for a matching xid and address.
252 	 */
253 	for (drp = list_head(dr_bkt); drp != NULL;
254 	    drp = list_next(dr_bkt, drp)) {
255 
256 		if (drp->dr_xid == the_xid &&
257 		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
258 		    bcmp((caddr_t)drp->dr_addr.buf,
259 		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
260 		    drp->dr_addr.len) == 0) {
261 
262 			/*
263 			 * Found a match so REPLAY the Reply
264 			 */
265 			if (drp->dr_state == NFS4_DUP_REPLAY) {
266 				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
267 				mutex_exit(&drc->lock);
268 				*dup = drp;
269 				DTRACE_PROBE1(nfss__i__drc_replay,
270 				    rfs4_dupreq_t *, drp);
271 				return (NFS4_DUP_REPLAY);
272 			}
273 
274 			/*
275 			 * This entry must be in transition, so return
276 			 * the 'pending' status.
277 			 */
278 			mutex_exit(&drc->lock);
279 			return (NFS4_DUP_PENDING);
280 		}
281 	}
282 
283 	drp = rfs4_alloc_dr(drc);
284 	mutex_exit(&drc->lock);
285 
286 	/*
287 	 * The DRC is full and all entries are in use. Upper function
288 	 * should error out this request and force the client to
289 	 * retransmit -- effectively this is a resource issue. NFSD
290 	 * threads tied up with native File System, or the cache size
291 	 * is too small for the server load.
292 	 */
293 	if (drp == NULL)
294 		return (NFS4_DUP_ERROR);
295 
296 	/*
297 	 * Init the state to NEW.
298 	 */
299 	drp->dr_state = NFS4_DUP_NEW;
300 
301 	/*
302 	 * If needed, resize the address buffer
303 	 */
304 	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
305 		if (drp->dr_addr.buf != NULL)
306 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
307 		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
308 		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
309 		if (drp->dr_addr.buf == NULL) {
310 			/*
311 			 * If the malloc fails, mark the entry
312 			 * as free and put on the tail.
313 			 */
314 			drp->dr_addr.maxlen = 0;
315 			drp->dr_state = NFS4_DUP_FREE;
316 			mutex_enter(&drc->lock);
317 			list_insert_tail(&(drc->dr_cache), drp);
318 			mutex_exit(&drc->lock);
319 			return (NFS4_DUP_ERROR);
320 		}
321 	}
322 
323 
324 	/*
325 	 * Copy the address.
326 	 */
327 	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
328 
329 	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
330 	    (caddr_t)drp->dr_addr.buf,
331 	    drp->dr_addr.len);
332 
333 	drp->dr_xid = the_xid;
334 	drp->dr_bkt = dr_bkt;
335 
336 	/*
337 	 * Insert at the head of the bucket and
338 	 * the drc lists..
339 	 */
340 	mutex_enter(&drc->lock);
341 	list_insert_head(&drc->dr_cache, drp);
342 	list_insert_head(dr_bkt, drp);
343 	mutex_exit(&drc->lock);
344 
345 	*dup = drp;
346 
347 	return (NFS4_DUP_NEW);
348 }
349 
350 /*
351  *
352  * This function handles the duplicate request cache,
353  * NULL_PROC and COMPOUND procedure calls for NFSv4.0;
354  * the 4.x where x > 0 case is handled in rfs4x_dispatch.
355  *
356  * Passed into this function are:-
357  *
358  *	disp	A pointer to our dispatch table entry
359  *	req	The request to process
360  *	xprt	The server transport handle
361  *	ap	A pointer to the arguments
362  *
363  *
364  * When appropriate this function is responsible for inserting
365  * the reply into the duplicate cache or replaying an existing
366  * cached reply.
367  *
368  * dr_stat	reflects the state of the duplicate request that
369  *		has been inserted into or retrieved from the cache
370  *
371  * drp		is the duplicate request entry
372  *
373  */
374 int
375 rfs40_dispatch(struct svc_req *req, SVCXPRT *xprt, char *ap)
376 {
377 
378 	COMPOUND4res	 res_buf;
379 	COMPOUND4res	*rbp;
380 	COMPOUND4args	*cap;
381 	int		 error = 0;
382 	int		 dis_flags = 0;
383 	int		 dr_stat = NFS4_NOT_DUP;
384 	rfs4_dupreq_t	*drp = NULL;
385 	int		 rv;
386 	struct compound_state cs;
387 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
388 	rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc;
389 
390 	/* Only NFSv4 Compounds from this point onward */
391 
392 	rbp = &res_buf;
393 	cap = (COMPOUND4args *)ap;
394 
395 	rfs4_init_compound_state(&cs);
396 
397 	/*
398 	 * Figure out the disposition of the whole COMPOUND
399 	 * and record it's IDEMPOTENTCY.
400 	 */
401 	rfs4_compound_flagproc(cap, &dis_flags);
402 
403 	/*
404 	 * If NON-IDEMPOTENT then we need to figure out if this
405 	 * request can be replied from the duplicate cache.
406 	 *
407 	 * If this is a new request then we need to insert the
408 	 * reply into the duplicate cache.
409 	 */
410 	if (!(dis_flags & RPC_IDEMPOTENT)) {
411 		/* look for a replay from the cache or allocate */
412 		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
413 
414 		switch (dr_stat) {
415 
416 		case NFS4_DUP_ERROR:
417 			rfs4_resource_err(req, cap);
418 			return (1);
419 			/* NOTREACHED */
420 
421 		case NFS4_DUP_PENDING:
422 			/*
423 			 * reply has previously been inserted into the
424 			 * duplicate cache, however the reply has
425 			 * not yet been sent via svc_sendreply()
426 			 */
427 			return (1);
428 			/* NOTREACHED */
429 
430 		case NFS4_DUP_NEW:
431 			curthread->t_flag |= T_DONTPEND;
432 			/* NON-IDEMPOTENT proc call */
433 			rfs4_compound(cap, rbp, &cs, req, &rv);
434 			curthread->t_flag &= ~T_DONTPEND;
435 
436 			rfs4_fini_compound_state(&cs);
437 
438 			if (rv)		/* short ckt sendreply on error */
439 				return (rv);
440 
441 			/*
442 			 * dr_res must be initialized before calling
443 			 * rfs4_dr_chstate (it frees the reply).
444 			 */
445 			drp->dr_res = res_buf;
446 			if (curthread->t_flag & T_WOULDBLOCK) {
447 				curthread->t_flag &= ~T_WOULDBLOCK;
448 				/*
449 				 * mark this entry as FREE and plop
450 				 * on the end of the cache list
451 				 */
452 				mutex_enter(&drp->drc->lock);
453 				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
454 				list_insert_tail(&(drp->drc->dr_cache), drp);
455 				mutex_exit(&drp->drc->lock);
456 				return (1);
457 			}
458 			break;
459 
460 		case NFS4_DUP_REPLAY:
461 			/* replay from the cache */
462 			rbp = &(drp->dr_res);
463 			break;
464 		}
465 	} else {
466 		curthread->t_flag |= T_DONTPEND;
467 		/* IDEMPOTENT proc call */
468 		rfs4_compound(cap, rbp, &cs, req, &rv);
469 		curthread->t_flag &= ~T_DONTPEND;
470 
471 		rfs4_fini_compound_state(&cs);
472 
473 		if (rv)		/* short ckt sendreply on error */
474 			return (rv);
475 
476 		if (curthread->t_flag & T_WOULDBLOCK) {
477 			curthread->t_flag &= ~T_WOULDBLOCK;
478 			return (1);
479 		}
480 	}
481 
482 	/*
483 	 * Send out the replayed reply or the 'real' one.
484 	 */
485 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
486 		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
487 		    struct svc_req *, xprt,
488 		    char *, rbp);
489 		svcerr_systemerr(xprt);
490 		error++;
491 	}
492 
493 	/*
494 	 * If this reply was just inserted into the duplicate cache
495 	 * or it was replayed from the dup cache; (re)mark it as
496 	 * available for replay
497 	 *
498 	 * At first glance, this 'if' statement seems a little strange;
499 	 * testing for NFS4_DUP_REPLAY, and then calling...
500 	 *
501 	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
502 	 *
503 	 * ... but notice that we are checking dr_stat, and not the
504 	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
505 	 * we do that so that we know not to prematurely reap it whilst
506 	 * we resent it to the client.
507 	 *
508 	 */
509 	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
510 		mutex_enter(&drp->drc->lock);
511 		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
512 		mutex_exit(&drp->drc->lock);
513 	} else if (dr_stat == NFS4_NOT_DUP) {
514 		rfs4_compound_free(rbp);
515 	}
516 
517 	return (error);
518 }
519 
520 static int
521 rfs4_send_minor_mismatch(SVCXPRT *xprt, COMPOUND4args *argsp)
522 {
523 	COMPOUND4res res_buf, *resp;
524 	int err = 0;
525 
526 	resp = &res_buf;
527 
528 	/*
529 	 * Form a reply tag by copying over the request tag.
530 	 */
531 	resp->tag.utf8string_len = argsp->tag.utf8string_len;
532 	if (argsp->tag.utf8string_len != 0) {
533 		resp->tag.utf8string_val =
534 		    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
535 		bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
536 		    resp->tag.utf8string_len);
537 	} else {
538 		resp->tag.utf8string_val = NULL;
539 	}
540 	resp->array_len = 0;
541 	resp->array = NULL;
542 	resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
543 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
544 		DTRACE_PROBE2(nfss__e__minorvers_mismatch,
545 		    SVCXPRT *, xprt, char *, resp);
546 		svcerr_systemerr(xprt);
547 		err = 1;
548 	}
549 	rfs4_compound_free(resp);
550 	return (err);
551 }
552 
553 /*
554  * Test minor version against allowed minor versions.
555  */
556 static inline bool_t
557 rfs4_minorversion_enabled(uint32_t minorversion)
558 {
559 	return (minorversion <= nfs4_get_srv()->nfs4_minor_max);
560 }
561 
562 bool_t
563 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
564 {
565 	COMPOUND4args *argsp;
566 
567 	if (req->rq_vers != 4)
568 		return (FALSE);
569 
570 	argsp = (COMPOUND4args *)args;
571 
572 	if (rfs4_minorversion_enabled(argsp->minorversion))
573 		return (FALSE);
574 
575 	(void) rfs4_send_minor_mismatch(xprt, argsp);
576 	return (TRUE);
577 }
578 
579 void
580 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
581 {
582 	COMPOUND4res res_buf, *rbp;
583 	nfs_resop4 *resop;
584 	PUTFH4res *resp;
585 
586 	rbp = &res_buf;
587 
588 	/*
589 	 * Form a reply tag by copying over the request tag.
590 	 */
591 	rbp->tag.utf8string_len = argsp->tag.utf8string_len;
592 	if (argsp->tag.utf8string_len != 0) {
593 		rbp->tag.utf8string_val =
594 		    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
595 		bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
596 		    rbp->tag.utf8string_len);
597 	} else {
598 		rbp->tag.utf8string_val = NULL;
599 	}
600 
601 	rbp->array_len = 1;
602 	rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
603 	    KM_SLEEP);
604 	resop = &rbp->array[0];
605 	resop->resop = argsp->array[0].argop;	/* copy first op over */
606 
607 	/* Any op will do, just need to access status field */
608 	resp = &resop->nfs_resop4_u.opputfh;
609 
610 	/*
611 	 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
612 	 * Note that all op numbers in the compound array were already
613 	 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
614 	 */
615 	resp->status = (resop->resop == OP_ILLEGAL ?
616 	    NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
617 
618 	/* compound status is same as first op status */
619 	rbp->status = resp->status;
620 
621 	if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
622 		DTRACE_PROBE2(nfss__rsrc_err__sendfail,
623 		    struct svc_req *, req->rq_xprt, char *, rbp);
624 		svcerr_systemerr(req->rq_xprt);
625 	}
626 
627 	UTF8STRING_FREE(rbp->tag);
628 	kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
629 }
630 
631 int
632 rfs4_dispatch(struct rpcdisp *disp __unused, struct svc_req *req,
633     SVCXPRT *xprt, char *ap)
634 {
635 	COMPOUND4args	*cmp;
636 
637 	/*
638 	 * Handle the NULL Proc here
639 	 */
640 	if (req->rq_proc == RFS_NULL) {
641 		return (!svc_sendreply(xprt, xdr_void, NULL));
642 	}
643 
644 	cmp = (COMPOUND4args *)ap;
645 	ASSERT(cmp != NULL);
646 
647 	if (!rfs4_minorversion_enabled(cmp->minorversion))
648 		return (rfs4_send_minor_mismatch(xprt, cmp));
649 
650 	if (cmp->minorversion == 0)
651 		return (rfs40_dispatch(req, xprt, ap));
652 
653 	return (rfs4x_dispatch(req, xprt, ap));
654 }
655