xref: /titanic_52/usr/src/uts/common/fs/nfs/nfs4_dispatch.c (revision 22e19ac1a2d512ea8d74e4f3662c08787d0716b1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/systm.h>
30 #include <sys/sdt.h>
31 #include <rpc/types.h>
32 #include <rpc/auth.h>
33 #include <rpc/auth_unix.h>
34 #include <rpc/auth_des.h>
35 #include <rpc/svc.h>
36 #include <rpc/xdr.h>
37 #include <nfs/nfs4.h>
38 #include <nfs/nfs_dispatch.h>
39 #include <nfs/nfs4_drc.h>
40 
41 #define	NFS4_MAX_MINOR_VERSION	0
42 
43 /*
44  * This is the duplicate request cache for NFSv4
45  */
46 rfs4_drc_t *nfs4_drc = NULL;
47 
48 /*
49  * The default size of the duplicate request cache
50  */
51 uint32_t nfs4_drc_max = 8 * 1024;
52 
53 /*
54  * The number of buckets we'd like to hash the
55  * replies into.. do not change this on the fly.
56  */
57 uint32_t nfs4_drc_hash = 541;
58 
59 /*
60  * Initialize a duplicate request cache.
61  */
62 rfs4_drc_t *
63 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
64 {
65 	rfs4_drc_t *drc;
66 	uint32_t   bki;
67 
68 	ASSERT(drc_size);
69 	ASSERT(drc_hash_size);
70 
71 	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
72 
73 	drc->max_size = drc_size;
74 	drc->in_use = 0;
75 
76 	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
77 
78 	drc->dr_hash = drc_hash_size;
79 
80 	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
81 
82 	for (bki = 0; bki < drc_hash_size; bki++) {
83 		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
84 		    offsetof(rfs4_dupreq_t, dr_bkt_next));
85 	}
86 
87 	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
88 		    offsetof(rfs4_dupreq_t, dr_next));
89 
90 	return (drc);
91 }
92 
93 /*
94  * Destroy a duplicate request cache.
95  */
96 void
97 rfs4_fini_drc(rfs4_drc_t *drc)
98 {
99 	rfs4_dupreq_t *drp, *drp_next;
100 
101 	ASSERT(drc);
102 
103 	/* iterate over the dr_cache and free the enties */
104 	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
105 
106 		if (drp->dr_state == NFS4_DUP_REPLAY)
107 			rfs4_compound_free(&(drp->dr_res));
108 
109 		if (drp->dr_addr.buf != NULL)
110 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
111 
112 		drp_next = list_next(&(drc->dr_cache), drp);
113 
114 		kmem_free(drp, sizeof (rfs4_dupreq_t));
115 	}
116 
117 	mutex_destroy(&drc->lock);
118 	kmem_free(drc->dr_buckets,
119 		sizeof (list_t)*drc->dr_hash);
120 	kmem_free(drc, sizeof (rfs4_drc_t));
121 }
122 
123 /*
124  * rfs4_dr_chstate:
125  *
126  * Change the state of a rfs4_dupreq. If it's not in transition
127  * to the FREE state, update the time used and return. If we
128  * are moving to the FREE state then we need to clean up the
129  * compound results and move the entry to the end of the list.
130  */
131 void
132 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
133 {
134 	rfs4_drc_t *drc;
135 
136 	ASSERT(drp);
137 	ASSERT(drp->drc);
138 	ASSERT(drp->dr_bkt);
139 	ASSERT(MUTEX_HELD(&drp->drc->lock));
140 
141 	drp->dr_state = new_state;
142 
143 	if (new_state != NFS4_DUP_FREE) {
144 		gethrestime(&drp->dr_time_used);
145 		return;
146 	}
147 
148 	drc = drp->drc;
149 
150 	/*
151 	 * Remove entry from the bucket and
152 	 * dr_cache list, free compound results.
153 	 */
154 	list_remove(drp->dr_bkt, drp);
155 	list_remove(&(drc->dr_cache), drp);
156 	rfs4_compound_free(&(drp->dr_res));
157 }
158 
159 /*
160  * rfs4_alloc_dr:
161  *
162  * Malloc a new one if we have not reached our maximum cache
163  * limit, otherwise pick an entry off the tail -- Use if it
164  * is marked as NFS4_DUP_FREE, or is an entry in the
165  * NFS4_DUP_REPLAY state.
166  */
167 rfs4_dupreq_t *
168 rfs4_alloc_dr(rfs4_drc_t *drc)
169 {
170 	rfs4_dupreq_t *drp_tail, *drp = NULL;
171 
172 	ASSERT(drc);
173 	ASSERT(MUTEX_HELD(&drc->lock));
174 
175 	/*
176 	 * Have we hit the cache limit yet ?
177 	 */
178 	if (drc->in_use < drc->max_size) {
179 		/*
180 		 * nope, so let's malloc a new one
181 		 */
182 		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
183 		drp->drc = drc;
184 		drc->in_use++;
185 		gethrestime(&drp->dr_time_created);
186 		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
187 		return (drp);
188 	}
189 
190 	/*
191 	 * Cache is all allocated now traverse the list
192 	 * backwards to find one we can reuse.
193 	 */
194 	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
195 	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
196 
197 		switch (drp_tail->dr_state) {
198 
199 		case NFS4_DUP_FREE:
200 			list_remove(&(drc->dr_cache), drp_tail);
201 			DTRACE_PROBE1(nfss__i__drc_freeclaim,
202 					rfs4_dupreq_t *, drp_tail);
203 			return (drp_tail);
204 			/* NOTREACHED */
205 
206 		case NFS4_DUP_REPLAY:
207 			/* grab it. */
208 			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
209 			DTRACE_PROBE1(nfss__i__drc_replayclaim,
210 					rfs4_dupreq_t *, drp_tail);
211 			return (drp_tail);
212 			/* NOTREACHED */
213 		}
214 	}
215 	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
216 	return (NULL);
217 }
218 
219 /*
220  * rfs4_find_dr:
221  *
222  * Search for an entry in the duplicate request cache by
223  * calculating the hash index based on the XID, and examining
224  * the entries in the hash bucket. If we find a match stamp the
225  * time_used and return. If the entry does not match it could be
226  * ready to be freed. Once we have searched the bucket we call
227  * rfs4_alloc_dr() to allocate a new entry, or reuse one that is
228  * available.
229  */
230 int
231 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
232 {
233 
234 	uint32_t	the_xid;
235 	list_t		*dr_bkt;
236 	rfs4_dupreq_t	*drp;
237 	int		bktdex;
238 
239 	/*
240 	 * Get the XID, calculate the bucket and search to
241 	 * see if we need to replay from the cache.
242 	 */
243 	the_xid = req->rq_xprt->xp_xid;
244 	bktdex = the_xid % drc->dr_hash;
245 
246 	dr_bkt = (list_t *)
247 		&(drc->dr_buckets[(the_xid % drc->dr_hash)]);
248 
249 	DTRACE_PROBE3(nfss__i__drc_bktdex,
250 			int, bktdex,
251 			uint32_t, the_xid,
252 			list_t *, dr_bkt);
253 
254 	*dup = NULL;
255 
256 	mutex_enter(&drc->lock);
257 	/*
258 	 * Search the bucket for a matching xid and address.
259 	 */
260 	for (drp = list_head(dr_bkt); drp != NULL;
261 		drp = list_next(dr_bkt, drp)) {
262 
263 		if (drp->dr_xid == the_xid &&
264 		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
265 		    bcmp((caddr_t)drp->dr_addr.buf,
266 		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
267 		    drp->dr_addr.len) == 0) {
268 
269 			/*
270 			 * Found a match so REPLAY the Reply
271 			 */
272 			if (drp->dr_state == NFS4_DUP_REPLAY) {
273 				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
274 				mutex_exit(&drc->lock);
275 				*dup = drp;
276 				DTRACE_PROBE1(nfss__i__drc_replay,
277 					rfs4_dupreq_t *, drp);
278 				return (NFS4_DUP_REPLAY);
279 			}
280 
281 			/*
282 			 * This entry must be in transition, so return
283 			 * the 'pending' status.
284 			 */
285 			mutex_exit(&drc->lock);
286 			return (NFS4_DUP_PENDING);
287 		}
288 
289 		/*
290 		 * Not a match, but maybe this entry is okay
291 		 * to be reused.
292 		 */
293 		if (drp->dr_state == NFS4_DUP_REPLAY) {
294 			rfs4_dr_chstate(drp, NFS4_DUP_FREE);
295 			list_insert_tail(&(drp->drc->dr_cache), drp);
296 		}
297 	}
298 
299 	drp = rfs4_alloc_dr(drc);
300 	mutex_exit(&drc->lock);
301 
302 	/*
303 	 * The DRC is full and all entries are in use. Upper function
304 	 * should error out this request and force the client to
305 	 * retransmit -- effectively this is a resource issue. NFSD
306 	 * threads tied up with native File System, or the cache size
307 	 * is too small for the server load.
308 	 */
309 	if (drp == NULL)
310 		return (NFS4_DUP_ERROR);
311 
312 	/*
313 	 * Init the state to NEW and clear the time used field.
314 	 */
315 	drp->dr_state = NFS4_DUP_NEW;
316 	drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0;
317 
318 	/*
319 	 * If needed, resize the address buffer
320 	 */
321 	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
322 		if (drp->dr_addr.buf != NULL)
323 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
324 		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
325 		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
326 		if (drp->dr_addr.buf == NULL) {
327 			/*
328 			 * If the malloc fails, mark the entry
329 			 * as free and put on the tail.
330 			 */
331 			drp->dr_addr.maxlen = 0;
332 			drp->dr_state = NFS4_DUP_FREE;
333 			mutex_enter(&drc->lock);
334 			list_insert_tail(&(drc->dr_cache), drp);
335 			mutex_exit(&drc->lock);
336 			return (NFS4_DUP_ERROR);
337 		}
338 	}
339 
340 
341 	/*
342 	 * Copy the address.
343 	 */
344 	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
345 
346 	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
347 		(caddr_t)drp->dr_addr.buf,
348 		drp->dr_addr.len);
349 
350 	drp->dr_xid = the_xid;
351 	drp->dr_bkt = dr_bkt;
352 
353 	/*
354 	 * Insert at the head of the bucket and
355 	 * the drc lists..
356 	 */
357 	mutex_enter(&drc->lock);
358 	list_insert_head(&drc->dr_cache, drp);
359 	list_insert_head(dr_bkt, drp);
360 	mutex_exit(&drc->lock);
361 
362 	*dup = drp;
363 
364 	return (NFS4_DUP_NEW);
365 }
366 
367 /*
368  *
369  * This function handles the duplicate request cache,
370  * NULL_PROC and COMPOUND procedure calls for NFSv4;
371  *
372  * Passed into this function are:-
373  *
374  * 	disp	A pointer to our dispatch table entry
375  * 	req	The request to process
376  * 	xprt	The server transport handle
377  * 	ap	A pointer to the arguments
378  *
379  *
380  * When appropriate this function is responsible for inserting
381  * the reply into the duplicate cache or replaying an existing
382  * cached reply.
383  *
384  * dr_stat 	reflects the state of the duplicate request that
385  * 		has been inserted into or retrieved from the cache
386  *
387  * drp		is the duplicate request entry
388  *
389  */
390 int
391 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
392 		SVCXPRT *xprt, char *ap)
393 {
394 
395 	COMPOUND4res res_buf, *rbp;
396 	COMPOUND4args *cap;
397 
398 	cred_t 	*cr = NULL;
399 	int	error = 0;
400 	int 	dis_flags = 0;
401 	int 	dr_stat = NFS4_NOT_DUP;
402 	rfs4_dupreq_t *drp = NULL;
403 
404 	ASSERT(disp);
405 
406 	/*
407 	 * Short circuit the RPC_NULL proc.
408 	 */
409 	if (disp->dis_proc == rpc_null) {
410 		if (!svc_sendreply(xprt, xdr_void, NULL)) {
411 			return (1);
412 		}
413 		return (0);
414 	}
415 
416 	/* Only NFSv4 Compounds from this point onward */
417 
418 	rbp = &res_buf;
419 	cap = (COMPOUND4args *)ap;
420 
421 	/*
422 	 * Figure out the disposition of the whole COMPOUND
423 	 * and record it's IDEMPOTENTCY.
424 	 */
425 	rfs4_compound_flagproc(cap, &dis_flags);
426 
427 	/*
428 	 * If NON-IDEMPOTENT then we need to figure out if this
429 	 * request can be replied from the duplicate cache.
430 	 *
431 	 * If this is a new request then we need to insert the
432 	 * reply into the duplicate cache.
433 	 */
434 	if (!(dis_flags & RPC_IDEMPOTENT)) {
435 		/* look for a replay from the cache or allocate */
436 		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
437 
438 		switch (dr_stat) {
439 
440 		case NFS4_DUP_ERROR:
441 			svcerr_systemerr(xprt);
442 			return (1);
443 			/* NOTREACHED */
444 
445 		case NFS4_DUP_PENDING:
446 			/*
447 			 * reply has previously been inserted into the
448 			 * duplicate cache, however the reply has
449 			 * not yet been sent via svc_sendreply()
450 			 */
451 			return (1);
452 			/* NOTREACHED */
453 
454 		case NFS4_DUP_NEW:
455 			curthread->t_flag |= T_DONTPEND;
456 			/* NON-IDEMPOTENT proc call */
457 			rfs4_compound(cap, rbp, NULL, req, cr);
458 
459 			curthread->t_flag &= ~T_DONTPEND;
460 
461 			/*
462 			 * dr_res must be initialized before calling
463 			 * rfs4_dr_chstate (it frees the reply).
464 			 */
465 			drp->dr_res = res_buf;
466 			if (curthread->t_flag & T_WOULDBLOCK) {
467 				curthread->t_flag &= ~T_WOULDBLOCK;
468 				/*
469 				 * mark this entry as FREE and plop
470 				 * on the end of the cache list
471 				 */
472 				mutex_enter(&drp->drc->lock);
473 				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
474 				list_insert_tail(&(drp->drc->dr_cache), drp);
475 				mutex_exit(&drp->drc->lock);
476 				return (1);
477 			}
478 			break;
479 
480 		case NFS4_DUP_REPLAY:
481 			/* replay from the cache */
482 			rbp = &(drp->dr_res);
483 			break;
484 		}
485 	} else {
486 		curthread->t_flag |= T_DONTPEND;
487 		/* IDEMPOTENT proc call */
488 		rfs4_compound(cap, rbp, NULL, req, cr);
489 
490 		curthread->t_flag &= ~T_DONTPEND;
491 		if (curthread->t_flag & T_WOULDBLOCK) {
492 			curthread->t_flag &= ~T_WOULDBLOCK;
493 			return (1);
494 		}
495 	}
496 
497 	/*
498 	 * Send out the replayed reply or the 'real' one.
499 	 */
500 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
501 		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
502 			struct svc_req *, xprt,
503 			char *, rbp);
504 		error++;
505 	}
506 
507 	/*
508 	 * If this reply was just inserted into the duplicate cache
509 	 * or it was replayed from the dup cache; (re)mark it as
510 	 * available for replay
511 	 *
512 	 * At first glance, this 'if' statement seems a little strange;
513 	 * testing for NFS4_DUP_REPLAY, and then calling...
514 	 *
515 	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
516 	 *
517 	 * ... but notice that we are checking dr_stat, and not the
518 	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
519 	 * we do that so that we know not to prematurely reap it whilst
520 	 * we resent it to the client.
521 	 *
522 	 */
523 	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
524 		mutex_enter(&drp->drc->lock);
525 		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
526 		mutex_exit(&drp->drc->lock);
527 	} else if (dr_stat == NFS4_NOT_DUP) {
528 		rfs4_compound_free(rbp);
529 	}
530 
531 	return (error);
532 }
533 
534 bool_t
535 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
536 {
537 	COMPOUND4args *argsp;
538 	COMPOUND4res res_buf, *resp;
539 
540 	if (req->rq_vers != 4)
541 		return (FALSE);
542 
543 	argsp = (COMPOUND4args *)args;
544 
545 	if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
546 		return (FALSE);
547 
548 	resp = &res_buf;
549 
550 	/*
551 	 * Form a reply tag by copying over the reqeuest tag.
552 	 */
553 	resp->tag.utf8string_val =
554 	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
555 	resp->tag.utf8string_len = argsp->tag.utf8string_len;
556 	bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
557 	    resp->tag.utf8string_len);
558 	resp->array_len = 0;
559 	resp->array = NULL;
560 	resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
561 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
562 		DTRACE_PROBE2(nfss__e__minorvers_mismatch,
563 		    SVCXPRT *, xprt, char *, resp);
564 	}
565 	rfs4_compound_free(resp);
566 	return (TRUE);
567 }
568