xref: /illumos-gate/usr/src/uts/common/rpc/xdr_rdma.c (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2007, The Ohio State University. All rights reserved.
27  *
28  * Portions of this source code is developed by the team members of
29  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30  * headed by Professor Dhabaleswar K. (DK) Panda.
31  *
32  * Acknowledgements to contributions from developors:
33  *   Ranjit Noronha: noronha@cse.ohio-state.edu
34  *   Lei Chai      : chail@cse.ohio-state.edu
35  *   Weikuan Yu    : yuw@cse.ohio-state.edu
36  *
37  */
38 
39 /*
40  * xdr_rdma.c, XDR implementation using RDMA to move large chunks
41  */
42 
43 #include <sys/param.h>
44 #include <sys/types.h>
45 #include <sys/systm.h>
46 #include <sys/kmem.h>
47 #include <sys/sdt.h>
48 #include <sys/debug.h>
49 
50 #include <rpc/types.h>
51 #include <rpc/xdr.h>
52 #include <sys/cmn_err.h>
53 #include <rpc/rpc_sztypes.h>
54 #include <rpc/rpc_rdma.h>
55 #include <sys/sysmacros.h>
56 
57 /*
58  * RCP header and xdr encoding overhead.  The number was determined by
59  * tracing the msglen in svc_rdma_ksend for sec=sys,krb5,krb5i and krb5p.
60  * If the XDR_RDMA_BUF_OVERHEAD is not large enough the result is the trigger
61  * of the dtrace probe on the server "krpc-e-svcrdma-ksend-noreplycl" from
62  * svc_rdma_ksend.
63  */
64 #define	XDR_RDMA_BUF_OVERHEAD	300
65 
66 static bool_t   xdrrdma_getint32(XDR *, int32_t *);
67 static bool_t   xdrrdma_putint32(XDR *, int32_t *);
68 static bool_t   xdrrdma_getbytes(XDR *, caddr_t, int);
69 static bool_t   xdrrdma_putbytes(XDR *, caddr_t, int);
70 uint_t		xdrrdma_getpos(XDR *);
71 bool_t		xdrrdma_setpos(XDR *, uint_t);
72 static rpc_inline_t *xdrrdma_inline(XDR *, int);
73 void		xdrrdma_destroy(XDR *);
74 static bool_t   xdrrdma_control(XDR *, int, void *);
75 static bool_t  xdrrdma_read_a_chunk(XDR *, CONN **);
76 static void xdrrdma_free_xdr_chunks(CONN *, struct clist *);
77 
78 struct xdr_ops  xdrrdmablk_ops = {
79 	xdrrdma_getbytes,
80 	xdrrdma_putbytes,
81 	xdrrdma_getpos,
82 	xdrrdma_setpos,
83 	xdrrdma_inline,
84 	xdrrdma_destroy,
85 	xdrrdma_control,
86 	xdrrdma_getint32,
87 	xdrrdma_putint32
88 };
89 
90 struct xdr_ops  xdrrdma_ops = {
91 	xdrrdma_getbytes,
92 	xdrrdma_putbytes,
93 	xdrrdma_getpos,
94 	xdrrdma_setpos,
95 	xdrrdma_inline,
96 	xdrrdma_destroy,
97 	xdrrdma_control,
98 	xdrrdma_getint32,
99 	xdrrdma_putint32
100 };
101 
102 /*
103  * A chunk list entry identifies a chunk of opaque data to be moved
104  * separately from the rest of the RPC message. xp_min_chunk = 0, is a
105  * special case for ENCODING, which means do not chunk the incoming stream of
106  * data.
107  *
108  * A read chunk can contain part of the RPC message in addition to the
109  * inline message. In such a case, (xp_offp - x_base) will not provide
110  * the correct xdr offset of the entire message. xp_off is used in such
111  * a case to denote the offset or current position in the overall message
112  * covering both the inline and the chunk. This is used only in the case
113  * of decoding and useful to compare read chunk 'c_xdroff' offsets.
114  *
115  * An example for a read chunk containing an XDR message:
116  * An NFSv4 compound as following:
117  *
118  * PUTFH
119  * WRITE [4109 bytes]
120  * GETATTR
121  *
122  * Solaris Encoding is:
123  * -------------------
124  *
125  * <Inline message>: [PUTFH WRITE4args GETATTR]
126  *                                   |
127  *                                   v
128  * [RDMA_READ chunks]:               [write data]
129  *
130  *
131  * Linux encoding is:
132  * -----------------
133  *
134  * <Inline message>: [PUTFH WRITE4args]
135  *                                    |
136  *                                    v
137  * [RDMA_READ chunks]:                [Write data] [Write data2] [Getattr chunk]
138  *                                     chunk1       chunk2         chunk3
139  *
140  * where the READ chunks are as:
141  *
142  *             - chunk1 - 4k
143  * write data |
144  *             - chunk2 - 13 bytes(4109 - 4k)
145  * getattr op  - chunk3 - 19 bytes
146  * (getattr op starts at byte 4 after 3 bytes of roundup)
147  *
148  */
149 
150 typedef struct {
151 	caddr_t		xp_offp;
152 	int		xp_min_chunk;
153 	uint_t		xp_flags;	/* Controls setting for rdma xdr */
154 	int		xp_buf_size;	/* size of xdr buffer */
155 	int		xp_off;		/* overall offset */
156 	struct clist	*xp_rcl;	/* head of chunk list */
157 	struct clist	**xp_rcl_next;	/* location to place/find next chunk */
158 	struct clist	*xp_rcl_xdr;	/* copy of rcl containing RPC message */
159 	struct clist	*xp_wcl;	/* head of write chunk list */
160 	CONN		*xp_conn;	/* connection for chunk data xfer */
161 	uint_t		xp_reply_chunk_len;
162 	/* used to track length for security modes: integrity/privacy */
163 	uint_t		xp_reply_chunk_len_alt;
164 } xrdma_private_t;
165 
166 extern kmem_cache_t *clist_cache;
167 
168 bool_t
169 xdrrdma_getrdmablk(XDR *xdrs, struct clist **rlist, uint_t *sizep,
170     CONN **conn, const uint_t maxsize)
171 {
172 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
173 	struct clist	*cle = *(xdrp->xp_rcl_next);
174 	struct clist	*rdclist = NULL, *prev = NULL;
175 	bool_t		retval = TRUE;
176 	uint32_t	cur_offset = 0;
177 	uint32_t	total_segments = 0;
178 	uint32_t	actual_segments = 0;
179 	uint32_t	alen;
180 	uint_t		total_len;
181 
182 	ASSERT(xdrs->x_op != XDR_FREE);
183 
184 	/*
185 	 * first deal with the length since xdr bytes are counted
186 	 */
187 	if (!xdr_u_int(xdrs, sizep)) {
188 		DTRACE_PROBE(xdr__e__getrdmablk_sizep_fail);
189 		return (FALSE);
190 	}
191 	total_len = *sizep;
192 	if (total_len > maxsize) {
193 		DTRACE_PROBE2(xdr__e__getrdmablk_bad_size,
194 		    int, total_len, int, maxsize);
195 		return (FALSE);
196 	}
197 	(*conn) = xdrp->xp_conn;
198 
199 	/*
200 	 * if no data we are done
201 	 */
202 	if (total_len == 0)
203 		return (TRUE);
204 
205 	while (cle) {
206 		total_segments++;
207 		cle = cle->c_next;
208 	}
209 
210 	cle = *(xdrp->xp_rcl_next);
211 
212 	/*
213 	 * If there was a chunk at the current offset, then setup a read
214 	 * chunk list which records the destination address and length
215 	 * and will RDMA READ the data in later.
216 	 */
217 	if (cle == NULL)
218 		return (FALSE);
219 
220 	if (cle->c_xdroff != (xdrp->xp_offp - xdrs->x_base))
221 		return (FALSE);
222 
223 	/*
224 	 * Setup the chunk list with appropriate
225 	 * address (offset) and length
226 	 */
227 	for (actual_segments = 0;
228 	    actual_segments < total_segments; actual_segments++) {
229 
230 		DTRACE_PROBE3(krpc__i__xdrrdma_getrdmablk, uint32_t, cle->c_len,
231 		    uint32_t, total_len, uint32_t, cle->c_xdroff);
232 
233 		if (total_len <= 0)
234 			break;
235 
236 		/*
237 		 * not the first time in the loop
238 		 */
239 		if (actual_segments > 0)
240 			cle = cle->c_next;
241 
242 		cle->u.c_daddr = (uint64) cur_offset;
243 		alen = 0;
244 		if (cle->c_len > total_len) {
245 			alen = cle->c_len;
246 			cle->c_len = total_len;
247 		}
248 		if (!alen)
249 			xdrp->xp_rcl_next = &cle->c_next;
250 
251 		cur_offset += cle->c_len;
252 		total_len -= cle->c_len;
253 
254 		if ((total_segments - actual_segments - 1) == 0 &&
255 		    total_len > 0) {
256 			DTRACE_PROBE(krpc__e__xdrrdma_getblk_chunktooshort);
257 			retval = FALSE;
258 		}
259 
260 		if ((total_segments - actual_segments - 1) > 0 &&
261 		    total_len == 0) {
262 			DTRACE_PROBE2(krpc__e__xdrrdma_getblk_toobig,
263 			    int, total_segments, int, actual_segments);
264 		}
265 
266 		rdclist = clist_alloc();
267 		(*rdclist) = (*cle);
268 		if ((*rlist) == NULL)
269 			(*rlist) = rdclist;
270 		if (prev == NULL)
271 			prev = rdclist;
272 		else {
273 			prev->c_next = rdclist;
274 			prev = rdclist;
275 		}
276 
277 	}
278 
279 out:
280 	if (prev != NULL)
281 		prev->c_next = NULL;
282 
283 	/*
284 	 * Adjust the chunk length, if we read only a part of
285 	 * a chunk.
286 	 */
287 
288 	if (alen) {
289 		cle->w.c_saddr =
290 		    (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
291 		cle->c_len = alen - cle->c_len;
292 	}
293 
294 	return (retval);
295 }
296 
297 /*
298  * The procedure xdrrdma_create initializes a stream descriptor for a memory
299  * buffer.
300  */
301 void
302 xdrrdma_create(XDR *xdrs, caddr_t addr, uint_t size,
303     int min_chunk, struct clist *cl, enum xdr_op op, CONN *conn)
304 {
305 	xrdma_private_t *xdrp;
306 	struct clist   *cle;
307 
308 	xdrs->x_op = op;
309 	xdrs->x_ops = &xdrrdma_ops;
310 	xdrs->x_base = addr;
311 	xdrs->x_handy = size;
312 	xdrs->x_public = NULL;
313 
314 	xdrp = (xrdma_private_t *)kmem_zalloc(sizeof (xrdma_private_t),
315 	    KM_SLEEP);
316 	xdrs->x_private = (caddr_t)xdrp;
317 	xdrp->xp_offp = addr;
318 	xdrp->xp_min_chunk = min_chunk;
319 	xdrp->xp_flags = 0;
320 	xdrp->xp_buf_size = size;
321 	xdrp->xp_rcl = cl;
322 	xdrp->xp_reply_chunk_len = 0;
323 	xdrp->xp_reply_chunk_len_alt = 0;
324 
325 	if (op == XDR_ENCODE && cl != NULL) {
326 		/* Find last element in chunk list and set xp_rcl_next */
327 		for (cle = cl; cle->c_next != NULL; cle = cle->c_next)
328 			continue;
329 
330 		xdrp->xp_rcl_next = &(cle->c_next);
331 	} else {
332 		xdrp->xp_rcl_next = &(xdrp->xp_rcl);
333 	}
334 
335 	xdrp->xp_wcl = NULL;
336 
337 	xdrp->xp_conn = conn;
338 	if (xdrp->xp_min_chunk != 0)
339 		xdrp->xp_flags |= XDR_RDMA_CHUNK;
340 }
341 
342 /* ARGSUSED */
343 void
344 xdrrdma_destroy(XDR * xdrs)
345 {
346 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
347 
348 	if (xdrp == NULL)
349 		return;
350 
351 	if (xdrp->xp_wcl) {
352 		if (xdrp->xp_flags & XDR_RDMA_WLIST_REG) {
353 			(void) clist_deregister(xdrp->xp_conn, xdrp->xp_wcl);
354 			rdma_buf_free(xdrp->xp_conn,
355 			    &xdrp->xp_wcl->rb_longbuf);
356 		}
357 		clist_free(xdrp->xp_wcl);
358 	}
359 
360 	if (xdrp->xp_rcl) {
361 		if (xdrp->xp_flags & XDR_RDMA_RLIST_REG) {
362 			(void) clist_deregister(xdrp->xp_conn, xdrp->xp_rcl);
363 			rdma_buf_free(xdrp->xp_conn,
364 			    &xdrp->xp_rcl->rb_longbuf);
365 		}
366 		clist_free(xdrp->xp_rcl);
367 	}
368 
369 	if (xdrp->xp_rcl_xdr)
370 		xdrrdma_free_xdr_chunks(xdrp->xp_conn, xdrp->xp_rcl_xdr);
371 
372 	(void) kmem_free(xdrs->x_private, sizeof (xrdma_private_t));
373 	xdrs->x_private = NULL;
374 }
375 
376 static	bool_t
377 xdrrdma_getint32(XDR *xdrs, int32_t *int32p)
378 {
379 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
380 	int chunked = 0;
381 
382 	if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0) {
383 		/*
384 		 * check if rest of the rpc message is in a chunk
385 		 */
386 		if (!xdrrdma_read_a_chunk(xdrs, &xdrp->xp_conn)) {
387 			return (FALSE);
388 		}
389 		chunked = 1;
390 	}
391 
392 	/* LINTED pointer alignment */
393 	*int32p = (int32_t)ntohl((uint32_t)(*((int32_t *)(xdrp->xp_offp))));
394 
395 	DTRACE_PROBE1(krpc__i__xdrrdma_getint32, int32_t, *int32p);
396 
397 	xdrp->xp_offp += sizeof (int32_t);
398 
399 	if (chunked)
400 		xdrs->x_handy -= (int)sizeof (int32_t);
401 
402 	if (xdrp->xp_off != 0) {
403 		xdrp->xp_off += sizeof (int32_t);
404 	}
405 
406 	return (TRUE);
407 }
408 
409 static	bool_t
410 xdrrdma_putint32(XDR *xdrs, int32_t *int32p)
411 {
412 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
413 
414 	if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
415 		return (FALSE);
416 
417 	/* LINTED pointer alignment */
418 	*(int32_t *)xdrp->xp_offp = (int32_t)htonl((uint32_t)(*int32p));
419 	xdrp->xp_offp += sizeof (int32_t);
420 
421 	return (TRUE);
422 }
423 
424 /*
425  * DECODE bytes from XDR stream for rdma.
426  * If the XDR stream contains a read chunk list,
427  * it will go through xdrrdma_getrdmablk instead.
428  */
429 static	bool_t
430 xdrrdma_getbytes(XDR *xdrs, caddr_t addr, int len)
431 {
432 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
433 	struct clist	*cle = *(xdrp->xp_rcl_next);
434 	struct clist	*cls = *(xdrp->xp_rcl_next);
435 	struct clist	cl;
436 	bool_t		retval = TRUE;
437 	uint32_t	total_len = len;
438 	uint32_t	cur_offset = 0;
439 	uint32_t	total_segments = 0;
440 	uint32_t	actual_segments = 0;
441 	uint32_t	status = RDMA_SUCCESS;
442 	uint32_t	alen = 0;
443 	uint32_t	xpoff;
444 
445 	while (cle) {
446 		total_segments++;
447 		cle = cle->c_next;
448 	}
449 
450 	cle = *(xdrp->xp_rcl_next);
451 
452 	if (xdrp->xp_off) {
453 		xpoff = xdrp->xp_off;
454 	} else {
455 		xpoff = (xdrp->xp_offp - xdrs->x_base);
456 	}
457 
458 	/*
459 	 * If there was a chunk at the current offset, then setup a read
460 	 * chunk list which records the destination address and length
461 	 * and will RDMA READ the data in later.
462 	 */
463 
464 	if (cle != NULL && cle->c_xdroff == xpoff) {
465 		for (actual_segments = 0;
466 		    actual_segments < total_segments; actual_segments++) {
467 
468 			if (total_len <= 0)
469 				break;
470 
471 			if (status != RDMA_SUCCESS)
472 				goto out;
473 
474 			cle->u.c_daddr = (uint64)(uintptr_t)addr + cur_offset;
475 			alen = 0;
476 			if (cle->c_len > total_len) {
477 				alen = cle->c_len;
478 				cle->c_len = total_len;
479 			}
480 			if (!alen)
481 				xdrp->xp_rcl_next = &cle->c_next;
482 
483 			cur_offset += cle->c_len;
484 			total_len -= cle->c_len;
485 
486 			if ((total_segments - actual_segments - 1) == 0 &&
487 			    total_len > 0) {
488 				DTRACE_PROBE(
489 				    krpc__e__xdrrdma_getbytes_chunktooshort);
490 				retval = FALSE;
491 			}
492 
493 			if ((total_segments - actual_segments - 1) > 0 &&
494 			    total_len == 0) {
495 				DTRACE_PROBE2(krpc__e__xdrrdma_getbytes_toobig,
496 				    int, total_segments, int, actual_segments);
497 			}
498 
499 			/*
500 			 * RDMA READ the chunk data from the remote end.
501 			 * First prep the destination buffer by registering
502 			 * it, then RDMA READ the chunk data. Since we are
503 			 * doing streaming memory, sync the destination
504 			 * buffer to CPU and deregister the buffer.
505 			 */
506 			if (xdrp->xp_conn == NULL) {
507 				return (FALSE);
508 			}
509 			cl = *cle;
510 			cl.c_next = NULL;
511 			status = clist_register(xdrp->xp_conn, &cl,
512 			    CLIST_REG_DST);
513 			if (status != RDMA_SUCCESS) {
514 				retval = FALSE;
515 				/*
516 				 * Deregister the previous chunks
517 				 * before return
518 				 */
519 				goto out;
520 			}
521 
522 			cle->c_dmemhandle = cl.c_dmemhandle;
523 			cle->c_dsynchandle = cl.c_dsynchandle;
524 
525 			/*
526 			 * Now read the chunk in
527 			 */
528 			if ((total_segments - actual_segments - 1) == 0 ||
529 			    total_len == 0) {
530 				status = RDMA_READ(xdrp->xp_conn, &cl, WAIT);
531 			} else {
532 				status = RDMA_READ(xdrp->xp_conn, &cl, NOWAIT);
533 			}
534 			if (status != RDMA_SUCCESS) {
535 				DTRACE_PROBE1(
536 				    krpc__i__xdrrdma_getblk_readfailed,
537 				    int, status);
538 				retval = FALSE;
539 			}
540 
541 			cle = cle->c_next;
542 
543 		}
544 
545 		/*
546 		 * sync the memory for cpu
547 		 */
548 		cl = *cls;
549 		cl.c_next = NULL;
550 		cl.c_len = cur_offset;
551 		if (clist_syncmem(
552 		    xdrp->xp_conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
553 			retval = FALSE;
554 		}
555 out:
556 
557 		/*
558 		 * Deregister the chunks
559 		 */
560 		cle = cls;
561 		while (actual_segments != 0) {
562 			cl = *cle;
563 			cl.c_next = NULL;
564 
565 			cl.c_regtype = CLIST_REG_DST;
566 			(void) clist_deregister(xdrp->xp_conn, &cl);
567 
568 			cle = cle->c_next;
569 			actual_segments--;
570 		}
571 
572 		if (alen) {
573 			cle = *(xdrp->xp_rcl_next);
574 			cle->w.c_saddr =
575 			    (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
576 			cle->c_len = alen - cle->c_len;
577 		}
578 
579 		return (retval);
580 	}
581 
582 	if ((xdrs->x_handy -= len) < 0)
583 		return (FALSE);
584 
585 	bcopy(xdrp->xp_offp, addr, len);
586 
587 	xdrp->xp_offp += len;
588 
589 	if (xdrp->xp_off != 0)
590 		xdrp->xp_off += len;
591 
592 	return (TRUE);
593 }
594 
595 /*
596  * ENCODE some bytes into an XDR stream xp_min_chunk = 0, means the stream of
597  * bytes contain no chunks to seperate out, and if the bytes do not fit in
598  * the supplied buffer, grow the buffer and free the old buffer.
599  */
600 static	bool_t
601 xdrrdma_putbytes(XDR *xdrs, caddr_t addr, int len)
602 {
603 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
604 	/*
605 	 * Is this stream accepting chunks?
606 	 * If so, does the either of the two following conditions exist?
607 	 * - length of bytes to encode is greater than the min chunk size?
608 	 * - remaining space in this stream is shorter than length of
609 	 *   bytes to encode?
610 	 *
611 	 * If the above exists, then create a chunk for this encoding
612 	 * and save the addresses, etc.
613 	 */
614 	if (xdrp->xp_flags & XDR_RDMA_CHUNK &&
615 	    ((xdrp->xp_min_chunk != 0 &&
616 	    len >= xdrp->xp_min_chunk) ||
617 	    (xdrs->x_handy - len  < 0))) {
618 		struct clist	*cle;
619 		int		offset = xdrp->xp_offp - xdrs->x_base;
620 
621 		cle = clist_alloc();
622 		cle->c_xdroff = offset;
623 		cle->c_len = len;
624 		cle->w.c_saddr = (uint64)(uintptr_t)addr;
625 		cle->c_next = NULL;
626 
627 		*(xdrp->xp_rcl_next) = cle;
628 		xdrp->xp_rcl_next = &(cle->c_next);
629 
630 		return (TRUE);
631 	}
632 	/* Is there enough space to encode what is left? */
633 	if ((xdrs->x_handy -= len) < 0) {
634 		return (FALSE);
635 	}
636 	bcopy(addr, xdrp->xp_offp, len);
637 	xdrp->xp_offp += len;
638 
639 	return (TRUE);
640 }
641 
642 uint_t
643 xdrrdma_getpos(XDR *xdrs)
644 {
645 	xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
646 
647 	return ((uint_t)((uintptr_t)xdrp->xp_offp - (uintptr_t)xdrs->x_base));
648 }
649 
650 bool_t
651 xdrrdma_setpos(XDR *xdrs, uint_t pos)
652 {
653 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
654 
655 	caddr_t		newaddr = xdrs->x_base + pos;
656 	caddr_t		lastaddr = xdrp->xp_offp + xdrs->x_handy;
657 	ptrdiff_t	diff;
658 
659 	if (newaddr > lastaddr)
660 		return (FALSE);
661 
662 	xdrp->xp_offp = newaddr;
663 	diff = lastaddr - newaddr;
664 	xdrs->x_handy = (int)diff;
665 
666 	return (TRUE);
667 }
668 
669 /* ARGSUSED */
670 static rpc_inline_t *
671 xdrrdma_inline(XDR *xdrs, int len)
672 {
673 	rpc_inline_t	*buf = NULL;
674 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
675 	struct clist	*cle = *(xdrp->xp_rcl_next);
676 
677 	if (xdrs->x_op == XDR_DECODE) {
678 		/*
679 		 * Since chunks aren't in-line, check to see whether there is
680 		 * a chunk in the inline range.
681 		 */
682 		if (cle != NULL &&
683 		    cle->c_xdroff <= (xdrp->xp_offp - xdrs->x_base + len))
684 			return (NULL);
685 	}
686 
687 	/* LINTED pointer alignment */
688 	buf = (rpc_inline_t *)xdrp->xp_offp;
689 	if (!IS_P2ALIGNED(buf, sizeof (int32_t)))
690 		return (NULL);
691 
692 	if ((xdrs->x_handy < len) || (xdrp->xp_min_chunk != 0 &&
693 	    len >= xdrp->xp_min_chunk)) {
694 		return (NULL);
695 	} else {
696 		xdrs->x_handy -= len;
697 		xdrp->xp_offp += len;
698 		return (buf);
699 	}
700 }
701 
702 static	bool_t
703 xdrrdma_control(XDR *xdrs, int request, void *info)
704 {
705 	int32_t		*int32p;
706 	int		len, i;
707 	uint_t		in_flags;
708 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
709 	rdma_chunkinfo_t *rcip = NULL;
710 	rdma_wlist_conn_info_t *rwcip = NULL;
711 	rdma_chunkinfo_lengths_t *rcilp = NULL;
712 	struct uio *uiop;
713 	struct clist	*rwl = NULL, *first = NULL;
714 	struct clist	*prev = NULL;
715 
716 	switch (request) {
717 	case XDR_PEEK:
718 		/*
719 		 * Return the next 4 byte unit in the XDR stream.
720 		 */
721 		if (xdrs->x_handy < sizeof (int32_t))
722 			return (FALSE);
723 
724 		int32p = (int32_t *)info;
725 		*int32p = (int32_t)ntohl((uint32_t)
726 		    (*((int32_t *)(xdrp->xp_offp))));
727 
728 		return (TRUE);
729 
730 	case XDR_SKIPBYTES:
731 		/*
732 		 * Skip the next N bytes in the XDR stream.
733 		 */
734 		int32p = (int32_t *)info;
735 		len = RNDUP((int)(*int32p));
736 		if ((xdrs->x_handy -= len) < 0)
737 			return (FALSE);
738 		xdrp->xp_offp += len;
739 
740 		return (TRUE);
741 
742 	case XDR_RDMA_SET_FLAGS:
743 		/*
744 		 * Set the flags provided in the *info in xp_flags for rdma
745 		 * xdr stream control.
746 		 */
747 		int32p = (int32_t *)info;
748 		in_flags = (uint_t)(*int32p);
749 
750 		xdrp->xp_flags |= in_flags;
751 		return (TRUE);
752 
753 	case XDR_RDMA_GET_FLAGS:
754 		/*
755 		 * Get the flags provided in xp_flags return through *info
756 		 */
757 		int32p = (int32_t *)info;
758 
759 		*int32p = (int32_t)xdrp->xp_flags;
760 		return (TRUE);
761 
762 	case XDR_RDMA_GET_CHUNK_LEN:
763 		rcilp = (rdma_chunkinfo_lengths_t *)info;
764 		rcilp->rcil_len = xdrp->xp_reply_chunk_len;
765 		rcilp->rcil_len_alt = xdrp->xp_reply_chunk_len_alt;
766 
767 		return (TRUE);
768 
769 	case XDR_RDMA_ADD_CHUNK:
770 		/*
771 		 * Store wlist information
772 		 */
773 
774 		rcip = (rdma_chunkinfo_t *)info;
775 
776 		DTRACE_PROBE2(krpc__i__xdrrdma__control__add__chunk,
777 		    rci_type_t, rcip->rci_type, uint32, rcip->rci_len);
778 		switch (rcip->rci_type) {
779 		case RCI_WRITE_UIO_CHUNK:
780 			xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
781 
782 			if ((rcip->rci_len + XDR_RDMA_BUF_OVERHEAD) <
783 			    xdrp->xp_min_chunk) {
784 				xdrp->xp_wcl = NULL;
785 				*(rcip->rci_clpp) = NULL;
786 				return (TRUE);
787 			}
788 			uiop = rcip->rci_a.rci_uiop;
789 
790 			for (i = 0; i < uiop->uio_iovcnt; i++) {
791 				rwl = clist_alloc();
792 				if (first == NULL)
793 					first = rwl;
794 				rwl->c_len = uiop->uio_iov[i].iov_len;
795 				rwl->u.c_daddr =
796 				    (uint64)(uintptr_t)
797 				    (uiop->uio_iov[i].iov_base);
798 				/*
799 				 * if userspace address, put adspace ptr in
800 				 * clist. If not, then do nothing since it's
801 				 * already set to NULL (from kmem_zalloc)
802 				 */
803 				if (uiop->uio_segflg == UIO_USERSPACE) {
804 					rwl->c_adspc = ttoproc(curthread)->p_as;
805 				}
806 
807 				if (prev == NULL)
808 					prev = rwl;
809 				else {
810 					prev->c_next = rwl;
811 					prev = rwl;
812 				}
813 			}
814 
815 			rwl->c_next = NULL;
816 			xdrp->xp_wcl = first;
817 			*(rcip->rci_clpp) = first;
818 
819 			break;
820 
821 		case RCI_WRITE_ADDR_CHUNK:
822 			rwl = clist_alloc();
823 
824 			rwl->c_len = rcip->rci_len;
825 			rwl->u.c_daddr3 = rcip->rci_a.rci_addr;
826 			rwl->c_next = NULL;
827 			xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
828 
829 			xdrp->xp_wcl = rwl;
830 			*(rcip->rci_clpp) = rwl;
831 
832 			break;
833 
834 		case RCI_REPLY_CHUNK:
835 			xdrp->xp_reply_chunk_len += rcip->rci_len;
836 			break;
837 		}
838 		return (TRUE);
839 
840 	case XDR_RDMA_GET_WLIST:
841 		*((struct clist **)info) = xdrp->xp_wcl;
842 		return (TRUE);
843 
844 	case XDR_RDMA_SET_WLIST:
845 		xdrp->xp_wcl = (struct clist *)info;
846 		return (TRUE);
847 
848 	case XDR_RDMA_GET_RLIST:
849 		*((struct clist **)info) = xdrp->xp_rcl;
850 		return (TRUE);
851 
852 	case XDR_RDMA_GET_WCINFO:
853 		rwcip = (rdma_wlist_conn_info_t *)info;
854 
855 		rwcip->rwci_wlist = xdrp->xp_wcl;
856 		rwcip->rwci_conn = xdrp->xp_conn;
857 
858 		return (TRUE);
859 
860 	default:
861 		return (FALSE);
862 	}
863 }
864 
865 bool_t xdr_do_clist(XDR *, clist **);
866 
867 /*
868  * Not all fields in struct clist are interesting to the RPC over RDMA
869  * protocol. Only XDR the interesting fields.
870  */
871 bool_t
872 xdr_clist(XDR *xdrs, clist *objp)
873 {
874 	if (!xdr_uint32(xdrs, &objp->c_xdroff))
875 		return (FALSE);
876 	if (!xdr_uint32(xdrs, &objp->c_smemhandle.mrc_rmr))
877 		return (FALSE);
878 	if (!xdr_uint32(xdrs, &objp->c_len))
879 		return (FALSE);
880 	if (!xdr_uint64(xdrs, &objp->w.c_saddr))
881 		return (FALSE);
882 	if (!xdr_do_clist(xdrs, &objp->c_next))
883 		return (FALSE);
884 	return (TRUE);
885 }
886 
887 /*
888  * The following two functions are forms of xdr_pointer()
889  * and xdr_reference(). Since the generic versions just
890  * kmem_alloc() a new clist, we actually want to use the
891  * rdma_clist kmem_cache.
892  */
893 
894 /*
895  * Generate or free a clist structure from the
896  * kmem_cache "rdma_clist"
897  */
898 bool_t
899 xdr_ref_clist(XDR *xdrs, caddr_t *pp)
900 {
901 	caddr_t loc = *pp;
902 	bool_t stat;
903 
904 	if (loc == NULL) {
905 		switch (xdrs->x_op) {
906 		case XDR_FREE:
907 			return (TRUE);
908 
909 		case XDR_DECODE:
910 			*pp = loc = (caddr_t)clist_alloc();
911 			break;
912 
913 		case XDR_ENCODE:
914 			ASSERT(loc);
915 			break;
916 		}
917 	}
918 
919 	stat = xdr_clist(xdrs, (struct clist *)loc);
920 
921 	if (xdrs->x_op == XDR_FREE) {
922 		kmem_cache_free(clist_cache, loc);
923 		*pp = NULL;
924 	}
925 	return (stat);
926 }
927 
928 /*
929  * XDR a pointer to a possibly recursive clist. This differs
930  * with xdr_reference in that it can serialize/deserialiaze
931  * trees correctly.
932  *
933  *  What is sent is actually a union:
934  *
935  *  union object_pointer switch (boolean b) {
936  *  case TRUE: object_data data;
937  *  case FALSE: void nothing;
938  *  }
939  *
940  * > objpp: Pointer to the pointer to the object.
941  *
942  */
943 
944 bool_t
945 xdr_do_clist(XDR *xdrs, clist **objpp)
946 {
947 	bool_t more_data;
948 
949 	more_data = (*objpp != NULL);
950 	if (!xdr_bool(xdrs, &more_data))
951 		return (FALSE);
952 	if (!more_data) {
953 		*objpp = NULL;
954 		return (TRUE);
955 	}
956 	return (xdr_ref_clist(xdrs, (caddr_t *)objpp));
957 }
958 
959 uint_t
960 xdr_getbufsize(XDR *xdrs)
961 {
962 	xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
963 
964 	return ((uint_t)xdrp->xp_buf_size);
965 }
966 
967 /* ARGSUSED */
968 bool_t
969 xdr_encode_rlist_svc(XDR *xdrs, clist *rlist)
970 {
971 	bool_t	vfalse = FALSE;
972 
973 	ASSERT(rlist == NULL);
974 	return (xdr_bool(xdrs, &vfalse));
975 }
976 
977 bool_t
978 xdr_encode_wlist(XDR *xdrs, clist *w)
979 {
980 	bool_t		vfalse = FALSE, vtrue = TRUE;
981 	int		i;
982 	uint_t		num_segment = 0;
983 	struct clist	*cl;
984 
985 	/* does a wlist exist? */
986 	if (w == NULL) {
987 		return (xdr_bool(xdrs, &vfalse));
988 	}
989 	/* Encode N consecutive segments, 1, N, HLOO, ..., HLOO, 0 */
990 	if (!xdr_bool(xdrs, &vtrue))
991 		return (FALSE);
992 
993 	for (cl = w; cl != NULL; cl = cl->c_next) {
994 		num_segment++;
995 	}
996 
997 	if (!xdr_uint32(xdrs, &num_segment))
998 		return (FALSE);
999 	for (i = 0; i < num_segment; i++) {
1000 
1001 		DTRACE_PROBE1(krpc__i__xdr_encode_wlist_len, uint_t, w->c_len);
1002 
1003 		if (!xdr_uint32(xdrs, &w->c_dmemhandle.mrc_rmr))
1004 			return (FALSE);
1005 
1006 		if (!xdr_uint32(xdrs, &w->c_len))
1007 			return (FALSE);
1008 
1009 		if (!xdr_uint64(xdrs, &w->u.c_daddr))
1010 			return (FALSE);
1011 
1012 		w = w->c_next;
1013 	}
1014 
1015 	if (!xdr_bool(xdrs, &vfalse))
1016 		return (FALSE);
1017 
1018 	return (TRUE);
1019 }
1020 
1021 
1022 /*
1023  * Conditionally decode a RDMA WRITE chunk list from XDR stream.
1024  *
1025  * If the next boolean in the XDR stream is false there is no
1026  * RDMA WRITE chunk list present. Otherwise iterate over the
1027  * array and for each entry: allocate a struct clist and decode.
1028  * Pass back an indication via wlist_exists if we have seen a
1029  * RDMA WRITE chunk list.
1030  */
1031 bool_t
1032 xdr_decode_wlist(XDR *xdrs, struct clist **w, bool_t *wlist_exists)
1033 {
1034 	struct clist	*tmp;
1035 	bool_t		more = FALSE;
1036 	uint32_t	seg_array_len;
1037 	uint32_t	i;
1038 
1039 	if (!xdr_bool(xdrs, &more))
1040 		return (FALSE);
1041 
1042 	/* is there a wlist? */
1043 	if (more == FALSE) {
1044 		*wlist_exists = FALSE;
1045 		return (TRUE);
1046 	}
1047 	*wlist_exists = TRUE;
1048 
1049 	if (!xdr_uint32(xdrs, &seg_array_len))
1050 		return (FALSE);
1051 
1052 	tmp = *w = clist_alloc();
1053 	for (i = 0; i < seg_array_len; i++) {
1054 
1055 		if (!xdr_uint32(xdrs, &tmp->c_dmemhandle.mrc_rmr))
1056 			return (FALSE);
1057 		if (!xdr_uint32(xdrs, &tmp->c_len))
1058 			return (FALSE);
1059 
1060 		DTRACE_PROBE1(krpc__i__xdr_decode_wlist_len,
1061 		    uint_t, tmp->c_len);
1062 
1063 		if (!xdr_uint64(xdrs, &tmp->u.c_daddr))
1064 			return (FALSE);
1065 		if (i < seg_array_len - 1) {
1066 			tmp->c_next = clist_alloc();
1067 			tmp = tmp->c_next;
1068 		} else {
1069 			tmp->c_next = NULL;
1070 		}
1071 	}
1072 
1073 	more = FALSE;
1074 	if (!xdr_bool(xdrs, &more))
1075 		return (FALSE);
1076 
1077 	return (TRUE);
1078 }
1079 
1080 /*
1081  * Server side RDMA WRITE list decode.
1082  * XDR context is memory ops
1083  */
1084 bool_t
1085 xdr_decode_wlist_svc(XDR *xdrs, struct clist **wclp, bool_t *wwl,
1086     uint32_t *total_length, CONN *conn)
1087 {
1088 	struct clist	*first, *ncl;
1089 	char		*memp;
1090 	uint32_t	num_wclist;
1091 	uint32_t	wcl_length = 0;
1092 	uint32_t	i;
1093 	bool_t		more = FALSE;
1094 
1095 	*wclp = NULL;
1096 	*wwl = FALSE;
1097 	*total_length = 0;
1098 
1099 	if (!xdr_bool(xdrs, &more)) {
1100 		return (FALSE);
1101 	}
1102 
1103 	if (more == FALSE) {
1104 		return (TRUE);
1105 	}
1106 
1107 	*wwl = TRUE;
1108 
1109 	if (!xdr_uint32(xdrs, &num_wclist)) {
1110 		DTRACE_PROBE(krpc__e__xdrrdma__wlistsvc__listlength);
1111 		return (FALSE);
1112 	}
1113 
1114 	first = ncl = clist_alloc();
1115 
1116 	for (i = 0; i < num_wclist; i++) {
1117 
1118 		if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
1119 			goto err_out;
1120 		if (!xdr_uint32(xdrs, &ncl->c_len))
1121 			goto err_out;
1122 		if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
1123 			goto err_out;
1124 
1125 		if (ncl->c_len > MAX_SVC_XFER_SIZE) {
1126 			DTRACE_PROBE(
1127 			    krpc__e__xdrrdma__wlistsvc__chunklist_toobig);
1128 			ncl->c_len = MAX_SVC_XFER_SIZE;
1129 		}
1130 
1131 		DTRACE_PROBE1(krpc__i__xdr_decode_wlist_svc_len,
1132 		    uint_t, ncl->c_len);
1133 
1134 		wcl_length += ncl->c_len;
1135 
1136 		if (i < num_wclist - 1) {
1137 			ncl->c_next = clist_alloc();
1138 			ncl = ncl->c_next;
1139 		}
1140 	}
1141 
1142 	if (!xdr_bool(xdrs, &more))
1143 		goto err_out;
1144 
1145 	first->rb_longbuf.type = RDMA_LONG_BUFFER;
1146 	first->rb_longbuf.len =
1147 	    wcl_length > WCL_BUF_LEN ? wcl_length : WCL_BUF_LEN;
1148 
1149 	if (rdma_buf_alloc(conn, &first->rb_longbuf)) {
1150 		clist_free(first);
1151 		return (FALSE);
1152 	}
1153 
1154 	memp = first->rb_longbuf.addr;
1155 
1156 	ncl = first;
1157 	for (i = 0; i < num_wclist; i++) {
1158 		ncl->w.c_saddr3 = (caddr_t)memp;
1159 		memp += ncl->c_len;
1160 		ncl = ncl->c_next;
1161 	}
1162 
1163 	*wclp = first;
1164 	*total_length = wcl_length;
1165 	return (TRUE);
1166 
1167 err_out:
1168 	clist_free(first);
1169 	return (FALSE);
1170 }
1171 
1172 /*
1173  * XDR decode the long reply write chunk.
1174  */
1175 bool_t
1176 xdr_decode_reply_wchunk(XDR *xdrs, struct clist **clist)
1177 {
1178 	bool_t		have_rchunk = FALSE;
1179 	struct clist	*first = NULL, *ncl = NULL;
1180 	uint32_t	num_wclist;
1181 	uint32_t	i;
1182 
1183 	if (!xdr_bool(xdrs, &have_rchunk))
1184 		return (FALSE);
1185 
1186 	if (have_rchunk == FALSE)
1187 		return (TRUE);
1188 
1189 	if (!xdr_uint32(xdrs, &num_wclist)) {
1190 		DTRACE_PROBE(krpc__e__xdrrdma__replywchunk__listlength);
1191 		return (FALSE);
1192 	}
1193 
1194 	if (num_wclist == 0) {
1195 		return (FALSE);
1196 	}
1197 
1198 	first = ncl = clist_alloc();
1199 
1200 	for (i = 0; i < num_wclist; i++) {
1201 
1202 		if (i > 0) {
1203 			ncl->c_next = clist_alloc();
1204 			ncl = ncl->c_next;
1205 		}
1206 
1207 		if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
1208 			goto err_out;
1209 		if (!xdr_uint32(xdrs, &ncl->c_len))
1210 			goto err_out;
1211 		if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
1212 			goto err_out;
1213 
1214 		if (ncl->c_len > MAX_SVC_XFER_SIZE) {
1215 			DTRACE_PROBE(
1216 			    krpc__e__xdrrdma__replywchunk__chunklist_toobig);
1217 			ncl->c_len = MAX_SVC_XFER_SIZE;
1218 		}
1219 		if (!(ncl->c_dmemhandle.mrc_rmr &&
1220 		    (ncl->c_len > 0) && ncl->u.c_daddr))
1221 			DTRACE_PROBE(
1222 			    krpc__e__xdrrdma__replywchunk__invalid_segaddr);
1223 
1224 		DTRACE_PROBE1(krpc__i__xdr_decode_reply_wchunk_c_len,
1225 		    uint32_t, ncl->c_len);
1226 
1227 	}
1228 	*clist = first;
1229 	return (TRUE);
1230 
1231 err_out:
1232 	clist_free(first);
1233 	return (FALSE);
1234 }
1235 
1236 
1237 bool_t
1238 xdr_encode_reply_wchunk(XDR *xdrs,
1239     struct clist *cl_longreply, uint32_t seg_array_len)
1240 {
1241 	int		i;
1242 	bool_t		long_reply_exists = TRUE;
1243 	uint32_t	length;
1244 	uint64		offset;
1245 
1246 	if (seg_array_len > 0) {
1247 		if (!xdr_bool(xdrs, &long_reply_exists))
1248 			return (FALSE);
1249 		if (!xdr_uint32(xdrs, &seg_array_len))
1250 			return (FALSE);
1251 
1252 		for (i = 0; i < seg_array_len; i++) {
1253 			if (!cl_longreply)
1254 				return (FALSE);
1255 			length = cl_longreply->c_len;
1256 			offset = (uint64) cl_longreply->u.c_daddr;
1257 
1258 			DTRACE_PROBE1(
1259 			    krpc__i__xdr_encode_reply_wchunk_c_len,
1260 			    uint32_t, length);
1261 
1262 			if (!xdr_uint32(xdrs,
1263 			    &cl_longreply->c_dmemhandle.mrc_rmr))
1264 				return (FALSE);
1265 			if (!xdr_uint32(xdrs, &length))
1266 				return (FALSE);
1267 			if (!xdr_uint64(xdrs, &offset))
1268 				return (FALSE);
1269 			cl_longreply = cl_longreply->c_next;
1270 		}
1271 	} else {
1272 		long_reply_exists = FALSE;
1273 		if (!xdr_bool(xdrs, &long_reply_exists))
1274 			return (FALSE);
1275 	}
1276 	return (TRUE);
1277 }
1278 bool_t
1279 xdrrdma_read_from_client(struct clist *rlist, CONN **conn, uint_t count)
1280 {
1281 	struct clist	*rdclist;
1282 	struct clist	cl;
1283 	uint_t		total_len = 0;
1284 	uint32_t	status;
1285 	bool_t		retval = TRUE;
1286 
1287 	rlist->rb_longbuf.type = RDMA_LONG_BUFFER;
1288 	rlist->rb_longbuf.len =
1289 	    count > RCL_BUF_LEN ? count : RCL_BUF_LEN;
1290 
1291 	if (rdma_buf_alloc(*conn, &rlist->rb_longbuf)) {
1292 		return (FALSE);
1293 	}
1294 
1295 	/*
1296 	 * The entire buffer is registered with the first chunk.
1297 	 * Later chunks will use the same registered memory handle.
1298 	 */
1299 
1300 	cl = *rlist;
1301 	cl.c_next = NULL;
1302 	if (clist_register(*conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
1303 		rdma_buf_free(*conn, &rlist->rb_longbuf);
1304 		DTRACE_PROBE(
1305 		    krpc__e__xdrrdma__readfromclient__clist__reg);
1306 		return (FALSE);
1307 	}
1308 
1309 	rlist->c_regtype = CLIST_REG_DST;
1310 	rlist->c_dmemhandle = cl.c_dmemhandle;
1311 	rlist->c_dsynchandle = cl.c_dsynchandle;
1312 
1313 	for (rdclist = rlist;
1314 	    rdclist != NULL; rdclist = rdclist->c_next) {
1315 		total_len += rdclist->c_len;
1316 #if (defined(OBJ32)||defined(DEBUG32))
1317 		rdclist->u.c_daddr3 =
1318 		    (caddr_t)((char *)rlist->rb_longbuf.addr +
1319 		    (uint32) rdclist->u.c_daddr3);
1320 #else
1321 		rdclist->u.c_daddr3 =
1322 		    (caddr_t)((char *)rlist->rb_longbuf.addr +
1323 		    (uint64) rdclist->u.c_daddr);
1324 
1325 #endif
1326 		cl = (*rdclist);
1327 		cl.c_next = NULL;
1328 
1329 		/*
1330 		 * Use the same memory handle for all the chunks
1331 		 */
1332 		cl.c_dmemhandle = rlist->c_dmemhandle;
1333 		cl.c_dsynchandle = rlist->c_dsynchandle;
1334 
1335 
1336 		DTRACE_PROBE1(krpc__i__xdrrdma__readfromclient__buflen,
1337 		    int, rdclist->c_len);
1338 
1339 		/*
1340 		 * Now read the chunk in
1341 		 */
1342 		if (rdclist->c_next == NULL) {
1343 			status = RDMA_READ(*conn, &cl, WAIT);
1344 		} else {
1345 			status = RDMA_READ(*conn, &cl, NOWAIT);
1346 		}
1347 		if (status != RDMA_SUCCESS) {
1348 			DTRACE_PROBE(
1349 			    krpc__e__xdrrdma__readfromclient__readfailed);
1350 			rdma_buf_free(*conn, &rlist->rb_longbuf);
1351 			return (FALSE);
1352 		}
1353 	}
1354 
1355 	cl = (*rlist);
1356 	cl.c_next = NULL;
1357 	cl.c_len = total_len;
1358 	if (clist_syncmem(*conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
1359 		retval = FALSE;
1360 	}
1361 	return (retval);
1362 }
1363 
1364 bool_t
1365 xdrrdma_free_clist(CONN *conn, struct clist *clp)
1366 {
1367 	rdma_buf_free(conn, &clp->rb_longbuf);
1368 	clist_free(clp);
1369 	return (TRUE);
1370 }
1371 
1372 bool_t
1373 xdrrdma_send_read_data(XDR *xdrs, uint_t data_len, struct clist *wcl)
1374 {
1375 	int status;
1376 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
1377 	struct xdr_ops *xops = xdrrdma_xops();
1378 	struct clist *tcl, *wrcl, *cl;
1379 	struct clist fcl;
1380 	int rndup_present, rnduplen;
1381 
1382 	rndup_present = 0;
1383 	wrcl = NULL;
1384 
1385 	/* caller is doing a sizeof */
1386 	if (xdrs->x_ops != &xdrrdma_ops || xdrs->x_ops == xops)
1387 		return (TRUE);
1388 
1389 	/* copy of the first chunk */
1390 	fcl = *wcl;
1391 	fcl.c_next = NULL;
1392 
1393 	/*
1394 	 * The entire buffer is registered with the first chunk.
1395 	 * Later chunks will use the same registered memory handle.
1396 	 */
1397 
1398 	status = clist_register(xdrp->xp_conn, &fcl, CLIST_REG_SOURCE);
1399 	if (status != RDMA_SUCCESS) {
1400 		return (FALSE);
1401 	}
1402 
1403 	wcl->c_regtype = CLIST_REG_SOURCE;
1404 	wcl->c_smemhandle = fcl.c_smemhandle;
1405 	wcl->c_ssynchandle = fcl.c_ssynchandle;
1406 
1407 	/*
1408 	 * Only transfer the read data ignoring any trailing
1409 	 * roundup chunks. A bit of work, but it saves an
1410 	 * unnecessary extra RDMA_WRITE containing only
1411 	 * roundup bytes.
1412 	 */
1413 
1414 	rnduplen = clist_len(wcl) - data_len;
1415 
1416 	if (rnduplen) {
1417 
1418 		tcl = wcl->c_next;
1419 
1420 		/*
1421 		 * Check if there is a trailing roundup chunk
1422 		 */
1423 		while (tcl) {
1424 			if ((tcl->c_next == NULL) && (tcl->c_len == rnduplen)) {
1425 				rndup_present = 1;
1426 				break;
1427 			}
1428 			tcl = tcl->c_next;
1429 		}
1430 
1431 		/*
1432 		 * Make a copy chunk list skipping the last chunk
1433 		 */
1434 		if (rndup_present) {
1435 			cl = wcl;
1436 			tcl = NULL;
1437 			while (cl) {
1438 				if (tcl == NULL) {
1439 					tcl = clist_alloc();
1440 					wrcl = tcl;
1441 				} else {
1442 					tcl->c_next = clist_alloc();
1443 					tcl = tcl->c_next;
1444 				}
1445 
1446 				*tcl = *cl;
1447 				cl = cl->c_next;
1448 				/* last chunk */
1449 				if (cl->c_next == NULL)
1450 					break;
1451 			}
1452 			tcl->c_next = NULL;
1453 		}
1454 	}
1455 
1456 	if (wrcl == NULL) {
1457 		/* No roundup chunks */
1458 		wrcl = wcl;
1459 	}
1460 
1461 	/*
1462 	 * Set the registered memory handles for the
1463 	 * rest of the chunks same as the first chunk.
1464 	 */
1465 	tcl = wrcl->c_next;
1466 	while (tcl) {
1467 		tcl->c_smemhandle = fcl.c_smemhandle;
1468 		tcl->c_ssynchandle = fcl.c_ssynchandle;
1469 		tcl = tcl->c_next;
1470 	}
1471 
1472 	/*
1473 	 * Sync the total len beginning from the first chunk.
1474 	 */
1475 	fcl.c_len = clist_len(wrcl);
1476 	status = clist_syncmem(xdrp->xp_conn, &fcl, CLIST_REG_SOURCE);
1477 	if (status != RDMA_SUCCESS) {
1478 		return (FALSE);
1479 	}
1480 
1481 	status = RDMA_WRITE(xdrp->xp_conn, wrcl, WAIT);
1482 
1483 	if (rndup_present)
1484 		clist_free(wrcl);
1485 
1486 	if (status != RDMA_SUCCESS) {
1487 		return (FALSE);
1488 	}
1489 
1490 	return (TRUE);
1491 }
1492 
1493 
1494 /*
1495  * Reads one chunk at a time
1496  */
1497 
1498 static bool_t
1499 xdrrdma_read_a_chunk(XDR *xdrs, CONN **conn)
1500 {
1501 	int status;
1502 	int32_t len = 0;
1503 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
1504 	struct clist *cle = *(xdrp->xp_rcl_next);
1505 	struct clist *rclp = xdrp->xp_rcl;
1506 	struct clist *clp;
1507 
1508 	/*
1509 	 * len is used later to decide xdr offset in
1510 	 * the chunk factoring any 4-byte XDR alignment
1511 	 * (See read chunk example top of this file)
1512 	 */
1513 	while (rclp != cle) {
1514 		len += rclp->c_len;
1515 		rclp = rclp->c_next;
1516 	}
1517 
1518 	len = RNDUP(len) - len;
1519 
1520 	ASSERT(xdrs->x_handy <= 0);
1521 
1522 	/*
1523 	 * If this is the first chunk to contain the RPC
1524 	 * message set xp_off to the xdr offset of the
1525 	 * inline message.
1526 	 */
1527 	if (xdrp->xp_off == 0)
1528 		xdrp->xp_off = (xdrp->xp_offp - xdrs->x_base);
1529 
1530 	if (cle == NULL || (cle->c_xdroff != xdrp->xp_off))
1531 		return (FALSE);
1532 
1533 	/*
1534 	 * Make a copy of the chunk to read from client.
1535 	 * Chunks are read on demand, so read only one
1536 	 * for now.
1537 	 */
1538 
1539 	rclp = clist_alloc();
1540 	*rclp = *cle;
1541 	rclp->c_next = NULL;
1542 
1543 	xdrp->xp_rcl_next = &cle->c_next;
1544 
1545 	/*
1546 	 * If there is a roundup present, then skip those
1547 	 * bytes when reading.
1548 	 */
1549 	if (len) {
1550 		rclp->w.c_saddr =
1551 		    (uint64)(uintptr_t)rclp->w.c_saddr + len;
1552 			rclp->c_len = rclp->c_len - len;
1553 	}
1554 
1555 	status = xdrrdma_read_from_client(rclp, conn, rclp->c_len);
1556 
1557 	if (status == FALSE) {
1558 		clist_free(rclp);
1559 		return (status);
1560 	}
1561 
1562 	xdrp->xp_offp = rclp->rb_longbuf.addr;
1563 	xdrs->x_base = xdrp->xp_offp;
1564 	xdrs->x_handy = rclp->c_len;
1565 
1566 	/*
1567 	 * This copy of read chunks containing the XDR
1568 	 * message is freed later in xdrrdma_destroy()
1569 	 */
1570 
1571 	if (xdrp->xp_rcl_xdr) {
1572 		/* Add the chunk to end of the list */
1573 		clp = xdrp->xp_rcl_xdr;
1574 		while (clp->c_next != NULL)
1575 			clp = clp->c_next;
1576 		clp->c_next = rclp;
1577 	} else {
1578 		xdrp->xp_rcl_xdr = rclp;
1579 	}
1580 	return (TRUE);
1581 }
1582 
1583 static void
1584 xdrrdma_free_xdr_chunks(CONN *conn, struct clist *xdr_rcl)
1585 {
1586 	struct clist *cl;
1587 
1588 	(void) clist_deregister(conn, xdr_rcl);
1589 
1590 	/*
1591 	 * Read chunks containing parts XDR message are
1592 	 * special: in case of multiple chunks each has
1593 	 * its own buffer.
1594 	 */
1595 
1596 	cl = xdr_rcl;
1597 	while (cl) {
1598 		rdma_buf_free(conn, &cl->rb_longbuf);
1599 		cl = cl->c_next;
1600 	}
1601 
1602 	clist_free(xdr_rcl);
1603 }
1604