xref: /illumos-gate/usr/src/uts/common/rpc/xdr_rdma.c (revision 968633ad8faee931821fd6b656eb0d96d4b186c0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * xdr_rdma.c, XDR implementation using RDMA to move large chunks
42  */
43 
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/systm.h>
47 #include <sys/kmem.h>
48 #include <sys/sdt.h>
49 #include <sys/debug.h>
50 
51 #include <rpc/types.h>
52 #include <rpc/xdr.h>
53 #include <sys/cmn_err.h>
54 #include <rpc/rpc_sztypes.h>
55 #include <rpc/rpc_rdma.h>
56 #include <sys/sysmacros.h>
57 
58 static bool_t   xdrrdma_getint32(XDR *, int32_t *);
59 static bool_t   xdrrdma_putint32(XDR *, int32_t *);
60 static bool_t   xdrrdma_getbytes(XDR *, caddr_t, int);
61 static bool_t   xdrrdma_putbytes(XDR *, caddr_t, int);
62 uint_t		xdrrdma_getpos(XDR *);
63 bool_t		xdrrdma_setpos(XDR *, uint_t);
64 static rpc_inline_t *xdrrdma_inline(XDR *, int);
65 void		xdrrdma_destroy(XDR *);
66 static bool_t   xdrrdma_control(XDR *, int, void *);
67 
68 struct xdr_ops  xdrrdmablk_ops = {
69 	xdrrdma_getbytes,
70 	xdrrdma_putbytes,
71 	xdrrdma_getpos,
72 	xdrrdma_setpos,
73 	xdrrdma_inline,
74 	xdrrdma_destroy,
75 	xdrrdma_control,
76 	xdrrdma_getint32,
77 	xdrrdma_putint32
78 };
79 
80 struct xdr_ops  xdrrdma_ops = {
81 	xdrrdma_getbytes,
82 	xdrrdma_putbytes,
83 	xdrrdma_getpos,
84 	xdrrdma_setpos,
85 	xdrrdma_inline,
86 	xdrrdma_destroy,
87 	xdrrdma_control,
88 	xdrrdma_getint32,
89 	xdrrdma_putint32
90 };
91 
92 /*
93  * A chunk list entry identifies a chunk of opaque data to be moved
94  * separately from the rest of the RPC message. xp_min_chunk = 0, is a
95  * special case for ENCODING, which means do not chunk the incoming stream of
96  * data.
97  */
98 
99 typedef struct {
100 	caddr_t		xp_offp;
101 	int		xp_min_chunk;
102 	uint_t		xp_flags;	/* Controls setting for rdma xdr */
103 	int		xp_buf_size;	/* size of xdr buffer */
104 	struct clist	*xp_rcl;		/* head of chunk list */
105 	struct clist	**xp_rcl_next;	/* location to place/find next chunk */
106 	struct clist	*xp_wcl;	/* head of write chunk list */
107 	CONN		*xp_conn;	/* connection for chunk data xfer */
108 	uint_t		xp_reply_chunk_len;
109 	/* used to track length for security modes: integrity/privacy */
110 	uint_t		xp_reply_chunk_len_alt;
111 } xrdma_private_t;
112 
113 extern kmem_cache_t *clist_cache;
114 
115 bool_t
116 xdrrdma_getrdmablk(XDR *xdrs, struct clist **rlist, uint_t *sizep,
117     CONN **conn, const uint_t maxsize)
118 {
119 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
120 	struct clist	*cle = *(xdrp->xp_rcl_next);
121 	struct clist	*cls = *(xdrp->xp_rcl_next);
122 	struct clist	*rdclist = NULL, *prev = NULL;
123 	bool_t		retval = TRUE;
124 	uint32_t	cur_offset = 0;
125 	uint32_t	total_segments = 0;
126 	uint32_t	actual_segments = 0;
127 	uint32_t	alen;
128 	uint_t		total_len;
129 
130 	ASSERT(xdrs->x_op != XDR_FREE);
131 
132 	/*
133 	 * first deal with the length since xdr bytes are counted
134 	 */
135 	if (!xdr_u_int(xdrs, sizep)) {
136 		DTRACE_PROBE(xdr__e__getrdmablk_sizep_fail);
137 		return (FALSE);
138 	}
139 	total_len = *sizep;
140 	if (total_len > maxsize) {
141 		DTRACE_PROBE2(xdr__e__getrdmablk_bad_size,
142 		    int, total_len, int, maxsize);
143 		return (FALSE);
144 	}
145 	(*conn) = xdrp->xp_conn;
146 
147 	/*
148 	 * if no data we are done
149 	 */
150 	if (total_len == 0)
151 		return (TRUE);
152 
153 	while (cle) {
154 		total_segments++;
155 		cle = cle->c_next;
156 	}
157 
158 	cle = *(xdrp->xp_rcl_next);
159 
160 	/*
161 	 * If there was a chunk at the current offset, then setup a read
162 	 * chunk list which records the destination address and length
163 	 * and will RDMA READ the data in later.
164 	 */
165 	if (cle == NULL)
166 		return (FALSE);
167 
168 	if (cle->c_xdroff != (xdrp->xp_offp - xdrs->x_base))
169 		return (FALSE);
170 
171 	/*
172 	 * Setup the chunk list with appropriate
173 	 * address (offset) and length
174 	 */
175 	for (actual_segments = 0;
176 	    actual_segments < total_segments; actual_segments++) {
177 		if (total_len <= 0)
178 			break;
179 		cle->u.c_daddr = (uint64) cur_offset;
180 		alen = 0;
181 		if (cle->c_len > total_len) {
182 			alen = cle->c_len;
183 			cle->c_len = total_len;
184 		}
185 		if (!alen)
186 			xdrp->xp_rcl_next = &cle->c_next;
187 
188 		cur_offset += cle->c_len;
189 		total_len -= cle->c_len;
190 
191 		if ((total_segments - actual_segments - 1) == 0 &&
192 		    total_len > 0) {
193 			DTRACE_PROBE(krpc__e__xdrrdma_getblk_chunktooshort);
194 			retval = FALSE;
195 		}
196 
197 		if ((total_segments - actual_segments - 1) > 0 &&
198 		    total_len == 0) {
199 			DTRACE_PROBE2(krpc__e__xdrrdma_getblk_toobig,
200 			    int, total_segments, int, actual_segments);
201 		}
202 
203 		rdclist = clist_alloc();
204 		(*rdclist) = (*cle);
205 		if ((*rlist) == NULL)
206 			(*rlist) = rdclist;
207 		if (prev == NULL)
208 			prev = rdclist;
209 		else {
210 			prev->c_next = rdclist;
211 			prev = rdclist;
212 		}
213 
214 		cle = cle->c_next;
215 	}
216 
217 out:
218 	if (prev != NULL)
219 		prev->c_next = NULL;
220 
221 	cle = cls;
222 	if (alen) {
223 		cle->w.c_saddr =
224 		    (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
225 		cle->c_len = alen - cle->c_len;
226 	}
227 
228 	return (retval);
229 }
230 
231 /*
232  * The procedure xdrrdma_create initializes a stream descriptor for a memory
233  * buffer.
234  */
235 void
236 xdrrdma_create(XDR *xdrs, caddr_t addr, uint_t size,
237     int min_chunk, struct clist *cl, enum xdr_op op, CONN *conn)
238 {
239 	xrdma_private_t *xdrp;
240 	struct clist   *cle;
241 
242 	xdrs->x_op = op;
243 	xdrs->x_ops = &xdrrdma_ops;
244 	xdrs->x_base = addr;
245 	xdrs->x_handy = size;
246 	xdrs->x_public = NULL;
247 
248 	xdrp = (xrdma_private_t *)kmem_zalloc(sizeof (xrdma_private_t),
249 	    KM_SLEEP);
250 	xdrs->x_private = (caddr_t)xdrp;
251 	xdrp->xp_offp = addr;
252 	xdrp->xp_min_chunk = min_chunk;
253 	xdrp->xp_flags = 0;
254 	xdrp->xp_buf_size = size;
255 	xdrp->xp_rcl = cl;
256 	xdrp->xp_reply_chunk_len = 0;
257 	xdrp->xp_reply_chunk_len_alt = 0;
258 
259 	if (op == XDR_ENCODE && cl != NULL) {
260 		/* Find last element in chunk list and set xp_rcl_next */
261 		for (cle = cl; cle->c_next != NULL; cle = cle->c_next)
262 			continue;
263 
264 		xdrp->xp_rcl_next = &(cle->c_next);
265 	} else {
266 		xdrp->xp_rcl_next = &(xdrp->xp_rcl);
267 	}
268 
269 	xdrp->xp_wcl = NULL;
270 
271 	xdrp->xp_conn = conn;
272 	if (xdrp->xp_min_chunk != 0)
273 		xdrp->xp_flags |= XDR_RDMA_CHUNK;
274 }
275 
276 /* ARGSUSED */
277 void
278 xdrrdma_destroy(XDR * xdrs)
279 {
280 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
281 
282 	if (xdrp == NULL)
283 		return;
284 
285 	if (xdrp->xp_wcl) {
286 		if (xdrp->xp_flags & XDR_RDMA_WLIST_REG) {
287 			(void) clist_deregister(xdrp->xp_conn,
288 			    xdrp->xp_wcl, CLIST_REG_DST);
289 			rdma_buf_free(xdrp->xp_conn,
290 			    &xdrp->xp_wcl->rb_longbuf);
291 		}
292 		clist_free(xdrp->xp_wcl);
293 	}
294 
295 	if (xdrp->xp_rcl) {
296 		if (xdrp->xp_flags & XDR_RDMA_RLIST_REG) {
297 			(void) clist_deregister(xdrp->xp_conn,
298 			    xdrp->xp_rcl, CLIST_REG_SOURCE);
299 			rdma_buf_free(xdrp->xp_conn,
300 			    &xdrp->xp_rcl->rb_longbuf);
301 		}
302 		clist_free(xdrp->xp_rcl);
303 	}
304 
305 	(void) kmem_free(xdrs->x_private, sizeof (xrdma_private_t));
306 	xdrs->x_private = NULL;
307 }
308 
309 static	bool_t
310 xdrrdma_getint32(XDR *xdrs, int32_t *int32p)
311 {
312 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
313 
314 	if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
315 		return (FALSE);
316 
317 	/* LINTED pointer alignment */
318 	*int32p = (int32_t)ntohl((uint32_t)(*((int32_t *)(xdrp->xp_offp))));
319 	xdrp->xp_offp += sizeof (int32_t);
320 
321 	return (TRUE);
322 }
323 
324 static	bool_t
325 xdrrdma_putint32(XDR *xdrs, int32_t *int32p)
326 {
327 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
328 
329 	if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
330 		return (FALSE);
331 
332 	/* LINTED pointer alignment */
333 	*(int32_t *)xdrp->xp_offp = (int32_t)htonl((uint32_t)(*int32p));
334 	xdrp->xp_offp += sizeof (int32_t);
335 
336 	return (TRUE);
337 }
338 
339 /*
340  * DECODE bytes from XDR stream for rdma.
341  * If the XDR stream contains a read chunk list,
342  * it will go through xdrrdma_getrdmablk instead.
343  */
344 static	bool_t
345 xdrrdma_getbytes(XDR *xdrs, caddr_t addr, int len)
346 {
347 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
348 	struct clist	*cle = *(xdrp->xp_rcl_next);
349 	struct clist	*cls = *(xdrp->xp_rcl_next);
350 	struct clist	cl;
351 	bool_t		retval = TRUE;
352 	uint32_t	total_len = len;
353 	uint32_t	cur_offset = 0;
354 	uint32_t	total_segments = 0;
355 	uint32_t	actual_segments = 0;
356 	uint32_t	status;
357 	uint32_t	alen;
358 
359 	while (cle) {
360 		total_segments++;
361 		cle = cle->c_next;
362 	}
363 
364 	cle = *(xdrp->xp_rcl_next);
365 	/*
366 	 * If there was a chunk at the current offset, then setup a read
367 	 * chunk list which records the destination address and length
368 	 * and will RDMA READ the data in later.
369 	 */
370 
371 	if (cle != NULL &&
372 	    cle->c_xdroff == (xdrp->xp_offp - xdrs->x_base)) {
373 		for (actual_segments = 0;
374 		    actual_segments < total_segments; actual_segments++) {
375 			if (total_len <= 0)
376 				break;
377 			cle->u.c_daddr = (uint64)(uintptr_t)addr + cur_offset;
378 			alen = 0;
379 			if (cle->c_len > total_len) {
380 				alen = cle->c_len;
381 				cle->c_len = total_len;
382 			}
383 			if (!alen)
384 				xdrp->xp_rcl_next = &cle->c_next;
385 
386 			cur_offset += cle->c_len;
387 			total_len -= cle->c_len;
388 
389 			if ((total_segments - actual_segments - 1) == 0 &&
390 			    total_len > 0) {
391 				DTRACE_PROBE(
392 				    krpc__e__xdrrdma_getbytes_chunktooshort);
393 				retval = FALSE;
394 			}
395 
396 			if ((total_segments - actual_segments - 1) > 0 &&
397 			    total_len == 0) {
398 				DTRACE_PROBE2(krpc__e__xdrrdma_getbytes_toobig,
399 				    int, total_segments, int, actual_segments);
400 			}
401 
402 			/*
403 			 * RDMA READ the chunk data from the remote end.
404 			 * First prep the destination buffer by registering
405 			 * it, then RDMA READ the chunk data. Since we are
406 			 * doing streaming memory, sync the destination
407 			 * buffer to CPU and deregister the buffer.
408 			 */
409 			if (xdrp->xp_conn == NULL) {
410 				return (FALSE);
411 			}
412 			cl = *cle;
413 			cl.c_next = NULL;
414 			if (clist_register(xdrp->xp_conn, &cl, CLIST_REG_DST)
415 			    != RDMA_SUCCESS) {
416 				return (FALSE);
417 			}
418 			cle->c_dmemhandle = cl.c_dmemhandle;
419 			cle->c_dsynchandle = cl.c_dsynchandle;
420 
421 			/*
422 			 * Now read the chunk in
423 			 */
424 			if ((total_segments - actual_segments - 1) == 0 ||
425 			    total_len == 0) {
426 				status = RDMA_READ(xdrp->xp_conn, &cl, WAIT);
427 			} else {
428 				status = RDMA_READ(xdrp->xp_conn, &cl, NOWAIT);
429 			}
430 			if (status != RDMA_SUCCESS) {
431 				DTRACE_PROBE1(
432 				    krpc__i__xdrrdma_getblk_readfailed,
433 				    int, status);
434 				retval = FALSE;
435 				goto out;
436 			}
437 			cle = cle->c_next;
438 		}
439 
440 		/*
441 		 * sync the memory for cpu
442 		 */
443 		cl = *cls;
444 		cl.c_next = NULL;
445 		cl.c_len = cur_offset;
446 		if (clist_syncmem(xdrp->xp_conn, &cl, 0) != RDMA_SUCCESS) {
447 			retval = FALSE;
448 		}
449 out:
450 		/*
451 		 * Deregister the chunks
452 		 */
453 		cle = cls;
454 		cl = *cle;
455 		cl.c_next = NULL;
456 		cl.c_len = cur_offset;
457 		(void) clist_deregister(xdrp->xp_conn, &cl, CLIST_REG_DST);
458 		if (alen) {
459 			cle->w.c_saddr =
460 			    (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
461 			cle->c_len = alen - cle->c_len;
462 		}
463 		return (retval);
464 	}
465 	if ((xdrs->x_handy -= len) < 0)
466 		return (FALSE);
467 
468 	bcopy(xdrp->xp_offp, addr, len);
469 	xdrp->xp_offp += len;
470 
471 	return (TRUE);
472 }
473 
474 /*
475  * ENCODE some bytes into an XDR stream xp_min_chunk = 0, means the stream of
476  * bytes contain no chunks to seperate out, and if the bytes do not fit in
477  * the supplied buffer, grow the buffer and free the old buffer.
478  */
479 static	bool_t
480 xdrrdma_putbytes(XDR *xdrs, caddr_t addr, int len)
481 {
482 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
483 	/*
484 	 * Is this stream accepting chunks?
485 	 * If so, does the either of the two following conditions exist?
486 	 * - length of bytes to encode is greater than the min chunk size?
487 	 * - remaining space in this stream is shorter than length of
488 	 *   bytes to encode?
489 	 *
490 	 * If the above exists, then create a chunk for this encoding
491 	 * and save the addresses, etc.
492 	 */
493 	if (xdrp->xp_flags & XDR_RDMA_CHUNK &&
494 	    ((xdrp->xp_min_chunk != 0 &&
495 	    len >= xdrp->xp_min_chunk) ||
496 	    (xdrs->x_handy - len  < 0))) {
497 		struct clist	*cle;
498 		int		offset = xdrp->xp_offp - xdrs->x_base;
499 
500 		cle = clist_alloc();
501 		cle->c_xdroff = offset;
502 		cle->c_len = len;
503 		cle->w.c_saddr = (uint64)(uintptr_t)addr;
504 		cle->c_next = NULL;
505 
506 		*(xdrp->xp_rcl_next) = cle;
507 		xdrp->xp_rcl_next = &(cle->c_next);
508 
509 		return (TRUE);
510 	}
511 	/* Is there enough space to encode what is left? */
512 	if ((xdrs->x_handy -= len) < 0) {
513 		return (FALSE);
514 	}
515 	bcopy(addr, xdrp->xp_offp, len);
516 	xdrp->xp_offp += len;
517 
518 	return (TRUE);
519 }
520 
521 uint_t
522 xdrrdma_getpos(XDR *xdrs)
523 {
524 	xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
525 
526 	return ((uint_t)((uintptr_t)xdrp->xp_offp - (uintptr_t)xdrs->x_base));
527 }
528 
529 bool_t
530 xdrrdma_setpos(XDR *xdrs, uint_t pos)
531 {
532 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
533 
534 	caddr_t		newaddr = xdrs->x_base + pos;
535 	caddr_t		lastaddr = xdrp->xp_offp + xdrs->x_handy;
536 	ptrdiff_t	diff;
537 
538 	if (newaddr > lastaddr)
539 		return (FALSE);
540 
541 	xdrp->xp_offp = newaddr;
542 	diff = lastaddr - newaddr;
543 	xdrs->x_handy = (int)diff;
544 
545 	return (TRUE);
546 }
547 
548 /* ARGSUSED */
549 static rpc_inline_t *
550 xdrrdma_inline(XDR *xdrs, int len)
551 {
552 	rpc_inline_t	*buf = NULL;
553 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
554 	struct clist	*cle = *(xdrp->xp_rcl_next);
555 
556 	if (xdrs->x_op == XDR_DECODE) {
557 		/*
558 		 * Since chunks aren't in-line, check to see whether there is
559 		 * a chunk in the inline range.
560 		 */
561 		if (cle != NULL &&
562 		    cle->c_xdroff <= (xdrp->xp_offp - xdrs->x_base + len))
563 			return (NULL);
564 	}
565 
566 	/* LINTED pointer alignment */
567 	buf = (rpc_inline_t *)xdrp->xp_offp;
568 	if (!IS_P2ALIGNED(buf, sizeof (int32_t)))
569 		return (NULL);
570 
571 	if ((xdrs->x_handy < len) || (xdrp->xp_min_chunk != 0 &&
572 	    len >= xdrp->xp_min_chunk)) {
573 		return (NULL);
574 	} else {
575 		xdrs->x_handy -= len;
576 		xdrp->xp_offp += len;
577 		return (buf);
578 	}
579 }
580 
581 static	bool_t
582 xdrrdma_control(XDR *xdrs, int request, void *info)
583 {
584 	int32_t		*int32p;
585 	int		len, i;
586 	uint_t		in_flags;
587 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
588 	rdma_chunkinfo_t *rcip = NULL;
589 	rdma_wlist_conn_info_t *rwcip = NULL;
590 	rdma_chunkinfo_lengths_t *rcilp = NULL;
591 	struct uio *uiop;
592 	struct clist	*rwl = NULL;
593 	struct clist	*prev = NULL;
594 
595 	switch (request) {
596 	case XDR_PEEK:
597 		/*
598 		 * Return the next 4 byte unit in the XDR stream.
599 		 */
600 		if (xdrs->x_handy < sizeof (int32_t))
601 			return (FALSE);
602 
603 		int32p = (int32_t *)info;
604 		*int32p = (int32_t)ntohl((uint32_t)
605 		    (*((int32_t *)(xdrp->xp_offp))));
606 
607 		return (TRUE);
608 
609 	case XDR_SKIPBYTES:
610 		/*
611 		 * Skip the next N bytes in the XDR stream.
612 		 */
613 		int32p = (int32_t *)info;
614 		len = RNDUP((int)(*int32p));
615 		if ((xdrs->x_handy -= len) < 0)
616 			return (FALSE);
617 		xdrp->xp_offp += len;
618 
619 		return (TRUE);
620 
621 	case XDR_RDMA_SET_FLAGS:
622 		/*
623 		 * Set the flags provided in the *info in xp_flags for rdma
624 		 * xdr stream control.
625 		 */
626 		int32p = (int32_t *)info;
627 		in_flags = (uint_t)(*int32p);
628 
629 		xdrp->xp_flags |= in_flags;
630 		return (TRUE);
631 
632 	case XDR_RDMA_GET_FLAGS:
633 		/*
634 		 * Get the flags provided in xp_flags return through *info
635 		 */
636 		int32p = (int32_t *)info;
637 
638 		*int32p = (int32_t)xdrp->xp_flags;
639 		return (TRUE);
640 
641 	case XDR_RDMA_GET_CHUNK_LEN:
642 		rcilp = (rdma_chunkinfo_lengths_t *)info;
643 		rcilp->rcil_len = xdrp->xp_reply_chunk_len;
644 		rcilp->rcil_len_alt = xdrp->xp_reply_chunk_len_alt;
645 
646 		return (TRUE);
647 
648 	case XDR_RDMA_ADD_CHUNK:
649 		/*
650 		 * Store wlist information
651 		 */
652 
653 		rcip = (rdma_chunkinfo_t *)info;
654 
655 		switch (rcip->rci_type) {
656 		case RCI_WRITE_UIO_CHUNK:
657 			xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
658 
659 			if (rcip->rci_len < xdrp->xp_min_chunk) {
660 				xdrp->xp_wcl = NULL;
661 				*(rcip->rci_clpp) = NULL;
662 				return (TRUE);
663 			}
664 			uiop = rcip->rci_a.rci_uiop;
665 
666 			for (i = 0; i < uiop->uio_iovcnt; i++) {
667 				rwl = clist_alloc();
668 				rwl->c_len = uiop->uio_iov[i].iov_len;
669 				rwl->u.c_daddr =
670 				    (uint64)(uintptr_t)
671 				    (uiop->uio_iov[i].iov_base);
672 				/*
673 				 * if userspace address, put adspace ptr in
674 				 * clist. If not, then do nothing since it's
675 				 * already set to NULL (from kmem_zalloc)
676 				 */
677 				if (uiop->uio_segflg == UIO_USERSPACE) {
678 					rwl->c_adspc = ttoproc(curthread)->p_as;
679 				}
680 
681 				if (prev == NULL)
682 					prev = rwl;
683 				else {
684 					prev->c_next = rwl;
685 					prev = rwl;
686 				}
687 			}
688 
689 			rwl->c_next = NULL;
690 			xdrp->xp_wcl = rwl;
691 			*(rcip->rci_clpp) = rwl;
692 
693 			break;
694 
695 		case RCI_WRITE_ADDR_CHUNK:
696 			rwl = clist_alloc();
697 
698 			rwl->c_len = rcip->rci_len;
699 			rwl->u.c_daddr3 = rcip->rci_a.rci_addr;
700 			rwl->c_next = NULL;
701 			xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
702 
703 			xdrp->xp_wcl = rwl;
704 			*(rcip->rci_clpp) = rwl;
705 
706 			break;
707 
708 		case RCI_REPLY_CHUNK:
709 			xdrp->xp_reply_chunk_len += rcip->rci_len;
710 			break;
711 		}
712 		return (TRUE);
713 
714 	case XDR_RDMA_GET_WLIST:
715 		*((struct clist **)info) = xdrp->xp_wcl;
716 		return (TRUE);
717 
718 	case XDR_RDMA_SET_WLIST:
719 		xdrp->xp_wcl = (struct clist *)info;
720 		return (TRUE);
721 
722 	case XDR_RDMA_GET_RLIST:
723 		*((struct clist **)info) = xdrp->xp_rcl;
724 		return (TRUE);
725 
726 	case XDR_RDMA_GET_WCINFO:
727 		rwcip = (rdma_wlist_conn_info_t *)info;
728 
729 		rwcip->rwci_wlist = xdrp->xp_wcl;
730 		rwcip->rwci_conn = xdrp->xp_conn;
731 
732 		return (TRUE);
733 
734 	default:
735 		return (FALSE);
736 	}
737 }
738 
739 bool_t xdr_do_clist(XDR *, clist **);
740 
741 /*
742  * Not all fields in struct clist are interesting to the RPC over RDMA
743  * protocol. Only XDR the interesting fields.
744  */
745 bool_t
746 xdr_clist(XDR *xdrs, clist *objp)
747 {
748 	if (!xdr_uint32(xdrs, &objp->c_xdroff))
749 		return (FALSE);
750 	if (!xdr_uint32(xdrs, &objp->c_smemhandle.mrc_rmr))
751 		return (FALSE);
752 	if (!xdr_uint32(xdrs, &objp->c_len))
753 		return (FALSE);
754 	if (!xdr_uint64(xdrs, &objp->w.c_saddr))
755 		return (FALSE);
756 	if (!xdr_do_clist(xdrs, &objp->c_next))
757 		return (FALSE);
758 	return (TRUE);
759 }
760 
761 /*
762  * The following two functions are forms of xdr_pointer()
763  * and xdr_reference(). Since the generic versions just
764  * kmem_alloc() a new clist, we actually want to use the
765  * rdma_clist kmem_cache.
766  */
767 
768 /*
769  * Generate or free a clist structure from the
770  * kmem_cache "rdma_clist"
771  */
772 bool_t
773 xdr_ref_clist(XDR *xdrs, caddr_t *pp)
774 {
775 	caddr_t loc = *pp;
776 	bool_t stat;
777 
778 	if (loc == NULL) {
779 		switch (xdrs->x_op) {
780 		case XDR_FREE:
781 			return (TRUE);
782 
783 		case XDR_DECODE:
784 			*pp = loc = (caddr_t)clist_alloc();
785 			break;
786 
787 		case XDR_ENCODE:
788 			ASSERT(loc);
789 			break;
790 		}
791 	}
792 
793 	stat = xdr_clist(xdrs, (struct clist *)loc);
794 
795 	if (xdrs->x_op == XDR_FREE) {
796 		kmem_cache_free(clist_cache, loc);
797 		*pp = NULL;
798 	}
799 	return (stat);
800 }
801 
802 /*
803  * XDR a pointer to a possibly recursive clist. This differs
804  * with xdr_reference in that it can serialize/deserialiaze
805  * trees correctly.
806  *
807  *  What is sent is actually a union:
808  *
809  *  union object_pointer switch (boolean b) {
810  *  case TRUE: object_data data;
811  *  case FALSE: void nothing;
812  *  }
813  *
814  * > objpp: Pointer to the pointer to the object.
815  *
816  */
817 
818 bool_t
819 xdr_do_clist(XDR *xdrs, clist **objpp)
820 {
821 	bool_t more_data;
822 
823 	more_data = (*objpp != NULL);
824 	if (!xdr_bool(xdrs, &more_data))
825 		return (FALSE);
826 	if (!more_data) {
827 		*objpp = NULL;
828 		return (TRUE);
829 	}
830 	return (xdr_ref_clist(xdrs, (caddr_t *)objpp));
831 }
832 
833 uint_t
834 xdr_getbufsize(XDR *xdrs)
835 {
836 	xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
837 
838 	return ((uint_t)xdrp->xp_buf_size);
839 }
840 
841 /* ARGSUSED */
842 bool_t
843 xdr_encode_rlist_svc(XDR *xdrs, clist *rlist)
844 {
845 	bool_t	vfalse = FALSE;
846 
847 	ASSERT(rlist == NULL);
848 	return (xdr_bool(xdrs, &vfalse));
849 }
850 
851 bool_t
852 xdr_encode_wlist(XDR *xdrs, clist *w)
853 {
854 	bool_t		vfalse = FALSE, vtrue = TRUE;
855 	int		i;
856 	uint_t		num_segment = 0;
857 	struct clist	*cl;
858 
859 	/* does a wlist exist? */
860 	if (w == NULL) {
861 		return (xdr_bool(xdrs, &vfalse));
862 	}
863 	/* Encode N consecutive segments, 1, N, HLOO, ..., HLOO, 0 */
864 	if (!xdr_bool(xdrs, &vtrue))
865 		return (FALSE);
866 
867 	for (cl = w; cl != NULL; cl = cl->c_next) {
868 		num_segment++;
869 	}
870 
871 	if (!xdr_uint32(xdrs, &num_segment))
872 		return (FALSE);
873 	for (i = 0; i < num_segment; i++) {
874 		if (!xdr_uint32(xdrs, &w->c_dmemhandle.mrc_rmr))
875 			return (FALSE);
876 
877 		if (!xdr_uint32(xdrs, &w->c_len))
878 			return (FALSE);
879 
880 		if (!xdr_uint64(xdrs, &w->u.c_daddr))
881 			return (FALSE);
882 
883 		w = w->c_next;
884 	}
885 
886 	if (!xdr_bool(xdrs, &vfalse))
887 		return (FALSE);
888 
889 	return (TRUE);
890 }
891 
892 
893 /*
894  * Conditionally decode a RDMA WRITE chunk list from XDR stream.
895  *
896  * If the next boolean in the XDR stream is false there is no
897  * RDMA WRITE chunk list present. Otherwise iterate over the
898  * array and for each entry: allocate a struct clist and decode.
899  * Pass back an indication via wlist_exists if we have seen a
900  * RDMA WRITE chunk list.
901  */
902 bool_t
903 xdr_decode_wlist(XDR *xdrs, struct clist **w, bool_t *wlist_exists)
904 {
905 	struct clist	*tmp;
906 	bool_t		more = FALSE;
907 	uint32_t	seg_array_len;
908 	uint32_t	i;
909 
910 	if (!xdr_bool(xdrs, &more))
911 		return (FALSE);
912 
913 	/* is there a wlist? */
914 	if (more == FALSE) {
915 		*wlist_exists = FALSE;
916 		return (TRUE);
917 	}
918 	*wlist_exists = TRUE;
919 
920 	if (!xdr_uint32(xdrs, &seg_array_len))
921 		return (FALSE);
922 
923 	tmp = *w = clist_alloc();
924 	for (i = 0; i < seg_array_len; i++) {
925 		if (!xdr_uint32(xdrs, &tmp->c_dmemhandle.mrc_rmr))
926 			return (FALSE);
927 		if (!xdr_uint32(xdrs, &tmp->c_len))
928 			return (FALSE);
929 		if (!xdr_uint64(xdrs, &tmp->u.c_daddr))
930 			return (FALSE);
931 		if (i < seg_array_len - 1) {
932 			tmp->c_next = clist_alloc();
933 			tmp = tmp->c_next;
934 		} else {
935 			tmp->c_next = NULL;
936 		}
937 	}
938 
939 	more = FALSE;
940 	if (!xdr_bool(xdrs, &more))
941 		return (FALSE);
942 
943 	return (TRUE);
944 }
945 
946 /*
947  * Server side RDMA WRITE list decode.
948  * XDR context is memory ops
949  */
950 bool_t
951 xdr_decode_wlist_svc(XDR *xdrs, struct clist **wclp, bool_t *wwl,
952     uint32_t *total_length, CONN *conn)
953 {
954 	struct clist	*first, *ncl;
955 	char		*memp;
956 	uint32_t	num_wclist;
957 	uint32_t	wcl_length = 0;
958 	uint32_t	i;
959 	bool_t		more = FALSE;
960 
961 	*wclp = NULL;
962 	*wwl = FALSE;
963 	*total_length = 0;
964 
965 	if (!xdr_bool(xdrs, &more)) {
966 		return (FALSE);
967 	}
968 
969 	if (more == FALSE) {
970 		return (TRUE);
971 	}
972 
973 	*wwl = TRUE;
974 
975 	if (!xdr_uint32(xdrs, &num_wclist)) {
976 		DTRACE_PROBE(krpc__e__xdrrdma__wlistsvc__listlength);
977 		return (FALSE);
978 	}
979 
980 	first = ncl = clist_alloc();
981 
982 	for (i = 0; i < num_wclist; i++) {
983 		if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
984 			goto err_out;
985 		if (!xdr_uint32(xdrs, &ncl->c_len))
986 			goto err_out;
987 		if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
988 			goto err_out;
989 
990 		if (ncl->c_len > MAX_SVC_XFER_SIZE) {
991 			DTRACE_PROBE(
992 			    krpc__e__xdrrdma__wlistsvc__chunklist_toobig);
993 			ncl->c_len = MAX_SVC_XFER_SIZE;
994 		}
995 
996 		wcl_length += ncl->c_len;
997 
998 		if (i < num_wclist - 1) {
999 			ncl->c_next = clist_alloc();
1000 			ncl = ncl->c_next;
1001 		}
1002 	}
1003 
1004 	if (!xdr_bool(xdrs, &more))
1005 		goto err_out;
1006 
1007 	first->rb_longbuf.type = RDMA_LONG_BUFFER;
1008 	first->rb_longbuf.len =
1009 	    wcl_length > WCL_BUF_LEN ? wcl_length : WCL_BUF_LEN;
1010 
1011 	if (rdma_buf_alloc(conn, &first->rb_longbuf)) {
1012 		clist_free(first);
1013 		return (FALSE);
1014 	}
1015 
1016 	memp = first->rb_longbuf.addr;
1017 
1018 	ncl = first;
1019 	for (i = 0; i < num_wclist; i++) {
1020 		ncl->w.c_saddr3 = (caddr_t)memp;
1021 		memp += ncl->c_len;
1022 		ncl = ncl->c_next;
1023 	}
1024 
1025 	*wclp = first;
1026 	*total_length = wcl_length;
1027 	return (TRUE);
1028 
1029 err_out:
1030 	clist_free(first);
1031 	return (FALSE);
1032 }
1033 
1034 /*
1035  * XDR decode the long reply write chunk.
1036  */
1037 bool_t
1038 xdr_decode_reply_wchunk(XDR *xdrs, struct clist **clist)
1039 {
1040 	bool_t		have_rchunk = FALSE;
1041 	struct clist	*first = NULL, *ncl = NULL;
1042 	uint32_t	num_wclist;
1043 	uint32_t	i;
1044 
1045 	if (!xdr_bool(xdrs, &have_rchunk))
1046 		return (FALSE);
1047 
1048 	if (have_rchunk == FALSE)
1049 		return (TRUE);
1050 
1051 	if (!xdr_uint32(xdrs, &num_wclist)) {
1052 		DTRACE_PROBE(krpc__e__xdrrdma__replywchunk__listlength);
1053 		return (FALSE);
1054 	}
1055 
1056 	if (num_wclist == 0) {
1057 		return (FALSE);
1058 	}
1059 
1060 	first = ncl = clist_alloc();
1061 
1062 	for (i = 0; i < num_wclist; i++) {
1063 		if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
1064 			goto err_out;
1065 		if (!xdr_uint32(xdrs, &ncl->c_len))
1066 			goto err_out;
1067 		if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
1068 			goto err_out;
1069 
1070 		if (ncl->c_len > MAX_SVC_XFER_SIZE) {
1071 			DTRACE_PROBE(
1072 			    krpc__e__xdrrdma__replywchunk__chunklist_toobig);
1073 			ncl->c_len = MAX_SVC_XFER_SIZE;
1074 		}
1075 		if (!(ncl->c_dmemhandle.mrc_rmr &&
1076 		    (ncl->c_len > 0) && ncl->u.c_daddr))
1077 			DTRACE_PROBE(
1078 			    krpc__e__xdrrdma__replywchunk__invalid_segaddr);
1079 
1080 		if (i > 0) {
1081 			ncl->c_next = clist_alloc();
1082 			ncl = ncl->c_next;
1083 		}
1084 	}
1085 	*clist = first;
1086 	return (TRUE);
1087 
1088 err_out:
1089 	clist_free(first);
1090 	return (FALSE);
1091 }
1092 
1093 
1094 bool_t
1095 xdr_encode_reply_wchunk(XDR *xdrs,
1096     struct clist *cl_longreply, uint32_t seg_array_len)
1097 {
1098 	int		i;
1099 	bool_t		long_reply_exists = TRUE;
1100 	uint32_t	length;
1101 	uint64		offset;
1102 
1103 	if (seg_array_len > 0) {
1104 		if (!xdr_bool(xdrs, &long_reply_exists))
1105 			return (FALSE);
1106 		if (!xdr_uint32(xdrs, &seg_array_len))
1107 			return (FALSE);
1108 
1109 		for (i = 0; i < seg_array_len; i++) {
1110 			if (!cl_longreply)
1111 				return (FALSE);
1112 			length = cl_longreply->c_len;
1113 			offset = (uint64) cl_longreply->u.c_daddr;
1114 
1115 			if (!xdr_uint32(xdrs,
1116 			    &cl_longreply->c_dmemhandle.mrc_rmr))
1117 				return (FALSE);
1118 			if (!xdr_uint32(xdrs, &length))
1119 				return (FALSE);
1120 			if (!xdr_uint64(xdrs, &offset))
1121 				return (FALSE);
1122 			cl_longreply = cl_longreply->c_next;
1123 		}
1124 	} else {
1125 		long_reply_exists = FALSE;
1126 		if (!xdr_bool(xdrs, &long_reply_exists))
1127 			return (FALSE);
1128 	}
1129 	return (TRUE);
1130 }
1131 bool_t
1132 xdrrdma_read_from_client(struct clist **rlist, CONN **conn, uint_t count)
1133 {
1134 	struct clist	*rdclist;
1135 	struct clist	cl;
1136 	uint_t		total_len = 0;
1137 	uint32_t	status;
1138 	bool_t		retval = TRUE;
1139 
1140 	(*rlist)->rb_longbuf.type = RDMA_LONG_BUFFER;
1141 	(*rlist)->rb_longbuf.len =
1142 	    count > RCL_BUF_LEN ? count : RCL_BUF_LEN;
1143 
1144 	if (rdma_buf_alloc(*conn, &(*rlist)->rb_longbuf)) {
1145 		return (FALSE);
1146 	}
1147 
1148 	for (rdclist = *rlist;
1149 	    rdclist != NULL; rdclist = rdclist->c_next) {
1150 		total_len += rdclist->c_len;
1151 #if (defined(OBJ32)||defined(DEBUG32))
1152 		rdclist->u.c_daddr3 =
1153 		    (caddr_t)((char *)(*rlist)->rb_longbuf.addr +
1154 		    (uint32) rdclist->u.c_daddr3);
1155 #else
1156 		rdclist->u.c_daddr3 =
1157 		    (caddr_t)((char *)(*rlist)->rb_longbuf.addr +
1158 		    (uint64) rdclist->u.c_daddr);
1159 
1160 #endif
1161 		cl = (*rdclist);
1162 		cl.c_next = NULL;
1163 
1164 		if (clist_register(*conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
1165 			rdma_buf_free(*conn, &(*rlist)->rb_longbuf);
1166 			DTRACE_PROBE(
1167 			    krpc__e__xdrrdma__readfromclient__clist__reg);
1168 			return (FALSE);
1169 		}
1170 
1171 		DTRACE_PROBE1(krpc__i__xdrrdma__readfromclient__buflen,
1172 		    int, rdclist->c_len);
1173 
1174 		/*
1175 		 * Now read the chunk in
1176 		 */
1177 		if (rdclist->c_next == NULL) {
1178 			status = RDMA_READ(*conn, &cl, WAIT);
1179 		} else {
1180 			status = RDMA_READ(*conn, &cl, NOWAIT);
1181 		}
1182 		if (status != RDMA_SUCCESS) {
1183 			DTRACE_PROBE(
1184 			    krpc__e__xdrrdma__readfromclient__readfailed);
1185 			rdma_buf_free(*conn, &(*rlist)->rb_longbuf);
1186 			return (FALSE);
1187 		}
1188 	}
1189 
1190 	cl = (*(*rlist));
1191 	cl.c_next = NULL;
1192 	cl.c_len = total_len;
1193 	if (clist_syncmem(*conn, &cl, 0) != RDMA_SUCCESS) {
1194 		retval = FALSE;
1195 	}
1196 	return (retval);
1197 }
1198 
1199 bool_t
1200 xdrrdma_free_clist(CONN *conn, struct clist *clp)
1201 {
1202 	rdma_buf_free(conn, &clp->rb_longbuf);
1203 	clist_free(clp);
1204 	return (TRUE);
1205 }
1206 
1207 bool_t
1208 xdrrdma_send_read_data(XDR *xdrs, struct clist *wcl)
1209 {
1210 	int status;
1211 	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
1212 	struct xdr_ops *xops = xdrrdma_xops();
1213 
1214 	/* caller is doing a sizeof */
1215 	if (xdrs->x_ops != &xdrrdma_ops || xdrs->x_ops == xops)
1216 		return (TRUE);
1217 
1218 	status = clist_register(xdrp->xp_conn, wcl, CLIST_REG_SOURCE);
1219 	if (status != RDMA_SUCCESS) {
1220 		return (FALSE);
1221 	}
1222 
1223 	status = clist_syncmem(xdrp->xp_conn, wcl, CLIST_REG_SOURCE);
1224 	if (status != RDMA_SUCCESS) {
1225 		return (FALSE);
1226 	}
1227 
1228 	status = RDMA_WRITE(xdrp->xp_conn, wcl, WAIT);
1229 	if (status != RDMA_SUCCESS) {
1230 		return (FALSE);
1231 	}
1232 
1233 	return (TRUE);
1234 }
1235