1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * Copyright (c) 2007, The Ohio State University. All rights reserved.
27 *
28 * Portions of this source code is developed by the team members of
29 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30 * headed by Professor Dhabaleswar K. (DK) Panda.
31 *
32 * Acknowledgements to contributions from developors:
33 * Ranjit Noronha: noronha@cse.ohio-state.edu
34 * Lei Chai : chail@cse.ohio-state.edu
35 * Weikuan Yu : yuw@cse.ohio-state.edu
36 *
37 */
38
39 /*
40 * xdr_rdma.c, XDR implementation using RDMA to move large chunks
41 */
42
43 #include <sys/param.h>
44 #include <sys/types.h>
45 #include <sys/systm.h>
46 #include <sys/kmem.h>
47 #include <sys/sdt.h>
48 #include <sys/debug.h>
49
50 #include <rpc/types.h>
51 #include <rpc/xdr.h>
52 #include <sys/cmn_err.h>
53 #include <rpc/rpc_sztypes.h>
54 #include <rpc/rpc_rdma.h>
55 #include <sys/sysmacros.h>
56
57 /*
58 * RCP header and xdr encoding overhead. The number was determined by
59 * tracing the msglen in svc_rdma_ksend for sec=sys,krb5,krb5i and krb5p.
60 * If the XDR_RDMA_BUF_OVERHEAD is not large enough the result is the trigger
61 * of the dtrace probe on the server "krpc-e-svcrdma-ksend-noreplycl" from
62 * svc_rdma_ksend.
63 */
64 #define XDR_RDMA_BUF_OVERHEAD 300
65
66 static bool_t xdrrdma_getint32(XDR *, int32_t *);
67 static bool_t xdrrdma_putint32(XDR *, int32_t *);
68 static bool_t xdrrdma_getbytes(XDR *, caddr_t, int);
69 static bool_t xdrrdma_putbytes(XDR *, caddr_t, int);
70 uint_t xdrrdma_getpos(XDR *);
71 bool_t xdrrdma_setpos(XDR *, uint_t);
72 static rpc_inline_t *xdrrdma_inline(XDR *, int);
73 void xdrrdma_destroy(XDR *);
74 static bool_t xdrrdma_control(XDR *, int, void *);
75 static bool_t xdrrdma_read_a_chunk(XDR *, CONN **);
76 static void xdrrdma_free_xdr_chunks(CONN *, struct clist *);
77
78 struct xdr_ops xdrrdmablk_ops = {
79 xdrrdma_getbytes,
80 xdrrdma_putbytes,
81 xdrrdma_getpos,
82 xdrrdma_setpos,
83 xdrrdma_inline,
84 xdrrdma_destroy,
85 xdrrdma_control,
86 xdrrdma_getint32,
87 xdrrdma_putint32
88 };
89
90 struct xdr_ops xdrrdma_ops = {
91 xdrrdma_getbytes,
92 xdrrdma_putbytes,
93 xdrrdma_getpos,
94 xdrrdma_setpos,
95 xdrrdma_inline,
96 xdrrdma_destroy,
97 xdrrdma_control,
98 xdrrdma_getint32,
99 xdrrdma_putint32
100 };
101
102 /*
103 * A chunk list entry identifies a chunk of opaque data to be moved
104 * separately from the rest of the RPC message. xp_min_chunk = 0, is a
105 * special case for ENCODING, which means do not chunk the incoming stream of
106 * data.
107 *
108 * A read chunk can contain part of the RPC message in addition to the
109 * inline message. In such a case, (xp_offp - x_base) will not provide
110 * the correct xdr offset of the entire message. xp_off is used in such
111 * a case to denote the offset or current position in the overall message
112 * covering both the inline and the chunk. This is used only in the case
113 * of decoding and useful to compare read chunk 'c_xdroff' offsets.
114 *
115 * An example for a read chunk containing an XDR message:
116 * An NFSv4 compound as following:
117 *
118 * PUTFH
119 * WRITE [4109 bytes]
120 * GETATTR
121 *
122 * Solaris Encoding is:
123 * -------------------
124 *
125 * <Inline message>: [PUTFH WRITE4args GETATTR]
126 * |
127 * v
128 * [RDMA_READ chunks]: [write data]
129 *
130 *
131 * Linux encoding is:
132 * -----------------
133 *
134 * <Inline message>: [PUTFH WRITE4args]
135 * |
136 * v
137 * [RDMA_READ chunks]: [Write data] [Write data2] [Getattr chunk]
138 * chunk1 chunk2 chunk3
139 *
140 * where the READ chunks are as:
141 *
142 * - chunk1 - 4k
143 * write data |
144 * - chunk2 - 13 bytes(4109 - 4k)
145 * getattr op - chunk3 - 19 bytes
146 * (getattr op starts at byte 4 after 3 bytes of roundup)
147 *
148 */
149
150 typedef struct {
151 caddr_t xp_offp;
152 int xp_min_chunk;
153 uint_t xp_flags; /* Controls setting for rdma xdr */
154 int xp_buf_size; /* size of xdr buffer */
155 int xp_off; /* overall offset */
156 struct clist *xp_rcl; /* head of chunk list */
157 struct clist **xp_rcl_next; /* location to place/find next chunk */
158 struct clist *xp_rcl_xdr; /* copy of rcl containing RPC message */
159 struct clist *xp_wcl; /* head of write chunk list */
160 CONN *xp_conn; /* connection for chunk data xfer */
161 uint_t xp_reply_chunk_len;
162 /* used to track length for security modes: integrity/privacy */
163 uint_t xp_reply_chunk_len_alt;
164 } xrdma_private_t;
165
166 extern kmem_cache_t *clist_cache;
167
168 bool_t
xdrrdma_getrdmablk(XDR * xdrs,struct clist ** rlist,uint_t * sizep,CONN ** conn,const uint_t maxsize)169 xdrrdma_getrdmablk(XDR *xdrs, struct clist **rlist, uint_t *sizep,
170 CONN **conn, const uint_t maxsize)
171 {
172 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
173 struct clist *cle = *(xdrp->xp_rcl_next);
174 struct clist *rdclist = NULL, *prev = NULL;
175 bool_t retval = TRUE;
176 uint32_t cur_offset = 0;
177 uint32_t total_segments = 0;
178 uint32_t actual_segments = 0;
179 uint32_t alen;
180 uint_t total_len;
181
182 ASSERT(xdrs->x_op != XDR_FREE);
183
184 /*
185 * first deal with the length since xdr bytes are counted
186 */
187 if (!xdr_u_int(xdrs, sizep)) {
188 DTRACE_PROBE(xdr__e__getrdmablk_sizep_fail);
189 return (FALSE);
190 }
191 total_len = *sizep;
192 if (total_len > maxsize) {
193 DTRACE_PROBE2(xdr__e__getrdmablk_bad_size,
194 int, total_len, int, maxsize);
195 return (FALSE);
196 }
197 (*conn) = xdrp->xp_conn;
198
199 /*
200 * if no data we are done
201 */
202 if (total_len == 0)
203 return (TRUE);
204
205 while (cle) {
206 total_segments++;
207 cle = cle->c_next;
208 }
209
210 cle = *(xdrp->xp_rcl_next);
211
212 /*
213 * If there was a chunk at the current offset, then setup a read
214 * chunk list which records the destination address and length
215 * and will RDMA READ the data in later.
216 */
217 if (cle == NULL)
218 return (FALSE);
219
220 if (cle->c_xdroff != (xdrp->xp_offp - xdrs->x_base))
221 return (FALSE);
222
223 /*
224 * Setup the chunk list with appropriate
225 * address (offset) and length
226 */
227 for (actual_segments = 0;
228 actual_segments < total_segments; actual_segments++) {
229
230 DTRACE_PROBE3(krpc__i__xdrrdma_getrdmablk, uint32_t, cle->c_len,
231 uint32_t, total_len, uint32_t, cle->c_xdroff);
232
233 if (total_len <= 0)
234 break;
235
236 /*
237 * not the first time in the loop
238 */
239 if (actual_segments > 0)
240 cle = cle->c_next;
241
242 cle->u.c_daddr = (uint64) cur_offset;
243 alen = 0;
244 if (cle->c_len > total_len) {
245 alen = cle->c_len;
246 cle->c_len = total_len;
247 }
248 if (!alen)
249 xdrp->xp_rcl_next = &cle->c_next;
250
251 cur_offset += cle->c_len;
252 total_len -= cle->c_len;
253
254 if ((total_segments - actual_segments - 1) == 0 &&
255 total_len > 0) {
256 DTRACE_PROBE(krpc__e__xdrrdma_getblk_chunktooshort);
257 retval = FALSE;
258 }
259
260 if ((total_segments - actual_segments - 1) > 0 &&
261 total_len == 0) {
262 DTRACE_PROBE2(krpc__e__xdrrdma_getblk_toobig,
263 int, total_segments, int, actual_segments);
264 }
265
266 rdclist = clist_alloc();
267 (*rdclist) = (*cle);
268 if ((*rlist) == NULL)
269 (*rlist) = rdclist;
270 if (prev == NULL)
271 prev = rdclist;
272 else {
273 prev->c_next = rdclist;
274 prev = rdclist;
275 }
276
277 }
278
279 if (prev != NULL)
280 prev->c_next = NULL;
281
282 /*
283 * Adjust the chunk length, if we read only a part of
284 * a chunk.
285 */
286
287 if (alen) {
288 cle->w.c_saddr =
289 (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
290 cle->c_len = alen - cle->c_len;
291 }
292
293 return (retval);
294 }
295
296 /*
297 * The procedure xdrrdma_create initializes a stream descriptor for a memory
298 * buffer.
299 */
300 void
xdrrdma_create(XDR * xdrs,caddr_t addr,uint_t size,int min_chunk,struct clist * cl,enum xdr_op op,CONN * conn)301 xdrrdma_create(XDR *xdrs, caddr_t addr, uint_t size,
302 int min_chunk, struct clist *cl, enum xdr_op op, CONN *conn)
303 {
304 xrdma_private_t *xdrp;
305 struct clist *cle;
306
307 xdrs->x_op = op;
308 xdrs->x_ops = &xdrrdma_ops;
309 xdrs->x_base = addr;
310 xdrs->x_handy = size;
311 xdrs->x_public = NULL;
312
313 xdrp = (xrdma_private_t *)kmem_zalloc(sizeof (xrdma_private_t),
314 KM_SLEEP);
315 xdrs->x_private = (caddr_t)xdrp;
316 xdrp->xp_offp = addr;
317 xdrp->xp_min_chunk = min_chunk;
318 xdrp->xp_flags = 0;
319 xdrp->xp_buf_size = size;
320 xdrp->xp_rcl = cl;
321 xdrp->xp_reply_chunk_len = 0;
322 xdrp->xp_reply_chunk_len_alt = 0;
323
324 if (op == XDR_ENCODE && cl != NULL) {
325 /* Find last element in chunk list and set xp_rcl_next */
326 for (cle = cl; cle->c_next != NULL; cle = cle->c_next)
327 continue;
328
329 xdrp->xp_rcl_next = &(cle->c_next);
330 } else {
331 xdrp->xp_rcl_next = &(xdrp->xp_rcl);
332 }
333
334 xdrp->xp_wcl = NULL;
335
336 xdrp->xp_conn = conn;
337 if (xdrp->xp_min_chunk != 0)
338 xdrp->xp_flags |= XDR_RDMA_CHUNK;
339 }
340
341 /* ARGSUSED */
342 void
xdrrdma_destroy(XDR * xdrs)343 xdrrdma_destroy(XDR * xdrs)
344 {
345 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
346
347 if (xdrp == NULL)
348 return;
349
350 if (xdrp->xp_wcl) {
351 if (xdrp->xp_flags & XDR_RDMA_WLIST_REG) {
352 (void) clist_deregister(xdrp->xp_conn, xdrp->xp_wcl);
353 rdma_buf_free(xdrp->xp_conn,
354 &xdrp->xp_wcl->rb_longbuf);
355 }
356 clist_free(xdrp->xp_wcl);
357 }
358
359 if (xdrp->xp_rcl) {
360 if (xdrp->xp_flags & XDR_RDMA_RLIST_REG) {
361 (void) clist_deregister(xdrp->xp_conn, xdrp->xp_rcl);
362 rdma_buf_free(xdrp->xp_conn,
363 &xdrp->xp_rcl->rb_longbuf);
364 }
365 clist_free(xdrp->xp_rcl);
366 }
367
368 if (xdrp->xp_rcl_xdr)
369 xdrrdma_free_xdr_chunks(xdrp->xp_conn, xdrp->xp_rcl_xdr);
370
371 (void) kmem_free(xdrs->x_private, sizeof (xrdma_private_t));
372 xdrs->x_private = NULL;
373 }
374
375 static bool_t
xdrrdma_getint32(XDR * xdrs,int32_t * int32p)376 xdrrdma_getint32(XDR *xdrs, int32_t *int32p)
377 {
378 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
379 int chunked = 0;
380
381 if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0) {
382 /*
383 * check if rest of the rpc message is in a chunk
384 */
385 if (!xdrrdma_read_a_chunk(xdrs, &xdrp->xp_conn)) {
386 return (FALSE);
387 }
388 chunked = 1;
389 }
390
391 /* LINTED pointer alignment */
392 *int32p = (int32_t)ntohl((uint32_t)(*((int32_t *)(xdrp->xp_offp))));
393
394 DTRACE_PROBE1(krpc__i__xdrrdma_getint32, int32_t, *int32p);
395
396 xdrp->xp_offp += sizeof (int32_t);
397
398 if (chunked)
399 xdrs->x_handy -= (int)sizeof (int32_t);
400
401 if (xdrp->xp_off != 0) {
402 xdrp->xp_off += sizeof (int32_t);
403 }
404
405 return (TRUE);
406 }
407
408 static bool_t
xdrrdma_putint32(XDR * xdrs,int32_t * int32p)409 xdrrdma_putint32(XDR *xdrs, int32_t *int32p)
410 {
411 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
412
413 if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
414 return (FALSE);
415
416 /* LINTED pointer alignment */
417 *(int32_t *)xdrp->xp_offp = (int32_t)htonl((uint32_t)(*int32p));
418 xdrp->xp_offp += sizeof (int32_t);
419
420 return (TRUE);
421 }
422
423 /*
424 * DECODE bytes from XDR stream for rdma.
425 * If the XDR stream contains a read chunk list,
426 * it will go through xdrrdma_getrdmablk instead.
427 */
428 static bool_t
xdrrdma_getbytes(XDR * xdrs,caddr_t addr,int len)429 xdrrdma_getbytes(XDR *xdrs, caddr_t addr, int len)
430 {
431 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
432 struct clist *cle = *(xdrp->xp_rcl_next);
433 struct clist *cls = *(xdrp->xp_rcl_next);
434 struct clist cl;
435 bool_t retval = TRUE;
436 uint32_t total_len = len;
437 uint32_t cur_offset = 0;
438 uint32_t total_segments = 0;
439 uint32_t actual_segments = 0;
440 uint32_t status = RDMA_SUCCESS;
441 uint32_t alen = 0;
442 uint32_t xpoff;
443
444 while (cle) {
445 total_segments++;
446 cle = cle->c_next;
447 }
448
449 cle = *(xdrp->xp_rcl_next);
450
451 if (xdrp->xp_off) {
452 xpoff = xdrp->xp_off;
453 } else {
454 xpoff = (xdrp->xp_offp - xdrs->x_base);
455 }
456
457 /*
458 * If there was a chunk at the current offset, then setup a read
459 * chunk list which records the destination address and length
460 * and will RDMA READ the data in later.
461 */
462
463 if (cle != NULL && cle->c_xdroff == xpoff) {
464 for (actual_segments = 0;
465 actual_segments < total_segments; actual_segments++) {
466
467 if (total_len <= 0)
468 break;
469
470 if (status != RDMA_SUCCESS)
471 goto out;
472
473 cle->u.c_daddr = (uint64)(uintptr_t)addr + cur_offset;
474 alen = 0;
475 if (cle->c_len > total_len) {
476 alen = cle->c_len;
477 cle->c_len = total_len;
478 }
479 if (!alen)
480 xdrp->xp_rcl_next = &cle->c_next;
481
482 cur_offset += cle->c_len;
483 total_len -= cle->c_len;
484
485 if ((total_segments - actual_segments - 1) == 0 &&
486 total_len > 0) {
487 DTRACE_PROBE(
488 krpc__e__xdrrdma_getbytes_chunktooshort);
489 retval = FALSE;
490 }
491
492 if ((total_segments - actual_segments - 1) > 0 &&
493 total_len == 0) {
494 DTRACE_PROBE2(krpc__e__xdrrdma_getbytes_toobig,
495 int, total_segments, int, actual_segments);
496 }
497
498 /*
499 * RDMA READ the chunk data from the remote end.
500 * First prep the destination buffer by registering
501 * it, then RDMA READ the chunk data. Since we are
502 * doing streaming memory, sync the destination
503 * buffer to CPU and deregister the buffer.
504 */
505 if (xdrp->xp_conn == NULL) {
506 return (FALSE);
507 }
508 cl = *cle;
509 cl.c_next = NULL;
510 status = clist_register(xdrp->xp_conn, &cl,
511 CLIST_REG_DST);
512 if (status != RDMA_SUCCESS) {
513 retval = FALSE;
514 /*
515 * Deregister the previous chunks
516 * before return
517 */
518 goto out;
519 }
520
521 cle->c_dmemhandle = cl.c_dmemhandle;
522 cle->c_dsynchandle = cl.c_dsynchandle;
523
524 /*
525 * Now read the chunk in
526 */
527 if ((total_segments - actual_segments - 1) == 0 ||
528 total_len == 0) {
529 status = RDMA_READ(xdrp->xp_conn, &cl, WAIT);
530 } else {
531 status = RDMA_READ(xdrp->xp_conn, &cl, NOWAIT);
532 }
533 if (status != RDMA_SUCCESS) {
534 DTRACE_PROBE1(
535 krpc__i__xdrrdma_getblk_readfailed,
536 int, status);
537 retval = FALSE;
538 }
539
540 cle = cle->c_next;
541
542 }
543
544 /*
545 * sync the memory for cpu
546 */
547 cl = *cls;
548 cl.c_next = NULL;
549 cl.c_len = cur_offset;
550 if (clist_syncmem(
551 xdrp->xp_conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
552 retval = FALSE;
553 }
554 out:
555
556 /*
557 * Deregister the chunks
558 */
559 cle = cls;
560 while (actual_segments != 0) {
561 cl = *cle;
562 cl.c_next = NULL;
563
564 cl.c_regtype = CLIST_REG_DST;
565 (void) clist_deregister(xdrp->xp_conn, &cl);
566
567 cle = cle->c_next;
568 actual_segments--;
569 }
570
571 if (alen) {
572 cle = *(xdrp->xp_rcl_next);
573 cle->w.c_saddr =
574 (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
575 cle->c_len = alen - cle->c_len;
576 }
577
578 return (retval);
579 }
580
581 if ((xdrs->x_handy -= len) < 0)
582 return (FALSE);
583
584 bcopy(xdrp->xp_offp, addr, len);
585
586 xdrp->xp_offp += len;
587
588 if (xdrp->xp_off != 0)
589 xdrp->xp_off += len;
590
591 return (TRUE);
592 }
593
594 /*
595 * ENCODE some bytes into an XDR stream xp_min_chunk = 0, means the stream of
596 * bytes contain no chunks to seperate out, and if the bytes do not fit in
597 * the supplied buffer, grow the buffer and free the old buffer.
598 */
599 static bool_t
xdrrdma_putbytes(XDR * xdrs,caddr_t addr,int len)600 xdrrdma_putbytes(XDR *xdrs, caddr_t addr, int len)
601 {
602 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
603 /*
604 * Is this stream accepting chunks?
605 * If so, does the either of the two following conditions exist?
606 * - length of bytes to encode is greater than the min chunk size?
607 * - remaining space in this stream is shorter than length of
608 * bytes to encode?
609 *
610 * If the above exists, then create a chunk for this encoding
611 * and save the addresses, etc.
612 */
613 if (xdrp->xp_flags & XDR_RDMA_CHUNK &&
614 ((xdrp->xp_min_chunk != 0 &&
615 len >= xdrp->xp_min_chunk) ||
616 (xdrs->x_handy - len < 0))) {
617 struct clist *cle;
618 int offset = xdrp->xp_offp - xdrs->x_base;
619
620 cle = clist_alloc();
621 cle->c_xdroff = offset;
622 cle->c_len = len;
623 cle->w.c_saddr = (uint64)(uintptr_t)addr;
624 cle->c_next = NULL;
625
626 *(xdrp->xp_rcl_next) = cle;
627 xdrp->xp_rcl_next = &(cle->c_next);
628
629 return (TRUE);
630 }
631 /* Is there enough space to encode what is left? */
632 if ((xdrs->x_handy -= len) < 0) {
633 return (FALSE);
634 }
635 bcopy(addr, xdrp->xp_offp, len);
636 xdrp->xp_offp += len;
637
638 return (TRUE);
639 }
640
641 uint_t
xdrrdma_getpos(XDR * xdrs)642 xdrrdma_getpos(XDR *xdrs)
643 {
644 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
645
646 return ((uint_t)((uintptr_t)xdrp->xp_offp - (uintptr_t)xdrs->x_base));
647 }
648
649 bool_t
xdrrdma_setpos(XDR * xdrs,uint_t pos)650 xdrrdma_setpos(XDR *xdrs, uint_t pos)
651 {
652 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
653
654 caddr_t newaddr = xdrs->x_base + pos;
655 caddr_t lastaddr = xdrp->xp_offp + xdrs->x_handy;
656 ptrdiff_t diff;
657
658 if (newaddr > lastaddr)
659 return (FALSE);
660
661 xdrp->xp_offp = newaddr;
662 diff = lastaddr - newaddr;
663 xdrs->x_handy = (int)diff;
664
665 return (TRUE);
666 }
667
668 /* ARGSUSED */
669 static rpc_inline_t *
xdrrdma_inline(XDR * xdrs,int len)670 xdrrdma_inline(XDR *xdrs, int len)
671 {
672 rpc_inline_t *buf = NULL;
673 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
674 struct clist *cle = *(xdrp->xp_rcl_next);
675
676 if (xdrs->x_op == XDR_DECODE) {
677 /*
678 * Since chunks aren't in-line, check to see whether there is
679 * a chunk in the inline range.
680 */
681 if (cle != NULL &&
682 cle->c_xdroff <= (xdrp->xp_offp - xdrs->x_base + len))
683 return (NULL);
684 }
685
686 /* LINTED pointer alignment */
687 buf = (rpc_inline_t *)xdrp->xp_offp;
688 if (!IS_P2ALIGNED(buf, sizeof (int32_t)))
689 return (NULL);
690
691 if ((xdrs->x_handy < len) || (xdrp->xp_min_chunk != 0 &&
692 len >= xdrp->xp_min_chunk)) {
693 return (NULL);
694 } else {
695 xdrs->x_handy -= len;
696 xdrp->xp_offp += len;
697 return (buf);
698 }
699 }
700
701 static bool_t
xdrrdma_control(XDR * xdrs,int request,void * info)702 xdrrdma_control(XDR *xdrs, int request, void *info)
703 {
704 int32_t *int32p;
705 int len, i;
706 uint_t in_flags;
707 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
708 rdma_chunkinfo_t *rcip = NULL;
709 rdma_wlist_conn_info_t *rwcip = NULL;
710 rdma_chunkinfo_lengths_t *rcilp = NULL;
711 struct uio *uiop;
712 struct clist *rwl = NULL, *first = NULL;
713 struct clist *prev = NULL;
714
715 switch (request) {
716 case XDR_PEEK:
717 /*
718 * Return the next 4 byte unit in the XDR stream.
719 */
720 if (xdrs->x_handy < sizeof (int32_t))
721 return (FALSE);
722
723 int32p = (int32_t *)info;
724 *int32p = (int32_t)ntohl((uint32_t)
725 (*((int32_t *)(xdrp->xp_offp))));
726
727 return (TRUE);
728
729 case XDR_SKIPBYTES:
730 /*
731 * Skip the next N bytes in the XDR stream.
732 */
733 int32p = (int32_t *)info;
734 len = RNDUP((int)(*int32p));
735 if ((xdrs->x_handy -= len) < 0)
736 return (FALSE);
737 xdrp->xp_offp += len;
738
739 return (TRUE);
740
741 case XDR_RDMA_SET_FLAGS:
742 /*
743 * Set the flags provided in the *info in xp_flags for rdma
744 * xdr stream control.
745 */
746 int32p = (int32_t *)info;
747 in_flags = (uint_t)(*int32p);
748
749 xdrp->xp_flags |= in_flags;
750 return (TRUE);
751
752 case XDR_RDMA_GET_FLAGS:
753 /*
754 * Get the flags provided in xp_flags return through *info
755 */
756 int32p = (int32_t *)info;
757
758 *int32p = (int32_t)xdrp->xp_flags;
759 return (TRUE);
760
761 case XDR_RDMA_GET_CHUNK_LEN:
762 rcilp = (rdma_chunkinfo_lengths_t *)info;
763 rcilp->rcil_len = xdrp->xp_reply_chunk_len;
764 rcilp->rcil_len_alt = xdrp->xp_reply_chunk_len_alt;
765
766 return (TRUE);
767
768 case XDR_RDMA_ADD_CHUNK:
769 /*
770 * Store wlist information
771 */
772
773 rcip = (rdma_chunkinfo_t *)info;
774
775 DTRACE_PROBE2(krpc__i__xdrrdma__control__add__chunk,
776 rci_type_t, rcip->rci_type, uint32, rcip->rci_len);
777 switch (rcip->rci_type) {
778 case RCI_WRITE_UIO_CHUNK:
779 xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
780
781 if ((rcip->rci_len + XDR_RDMA_BUF_OVERHEAD) <
782 xdrp->xp_min_chunk) {
783 xdrp->xp_wcl = NULL;
784 *(rcip->rci_clpp) = NULL;
785 return (TRUE);
786 }
787 uiop = rcip->rci_a.rci_uiop;
788
789 for (i = 0; i < uiop->uio_iovcnt; i++) {
790 rwl = clist_alloc();
791 if (first == NULL)
792 first = rwl;
793 rwl->c_len = uiop->uio_iov[i].iov_len;
794 rwl->u.c_daddr =
795 (uint64)(uintptr_t)
796 (uiop->uio_iov[i].iov_base);
797 /*
798 * if userspace address, put adspace ptr in
799 * clist. If not, then do nothing since it's
800 * already set to NULL (from kmem_zalloc)
801 */
802 if (uiop->uio_segflg == UIO_USERSPACE) {
803 rwl->c_adspc = ttoproc(curthread)->p_as;
804 }
805
806 if (prev == NULL)
807 prev = rwl;
808 else {
809 prev->c_next = rwl;
810 prev = rwl;
811 }
812 }
813
814 rwl->c_next = NULL;
815 xdrp->xp_wcl = first;
816 *(rcip->rci_clpp) = first;
817
818 break;
819
820 case RCI_WRITE_ADDR_CHUNK:
821 rwl = clist_alloc();
822
823 rwl->c_len = rcip->rci_len;
824 rwl->u.c_daddr3 = rcip->rci_a.rci_addr;
825 rwl->c_next = NULL;
826 xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
827
828 xdrp->xp_wcl = rwl;
829 *(rcip->rci_clpp) = rwl;
830
831 break;
832
833 case RCI_REPLY_CHUNK:
834 xdrp->xp_reply_chunk_len += rcip->rci_len;
835 break;
836 }
837 return (TRUE);
838
839 case XDR_RDMA_GET_WLIST:
840 *((struct clist **)info) = xdrp->xp_wcl;
841 return (TRUE);
842
843 case XDR_RDMA_SET_WLIST:
844 xdrp->xp_wcl = (struct clist *)info;
845 return (TRUE);
846
847 case XDR_RDMA_GET_RLIST:
848 *((struct clist **)info) = xdrp->xp_rcl;
849 return (TRUE);
850
851 case XDR_RDMA_GET_WCINFO:
852 rwcip = (rdma_wlist_conn_info_t *)info;
853
854 rwcip->rwci_wlist = xdrp->xp_wcl;
855 rwcip->rwci_conn = xdrp->xp_conn;
856
857 return (TRUE);
858
859 default:
860 return (FALSE);
861 }
862 }
863
864 bool_t xdr_do_clist(XDR *, clist **);
865
866 /*
867 * Not all fields in struct clist are interesting to the RPC over RDMA
868 * protocol. Only XDR the interesting fields.
869 */
870 bool_t
xdr_clist(XDR * xdrs,clist * objp)871 xdr_clist(XDR *xdrs, clist *objp)
872 {
873 if (!xdr_uint32(xdrs, &objp->c_xdroff))
874 return (FALSE);
875 if (!xdr_uint32(xdrs, &objp->c_smemhandle.mrc_rmr))
876 return (FALSE);
877 if (!xdr_uint32(xdrs, &objp->c_len))
878 return (FALSE);
879 if (!xdr_uint64(xdrs, &objp->w.c_saddr))
880 return (FALSE);
881 if (!xdr_do_clist(xdrs, &objp->c_next))
882 return (FALSE);
883 return (TRUE);
884 }
885
886 /*
887 * The following two functions are forms of xdr_pointer()
888 * and xdr_reference(). Since the generic versions just
889 * kmem_alloc() a new clist, we actually want to use the
890 * rdma_clist kmem_cache.
891 */
892
893 /*
894 * Generate or free a clist structure from the
895 * kmem_cache "rdma_clist"
896 */
897 bool_t
xdr_ref_clist(XDR * xdrs,caddr_t * pp)898 xdr_ref_clist(XDR *xdrs, caddr_t *pp)
899 {
900 caddr_t loc = *pp;
901 bool_t stat;
902
903 if (loc == NULL) {
904 switch (xdrs->x_op) {
905 case XDR_FREE:
906 return (TRUE);
907
908 case XDR_DECODE:
909 *pp = loc = (caddr_t)clist_alloc();
910 break;
911
912 case XDR_ENCODE:
913 ASSERT(loc);
914 break;
915 }
916 }
917
918 stat = xdr_clist(xdrs, (struct clist *)loc);
919
920 if (xdrs->x_op == XDR_FREE) {
921 kmem_cache_free(clist_cache, loc);
922 *pp = NULL;
923 }
924 return (stat);
925 }
926
927 /*
928 * XDR a pointer to a possibly recursive clist. This differs
929 * with xdr_reference in that it can serialize/deserialiaze
930 * trees correctly.
931 *
932 * What is sent is actually a union:
933 *
934 * union object_pointer switch (boolean b) {
935 * case TRUE: object_data data;
936 * case FALSE: void nothing;
937 * }
938 *
939 * > objpp: Pointer to the pointer to the object.
940 *
941 */
942
943 bool_t
xdr_do_clist(XDR * xdrs,clist ** objpp)944 xdr_do_clist(XDR *xdrs, clist **objpp)
945 {
946 bool_t more_data;
947
948 more_data = (*objpp != NULL);
949 if (!xdr_bool(xdrs, &more_data))
950 return (FALSE);
951 if (!more_data) {
952 *objpp = NULL;
953 return (TRUE);
954 }
955 return (xdr_ref_clist(xdrs, (caddr_t *)objpp));
956 }
957
958 uint_t
xdr_getbufsize(XDR * xdrs)959 xdr_getbufsize(XDR *xdrs)
960 {
961 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
962
963 return ((uint_t)xdrp->xp_buf_size);
964 }
965
966 /* ARGSUSED */
967 bool_t
xdr_encode_rlist_svc(XDR * xdrs,clist * rlist)968 xdr_encode_rlist_svc(XDR *xdrs, clist *rlist)
969 {
970 bool_t vfalse = FALSE;
971
972 ASSERT(rlist == NULL);
973 return (xdr_bool(xdrs, &vfalse));
974 }
975
976 bool_t
xdr_encode_wlist(XDR * xdrs,clist * w)977 xdr_encode_wlist(XDR *xdrs, clist *w)
978 {
979 bool_t vfalse = FALSE, vtrue = TRUE;
980 int i;
981 uint_t num_segment = 0;
982 struct clist *cl;
983
984 /* does a wlist exist? */
985 if (w == NULL) {
986 return (xdr_bool(xdrs, &vfalse));
987 }
988 /* Encode N consecutive segments, 1, N, HLOO, ..., HLOO, 0 */
989 if (!xdr_bool(xdrs, &vtrue))
990 return (FALSE);
991
992 for (cl = w; cl != NULL; cl = cl->c_next) {
993 num_segment++;
994 }
995
996 if (!xdr_uint32(xdrs, &num_segment))
997 return (FALSE);
998 for (i = 0; i < num_segment; i++) {
999
1000 DTRACE_PROBE1(krpc__i__xdr_encode_wlist_len, uint_t, w->c_len);
1001
1002 if (!xdr_uint32(xdrs, &w->c_dmemhandle.mrc_rmr))
1003 return (FALSE);
1004
1005 if (!xdr_uint32(xdrs, &w->c_len))
1006 return (FALSE);
1007
1008 if (!xdr_uint64(xdrs, &w->u.c_daddr))
1009 return (FALSE);
1010
1011 w = w->c_next;
1012 }
1013
1014 if (!xdr_bool(xdrs, &vfalse))
1015 return (FALSE);
1016
1017 return (TRUE);
1018 }
1019
1020
1021 /*
1022 * Conditionally decode a RDMA WRITE chunk list from XDR stream.
1023 *
1024 * If the next boolean in the XDR stream is false there is no
1025 * RDMA WRITE chunk list present. Otherwise iterate over the
1026 * array and for each entry: allocate a struct clist and decode.
1027 * Pass back an indication via wlist_exists if we have seen a
1028 * RDMA WRITE chunk list.
1029 */
1030 bool_t
xdr_decode_wlist(XDR * xdrs,struct clist ** w,bool_t * wlist_exists)1031 xdr_decode_wlist(XDR *xdrs, struct clist **w, bool_t *wlist_exists)
1032 {
1033 struct clist *tmp;
1034 bool_t more = FALSE;
1035 uint32_t seg_array_len;
1036 uint32_t i;
1037
1038 if (!xdr_bool(xdrs, &more))
1039 return (FALSE);
1040
1041 /* is there a wlist? */
1042 if (more == FALSE) {
1043 *wlist_exists = FALSE;
1044 return (TRUE);
1045 }
1046 *wlist_exists = TRUE;
1047
1048 if (!xdr_uint32(xdrs, &seg_array_len))
1049 return (FALSE);
1050
1051 tmp = *w = clist_alloc();
1052 for (i = 0; i < seg_array_len; i++) {
1053
1054 if (!xdr_uint32(xdrs, &tmp->c_dmemhandle.mrc_rmr))
1055 return (FALSE);
1056 if (!xdr_uint32(xdrs, &tmp->c_len))
1057 return (FALSE);
1058
1059 DTRACE_PROBE1(krpc__i__xdr_decode_wlist_len,
1060 uint_t, tmp->c_len);
1061
1062 if (!xdr_uint64(xdrs, &tmp->u.c_daddr))
1063 return (FALSE);
1064 if (i < seg_array_len - 1) {
1065 tmp->c_next = clist_alloc();
1066 tmp = tmp->c_next;
1067 } else {
1068 tmp->c_next = NULL;
1069 }
1070 }
1071
1072 more = FALSE;
1073 if (!xdr_bool(xdrs, &more))
1074 return (FALSE);
1075
1076 return (TRUE);
1077 }
1078
1079 /*
1080 * Server side RDMA WRITE list decode.
1081 * XDR context is memory ops
1082 */
1083 bool_t
xdr_decode_wlist_svc(XDR * xdrs,struct clist ** wclp,bool_t * wwl,uint32_t * total_length,CONN * conn)1084 xdr_decode_wlist_svc(XDR *xdrs, struct clist **wclp, bool_t *wwl,
1085 uint32_t *total_length, CONN *conn)
1086 {
1087 struct clist *first, *ncl;
1088 char *memp;
1089 uint32_t num_wclist;
1090 uint32_t wcl_length = 0;
1091 uint32_t i;
1092 bool_t more = FALSE;
1093
1094 *wclp = NULL;
1095 *wwl = FALSE;
1096 *total_length = 0;
1097
1098 if (!xdr_bool(xdrs, &more)) {
1099 return (FALSE);
1100 }
1101
1102 if (more == FALSE) {
1103 return (TRUE);
1104 }
1105
1106 *wwl = TRUE;
1107
1108 if (!xdr_uint32(xdrs, &num_wclist)) {
1109 DTRACE_PROBE(krpc__e__xdrrdma__wlistsvc__listlength);
1110 return (FALSE);
1111 }
1112
1113 first = ncl = clist_alloc();
1114
1115 for (i = 0; i < num_wclist; i++) {
1116
1117 if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
1118 goto err_out;
1119 if (!xdr_uint32(xdrs, &ncl->c_len))
1120 goto err_out;
1121 if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
1122 goto err_out;
1123
1124 if (ncl->c_len > MAX_SVC_XFER_SIZE) {
1125 DTRACE_PROBE(
1126 krpc__e__xdrrdma__wlistsvc__chunklist_toobig);
1127 ncl->c_len = MAX_SVC_XFER_SIZE;
1128 }
1129
1130 DTRACE_PROBE1(krpc__i__xdr_decode_wlist_svc_len,
1131 uint_t, ncl->c_len);
1132
1133 wcl_length += ncl->c_len;
1134
1135 if (i < num_wclist - 1) {
1136 ncl->c_next = clist_alloc();
1137 ncl = ncl->c_next;
1138 }
1139 }
1140
1141 if (!xdr_bool(xdrs, &more))
1142 goto err_out;
1143
1144 first->rb_longbuf.type = RDMA_LONG_BUFFER;
1145 first->rb_longbuf.len =
1146 wcl_length > WCL_BUF_LEN ? wcl_length : WCL_BUF_LEN;
1147
1148 if (rdma_buf_alloc(conn, &first->rb_longbuf)) {
1149 clist_free(first);
1150 return (FALSE);
1151 }
1152
1153 memp = first->rb_longbuf.addr;
1154
1155 ncl = first;
1156 for (i = 0; i < num_wclist; i++) {
1157 ncl->w.c_saddr3 = (caddr_t)memp;
1158 memp += ncl->c_len;
1159 ncl = ncl->c_next;
1160 }
1161
1162 *wclp = first;
1163 *total_length = wcl_length;
1164 return (TRUE);
1165
1166 err_out:
1167 clist_free(first);
1168 return (FALSE);
1169 }
1170
1171 /*
1172 * XDR decode the long reply write chunk.
1173 */
1174 bool_t
xdr_decode_reply_wchunk(XDR * xdrs,struct clist ** clist)1175 xdr_decode_reply_wchunk(XDR *xdrs, struct clist **clist)
1176 {
1177 bool_t have_rchunk = FALSE;
1178 struct clist *first = NULL, *ncl = NULL;
1179 uint32_t num_wclist;
1180 uint32_t i;
1181
1182 if (!xdr_bool(xdrs, &have_rchunk))
1183 return (FALSE);
1184
1185 if (have_rchunk == FALSE)
1186 return (TRUE);
1187
1188 if (!xdr_uint32(xdrs, &num_wclist)) {
1189 DTRACE_PROBE(krpc__e__xdrrdma__replywchunk__listlength);
1190 return (FALSE);
1191 }
1192
1193 if (num_wclist == 0) {
1194 return (FALSE);
1195 }
1196
1197 first = ncl = clist_alloc();
1198
1199 for (i = 0; i < num_wclist; i++) {
1200
1201 if (i > 0) {
1202 ncl->c_next = clist_alloc();
1203 ncl = ncl->c_next;
1204 }
1205
1206 if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
1207 goto err_out;
1208 if (!xdr_uint32(xdrs, &ncl->c_len))
1209 goto err_out;
1210 if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
1211 goto err_out;
1212
1213 if (ncl->c_len > MAX_SVC_XFER_SIZE) {
1214 DTRACE_PROBE(
1215 krpc__e__xdrrdma__replywchunk__chunklist_toobig);
1216 ncl->c_len = MAX_SVC_XFER_SIZE;
1217 }
1218 if (!(ncl->c_dmemhandle.mrc_rmr &&
1219 (ncl->c_len > 0) && ncl->u.c_daddr))
1220 DTRACE_PROBE(
1221 krpc__e__xdrrdma__replywchunk__invalid_segaddr);
1222
1223 DTRACE_PROBE1(krpc__i__xdr_decode_reply_wchunk_c_len,
1224 uint32_t, ncl->c_len);
1225
1226 }
1227 *clist = first;
1228 return (TRUE);
1229
1230 err_out:
1231 clist_free(first);
1232 return (FALSE);
1233 }
1234
1235
1236 bool_t
xdr_encode_reply_wchunk(XDR * xdrs,struct clist * cl_longreply,uint32_t seg_array_len)1237 xdr_encode_reply_wchunk(XDR *xdrs,
1238 struct clist *cl_longreply, uint32_t seg_array_len)
1239 {
1240 int i;
1241 bool_t long_reply_exists = TRUE;
1242 uint32_t length;
1243 uint64 offset;
1244
1245 if (seg_array_len > 0) {
1246 if (!xdr_bool(xdrs, &long_reply_exists))
1247 return (FALSE);
1248 if (!xdr_uint32(xdrs, &seg_array_len))
1249 return (FALSE);
1250
1251 for (i = 0; i < seg_array_len; i++) {
1252 if (!cl_longreply)
1253 return (FALSE);
1254 length = cl_longreply->c_len;
1255 offset = (uint64) cl_longreply->u.c_daddr;
1256
1257 DTRACE_PROBE1(
1258 krpc__i__xdr_encode_reply_wchunk_c_len,
1259 uint32_t, length);
1260
1261 if (!xdr_uint32(xdrs,
1262 &cl_longreply->c_dmemhandle.mrc_rmr))
1263 return (FALSE);
1264 if (!xdr_uint32(xdrs, &length))
1265 return (FALSE);
1266 if (!xdr_uint64(xdrs, &offset))
1267 return (FALSE);
1268 cl_longreply = cl_longreply->c_next;
1269 }
1270 } else {
1271 long_reply_exists = FALSE;
1272 if (!xdr_bool(xdrs, &long_reply_exists))
1273 return (FALSE);
1274 }
1275 return (TRUE);
1276 }
1277 bool_t
xdrrdma_read_from_client(struct clist * rlist,CONN ** conn,uint_t count)1278 xdrrdma_read_from_client(struct clist *rlist, CONN **conn, uint_t count)
1279 {
1280 struct clist *rdclist;
1281 struct clist cl;
1282 uint_t total_len = 0;
1283 uint32_t status;
1284 bool_t retval = TRUE;
1285
1286 rlist->rb_longbuf.type = RDMA_LONG_BUFFER;
1287 rlist->rb_longbuf.len =
1288 count > RCL_BUF_LEN ? count : RCL_BUF_LEN;
1289
1290 if (rdma_buf_alloc(*conn, &rlist->rb_longbuf)) {
1291 return (FALSE);
1292 }
1293
1294 /*
1295 * The entire buffer is registered with the first chunk.
1296 * Later chunks will use the same registered memory handle.
1297 */
1298
1299 cl = *rlist;
1300 cl.c_next = NULL;
1301 if (clist_register(*conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
1302 rdma_buf_free(*conn, &rlist->rb_longbuf);
1303 DTRACE_PROBE(
1304 krpc__e__xdrrdma__readfromclient__clist__reg);
1305 return (FALSE);
1306 }
1307
1308 rlist->c_regtype = CLIST_REG_DST;
1309 rlist->c_dmemhandle = cl.c_dmemhandle;
1310 rlist->c_dsynchandle = cl.c_dsynchandle;
1311
1312 for (rdclist = rlist;
1313 rdclist != NULL; rdclist = rdclist->c_next) {
1314 total_len += rdclist->c_len;
1315 #if (defined(OBJ32)||defined(DEBUG32))
1316 rdclist->u.c_daddr3 =
1317 (caddr_t)((char *)rlist->rb_longbuf.addr +
1318 (uint32) rdclist->u.c_daddr3);
1319 #else
1320 rdclist->u.c_daddr3 =
1321 (caddr_t)((char *)rlist->rb_longbuf.addr +
1322 (uint64) rdclist->u.c_daddr);
1323
1324 #endif
1325 cl = (*rdclist);
1326 cl.c_next = NULL;
1327
1328 /*
1329 * Use the same memory handle for all the chunks
1330 */
1331 cl.c_dmemhandle = rlist->c_dmemhandle;
1332 cl.c_dsynchandle = rlist->c_dsynchandle;
1333
1334
1335 DTRACE_PROBE1(krpc__i__xdrrdma__readfromclient__buflen,
1336 int, rdclist->c_len);
1337
1338 /*
1339 * Now read the chunk in
1340 */
1341 if (rdclist->c_next == NULL) {
1342 status = RDMA_READ(*conn, &cl, WAIT);
1343 } else {
1344 status = RDMA_READ(*conn, &cl, NOWAIT);
1345 }
1346 if (status != RDMA_SUCCESS) {
1347 DTRACE_PROBE(
1348 krpc__e__xdrrdma__readfromclient__readfailed);
1349 rdma_buf_free(*conn, &rlist->rb_longbuf);
1350 return (FALSE);
1351 }
1352 }
1353
1354 cl = (*rlist);
1355 cl.c_next = NULL;
1356 cl.c_len = total_len;
1357 if (clist_syncmem(*conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
1358 retval = FALSE;
1359 }
1360 return (retval);
1361 }
1362
1363 bool_t
xdrrdma_free_clist(CONN * conn,struct clist * clp)1364 xdrrdma_free_clist(CONN *conn, struct clist *clp)
1365 {
1366 rdma_buf_free(conn, &clp->rb_longbuf);
1367 clist_free(clp);
1368 return (TRUE);
1369 }
1370
1371 bool_t
xdrrdma_send_read_data(XDR * xdrs,uint_t data_len,struct clist * wcl)1372 xdrrdma_send_read_data(XDR *xdrs, uint_t data_len, struct clist *wcl)
1373 {
1374 int status;
1375 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
1376 struct xdr_ops *xops = xdrrdma_xops();
1377 struct clist *tcl, *wrcl, *cl;
1378 struct clist fcl;
1379 int rndup_present, rnduplen;
1380
1381 rndup_present = 0;
1382 wrcl = NULL;
1383
1384 /* caller is doing a sizeof */
1385 if (xdrs->x_ops != &xdrrdma_ops || xdrs->x_ops == xops)
1386 return (TRUE);
1387
1388 /* copy of the first chunk */
1389 fcl = *wcl;
1390 fcl.c_next = NULL;
1391
1392 /*
1393 * The entire buffer is registered with the first chunk.
1394 * Later chunks will use the same registered memory handle.
1395 */
1396
1397 status = clist_register(xdrp->xp_conn, &fcl, CLIST_REG_SOURCE);
1398 if (status != RDMA_SUCCESS) {
1399 return (FALSE);
1400 }
1401
1402 wcl->c_regtype = CLIST_REG_SOURCE;
1403 wcl->c_smemhandle = fcl.c_smemhandle;
1404 wcl->c_ssynchandle = fcl.c_ssynchandle;
1405
1406 /*
1407 * Only transfer the read data ignoring any trailing
1408 * roundup chunks. A bit of work, but it saves an
1409 * unnecessary extra RDMA_WRITE containing only
1410 * roundup bytes.
1411 */
1412
1413 rnduplen = clist_len(wcl) - data_len;
1414
1415 if (rnduplen) {
1416
1417 tcl = wcl->c_next;
1418
1419 /*
1420 * Check if there is a trailing roundup chunk
1421 */
1422 while (tcl) {
1423 if ((tcl->c_next == NULL) && (tcl->c_len == rnduplen)) {
1424 rndup_present = 1;
1425 break;
1426 }
1427 tcl = tcl->c_next;
1428 }
1429
1430 /*
1431 * Make a copy chunk list skipping the last chunk
1432 */
1433 if (rndup_present) {
1434 cl = wcl;
1435 tcl = NULL;
1436 while (cl) {
1437 if (tcl == NULL) {
1438 tcl = clist_alloc();
1439 wrcl = tcl;
1440 } else {
1441 tcl->c_next = clist_alloc();
1442 tcl = tcl->c_next;
1443 }
1444
1445 *tcl = *cl;
1446 cl = cl->c_next;
1447 /* last chunk */
1448 if (cl->c_next == NULL)
1449 break;
1450 }
1451 tcl->c_next = NULL;
1452 }
1453 }
1454
1455 if (wrcl == NULL) {
1456 /* No roundup chunks */
1457 wrcl = wcl;
1458 }
1459
1460 /*
1461 * Set the registered memory handles for the
1462 * rest of the chunks same as the first chunk.
1463 */
1464 tcl = wrcl->c_next;
1465 while (tcl) {
1466 tcl->c_smemhandle = fcl.c_smemhandle;
1467 tcl->c_ssynchandle = fcl.c_ssynchandle;
1468 tcl = tcl->c_next;
1469 }
1470
1471 /*
1472 * Sync the total len beginning from the first chunk.
1473 */
1474 fcl.c_len = clist_len(wrcl);
1475 status = clist_syncmem(xdrp->xp_conn, &fcl, CLIST_REG_SOURCE);
1476 if (status != RDMA_SUCCESS) {
1477 return (FALSE);
1478 }
1479
1480 status = RDMA_WRITE(xdrp->xp_conn, wrcl, WAIT);
1481
1482 if (rndup_present)
1483 clist_free(wrcl);
1484
1485 if (status != RDMA_SUCCESS) {
1486 return (FALSE);
1487 }
1488
1489 return (TRUE);
1490 }
1491
1492
1493 /*
1494 * Reads one chunk at a time
1495 */
1496
1497 static bool_t
xdrrdma_read_a_chunk(XDR * xdrs,CONN ** conn)1498 xdrrdma_read_a_chunk(XDR *xdrs, CONN **conn)
1499 {
1500 int status;
1501 int32_t len = 0;
1502 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
1503 struct clist *cle = *(xdrp->xp_rcl_next);
1504 struct clist *rclp = xdrp->xp_rcl;
1505 struct clist *clp;
1506
1507 /*
1508 * len is used later to decide xdr offset in
1509 * the chunk factoring any 4-byte XDR alignment
1510 * (See read chunk example top of this file)
1511 */
1512 while (rclp != cle) {
1513 len += rclp->c_len;
1514 rclp = rclp->c_next;
1515 }
1516
1517 len = RNDUP(len) - len;
1518
1519 ASSERT(xdrs->x_handy <= 0);
1520
1521 /*
1522 * If this is the first chunk to contain the RPC
1523 * message set xp_off to the xdr offset of the
1524 * inline message.
1525 */
1526 if (xdrp->xp_off == 0)
1527 xdrp->xp_off = (xdrp->xp_offp - xdrs->x_base);
1528
1529 if (cle == NULL || (cle->c_xdroff != xdrp->xp_off))
1530 return (FALSE);
1531
1532 /*
1533 * Make a copy of the chunk to read from client.
1534 * Chunks are read on demand, so read only one
1535 * for now.
1536 */
1537
1538 rclp = clist_alloc();
1539 *rclp = *cle;
1540 rclp->c_next = NULL;
1541
1542 xdrp->xp_rcl_next = &cle->c_next;
1543
1544 /*
1545 * If there is a roundup present, then skip those
1546 * bytes when reading.
1547 */
1548 if (len) {
1549 rclp->w.c_saddr =
1550 (uint64)(uintptr_t)rclp->w.c_saddr + len;
1551 rclp->c_len = rclp->c_len - len;
1552 }
1553
1554 status = xdrrdma_read_from_client(rclp, conn, rclp->c_len);
1555
1556 if (status == FALSE) {
1557 clist_free(rclp);
1558 return (status);
1559 }
1560
1561 xdrp->xp_offp = rclp->rb_longbuf.addr;
1562 xdrs->x_base = xdrp->xp_offp;
1563 xdrs->x_handy = rclp->c_len;
1564
1565 /*
1566 * This copy of read chunks containing the XDR
1567 * message is freed later in xdrrdma_destroy()
1568 */
1569
1570 if (xdrp->xp_rcl_xdr) {
1571 /* Add the chunk to end of the list */
1572 clp = xdrp->xp_rcl_xdr;
1573 while (clp->c_next != NULL)
1574 clp = clp->c_next;
1575 clp->c_next = rclp;
1576 } else {
1577 xdrp->xp_rcl_xdr = rclp;
1578 }
1579 return (TRUE);
1580 }
1581
1582 static void
xdrrdma_free_xdr_chunks(CONN * conn,struct clist * xdr_rcl)1583 xdrrdma_free_xdr_chunks(CONN *conn, struct clist *xdr_rcl)
1584 {
1585 struct clist *cl;
1586
1587 (void) clist_deregister(conn, xdr_rcl);
1588
1589 /*
1590 * Read chunks containing parts XDR message are
1591 * special: in case of multiple chunks each has
1592 * its own buffer.
1593 */
1594
1595 cl = xdr_rcl;
1596 while (cl) {
1597 rdma_buf_free(conn, &cl->rb_longbuf);
1598 cl = cl->c_next;
1599 }
1600
1601 clist_free(xdr_rcl);
1602 }
1603