1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * Copyright (c) 2007, The Ohio State University. All rights reserved.
27 *
28 * Portions of this source code is developed by the team members of
29 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30 * headed by Professor Dhabaleswar K. (DK) Panda.
31 *
32 * Acknowledgements to contributions from developors:
33 * Ranjit Noronha: noronha@cse.ohio-state.edu
34 * Lei Chai : chail@cse.ohio-state.edu
35 * Weikuan Yu : yuw@cse.ohio-state.edu
36 *
37 */
38
39 /*
40 * xdr_rdma.c, XDR implementation using RDMA to move large chunks
41 */
42
43 #include <sys/param.h>
44 #include <sys/types.h>
45 #include <sys/systm.h>
46 #include <sys/kmem.h>
47 #include <sys/sdt.h>
48 #include <sys/debug.h>
49
50 #include <rpc/types.h>
51 #include <rpc/xdr.h>
52 #include <sys/cmn_err.h>
53 #include <rpc/rpc_sztypes.h>
54 #include <rpc/rpc_rdma.h>
55 #include <sys/sysmacros.h>
56
57 /*
58 * RCP header and xdr encoding overhead. The number was determined by
59 * tracing the msglen in svc_rdma_ksend for sec=sys,krb5,krb5i and krb5p.
60 * If the XDR_RDMA_BUF_OVERHEAD is not large enough the result is the trigger
61 * of the dtrace probe on the server "krpc-e-svcrdma-ksend-noreplycl" from
62 * svc_rdma_ksend.
63 */
64 #define XDR_RDMA_BUF_OVERHEAD 300
65
66 static bool_t xdrrdma_getint32(XDR *, int32_t *);
67 static bool_t xdrrdma_putint32(XDR *, int32_t *);
68 static bool_t xdrrdma_getbytes(XDR *, caddr_t, int);
69 static bool_t xdrrdma_putbytes(XDR *, caddr_t, int);
70 uint_t xdrrdma_getpos(XDR *);
71 bool_t xdrrdma_setpos(XDR *, uint_t);
72 static rpc_inline_t *xdrrdma_inline(XDR *, int);
73 void xdrrdma_destroy(XDR *);
74 static bool_t xdrrdma_control(XDR *, int, void *);
75 static bool_t xdrrdma_read_a_chunk(XDR *, CONN **);
76 static void xdrrdma_free_xdr_chunks(CONN *, struct clist *);
77
78 struct xdr_ops xdrrdmablk_ops = {
79 xdrrdma_getbytes,
80 xdrrdma_putbytes,
81 xdrrdma_getpos,
82 xdrrdma_setpos,
83 xdrrdma_inline,
84 xdrrdma_destroy,
85 xdrrdma_control,
86 xdrrdma_getint32,
87 xdrrdma_putint32
88 };
89
90 struct xdr_ops xdrrdma_ops = {
91 xdrrdma_getbytes,
92 xdrrdma_putbytes,
93 xdrrdma_getpos,
94 xdrrdma_setpos,
95 xdrrdma_inline,
96 xdrrdma_destroy,
97 xdrrdma_control,
98 xdrrdma_getint32,
99 xdrrdma_putint32
100 };
101
102 /*
103 * A chunk list entry identifies a chunk of opaque data to be moved
104 * separately from the rest of the RPC message. xp_min_chunk = 0, is a
105 * special case for ENCODING, which means do not chunk the incoming stream of
106 * data.
107 *
108 * A read chunk can contain part of the RPC message in addition to the
109 * inline message. In such a case, (xp_offp - x_base) will not provide
110 * the correct xdr offset of the entire message. xp_off is used in such
111 * a case to denote the offset or current position in the overall message
112 * covering both the inline and the chunk. This is used only in the case
113 * of decoding and useful to compare read chunk 'c_xdroff' offsets.
114 *
115 * An example for a read chunk containing an XDR message:
116 * An NFSv4 compound as following:
117 *
118 * PUTFH
119 * WRITE [4109 bytes]
120 * GETATTR
121 *
122 * Solaris Encoding is:
123 * -------------------
124 *
125 * <Inline message>: [PUTFH WRITE4args GETATTR]
126 * |
127 * v
128 * [RDMA_READ chunks]: [write data]
129 *
130 *
131 * Linux encoding is:
132 * -----------------
133 *
134 * <Inline message>: [PUTFH WRITE4args]
135 * |
136 * v
137 * [RDMA_READ chunks]: [Write data] [Write data2] [Getattr chunk]
138 * chunk1 chunk2 chunk3
139 *
140 * where the READ chunks are as:
141 *
142 * - chunk1 - 4k
143 * write data |
144 * - chunk2 - 13 bytes(4109 - 4k)
145 * getattr op - chunk3 - 19 bytes
146 * (getattr op starts at byte 4 after 3 bytes of roundup)
147 *
148 */
149
150 typedef struct {
151 caddr_t xp_offp;
152 int xp_min_chunk;
153 uint_t xp_flags; /* Controls setting for rdma xdr */
154 int xp_buf_size; /* size of xdr buffer */
155 int xp_off; /* overall offset */
156 struct clist *xp_rcl; /* head of chunk list */
157 struct clist **xp_rcl_next; /* location to place/find next chunk */
158 struct clist *xp_rcl_xdr; /* copy of rcl containing RPC message */
159 struct clist *xp_wcl; /* head of write chunk list */
160 CONN *xp_conn; /* connection for chunk data xfer */
161 uint_t xp_reply_chunk_len;
162 /* used to track length for security modes: integrity/privacy */
163 uint_t xp_reply_chunk_len_alt;
164 } xrdma_private_t;
165
166 extern kmem_cache_t *clist_cache;
167
168 bool_t
xdrrdma_getrdmablk(XDR * xdrs,struct clist ** rlist,uint_t * sizep,CONN ** conn,const uint_t maxsize)169 xdrrdma_getrdmablk(XDR *xdrs, struct clist **rlist, uint_t *sizep,
170 CONN **conn, const uint_t maxsize)
171 {
172 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
173 struct clist *cle = *(xdrp->xp_rcl_next);
174 struct clist *rdclist = NULL, *prev = NULL;
175 bool_t retval = TRUE;
176 uint32_t cur_offset = 0;
177 uint32_t total_segments = 0;
178 uint32_t actual_segments = 0;
179 uint32_t alen;
180 uint_t total_len;
181
182 ASSERT(xdrs->x_op != XDR_FREE);
183
184 /*
185 * first deal with the length since xdr bytes are counted
186 */
187 if (!xdr_u_int(xdrs, sizep)) {
188 DTRACE_PROBE(xdr__e__getrdmablk_sizep_fail);
189 return (FALSE);
190 }
191 total_len = *sizep;
192 if (total_len > maxsize) {
193 DTRACE_PROBE2(xdr__e__getrdmablk_bad_size,
194 int, total_len, int, maxsize);
195 return (FALSE);
196 }
197 (*conn) = xdrp->xp_conn;
198
199 /*
200 * if no data we are done
201 */
202 if (total_len == 0)
203 return (TRUE);
204
205 while (cle) {
206 total_segments++;
207 cle = cle->c_next;
208 }
209
210 cle = *(xdrp->xp_rcl_next);
211
212 /*
213 * If there was a chunk at the current offset, then setup a read
214 * chunk list which records the destination address and length
215 * and will RDMA READ the data in later.
216 */
217 if (cle == NULL)
218 return (FALSE);
219
220 if (cle->c_xdroff != (xdrp->xp_offp - xdrs->x_base))
221 return (FALSE);
222
223 /*
224 * Setup the chunk list with appropriate
225 * address (offset) and length
226 */
227 for (actual_segments = 0;
228 actual_segments < total_segments; actual_segments++) {
229
230 DTRACE_PROBE3(krpc__i__xdrrdma_getrdmablk, uint32_t, cle->c_len,
231 uint32_t, total_len, uint32_t, cle->c_xdroff);
232
233 if (total_len <= 0)
234 break;
235
236 /*
237 * not the first time in the loop
238 */
239 if (actual_segments > 0)
240 cle = cle->c_next;
241
242 cle->u.c_daddr = (uint64) cur_offset;
243 alen = 0;
244 if (cle->c_len > total_len) {
245 alen = cle->c_len;
246 cle->c_len = total_len;
247 }
248 if (!alen)
249 xdrp->xp_rcl_next = &cle->c_next;
250
251 cur_offset += cle->c_len;
252 total_len -= cle->c_len;
253
254 if ((total_segments - actual_segments - 1) == 0 &&
255 total_len > 0) {
256 DTRACE_PROBE(krpc__e__xdrrdma_getblk_chunktooshort);
257 retval = FALSE;
258 }
259
260 if ((total_segments - actual_segments - 1) > 0 &&
261 total_len == 0) {
262 DTRACE_PROBE2(krpc__e__xdrrdma_getblk_toobig,
263 int, total_segments, int, actual_segments);
264 }
265
266 rdclist = clist_alloc();
267 (*rdclist) = (*cle);
268 if ((*rlist) == NULL)
269 (*rlist) = rdclist;
270 if (prev == NULL)
271 prev = rdclist;
272 else {
273 prev->c_next = rdclist;
274 prev = rdclist;
275 }
276
277 }
278
279 out:
280 if (prev != NULL)
281 prev->c_next = NULL;
282
283 /*
284 * Adjust the chunk length, if we read only a part of
285 * a chunk.
286 */
287
288 if (alen) {
289 cle->w.c_saddr =
290 (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
291 cle->c_len = alen - cle->c_len;
292 }
293
294 return (retval);
295 }
296
297 /*
298 * The procedure xdrrdma_create initializes a stream descriptor for a memory
299 * buffer.
300 */
301 void
xdrrdma_create(XDR * xdrs,caddr_t addr,uint_t size,int min_chunk,struct clist * cl,enum xdr_op op,CONN * conn)302 xdrrdma_create(XDR *xdrs, caddr_t addr, uint_t size,
303 int min_chunk, struct clist *cl, enum xdr_op op, CONN *conn)
304 {
305 xrdma_private_t *xdrp;
306 struct clist *cle;
307
308 xdrs->x_op = op;
309 xdrs->x_ops = &xdrrdma_ops;
310 xdrs->x_base = addr;
311 xdrs->x_handy = size;
312 xdrs->x_public = NULL;
313
314 xdrp = (xrdma_private_t *)kmem_zalloc(sizeof (xrdma_private_t),
315 KM_SLEEP);
316 xdrs->x_private = (caddr_t)xdrp;
317 xdrp->xp_offp = addr;
318 xdrp->xp_min_chunk = min_chunk;
319 xdrp->xp_flags = 0;
320 xdrp->xp_buf_size = size;
321 xdrp->xp_rcl = cl;
322 xdrp->xp_reply_chunk_len = 0;
323 xdrp->xp_reply_chunk_len_alt = 0;
324
325 if (op == XDR_ENCODE && cl != NULL) {
326 /* Find last element in chunk list and set xp_rcl_next */
327 for (cle = cl; cle->c_next != NULL; cle = cle->c_next)
328 continue;
329
330 xdrp->xp_rcl_next = &(cle->c_next);
331 } else {
332 xdrp->xp_rcl_next = &(xdrp->xp_rcl);
333 }
334
335 xdrp->xp_wcl = NULL;
336
337 xdrp->xp_conn = conn;
338 if (xdrp->xp_min_chunk != 0)
339 xdrp->xp_flags |= XDR_RDMA_CHUNK;
340 }
341
342 /* ARGSUSED */
343 void
xdrrdma_destroy(XDR * xdrs)344 xdrrdma_destroy(XDR * xdrs)
345 {
346 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
347
348 if (xdrp == NULL)
349 return;
350
351 if (xdrp->xp_wcl) {
352 if (xdrp->xp_flags & XDR_RDMA_WLIST_REG) {
353 (void) clist_deregister(xdrp->xp_conn, xdrp->xp_wcl);
354 rdma_buf_free(xdrp->xp_conn,
355 &xdrp->xp_wcl->rb_longbuf);
356 }
357 clist_free(xdrp->xp_wcl);
358 }
359
360 if (xdrp->xp_rcl) {
361 if (xdrp->xp_flags & XDR_RDMA_RLIST_REG) {
362 (void) clist_deregister(xdrp->xp_conn, xdrp->xp_rcl);
363 rdma_buf_free(xdrp->xp_conn,
364 &xdrp->xp_rcl->rb_longbuf);
365 }
366 clist_free(xdrp->xp_rcl);
367 }
368
369 if (xdrp->xp_rcl_xdr)
370 xdrrdma_free_xdr_chunks(xdrp->xp_conn, xdrp->xp_rcl_xdr);
371
372 (void) kmem_free(xdrs->x_private, sizeof (xrdma_private_t));
373 xdrs->x_private = NULL;
374 }
375
376 static bool_t
xdrrdma_getint32(XDR * xdrs,int32_t * int32p)377 xdrrdma_getint32(XDR *xdrs, int32_t *int32p)
378 {
379 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
380 int chunked = 0;
381
382 if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0) {
383 /*
384 * check if rest of the rpc message is in a chunk
385 */
386 if (!xdrrdma_read_a_chunk(xdrs, &xdrp->xp_conn)) {
387 return (FALSE);
388 }
389 chunked = 1;
390 }
391
392 /* LINTED pointer alignment */
393 *int32p = (int32_t)ntohl((uint32_t)(*((int32_t *)(xdrp->xp_offp))));
394
395 DTRACE_PROBE1(krpc__i__xdrrdma_getint32, int32_t, *int32p);
396
397 xdrp->xp_offp += sizeof (int32_t);
398
399 if (chunked)
400 xdrs->x_handy -= (int)sizeof (int32_t);
401
402 if (xdrp->xp_off != 0) {
403 xdrp->xp_off += sizeof (int32_t);
404 }
405
406 return (TRUE);
407 }
408
409 static bool_t
xdrrdma_putint32(XDR * xdrs,int32_t * int32p)410 xdrrdma_putint32(XDR *xdrs, int32_t *int32p)
411 {
412 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
413
414 if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
415 return (FALSE);
416
417 /* LINTED pointer alignment */
418 *(int32_t *)xdrp->xp_offp = (int32_t)htonl((uint32_t)(*int32p));
419 xdrp->xp_offp += sizeof (int32_t);
420
421 return (TRUE);
422 }
423
424 /*
425 * DECODE bytes from XDR stream for rdma.
426 * If the XDR stream contains a read chunk list,
427 * it will go through xdrrdma_getrdmablk instead.
428 */
429 static bool_t
xdrrdma_getbytes(XDR * xdrs,caddr_t addr,int len)430 xdrrdma_getbytes(XDR *xdrs, caddr_t addr, int len)
431 {
432 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
433 struct clist *cle = *(xdrp->xp_rcl_next);
434 struct clist *cls = *(xdrp->xp_rcl_next);
435 struct clist cl;
436 bool_t retval = TRUE;
437 uint32_t total_len = len;
438 uint32_t cur_offset = 0;
439 uint32_t total_segments = 0;
440 uint32_t actual_segments = 0;
441 uint32_t status = RDMA_SUCCESS;
442 uint32_t alen = 0;
443 uint32_t xpoff;
444
445 while (cle) {
446 total_segments++;
447 cle = cle->c_next;
448 }
449
450 cle = *(xdrp->xp_rcl_next);
451
452 if (xdrp->xp_off) {
453 xpoff = xdrp->xp_off;
454 } else {
455 xpoff = (xdrp->xp_offp - xdrs->x_base);
456 }
457
458 /*
459 * If there was a chunk at the current offset, then setup a read
460 * chunk list which records the destination address and length
461 * and will RDMA READ the data in later.
462 */
463
464 if (cle != NULL && cle->c_xdroff == xpoff) {
465 for (actual_segments = 0;
466 actual_segments < total_segments; actual_segments++) {
467
468 if (total_len <= 0)
469 break;
470
471 if (status != RDMA_SUCCESS)
472 goto out;
473
474 cle->u.c_daddr = (uint64)(uintptr_t)addr + cur_offset;
475 alen = 0;
476 if (cle->c_len > total_len) {
477 alen = cle->c_len;
478 cle->c_len = total_len;
479 }
480 if (!alen)
481 xdrp->xp_rcl_next = &cle->c_next;
482
483 cur_offset += cle->c_len;
484 total_len -= cle->c_len;
485
486 if ((total_segments - actual_segments - 1) == 0 &&
487 total_len > 0) {
488 DTRACE_PROBE(
489 krpc__e__xdrrdma_getbytes_chunktooshort);
490 retval = FALSE;
491 }
492
493 if ((total_segments - actual_segments - 1) > 0 &&
494 total_len == 0) {
495 DTRACE_PROBE2(krpc__e__xdrrdma_getbytes_toobig,
496 int, total_segments, int, actual_segments);
497 }
498
499 /*
500 * RDMA READ the chunk data from the remote end.
501 * First prep the destination buffer by registering
502 * it, then RDMA READ the chunk data. Since we are
503 * doing streaming memory, sync the destination
504 * buffer to CPU and deregister the buffer.
505 */
506 if (xdrp->xp_conn == NULL) {
507 return (FALSE);
508 }
509 cl = *cle;
510 cl.c_next = NULL;
511 status = clist_register(xdrp->xp_conn, &cl,
512 CLIST_REG_DST);
513 if (status != RDMA_SUCCESS) {
514 retval = FALSE;
515 /*
516 * Deregister the previous chunks
517 * before return
518 */
519 goto out;
520 }
521
522 cle->c_dmemhandle = cl.c_dmemhandle;
523 cle->c_dsynchandle = cl.c_dsynchandle;
524
525 /*
526 * Now read the chunk in
527 */
528 if ((total_segments - actual_segments - 1) == 0 ||
529 total_len == 0) {
530 status = RDMA_READ(xdrp->xp_conn, &cl, WAIT);
531 } else {
532 status = RDMA_READ(xdrp->xp_conn, &cl, NOWAIT);
533 }
534 if (status != RDMA_SUCCESS) {
535 DTRACE_PROBE1(
536 krpc__i__xdrrdma_getblk_readfailed,
537 int, status);
538 retval = FALSE;
539 }
540
541 cle = cle->c_next;
542
543 }
544
545 /*
546 * sync the memory for cpu
547 */
548 cl = *cls;
549 cl.c_next = NULL;
550 cl.c_len = cur_offset;
551 if (clist_syncmem(
552 xdrp->xp_conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
553 retval = FALSE;
554 }
555 out:
556
557 /*
558 * Deregister the chunks
559 */
560 cle = cls;
561 while (actual_segments != 0) {
562 cl = *cle;
563 cl.c_next = NULL;
564
565 cl.c_regtype = CLIST_REG_DST;
566 (void) clist_deregister(xdrp->xp_conn, &cl);
567
568 cle = cle->c_next;
569 actual_segments--;
570 }
571
572 if (alen) {
573 cle = *(xdrp->xp_rcl_next);
574 cle->w.c_saddr =
575 (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
576 cle->c_len = alen - cle->c_len;
577 }
578
579 return (retval);
580 }
581
582 if ((xdrs->x_handy -= len) < 0)
583 return (FALSE);
584
585 bcopy(xdrp->xp_offp, addr, len);
586
587 xdrp->xp_offp += len;
588
589 if (xdrp->xp_off != 0)
590 xdrp->xp_off += len;
591
592 return (TRUE);
593 }
594
595 /*
596 * ENCODE some bytes into an XDR stream xp_min_chunk = 0, means the stream of
597 * bytes contain no chunks to seperate out, and if the bytes do not fit in
598 * the supplied buffer, grow the buffer and free the old buffer.
599 */
600 static bool_t
xdrrdma_putbytes(XDR * xdrs,caddr_t addr,int len)601 xdrrdma_putbytes(XDR *xdrs, caddr_t addr, int len)
602 {
603 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
604 /*
605 * Is this stream accepting chunks?
606 * If so, does the either of the two following conditions exist?
607 * - length of bytes to encode is greater than the min chunk size?
608 * - remaining space in this stream is shorter than length of
609 * bytes to encode?
610 *
611 * If the above exists, then create a chunk for this encoding
612 * and save the addresses, etc.
613 */
614 if (xdrp->xp_flags & XDR_RDMA_CHUNK &&
615 ((xdrp->xp_min_chunk != 0 &&
616 len >= xdrp->xp_min_chunk) ||
617 (xdrs->x_handy - len < 0))) {
618 struct clist *cle;
619 int offset = xdrp->xp_offp - xdrs->x_base;
620
621 cle = clist_alloc();
622 cle->c_xdroff = offset;
623 cle->c_len = len;
624 cle->w.c_saddr = (uint64)(uintptr_t)addr;
625 cle->c_next = NULL;
626
627 *(xdrp->xp_rcl_next) = cle;
628 xdrp->xp_rcl_next = &(cle->c_next);
629
630 return (TRUE);
631 }
632 /* Is there enough space to encode what is left? */
633 if ((xdrs->x_handy -= len) < 0) {
634 return (FALSE);
635 }
636 bcopy(addr, xdrp->xp_offp, len);
637 xdrp->xp_offp += len;
638
639 return (TRUE);
640 }
641
642 uint_t
xdrrdma_getpos(XDR * xdrs)643 xdrrdma_getpos(XDR *xdrs)
644 {
645 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
646
647 return ((uint_t)((uintptr_t)xdrp->xp_offp - (uintptr_t)xdrs->x_base));
648 }
649
650 bool_t
xdrrdma_setpos(XDR * xdrs,uint_t pos)651 xdrrdma_setpos(XDR *xdrs, uint_t pos)
652 {
653 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
654
655 caddr_t newaddr = xdrs->x_base + pos;
656 caddr_t lastaddr = xdrp->xp_offp + xdrs->x_handy;
657 ptrdiff_t diff;
658
659 if (newaddr > lastaddr)
660 return (FALSE);
661
662 xdrp->xp_offp = newaddr;
663 diff = lastaddr - newaddr;
664 xdrs->x_handy = (int)diff;
665
666 return (TRUE);
667 }
668
669 /* ARGSUSED */
670 static rpc_inline_t *
xdrrdma_inline(XDR * xdrs,int len)671 xdrrdma_inline(XDR *xdrs, int len)
672 {
673 rpc_inline_t *buf = NULL;
674 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
675 struct clist *cle = *(xdrp->xp_rcl_next);
676
677 if (xdrs->x_op == XDR_DECODE) {
678 /*
679 * Since chunks aren't in-line, check to see whether there is
680 * a chunk in the inline range.
681 */
682 if (cle != NULL &&
683 cle->c_xdroff <= (xdrp->xp_offp - xdrs->x_base + len))
684 return (NULL);
685 }
686
687 /* LINTED pointer alignment */
688 buf = (rpc_inline_t *)xdrp->xp_offp;
689 if (!IS_P2ALIGNED(buf, sizeof (int32_t)))
690 return (NULL);
691
692 if ((xdrs->x_handy < len) || (xdrp->xp_min_chunk != 0 &&
693 len >= xdrp->xp_min_chunk)) {
694 return (NULL);
695 } else {
696 xdrs->x_handy -= len;
697 xdrp->xp_offp += len;
698 return (buf);
699 }
700 }
701
702 static bool_t
xdrrdma_control(XDR * xdrs,int request,void * info)703 xdrrdma_control(XDR *xdrs, int request, void *info)
704 {
705 int32_t *int32p;
706 int len, i;
707 uint_t in_flags;
708 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
709 rdma_chunkinfo_t *rcip = NULL;
710 rdma_wlist_conn_info_t *rwcip = NULL;
711 rdma_chunkinfo_lengths_t *rcilp = NULL;
712 struct uio *uiop;
713 struct clist *rwl = NULL, *first = NULL;
714 struct clist *prev = NULL;
715
716 switch (request) {
717 case XDR_PEEK:
718 /*
719 * Return the next 4 byte unit in the XDR stream.
720 */
721 if (xdrs->x_handy < sizeof (int32_t))
722 return (FALSE);
723
724 int32p = (int32_t *)info;
725 *int32p = (int32_t)ntohl((uint32_t)
726 (*((int32_t *)(xdrp->xp_offp))));
727
728 return (TRUE);
729
730 case XDR_SKIPBYTES:
731 /*
732 * Skip the next N bytes in the XDR stream.
733 */
734 int32p = (int32_t *)info;
735 len = RNDUP((int)(*int32p));
736 if ((xdrs->x_handy -= len) < 0)
737 return (FALSE);
738 xdrp->xp_offp += len;
739
740 return (TRUE);
741
742 case XDR_RDMA_SET_FLAGS:
743 /*
744 * Set the flags provided in the *info in xp_flags for rdma
745 * xdr stream control.
746 */
747 int32p = (int32_t *)info;
748 in_flags = (uint_t)(*int32p);
749
750 xdrp->xp_flags |= in_flags;
751 return (TRUE);
752
753 case XDR_RDMA_GET_FLAGS:
754 /*
755 * Get the flags provided in xp_flags return through *info
756 */
757 int32p = (int32_t *)info;
758
759 *int32p = (int32_t)xdrp->xp_flags;
760 return (TRUE);
761
762 case XDR_RDMA_GET_CHUNK_LEN:
763 rcilp = (rdma_chunkinfo_lengths_t *)info;
764 rcilp->rcil_len = xdrp->xp_reply_chunk_len;
765 rcilp->rcil_len_alt = xdrp->xp_reply_chunk_len_alt;
766
767 return (TRUE);
768
769 case XDR_RDMA_ADD_CHUNK:
770 /*
771 * Store wlist information
772 */
773
774 rcip = (rdma_chunkinfo_t *)info;
775
776 DTRACE_PROBE2(krpc__i__xdrrdma__control__add__chunk,
777 rci_type_t, rcip->rci_type, uint32, rcip->rci_len);
778 switch (rcip->rci_type) {
779 case RCI_WRITE_UIO_CHUNK:
780 xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
781
782 if ((rcip->rci_len + XDR_RDMA_BUF_OVERHEAD) <
783 xdrp->xp_min_chunk) {
784 xdrp->xp_wcl = NULL;
785 *(rcip->rci_clpp) = NULL;
786 return (TRUE);
787 }
788 uiop = rcip->rci_a.rci_uiop;
789
790 for (i = 0; i < uiop->uio_iovcnt; i++) {
791 rwl = clist_alloc();
792 if (first == NULL)
793 first = rwl;
794 rwl->c_len = uiop->uio_iov[i].iov_len;
795 rwl->u.c_daddr =
796 (uint64)(uintptr_t)
797 (uiop->uio_iov[i].iov_base);
798 /*
799 * if userspace address, put adspace ptr in
800 * clist. If not, then do nothing since it's
801 * already set to NULL (from kmem_zalloc)
802 */
803 if (uiop->uio_segflg == UIO_USERSPACE) {
804 rwl->c_adspc = ttoproc(curthread)->p_as;
805 }
806
807 if (prev == NULL)
808 prev = rwl;
809 else {
810 prev->c_next = rwl;
811 prev = rwl;
812 }
813 }
814
815 rwl->c_next = NULL;
816 xdrp->xp_wcl = first;
817 *(rcip->rci_clpp) = first;
818
819 break;
820
821 case RCI_WRITE_ADDR_CHUNK:
822 rwl = clist_alloc();
823
824 rwl->c_len = rcip->rci_len;
825 rwl->u.c_daddr3 = rcip->rci_a.rci_addr;
826 rwl->c_next = NULL;
827 xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
828
829 xdrp->xp_wcl = rwl;
830 *(rcip->rci_clpp) = rwl;
831
832 break;
833
834 case RCI_REPLY_CHUNK:
835 xdrp->xp_reply_chunk_len += rcip->rci_len;
836 break;
837 }
838 return (TRUE);
839
840 case XDR_RDMA_GET_WLIST:
841 *((struct clist **)info) = xdrp->xp_wcl;
842 return (TRUE);
843
844 case XDR_RDMA_SET_WLIST:
845 xdrp->xp_wcl = (struct clist *)info;
846 return (TRUE);
847
848 case XDR_RDMA_GET_RLIST:
849 *((struct clist **)info) = xdrp->xp_rcl;
850 return (TRUE);
851
852 case XDR_RDMA_GET_WCINFO:
853 rwcip = (rdma_wlist_conn_info_t *)info;
854
855 rwcip->rwci_wlist = xdrp->xp_wcl;
856 rwcip->rwci_conn = xdrp->xp_conn;
857
858 return (TRUE);
859
860 default:
861 return (FALSE);
862 }
863 }
864
865 bool_t xdr_do_clist(XDR *, clist **);
866
867 /*
868 * Not all fields in struct clist are interesting to the RPC over RDMA
869 * protocol. Only XDR the interesting fields.
870 */
871 bool_t
xdr_clist(XDR * xdrs,clist * objp)872 xdr_clist(XDR *xdrs, clist *objp)
873 {
874 if (!xdr_uint32(xdrs, &objp->c_xdroff))
875 return (FALSE);
876 if (!xdr_uint32(xdrs, &objp->c_smemhandle.mrc_rmr))
877 return (FALSE);
878 if (!xdr_uint32(xdrs, &objp->c_len))
879 return (FALSE);
880 if (!xdr_uint64(xdrs, &objp->w.c_saddr))
881 return (FALSE);
882 if (!xdr_do_clist(xdrs, &objp->c_next))
883 return (FALSE);
884 return (TRUE);
885 }
886
887 /*
888 * The following two functions are forms of xdr_pointer()
889 * and xdr_reference(). Since the generic versions just
890 * kmem_alloc() a new clist, we actually want to use the
891 * rdma_clist kmem_cache.
892 */
893
894 /*
895 * Generate or free a clist structure from the
896 * kmem_cache "rdma_clist"
897 */
898 bool_t
xdr_ref_clist(XDR * xdrs,caddr_t * pp)899 xdr_ref_clist(XDR *xdrs, caddr_t *pp)
900 {
901 caddr_t loc = *pp;
902 bool_t stat;
903
904 if (loc == NULL) {
905 switch (xdrs->x_op) {
906 case XDR_FREE:
907 return (TRUE);
908
909 case XDR_DECODE:
910 *pp = loc = (caddr_t)clist_alloc();
911 break;
912
913 case XDR_ENCODE:
914 ASSERT(loc);
915 break;
916 }
917 }
918
919 stat = xdr_clist(xdrs, (struct clist *)loc);
920
921 if (xdrs->x_op == XDR_FREE) {
922 kmem_cache_free(clist_cache, loc);
923 *pp = NULL;
924 }
925 return (stat);
926 }
927
928 /*
929 * XDR a pointer to a possibly recursive clist. This differs
930 * with xdr_reference in that it can serialize/deserialiaze
931 * trees correctly.
932 *
933 * What is sent is actually a union:
934 *
935 * union object_pointer switch (boolean b) {
936 * case TRUE: object_data data;
937 * case FALSE: void nothing;
938 * }
939 *
940 * > objpp: Pointer to the pointer to the object.
941 *
942 */
943
944 bool_t
xdr_do_clist(XDR * xdrs,clist ** objpp)945 xdr_do_clist(XDR *xdrs, clist **objpp)
946 {
947 bool_t more_data;
948
949 more_data = (*objpp != NULL);
950 if (!xdr_bool(xdrs, &more_data))
951 return (FALSE);
952 if (!more_data) {
953 *objpp = NULL;
954 return (TRUE);
955 }
956 return (xdr_ref_clist(xdrs, (caddr_t *)objpp));
957 }
958
959 uint_t
xdr_getbufsize(XDR * xdrs)960 xdr_getbufsize(XDR *xdrs)
961 {
962 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
963
964 return ((uint_t)xdrp->xp_buf_size);
965 }
966
967 /* ARGSUSED */
968 bool_t
xdr_encode_rlist_svc(XDR * xdrs,clist * rlist)969 xdr_encode_rlist_svc(XDR *xdrs, clist *rlist)
970 {
971 bool_t vfalse = FALSE;
972
973 ASSERT(rlist == NULL);
974 return (xdr_bool(xdrs, &vfalse));
975 }
976
977 bool_t
xdr_encode_wlist(XDR * xdrs,clist * w)978 xdr_encode_wlist(XDR *xdrs, clist *w)
979 {
980 bool_t vfalse = FALSE, vtrue = TRUE;
981 int i;
982 uint_t num_segment = 0;
983 struct clist *cl;
984
985 /* does a wlist exist? */
986 if (w == NULL) {
987 return (xdr_bool(xdrs, &vfalse));
988 }
989 /* Encode N consecutive segments, 1, N, HLOO, ..., HLOO, 0 */
990 if (!xdr_bool(xdrs, &vtrue))
991 return (FALSE);
992
993 for (cl = w; cl != NULL; cl = cl->c_next) {
994 num_segment++;
995 }
996
997 if (!xdr_uint32(xdrs, &num_segment))
998 return (FALSE);
999 for (i = 0; i < num_segment; i++) {
1000
1001 DTRACE_PROBE1(krpc__i__xdr_encode_wlist_len, uint_t, w->c_len);
1002
1003 if (!xdr_uint32(xdrs, &w->c_dmemhandle.mrc_rmr))
1004 return (FALSE);
1005
1006 if (!xdr_uint32(xdrs, &w->c_len))
1007 return (FALSE);
1008
1009 if (!xdr_uint64(xdrs, &w->u.c_daddr))
1010 return (FALSE);
1011
1012 w = w->c_next;
1013 }
1014
1015 if (!xdr_bool(xdrs, &vfalse))
1016 return (FALSE);
1017
1018 return (TRUE);
1019 }
1020
1021
1022 /*
1023 * Conditionally decode a RDMA WRITE chunk list from XDR stream.
1024 *
1025 * If the next boolean in the XDR stream is false there is no
1026 * RDMA WRITE chunk list present. Otherwise iterate over the
1027 * array and for each entry: allocate a struct clist and decode.
1028 * Pass back an indication via wlist_exists if we have seen a
1029 * RDMA WRITE chunk list.
1030 */
1031 bool_t
xdr_decode_wlist(XDR * xdrs,struct clist ** w,bool_t * wlist_exists)1032 xdr_decode_wlist(XDR *xdrs, struct clist **w, bool_t *wlist_exists)
1033 {
1034 struct clist *tmp;
1035 bool_t more = FALSE;
1036 uint32_t seg_array_len;
1037 uint32_t i;
1038
1039 if (!xdr_bool(xdrs, &more))
1040 return (FALSE);
1041
1042 /* is there a wlist? */
1043 if (more == FALSE) {
1044 *wlist_exists = FALSE;
1045 return (TRUE);
1046 }
1047 *wlist_exists = TRUE;
1048
1049 if (!xdr_uint32(xdrs, &seg_array_len))
1050 return (FALSE);
1051
1052 tmp = *w = clist_alloc();
1053 for (i = 0; i < seg_array_len; i++) {
1054
1055 if (!xdr_uint32(xdrs, &tmp->c_dmemhandle.mrc_rmr))
1056 return (FALSE);
1057 if (!xdr_uint32(xdrs, &tmp->c_len))
1058 return (FALSE);
1059
1060 DTRACE_PROBE1(krpc__i__xdr_decode_wlist_len,
1061 uint_t, tmp->c_len);
1062
1063 if (!xdr_uint64(xdrs, &tmp->u.c_daddr))
1064 return (FALSE);
1065 if (i < seg_array_len - 1) {
1066 tmp->c_next = clist_alloc();
1067 tmp = tmp->c_next;
1068 } else {
1069 tmp->c_next = NULL;
1070 }
1071 }
1072
1073 more = FALSE;
1074 if (!xdr_bool(xdrs, &more))
1075 return (FALSE);
1076
1077 return (TRUE);
1078 }
1079
1080 /*
1081 * Server side RDMA WRITE list decode.
1082 * XDR context is memory ops
1083 */
1084 bool_t
xdr_decode_wlist_svc(XDR * xdrs,struct clist ** wclp,bool_t * wwl,uint32_t * total_length,CONN * conn)1085 xdr_decode_wlist_svc(XDR *xdrs, struct clist **wclp, bool_t *wwl,
1086 uint32_t *total_length, CONN *conn)
1087 {
1088 struct clist *first, *ncl;
1089 char *memp;
1090 uint32_t num_wclist;
1091 uint32_t wcl_length = 0;
1092 uint32_t i;
1093 bool_t more = FALSE;
1094
1095 *wclp = NULL;
1096 *wwl = FALSE;
1097 *total_length = 0;
1098
1099 if (!xdr_bool(xdrs, &more)) {
1100 return (FALSE);
1101 }
1102
1103 if (more == FALSE) {
1104 return (TRUE);
1105 }
1106
1107 *wwl = TRUE;
1108
1109 if (!xdr_uint32(xdrs, &num_wclist)) {
1110 DTRACE_PROBE(krpc__e__xdrrdma__wlistsvc__listlength);
1111 return (FALSE);
1112 }
1113
1114 first = ncl = clist_alloc();
1115
1116 for (i = 0; i < num_wclist; i++) {
1117
1118 if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
1119 goto err_out;
1120 if (!xdr_uint32(xdrs, &ncl->c_len))
1121 goto err_out;
1122 if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
1123 goto err_out;
1124
1125 if (ncl->c_len > MAX_SVC_XFER_SIZE) {
1126 DTRACE_PROBE(
1127 krpc__e__xdrrdma__wlistsvc__chunklist_toobig);
1128 ncl->c_len = MAX_SVC_XFER_SIZE;
1129 }
1130
1131 DTRACE_PROBE1(krpc__i__xdr_decode_wlist_svc_len,
1132 uint_t, ncl->c_len);
1133
1134 wcl_length += ncl->c_len;
1135
1136 if (i < num_wclist - 1) {
1137 ncl->c_next = clist_alloc();
1138 ncl = ncl->c_next;
1139 }
1140 }
1141
1142 if (!xdr_bool(xdrs, &more))
1143 goto err_out;
1144
1145 first->rb_longbuf.type = RDMA_LONG_BUFFER;
1146 first->rb_longbuf.len =
1147 wcl_length > WCL_BUF_LEN ? wcl_length : WCL_BUF_LEN;
1148
1149 if (rdma_buf_alloc(conn, &first->rb_longbuf)) {
1150 clist_free(first);
1151 return (FALSE);
1152 }
1153
1154 memp = first->rb_longbuf.addr;
1155
1156 ncl = first;
1157 for (i = 0; i < num_wclist; i++) {
1158 ncl->w.c_saddr3 = (caddr_t)memp;
1159 memp += ncl->c_len;
1160 ncl = ncl->c_next;
1161 }
1162
1163 *wclp = first;
1164 *total_length = wcl_length;
1165 return (TRUE);
1166
1167 err_out:
1168 clist_free(first);
1169 return (FALSE);
1170 }
1171
1172 /*
1173 * XDR decode the long reply write chunk.
1174 */
1175 bool_t
xdr_decode_reply_wchunk(XDR * xdrs,struct clist ** clist)1176 xdr_decode_reply_wchunk(XDR *xdrs, struct clist **clist)
1177 {
1178 bool_t have_rchunk = FALSE;
1179 struct clist *first = NULL, *ncl = NULL;
1180 uint32_t num_wclist;
1181 uint32_t i;
1182
1183 if (!xdr_bool(xdrs, &have_rchunk))
1184 return (FALSE);
1185
1186 if (have_rchunk == FALSE)
1187 return (TRUE);
1188
1189 if (!xdr_uint32(xdrs, &num_wclist)) {
1190 DTRACE_PROBE(krpc__e__xdrrdma__replywchunk__listlength);
1191 return (FALSE);
1192 }
1193
1194 if (num_wclist == 0) {
1195 return (FALSE);
1196 }
1197
1198 first = ncl = clist_alloc();
1199
1200 for (i = 0; i < num_wclist; i++) {
1201
1202 if (i > 0) {
1203 ncl->c_next = clist_alloc();
1204 ncl = ncl->c_next;
1205 }
1206
1207 if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
1208 goto err_out;
1209 if (!xdr_uint32(xdrs, &ncl->c_len))
1210 goto err_out;
1211 if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
1212 goto err_out;
1213
1214 if (ncl->c_len > MAX_SVC_XFER_SIZE) {
1215 DTRACE_PROBE(
1216 krpc__e__xdrrdma__replywchunk__chunklist_toobig);
1217 ncl->c_len = MAX_SVC_XFER_SIZE;
1218 }
1219 if (!(ncl->c_dmemhandle.mrc_rmr &&
1220 (ncl->c_len > 0) && ncl->u.c_daddr))
1221 DTRACE_PROBE(
1222 krpc__e__xdrrdma__replywchunk__invalid_segaddr);
1223
1224 DTRACE_PROBE1(krpc__i__xdr_decode_reply_wchunk_c_len,
1225 uint32_t, ncl->c_len);
1226
1227 }
1228 *clist = first;
1229 return (TRUE);
1230
1231 err_out:
1232 clist_free(first);
1233 return (FALSE);
1234 }
1235
1236
1237 bool_t
xdr_encode_reply_wchunk(XDR * xdrs,struct clist * cl_longreply,uint32_t seg_array_len)1238 xdr_encode_reply_wchunk(XDR *xdrs,
1239 struct clist *cl_longreply, uint32_t seg_array_len)
1240 {
1241 int i;
1242 bool_t long_reply_exists = TRUE;
1243 uint32_t length;
1244 uint64 offset;
1245
1246 if (seg_array_len > 0) {
1247 if (!xdr_bool(xdrs, &long_reply_exists))
1248 return (FALSE);
1249 if (!xdr_uint32(xdrs, &seg_array_len))
1250 return (FALSE);
1251
1252 for (i = 0; i < seg_array_len; i++) {
1253 if (!cl_longreply)
1254 return (FALSE);
1255 length = cl_longreply->c_len;
1256 offset = (uint64) cl_longreply->u.c_daddr;
1257
1258 DTRACE_PROBE1(
1259 krpc__i__xdr_encode_reply_wchunk_c_len,
1260 uint32_t, length);
1261
1262 if (!xdr_uint32(xdrs,
1263 &cl_longreply->c_dmemhandle.mrc_rmr))
1264 return (FALSE);
1265 if (!xdr_uint32(xdrs, &length))
1266 return (FALSE);
1267 if (!xdr_uint64(xdrs, &offset))
1268 return (FALSE);
1269 cl_longreply = cl_longreply->c_next;
1270 }
1271 } else {
1272 long_reply_exists = FALSE;
1273 if (!xdr_bool(xdrs, &long_reply_exists))
1274 return (FALSE);
1275 }
1276 return (TRUE);
1277 }
1278 bool_t
xdrrdma_read_from_client(struct clist * rlist,CONN ** conn,uint_t count)1279 xdrrdma_read_from_client(struct clist *rlist, CONN **conn, uint_t count)
1280 {
1281 struct clist *rdclist;
1282 struct clist cl;
1283 uint_t total_len = 0;
1284 uint32_t status;
1285 bool_t retval = TRUE;
1286
1287 rlist->rb_longbuf.type = RDMA_LONG_BUFFER;
1288 rlist->rb_longbuf.len =
1289 count > RCL_BUF_LEN ? count : RCL_BUF_LEN;
1290
1291 if (rdma_buf_alloc(*conn, &rlist->rb_longbuf)) {
1292 return (FALSE);
1293 }
1294
1295 /*
1296 * The entire buffer is registered with the first chunk.
1297 * Later chunks will use the same registered memory handle.
1298 */
1299
1300 cl = *rlist;
1301 cl.c_next = NULL;
1302 if (clist_register(*conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
1303 rdma_buf_free(*conn, &rlist->rb_longbuf);
1304 DTRACE_PROBE(
1305 krpc__e__xdrrdma__readfromclient__clist__reg);
1306 return (FALSE);
1307 }
1308
1309 rlist->c_regtype = CLIST_REG_DST;
1310 rlist->c_dmemhandle = cl.c_dmemhandle;
1311 rlist->c_dsynchandle = cl.c_dsynchandle;
1312
1313 for (rdclist = rlist;
1314 rdclist != NULL; rdclist = rdclist->c_next) {
1315 total_len += rdclist->c_len;
1316 #if (defined(OBJ32)||defined(DEBUG32))
1317 rdclist->u.c_daddr3 =
1318 (caddr_t)((char *)rlist->rb_longbuf.addr +
1319 (uint32) rdclist->u.c_daddr3);
1320 #else
1321 rdclist->u.c_daddr3 =
1322 (caddr_t)((char *)rlist->rb_longbuf.addr +
1323 (uint64) rdclist->u.c_daddr);
1324
1325 #endif
1326 cl = (*rdclist);
1327 cl.c_next = NULL;
1328
1329 /*
1330 * Use the same memory handle for all the chunks
1331 */
1332 cl.c_dmemhandle = rlist->c_dmemhandle;
1333 cl.c_dsynchandle = rlist->c_dsynchandle;
1334
1335
1336 DTRACE_PROBE1(krpc__i__xdrrdma__readfromclient__buflen,
1337 int, rdclist->c_len);
1338
1339 /*
1340 * Now read the chunk in
1341 */
1342 if (rdclist->c_next == NULL) {
1343 status = RDMA_READ(*conn, &cl, WAIT);
1344 } else {
1345 status = RDMA_READ(*conn, &cl, NOWAIT);
1346 }
1347 if (status != RDMA_SUCCESS) {
1348 DTRACE_PROBE(
1349 krpc__e__xdrrdma__readfromclient__readfailed);
1350 rdma_buf_free(*conn, &rlist->rb_longbuf);
1351 return (FALSE);
1352 }
1353 }
1354
1355 cl = (*rlist);
1356 cl.c_next = NULL;
1357 cl.c_len = total_len;
1358 if (clist_syncmem(*conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
1359 retval = FALSE;
1360 }
1361 return (retval);
1362 }
1363
1364 bool_t
xdrrdma_free_clist(CONN * conn,struct clist * clp)1365 xdrrdma_free_clist(CONN *conn, struct clist *clp)
1366 {
1367 rdma_buf_free(conn, &clp->rb_longbuf);
1368 clist_free(clp);
1369 return (TRUE);
1370 }
1371
1372 bool_t
xdrrdma_send_read_data(XDR * xdrs,uint_t data_len,struct clist * wcl)1373 xdrrdma_send_read_data(XDR *xdrs, uint_t data_len, struct clist *wcl)
1374 {
1375 int status;
1376 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
1377 struct xdr_ops *xops = xdrrdma_xops();
1378 struct clist *tcl, *wrcl, *cl;
1379 struct clist fcl;
1380 int rndup_present, rnduplen;
1381
1382 rndup_present = 0;
1383 wrcl = NULL;
1384
1385 /* caller is doing a sizeof */
1386 if (xdrs->x_ops != &xdrrdma_ops || xdrs->x_ops == xops)
1387 return (TRUE);
1388
1389 /* copy of the first chunk */
1390 fcl = *wcl;
1391 fcl.c_next = NULL;
1392
1393 /*
1394 * The entire buffer is registered with the first chunk.
1395 * Later chunks will use the same registered memory handle.
1396 */
1397
1398 status = clist_register(xdrp->xp_conn, &fcl, CLIST_REG_SOURCE);
1399 if (status != RDMA_SUCCESS) {
1400 return (FALSE);
1401 }
1402
1403 wcl->c_regtype = CLIST_REG_SOURCE;
1404 wcl->c_smemhandle = fcl.c_smemhandle;
1405 wcl->c_ssynchandle = fcl.c_ssynchandle;
1406
1407 /*
1408 * Only transfer the read data ignoring any trailing
1409 * roundup chunks. A bit of work, but it saves an
1410 * unnecessary extra RDMA_WRITE containing only
1411 * roundup bytes.
1412 */
1413
1414 rnduplen = clist_len(wcl) - data_len;
1415
1416 if (rnduplen) {
1417
1418 tcl = wcl->c_next;
1419
1420 /*
1421 * Check if there is a trailing roundup chunk
1422 */
1423 while (tcl) {
1424 if ((tcl->c_next == NULL) && (tcl->c_len == rnduplen)) {
1425 rndup_present = 1;
1426 break;
1427 }
1428 tcl = tcl->c_next;
1429 }
1430
1431 /*
1432 * Make a copy chunk list skipping the last chunk
1433 */
1434 if (rndup_present) {
1435 cl = wcl;
1436 tcl = NULL;
1437 while (cl) {
1438 if (tcl == NULL) {
1439 tcl = clist_alloc();
1440 wrcl = tcl;
1441 } else {
1442 tcl->c_next = clist_alloc();
1443 tcl = tcl->c_next;
1444 }
1445
1446 *tcl = *cl;
1447 cl = cl->c_next;
1448 /* last chunk */
1449 if (cl->c_next == NULL)
1450 break;
1451 }
1452 tcl->c_next = NULL;
1453 }
1454 }
1455
1456 if (wrcl == NULL) {
1457 /* No roundup chunks */
1458 wrcl = wcl;
1459 }
1460
1461 /*
1462 * Set the registered memory handles for the
1463 * rest of the chunks same as the first chunk.
1464 */
1465 tcl = wrcl->c_next;
1466 while (tcl) {
1467 tcl->c_smemhandle = fcl.c_smemhandle;
1468 tcl->c_ssynchandle = fcl.c_ssynchandle;
1469 tcl = tcl->c_next;
1470 }
1471
1472 /*
1473 * Sync the total len beginning from the first chunk.
1474 */
1475 fcl.c_len = clist_len(wrcl);
1476 status = clist_syncmem(xdrp->xp_conn, &fcl, CLIST_REG_SOURCE);
1477 if (status != RDMA_SUCCESS) {
1478 return (FALSE);
1479 }
1480
1481 status = RDMA_WRITE(xdrp->xp_conn, wrcl, WAIT);
1482
1483 if (rndup_present)
1484 clist_free(wrcl);
1485
1486 if (status != RDMA_SUCCESS) {
1487 return (FALSE);
1488 }
1489
1490 return (TRUE);
1491 }
1492
1493
1494 /*
1495 * Reads one chunk at a time
1496 */
1497
1498 static bool_t
xdrrdma_read_a_chunk(XDR * xdrs,CONN ** conn)1499 xdrrdma_read_a_chunk(XDR *xdrs, CONN **conn)
1500 {
1501 int status;
1502 int32_t len = 0;
1503 xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
1504 struct clist *cle = *(xdrp->xp_rcl_next);
1505 struct clist *rclp = xdrp->xp_rcl;
1506 struct clist *clp;
1507
1508 /*
1509 * len is used later to decide xdr offset in
1510 * the chunk factoring any 4-byte XDR alignment
1511 * (See read chunk example top of this file)
1512 */
1513 while (rclp != cle) {
1514 len += rclp->c_len;
1515 rclp = rclp->c_next;
1516 }
1517
1518 len = RNDUP(len) - len;
1519
1520 ASSERT(xdrs->x_handy <= 0);
1521
1522 /*
1523 * If this is the first chunk to contain the RPC
1524 * message set xp_off to the xdr offset of the
1525 * inline message.
1526 */
1527 if (xdrp->xp_off == 0)
1528 xdrp->xp_off = (xdrp->xp_offp - xdrs->x_base);
1529
1530 if (cle == NULL || (cle->c_xdroff != xdrp->xp_off))
1531 return (FALSE);
1532
1533 /*
1534 * Make a copy of the chunk to read from client.
1535 * Chunks are read on demand, so read only one
1536 * for now.
1537 */
1538
1539 rclp = clist_alloc();
1540 *rclp = *cle;
1541 rclp->c_next = NULL;
1542
1543 xdrp->xp_rcl_next = &cle->c_next;
1544
1545 /*
1546 * If there is a roundup present, then skip those
1547 * bytes when reading.
1548 */
1549 if (len) {
1550 rclp->w.c_saddr =
1551 (uint64)(uintptr_t)rclp->w.c_saddr + len;
1552 rclp->c_len = rclp->c_len - len;
1553 }
1554
1555 status = xdrrdma_read_from_client(rclp, conn, rclp->c_len);
1556
1557 if (status == FALSE) {
1558 clist_free(rclp);
1559 return (status);
1560 }
1561
1562 xdrp->xp_offp = rclp->rb_longbuf.addr;
1563 xdrs->x_base = xdrp->xp_offp;
1564 xdrs->x_handy = rclp->c_len;
1565
1566 /*
1567 * This copy of read chunks containing the XDR
1568 * message is freed later in xdrrdma_destroy()
1569 */
1570
1571 if (xdrp->xp_rcl_xdr) {
1572 /* Add the chunk to end of the list */
1573 clp = xdrp->xp_rcl_xdr;
1574 while (clp->c_next != NULL)
1575 clp = clp->c_next;
1576 clp->c_next = rclp;
1577 } else {
1578 xdrp->xp_rcl_xdr = rclp;
1579 }
1580 return (TRUE);
1581 }
1582
1583 static void
xdrrdma_free_xdr_chunks(CONN * conn,struct clist * xdr_rcl)1584 xdrrdma_free_xdr_chunks(CONN *conn, struct clist *xdr_rcl)
1585 {
1586 struct clist *cl;
1587
1588 (void) clist_deregister(conn, xdr_rcl);
1589
1590 /*
1591 * Read chunks containing parts XDR message are
1592 * special: in case of multiple chunks each has
1593 * its own buffer.
1594 */
1595
1596 cl = xdr_rcl;
1597 while (cl) {
1598 rdma_buf_free(conn, &cl->rb_longbuf);
1599 cl = cl->c_next;
1600 }
1601
1602 clist_free(xdr_rcl);
1603 }
1604