1*d6b92ffaSHans Petter Selasky /*
2*d6b92ffaSHans Petter Selasky * Copyright (c) 2008-2014 Intel Corporation. All rights reserved.
3*d6b92ffaSHans Petter Selasky *
4*d6b92ffaSHans Petter Selasky * This software is available to you under a choice of one of two
5*d6b92ffaSHans Petter Selasky * licenses. You may choose to be licensed under the terms of the GNU
6*d6b92ffaSHans Petter Selasky * General Public License (GPL) Version 2, available from the file
7*d6b92ffaSHans Petter Selasky * COPYING in the main directory of this source tree, or the
8*d6b92ffaSHans Petter Selasky * OpenIB.org BSD license below:
9*d6b92ffaSHans Petter Selasky *
10*d6b92ffaSHans Petter Selasky * Redistribution and use in source and binary forms, with or
11*d6b92ffaSHans Petter Selasky * without modification, are permitted provided that the following
12*d6b92ffaSHans Petter Selasky * conditions are met:
13*d6b92ffaSHans Petter Selasky *
14*d6b92ffaSHans Petter Selasky * - Redistributions of source code must retain the above
15*d6b92ffaSHans Petter Selasky * copyright notice, this list of conditions and the following
16*d6b92ffaSHans Petter Selasky * disclaimer.
17*d6b92ffaSHans Petter Selasky *
18*d6b92ffaSHans Petter Selasky * - Redistributions in binary form must reproduce the above
19*d6b92ffaSHans Petter Selasky * copyright notice, this list of conditions and the following
20*d6b92ffaSHans Petter Selasky * disclaimer in the documentation and/or other materials
21*d6b92ffaSHans Petter Selasky * provided with the distribution.
22*d6b92ffaSHans Petter Selasky *
23*d6b92ffaSHans Petter Selasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24*d6b92ffaSHans Petter Selasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25*d6b92ffaSHans Petter Selasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26*d6b92ffaSHans Petter Selasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27*d6b92ffaSHans Petter Selasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28*d6b92ffaSHans Petter Selasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29*d6b92ffaSHans Petter Selasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30*d6b92ffaSHans Petter Selasky * SOFTWARE.
31*d6b92ffaSHans Petter Selasky *
32*d6b92ffaSHans Petter Selasky */
33*d6b92ffaSHans Petter Selasky #define _GNU_SOURCE
34*d6b92ffaSHans Petter Selasky #include <config.h>
35*d6b92ffaSHans Petter Selasky
36*d6b92ffaSHans Petter Selasky #include <sys/types.h>
37*d6b92ffaSHans Petter Selasky #include <sys/socket.h>
38*d6b92ffaSHans Petter Selasky #include <sys/time.h>
39*d6b92ffaSHans Petter Selasky #include <infiniband/endian.h>
40*d6b92ffaSHans Petter Selasky #include <stdarg.h>
41*d6b92ffaSHans Petter Selasky #include <netdb.h>
42*d6b92ffaSHans Petter Selasky #include <unistd.h>
43*d6b92ffaSHans Petter Selasky #include <fcntl.h>
44*d6b92ffaSHans Petter Selasky #include <stdio.h>
45*d6b92ffaSHans Petter Selasky #include <stddef.h>
46*d6b92ffaSHans Petter Selasky #include <string.h>
47*d6b92ffaSHans Petter Selasky #include <netinet/tcp.h>
48*d6b92ffaSHans Petter Selasky #include <sys/epoll.h>
49*d6b92ffaSHans Petter Selasky #include <search.h>
50*d6b92ffaSHans Petter Selasky #include <byteswap.h>
51*d6b92ffaSHans Petter Selasky #include <util/compiler.h>
52*d6b92ffaSHans Petter Selasky
53*d6b92ffaSHans Petter Selasky #include <rdma/rdma_cma.h>
54*d6b92ffaSHans Petter Selasky #include <rdma/rdma_verbs.h>
55*d6b92ffaSHans Petter Selasky #include <rdma/rsocket.h>
56*d6b92ffaSHans Petter Selasky #include "cma.h"
57*d6b92ffaSHans Petter Selasky #include "indexer.h"
58*d6b92ffaSHans Petter Selasky
59*d6b92ffaSHans Petter Selasky #define RS_OLAP_START_SIZE 2048
60*d6b92ffaSHans Petter Selasky #define RS_MAX_TRANSFER 65536
61*d6b92ffaSHans Petter Selasky #define RS_SNDLOWAT 2048
62*d6b92ffaSHans Petter Selasky #define RS_QP_MIN_SIZE 16
63*d6b92ffaSHans Petter Selasky #define RS_QP_MAX_SIZE 0xFFFE
64*d6b92ffaSHans Petter Selasky #define RS_QP_CTRL_SIZE 4 /* must be power of 2 */
65*d6b92ffaSHans Petter Selasky #define RS_CONN_RETRIES 6
66*d6b92ffaSHans Petter Selasky #define RS_SGL_SIZE 2
67*d6b92ffaSHans Petter Selasky static struct index_map idm;
68*d6b92ffaSHans Petter Selasky static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
69*d6b92ffaSHans Petter Selasky
70*d6b92ffaSHans Petter Selasky struct rsocket;
71*d6b92ffaSHans Petter Selasky
72*d6b92ffaSHans Petter Selasky enum {
73*d6b92ffaSHans Petter Selasky RS_SVC_NOOP,
74*d6b92ffaSHans Petter Selasky RS_SVC_ADD_DGRAM,
75*d6b92ffaSHans Petter Selasky RS_SVC_REM_DGRAM,
76*d6b92ffaSHans Petter Selasky RS_SVC_ADD_KEEPALIVE,
77*d6b92ffaSHans Petter Selasky RS_SVC_REM_KEEPALIVE,
78*d6b92ffaSHans Petter Selasky RS_SVC_MOD_KEEPALIVE
79*d6b92ffaSHans Petter Selasky };
80*d6b92ffaSHans Petter Selasky
81*d6b92ffaSHans Petter Selasky struct rs_svc_msg {
82*d6b92ffaSHans Petter Selasky uint32_t cmd;
83*d6b92ffaSHans Petter Selasky uint32_t status;
84*d6b92ffaSHans Petter Selasky struct rsocket *rs;
85*d6b92ffaSHans Petter Selasky };
86*d6b92ffaSHans Petter Selasky
87*d6b92ffaSHans Petter Selasky struct rs_svc {
88*d6b92ffaSHans Petter Selasky pthread_t id;
89*d6b92ffaSHans Petter Selasky int sock[2];
90*d6b92ffaSHans Petter Selasky int cnt;
91*d6b92ffaSHans Petter Selasky int size;
92*d6b92ffaSHans Petter Selasky int context_size;
93*d6b92ffaSHans Petter Selasky void *(*run)(void *svc);
94*d6b92ffaSHans Petter Selasky struct rsocket **rss;
95*d6b92ffaSHans Petter Selasky void *contexts;
96*d6b92ffaSHans Petter Selasky };
97*d6b92ffaSHans Petter Selasky
98*d6b92ffaSHans Petter Selasky static struct pollfd *udp_svc_fds;
99*d6b92ffaSHans Petter Selasky static void *udp_svc_run(void *arg);
100*d6b92ffaSHans Petter Selasky static struct rs_svc udp_svc = {
101*d6b92ffaSHans Petter Selasky .context_size = sizeof(*udp_svc_fds),
102*d6b92ffaSHans Petter Selasky .run = udp_svc_run
103*d6b92ffaSHans Petter Selasky };
104*d6b92ffaSHans Petter Selasky static uint32_t *tcp_svc_timeouts;
105*d6b92ffaSHans Petter Selasky static void *tcp_svc_run(void *arg);
106*d6b92ffaSHans Petter Selasky static struct rs_svc tcp_svc = {
107*d6b92ffaSHans Petter Selasky .context_size = sizeof(*tcp_svc_timeouts),
108*d6b92ffaSHans Petter Selasky .run = tcp_svc_run
109*d6b92ffaSHans Petter Selasky };
110*d6b92ffaSHans Petter Selasky
111*d6b92ffaSHans Petter Selasky static uint16_t def_iomap_size = 0;
112*d6b92ffaSHans Petter Selasky static uint16_t def_inline = 64;
113*d6b92ffaSHans Petter Selasky static uint16_t def_sqsize = 384;
114*d6b92ffaSHans Petter Selasky static uint16_t def_rqsize = 384;
115*d6b92ffaSHans Petter Selasky static uint32_t def_mem = (1 << 17);
116*d6b92ffaSHans Petter Selasky static uint32_t def_wmem = (1 << 17);
117*d6b92ffaSHans Petter Selasky static uint32_t polling_time = 10;
118*d6b92ffaSHans Petter Selasky
119*d6b92ffaSHans Petter Selasky /*
120*d6b92ffaSHans Petter Selasky * Immediate data format is determined by the upper bits
121*d6b92ffaSHans Petter Selasky * bit 31: message type, 0 - data, 1 - control
122*d6b92ffaSHans Petter Selasky * bit 30: buffers updated, 0 - target, 1 - direct-receive
123*d6b92ffaSHans Petter Selasky * bit 29: more data, 0 - end of transfer, 1 - more data available
124*d6b92ffaSHans Petter Selasky *
125*d6b92ffaSHans Petter Selasky * for data transfers:
126*d6b92ffaSHans Petter Selasky * bits [28:0]: bytes transferred
127*d6b92ffaSHans Petter Selasky * for control messages:
128*d6b92ffaSHans Petter Selasky * SGL, CTRL
129*d6b92ffaSHans Petter Selasky * bits [28-0]: receive credits granted
130*d6b92ffaSHans Petter Selasky * IOMAP_SGL
131*d6b92ffaSHans Petter Selasky * bits [28-16]: reserved, bits [15-0]: index
132*d6b92ffaSHans Petter Selasky */
133*d6b92ffaSHans Petter Selasky
134*d6b92ffaSHans Petter Selasky enum {
135*d6b92ffaSHans Petter Selasky RS_OP_DATA,
136*d6b92ffaSHans Petter Selasky RS_OP_RSVD_DATA_MORE,
137*d6b92ffaSHans Petter Selasky RS_OP_WRITE, /* opcode is not transmitted over the network */
138*d6b92ffaSHans Petter Selasky RS_OP_RSVD_DRA_MORE,
139*d6b92ffaSHans Petter Selasky RS_OP_SGL,
140*d6b92ffaSHans Petter Selasky RS_OP_RSVD,
141*d6b92ffaSHans Petter Selasky RS_OP_IOMAP_SGL,
142*d6b92ffaSHans Petter Selasky RS_OP_CTRL
143*d6b92ffaSHans Petter Selasky };
144*d6b92ffaSHans Petter Selasky #define rs_msg_set(op, data) ((op << 29) | (uint32_t) (data))
145*d6b92ffaSHans Petter Selasky #define rs_msg_op(imm_data) (imm_data >> 29)
146*d6b92ffaSHans Petter Selasky #define rs_msg_data(imm_data) (imm_data & 0x1FFFFFFF)
147*d6b92ffaSHans Petter Selasky #define RS_MSG_SIZE sizeof(uint32_t)
148*d6b92ffaSHans Petter Selasky
149*d6b92ffaSHans Petter Selasky #define RS_WR_ID_FLAG_RECV (((uint64_t) 1) << 63)
150*d6b92ffaSHans Petter Selasky #define RS_WR_ID_FLAG_MSG_SEND (((uint64_t) 1) << 62) /* See RS_OPT_MSG_SEND */
151*d6b92ffaSHans Petter Selasky #define rs_send_wr_id(data) ((uint64_t) data)
152*d6b92ffaSHans Petter Selasky #define rs_recv_wr_id(data) (RS_WR_ID_FLAG_RECV | (uint64_t) data)
153*d6b92ffaSHans Petter Selasky #define rs_wr_is_recv(wr_id) (wr_id & RS_WR_ID_FLAG_RECV)
154*d6b92ffaSHans Petter Selasky #define rs_wr_is_msg_send(wr_id) (wr_id & RS_WR_ID_FLAG_MSG_SEND)
155*d6b92ffaSHans Petter Selasky #define rs_wr_data(wr_id) ((uint32_t) wr_id)
156*d6b92ffaSHans Petter Selasky
157*d6b92ffaSHans Petter Selasky enum {
158*d6b92ffaSHans Petter Selasky RS_CTRL_DISCONNECT,
159*d6b92ffaSHans Petter Selasky RS_CTRL_KEEPALIVE,
160*d6b92ffaSHans Petter Selasky RS_CTRL_SHUTDOWN
161*d6b92ffaSHans Petter Selasky };
162*d6b92ffaSHans Petter Selasky
163*d6b92ffaSHans Petter Selasky struct rs_msg {
164*d6b92ffaSHans Petter Selasky uint32_t op;
165*d6b92ffaSHans Petter Selasky uint32_t data;
166*d6b92ffaSHans Petter Selasky };
167*d6b92ffaSHans Petter Selasky
168*d6b92ffaSHans Petter Selasky struct ds_qp;
169*d6b92ffaSHans Petter Selasky
170*d6b92ffaSHans Petter Selasky struct ds_rmsg {
171*d6b92ffaSHans Petter Selasky struct ds_qp *qp;
172*d6b92ffaSHans Petter Selasky uint32_t offset;
173*d6b92ffaSHans Petter Selasky uint32_t length;
174*d6b92ffaSHans Petter Selasky };
175*d6b92ffaSHans Petter Selasky
176*d6b92ffaSHans Petter Selasky struct ds_smsg {
177*d6b92ffaSHans Petter Selasky struct ds_smsg *next;
178*d6b92ffaSHans Petter Selasky };
179*d6b92ffaSHans Petter Selasky
180*d6b92ffaSHans Petter Selasky struct rs_sge {
181*d6b92ffaSHans Petter Selasky uint64_t addr;
182*d6b92ffaSHans Petter Selasky uint32_t key;
183*d6b92ffaSHans Petter Selasky uint32_t length;
184*d6b92ffaSHans Petter Selasky };
185*d6b92ffaSHans Petter Selasky
186*d6b92ffaSHans Petter Selasky struct rs_iomap {
187*d6b92ffaSHans Petter Selasky uint64_t offset;
188*d6b92ffaSHans Petter Selasky struct rs_sge sge;
189*d6b92ffaSHans Petter Selasky };
190*d6b92ffaSHans Petter Selasky
191*d6b92ffaSHans Petter Selasky struct rs_iomap_mr {
192*d6b92ffaSHans Petter Selasky uint64_t offset;
193*d6b92ffaSHans Petter Selasky struct ibv_mr *mr;
194*d6b92ffaSHans Petter Selasky dlist_entry entry;
195*d6b92ffaSHans Petter Selasky _Atomic(int) refcnt;
196*d6b92ffaSHans Petter Selasky int index; /* -1 if mapping is local and not in iomap_list */
197*d6b92ffaSHans Petter Selasky };
198*d6b92ffaSHans Petter Selasky
199*d6b92ffaSHans Petter Selasky #define RS_MAX_CTRL_MSG (sizeof(struct rs_sge))
200*d6b92ffaSHans Petter Selasky #define rs_host_is_net() (__BYTE_ORDER == __BIG_ENDIAN)
201*d6b92ffaSHans Petter Selasky #define RS_CONN_FLAG_NET (1 << 0)
202*d6b92ffaSHans Petter Selasky #define RS_CONN_FLAG_IOMAP (1 << 1)
203*d6b92ffaSHans Petter Selasky
204*d6b92ffaSHans Petter Selasky struct rs_conn_data {
205*d6b92ffaSHans Petter Selasky uint8_t version;
206*d6b92ffaSHans Petter Selasky uint8_t flags;
207*d6b92ffaSHans Petter Selasky __be16 credits;
208*d6b92ffaSHans Petter Selasky uint8_t reserved[3];
209*d6b92ffaSHans Petter Selasky uint8_t target_iomap_size;
210*d6b92ffaSHans Petter Selasky struct rs_sge target_sgl;
211*d6b92ffaSHans Petter Selasky struct rs_sge data_buf;
212*d6b92ffaSHans Petter Selasky };
213*d6b92ffaSHans Petter Selasky
214*d6b92ffaSHans Petter Selasky struct rs_conn_private_data {
215*d6b92ffaSHans Petter Selasky union {
216*d6b92ffaSHans Petter Selasky struct rs_conn_data conn_data;
217*d6b92ffaSHans Petter Selasky struct {
218*d6b92ffaSHans Petter Selasky struct ib_connect_hdr ib_hdr;
219*d6b92ffaSHans Petter Selasky struct rs_conn_data conn_data;
220*d6b92ffaSHans Petter Selasky } af_ib;
221*d6b92ffaSHans Petter Selasky };
222*d6b92ffaSHans Petter Selasky };
223*d6b92ffaSHans Petter Selasky
224*d6b92ffaSHans Petter Selasky /*
225*d6b92ffaSHans Petter Selasky * rsocket states are ordered as passive, connecting, connected, disconnected.
226*d6b92ffaSHans Petter Selasky */
227*d6b92ffaSHans Petter Selasky enum rs_state {
228*d6b92ffaSHans Petter Selasky rs_init,
229*d6b92ffaSHans Petter Selasky rs_bound = 0x0001,
230*d6b92ffaSHans Petter Selasky rs_listening = 0x0002,
231*d6b92ffaSHans Petter Selasky rs_opening = 0x0004,
232*d6b92ffaSHans Petter Selasky rs_resolving_addr = rs_opening | 0x0010,
233*d6b92ffaSHans Petter Selasky rs_resolving_route = rs_opening | 0x0020,
234*d6b92ffaSHans Petter Selasky rs_connecting = rs_opening | 0x0040,
235*d6b92ffaSHans Petter Selasky rs_accepting = rs_opening | 0x0080,
236*d6b92ffaSHans Petter Selasky rs_connected = 0x0100,
237*d6b92ffaSHans Petter Selasky rs_writable = 0x0200,
238*d6b92ffaSHans Petter Selasky rs_readable = 0x0400,
239*d6b92ffaSHans Petter Selasky rs_connect_rdwr = rs_connected | rs_readable | rs_writable,
240*d6b92ffaSHans Petter Selasky rs_connect_error = 0x0800,
241*d6b92ffaSHans Petter Selasky rs_disconnected = 0x1000,
242*d6b92ffaSHans Petter Selasky rs_error = 0x2000,
243*d6b92ffaSHans Petter Selasky };
244*d6b92ffaSHans Petter Selasky
245*d6b92ffaSHans Petter Selasky #define RS_OPT_SWAP_SGL (1 << 0)
246*d6b92ffaSHans Petter Selasky /*
247*d6b92ffaSHans Petter Selasky * iWarp does not support RDMA write with immediate data. For iWarp, we
248*d6b92ffaSHans Petter Selasky * transfer rsocket messages as inline sends.
249*d6b92ffaSHans Petter Selasky */
250*d6b92ffaSHans Petter Selasky #define RS_OPT_MSG_SEND (1 << 1)
251*d6b92ffaSHans Petter Selasky #define RS_OPT_SVC_ACTIVE (1 << 2)
252*d6b92ffaSHans Petter Selasky
253*d6b92ffaSHans Petter Selasky union socket_addr {
254*d6b92ffaSHans Petter Selasky struct sockaddr sa;
255*d6b92ffaSHans Petter Selasky struct sockaddr_in sin;
256*d6b92ffaSHans Petter Selasky struct sockaddr_in6 sin6;
257*d6b92ffaSHans Petter Selasky };
258*d6b92ffaSHans Petter Selasky
259*d6b92ffaSHans Petter Selasky struct ds_header {
260*d6b92ffaSHans Petter Selasky uint8_t version;
261*d6b92ffaSHans Petter Selasky uint8_t length;
262*d6b92ffaSHans Petter Selasky __be16 port;
263*d6b92ffaSHans Petter Selasky union {
264*d6b92ffaSHans Petter Selasky __be32 ipv4;
265*d6b92ffaSHans Petter Selasky struct {
266*d6b92ffaSHans Petter Selasky __be32 flowinfo;
267*d6b92ffaSHans Petter Selasky uint8_t addr[16];
268*d6b92ffaSHans Petter Selasky } ipv6;
269*d6b92ffaSHans Petter Selasky } addr;
270*d6b92ffaSHans Petter Selasky };
271*d6b92ffaSHans Petter Selasky
272*d6b92ffaSHans Petter Selasky #define DS_IPV4_HDR_LEN 8
273*d6b92ffaSHans Petter Selasky #define DS_IPV6_HDR_LEN 24
274*d6b92ffaSHans Petter Selasky
275*d6b92ffaSHans Petter Selasky struct ds_dest {
276*d6b92ffaSHans Petter Selasky union socket_addr addr; /* must be first */
277*d6b92ffaSHans Petter Selasky struct ds_qp *qp;
278*d6b92ffaSHans Petter Selasky struct ibv_ah *ah;
279*d6b92ffaSHans Petter Selasky uint32_t qpn;
280*d6b92ffaSHans Petter Selasky };
281*d6b92ffaSHans Petter Selasky
282*d6b92ffaSHans Petter Selasky struct ds_qp {
283*d6b92ffaSHans Petter Selasky dlist_entry list;
284*d6b92ffaSHans Petter Selasky struct rsocket *rs;
285*d6b92ffaSHans Petter Selasky struct rdma_cm_id *cm_id;
286*d6b92ffaSHans Petter Selasky struct ds_header hdr;
287*d6b92ffaSHans Petter Selasky struct ds_dest dest;
288*d6b92ffaSHans Petter Selasky
289*d6b92ffaSHans Petter Selasky struct ibv_mr *smr;
290*d6b92ffaSHans Petter Selasky struct ibv_mr *rmr;
291*d6b92ffaSHans Petter Selasky uint8_t *rbuf;
292*d6b92ffaSHans Petter Selasky
293*d6b92ffaSHans Petter Selasky int cq_armed;
294*d6b92ffaSHans Petter Selasky };
295*d6b92ffaSHans Petter Selasky
296*d6b92ffaSHans Petter Selasky struct rsocket {
297*d6b92ffaSHans Petter Selasky int type;
298*d6b92ffaSHans Petter Selasky int index;
299*d6b92ffaSHans Petter Selasky fastlock_t slock;
300*d6b92ffaSHans Petter Selasky fastlock_t rlock;
301*d6b92ffaSHans Petter Selasky fastlock_t cq_lock;
302*d6b92ffaSHans Petter Selasky fastlock_t cq_wait_lock;
303*d6b92ffaSHans Petter Selasky fastlock_t map_lock; /* acquire slock first if needed */
304*d6b92ffaSHans Petter Selasky
305*d6b92ffaSHans Petter Selasky union {
306*d6b92ffaSHans Petter Selasky /* data stream */
307*d6b92ffaSHans Petter Selasky struct {
308*d6b92ffaSHans Petter Selasky struct rdma_cm_id *cm_id;
309*d6b92ffaSHans Petter Selasky uint64_t tcp_opts;
310*d6b92ffaSHans Petter Selasky unsigned int keepalive_time;
311*d6b92ffaSHans Petter Selasky
312*d6b92ffaSHans Petter Selasky unsigned int ctrl_seqno;
313*d6b92ffaSHans Petter Selasky unsigned int ctrl_max_seqno;
314*d6b92ffaSHans Petter Selasky uint16_t sseq_no;
315*d6b92ffaSHans Petter Selasky uint16_t sseq_comp;
316*d6b92ffaSHans Petter Selasky uint16_t rseq_no;
317*d6b92ffaSHans Petter Selasky uint16_t rseq_comp;
318*d6b92ffaSHans Petter Selasky
319*d6b92ffaSHans Petter Selasky int remote_sge;
320*d6b92ffaSHans Petter Selasky struct rs_sge remote_sgl;
321*d6b92ffaSHans Petter Selasky struct rs_sge remote_iomap;
322*d6b92ffaSHans Petter Selasky
323*d6b92ffaSHans Petter Selasky struct ibv_mr *target_mr;
324*d6b92ffaSHans Petter Selasky int target_sge;
325*d6b92ffaSHans Petter Selasky int target_iomap_size;
326*d6b92ffaSHans Petter Selasky void *target_buffer_list;
327*d6b92ffaSHans Petter Selasky volatile struct rs_sge *target_sgl;
328*d6b92ffaSHans Petter Selasky struct rs_iomap *target_iomap;
329*d6b92ffaSHans Petter Selasky
330*d6b92ffaSHans Petter Selasky int rbuf_msg_index;
331*d6b92ffaSHans Petter Selasky int rbuf_bytes_avail;
332*d6b92ffaSHans Petter Selasky int rbuf_free_offset;
333*d6b92ffaSHans Petter Selasky int rbuf_offset;
334*d6b92ffaSHans Petter Selasky struct ibv_mr *rmr;
335*d6b92ffaSHans Petter Selasky uint8_t *rbuf;
336*d6b92ffaSHans Petter Selasky
337*d6b92ffaSHans Petter Selasky int sbuf_bytes_avail;
338*d6b92ffaSHans Petter Selasky struct ibv_mr *smr;
339*d6b92ffaSHans Petter Selasky struct ibv_sge ssgl[2];
340*d6b92ffaSHans Petter Selasky };
341*d6b92ffaSHans Petter Selasky /* datagram */
342*d6b92ffaSHans Petter Selasky struct {
343*d6b92ffaSHans Petter Selasky struct ds_qp *qp_list;
344*d6b92ffaSHans Petter Selasky void *dest_map;
345*d6b92ffaSHans Petter Selasky struct ds_dest *conn_dest;
346*d6b92ffaSHans Petter Selasky
347*d6b92ffaSHans Petter Selasky int udp_sock;
348*d6b92ffaSHans Petter Selasky int epfd;
349*d6b92ffaSHans Petter Selasky int rqe_avail;
350*d6b92ffaSHans Petter Selasky struct ds_smsg *smsg_free;
351*d6b92ffaSHans Petter Selasky };
352*d6b92ffaSHans Petter Selasky };
353*d6b92ffaSHans Petter Selasky
354*d6b92ffaSHans Petter Selasky int opts;
355*d6b92ffaSHans Petter Selasky int fd_flags;
356*d6b92ffaSHans Petter Selasky uint64_t so_opts;
357*d6b92ffaSHans Petter Selasky uint64_t ipv6_opts;
358*d6b92ffaSHans Petter Selasky void *optval;
359*d6b92ffaSHans Petter Selasky size_t optlen;
360*d6b92ffaSHans Petter Selasky int state;
361*d6b92ffaSHans Petter Selasky int cq_armed;
362*d6b92ffaSHans Petter Selasky int retries;
363*d6b92ffaSHans Petter Selasky int err;
364*d6b92ffaSHans Petter Selasky
365*d6b92ffaSHans Petter Selasky int sqe_avail;
366*d6b92ffaSHans Petter Selasky uint32_t sbuf_size;
367*d6b92ffaSHans Petter Selasky uint16_t sq_size;
368*d6b92ffaSHans Petter Selasky uint16_t sq_inline;
369*d6b92ffaSHans Petter Selasky
370*d6b92ffaSHans Petter Selasky uint32_t rbuf_size;
371*d6b92ffaSHans Petter Selasky uint16_t rq_size;
372*d6b92ffaSHans Petter Selasky int rmsg_head;
373*d6b92ffaSHans Petter Selasky int rmsg_tail;
374*d6b92ffaSHans Petter Selasky union {
375*d6b92ffaSHans Petter Selasky struct rs_msg *rmsg;
376*d6b92ffaSHans Petter Selasky struct ds_rmsg *dmsg;
377*d6b92ffaSHans Petter Selasky };
378*d6b92ffaSHans Petter Selasky
379*d6b92ffaSHans Petter Selasky uint8_t *sbuf;
380*d6b92ffaSHans Petter Selasky struct rs_iomap_mr *remote_iomappings;
381*d6b92ffaSHans Petter Selasky dlist_entry iomap_list;
382*d6b92ffaSHans Petter Selasky dlist_entry iomap_queue;
383*d6b92ffaSHans Petter Selasky int iomap_pending;
384*d6b92ffaSHans Petter Selasky int unack_cqe;
385*d6b92ffaSHans Petter Selasky };
386*d6b92ffaSHans Petter Selasky
387*d6b92ffaSHans Petter Selasky #define DS_UDP_TAG 0x55555555
388*d6b92ffaSHans Petter Selasky
389*d6b92ffaSHans Petter Selasky struct ds_udp_header {
390*d6b92ffaSHans Petter Selasky __be32 tag;
391*d6b92ffaSHans Petter Selasky uint8_t version;
392*d6b92ffaSHans Petter Selasky uint8_t op;
393*d6b92ffaSHans Petter Selasky uint8_t length;
394*d6b92ffaSHans Petter Selasky uint8_t reserved;
395*d6b92ffaSHans Petter Selasky __be32 qpn; /* lower 8-bits reserved */
396*d6b92ffaSHans Petter Selasky union {
397*d6b92ffaSHans Petter Selasky __be32 ipv4;
398*d6b92ffaSHans Petter Selasky uint8_t ipv6[16];
399*d6b92ffaSHans Petter Selasky } addr;
400*d6b92ffaSHans Petter Selasky };
401*d6b92ffaSHans Petter Selasky
402*d6b92ffaSHans Petter Selasky #define DS_UDP_IPV4_HDR_LEN 16
403*d6b92ffaSHans Petter Selasky #define DS_UDP_IPV6_HDR_LEN 28
404*d6b92ffaSHans Petter Selasky
405*d6b92ffaSHans Petter Selasky #define ds_next_qp(qp) container_of((qp)->list.next, struct ds_qp, list)
406*d6b92ffaSHans Petter Selasky
write_all(int fd,const void * msg,size_t len)407*d6b92ffaSHans Petter Selasky static void write_all(int fd, const void *msg, size_t len)
408*d6b92ffaSHans Petter Selasky {
409*d6b92ffaSHans Petter Selasky // FIXME: if fd is a socket this really needs to handle EINTR and other conditions.
410*d6b92ffaSHans Petter Selasky ssize_t rc = write(fd, msg, len);
411*d6b92ffaSHans Petter Selasky assert(rc == len);
412*d6b92ffaSHans Petter Selasky }
413*d6b92ffaSHans Petter Selasky
read_all(int fd,void * msg,size_t len)414*d6b92ffaSHans Petter Selasky static void read_all(int fd, void *msg, size_t len)
415*d6b92ffaSHans Petter Selasky {
416*d6b92ffaSHans Petter Selasky // FIXME: if fd is a socket this really needs to handle EINTR and other conditions.
417*d6b92ffaSHans Petter Selasky ssize_t rc = read(fd, msg, len);
418*d6b92ffaSHans Petter Selasky assert(rc == len);
419*d6b92ffaSHans Petter Selasky }
420*d6b92ffaSHans Petter Selasky
ds_insert_qp(struct rsocket * rs,struct ds_qp * qp)421*d6b92ffaSHans Petter Selasky static void ds_insert_qp(struct rsocket *rs, struct ds_qp *qp)
422*d6b92ffaSHans Petter Selasky {
423*d6b92ffaSHans Petter Selasky if (!rs->qp_list)
424*d6b92ffaSHans Petter Selasky dlist_init(&qp->list);
425*d6b92ffaSHans Petter Selasky else
426*d6b92ffaSHans Petter Selasky dlist_insert_head(&qp->list, &rs->qp_list->list);
427*d6b92ffaSHans Petter Selasky rs->qp_list = qp;
428*d6b92ffaSHans Petter Selasky }
429*d6b92ffaSHans Petter Selasky
ds_remove_qp(struct rsocket * rs,struct ds_qp * qp)430*d6b92ffaSHans Petter Selasky static void ds_remove_qp(struct rsocket *rs, struct ds_qp *qp)
431*d6b92ffaSHans Petter Selasky {
432*d6b92ffaSHans Petter Selasky if (qp->list.next != &qp->list) {
433*d6b92ffaSHans Petter Selasky rs->qp_list = ds_next_qp(qp);
434*d6b92ffaSHans Petter Selasky dlist_remove(&qp->list);
435*d6b92ffaSHans Petter Selasky } else {
436*d6b92ffaSHans Petter Selasky rs->qp_list = NULL;
437*d6b92ffaSHans Petter Selasky }
438*d6b92ffaSHans Petter Selasky }
439*d6b92ffaSHans Petter Selasky
rs_notify_svc(struct rs_svc * svc,struct rsocket * rs,int cmd)440*d6b92ffaSHans Petter Selasky static int rs_notify_svc(struct rs_svc *svc, struct rsocket *rs, int cmd)
441*d6b92ffaSHans Petter Selasky {
442*d6b92ffaSHans Petter Selasky struct rs_svc_msg msg;
443*d6b92ffaSHans Petter Selasky int ret;
444*d6b92ffaSHans Petter Selasky
445*d6b92ffaSHans Petter Selasky pthread_mutex_lock(&mut);
446*d6b92ffaSHans Petter Selasky if (!svc->cnt) {
447*d6b92ffaSHans Petter Selasky ret = socketpair(AF_UNIX, SOCK_STREAM, 0, svc->sock);
448*d6b92ffaSHans Petter Selasky if (ret)
449*d6b92ffaSHans Petter Selasky goto unlock;
450*d6b92ffaSHans Petter Selasky
451*d6b92ffaSHans Petter Selasky ret = pthread_create(&svc->id, NULL, svc->run, svc);
452*d6b92ffaSHans Petter Selasky if (ret) {
453*d6b92ffaSHans Petter Selasky ret = ERR(ret);
454*d6b92ffaSHans Petter Selasky goto closepair;
455*d6b92ffaSHans Petter Selasky }
456*d6b92ffaSHans Petter Selasky }
457*d6b92ffaSHans Petter Selasky
458*d6b92ffaSHans Petter Selasky msg.cmd = cmd;
459*d6b92ffaSHans Petter Selasky msg.status = EINVAL;
460*d6b92ffaSHans Petter Selasky msg.rs = rs;
461*d6b92ffaSHans Petter Selasky write_all(svc->sock[0], &msg, sizeof msg);
462*d6b92ffaSHans Petter Selasky read_all(svc->sock[0], &msg, sizeof msg);
463*d6b92ffaSHans Petter Selasky ret = rdma_seterrno(msg.status);
464*d6b92ffaSHans Petter Selasky if (svc->cnt)
465*d6b92ffaSHans Petter Selasky goto unlock;
466*d6b92ffaSHans Petter Selasky
467*d6b92ffaSHans Petter Selasky pthread_join(svc->id, NULL);
468*d6b92ffaSHans Petter Selasky closepair:
469*d6b92ffaSHans Petter Selasky close(svc->sock[0]);
470*d6b92ffaSHans Petter Selasky close(svc->sock[1]);
471*d6b92ffaSHans Petter Selasky unlock:
472*d6b92ffaSHans Petter Selasky pthread_mutex_unlock(&mut);
473*d6b92ffaSHans Petter Selasky return ret;
474*d6b92ffaSHans Petter Selasky }
475*d6b92ffaSHans Petter Selasky
ds_compare_addr(const void * dst1,const void * dst2)476*d6b92ffaSHans Petter Selasky static int ds_compare_addr(const void *dst1, const void *dst2)
477*d6b92ffaSHans Petter Selasky {
478*d6b92ffaSHans Petter Selasky const struct sockaddr *sa1, *sa2;
479*d6b92ffaSHans Petter Selasky size_t len;
480*d6b92ffaSHans Petter Selasky
481*d6b92ffaSHans Petter Selasky sa1 = (const struct sockaddr *) dst1;
482*d6b92ffaSHans Petter Selasky sa2 = (const struct sockaddr *) dst2;
483*d6b92ffaSHans Petter Selasky
484*d6b92ffaSHans Petter Selasky len = (sa1->sa_family == AF_INET6 && sa2->sa_family == AF_INET6) ?
485*d6b92ffaSHans Petter Selasky sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in);
486*d6b92ffaSHans Petter Selasky return memcmp(dst1, dst2, len);
487*d6b92ffaSHans Petter Selasky }
488*d6b92ffaSHans Petter Selasky
rs_value_to_scale(int value,int bits)489*d6b92ffaSHans Petter Selasky static int rs_value_to_scale(int value, int bits)
490*d6b92ffaSHans Petter Selasky {
491*d6b92ffaSHans Petter Selasky return value <= (1 << (bits - 1)) ?
492*d6b92ffaSHans Petter Selasky value : (1 << (bits - 1)) | (value >> bits);
493*d6b92ffaSHans Petter Selasky }
494*d6b92ffaSHans Petter Selasky
rs_scale_to_value(int value,int bits)495*d6b92ffaSHans Petter Selasky static int rs_scale_to_value(int value, int bits)
496*d6b92ffaSHans Petter Selasky {
497*d6b92ffaSHans Petter Selasky return value <= (1 << (bits - 1)) ?
498*d6b92ffaSHans Petter Selasky value : (value & ~(1 << (bits - 1))) << bits;
499*d6b92ffaSHans Petter Selasky }
500*d6b92ffaSHans Petter Selasky
501*d6b92ffaSHans Petter Selasky /* gcc > ~5 will not allow (void)fscanf to suppress -Wunused-result, but this
502*d6b92ffaSHans Petter Selasky will do it. In this case ignoring the result is OK (but horribly
503*d6b92ffaSHans Petter Selasky unfriendly to user) since the library has a sane default. */
504*d6b92ffaSHans Petter Selasky #define failable_fscanf(f, fmt, ...) \
505*d6b92ffaSHans Petter Selasky { \
506*d6b92ffaSHans Petter Selasky int rc = fscanf(f, fmt, __VA_ARGS__); \
507*d6b92ffaSHans Petter Selasky (void) rc; \
508*d6b92ffaSHans Petter Selasky }
509*d6b92ffaSHans Petter Selasky
rs_configure(void)510*d6b92ffaSHans Petter Selasky static void rs_configure(void)
511*d6b92ffaSHans Petter Selasky {
512*d6b92ffaSHans Petter Selasky FILE *f;
513*d6b92ffaSHans Petter Selasky static int init;
514*d6b92ffaSHans Petter Selasky
515*d6b92ffaSHans Petter Selasky if (init)
516*d6b92ffaSHans Petter Selasky return;
517*d6b92ffaSHans Petter Selasky
518*d6b92ffaSHans Petter Selasky pthread_mutex_lock(&mut);
519*d6b92ffaSHans Petter Selasky if (init)
520*d6b92ffaSHans Petter Selasky goto out;
521*d6b92ffaSHans Petter Selasky
522*d6b92ffaSHans Petter Selasky if (ucma_init())
523*d6b92ffaSHans Petter Selasky goto out;
524*d6b92ffaSHans Petter Selasky ucma_ib_init();
525*d6b92ffaSHans Petter Selasky
526*d6b92ffaSHans Petter Selasky if ((f = fopen(RS_CONF_DIR "/polling_time", "r"))) {
527*d6b92ffaSHans Petter Selasky failable_fscanf(f, "%u", &polling_time);
528*d6b92ffaSHans Petter Selasky fclose(f);
529*d6b92ffaSHans Petter Selasky }
530*d6b92ffaSHans Petter Selasky
531*d6b92ffaSHans Petter Selasky if ((f = fopen(RS_CONF_DIR "/inline_default", "r"))) {
532*d6b92ffaSHans Petter Selasky failable_fscanf(f, "%hu", &def_inline);
533*d6b92ffaSHans Petter Selasky fclose(f);
534*d6b92ffaSHans Petter Selasky }
535*d6b92ffaSHans Petter Selasky
536*d6b92ffaSHans Petter Selasky if ((f = fopen(RS_CONF_DIR "/sqsize_default", "r"))) {
537*d6b92ffaSHans Petter Selasky failable_fscanf(f, "%hu", &def_sqsize);
538*d6b92ffaSHans Petter Selasky fclose(f);
539*d6b92ffaSHans Petter Selasky }
540*d6b92ffaSHans Petter Selasky
541*d6b92ffaSHans Petter Selasky if ((f = fopen(RS_CONF_DIR "/rqsize_default", "r"))) {
542*d6b92ffaSHans Petter Selasky failable_fscanf(f, "%hu", &def_rqsize);
543*d6b92ffaSHans Petter Selasky fclose(f);
544*d6b92ffaSHans Petter Selasky }
545*d6b92ffaSHans Petter Selasky
546*d6b92ffaSHans Petter Selasky if ((f = fopen(RS_CONF_DIR "/mem_default", "r"))) {
547*d6b92ffaSHans Petter Selasky failable_fscanf(f, "%u", &def_mem);
548*d6b92ffaSHans Petter Selasky fclose(f);
549*d6b92ffaSHans Petter Selasky
550*d6b92ffaSHans Petter Selasky if (def_mem < 1)
551*d6b92ffaSHans Petter Selasky def_mem = 1;
552*d6b92ffaSHans Petter Selasky }
553*d6b92ffaSHans Petter Selasky
554*d6b92ffaSHans Petter Selasky if ((f = fopen(RS_CONF_DIR "/wmem_default", "r"))) {
555*d6b92ffaSHans Petter Selasky failable_fscanf(f, "%u", &def_wmem);
556*d6b92ffaSHans Petter Selasky fclose(f);
557*d6b92ffaSHans Petter Selasky if (def_wmem < RS_SNDLOWAT)
558*d6b92ffaSHans Petter Selasky def_wmem = RS_SNDLOWAT << 1;
559*d6b92ffaSHans Petter Selasky }
560*d6b92ffaSHans Petter Selasky
561*d6b92ffaSHans Petter Selasky if ((f = fopen(RS_CONF_DIR "/iomap_size", "r"))) {
562*d6b92ffaSHans Petter Selasky failable_fscanf(f, "%hu", &def_iomap_size);
563*d6b92ffaSHans Petter Selasky fclose(f);
564*d6b92ffaSHans Petter Selasky
565*d6b92ffaSHans Petter Selasky /* round to supported values */
566*d6b92ffaSHans Petter Selasky def_iomap_size = (uint8_t) rs_value_to_scale(
567*d6b92ffaSHans Petter Selasky (uint16_t) rs_scale_to_value(def_iomap_size, 8), 8);
568*d6b92ffaSHans Petter Selasky }
569*d6b92ffaSHans Petter Selasky init = 1;
570*d6b92ffaSHans Petter Selasky out:
571*d6b92ffaSHans Petter Selasky pthread_mutex_unlock(&mut);
572*d6b92ffaSHans Petter Selasky }
573*d6b92ffaSHans Petter Selasky
rs_insert(struct rsocket * rs,int index)574*d6b92ffaSHans Petter Selasky static int rs_insert(struct rsocket *rs, int index)
575*d6b92ffaSHans Petter Selasky {
576*d6b92ffaSHans Petter Selasky pthread_mutex_lock(&mut);
577*d6b92ffaSHans Petter Selasky rs->index = idm_set(&idm, index, rs);
578*d6b92ffaSHans Petter Selasky pthread_mutex_unlock(&mut);
579*d6b92ffaSHans Petter Selasky return rs->index;
580*d6b92ffaSHans Petter Selasky }
581*d6b92ffaSHans Petter Selasky
rs_remove(struct rsocket * rs)582*d6b92ffaSHans Petter Selasky static void rs_remove(struct rsocket *rs)
583*d6b92ffaSHans Petter Selasky {
584*d6b92ffaSHans Petter Selasky pthread_mutex_lock(&mut);
585*d6b92ffaSHans Petter Selasky idm_clear(&idm, rs->index);
586*d6b92ffaSHans Petter Selasky pthread_mutex_unlock(&mut);
587*d6b92ffaSHans Petter Selasky }
588*d6b92ffaSHans Petter Selasky
589*d6b92ffaSHans Petter Selasky /* We only inherit from listening sockets */
rs_alloc(struct rsocket * inherited_rs,int type)590*d6b92ffaSHans Petter Selasky static struct rsocket *rs_alloc(struct rsocket *inherited_rs, int type)
591*d6b92ffaSHans Petter Selasky {
592*d6b92ffaSHans Petter Selasky struct rsocket *rs;
593*d6b92ffaSHans Petter Selasky
594*d6b92ffaSHans Petter Selasky rs = calloc(1, sizeof(*rs));
595*d6b92ffaSHans Petter Selasky if (!rs)
596*d6b92ffaSHans Petter Selasky return NULL;
597*d6b92ffaSHans Petter Selasky
598*d6b92ffaSHans Petter Selasky rs->type = type;
599*d6b92ffaSHans Petter Selasky rs->index = -1;
600*d6b92ffaSHans Petter Selasky if (type == SOCK_DGRAM) {
601*d6b92ffaSHans Petter Selasky rs->udp_sock = -1;
602*d6b92ffaSHans Petter Selasky rs->epfd = -1;
603*d6b92ffaSHans Petter Selasky }
604*d6b92ffaSHans Petter Selasky
605*d6b92ffaSHans Petter Selasky if (inherited_rs) {
606*d6b92ffaSHans Petter Selasky rs->sbuf_size = inherited_rs->sbuf_size;
607*d6b92ffaSHans Petter Selasky rs->rbuf_size = inherited_rs->rbuf_size;
608*d6b92ffaSHans Petter Selasky rs->sq_inline = inherited_rs->sq_inline;
609*d6b92ffaSHans Petter Selasky rs->sq_size = inherited_rs->sq_size;
610*d6b92ffaSHans Petter Selasky rs->rq_size = inherited_rs->rq_size;
611*d6b92ffaSHans Petter Selasky if (type == SOCK_STREAM) {
612*d6b92ffaSHans Petter Selasky rs->ctrl_max_seqno = inherited_rs->ctrl_max_seqno;
613*d6b92ffaSHans Petter Selasky rs->target_iomap_size = inherited_rs->target_iomap_size;
614*d6b92ffaSHans Petter Selasky }
615*d6b92ffaSHans Petter Selasky } else {
616*d6b92ffaSHans Petter Selasky rs->sbuf_size = def_wmem;
617*d6b92ffaSHans Petter Selasky rs->rbuf_size = def_mem;
618*d6b92ffaSHans Petter Selasky rs->sq_inline = def_inline;
619*d6b92ffaSHans Petter Selasky rs->sq_size = def_sqsize;
620*d6b92ffaSHans Petter Selasky rs->rq_size = def_rqsize;
621*d6b92ffaSHans Petter Selasky if (type == SOCK_STREAM) {
622*d6b92ffaSHans Petter Selasky rs->ctrl_max_seqno = RS_QP_CTRL_SIZE;
623*d6b92ffaSHans Petter Selasky rs->target_iomap_size = def_iomap_size;
624*d6b92ffaSHans Petter Selasky }
625*d6b92ffaSHans Petter Selasky }
626*d6b92ffaSHans Petter Selasky fastlock_init(&rs->slock);
627*d6b92ffaSHans Petter Selasky fastlock_init(&rs->rlock);
628*d6b92ffaSHans Petter Selasky fastlock_init(&rs->cq_lock);
629*d6b92ffaSHans Petter Selasky fastlock_init(&rs->cq_wait_lock);
630*d6b92ffaSHans Petter Selasky fastlock_init(&rs->map_lock);
631*d6b92ffaSHans Petter Selasky dlist_init(&rs->iomap_list);
632*d6b92ffaSHans Petter Selasky dlist_init(&rs->iomap_queue);
633*d6b92ffaSHans Petter Selasky return rs;
634*d6b92ffaSHans Petter Selasky }
635*d6b92ffaSHans Petter Selasky
rs_set_nonblocking(struct rsocket * rs,int arg)636*d6b92ffaSHans Petter Selasky static int rs_set_nonblocking(struct rsocket *rs, int arg)
637*d6b92ffaSHans Petter Selasky {
638*d6b92ffaSHans Petter Selasky struct ds_qp *qp;
639*d6b92ffaSHans Petter Selasky int ret = 0;
640*d6b92ffaSHans Petter Selasky
641*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
642*d6b92ffaSHans Petter Selasky if (rs->cm_id->recv_cq_channel)
643*d6b92ffaSHans Petter Selasky ret = fcntl(rs->cm_id->recv_cq_channel->fd, F_SETFL, arg);
644*d6b92ffaSHans Petter Selasky
645*d6b92ffaSHans Petter Selasky if (!ret && rs->state < rs_connected)
646*d6b92ffaSHans Petter Selasky ret = fcntl(rs->cm_id->channel->fd, F_SETFL, arg);
647*d6b92ffaSHans Petter Selasky } else {
648*d6b92ffaSHans Petter Selasky ret = fcntl(rs->epfd, F_SETFL, arg);
649*d6b92ffaSHans Petter Selasky if (!ret && rs->qp_list) {
650*d6b92ffaSHans Petter Selasky qp = rs->qp_list;
651*d6b92ffaSHans Petter Selasky do {
652*d6b92ffaSHans Petter Selasky ret = fcntl(qp->cm_id->recv_cq_channel->fd,
653*d6b92ffaSHans Petter Selasky F_SETFL, arg);
654*d6b92ffaSHans Petter Selasky qp = ds_next_qp(qp);
655*d6b92ffaSHans Petter Selasky } while (qp != rs->qp_list && !ret);
656*d6b92ffaSHans Petter Selasky }
657*d6b92ffaSHans Petter Selasky }
658*d6b92ffaSHans Petter Selasky
659*d6b92ffaSHans Petter Selasky return ret;
660*d6b92ffaSHans Petter Selasky }
661*d6b92ffaSHans Petter Selasky
rs_set_qp_size(struct rsocket * rs)662*d6b92ffaSHans Petter Selasky static void rs_set_qp_size(struct rsocket *rs)
663*d6b92ffaSHans Petter Selasky {
664*d6b92ffaSHans Petter Selasky uint16_t max_size;
665*d6b92ffaSHans Petter Selasky
666*d6b92ffaSHans Petter Selasky max_size = min(ucma_max_qpsize(rs->cm_id), RS_QP_MAX_SIZE);
667*d6b92ffaSHans Petter Selasky
668*d6b92ffaSHans Petter Selasky if (rs->sq_size > max_size)
669*d6b92ffaSHans Petter Selasky rs->sq_size = max_size;
670*d6b92ffaSHans Petter Selasky else if (rs->sq_size < RS_QP_MIN_SIZE)
671*d6b92ffaSHans Petter Selasky rs->sq_size = RS_QP_MIN_SIZE;
672*d6b92ffaSHans Petter Selasky
673*d6b92ffaSHans Petter Selasky if (rs->rq_size > max_size)
674*d6b92ffaSHans Petter Selasky rs->rq_size = max_size;
675*d6b92ffaSHans Petter Selasky else if (rs->rq_size < RS_QP_MIN_SIZE)
676*d6b92ffaSHans Petter Selasky rs->rq_size = RS_QP_MIN_SIZE;
677*d6b92ffaSHans Petter Selasky }
678*d6b92ffaSHans Petter Selasky
ds_set_qp_size(struct rsocket * rs)679*d6b92ffaSHans Petter Selasky static void ds_set_qp_size(struct rsocket *rs)
680*d6b92ffaSHans Petter Selasky {
681*d6b92ffaSHans Petter Selasky uint16_t max_size;
682*d6b92ffaSHans Petter Selasky
683*d6b92ffaSHans Petter Selasky max_size = min(ucma_max_qpsize(NULL), RS_QP_MAX_SIZE);
684*d6b92ffaSHans Petter Selasky
685*d6b92ffaSHans Petter Selasky if (rs->sq_size > max_size)
686*d6b92ffaSHans Petter Selasky rs->sq_size = max_size;
687*d6b92ffaSHans Petter Selasky if (rs->rq_size > max_size)
688*d6b92ffaSHans Petter Selasky rs->rq_size = max_size;
689*d6b92ffaSHans Petter Selasky
690*d6b92ffaSHans Petter Selasky if (rs->rq_size > (rs->rbuf_size / RS_SNDLOWAT))
691*d6b92ffaSHans Petter Selasky rs->rq_size = rs->rbuf_size / RS_SNDLOWAT;
692*d6b92ffaSHans Petter Selasky else
693*d6b92ffaSHans Petter Selasky rs->rbuf_size = rs->rq_size * RS_SNDLOWAT;
694*d6b92ffaSHans Petter Selasky
695*d6b92ffaSHans Petter Selasky if (rs->sq_size > (rs->sbuf_size / RS_SNDLOWAT))
696*d6b92ffaSHans Petter Selasky rs->sq_size = rs->sbuf_size / RS_SNDLOWAT;
697*d6b92ffaSHans Petter Selasky else
698*d6b92ffaSHans Petter Selasky rs->sbuf_size = rs->sq_size * RS_SNDLOWAT;
699*d6b92ffaSHans Petter Selasky }
700*d6b92ffaSHans Petter Selasky
rs_init_bufs(struct rsocket * rs)701*d6b92ffaSHans Petter Selasky static int rs_init_bufs(struct rsocket *rs)
702*d6b92ffaSHans Petter Selasky {
703*d6b92ffaSHans Petter Selasky uint32_t total_rbuf_size, total_sbuf_size;
704*d6b92ffaSHans Petter Selasky size_t len;
705*d6b92ffaSHans Petter Selasky
706*d6b92ffaSHans Petter Selasky rs->rmsg = calloc(rs->rq_size + 1, sizeof(*rs->rmsg));
707*d6b92ffaSHans Petter Selasky if (!rs->rmsg)
708*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
709*d6b92ffaSHans Petter Selasky
710*d6b92ffaSHans Petter Selasky total_sbuf_size = rs->sbuf_size;
711*d6b92ffaSHans Petter Selasky if (rs->sq_inline < RS_MAX_CTRL_MSG)
712*d6b92ffaSHans Petter Selasky total_sbuf_size += RS_MAX_CTRL_MSG * RS_QP_CTRL_SIZE;
713*d6b92ffaSHans Petter Selasky rs->sbuf = calloc(total_sbuf_size, 1);
714*d6b92ffaSHans Petter Selasky if (!rs->sbuf)
715*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
716*d6b92ffaSHans Petter Selasky
717*d6b92ffaSHans Petter Selasky rs->smr = rdma_reg_msgs(rs->cm_id, rs->sbuf, total_sbuf_size);
718*d6b92ffaSHans Petter Selasky if (!rs->smr)
719*d6b92ffaSHans Petter Selasky return -1;
720*d6b92ffaSHans Petter Selasky
721*d6b92ffaSHans Petter Selasky len = sizeof(*rs->target_sgl) * RS_SGL_SIZE +
722*d6b92ffaSHans Petter Selasky sizeof(*rs->target_iomap) * rs->target_iomap_size;
723*d6b92ffaSHans Petter Selasky rs->target_buffer_list = malloc(len);
724*d6b92ffaSHans Petter Selasky if (!rs->target_buffer_list)
725*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
726*d6b92ffaSHans Petter Selasky
727*d6b92ffaSHans Petter Selasky rs->target_mr = rdma_reg_write(rs->cm_id, rs->target_buffer_list, len);
728*d6b92ffaSHans Petter Selasky if (!rs->target_mr)
729*d6b92ffaSHans Petter Selasky return -1;
730*d6b92ffaSHans Petter Selasky
731*d6b92ffaSHans Petter Selasky memset(rs->target_buffer_list, 0, len);
732*d6b92ffaSHans Petter Selasky rs->target_sgl = rs->target_buffer_list;
733*d6b92ffaSHans Petter Selasky if (rs->target_iomap_size)
734*d6b92ffaSHans Petter Selasky rs->target_iomap = (struct rs_iomap *) (rs->target_sgl + RS_SGL_SIZE);
735*d6b92ffaSHans Petter Selasky
736*d6b92ffaSHans Petter Selasky total_rbuf_size = rs->rbuf_size;
737*d6b92ffaSHans Petter Selasky if (rs->opts & RS_OPT_MSG_SEND)
738*d6b92ffaSHans Petter Selasky total_rbuf_size += rs->rq_size * RS_MSG_SIZE;
739*d6b92ffaSHans Petter Selasky rs->rbuf = calloc(total_rbuf_size, 1);
740*d6b92ffaSHans Petter Selasky if (!rs->rbuf)
741*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
742*d6b92ffaSHans Petter Selasky
743*d6b92ffaSHans Petter Selasky rs->rmr = rdma_reg_write(rs->cm_id, rs->rbuf, total_rbuf_size);
744*d6b92ffaSHans Petter Selasky if (!rs->rmr)
745*d6b92ffaSHans Petter Selasky return -1;
746*d6b92ffaSHans Petter Selasky
747*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr = rs->ssgl[1].addr = (uintptr_t) rs->sbuf;
748*d6b92ffaSHans Petter Selasky rs->sbuf_bytes_avail = rs->sbuf_size;
749*d6b92ffaSHans Petter Selasky rs->ssgl[0].lkey = rs->ssgl[1].lkey = rs->smr->lkey;
750*d6b92ffaSHans Petter Selasky
751*d6b92ffaSHans Petter Selasky rs->rbuf_free_offset = rs->rbuf_size >> 1;
752*d6b92ffaSHans Petter Selasky rs->rbuf_bytes_avail = rs->rbuf_size >> 1;
753*d6b92ffaSHans Petter Selasky rs->sqe_avail = rs->sq_size - rs->ctrl_max_seqno;
754*d6b92ffaSHans Petter Selasky rs->rseq_comp = rs->rq_size >> 1;
755*d6b92ffaSHans Petter Selasky return 0;
756*d6b92ffaSHans Petter Selasky }
757*d6b92ffaSHans Petter Selasky
ds_init_bufs(struct ds_qp * qp)758*d6b92ffaSHans Petter Selasky static int ds_init_bufs(struct ds_qp *qp)
759*d6b92ffaSHans Petter Selasky {
760*d6b92ffaSHans Petter Selasky qp->rbuf = calloc(qp->rs->rbuf_size + sizeof(struct ibv_grh), 1);
761*d6b92ffaSHans Petter Selasky if (!qp->rbuf)
762*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
763*d6b92ffaSHans Petter Selasky
764*d6b92ffaSHans Petter Selasky qp->smr = rdma_reg_msgs(qp->cm_id, qp->rs->sbuf, qp->rs->sbuf_size);
765*d6b92ffaSHans Petter Selasky if (!qp->smr)
766*d6b92ffaSHans Petter Selasky return -1;
767*d6b92ffaSHans Petter Selasky
768*d6b92ffaSHans Petter Selasky qp->rmr = rdma_reg_msgs(qp->cm_id, qp->rbuf, qp->rs->rbuf_size +
769*d6b92ffaSHans Petter Selasky sizeof(struct ibv_grh));
770*d6b92ffaSHans Petter Selasky if (!qp->rmr)
771*d6b92ffaSHans Petter Selasky return -1;
772*d6b92ffaSHans Petter Selasky
773*d6b92ffaSHans Petter Selasky return 0;
774*d6b92ffaSHans Petter Selasky }
775*d6b92ffaSHans Petter Selasky
776*d6b92ffaSHans Petter Selasky /*
777*d6b92ffaSHans Petter Selasky * If a user is waiting on a datagram rsocket through poll or select, then
778*d6b92ffaSHans Petter Selasky * we need the first completion to generate an event on the related epoll fd
779*d6b92ffaSHans Petter Selasky * in order to signal the user. We arm the CQ on creation for this purpose
780*d6b92ffaSHans Petter Selasky */
rs_create_cq(struct rsocket * rs,struct rdma_cm_id * cm_id)781*d6b92ffaSHans Petter Selasky static int rs_create_cq(struct rsocket *rs, struct rdma_cm_id *cm_id)
782*d6b92ffaSHans Petter Selasky {
783*d6b92ffaSHans Petter Selasky cm_id->recv_cq_channel = ibv_create_comp_channel(cm_id->verbs);
784*d6b92ffaSHans Petter Selasky if (!cm_id->recv_cq_channel)
785*d6b92ffaSHans Petter Selasky return -1;
786*d6b92ffaSHans Petter Selasky
787*d6b92ffaSHans Petter Selasky cm_id->recv_cq = ibv_create_cq(cm_id->verbs, rs->sq_size + rs->rq_size,
788*d6b92ffaSHans Petter Selasky cm_id, cm_id->recv_cq_channel, 0);
789*d6b92ffaSHans Petter Selasky if (!cm_id->recv_cq)
790*d6b92ffaSHans Petter Selasky goto err1;
791*d6b92ffaSHans Petter Selasky
792*d6b92ffaSHans Petter Selasky if (rs->fd_flags & O_NONBLOCK) {
793*d6b92ffaSHans Petter Selasky if (fcntl(cm_id->recv_cq_channel->fd, F_SETFL, O_NONBLOCK))
794*d6b92ffaSHans Petter Selasky goto err2;
795*d6b92ffaSHans Petter Selasky }
796*d6b92ffaSHans Petter Selasky
797*d6b92ffaSHans Petter Selasky ibv_req_notify_cq(cm_id->recv_cq, 0);
798*d6b92ffaSHans Petter Selasky cm_id->send_cq_channel = cm_id->recv_cq_channel;
799*d6b92ffaSHans Petter Selasky cm_id->send_cq = cm_id->recv_cq;
800*d6b92ffaSHans Petter Selasky return 0;
801*d6b92ffaSHans Petter Selasky
802*d6b92ffaSHans Petter Selasky err2:
803*d6b92ffaSHans Petter Selasky ibv_destroy_cq(cm_id->recv_cq);
804*d6b92ffaSHans Petter Selasky cm_id->recv_cq = NULL;
805*d6b92ffaSHans Petter Selasky err1:
806*d6b92ffaSHans Petter Selasky ibv_destroy_comp_channel(cm_id->recv_cq_channel);
807*d6b92ffaSHans Petter Selasky cm_id->recv_cq_channel = NULL;
808*d6b92ffaSHans Petter Selasky return -1;
809*d6b92ffaSHans Petter Selasky }
810*d6b92ffaSHans Petter Selasky
rs_post_recv(struct rsocket * rs)811*d6b92ffaSHans Petter Selasky static inline int rs_post_recv(struct rsocket *rs)
812*d6b92ffaSHans Petter Selasky {
813*d6b92ffaSHans Petter Selasky struct ibv_recv_wr wr, *bad;
814*d6b92ffaSHans Petter Selasky struct ibv_sge sge;
815*d6b92ffaSHans Petter Selasky
816*d6b92ffaSHans Petter Selasky wr.next = NULL;
817*d6b92ffaSHans Petter Selasky if (!(rs->opts & RS_OPT_MSG_SEND)) {
818*d6b92ffaSHans Petter Selasky wr.wr_id = rs_recv_wr_id(0);
819*d6b92ffaSHans Petter Selasky wr.sg_list = NULL;
820*d6b92ffaSHans Petter Selasky wr.num_sge = 0;
821*d6b92ffaSHans Petter Selasky } else {
822*d6b92ffaSHans Petter Selasky wr.wr_id = rs_recv_wr_id(rs->rbuf_msg_index);
823*d6b92ffaSHans Petter Selasky sge.addr = (uintptr_t) rs->rbuf + rs->rbuf_size +
824*d6b92ffaSHans Petter Selasky (rs->rbuf_msg_index * RS_MSG_SIZE);
825*d6b92ffaSHans Petter Selasky sge.length = RS_MSG_SIZE;
826*d6b92ffaSHans Petter Selasky sge.lkey = rs->rmr->lkey;
827*d6b92ffaSHans Petter Selasky
828*d6b92ffaSHans Petter Selasky wr.sg_list = &sge;
829*d6b92ffaSHans Petter Selasky wr.num_sge = 1;
830*d6b92ffaSHans Petter Selasky if(++rs->rbuf_msg_index == rs->rq_size)
831*d6b92ffaSHans Petter Selasky rs->rbuf_msg_index = 0;
832*d6b92ffaSHans Petter Selasky }
833*d6b92ffaSHans Petter Selasky
834*d6b92ffaSHans Petter Selasky return rdma_seterrno(ibv_post_recv(rs->cm_id->qp, &wr, &bad));
835*d6b92ffaSHans Petter Selasky }
836*d6b92ffaSHans Petter Selasky
ds_post_recv(struct rsocket * rs,struct ds_qp * qp,uint32_t offset)837*d6b92ffaSHans Petter Selasky static inline int ds_post_recv(struct rsocket *rs, struct ds_qp *qp, uint32_t offset)
838*d6b92ffaSHans Petter Selasky {
839*d6b92ffaSHans Petter Selasky struct ibv_recv_wr wr, *bad;
840*d6b92ffaSHans Petter Selasky struct ibv_sge sge[2];
841*d6b92ffaSHans Petter Selasky
842*d6b92ffaSHans Petter Selasky sge[0].addr = (uintptr_t) qp->rbuf + rs->rbuf_size;
843*d6b92ffaSHans Petter Selasky sge[0].length = sizeof(struct ibv_grh);
844*d6b92ffaSHans Petter Selasky sge[0].lkey = qp->rmr->lkey;
845*d6b92ffaSHans Petter Selasky sge[1].addr = (uintptr_t) qp->rbuf + offset;
846*d6b92ffaSHans Petter Selasky sge[1].length = RS_SNDLOWAT;
847*d6b92ffaSHans Petter Selasky sge[1].lkey = qp->rmr->lkey;
848*d6b92ffaSHans Petter Selasky
849*d6b92ffaSHans Petter Selasky wr.wr_id = rs_recv_wr_id(offset);
850*d6b92ffaSHans Petter Selasky wr.next = NULL;
851*d6b92ffaSHans Petter Selasky wr.sg_list = sge;
852*d6b92ffaSHans Petter Selasky wr.num_sge = 2;
853*d6b92ffaSHans Petter Selasky
854*d6b92ffaSHans Petter Selasky return rdma_seterrno(ibv_post_recv(qp->cm_id->qp, &wr, &bad));
855*d6b92ffaSHans Petter Selasky }
856*d6b92ffaSHans Petter Selasky
rs_create_ep(struct rsocket * rs)857*d6b92ffaSHans Petter Selasky static int rs_create_ep(struct rsocket *rs)
858*d6b92ffaSHans Petter Selasky {
859*d6b92ffaSHans Petter Selasky struct ibv_qp_init_attr qp_attr;
860*d6b92ffaSHans Petter Selasky int i, ret;
861*d6b92ffaSHans Petter Selasky
862*d6b92ffaSHans Petter Selasky rs_set_qp_size(rs);
863*d6b92ffaSHans Petter Selasky if (rs->cm_id->verbs->device->transport_type == IBV_TRANSPORT_IWARP)
864*d6b92ffaSHans Petter Selasky rs->opts |= RS_OPT_MSG_SEND;
865*d6b92ffaSHans Petter Selasky ret = rs_create_cq(rs, rs->cm_id);
866*d6b92ffaSHans Petter Selasky if (ret)
867*d6b92ffaSHans Petter Selasky return ret;
868*d6b92ffaSHans Petter Selasky
869*d6b92ffaSHans Petter Selasky memset(&qp_attr, 0, sizeof qp_attr);
870*d6b92ffaSHans Petter Selasky qp_attr.qp_context = rs;
871*d6b92ffaSHans Petter Selasky qp_attr.send_cq = rs->cm_id->send_cq;
872*d6b92ffaSHans Petter Selasky qp_attr.recv_cq = rs->cm_id->recv_cq;
873*d6b92ffaSHans Petter Selasky qp_attr.qp_type = IBV_QPT_RC;
874*d6b92ffaSHans Petter Selasky qp_attr.sq_sig_all = 1;
875*d6b92ffaSHans Petter Selasky qp_attr.cap.max_send_wr = rs->sq_size;
876*d6b92ffaSHans Petter Selasky qp_attr.cap.max_recv_wr = rs->rq_size;
877*d6b92ffaSHans Petter Selasky qp_attr.cap.max_send_sge = 2;
878*d6b92ffaSHans Petter Selasky qp_attr.cap.max_recv_sge = 1;
879*d6b92ffaSHans Petter Selasky qp_attr.cap.max_inline_data = rs->sq_inline;
880*d6b92ffaSHans Petter Selasky
881*d6b92ffaSHans Petter Selasky ret = rdma_create_qp(rs->cm_id, NULL, &qp_attr);
882*d6b92ffaSHans Petter Selasky if (ret)
883*d6b92ffaSHans Petter Selasky return ret;
884*d6b92ffaSHans Petter Selasky
885*d6b92ffaSHans Petter Selasky rs->sq_inline = qp_attr.cap.max_inline_data;
886*d6b92ffaSHans Petter Selasky if ((rs->opts & RS_OPT_MSG_SEND) && (rs->sq_inline < RS_MSG_SIZE))
887*d6b92ffaSHans Petter Selasky return ERR(ENOTSUP);
888*d6b92ffaSHans Petter Selasky
889*d6b92ffaSHans Petter Selasky ret = rs_init_bufs(rs);
890*d6b92ffaSHans Petter Selasky if (ret)
891*d6b92ffaSHans Petter Selasky return ret;
892*d6b92ffaSHans Petter Selasky
893*d6b92ffaSHans Petter Selasky for (i = 0; i < rs->rq_size; i++) {
894*d6b92ffaSHans Petter Selasky ret = rs_post_recv(rs);
895*d6b92ffaSHans Petter Selasky if (ret)
896*d6b92ffaSHans Petter Selasky return ret;
897*d6b92ffaSHans Petter Selasky }
898*d6b92ffaSHans Petter Selasky return 0;
899*d6b92ffaSHans Petter Selasky }
900*d6b92ffaSHans Petter Selasky
rs_release_iomap_mr(struct rs_iomap_mr * iomr)901*d6b92ffaSHans Petter Selasky static void rs_release_iomap_mr(struct rs_iomap_mr *iomr)
902*d6b92ffaSHans Petter Selasky {
903*d6b92ffaSHans Petter Selasky if (atomic_fetch_sub(&iomr->refcnt, 1) != 1)
904*d6b92ffaSHans Petter Selasky return;
905*d6b92ffaSHans Petter Selasky
906*d6b92ffaSHans Petter Selasky dlist_remove(&iomr->entry);
907*d6b92ffaSHans Petter Selasky ibv_dereg_mr(iomr->mr);
908*d6b92ffaSHans Petter Selasky if (iomr->index >= 0)
909*d6b92ffaSHans Petter Selasky iomr->mr = NULL;
910*d6b92ffaSHans Petter Selasky else
911*d6b92ffaSHans Petter Selasky free(iomr);
912*d6b92ffaSHans Petter Selasky }
913*d6b92ffaSHans Petter Selasky
rs_free_iomappings(struct rsocket * rs)914*d6b92ffaSHans Petter Selasky static void rs_free_iomappings(struct rsocket *rs)
915*d6b92ffaSHans Petter Selasky {
916*d6b92ffaSHans Petter Selasky struct rs_iomap_mr *iomr;
917*d6b92ffaSHans Petter Selasky
918*d6b92ffaSHans Petter Selasky while (!dlist_empty(&rs->iomap_list)) {
919*d6b92ffaSHans Petter Selasky iomr = container_of(rs->iomap_list.next,
920*d6b92ffaSHans Petter Selasky struct rs_iomap_mr, entry);
921*d6b92ffaSHans Petter Selasky riounmap(rs->index, iomr->mr->addr, iomr->mr->length);
922*d6b92ffaSHans Petter Selasky }
923*d6b92ffaSHans Petter Selasky while (!dlist_empty(&rs->iomap_queue)) {
924*d6b92ffaSHans Petter Selasky iomr = container_of(rs->iomap_queue.next,
925*d6b92ffaSHans Petter Selasky struct rs_iomap_mr, entry);
926*d6b92ffaSHans Petter Selasky riounmap(rs->index, iomr->mr->addr, iomr->mr->length);
927*d6b92ffaSHans Petter Selasky }
928*d6b92ffaSHans Petter Selasky }
929*d6b92ffaSHans Petter Selasky
ds_free_qp(struct ds_qp * qp)930*d6b92ffaSHans Petter Selasky static void ds_free_qp(struct ds_qp *qp)
931*d6b92ffaSHans Petter Selasky {
932*d6b92ffaSHans Petter Selasky if (qp->smr)
933*d6b92ffaSHans Petter Selasky rdma_dereg_mr(qp->smr);
934*d6b92ffaSHans Petter Selasky
935*d6b92ffaSHans Petter Selasky if (qp->rbuf) {
936*d6b92ffaSHans Petter Selasky if (qp->rmr)
937*d6b92ffaSHans Petter Selasky rdma_dereg_mr(qp->rmr);
938*d6b92ffaSHans Petter Selasky free(qp->rbuf);
939*d6b92ffaSHans Petter Selasky }
940*d6b92ffaSHans Petter Selasky
941*d6b92ffaSHans Petter Selasky if (qp->cm_id) {
942*d6b92ffaSHans Petter Selasky if (qp->cm_id->qp) {
943*d6b92ffaSHans Petter Selasky tdelete(&qp->dest.addr, &qp->rs->dest_map, ds_compare_addr);
944*d6b92ffaSHans Petter Selasky epoll_ctl(qp->rs->epfd, EPOLL_CTL_DEL,
945*d6b92ffaSHans Petter Selasky qp->cm_id->recv_cq_channel->fd, NULL);
946*d6b92ffaSHans Petter Selasky rdma_destroy_qp(qp->cm_id);
947*d6b92ffaSHans Petter Selasky }
948*d6b92ffaSHans Petter Selasky rdma_destroy_id(qp->cm_id);
949*d6b92ffaSHans Petter Selasky }
950*d6b92ffaSHans Petter Selasky
951*d6b92ffaSHans Petter Selasky free(qp);
952*d6b92ffaSHans Petter Selasky }
953*d6b92ffaSHans Petter Selasky
ds_free(struct rsocket * rs)954*d6b92ffaSHans Petter Selasky static void ds_free(struct rsocket *rs)
955*d6b92ffaSHans Petter Selasky {
956*d6b92ffaSHans Petter Selasky struct ds_qp *qp;
957*d6b92ffaSHans Petter Selasky
958*d6b92ffaSHans Petter Selasky if (rs->udp_sock >= 0)
959*d6b92ffaSHans Petter Selasky close(rs->udp_sock);
960*d6b92ffaSHans Petter Selasky
961*d6b92ffaSHans Petter Selasky if (rs->index >= 0)
962*d6b92ffaSHans Petter Selasky rs_remove(rs);
963*d6b92ffaSHans Petter Selasky
964*d6b92ffaSHans Petter Selasky if (rs->dmsg)
965*d6b92ffaSHans Petter Selasky free(rs->dmsg);
966*d6b92ffaSHans Petter Selasky
967*d6b92ffaSHans Petter Selasky while ((qp = rs->qp_list)) {
968*d6b92ffaSHans Petter Selasky ds_remove_qp(rs, qp);
969*d6b92ffaSHans Petter Selasky ds_free_qp(qp);
970*d6b92ffaSHans Petter Selasky }
971*d6b92ffaSHans Petter Selasky
972*d6b92ffaSHans Petter Selasky if (rs->epfd >= 0)
973*d6b92ffaSHans Petter Selasky close(rs->epfd);
974*d6b92ffaSHans Petter Selasky
975*d6b92ffaSHans Petter Selasky if (rs->sbuf)
976*d6b92ffaSHans Petter Selasky free(rs->sbuf);
977*d6b92ffaSHans Petter Selasky
978*d6b92ffaSHans Petter Selasky tdestroy(rs->dest_map, free);
979*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->map_lock);
980*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->cq_wait_lock);
981*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->cq_lock);
982*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->rlock);
983*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->slock);
984*d6b92ffaSHans Petter Selasky free(rs);
985*d6b92ffaSHans Petter Selasky }
986*d6b92ffaSHans Petter Selasky
rs_free(struct rsocket * rs)987*d6b92ffaSHans Petter Selasky static void rs_free(struct rsocket *rs)
988*d6b92ffaSHans Petter Selasky {
989*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_DGRAM) {
990*d6b92ffaSHans Petter Selasky ds_free(rs);
991*d6b92ffaSHans Petter Selasky return;
992*d6b92ffaSHans Petter Selasky }
993*d6b92ffaSHans Petter Selasky
994*d6b92ffaSHans Petter Selasky if (rs->rmsg)
995*d6b92ffaSHans Petter Selasky free(rs->rmsg);
996*d6b92ffaSHans Petter Selasky
997*d6b92ffaSHans Petter Selasky if (rs->sbuf) {
998*d6b92ffaSHans Petter Selasky if (rs->smr)
999*d6b92ffaSHans Petter Selasky rdma_dereg_mr(rs->smr);
1000*d6b92ffaSHans Petter Selasky free(rs->sbuf);
1001*d6b92ffaSHans Petter Selasky }
1002*d6b92ffaSHans Petter Selasky
1003*d6b92ffaSHans Petter Selasky if (rs->rbuf) {
1004*d6b92ffaSHans Petter Selasky if (rs->rmr)
1005*d6b92ffaSHans Petter Selasky rdma_dereg_mr(rs->rmr);
1006*d6b92ffaSHans Petter Selasky free(rs->rbuf);
1007*d6b92ffaSHans Petter Selasky }
1008*d6b92ffaSHans Petter Selasky
1009*d6b92ffaSHans Petter Selasky if (rs->target_buffer_list) {
1010*d6b92ffaSHans Petter Selasky if (rs->target_mr)
1011*d6b92ffaSHans Petter Selasky rdma_dereg_mr(rs->target_mr);
1012*d6b92ffaSHans Petter Selasky free(rs->target_buffer_list);
1013*d6b92ffaSHans Petter Selasky }
1014*d6b92ffaSHans Petter Selasky
1015*d6b92ffaSHans Petter Selasky if (rs->cm_id) {
1016*d6b92ffaSHans Petter Selasky rs_free_iomappings(rs);
1017*d6b92ffaSHans Petter Selasky if (rs->cm_id->qp) {
1018*d6b92ffaSHans Petter Selasky ibv_ack_cq_events(rs->cm_id->recv_cq, rs->unack_cqe);
1019*d6b92ffaSHans Petter Selasky rdma_destroy_qp(rs->cm_id);
1020*d6b92ffaSHans Petter Selasky }
1021*d6b92ffaSHans Petter Selasky rdma_destroy_id(rs->cm_id);
1022*d6b92ffaSHans Petter Selasky }
1023*d6b92ffaSHans Petter Selasky
1024*d6b92ffaSHans Petter Selasky if (rs->index >= 0)
1025*d6b92ffaSHans Petter Selasky rs_remove(rs);
1026*d6b92ffaSHans Petter Selasky
1027*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->map_lock);
1028*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->cq_wait_lock);
1029*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->cq_lock);
1030*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->rlock);
1031*d6b92ffaSHans Petter Selasky fastlock_destroy(&rs->slock);
1032*d6b92ffaSHans Petter Selasky free(rs);
1033*d6b92ffaSHans Petter Selasky }
1034*d6b92ffaSHans Petter Selasky
rs_conn_data_offset(struct rsocket * rs)1035*d6b92ffaSHans Petter Selasky static size_t rs_conn_data_offset(struct rsocket *rs)
1036*d6b92ffaSHans Petter Selasky {
1037*d6b92ffaSHans Petter Selasky return (rs->cm_id->route.addr.src_addr.sa_family == AF_IB) ?
1038*d6b92ffaSHans Petter Selasky sizeof(struct ib_connect_hdr) : 0;
1039*d6b92ffaSHans Petter Selasky }
1040*d6b92ffaSHans Petter Selasky
rs_format_conn_data(struct rsocket * rs,struct rs_conn_data * conn)1041*d6b92ffaSHans Petter Selasky static void rs_format_conn_data(struct rsocket *rs, struct rs_conn_data *conn)
1042*d6b92ffaSHans Petter Selasky {
1043*d6b92ffaSHans Petter Selasky conn->version = 1;
1044*d6b92ffaSHans Petter Selasky conn->flags = RS_CONN_FLAG_IOMAP |
1045*d6b92ffaSHans Petter Selasky (rs_host_is_net() ? RS_CONN_FLAG_NET : 0);
1046*d6b92ffaSHans Petter Selasky conn->credits = htobe16(rs->rq_size);
1047*d6b92ffaSHans Petter Selasky memset(conn->reserved, 0, sizeof conn->reserved);
1048*d6b92ffaSHans Petter Selasky conn->target_iomap_size = (uint8_t) rs_value_to_scale(rs->target_iomap_size, 8);
1049*d6b92ffaSHans Petter Selasky
1050*d6b92ffaSHans Petter Selasky conn->target_sgl.addr = (__force uint64_t)htobe64((uintptr_t) rs->target_sgl);
1051*d6b92ffaSHans Petter Selasky conn->target_sgl.length = (__force uint32_t)htobe32(RS_SGL_SIZE);
1052*d6b92ffaSHans Petter Selasky conn->target_sgl.key = (__force uint32_t)htobe32(rs->target_mr->rkey);
1053*d6b92ffaSHans Petter Selasky
1054*d6b92ffaSHans Petter Selasky conn->data_buf.addr = (__force uint64_t)htobe64((uintptr_t) rs->rbuf);
1055*d6b92ffaSHans Petter Selasky conn->data_buf.length = (__force uint32_t)htobe32(rs->rbuf_size >> 1);
1056*d6b92ffaSHans Petter Selasky conn->data_buf.key = (__force uint32_t)htobe32(rs->rmr->rkey);
1057*d6b92ffaSHans Petter Selasky }
1058*d6b92ffaSHans Petter Selasky
rs_save_conn_data(struct rsocket * rs,struct rs_conn_data * conn)1059*d6b92ffaSHans Petter Selasky static void rs_save_conn_data(struct rsocket *rs, struct rs_conn_data *conn)
1060*d6b92ffaSHans Petter Selasky {
1061*d6b92ffaSHans Petter Selasky rs->remote_sgl.addr = be64toh((__force __be64)conn->target_sgl.addr);
1062*d6b92ffaSHans Petter Selasky rs->remote_sgl.length = be32toh((__force __be32)conn->target_sgl.length);
1063*d6b92ffaSHans Petter Selasky rs->remote_sgl.key = be32toh((__force __be32)conn->target_sgl.key);
1064*d6b92ffaSHans Petter Selasky rs->remote_sge = 1;
1065*d6b92ffaSHans Petter Selasky if ((rs_host_is_net() && !(conn->flags & RS_CONN_FLAG_NET)) ||
1066*d6b92ffaSHans Petter Selasky (!rs_host_is_net() && (conn->flags & RS_CONN_FLAG_NET)))
1067*d6b92ffaSHans Petter Selasky rs->opts = RS_OPT_SWAP_SGL;
1068*d6b92ffaSHans Petter Selasky
1069*d6b92ffaSHans Petter Selasky if (conn->flags & RS_CONN_FLAG_IOMAP) {
1070*d6b92ffaSHans Petter Selasky rs->remote_iomap.addr = rs->remote_sgl.addr +
1071*d6b92ffaSHans Petter Selasky sizeof(rs->remote_sgl) * rs->remote_sgl.length;
1072*d6b92ffaSHans Petter Selasky rs->remote_iomap.length = rs_scale_to_value(conn->target_iomap_size, 8);
1073*d6b92ffaSHans Petter Selasky rs->remote_iomap.key = rs->remote_sgl.key;
1074*d6b92ffaSHans Petter Selasky }
1075*d6b92ffaSHans Petter Selasky
1076*d6b92ffaSHans Petter Selasky rs->target_sgl[0].addr = be64toh((__force __be64)conn->data_buf.addr);
1077*d6b92ffaSHans Petter Selasky rs->target_sgl[0].length = be32toh((__force __be32)conn->data_buf.length);
1078*d6b92ffaSHans Petter Selasky rs->target_sgl[0].key = be32toh((__force __be32)conn->data_buf.key);
1079*d6b92ffaSHans Petter Selasky
1080*d6b92ffaSHans Petter Selasky rs->sseq_comp = be16toh(conn->credits);
1081*d6b92ffaSHans Petter Selasky }
1082*d6b92ffaSHans Petter Selasky
ds_init(struct rsocket * rs,int domain)1083*d6b92ffaSHans Petter Selasky static int ds_init(struct rsocket *rs, int domain)
1084*d6b92ffaSHans Petter Selasky {
1085*d6b92ffaSHans Petter Selasky rs->udp_sock = socket(domain, SOCK_DGRAM, 0);
1086*d6b92ffaSHans Petter Selasky if (rs->udp_sock < 0)
1087*d6b92ffaSHans Petter Selasky return rs->udp_sock;
1088*d6b92ffaSHans Petter Selasky
1089*d6b92ffaSHans Petter Selasky rs->epfd = epoll_create(2);
1090*d6b92ffaSHans Petter Selasky if (rs->epfd < 0)
1091*d6b92ffaSHans Petter Selasky return rs->epfd;
1092*d6b92ffaSHans Petter Selasky
1093*d6b92ffaSHans Petter Selasky return 0;
1094*d6b92ffaSHans Petter Selasky }
1095*d6b92ffaSHans Petter Selasky
ds_init_ep(struct rsocket * rs)1096*d6b92ffaSHans Petter Selasky static int ds_init_ep(struct rsocket *rs)
1097*d6b92ffaSHans Petter Selasky {
1098*d6b92ffaSHans Petter Selasky struct ds_smsg *msg;
1099*d6b92ffaSHans Petter Selasky int i, ret;
1100*d6b92ffaSHans Petter Selasky
1101*d6b92ffaSHans Petter Selasky ds_set_qp_size(rs);
1102*d6b92ffaSHans Petter Selasky
1103*d6b92ffaSHans Petter Selasky rs->sbuf = calloc(rs->sq_size, RS_SNDLOWAT);
1104*d6b92ffaSHans Petter Selasky if (!rs->sbuf)
1105*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
1106*d6b92ffaSHans Petter Selasky
1107*d6b92ffaSHans Petter Selasky rs->dmsg = calloc(rs->rq_size + 1, sizeof(*rs->dmsg));
1108*d6b92ffaSHans Petter Selasky if (!rs->dmsg)
1109*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
1110*d6b92ffaSHans Petter Selasky
1111*d6b92ffaSHans Petter Selasky rs->sqe_avail = rs->sq_size;
1112*d6b92ffaSHans Petter Selasky rs->rqe_avail = rs->rq_size;
1113*d6b92ffaSHans Petter Selasky
1114*d6b92ffaSHans Petter Selasky rs->smsg_free = (struct ds_smsg *) rs->sbuf;
1115*d6b92ffaSHans Petter Selasky msg = rs->smsg_free;
1116*d6b92ffaSHans Petter Selasky for (i = 0; i < rs->sq_size - 1; i++) {
1117*d6b92ffaSHans Petter Selasky msg->next = (void *) msg + RS_SNDLOWAT;
1118*d6b92ffaSHans Petter Selasky msg = msg->next;
1119*d6b92ffaSHans Petter Selasky }
1120*d6b92ffaSHans Petter Selasky msg->next = NULL;
1121*d6b92ffaSHans Petter Selasky
1122*d6b92ffaSHans Petter Selasky ret = rs_notify_svc(&udp_svc, rs, RS_SVC_ADD_DGRAM);
1123*d6b92ffaSHans Petter Selasky if (ret)
1124*d6b92ffaSHans Petter Selasky return ret;
1125*d6b92ffaSHans Petter Selasky
1126*d6b92ffaSHans Petter Selasky rs->state = rs_readable | rs_writable;
1127*d6b92ffaSHans Petter Selasky return 0;
1128*d6b92ffaSHans Petter Selasky }
1129*d6b92ffaSHans Petter Selasky
rsocket(int domain,int type,int protocol)1130*d6b92ffaSHans Petter Selasky int rsocket(int domain, int type, int protocol)
1131*d6b92ffaSHans Petter Selasky {
1132*d6b92ffaSHans Petter Selasky struct rsocket *rs;
1133*d6b92ffaSHans Petter Selasky int index, ret;
1134*d6b92ffaSHans Petter Selasky
1135*d6b92ffaSHans Petter Selasky if ((domain != AF_INET && domain != AF_INET6 && domain != AF_IB) ||
1136*d6b92ffaSHans Petter Selasky ((type != SOCK_STREAM) && (type != SOCK_DGRAM)) ||
1137*d6b92ffaSHans Petter Selasky (type == SOCK_STREAM && protocol && protocol != IPPROTO_TCP) ||
1138*d6b92ffaSHans Petter Selasky (type == SOCK_DGRAM && protocol && protocol != IPPROTO_UDP))
1139*d6b92ffaSHans Petter Selasky return ERR(ENOTSUP);
1140*d6b92ffaSHans Petter Selasky
1141*d6b92ffaSHans Petter Selasky rs_configure();
1142*d6b92ffaSHans Petter Selasky rs = rs_alloc(NULL, type);
1143*d6b92ffaSHans Petter Selasky if (!rs)
1144*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
1145*d6b92ffaSHans Petter Selasky
1146*d6b92ffaSHans Petter Selasky if (type == SOCK_STREAM) {
1147*d6b92ffaSHans Petter Selasky ret = rdma_create_id(NULL, &rs->cm_id, rs, RDMA_PS_TCP);
1148*d6b92ffaSHans Petter Selasky if (ret)
1149*d6b92ffaSHans Petter Selasky goto err;
1150*d6b92ffaSHans Petter Selasky
1151*d6b92ffaSHans Petter Selasky rs->cm_id->route.addr.src_addr.sa_family = domain;
1152*d6b92ffaSHans Petter Selasky index = rs->cm_id->channel->fd;
1153*d6b92ffaSHans Petter Selasky } else {
1154*d6b92ffaSHans Petter Selasky ret = ds_init(rs, domain);
1155*d6b92ffaSHans Petter Selasky if (ret)
1156*d6b92ffaSHans Petter Selasky goto err;
1157*d6b92ffaSHans Petter Selasky
1158*d6b92ffaSHans Petter Selasky index = rs->udp_sock;
1159*d6b92ffaSHans Petter Selasky }
1160*d6b92ffaSHans Petter Selasky
1161*d6b92ffaSHans Petter Selasky ret = rs_insert(rs, index);
1162*d6b92ffaSHans Petter Selasky if (ret < 0)
1163*d6b92ffaSHans Petter Selasky goto err;
1164*d6b92ffaSHans Petter Selasky
1165*d6b92ffaSHans Petter Selasky return rs->index;
1166*d6b92ffaSHans Petter Selasky
1167*d6b92ffaSHans Petter Selasky err:
1168*d6b92ffaSHans Petter Selasky rs_free(rs);
1169*d6b92ffaSHans Petter Selasky return ret;
1170*d6b92ffaSHans Petter Selasky }
1171*d6b92ffaSHans Petter Selasky
rbind(int socket,const struct sockaddr * addr,socklen_t addrlen)1172*d6b92ffaSHans Petter Selasky int rbind(int socket, const struct sockaddr *addr, socklen_t addrlen)
1173*d6b92ffaSHans Petter Selasky {
1174*d6b92ffaSHans Petter Selasky struct rsocket *rs;
1175*d6b92ffaSHans Petter Selasky int ret;
1176*d6b92ffaSHans Petter Selasky
1177*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
1178*d6b92ffaSHans Petter Selasky if (!rs)
1179*d6b92ffaSHans Petter Selasky return ERR(EBADF);
1180*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
1181*d6b92ffaSHans Petter Selasky ret = rdma_bind_addr(rs->cm_id, (struct sockaddr *) addr);
1182*d6b92ffaSHans Petter Selasky if (!ret)
1183*d6b92ffaSHans Petter Selasky rs->state = rs_bound;
1184*d6b92ffaSHans Petter Selasky } else {
1185*d6b92ffaSHans Petter Selasky if (rs->state == rs_init) {
1186*d6b92ffaSHans Petter Selasky ret = ds_init_ep(rs);
1187*d6b92ffaSHans Petter Selasky if (ret)
1188*d6b92ffaSHans Petter Selasky return ret;
1189*d6b92ffaSHans Petter Selasky }
1190*d6b92ffaSHans Petter Selasky ret = bind(rs->udp_sock, addr, addrlen);
1191*d6b92ffaSHans Petter Selasky }
1192*d6b92ffaSHans Petter Selasky return ret;
1193*d6b92ffaSHans Petter Selasky }
1194*d6b92ffaSHans Petter Selasky
rlisten(int socket,int backlog)1195*d6b92ffaSHans Petter Selasky int rlisten(int socket, int backlog)
1196*d6b92ffaSHans Petter Selasky {
1197*d6b92ffaSHans Petter Selasky struct rsocket *rs;
1198*d6b92ffaSHans Petter Selasky int ret;
1199*d6b92ffaSHans Petter Selasky
1200*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
1201*d6b92ffaSHans Petter Selasky if (!rs)
1202*d6b92ffaSHans Petter Selasky return ERR(EBADF);
1203*d6b92ffaSHans Petter Selasky
1204*d6b92ffaSHans Petter Selasky if (rs->state != rs_listening) {
1205*d6b92ffaSHans Petter Selasky ret = rdma_listen(rs->cm_id, backlog);
1206*d6b92ffaSHans Petter Selasky if (!ret)
1207*d6b92ffaSHans Petter Selasky rs->state = rs_listening;
1208*d6b92ffaSHans Petter Selasky } else {
1209*d6b92ffaSHans Petter Selasky ret = 0;
1210*d6b92ffaSHans Petter Selasky }
1211*d6b92ffaSHans Petter Selasky return ret;
1212*d6b92ffaSHans Petter Selasky }
1213*d6b92ffaSHans Petter Selasky
1214*d6b92ffaSHans Petter Selasky /*
1215*d6b92ffaSHans Petter Selasky * Nonblocking is usually not inherited between sockets, but we need to
1216*d6b92ffaSHans Petter Selasky * inherit it here to establish the connection only. This is needed to
1217*d6b92ffaSHans Petter Selasky * prevent rdma_accept from blocking until the remote side finishes
1218*d6b92ffaSHans Petter Selasky * establishing the connection. If we were to allow rdma_accept to block,
1219*d6b92ffaSHans Petter Selasky * then a single thread cannot establish a connection with itself, or
1220*d6b92ffaSHans Petter Selasky * two threads which try to connect to each other can deadlock trying to
1221*d6b92ffaSHans Petter Selasky * form a connection.
1222*d6b92ffaSHans Petter Selasky *
1223*d6b92ffaSHans Petter Selasky * Data transfers on the new socket remain blocking unless the user
1224*d6b92ffaSHans Petter Selasky * specifies otherwise through rfcntl.
1225*d6b92ffaSHans Petter Selasky */
raccept(int socket,struct sockaddr * addr,socklen_t * addrlen)1226*d6b92ffaSHans Petter Selasky int raccept(int socket, struct sockaddr *addr, socklen_t *addrlen)
1227*d6b92ffaSHans Petter Selasky {
1228*d6b92ffaSHans Petter Selasky struct rsocket *rs, *new_rs;
1229*d6b92ffaSHans Petter Selasky struct rdma_conn_param param;
1230*d6b92ffaSHans Petter Selasky struct rs_conn_data *creq, cresp;
1231*d6b92ffaSHans Petter Selasky int ret;
1232*d6b92ffaSHans Petter Selasky
1233*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
1234*d6b92ffaSHans Petter Selasky if (!rs)
1235*d6b92ffaSHans Petter Selasky return ERR(EBADF);
1236*d6b92ffaSHans Petter Selasky new_rs = rs_alloc(rs, rs->type);
1237*d6b92ffaSHans Petter Selasky if (!new_rs)
1238*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
1239*d6b92ffaSHans Petter Selasky
1240*d6b92ffaSHans Petter Selasky ret = rdma_get_request(rs->cm_id, &new_rs->cm_id);
1241*d6b92ffaSHans Petter Selasky if (ret)
1242*d6b92ffaSHans Petter Selasky goto err;
1243*d6b92ffaSHans Petter Selasky
1244*d6b92ffaSHans Petter Selasky ret = rs_insert(new_rs, new_rs->cm_id->channel->fd);
1245*d6b92ffaSHans Petter Selasky if (ret < 0)
1246*d6b92ffaSHans Petter Selasky goto err;
1247*d6b92ffaSHans Petter Selasky
1248*d6b92ffaSHans Petter Selasky creq = (struct rs_conn_data *)
1249*d6b92ffaSHans Petter Selasky (new_rs->cm_id->event->param.conn.private_data + rs_conn_data_offset(rs));
1250*d6b92ffaSHans Petter Selasky if (creq->version != 1) {
1251*d6b92ffaSHans Petter Selasky ret = ERR(ENOTSUP);
1252*d6b92ffaSHans Petter Selasky goto err;
1253*d6b92ffaSHans Petter Selasky }
1254*d6b92ffaSHans Petter Selasky
1255*d6b92ffaSHans Petter Selasky if (rs->fd_flags & O_NONBLOCK)
1256*d6b92ffaSHans Petter Selasky fcntl(new_rs->cm_id->channel->fd, F_SETFL, O_NONBLOCK);
1257*d6b92ffaSHans Petter Selasky
1258*d6b92ffaSHans Petter Selasky ret = rs_create_ep(new_rs);
1259*d6b92ffaSHans Petter Selasky if (ret)
1260*d6b92ffaSHans Petter Selasky goto err;
1261*d6b92ffaSHans Petter Selasky
1262*d6b92ffaSHans Petter Selasky rs_save_conn_data(new_rs, creq);
1263*d6b92ffaSHans Petter Selasky param = new_rs->cm_id->event->param.conn;
1264*d6b92ffaSHans Petter Selasky rs_format_conn_data(new_rs, &cresp);
1265*d6b92ffaSHans Petter Selasky param.private_data = &cresp;
1266*d6b92ffaSHans Petter Selasky param.private_data_len = sizeof cresp;
1267*d6b92ffaSHans Petter Selasky ret = rdma_accept(new_rs->cm_id, ¶m);
1268*d6b92ffaSHans Petter Selasky if (!ret)
1269*d6b92ffaSHans Petter Selasky new_rs->state = rs_connect_rdwr;
1270*d6b92ffaSHans Petter Selasky else if (errno == EAGAIN || errno == EWOULDBLOCK)
1271*d6b92ffaSHans Petter Selasky new_rs->state = rs_accepting;
1272*d6b92ffaSHans Petter Selasky else
1273*d6b92ffaSHans Petter Selasky goto err;
1274*d6b92ffaSHans Petter Selasky
1275*d6b92ffaSHans Petter Selasky if (addr && addrlen)
1276*d6b92ffaSHans Petter Selasky rgetpeername(new_rs->index, addr, addrlen);
1277*d6b92ffaSHans Petter Selasky return new_rs->index;
1278*d6b92ffaSHans Petter Selasky
1279*d6b92ffaSHans Petter Selasky err:
1280*d6b92ffaSHans Petter Selasky rs_free(new_rs);
1281*d6b92ffaSHans Petter Selasky return ret;
1282*d6b92ffaSHans Petter Selasky }
1283*d6b92ffaSHans Petter Selasky
rs_do_connect(struct rsocket * rs)1284*d6b92ffaSHans Petter Selasky static int rs_do_connect(struct rsocket *rs)
1285*d6b92ffaSHans Petter Selasky {
1286*d6b92ffaSHans Petter Selasky struct rdma_conn_param param;
1287*d6b92ffaSHans Petter Selasky struct rs_conn_private_data cdata;
1288*d6b92ffaSHans Petter Selasky struct rs_conn_data *creq, *cresp;
1289*d6b92ffaSHans Petter Selasky int to, ret;
1290*d6b92ffaSHans Petter Selasky
1291*d6b92ffaSHans Petter Selasky switch (rs->state) {
1292*d6b92ffaSHans Petter Selasky case rs_init:
1293*d6b92ffaSHans Petter Selasky case rs_bound:
1294*d6b92ffaSHans Petter Selasky resolve_addr:
1295*d6b92ffaSHans Petter Selasky to = 1000 << rs->retries++;
1296*d6b92ffaSHans Petter Selasky ret = rdma_resolve_addr(rs->cm_id, NULL,
1297*d6b92ffaSHans Petter Selasky &rs->cm_id->route.addr.dst_addr, to);
1298*d6b92ffaSHans Petter Selasky if (!ret)
1299*d6b92ffaSHans Petter Selasky goto resolve_route;
1300*d6b92ffaSHans Petter Selasky if (errno == EAGAIN || errno == EWOULDBLOCK)
1301*d6b92ffaSHans Petter Selasky rs->state = rs_resolving_addr;
1302*d6b92ffaSHans Petter Selasky break;
1303*d6b92ffaSHans Petter Selasky case rs_resolving_addr:
1304*d6b92ffaSHans Petter Selasky ret = ucma_complete(rs->cm_id);
1305*d6b92ffaSHans Petter Selasky if (ret) {
1306*d6b92ffaSHans Petter Selasky if (errno == ETIMEDOUT && rs->retries <= RS_CONN_RETRIES)
1307*d6b92ffaSHans Petter Selasky goto resolve_addr;
1308*d6b92ffaSHans Petter Selasky break;
1309*d6b92ffaSHans Petter Selasky }
1310*d6b92ffaSHans Petter Selasky
1311*d6b92ffaSHans Petter Selasky rs->retries = 0;
1312*d6b92ffaSHans Petter Selasky resolve_route:
1313*d6b92ffaSHans Petter Selasky to = 1000 << rs->retries++;
1314*d6b92ffaSHans Petter Selasky if (rs->optval) {
1315*d6b92ffaSHans Petter Selasky ret = rdma_set_option(rs->cm_id, RDMA_OPTION_IB,
1316*d6b92ffaSHans Petter Selasky RDMA_OPTION_IB_PATH, rs->optval,
1317*d6b92ffaSHans Petter Selasky rs->optlen);
1318*d6b92ffaSHans Petter Selasky free(rs->optval);
1319*d6b92ffaSHans Petter Selasky rs->optval = NULL;
1320*d6b92ffaSHans Petter Selasky if (!ret) {
1321*d6b92ffaSHans Petter Selasky rs->state = rs_resolving_route;
1322*d6b92ffaSHans Petter Selasky goto resolving_route;
1323*d6b92ffaSHans Petter Selasky }
1324*d6b92ffaSHans Petter Selasky } else {
1325*d6b92ffaSHans Petter Selasky ret = rdma_resolve_route(rs->cm_id, to);
1326*d6b92ffaSHans Petter Selasky if (!ret)
1327*d6b92ffaSHans Petter Selasky goto do_connect;
1328*d6b92ffaSHans Petter Selasky }
1329*d6b92ffaSHans Petter Selasky if (errno == EAGAIN || errno == EWOULDBLOCK)
1330*d6b92ffaSHans Petter Selasky rs->state = rs_resolving_route;
1331*d6b92ffaSHans Petter Selasky break;
1332*d6b92ffaSHans Petter Selasky case rs_resolving_route:
1333*d6b92ffaSHans Petter Selasky resolving_route:
1334*d6b92ffaSHans Petter Selasky ret = ucma_complete(rs->cm_id);
1335*d6b92ffaSHans Petter Selasky if (ret) {
1336*d6b92ffaSHans Petter Selasky if (errno == ETIMEDOUT && rs->retries <= RS_CONN_RETRIES)
1337*d6b92ffaSHans Petter Selasky goto resolve_route;
1338*d6b92ffaSHans Petter Selasky break;
1339*d6b92ffaSHans Petter Selasky }
1340*d6b92ffaSHans Petter Selasky do_connect:
1341*d6b92ffaSHans Petter Selasky ret = rs_create_ep(rs);
1342*d6b92ffaSHans Petter Selasky if (ret)
1343*d6b92ffaSHans Petter Selasky break;
1344*d6b92ffaSHans Petter Selasky
1345*d6b92ffaSHans Petter Selasky memset(¶m, 0, sizeof param);
1346*d6b92ffaSHans Petter Selasky creq = (void *) &cdata + rs_conn_data_offset(rs);
1347*d6b92ffaSHans Petter Selasky rs_format_conn_data(rs, creq);
1348*d6b92ffaSHans Petter Selasky param.private_data = (void *) creq - rs_conn_data_offset(rs);
1349*d6b92ffaSHans Petter Selasky param.private_data_len = sizeof(*creq) + rs_conn_data_offset(rs);
1350*d6b92ffaSHans Petter Selasky param.flow_control = 1;
1351*d6b92ffaSHans Petter Selasky param.retry_count = 7;
1352*d6b92ffaSHans Petter Selasky param.rnr_retry_count = 7;
1353*d6b92ffaSHans Petter Selasky /* work-around: iWarp issues RDMA read during connection */
1354*d6b92ffaSHans Petter Selasky if (rs->opts & RS_OPT_MSG_SEND)
1355*d6b92ffaSHans Petter Selasky param.initiator_depth = 1;
1356*d6b92ffaSHans Petter Selasky rs->retries = 0;
1357*d6b92ffaSHans Petter Selasky
1358*d6b92ffaSHans Petter Selasky ret = rdma_connect(rs->cm_id, ¶m);
1359*d6b92ffaSHans Petter Selasky if (!ret)
1360*d6b92ffaSHans Petter Selasky goto connected;
1361*d6b92ffaSHans Petter Selasky if (errno == EAGAIN || errno == EWOULDBLOCK)
1362*d6b92ffaSHans Petter Selasky rs->state = rs_connecting;
1363*d6b92ffaSHans Petter Selasky break;
1364*d6b92ffaSHans Petter Selasky case rs_connecting:
1365*d6b92ffaSHans Petter Selasky ret = ucma_complete(rs->cm_id);
1366*d6b92ffaSHans Petter Selasky if (ret)
1367*d6b92ffaSHans Petter Selasky break;
1368*d6b92ffaSHans Petter Selasky connected:
1369*d6b92ffaSHans Petter Selasky cresp = (struct rs_conn_data *) rs->cm_id->event->param.conn.private_data;
1370*d6b92ffaSHans Petter Selasky if (cresp->version != 1) {
1371*d6b92ffaSHans Petter Selasky ret = ERR(ENOTSUP);
1372*d6b92ffaSHans Petter Selasky break;
1373*d6b92ffaSHans Petter Selasky }
1374*d6b92ffaSHans Petter Selasky
1375*d6b92ffaSHans Petter Selasky rs_save_conn_data(rs, cresp);
1376*d6b92ffaSHans Petter Selasky rs->state = rs_connect_rdwr;
1377*d6b92ffaSHans Petter Selasky break;
1378*d6b92ffaSHans Petter Selasky case rs_accepting:
1379*d6b92ffaSHans Petter Selasky if (!(rs->fd_flags & O_NONBLOCK))
1380*d6b92ffaSHans Petter Selasky fcntl(rs->cm_id->channel->fd, F_SETFL, 0);
1381*d6b92ffaSHans Petter Selasky
1382*d6b92ffaSHans Petter Selasky ret = ucma_complete(rs->cm_id);
1383*d6b92ffaSHans Petter Selasky if (ret)
1384*d6b92ffaSHans Petter Selasky break;
1385*d6b92ffaSHans Petter Selasky
1386*d6b92ffaSHans Petter Selasky rs->state = rs_connect_rdwr;
1387*d6b92ffaSHans Petter Selasky break;
1388*d6b92ffaSHans Petter Selasky default:
1389*d6b92ffaSHans Petter Selasky ret = ERR(EINVAL);
1390*d6b92ffaSHans Petter Selasky break;
1391*d6b92ffaSHans Petter Selasky }
1392*d6b92ffaSHans Petter Selasky
1393*d6b92ffaSHans Petter Selasky if (ret) {
1394*d6b92ffaSHans Petter Selasky if (errno == EAGAIN || errno == EWOULDBLOCK) {
1395*d6b92ffaSHans Petter Selasky errno = EINPROGRESS;
1396*d6b92ffaSHans Petter Selasky } else {
1397*d6b92ffaSHans Petter Selasky rs->state = rs_connect_error;
1398*d6b92ffaSHans Petter Selasky rs->err = errno;
1399*d6b92ffaSHans Petter Selasky }
1400*d6b92ffaSHans Petter Selasky }
1401*d6b92ffaSHans Petter Selasky return ret;
1402*d6b92ffaSHans Petter Selasky }
1403*d6b92ffaSHans Petter Selasky
rs_any_addr(const union socket_addr * addr)1404*d6b92ffaSHans Petter Selasky static int rs_any_addr(const union socket_addr *addr)
1405*d6b92ffaSHans Petter Selasky {
1406*d6b92ffaSHans Petter Selasky if (addr->sa.sa_family == AF_INET) {
1407*d6b92ffaSHans Petter Selasky return (addr->sin.sin_addr.s_addr == htobe32(INADDR_ANY) ||
1408*d6b92ffaSHans Petter Selasky addr->sin.sin_addr.s_addr == htobe32(INADDR_LOOPBACK));
1409*d6b92ffaSHans Petter Selasky } else {
1410*d6b92ffaSHans Petter Selasky return (!memcmp(&addr->sin6.sin6_addr, &in6addr_any, 16) ||
1411*d6b92ffaSHans Petter Selasky !memcmp(&addr->sin6.sin6_addr, &in6addr_loopback, 16));
1412*d6b92ffaSHans Petter Selasky }
1413*d6b92ffaSHans Petter Selasky }
1414*d6b92ffaSHans Petter Selasky
ds_get_src_addr(struct rsocket * rs,const struct sockaddr * dest_addr,socklen_t dest_len,union socket_addr * src_addr,socklen_t * src_len)1415*d6b92ffaSHans Petter Selasky static int ds_get_src_addr(struct rsocket *rs,
1416*d6b92ffaSHans Petter Selasky const struct sockaddr *dest_addr, socklen_t dest_len,
1417*d6b92ffaSHans Petter Selasky union socket_addr *src_addr, socklen_t *src_len)
1418*d6b92ffaSHans Petter Selasky {
1419*d6b92ffaSHans Petter Selasky int sock, ret;
1420*d6b92ffaSHans Petter Selasky __be16 port;
1421*d6b92ffaSHans Petter Selasky
1422*d6b92ffaSHans Petter Selasky *src_len = sizeof(*src_addr);
1423*d6b92ffaSHans Petter Selasky ret = getsockname(rs->udp_sock, &src_addr->sa, src_len);
1424*d6b92ffaSHans Petter Selasky if (ret || !rs_any_addr(src_addr))
1425*d6b92ffaSHans Petter Selasky return ret;
1426*d6b92ffaSHans Petter Selasky
1427*d6b92ffaSHans Petter Selasky port = src_addr->sin.sin_port;
1428*d6b92ffaSHans Petter Selasky sock = socket(dest_addr->sa_family, SOCK_DGRAM, 0);
1429*d6b92ffaSHans Petter Selasky if (sock < 0)
1430*d6b92ffaSHans Petter Selasky return sock;
1431*d6b92ffaSHans Petter Selasky
1432*d6b92ffaSHans Petter Selasky ret = connect(sock, dest_addr, dest_len);
1433*d6b92ffaSHans Petter Selasky if (ret)
1434*d6b92ffaSHans Petter Selasky goto out;
1435*d6b92ffaSHans Petter Selasky
1436*d6b92ffaSHans Petter Selasky *src_len = sizeof(*src_addr);
1437*d6b92ffaSHans Petter Selasky ret = getsockname(sock, &src_addr->sa, src_len);
1438*d6b92ffaSHans Petter Selasky src_addr->sin.sin_port = port;
1439*d6b92ffaSHans Petter Selasky out:
1440*d6b92ffaSHans Petter Selasky close(sock);
1441*d6b92ffaSHans Petter Selasky return ret;
1442*d6b92ffaSHans Petter Selasky }
1443*d6b92ffaSHans Petter Selasky
ds_format_hdr(struct ds_header * hdr,union socket_addr * addr)1444*d6b92ffaSHans Petter Selasky static void ds_format_hdr(struct ds_header *hdr, union socket_addr *addr)
1445*d6b92ffaSHans Petter Selasky {
1446*d6b92ffaSHans Petter Selasky if (addr->sa.sa_family == AF_INET) {
1447*d6b92ffaSHans Petter Selasky hdr->version = 4;
1448*d6b92ffaSHans Petter Selasky hdr->length = DS_IPV4_HDR_LEN;
1449*d6b92ffaSHans Petter Selasky hdr->port = addr->sin.sin_port;
1450*d6b92ffaSHans Petter Selasky hdr->addr.ipv4 = addr->sin.sin_addr.s_addr;
1451*d6b92ffaSHans Petter Selasky } else {
1452*d6b92ffaSHans Petter Selasky hdr->version = 6;
1453*d6b92ffaSHans Petter Selasky hdr->length = DS_IPV6_HDR_LEN;
1454*d6b92ffaSHans Petter Selasky hdr->port = addr->sin6.sin6_port;
1455*d6b92ffaSHans Petter Selasky hdr->addr.ipv6.flowinfo= addr->sin6.sin6_flowinfo;
1456*d6b92ffaSHans Petter Selasky memcpy(&hdr->addr.ipv6.addr, &addr->sin6.sin6_addr, 16);
1457*d6b92ffaSHans Petter Selasky }
1458*d6b92ffaSHans Petter Selasky }
1459*d6b92ffaSHans Petter Selasky
ds_add_qp_dest(struct ds_qp * qp,union socket_addr * addr,socklen_t addrlen)1460*d6b92ffaSHans Petter Selasky static int ds_add_qp_dest(struct ds_qp *qp, union socket_addr *addr,
1461*d6b92ffaSHans Petter Selasky socklen_t addrlen)
1462*d6b92ffaSHans Petter Selasky {
1463*d6b92ffaSHans Petter Selasky struct ibv_port_attr port_attr;
1464*d6b92ffaSHans Petter Selasky struct ibv_ah_attr attr;
1465*d6b92ffaSHans Petter Selasky int ret;
1466*d6b92ffaSHans Petter Selasky
1467*d6b92ffaSHans Petter Selasky memcpy(&qp->dest.addr, addr, addrlen);
1468*d6b92ffaSHans Petter Selasky qp->dest.qp = qp;
1469*d6b92ffaSHans Petter Selasky qp->dest.qpn = qp->cm_id->qp->qp_num;
1470*d6b92ffaSHans Petter Selasky
1471*d6b92ffaSHans Petter Selasky ret = ibv_query_port(qp->cm_id->verbs, qp->cm_id->port_num, &port_attr);
1472*d6b92ffaSHans Petter Selasky if (ret)
1473*d6b92ffaSHans Petter Selasky return ret;
1474*d6b92ffaSHans Petter Selasky
1475*d6b92ffaSHans Petter Selasky memset(&attr, 0, sizeof attr);
1476*d6b92ffaSHans Petter Selasky attr.dlid = port_attr.lid;
1477*d6b92ffaSHans Petter Selasky attr.port_num = qp->cm_id->port_num;
1478*d6b92ffaSHans Petter Selasky qp->dest.ah = ibv_create_ah(qp->cm_id->pd, &attr);
1479*d6b92ffaSHans Petter Selasky if (!qp->dest.ah)
1480*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
1481*d6b92ffaSHans Petter Selasky
1482*d6b92ffaSHans Petter Selasky tsearch(&qp->dest.addr, &qp->rs->dest_map, ds_compare_addr);
1483*d6b92ffaSHans Petter Selasky return 0;
1484*d6b92ffaSHans Petter Selasky }
1485*d6b92ffaSHans Petter Selasky
ds_create_qp(struct rsocket * rs,union socket_addr * src_addr,socklen_t addrlen,struct ds_qp ** new_qp)1486*d6b92ffaSHans Petter Selasky static int ds_create_qp(struct rsocket *rs, union socket_addr *src_addr,
1487*d6b92ffaSHans Petter Selasky socklen_t addrlen, struct ds_qp **new_qp)
1488*d6b92ffaSHans Petter Selasky {
1489*d6b92ffaSHans Petter Selasky struct ds_qp *qp;
1490*d6b92ffaSHans Petter Selasky struct ibv_qp_init_attr qp_attr;
1491*d6b92ffaSHans Petter Selasky struct epoll_event event;
1492*d6b92ffaSHans Petter Selasky int i, ret;
1493*d6b92ffaSHans Petter Selasky
1494*d6b92ffaSHans Petter Selasky qp = calloc(1, sizeof(*qp));
1495*d6b92ffaSHans Petter Selasky if (!qp)
1496*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
1497*d6b92ffaSHans Petter Selasky
1498*d6b92ffaSHans Petter Selasky qp->rs = rs;
1499*d6b92ffaSHans Petter Selasky ret = rdma_create_id(NULL, &qp->cm_id, qp, RDMA_PS_UDP);
1500*d6b92ffaSHans Petter Selasky if (ret)
1501*d6b92ffaSHans Petter Selasky goto err;
1502*d6b92ffaSHans Petter Selasky
1503*d6b92ffaSHans Petter Selasky ds_format_hdr(&qp->hdr, src_addr);
1504*d6b92ffaSHans Petter Selasky ret = rdma_bind_addr(qp->cm_id, &src_addr->sa);
1505*d6b92ffaSHans Petter Selasky if (ret)
1506*d6b92ffaSHans Petter Selasky goto err;
1507*d6b92ffaSHans Petter Selasky
1508*d6b92ffaSHans Petter Selasky ret = ds_init_bufs(qp);
1509*d6b92ffaSHans Petter Selasky if (ret)
1510*d6b92ffaSHans Petter Selasky goto err;
1511*d6b92ffaSHans Petter Selasky
1512*d6b92ffaSHans Petter Selasky ret = rs_create_cq(rs, qp->cm_id);
1513*d6b92ffaSHans Petter Selasky if (ret)
1514*d6b92ffaSHans Petter Selasky goto err;
1515*d6b92ffaSHans Petter Selasky
1516*d6b92ffaSHans Petter Selasky memset(&qp_attr, 0, sizeof qp_attr);
1517*d6b92ffaSHans Petter Selasky qp_attr.qp_context = qp;
1518*d6b92ffaSHans Petter Selasky qp_attr.send_cq = qp->cm_id->send_cq;
1519*d6b92ffaSHans Petter Selasky qp_attr.recv_cq = qp->cm_id->recv_cq;
1520*d6b92ffaSHans Petter Selasky qp_attr.qp_type = IBV_QPT_UD;
1521*d6b92ffaSHans Petter Selasky qp_attr.sq_sig_all = 1;
1522*d6b92ffaSHans Petter Selasky qp_attr.cap.max_send_wr = rs->sq_size;
1523*d6b92ffaSHans Petter Selasky qp_attr.cap.max_recv_wr = rs->rq_size;
1524*d6b92ffaSHans Petter Selasky qp_attr.cap.max_send_sge = 1;
1525*d6b92ffaSHans Petter Selasky qp_attr.cap.max_recv_sge = 2;
1526*d6b92ffaSHans Petter Selasky qp_attr.cap.max_inline_data = rs->sq_inline;
1527*d6b92ffaSHans Petter Selasky ret = rdma_create_qp(qp->cm_id, NULL, &qp_attr);
1528*d6b92ffaSHans Petter Selasky if (ret)
1529*d6b92ffaSHans Petter Selasky goto err;
1530*d6b92ffaSHans Petter Selasky
1531*d6b92ffaSHans Petter Selasky rs->sq_inline = qp_attr.cap.max_inline_data;
1532*d6b92ffaSHans Petter Selasky ret = ds_add_qp_dest(qp, src_addr, addrlen);
1533*d6b92ffaSHans Petter Selasky if (ret)
1534*d6b92ffaSHans Petter Selasky goto err;
1535*d6b92ffaSHans Petter Selasky
1536*d6b92ffaSHans Petter Selasky event.events = EPOLLIN;
1537*d6b92ffaSHans Petter Selasky event.data.ptr = qp;
1538*d6b92ffaSHans Petter Selasky ret = epoll_ctl(rs->epfd, EPOLL_CTL_ADD,
1539*d6b92ffaSHans Petter Selasky qp->cm_id->recv_cq_channel->fd, &event);
1540*d6b92ffaSHans Petter Selasky if (ret)
1541*d6b92ffaSHans Petter Selasky goto err;
1542*d6b92ffaSHans Petter Selasky
1543*d6b92ffaSHans Petter Selasky for (i = 0; i < rs->rq_size; i++) {
1544*d6b92ffaSHans Petter Selasky ret = ds_post_recv(rs, qp, i * RS_SNDLOWAT);
1545*d6b92ffaSHans Petter Selasky if (ret)
1546*d6b92ffaSHans Petter Selasky goto err;
1547*d6b92ffaSHans Petter Selasky }
1548*d6b92ffaSHans Petter Selasky
1549*d6b92ffaSHans Petter Selasky ds_insert_qp(rs, qp);
1550*d6b92ffaSHans Petter Selasky *new_qp = qp;
1551*d6b92ffaSHans Petter Selasky return 0;
1552*d6b92ffaSHans Petter Selasky err:
1553*d6b92ffaSHans Petter Selasky ds_free_qp(qp);
1554*d6b92ffaSHans Petter Selasky return ret;
1555*d6b92ffaSHans Petter Selasky }
1556*d6b92ffaSHans Petter Selasky
ds_get_qp(struct rsocket * rs,union socket_addr * src_addr,socklen_t addrlen,struct ds_qp ** qp)1557*d6b92ffaSHans Petter Selasky static int ds_get_qp(struct rsocket *rs, union socket_addr *src_addr,
1558*d6b92ffaSHans Petter Selasky socklen_t addrlen, struct ds_qp **qp)
1559*d6b92ffaSHans Petter Selasky {
1560*d6b92ffaSHans Petter Selasky if (rs->qp_list) {
1561*d6b92ffaSHans Petter Selasky *qp = rs->qp_list;
1562*d6b92ffaSHans Petter Selasky do {
1563*d6b92ffaSHans Petter Selasky if (!ds_compare_addr(rdma_get_local_addr((*qp)->cm_id),
1564*d6b92ffaSHans Petter Selasky src_addr))
1565*d6b92ffaSHans Petter Selasky return 0;
1566*d6b92ffaSHans Petter Selasky
1567*d6b92ffaSHans Petter Selasky *qp = ds_next_qp(*qp);
1568*d6b92ffaSHans Petter Selasky } while (*qp != rs->qp_list);
1569*d6b92ffaSHans Petter Selasky }
1570*d6b92ffaSHans Petter Selasky
1571*d6b92ffaSHans Petter Selasky return ds_create_qp(rs, src_addr, addrlen, qp);
1572*d6b92ffaSHans Petter Selasky }
1573*d6b92ffaSHans Petter Selasky
ds_get_dest(struct rsocket * rs,const struct sockaddr * addr,socklen_t addrlen,struct ds_dest ** dest)1574*d6b92ffaSHans Petter Selasky static int ds_get_dest(struct rsocket *rs, const struct sockaddr *addr,
1575*d6b92ffaSHans Petter Selasky socklen_t addrlen, struct ds_dest **dest)
1576*d6b92ffaSHans Petter Selasky {
1577*d6b92ffaSHans Petter Selasky union socket_addr src_addr;
1578*d6b92ffaSHans Petter Selasky socklen_t src_len;
1579*d6b92ffaSHans Petter Selasky struct ds_qp *qp;
1580*d6b92ffaSHans Petter Selasky struct ds_dest **tdest, *new_dest;
1581*d6b92ffaSHans Petter Selasky int ret = 0;
1582*d6b92ffaSHans Petter Selasky
1583*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->map_lock);
1584*d6b92ffaSHans Petter Selasky tdest = tfind(addr, &rs->dest_map, ds_compare_addr);
1585*d6b92ffaSHans Petter Selasky if (tdest)
1586*d6b92ffaSHans Petter Selasky goto found;
1587*d6b92ffaSHans Petter Selasky
1588*d6b92ffaSHans Petter Selasky ret = ds_get_src_addr(rs, addr, addrlen, &src_addr, &src_len);
1589*d6b92ffaSHans Petter Selasky if (ret)
1590*d6b92ffaSHans Petter Selasky goto out;
1591*d6b92ffaSHans Petter Selasky
1592*d6b92ffaSHans Petter Selasky ret = ds_get_qp(rs, &src_addr, src_len, &qp);
1593*d6b92ffaSHans Petter Selasky if (ret)
1594*d6b92ffaSHans Petter Selasky goto out;
1595*d6b92ffaSHans Petter Selasky
1596*d6b92ffaSHans Petter Selasky tdest = tfind(addr, &rs->dest_map, ds_compare_addr);
1597*d6b92ffaSHans Petter Selasky if (!tdest) {
1598*d6b92ffaSHans Petter Selasky new_dest = calloc(1, sizeof(*new_dest));
1599*d6b92ffaSHans Petter Selasky if (!new_dest) {
1600*d6b92ffaSHans Petter Selasky ret = ERR(ENOMEM);
1601*d6b92ffaSHans Petter Selasky goto out;
1602*d6b92ffaSHans Petter Selasky }
1603*d6b92ffaSHans Petter Selasky
1604*d6b92ffaSHans Petter Selasky memcpy(&new_dest->addr, addr, addrlen);
1605*d6b92ffaSHans Petter Selasky new_dest->qp = qp;
1606*d6b92ffaSHans Petter Selasky tdest = tsearch(&new_dest->addr, &rs->dest_map, ds_compare_addr);
1607*d6b92ffaSHans Petter Selasky }
1608*d6b92ffaSHans Petter Selasky
1609*d6b92ffaSHans Petter Selasky found:
1610*d6b92ffaSHans Petter Selasky *dest = *tdest;
1611*d6b92ffaSHans Petter Selasky out:
1612*d6b92ffaSHans Petter Selasky fastlock_release(&rs->map_lock);
1613*d6b92ffaSHans Petter Selasky return ret;
1614*d6b92ffaSHans Petter Selasky }
1615*d6b92ffaSHans Petter Selasky
rconnect(int socket,const struct sockaddr * addr,socklen_t addrlen)1616*d6b92ffaSHans Petter Selasky int rconnect(int socket, const struct sockaddr *addr, socklen_t addrlen)
1617*d6b92ffaSHans Petter Selasky {
1618*d6b92ffaSHans Petter Selasky struct rsocket *rs;
1619*d6b92ffaSHans Petter Selasky int ret;
1620*d6b92ffaSHans Petter Selasky
1621*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
1622*d6b92ffaSHans Petter Selasky if (!rs)
1623*d6b92ffaSHans Petter Selasky return ERR(EBADF);
1624*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
1625*d6b92ffaSHans Petter Selasky memcpy(&rs->cm_id->route.addr.dst_addr, addr, addrlen);
1626*d6b92ffaSHans Petter Selasky ret = rs_do_connect(rs);
1627*d6b92ffaSHans Petter Selasky } else {
1628*d6b92ffaSHans Petter Selasky if (rs->state == rs_init) {
1629*d6b92ffaSHans Petter Selasky ret = ds_init_ep(rs);
1630*d6b92ffaSHans Petter Selasky if (ret)
1631*d6b92ffaSHans Petter Selasky return ret;
1632*d6b92ffaSHans Petter Selasky }
1633*d6b92ffaSHans Petter Selasky
1634*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
1635*d6b92ffaSHans Petter Selasky ret = connect(rs->udp_sock, addr, addrlen);
1636*d6b92ffaSHans Petter Selasky if (!ret)
1637*d6b92ffaSHans Petter Selasky ret = ds_get_dest(rs, addr, addrlen, &rs->conn_dest);
1638*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
1639*d6b92ffaSHans Petter Selasky }
1640*d6b92ffaSHans Petter Selasky return ret;
1641*d6b92ffaSHans Petter Selasky }
1642*d6b92ffaSHans Petter Selasky
rs_get_ctrl_buf(struct rsocket * rs)1643*d6b92ffaSHans Petter Selasky static void *rs_get_ctrl_buf(struct rsocket *rs)
1644*d6b92ffaSHans Petter Selasky {
1645*d6b92ffaSHans Petter Selasky return rs->sbuf + rs->sbuf_size +
1646*d6b92ffaSHans Petter Selasky RS_MAX_CTRL_MSG * (rs->ctrl_seqno & (RS_QP_CTRL_SIZE - 1));
1647*d6b92ffaSHans Petter Selasky }
1648*d6b92ffaSHans Petter Selasky
rs_post_msg(struct rsocket * rs,uint32_t msg)1649*d6b92ffaSHans Petter Selasky static int rs_post_msg(struct rsocket *rs, uint32_t msg)
1650*d6b92ffaSHans Petter Selasky {
1651*d6b92ffaSHans Petter Selasky struct ibv_send_wr wr, *bad;
1652*d6b92ffaSHans Petter Selasky struct ibv_sge sge;
1653*d6b92ffaSHans Petter Selasky
1654*d6b92ffaSHans Petter Selasky wr.wr_id = rs_send_wr_id(msg);
1655*d6b92ffaSHans Petter Selasky wr.next = NULL;
1656*d6b92ffaSHans Petter Selasky if (!(rs->opts & RS_OPT_MSG_SEND)) {
1657*d6b92ffaSHans Petter Selasky wr.sg_list = NULL;
1658*d6b92ffaSHans Petter Selasky wr.num_sge = 0;
1659*d6b92ffaSHans Petter Selasky wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
1660*d6b92ffaSHans Petter Selasky wr.send_flags = 0;
1661*d6b92ffaSHans Petter Selasky wr.imm_data = htobe32(msg);
1662*d6b92ffaSHans Petter Selasky } else {
1663*d6b92ffaSHans Petter Selasky sge.addr = (uintptr_t) &msg;
1664*d6b92ffaSHans Petter Selasky sge.lkey = 0;
1665*d6b92ffaSHans Petter Selasky sge.length = sizeof msg;
1666*d6b92ffaSHans Petter Selasky wr.sg_list = &sge;
1667*d6b92ffaSHans Petter Selasky wr.num_sge = 1;
1668*d6b92ffaSHans Petter Selasky wr.opcode = IBV_WR_SEND;
1669*d6b92ffaSHans Petter Selasky wr.send_flags = IBV_SEND_INLINE;
1670*d6b92ffaSHans Petter Selasky }
1671*d6b92ffaSHans Petter Selasky
1672*d6b92ffaSHans Petter Selasky return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad));
1673*d6b92ffaSHans Petter Selasky }
1674*d6b92ffaSHans Petter Selasky
rs_post_write(struct rsocket * rs,struct ibv_sge * sgl,int nsge,uint32_t wr_data,int flags,uint64_t addr,uint32_t rkey)1675*d6b92ffaSHans Petter Selasky static int rs_post_write(struct rsocket *rs,
1676*d6b92ffaSHans Petter Selasky struct ibv_sge *sgl, int nsge,
1677*d6b92ffaSHans Petter Selasky uint32_t wr_data, int flags,
1678*d6b92ffaSHans Petter Selasky uint64_t addr, uint32_t rkey)
1679*d6b92ffaSHans Petter Selasky {
1680*d6b92ffaSHans Petter Selasky struct ibv_send_wr wr, *bad;
1681*d6b92ffaSHans Petter Selasky
1682*d6b92ffaSHans Petter Selasky wr.wr_id = rs_send_wr_id(wr_data);
1683*d6b92ffaSHans Petter Selasky wr.next = NULL;
1684*d6b92ffaSHans Petter Selasky wr.sg_list = sgl;
1685*d6b92ffaSHans Petter Selasky wr.num_sge = nsge;
1686*d6b92ffaSHans Petter Selasky wr.opcode = IBV_WR_RDMA_WRITE;
1687*d6b92ffaSHans Petter Selasky wr.send_flags = flags;
1688*d6b92ffaSHans Petter Selasky wr.wr.rdma.remote_addr = addr;
1689*d6b92ffaSHans Petter Selasky wr.wr.rdma.rkey = rkey;
1690*d6b92ffaSHans Petter Selasky
1691*d6b92ffaSHans Petter Selasky return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad));
1692*d6b92ffaSHans Petter Selasky }
1693*d6b92ffaSHans Petter Selasky
rs_post_write_msg(struct rsocket * rs,struct ibv_sge * sgl,int nsge,uint32_t msg,int flags,uint64_t addr,uint32_t rkey)1694*d6b92ffaSHans Petter Selasky static int rs_post_write_msg(struct rsocket *rs,
1695*d6b92ffaSHans Petter Selasky struct ibv_sge *sgl, int nsge,
1696*d6b92ffaSHans Petter Selasky uint32_t msg, int flags,
1697*d6b92ffaSHans Petter Selasky uint64_t addr, uint32_t rkey)
1698*d6b92ffaSHans Petter Selasky {
1699*d6b92ffaSHans Petter Selasky struct ibv_send_wr wr, *bad;
1700*d6b92ffaSHans Petter Selasky struct ibv_sge sge;
1701*d6b92ffaSHans Petter Selasky int ret;
1702*d6b92ffaSHans Petter Selasky
1703*d6b92ffaSHans Petter Selasky wr.next = NULL;
1704*d6b92ffaSHans Petter Selasky if (!(rs->opts & RS_OPT_MSG_SEND)) {
1705*d6b92ffaSHans Petter Selasky wr.wr_id = rs_send_wr_id(msg);
1706*d6b92ffaSHans Petter Selasky wr.sg_list = sgl;
1707*d6b92ffaSHans Petter Selasky wr.num_sge = nsge;
1708*d6b92ffaSHans Petter Selasky wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
1709*d6b92ffaSHans Petter Selasky wr.send_flags = flags;
1710*d6b92ffaSHans Petter Selasky wr.imm_data = htobe32(msg);
1711*d6b92ffaSHans Petter Selasky wr.wr.rdma.remote_addr = addr;
1712*d6b92ffaSHans Petter Selasky wr.wr.rdma.rkey = rkey;
1713*d6b92ffaSHans Petter Selasky
1714*d6b92ffaSHans Petter Selasky return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad));
1715*d6b92ffaSHans Petter Selasky } else {
1716*d6b92ffaSHans Petter Selasky ret = rs_post_write(rs, sgl, nsge, msg, flags, addr, rkey);
1717*d6b92ffaSHans Petter Selasky if (!ret) {
1718*d6b92ffaSHans Petter Selasky wr.wr_id = rs_send_wr_id(rs_msg_set(rs_msg_op(msg), 0)) |
1719*d6b92ffaSHans Petter Selasky RS_WR_ID_FLAG_MSG_SEND;
1720*d6b92ffaSHans Petter Selasky sge.addr = (uintptr_t) &msg;
1721*d6b92ffaSHans Petter Selasky sge.lkey = 0;
1722*d6b92ffaSHans Petter Selasky sge.length = sizeof msg;
1723*d6b92ffaSHans Petter Selasky wr.sg_list = &sge;
1724*d6b92ffaSHans Petter Selasky wr.num_sge = 1;
1725*d6b92ffaSHans Petter Selasky wr.opcode = IBV_WR_SEND;
1726*d6b92ffaSHans Petter Selasky wr.send_flags = IBV_SEND_INLINE;
1727*d6b92ffaSHans Petter Selasky
1728*d6b92ffaSHans Petter Selasky ret = rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad));
1729*d6b92ffaSHans Petter Selasky }
1730*d6b92ffaSHans Petter Selasky return ret;
1731*d6b92ffaSHans Petter Selasky }
1732*d6b92ffaSHans Petter Selasky }
1733*d6b92ffaSHans Petter Selasky
ds_post_send(struct rsocket * rs,struct ibv_sge * sge,uint32_t wr_data)1734*d6b92ffaSHans Petter Selasky static int ds_post_send(struct rsocket *rs, struct ibv_sge *sge,
1735*d6b92ffaSHans Petter Selasky uint32_t wr_data)
1736*d6b92ffaSHans Petter Selasky {
1737*d6b92ffaSHans Petter Selasky struct ibv_send_wr wr, *bad;
1738*d6b92ffaSHans Petter Selasky
1739*d6b92ffaSHans Petter Selasky wr.wr_id = rs_send_wr_id(wr_data);
1740*d6b92ffaSHans Petter Selasky wr.next = NULL;
1741*d6b92ffaSHans Petter Selasky wr.sg_list = sge;
1742*d6b92ffaSHans Petter Selasky wr.num_sge = 1;
1743*d6b92ffaSHans Petter Selasky wr.opcode = IBV_WR_SEND;
1744*d6b92ffaSHans Petter Selasky wr.send_flags = (sge->length <= rs->sq_inline) ? IBV_SEND_INLINE : 0;
1745*d6b92ffaSHans Petter Selasky wr.wr.ud.ah = rs->conn_dest->ah;
1746*d6b92ffaSHans Petter Selasky wr.wr.ud.remote_qpn = rs->conn_dest->qpn;
1747*d6b92ffaSHans Petter Selasky wr.wr.ud.remote_qkey = RDMA_UDP_QKEY;
1748*d6b92ffaSHans Petter Selasky
1749*d6b92ffaSHans Petter Selasky return rdma_seterrno(ibv_post_send(rs->conn_dest->qp->cm_id->qp, &wr, &bad));
1750*d6b92ffaSHans Petter Selasky }
1751*d6b92ffaSHans Petter Selasky
1752*d6b92ffaSHans Petter Selasky /*
1753*d6b92ffaSHans Petter Selasky * Update target SGE before sending data. Otherwise the remote side may
1754*d6b92ffaSHans Petter Selasky * update the entry before we do.
1755*d6b92ffaSHans Petter Selasky */
rs_write_data(struct rsocket * rs,struct ibv_sge * sgl,int nsge,uint32_t length,int flags)1756*d6b92ffaSHans Petter Selasky static int rs_write_data(struct rsocket *rs,
1757*d6b92ffaSHans Petter Selasky struct ibv_sge *sgl, int nsge,
1758*d6b92ffaSHans Petter Selasky uint32_t length, int flags)
1759*d6b92ffaSHans Petter Selasky {
1760*d6b92ffaSHans Petter Selasky uint64_t addr;
1761*d6b92ffaSHans Petter Selasky uint32_t rkey;
1762*d6b92ffaSHans Petter Selasky
1763*d6b92ffaSHans Petter Selasky rs->sseq_no++;
1764*d6b92ffaSHans Petter Selasky rs->sqe_avail--;
1765*d6b92ffaSHans Petter Selasky if (rs->opts & RS_OPT_MSG_SEND)
1766*d6b92ffaSHans Petter Selasky rs->sqe_avail--;
1767*d6b92ffaSHans Petter Selasky rs->sbuf_bytes_avail -= length;
1768*d6b92ffaSHans Petter Selasky
1769*d6b92ffaSHans Petter Selasky addr = rs->target_sgl[rs->target_sge].addr;
1770*d6b92ffaSHans Petter Selasky rkey = rs->target_sgl[rs->target_sge].key;
1771*d6b92ffaSHans Petter Selasky
1772*d6b92ffaSHans Petter Selasky rs->target_sgl[rs->target_sge].addr += length;
1773*d6b92ffaSHans Petter Selasky rs->target_sgl[rs->target_sge].length -= length;
1774*d6b92ffaSHans Petter Selasky
1775*d6b92ffaSHans Petter Selasky if (!rs->target_sgl[rs->target_sge].length) {
1776*d6b92ffaSHans Petter Selasky if (++rs->target_sge == RS_SGL_SIZE)
1777*d6b92ffaSHans Petter Selasky rs->target_sge = 0;
1778*d6b92ffaSHans Petter Selasky }
1779*d6b92ffaSHans Petter Selasky
1780*d6b92ffaSHans Petter Selasky return rs_post_write_msg(rs, sgl, nsge, rs_msg_set(RS_OP_DATA, length),
1781*d6b92ffaSHans Petter Selasky flags, addr, rkey);
1782*d6b92ffaSHans Petter Selasky }
1783*d6b92ffaSHans Petter Selasky
rs_write_direct(struct rsocket * rs,struct rs_iomap * iom,uint64_t offset,struct ibv_sge * sgl,int nsge,uint32_t length,int flags)1784*d6b92ffaSHans Petter Selasky static int rs_write_direct(struct rsocket *rs, struct rs_iomap *iom, uint64_t offset,
1785*d6b92ffaSHans Petter Selasky struct ibv_sge *sgl, int nsge, uint32_t length, int flags)
1786*d6b92ffaSHans Petter Selasky {
1787*d6b92ffaSHans Petter Selasky uint64_t addr;
1788*d6b92ffaSHans Petter Selasky
1789*d6b92ffaSHans Petter Selasky rs->sqe_avail--;
1790*d6b92ffaSHans Petter Selasky rs->sbuf_bytes_avail -= length;
1791*d6b92ffaSHans Petter Selasky
1792*d6b92ffaSHans Petter Selasky addr = iom->sge.addr + offset - iom->offset;
1793*d6b92ffaSHans Petter Selasky return rs_post_write(rs, sgl, nsge, rs_msg_set(RS_OP_WRITE, length),
1794*d6b92ffaSHans Petter Selasky flags, addr, iom->sge.key);
1795*d6b92ffaSHans Petter Selasky }
1796*d6b92ffaSHans Petter Selasky
rs_write_iomap(struct rsocket * rs,struct rs_iomap_mr * iomr,struct ibv_sge * sgl,int nsge,int flags)1797*d6b92ffaSHans Petter Selasky static int rs_write_iomap(struct rsocket *rs, struct rs_iomap_mr *iomr,
1798*d6b92ffaSHans Petter Selasky struct ibv_sge *sgl, int nsge, int flags)
1799*d6b92ffaSHans Petter Selasky {
1800*d6b92ffaSHans Petter Selasky uint64_t addr;
1801*d6b92ffaSHans Petter Selasky
1802*d6b92ffaSHans Petter Selasky rs->sseq_no++;
1803*d6b92ffaSHans Petter Selasky rs->sqe_avail--;
1804*d6b92ffaSHans Petter Selasky if (rs->opts & RS_OPT_MSG_SEND)
1805*d6b92ffaSHans Petter Selasky rs->sqe_avail--;
1806*d6b92ffaSHans Petter Selasky rs->sbuf_bytes_avail -= sizeof(struct rs_iomap);
1807*d6b92ffaSHans Petter Selasky
1808*d6b92ffaSHans Petter Selasky addr = rs->remote_iomap.addr + iomr->index * sizeof(struct rs_iomap);
1809*d6b92ffaSHans Petter Selasky return rs_post_write_msg(rs, sgl, nsge, rs_msg_set(RS_OP_IOMAP_SGL, iomr->index),
1810*d6b92ffaSHans Petter Selasky flags, addr, rs->remote_iomap.key);
1811*d6b92ffaSHans Petter Selasky }
1812*d6b92ffaSHans Petter Selasky
rs_sbuf_left(struct rsocket * rs)1813*d6b92ffaSHans Petter Selasky static uint32_t rs_sbuf_left(struct rsocket *rs)
1814*d6b92ffaSHans Petter Selasky {
1815*d6b92ffaSHans Petter Selasky return (uint32_t) (((uint64_t) (uintptr_t) &rs->sbuf[rs->sbuf_size]) -
1816*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr);
1817*d6b92ffaSHans Petter Selasky }
1818*d6b92ffaSHans Petter Selasky
rs_send_credits(struct rsocket * rs)1819*d6b92ffaSHans Petter Selasky static void rs_send_credits(struct rsocket *rs)
1820*d6b92ffaSHans Petter Selasky {
1821*d6b92ffaSHans Petter Selasky struct ibv_sge ibsge;
1822*d6b92ffaSHans Petter Selasky struct rs_sge sge, *sge_buf;
1823*d6b92ffaSHans Petter Selasky int flags;
1824*d6b92ffaSHans Petter Selasky
1825*d6b92ffaSHans Petter Selasky rs->ctrl_seqno++;
1826*d6b92ffaSHans Petter Selasky rs->rseq_comp = rs->rseq_no + (rs->rq_size >> 1);
1827*d6b92ffaSHans Petter Selasky if (rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) {
1828*d6b92ffaSHans Petter Selasky if (rs->opts & RS_OPT_MSG_SEND)
1829*d6b92ffaSHans Petter Selasky rs->ctrl_seqno++;
1830*d6b92ffaSHans Petter Selasky
1831*d6b92ffaSHans Petter Selasky if (!(rs->opts & RS_OPT_SWAP_SGL)) {
1832*d6b92ffaSHans Petter Selasky sge.addr = (uintptr_t) &rs->rbuf[rs->rbuf_free_offset];
1833*d6b92ffaSHans Petter Selasky sge.key = rs->rmr->rkey;
1834*d6b92ffaSHans Petter Selasky sge.length = rs->rbuf_size >> 1;
1835*d6b92ffaSHans Petter Selasky } else {
1836*d6b92ffaSHans Petter Selasky sge.addr = bswap_64((uintptr_t) &rs->rbuf[rs->rbuf_free_offset]);
1837*d6b92ffaSHans Petter Selasky sge.key = bswap_32(rs->rmr->rkey);
1838*d6b92ffaSHans Petter Selasky sge.length = bswap_32(rs->rbuf_size >> 1);
1839*d6b92ffaSHans Petter Selasky }
1840*d6b92ffaSHans Petter Selasky
1841*d6b92ffaSHans Petter Selasky if (rs->sq_inline < sizeof sge) {
1842*d6b92ffaSHans Petter Selasky sge_buf = rs_get_ctrl_buf(rs);
1843*d6b92ffaSHans Petter Selasky memcpy(sge_buf, &sge, sizeof sge);
1844*d6b92ffaSHans Petter Selasky ibsge.addr = (uintptr_t) sge_buf;
1845*d6b92ffaSHans Petter Selasky ibsge.lkey = rs->smr->lkey;
1846*d6b92ffaSHans Petter Selasky flags = 0;
1847*d6b92ffaSHans Petter Selasky } else {
1848*d6b92ffaSHans Petter Selasky ibsge.addr = (uintptr_t) &sge;
1849*d6b92ffaSHans Petter Selasky ibsge.lkey = 0;
1850*d6b92ffaSHans Petter Selasky flags = IBV_SEND_INLINE;
1851*d6b92ffaSHans Petter Selasky }
1852*d6b92ffaSHans Petter Selasky ibsge.length = sizeof(sge);
1853*d6b92ffaSHans Petter Selasky
1854*d6b92ffaSHans Petter Selasky rs_post_write_msg(rs, &ibsge, 1,
1855*d6b92ffaSHans Petter Selasky rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size), flags,
1856*d6b92ffaSHans Petter Selasky rs->remote_sgl.addr + rs->remote_sge * sizeof(struct rs_sge),
1857*d6b92ffaSHans Petter Selasky rs->remote_sgl.key);
1858*d6b92ffaSHans Petter Selasky
1859*d6b92ffaSHans Petter Selasky rs->rbuf_bytes_avail -= rs->rbuf_size >> 1;
1860*d6b92ffaSHans Petter Selasky rs->rbuf_free_offset += rs->rbuf_size >> 1;
1861*d6b92ffaSHans Petter Selasky if (rs->rbuf_free_offset >= rs->rbuf_size)
1862*d6b92ffaSHans Petter Selasky rs->rbuf_free_offset = 0;
1863*d6b92ffaSHans Petter Selasky if (++rs->remote_sge == rs->remote_sgl.length)
1864*d6b92ffaSHans Petter Selasky rs->remote_sge = 0;
1865*d6b92ffaSHans Petter Selasky } else {
1866*d6b92ffaSHans Petter Selasky rs_post_msg(rs, rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size));
1867*d6b92ffaSHans Petter Selasky }
1868*d6b92ffaSHans Petter Selasky }
1869*d6b92ffaSHans Petter Selasky
rs_ctrl_avail(struct rsocket * rs)1870*d6b92ffaSHans Petter Selasky static inline int rs_ctrl_avail(struct rsocket *rs)
1871*d6b92ffaSHans Petter Selasky {
1872*d6b92ffaSHans Petter Selasky return rs->ctrl_seqno != rs->ctrl_max_seqno;
1873*d6b92ffaSHans Petter Selasky }
1874*d6b92ffaSHans Petter Selasky
1875*d6b92ffaSHans Petter Selasky /* Protocols that do not support RDMA write with immediate may require 2 msgs */
rs_2ctrl_avail(struct rsocket * rs)1876*d6b92ffaSHans Petter Selasky static inline int rs_2ctrl_avail(struct rsocket *rs)
1877*d6b92ffaSHans Petter Selasky {
1878*d6b92ffaSHans Petter Selasky return (int)((rs->ctrl_seqno + 1) - rs->ctrl_max_seqno) < 0;
1879*d6b92ffaSHans Petter Selasky }
1880*d6b92ffaSHans Petter Selasky
rs_give_credits(struct rsocket * rs)1881*d6b92ffaSHans Petter Selasky static int rs_give_credits(struct rsocket *rs)
1882*d6b92ffaSHans Petter Selasky {
1883*d6b92ffaSHans Petter Selasky if (!(rs->opts & RS_OPT_MSG_SEND)) {
1884*d6b92ffaSHans Petter Selasky return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) ||
1885*d6b92ffaSHans Petter Selasky ((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) &&
1886*d6b92ffaSHans Petter Selasky rs_ctrl_avail(rs) && (rs->state & rs_connected);
1887*d6b92ffaSHans Petter Selasky } else {
1888*d6b92ffaSHans Petter Selasky return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) ||
1889*d6b92ffaSHans Petter Selasky ((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) &&
1890*d6b92ffaSHans Petter Selasky rs_2ctrl_avail(rs) && (rs->state & rs_connected);
1891*d6b92ffaSHans Petter Selasky }
1892*d6b92ffaSHans Petter Selasky }
1893*d6b92ffaSHans Petter Selasky
rs_update_credits(struct rsocket * rs)1894*d6b92ffaSHans Petter Selasky static void rs_update_credits(struct rsocket *rs)
1895*d6b92ffaSHans Petter Selasky {
1896*d6b92ffaSHans Petter Selasky if (rs_give_credits(rs))
1897*d6b92ffaSHans Petter Selasky rs_send_credits(rs);
1898*d6b92ffaSHans Petter Selasky }
1899*d6b92ffaSHans Petter Selasky
rs_poll_cq(struct rsocket * rs)1900*d6b92ffaSHans Petter Selasky static int rs_poll_cq(struct rsocket *rs)
1901*d6b92ffaSHans Petter Selasky {
1902*d6b92ffaSHans Petter Selasky struct ibv_wc wc;
1903*d6b92ffaSHans Petter Selasky uint32_t msg;
1904*d6b92ffaSHans Petter Selasky int ret, rcnt = 0;
1905*d6b92ffaSHans Petter Selasky
1906*d6b92ffaSHans Petter Selasky while ((ret = ibv_poll_cq(rs->cm_id->recv_cq, 1, &wc)) > 0) {
1907*d6b92ffaSHans Petter Selasky if (rs_wr_is_recv(wc.wr_id)) {
1908*d6b92ffaSHans Petter Selasky if (wc.status != IBV_WC_SUCCESS)
1909*d6b92ffaSHans Petter Selasky continue;
1910*d6b92ffaSHans Petter Selasky rcnt++;
1911*d6b92ffaSHans Petter Selasky
1912*d6b92ffaSHans Petter Selasky if (wc.wc_flags & IBV_WC_WITH_IMM) {
1913*d6b92ffaSHans Petter Selasky msg = be32toh(wc.imm_data);
1914*d6b92ffaSHans Petter Selasky } else {
1915*d6b92ffaSHans Petter Selasky msg = ((uint32_t *) (rs->rbuf + rs->rbuf_size))
1916*d6b92ffaSHans Petter Selasky [rs_wr_data(wc.wr_id)];
1917*d6b92ffaSHans Petter Selasky
1918*d6b92ffaSHans Petter Selasky }
1919*d6b92ffaSHans Petter Selasky switch (rs_msg_op(msg)) {
1920*d6b92ffaSHans Petter Selasky case RS_OP_SGL:
1921*d6b92ffaSHans Petter Selasky rs->sseq_comp = (uint16_t) rs_msg_data(msg);
1922*d6b92ffaSHans Petter Selasky break;
1923*d6b92ffaSHans Petter Selasky case RS_OP_IOMAP_SGL:
1924*d6b92ffaSHans Petter Selasky /* The iomap was updated, that's nice to know. */
1925*d6b92ffaSHans Petter Selasky break;
1926*d6b92ffaSHans Petter Selasky case RS_OP_CTRL:
1927*d6b92ffaSHans Petter Selasky if (rs_msg_data(msg) == RS_CTRL_DISCONNECT) {
1928*d6b92ffaSHans Petter Selasky rs->state = rs_disconnected;
1929*d6b92ffaSHans Petter Selasky return 0;
1930*d6b92ffaSHans Petter Selasky } else if (rs_msg_data(msg) == RS_CTRL_SHUTDOWN) {
1931*d6b92ffaSHans Petter Selasky if (rs->state & rs_writable) {
1932*d6b92ffaSHans Petter Selasky rs->state &= ~rs_readable;
1933*d6b92ffaSHans Petter Selasky } else {
1934*d6b92ffaSHans Petter Selasky rs->state = rs_disconnected;
1935*d6b92ffaSHans Petter Selasky return 0;
1936*d6b92ffaSHans Petter Selasky }
1937*d6b92ffaSHans Petter Selasky }
1938*d6b92ffaSHans Petter Selasky break;
1939*d6b92ffaSHans Petter Selasky case RS_OP_WRITE:
1940*d6b92ffaSHans Petter Selasky /* We really shouldn't be here. */
1941*d6b92ffaSHans Petter Selasky break;
1942*d6b92ffaSHans Petter Selasky default:
1943*d6b92ffaSHans Petter Selasky rs->rmsg[rs->rmsg_tail].op = rs_msg_op(msg);
1944*d6b92ffaSHans Petter Selasky rs->rmsg[rs->rmsg_tail].data = rs_msg_data(msg);
1945*d6b92ffaSHans Petter Selasky if (++rs->rmsg_tail == rs->rq_size + 1)
1946*d6b92ffaSHans Petter Selasky rs->rmsg_tail = 0;
1947*d6b92ffaSHans Petter Selasky break;
1948*d6b92ffaSHans Petter Selasky }
1949*d6b92ffaSHans Petter Selasky } else {
1950*d6b92ffaSHans Petter Selasky switch (rs_msg_op(rs_wr_data(wc.wr_id))) {
1951*d6b92ffaSHans Petter Selasky case RS_OP_SGL:
1952*d6b92ffaSHans Petter Selasky rs->ctrl_max_seqno++;
1953*d6b92ffaSHans Petter Selasky break;
1954*d6b92ffaSHans Petter Selasky case RS_OP_CTRL:
1955*d6b92ffaSHans Petter Selasky rs->ctrl_max_seqno++;
1956*d6b92ffaSHans Petter Selasky if (rs_msg_data(rs_wr_data(wc.wr_id)) == RS_CTRL_DISCONNECT)
1957*d6b92ffaSHans Petter Selasky rs->state = rs_disconnected;
1958*d6b92ffaSHans Petter Selasky break;
1959*d6b92ffaSHans Petter Selasky case RS_OP_IOMAP_SGL:
1960*d6b92ffaSHans Petter Selasky rs->sqe_avail++;
1961*d6b92ffaSHans Petter Selasky if (!rs_wr_is_msg_send(wc.wr_id))
1962*d6b92ffaSHans Petter Selasky rs->sbuf_bytes_avail += sizeof(struct rs_iomap);
1963*d6b92ffaSHans Petter Selasky break;
1964*d6b92ffaSHans Petter Selasky default:
1965*d6b92ffaSHans Petter Selasky rs->sqe_avail++;
1966*d6b92ffaSHans Petter Selasky rs->sbuf_bytes_avail += rs_msg_data(rs_wr_data(wc.wr_id));
1967*d6b92ffaSHans Petter Selasky break;
1968*d6b92ffaSHans Petter Selasky }
1969*d6b92ffaSHans Petter Selasky if (wc.status != IBV_WC_SUCCESS && (rs->state & rs_connected)) {
1970*d6b92ffaSHans Petter Selasky rs->state = rs_error;
1971*d6b92ffaSHans Petter Selasky rs->err = EIO;
1972*d6b92ffaSHans Petter Selasky }
1973*d6b92ffaSHans Petter Selasky }
1974*d6b92ffaSHans Petter Selasky }
1975*d6b92ffaSHans Petter Selasky
1976*d6b92ffaSHans Petter Selasky if (rs->state & rs_connected) {
1977*d6b92ffaSHans Petter Selasky while (!ret && rcnt--)
1978*d6b92ffaSHans Petter Selasky ret = rs_post_recv(rs);
1979*d6b92ffaSHans Petter Selasky
1980*d6b92ffaSHans Petter Selasky if (ret) {
1981*d6b92ffaSHans Petter Selasky rs->state = rs_error;
1982*d6b92ffaSHans Petter Selasky rs->err = errno;
1983*d6b92ffaSHans Petter Selasky }
1984*d6b92ffaSHans Petter Selasky }
1985*d6b92ffaSHans Petter Selasky return ret;
1986*d6b92ffaSHans Petter Selasky }
1987*d6b92ffaSHans Petter Selasky
rs_get_cq_event(struct rsocket * rs)1988*d6b92ffaSHans Petter Selasky static int rs_get_cq_event(struct rsocket *rs)
1989*d6b92ffaSHans Petter Selasky {
1990*d6b92ffaSHans Petter Selasky struct ibv_cq *cq;
1991*d6b92ffaSHans Petter Selasky void *context;
1992*d6b92ffaSHans Petter Selasky int ret;
1993*d6b92ffaSHans Petter Selasky
1994*d6b92ffaSHans Petter Selasky if (!rs->cq_armed)
1995*d6b92ffaSHans Petter Selasky return 0;
1996*d6b92ffaSHans Petter Selasky
1997*d6b92ffaSHans Petter Selasky ret = ibv_get_cq_event(rs->cm_id->recv_cq_channel, &cq, &context);
1998*d6b92ffaSHans Petter Selasky if (!ret) {
1999*d6b92ffaSHans Petter Selasky if (++rs->unack_cqe >= rs->sq_size + rs->rq_size) {
2000*d6b92ffaSHans Petter Selasky ibv_ack_cq_events(rs->cm_id->recv_cq, rs->unack_cqe);
2001*d6b92ffaSHans Petter Selasky rs->unack_cqe = 0;
2002*d6b92ffaSHans Petter Selasky }
2003*d6b92ffaSHans Petter Selasky rs->cq_armed = 0;
2004*d6b92ffaSHans Petter Selasky } else if (!(errno == EAGAIN || errno == EINTR)) {
2005*d6b92ffaSHans Petter Selasky rs->state = rs_error;
2006*d6b92ffaSHans Petter Selasky }
2007*d6b92ffaSHans Petter Selasky
2008*d6b92ffaSHans Petter Selasky return ret;
2009*d6b92ffaSHans Petter Selasky }
2010*d6b92ffaSHans Petter Selasky
2011*d6b92ffaSHans Petter Selasky /*
2012*d6b92ffaSHans Petter Selasky * Although we serialize rsend and rrecv calls with respect to themselves,
2013*d6b92ffaSHans Petter Selasky * both calls may run simultaneously and need to poll the CQ for completions.
2014*d6b92ffaSHans Petter Selasky * We need to serialize access to the CQ, but rsend and rrecv need to
2015*d6b92ffaSHans Petter Selasky * allow each other to make forward progress.
2016*d6b92ffaSHans Petter Selasky *
2017*d6b92ffaSHans Petter Selasky * For example, rsend may need to wait for credits from the remote side,
2018*d6b92ffaSHans Petter Selasky * which could be stalled until the remote process calls rrecv. This should
2019*d6b92ffaSHans Petter Selasky * not block rrecv from receiving data from the remote side however.
2020*d6b92ffaSHans Petter Selasky *
2021*d6b92ffaSHans Petter Selasky * We handle this by using two locks. The cq_lock protects against polling
2022*d6b92ffaSHans Petter Selasky * the CQ and processing completions. The cq_wait_lock serializes access to
2023*d6b92ffaSHans Petter Selasky * waiting on the CQ.
2024*d6b92ffaSHans Petter Selasky */
rs_process_cq(struct rsocket * rs,int nonblock,int (* test)(struct rsocket * rs))2025*d6b92ffaSHans Petter Selasky static int rs_process_cq(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs))
2026*d6b92ffaSHans Petter Selasky {
2027*d6b92ffaSHans Petter Selasky int ret;
2028*d6b92ffaSHans Petter Selasky
2029*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->cq_lock);
2030*d6b92ffaSHans Petter Selasky do {
2031*d6b92ffaSHans Petter Selasky rs_update_credits(rs);
2032*d6b92ffaSHans Petter Selasky ret = rs_poll_cq(rs);
2033*d6b92ffaSHans Petter Selasky if (test(rs)) {
2034*d6b92ffaSHans Petter Selasky ret = 0;
2035*d6b92ffaSHans Petter Selasky break;
2036*d6b92ffaSHans Petter Selasky } else if (ret) {
2037*d6b92ffaSHans Petter Selasky break;
2038*d6b92ffaSHans Petter Selasky } else if (nonblock) {
2039*d6b92ffaSHans Petter Selasky ret = ERR(EWOULDBLOCK);
2040*d6b92ffaSHans Petter Selasky } else if (!rs->cq_armed) {
2041*d6b92ffaSHans Petter Selasky ibv_req_notify_cq(rs->cm_id->recv_cq, 0);
2042*d6b92ffaSHans Petter Selasky rs->cq_armed = 1;
2043*d6b92ffaSHans Petter Selasky } else {
2044*d6b92ffaSHans Petter Selasky rs_update_credits(rs);
2045*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->cq_wait_lock);
2046*d6b92ffaSHans Petter Selasky fastlock_release(&rs->cq_lock);
2047*d6b92ffaSHans Petter Selasky
2048*d6b92ffaSHans Petter Selasky ret = rs_get_cq_event(rs);
2049*d6b92ffaSHans Petter Selasky fastlock_release(&rs->cq_wait_lock);
2050*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->cq_lock);
2051*d6b92ffaSHans Petter Selasky }
2052*d6b92ffaSHans Petter Selasky } while (!ret);
2053*d6b92ffaSHans Petter Selasky
2054*d6b92ffaSHans Petter Selasky rs_update_credits(rs);
2055*d6b92ffaSHans Petter Selasky fastlock_release(&rs->cq_lock);
2056*d6b92ffaSHans Petter Selasky return ret;
2057*d6b92ffaSHans Petter Selasky }
2058*d6b92ffaSHans Petter Selasky
rs_get_comp(struct rsocket * rs,int nonblock,int (* test)(struct rsocket * rs))2059*d6b92ffaSHans Petter Selasky static int rs_get_comp(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs))
2060*d6b92ffaSHans Petter Selasky {
2061*d6b92ffaSHans Petter Selasky struct timeval s, e;
2062*d6b92ffaSHans Petter Selasky uint32_t poll_time = 0;
2063*d6b92ffaSHans Petter Selasky int ret;
2064*d6b92ffaSHans Petter Selasky
2065*d6b92ffaSHans Petter Selasky do {
2066*d6b92ffaSHans Petter Selasky ret = rs_process_cq(rs, 1, test);
2067*d6b92ffaSHans Petter Selasky if (!ret || nonblock || errno != EWOULDBLOCK)
2068*d6b92ffaSHans Petter Selasky return ret;
2069*d6b92ffaSHans Petter Selasky
2070*d6b92ffaSHans Petter Selasky if (!poll_time)
2071*d6b92ffaSHans Petter Selasky gettimeofday(&s, NULL);
2072*d6b92ffaSHans Petter Selasky
2073*d6b92ffaSHans Petter Selasky gettimeofday(&e, NULL);
2074*d6b92ffaSHans Petter Selasky poll_time = (e.tv_sec - s.tv_sec) * 1000000 +
2075*d6b92ffaSHans Petter Selasky (e.tv_usec - s.tv_usec) + 1;
2076*d6b92ffaSHans Petter Selasky } while (poll_time <= polling_time);
2077*d6b92ffaSHans Petter Selasky
2078*d6b92ffaSHans Petter Selasky ret = rs_process_cq(rs, 0, test);
2079*d6b92ffaSHans Petter Selasky return ret;
2080*d6b92ffaSHans Petter Selasky }
2081*d6b92ffaSHans Petter Selasky
ds_valid_recv(struct ds_qp * qp,struct ibv_wc * wc)2082*d6b92ffaSHans Petter Selasky static int ds_valid_recv(struct ds_qp *qp, struct ibv_wc *wc)
2083*d6b92ffaSHans Petter Selasky {
2084*d6b92ffaSHans Petter Selasky struct ds_header *hdr;
2085*d6b92ffaSHans Petter Selasky
2086*d6b92ffaSHans Petter Selasky hdr = (struct ds_header *) (qp->rbuf + rs_wr_data(wc->wr_id));
2087*d6b92ffaSHans Petter Selasky return ((wc->byte_len >= sizeof(struct ibv_grh) + DS_IPV4_HDR_LEN) &&
2088*d6b92ffaSHans Petter Selasky ((hdr->version == 4 && hdr->length == DS_IPV4_HDR_LEN) ||
2089*d6b92ffaSHans Petter Selasky (hdr->version == 6 && hdr->length == DS_IPV6_HDR_LEN)));
2090*d6b92ffaSHans Petter Selasky }
2091*d6b92ffaSHans Petter Selasky
2092*d6b92ffaSHans Petter Selasky /*
2093*d6b92ffaSHans Petter Selasky * Poll all CQs associated with a datagram rsocket. We need to drop any
2094*d6b92ffaSHans Petter Selasky * received messages that we do not have room to store. To limit drops,
2095*d6b92ffaSHans Petter Selasky * we only poll if we have room to store the receive or we need a send
2096*d6b92ffaSHans Petter Selasky * buffer. To ensure fairness, we poll the CQs round robin, remembering
2097*d6b92ffaSHans Petter Selasky * where we left off.
2098*d6b92ffaSHans Petter Selasky */
ds_poll_cqs(struct rsocket * rs)2099*d6b92ffaSHans Petter Selasky static void ds_poll_cqs(struct rsocket *rs)
2100*d6b92ffaSHans Petter Selasky {
2101*d6b92ffaSHans Petter Selasky struct ds_qp *qp;
2102*d6b92ffaSHans Petter Selasky struct ds_smsg *smsg;
2103*d6b92ffaSHans Petter Selasky struct ds_rmsg *rmsg;
2104*d6b92ffaSHans Petter Selasky struct ibv_wc wc;
2105*d6b92ffaSHans Petter Selasky int ret, cnt;
2106*d6b92ffaSHans Petter Selasky
2107*d6b92ffaSHans Petter Selasky if (!(qp = rs->qp_list))
2108*d6b92ffaSHans Petter Selasky return;
2109*d6b92ffaSHans Petter Selasky
2110*d6b92ffaSHans Petter Selasky do {
2111*d6b92ffaSHans Petter Selasky cnt = 0;
2112*d6b92ffaSHans Petter Selasky do {
2113*d6b92ffaSHans Petter Selasky ret = ibv_poll_cq(qp->cm_id->recv_cq, 1, &wc);
2114*d6b92ffaSHans Petter Selasky if (ret <= 0) {
2115*d6b92ffaSHans Petter Selasky qp = ds_next_qp(qp);
2116*d6b92ffaSHans Petter Selasky continue;
2117*d6b92ffaSHans Petter Selasky }
2118*d6b92ffaSHans Petter Selasky
2119*d6b92ffaSHans Petter Selasky if (rs_wr_is_recv(wc.wr_id)) {
2120*d6b92ffaSHans Petter Selasky if (rs->rqe_avail && wc.status == IBV_WC_SUCCESS &&
2121*d6b92ffaSHans Petter Selasky ds_valid_recv(qp, &wc)) {
2122*d6b92ffaSHans Petter Selasky rs->rqe_avail--;
2123*d6b92ffaSHans Petter Selasky rmsg = &rs->dmsg[rs->rmsg_tail];
2124*d6b92ffaSHans Petter Selasky rmsg->qp = qp;
2125*d6b92ffaSHans Petter Selasky rmsg->offset = rs_wr_data(wc.wr_id);
2126*d6b92ffaSHans Petter Selasky rmsg->length = wc.byte_len - sizeof(struct ibv_grh);
2127*d6b92ffaSHans Petter Selasky if (++rs->rmsg_tail == rs->rq_size + 1)
2128*d6b92ffaSHans Petter Selasky rs->rmsg_tail = 0;
2129*d6b92ffaSHans Petter Selasky } else {
2130*d6b92ffaSHans Petter Selasky ds_post_recv(rs, qp, rs_wr_data(wc.wr_id));
2131*d6b92ffaSHans Petter Selasky }
2132*d6b92ffaSHans Petter Selasky } else {
2133*d6b92ffaSHans Petter Selasky smsg = (struct ds_smsg *) (rs->sbuf + rs_wr_data(wc.wr_id));
2134*d6b92ffaSHans Petter Selasky smsg->next = rs->smsg_free;
2135*d6b92ffaSHans Petter Selasky rs->smsg_free = smsg;
2136*d6b92ffaSHans Petter Selasky rs->sqe_avail++;
2137*d6b92ffaSHans Petter Selasky }
2138*d6b92ffaSHans Petter Selasky
2139*d6b92ffaSHans Petter Selasky qp = ds_next_qp(qp);
2140*d6b92ffaSHans Petter Selasky if (!rs->rqe_avail && rs->sqe_avail) {
2141*d6b92ffaSHans Petter Selasky rs->qp_list = qp;
2142*d6b92ffaSHans Petter Selasky return;
2143*d6b92ffaSHans Petter Selasky }
2144*d6b92ffaSHans Petter Selasky cnt++;
2145*d6b92ffaSHans Petter Selasky } while (qp != rs->qp_list);
2146*d6b92ffaSHans Petter Selasky } while (cnt);
2147*d6b92ffaSHans Petter Selasky }
2148*d6b92ffaSHans Petter Selasky
ds_req_notify_cqs(struct rsocket * rs)2149*d6b92ffaSHans Petter Selasky static void ds_req_notify_cqs(struct rsocket *rs)
2150*d6b92ffaSHans Petter Selasky {
2151*d6b92ffaSHans Petter Selasky struct ds_qp *qp;
2152*d6b92ffaSHans Petter Selasky
2153*d6b92ffaSHans Petter Selasky if (!(qp = rs->qp_list))
2154*d6b92ffaSHans Petter Selasky return;
2155*d6b92ffaSHans Petter Selasky
2156*d6b92ffaSHans Petter Selasky do {
2157*d6b92ffaSHans Petter Selasky if (!qp->cq_armed) {
2158*d6b92ffaSHans Petter Selasky ibv_req_notify_cq(qp->cm_id->recv_cq, 0);
2159*d6b92ffaSHans Petter Selasky qp->cq_armed = 1;
2160*d6b92ffaSHans Petter Selasky }
2161*d6b92ffaSHans Petter Selasky qp = ds_next_qp(qp);
2162*d6b92ffaSHans Petter Selasky } while (qp != rs->qp_list);
2163*d6b92ffaSHans Petter Selasky }
2164*d6b92ffaSHans Petter Selasky
ds_get_cq_event(struct rsocket * rs)2165*d6b92ffaSHans Petter Selasky static int ds_get_cq_event(struct rsocket *rs)
2166*d6b92ffaSHans Petter Selasky {
2167*d6b92ffaSHans Petter Selasky struct epoll_event event;
2168*d6b92ffaSHans Petter Selasky struct ds_qp *qp;
2169*d6b92ffaSHans Petter Selasky struct ibv_cq *cq;
2170*d6b92ffaSHans Petter Selasky void *context;
2171*d6b92ffaSHans Petter Selasky int ret;
2172*d6b92ffaSHans Petter Selasky
2173*d6b92ffaSHans Petter Selasky if (!rs->cq_armed)
2174*d6b92ffaSHans Petter Selasky return 0;
2175*d6b92ffaSHans Petter Selasky
2176*d6b92ffaSHans Petter Selasky ret = epoll_wait(rs->epfd, &event, 1, -1);
2177*d6b92ffaSHans Petter Selasky if (ret <= 0)
2178*d6b92ffaSHans Petter Selasky return ret;
2179*d6b92ffaSHans Petter Selasky
2180*d6b92ffaSHans Petter Selasky qp = event.data.ptr;
2181*d6b92ffaSHans Petter Selasky ret = ibv_get_cq_event(qp->cm_id->recv_cq_channel, &cq, &context);
2182*d6b92ffaSHans Petter Selasky if (!ret) {
2183*d6b92ffaSHans Petter Selasky ibv_ack_cq_events(qp->cm_id->recv_cq, 1);
2184*d6b92ffaSHans Petter Selasky qp->cq_armed = 0;
2185*d6b92ffaSHans Petter Selasky rs->cq_armed = 0;
2186*d6b92ffaSHans Petter Selasky }
2187*d6b92ffaSHans Petter Selasky
2188*d6b92ffaSHans Petter Selasky return ret;
2189*d6b92ffaSHans Petter Selasky }
2190*d6b92ffaSHans Petter Selasky
ds_process_cqs(struct rsocket * rs,int nonblock,int (* test)(struct rsocket * rs))2191*d6b92ffaSHans Petter Selasky static int ds_process_cqs(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs))
2192*d6b92ffaSHans Petter Selasky {
2193*d6b92ffaSHans Petter Selasky int ret = 0;
2194*d6b92ffaSHans Petter Selasky
2195*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->cq_lock);
2196*d6b92ffaSHans Petter Selasky do {
2197*d6b92ffaSHans Petter Selasky ds_poll_cqs(rs);
2198*d6b92ffaSHans Petter Selasky if (test(rs)) {
2199*d6b92ffaSHans Petter Selasky ret = 0;
2200*d6b92ffaSHans Petter Selasky break;
2201*d6b92ffaSHans Petter Selasky } else if (nonblock) {
2202*d6b92ffaSHans Petter Selasky ret = ERR(EWOULDBLOCK);
2203*d6b92ffaSHans Petter Selasky } else if (!rs->cq_armed) {
2204*d6b92ffaSHans Petter Selasky ds_req_notify_cqs(rs);
2205*d6b92ffaSHans Petter Selasky rs->cq_armed = 1;
2206*d6b92ffaSHans Petter Selasky } else {
2207*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->cq_wait_lock);
2208*d6b92ffaSHans Petter Selasky fastlock_release(&rs->cq_lock);
2209*d6b92ffaSHans Petter Selasky
2210*d6b92ffaSHans Petter Selasky ret = ds_get_cq_event(rs);
2211*d6b92ffaSHans Petter Selasky fastlock_release(&rs->cq_wait_lock);
2212*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->cq_lock);
2213*d6b92ffaSHans Petter Selasky }
2214*d6b92ffaSHans Petter Selasky } while (!ret);
2215*d6b92ffaSHans Petter Selasky
2216*d6b92ffaSHans Petter Selasky fastlock_release(&rs->cq_lock);
2217*d6b92ffaSHans Petter Selasky return ret;
2218*d6b92ffaSHans Petter Selasky }
2219*d6b92ffaSHans Petter Selasky
ds_get_comp(struct rsocket * rs,int nonblock,int (* test)(struct rsocket * rs))2220*d6b92ffaSHans Petter Selasky static int ds_get_comp(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs))
2221*d6b92ffaSHans Petter Selasky {
2222*d6b92ffaSHans Petter Selasky struct timeval s, e;
2223*d6b92ffaSHans Petter Selasky uint32_t poll_time = 0;
2224*d6b92ffaSHans Petter Selasky int ret;
2225*d6b92ffaSHans Petter Selasky
2226*d6b92ffaSHans Petter Selasky do {
2227*d6b92ffaSHans Petter Selasky ret = ds_process_cqs(rs, 1, test);
2228*d6b92ffaSHans Petter Selasky if (!ret || nonblock || errno != EWOULDBLOCK)
2229*d6b92ffaSHans Petter Selasky return ret;
2230*d6b92ffaSHans Petter Selasky
2231*d6b92ffaSHans Petter Selasky if (!poll_time)
2232*d6b92ffaSHans Petter Selasky gettimeofday(&s, NULL);
2233*d6b92ffaSHans Petter Selasky
2234*d6b92ffaSHans Petter Selasky gettimeofday(&e, NULL);
2235*d6b92ffaSHans Petter Selasky poll_time = (e.tv_sec - s.tv_sec) * 1000000 +
2236*d6b92ffaSHans Petter Selasky (e.tv_usec - s.tv_usec) + 1;
2237*d6b92ffaSHans Petter Selasky } while (poll_time <= polling_time);
2238*d6b92ffaSHans Petter Selasky
2239*d6b92ffaSHans Petter Selasky ret = ds_process_cqs(rs, 0, test);
2240*d6b92ffaSHans Petter Selasky return ret;
2241*d6b92ffaSHans Petter Selasky }
2242*d6b92ffaSHans Petter Selasky
rs_nonblocking(struct rsocket * rs,int flags)2243*d6b92ffaSHans Petter Selasky static int rs_nonblocking(struct rsocket *rs, int flags)
2244*d6b92ffaSHans Petter Selasky {
2245*d6b92ffaSHans Petter Selasky return (rs->fd_flags & O_NONBLOCK) || (flags & MSG_DONTWAIT);
2246*d6b92ffaSHans Petter Selasky }
2247*d6b92ffaSHans Petter Selasky
rs_is_cq_armed(struct rsocket * rs)2248*d6b92ffaSHans Petter Selasky static int rs_is_cq_armed(struct rsocket *rs)
2249*d6b92ffaSHans Petter Selasky {
2250*d6b92ffaSHans Petter Selasky return rs->cq_armed;
2251*d6b92ffaSHans Petter Selasky }
2252*d6b92ffaSHans Petter Selasky
rs_poll_all(struct rsocket * rs)2253*d6b92ffaSHans Petter Selasky static int rs_poll_all(struct rsocket *rs)
2254*d6b92ffaSHans Petter Selasky {
2255*d6b92ffaSHans Petter Selasky return 1;
2256*d6b92ffaSHans Petter Selasky }
2257*d6b92ffaSHans Petter Selasky
2258*d6b92ffaSHans Petter Selasky /*
2259*d6b92ffaSHans Petter Selasky * We use hardware flow control to prevent over running the remote
2260*d6b92ffaSHans Petter Selasky * receive queue. However, data transfers still require space in
2261*d6b92ffaSHans Petter Selasky * the remote rmsg queue, or we risk losing notification that data
2262*d6b92ffaSHans Petter Selasky * has been transfered.
2263*d6b92ffaSHans Petter Selasky *
2264*d6b92ffaSHans Petter Selasky * Be careful with race conditions in the check below. The target SGL
2265*d6b92ffaSHans Petter Selasky * may be updated by a remote RDMA write.
2266*d6b92ffaSHans Petter Selasky */
rs_can_send(struct rsocket * rs)2267*d6b92ffaSHans Petter Selasky static int rs_can_send(struct rsocket *rs)
2268*d6b92ffaSHans Petter Selasky {
2269*d6b92ffaSHans Petter Selasky if (!(rs->opts & RS_OPT_MSG_SEND)) {
2270*d6b92ffaSHans Petter Selasky return rs->sqe_avail && (rs->sbuf_bytes_avail >= RS_SNDLOWAT) &&
2271*d6b92ffaSHans Petter Selasky (rs->sseq_no != rs->sseq_comp) &&
2272*d6b92ffaSHans Petter Selasky (rs->target_sgl[rs->target_sge].length != 0);
2273*d6b92ffaSHans Petter Selasky } else {
2274*d6b92ffaSHans Petter Selasky return (rs->sqe_avail >= 2) && (rs->sbuf_bytes_avail >= RS_SNDLOWAT) &&
2275*d6b92ffaSHans Petter Selasky (rs->sseq_no != rs->sseq_comp) &&
2276*d6b92ffaSHans Petter Selasky (rs->target_sgl[rs->target_sge].length != 0);
2277*d6b92ffaSHans Petter Selasky }
2278*d6b92ffaSHans Petter Selasky }
2279*d6b92ffaSHans Petter Selasky
ds_can_send(struct rsocket * rs)2280*d6b92ffaSHans Petter Selasky static int ds_can_send(struct rsocket *rs)
2281*d6b92ffaSHans Petter Selasky {
2282*d6b92ffaSHans Petter Selasky return rs->sqe_avail;
2283*d6b92ffaSHans Petter Selasky }
2284*d6b92ffaSHans Petter Selasky
ds_all_sends_done(struct rsocket * rs)2285*d6b92ffaSHans Petter Selasky static int ds_all_sends_done(struct rsocket *rs)
2286*d6b92ffaSHans Petter Selasky {
2287*d6b92ffaSHans Petter Selasky return rs->sqe_avail == rs->sq_size;
2288*d6b92ffaSHans Petter Selasky }
2289*d6b92ffaSHans Petter Selasky
rs_conn_can_send(struct rsocket * rs)2290*d6b92ffaSHans Petter Selasky static int rs_conn_can_send(struct rsocket *rs)
2291*d6b92ffaSHans Petter Selasky {
2292*d6b92ffaSHans Petter Selasky return rs_can_send(rs) || !(rs->state & rs_writable);
2293*d6b92ffaSHans Petter Selasky }
2294*d6b92ffaSHans Petter Selasky
rs_conn_can_send_ctrl(struct rsocket * rs)2295*d6b92ffaSHans Petter Selasky static int rs_conn_can_send_ctrl(struct rsocket *rs)
2296*d6b92ffaSHans Petter Selasky {
2297*d6b92ffaSHans Petter Selasky return rs_ctrl_avail(rs) || !(rs->state & rs_connected);
2298*d6b92ffaSHans Petter Selasky }
2299*d6b92ffaSHans Petter Selasky
rs_have_rdata(struct rsocket * rs)2300*d6b92ffaSHans Petter Selasky static int rs_have_rdata(struct rsocket *rs)
2301*d6b92ffaSHans Petter Selasky {
2302*d6b92ffaSHans Petter Selasky return (rs->rmsg_head != rs->rmsg_tail);
2303*d6b92ffaSHans Petter Selasky }
2304*d6b92ffaSHans Petter Selasky
rs_conn_have_rdata(struct rsocket * rs)2305*d6b92ffaSHans Petter Selasky static int rs_conn_have_rdata(struct rsocket *rs)
2306*d6b92ffaSHans Petter Selasky {
2307*d6b92ffaSHans Petter Selasky return rs_have_rdata(rs) || !(rs->state & rs_readable);
2308*d6b92ffaSHans Petter Selasky }
2309*d6b92ffaSHans Petter Selasky
rs_conn_all_sends_done(struct rsocket * rs)2310*d6b92ffaSHans Petter Selasky static int rs_conn_all_sends_done(struct rsocket *rs)
2311*d6b92ffaSHans Petter Selasky {
2312*d6b92ffaSHans Petter Selasky return ((((int) rs->ctrl_max_seqno) - ((int) rs->ctrl_seqno)) +
2313*d6b92ffaSHans Petter Selasky rs->sqe_avail == rs->sq_size) ||
2314*d6b92ffaSHans Petter Selasky !(rs->state & rs_connected);
2315*d6b92ffaSHans Petter Selasky }
2316*d6b92ffaSHans Petter Selasky
ds_set_src(struct sockaddr * addr,socklen_t * addrlen,struct ds_header * hdr)2317*d6b92ffaSHans Petter Selasky static void ds_set_src(struct sockaddr *addr, socklen_t *addrlen,
2318*d6b92ffaSHans Petter Selasky struct ds_header *hdr)
2319*d6b92ffaSHans Petter Selasky {
2320*d6b92ffaSHans Petter Selasky union socket_addr sa;
2321*d6b92ffaSHans Petter Selasky
2322*d6b92ffaSHans Petter Selasky memset(&sa, 0, sizeof sa);
2323*d6b92ffaSHans Petter Selasky if (hdr->version == 4) {
2324*d6b92ffaSHans Petter Selasky if (*addrlen > sizeof(sa.sin))
2325*d6b92ffaSHans Petter Selasky *addrlen = sizeof(sa.sin);
2326*d6b92ffaSHans Petter Selasky
2327*d6b92ffaSHans Petter Selasky sa.sin.sin_family = AF_INET;
2328*d6b92ffaSHans Petter Selasky sa.sin.sin_port = hdr->port;
2329*d6b92ffaSHans Petter Selasky sa.sin.sin_addr.s_addr = hdr->addr.ipv4;
2330*d6b92ffaSHans Petter Selasky } else {
2331*d6b92ffaSHans Petter Selasky if (*addrlen > sizeof(sa.sin6))
2332*d6b92ffaSHans Petter Selasky *addrlen = sizeof(sa.sin6);
2333*d6b92ffaSHans Petter Selasky
2334*d6b92ffaSHans Petter Selasky sa.sin6.sin6_family = AF_INET6;
2335*d6b92ffaSHans Petter Selasky sa.sin6.sin6_port = hdr->port;
2336*d6b92ffaSHans Petter Selasky sa.sin6.sin6_flowinfo = hdr->addr.ipv6.flowinfo;
2337*d6b92ffaSHans Petter Selasky memcpy(&sa.sin6.sin6_addr, &hdr->addr.ipv6.addr, 16);
2338*d6b92ffaSHans Petter Selasky }
2339*d6b92ffaSHans Petter Selasky memcpy(addr, &sa, *addrlen);
2340*d6b92ffaSHans Petter Selasky }
2341*d6b92ffaSHans Petter Selasky
ds_recvfrom(struct rsocket * rs,void * buf,size_t len,int flags,struct sockaddr * src_addr,socklen_t * addrlen)2342*d6b92ffaSHans Petter Selasky static ssize_t ds_recvfrom(struct rsocket *rs, void *buf, size_t len, int flags,
2343*d6b92ffaSHans Petter Selasky struct sockaddr *src_addr, socklen_t *addrlen)
2344*d6b92ffaSHans Petter Selasky {
2345*d6b92ffaSHans Petter Selasky struct ds_rmsg *rmsg;
2346*d6b92ffaSHans Petter Selasky struct ds_header *hdr;
2347*d6b92ffaSHans Petter Selasky int ret;
2348*d6b92ffaSHans Petter Selasky
2349*d6b92ffaSHans Petter Selasky if (!(rs->state & rs_readable))
2350*d6b92ffaSHans Petter Selasky return ERR(EINVAL);
2351*d6b92ffaSHans Petter Selasky
2352*d6b92ffaSHans Petter Selasky if (!rs_have_rdata(rs)) {
2353*d6b92ffaSHans Petter Selasky ret = ds_get_comp(rs, rs_nonblocking(rs, flags),
2354*d6b92ffaSHans Petter Selasky rs_have_rdata);
2355*d6b92ffaSHans Petter Selasky if (ret)
2356*d6b92ffaSHans Petter Selasky return ret;
2357*d6b92ffaSHans Petter Selasky }
2358*d6b92ffaSHans Petter Selasky
2359*d6b92ffaSHans Petter Selasky rmsg = &rs->dmsg[rs->rmsg_head];
2360*d6b92ffaSHans Petter Selasky hdr = (struct ds_header *) (rmsg->qp->rbuf + rmsg->offset);
2361*d6b92ffaSHans Petter Selasky if (len > rmsg->length - hdr->length)
2362*d6b92ffaSHans Petter Selasky len = rmsg->length - hdr->length;
2363*d6b92ffaSHans Petter Selasky
2364*d6b92ffaSHans Petter Selasky memcpy(buf, (void *) hdr + hdr->length, len);
2365*d6b92ffaSHans Petter Selasky if (addrlen)
2366*d6b92ffaSHans Petter Selasky ds_set_src(src_addr, addrlen, hdr);
2367*d6b92ffaSHans Petter Selasky
2368*d6b92ffaSHans Petter Selasky if (!(flags & MSG_PEEK)) {
2369*d6b92ffaSHans Petter Selasky ds_post_recv(rs, rmsg->qp, rmsg->offset);
2370*d6b92ffaSHans Petter Selasky if (++rs->rmsg_head == rs->rq_size + 1)
2371*d6b92ffaSHans Petter Selasky rs->rmsg_head = 0;
2372*d6b92ffaSHans Petter Selasky rs->rqe_avail++;
2373*d6b92ffaSHans Petter Selasky }
2374*d6b92ffaSHans Petter Selasky
2375*d6b92ffaSHans Petter Selasky return len;
2376*d6b92ffaSHans Petter Selasky }
2377*d6b92ffaSHans Petter Selasky
rs_peek(struct rsocket * rs,void * buf,size_t len)2378*d6b92ffaSHans Petter Selasky static ssize_t rs_peek(struct rsocket *rs, void *buf, size_t len)
2379*d6b92ffaSHans Petter Selasky {
2380*d6b92ffaSHans Petter Selasky size_t left = len;
2381*d6b92ffaSHans Petter Selasky uint32_t end_size, rsize;
2382*d6b92ffaSHans Petter Selasky int rmsg_head, rbuf_offset;
2383*d6b92ffaSHans Petter Selasky
2384*d6b92ffaSHans Petter Selasky rmsg_head = rs->rmsg_head;
2385*d6b92ffaSHans Petter Selasky rbuf_offset = rs->rbuf_offset;
2386*d6b92ffaSHans Petter Selasky
2387*d6b92ffaSHans Petter Selasky for (; left && (rmsg_head != rs->rmsg_tail); left -= rsize) {
2388*d6b92ffaSHans Petter Selasky if (left < rs->rmsg[rmsg_head].data) {
2389*d6b92ffaSHans Petter Selasky rsize = left;
2390*d6b92ffaSHans Petter Selasky } else {
2391*d6b92ffaSHans Petter Selasky rsize = rs->rmsg[rmsg_head].data;
2392*d6b92ffaSHans Petter Selasky if (++rmsg_head == rs->rq_size + 1)
2393*d6b92ffaSHans Petter Selasky rmsg_head = 0;
2394*d6b92ffaSHans Petter Selasky }
2395*d6b92ffaSHans Petter Selasky
2396*d6b92ffaSHans Petter Selasky end_size = rs->rbuf_size - rbuf_offset;
2397*d6b92ffaSHans Petter Selasky if (rsize > end_size) {
2398*d6b92ffaSHans Petter Selasky memcpy(buf, &rs->rbuf[rbuf_offset], end_size);
2399*d6b92ffaSHans Petter Selasky rbuf_offset = 0;
2400*d6b92ffaSHans Petter Selasky buf += end_size;
2401*d6b92ffaSHans Petter Selasky rsize -= end_size;
2402*d6b92ffaSHans Petter Selasky left -= end_size;
2403*d6b92ffaSHans Petter Selasky }
2404*d6b92ffaSHans Petter Selasky memcpy(buf, &rs->rbuf[rbuf_offset], rsize);
2405*d6b92ffaSHans Petter Selasky rbuf_offset += rsize;
2406*d6b92ffaSHans Petter Selasky buf += rsize;
2407*d6b92ffaSHans Petter Selasky }
2408*d6b92ffaSHans Petter Selasky
2409*d6b92ffaSHans Petter Selasky return len - left;
2410*d6b92ffaSHans Petter Selasky }
2411*d6b92ffaSHans Petter Selasky
2412*d6b92ffaSHans Petter Selasky /*
2413*d6b92ffaSHans Petter Selasky * Continue to receive any queued data even if the remote side has disconnected.
2414*d6b92ffaSHans Petter Selasky */
rrecv(int socket,void * buf,size_t len,int flags)2415*d6b92ffaSHans Petter Selasky ssize_t rrecv(int socket, void *buf, size_t len, int flags)
2416*d6b92ffaSHans Petter Selasky {
2417*d6b92ffaSHans Petter Selasky struct rsocket *rs;
2418*d6b92ffaSHans Petter Selasky size_t left = len;
2419*d6b92ffaSHans Petter Selasky uint32_t end_size, rsize;
2420*d6b92ffaSHans Petter Selasky int ret = 0;
2421*d6b92ffaSHans Petter Selasky
2422*d6b92ffaSHans Petter Selasky rs = idm_at(&idm, socket);
2423*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_DGRAM) {
2424*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->rlock);
2425*d6b92ffaSHans Petter Selasky ret = ds_recvfrom(rs, buf, len, flags, NULL, NULL);
2426*d6b92ffaSHans Petter Selasky fastlock_release(&rs->rlock);
2427*d6b92ffaSHans Petter Selasky return ret;
2428*d6b92ffaSHans Petter Selasky }
2429*d6b92ffaSHans Petter Selasky
2430*d6b92ffaSHans Petter Selasky if (rs->state & rs_opening) {
2431*d6b92ffaSHans Petter Selasky ret = rs_do_connect(rs);
2432*d6b92ffaSHans Petter Selasky if (ret) {
2433*d6b92ffaSHans Petter Selasky if (errno == EINPROGRESS)
2434*d6b92ffaSHans Petter Selasky errno = EAGAIN;
2435*d6b92ffaSHans Petter Selasky return ret;
2436*d6b92ffaSHans Petter Selasky }
2437*d6b92ffaSHans Petter Selasky }
2438*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->rlock);
2439*d6b92ffaSHans Petter Selasky do {
2440*d6b92ffaSHans Petter Selasky if (!rs_have_rdata(rs)) {
2441*d6b92ffaSHans Petter Selasky ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
2442*d6b92ffaSHans Petter Selasky rs_conn_have_rdata);
2443*d6b92ffaSHans Petter Selasky if (ret)
2444*d6b92ffaSHans Petter Selasky break;
2445*d6b92ffaSHans Petter Selasky }
2446*d6b92ffaSHans Petter Selasky
2447*d6b92ffaSHans Petter Selasky if (flags & MSG_PEEK) {
2448*d6b92ffaSHans Petter Selasky left = len - rs_peek(rs, buf, left);
2449*d6b92ffaSHans Petter Selasky break;
2450*d6b92ffaSHans Petter Selasky }
2451*d6b92ffaSHans Petter Selasky
2452*d6b92ffaSHans Petter Selasky for (; left && rs_have_rdata(rs); left -= rsize) {
2453*d6b92ffaSHans Petter Selasky if (left < rs->rmsg[rs->rmsg_head].data) {
2454*d6b92ffaSHans Petter Selasky rsize = left;
2455*d6b92ffaSHans Petter Selasky rs->rmsg[rs->rmsg_head].data -= left;
2456*d6b92ffaSHans Petter Selasky } else {
2457*d6b92ffaSHans Petter Selasky rs->rseq_no++;
2458*d6b92ffaSHans Petter Selasky rsize = rs->rmsg[rs->rmsg_head].data;
2459*d6b92ffaSHans Petter Selasky if (++rs->rmsg_head == rs->rq_size + 1)
2460*d6b92ffaSHans Petter Selasky rs->rmsg_head = 0;
2461*d6b92ffaSHans Petter Selasky }
2462*d6b92ffaSHans Petter Selasky
2463*d6b92ffaSHans Petter Selasky end_size = rs->rbuf_size - rs->rbuf_offset;
2464*d6b92ffaSHans Petter Selasky if (rsize > end_size) {
2465*d6b92ffaSHans Petter Selasky memcpy(buf, &rs->rbuf[rs->rbuf_offset], end_size);
2466*d6b92ffaSHans Petter Selasky rs->rbuf_offset = 0;
2467*d6b92ffaSHans Petter Selasky buf += end_size;
2468*d6b92ffaSHans Petter Selasky rsize -= end_size;
2469*d6b92ffaSHans Petter Selasky left -= end_size;
2470*d6b92ffaSHans Petter Selasky rs->rbuf_bytes_avail += end_size;
2471*d6b92ffaSHans Petter Selasky }
2472*d6b92ffaSHans Petter Selasky memcpy(buf, &rs->rbuf[rs->rbuf_offset], rsize);
2473*d6b92ffaSHans Petter Selasky rs->rbuf_offset += rsize;
2474*d6b92ffaSHans Petter Selasky buf += rsize;
2475*d6b92ffaSHans Petter Selasky rs->rbuf_bytes_avail += rsize;
2476*d6b92ffaSHans Petter Selasky }
2477*d6b92ffaSHans Petter Selasky
2478*d6b92ffaSHans Petter Selasky } while (left && (flags & MSG_WAITALL) && (rs->state & rs_readable));
2479*d6b92ffaSHans Petter Selasky
2480*d6b92ffaSHans Petter Selasky fastlock_release(&rs->rlock);
2481*d6b92ffaSHans Petter Selasky return (ret && left == len) ? ret : len - left;
2482*d6b92ffaSHans Petter Selasky }
2483*d6b92ffaSHans Petter Selasky
rrecvfrom(int socket,void * buf,size_t len,int flags,struct sockaddr * src_addr,socklen_t * addrlen)2484*d6b92ffaSHans Petter Selasky ssize_t rrecvfrom(int socket, void *buf, size_t len, int flags,
2485*d6b92ffaSHans Petter Selasky struct sockaddr *src_addr, socklen_t *addrlen)
2486*d6b92ffaSHans Petter Selasky {
2487*d6b92ffaSHans Petter Selasky struct rsocket *rs;
2488*d6b92ffaSHans Petter Selasky int ret;
2489*d6b92ffaSHans Petter Selasky
2490*d6b92ffaSHans Petter Selasky rs = idm_at(&idm, socket);
2491*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_DGRAM) {
2492*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->rlock);
2493*d6b92ffaSHans Petter Selasky ret = ds_recvfrom(rs, buf, len, flags, src_addr, addrlen);
2494*d6b92ffaSHans Petter Selasky fastlock_release(&rs->rlock);
2495*d6b92ffaSHans Petter Selasky return ret;
2496*d6b92ffaSHans Petter Selasky }
2497*d6b92ffaSHans Petter Selasky
2498*d6b92ffaSHans Petter Selasky ret = rrecv(socket, buf, len, flags);
2499*d6b92ffaSHans Petter Selasky if (ret > 0 && src_addr)
2500*d6b92ffaSHans Petter Selasky rgetpeername(socket, src_addr, addrlen);
2501*d6b92ffaSHans Petter Selasky
2502*d6b92ffaSHans Petter Selasky return ret;
2503*d6b92ffaSHans Petter Selasky }
2504*d6b92ffaSHans Petter Selasky
2505*d6b92ffaSHans Petter Selasky /*
2506*d6b92ffaSHans Petter Selasky * Simple, straightforward implementation for now that only tries to fill
2507*d6b92ffaSHans Petter Selasky * in the first vector.
2508*d6b92ffaSHans Petter Selasky */
rrecvv(int socket,const struct iovec * iov,int iovcnt,int flags)2509*d6b92ffaSHans Petter Selasky static ssize_t rrecvv(int socket, const struct iovec *iov, int iovcnt, int flags)
2510*d6b92ffaSHans Petter Selasky {
2511*d6b92ffaSHans Petter Selasky return rrecv(socket, iov[0].iov_base, iov[0].iov_len, flags);
2512*d6b92ffaSHans Petter Selasky }
2513*d6b92ffaSHans Petter Selasky
rrecvmsg(int socket,struct msghdr * msg,int flags)2514*d6b92ffaSHans Petter Selasky ssize_t rrecvmsg(int socket, struct msghdr *msg, int flags)
2515*d6b92ffaSHans Petter Selasky {
2516*d6b92ffaSHans Petter Selasky if (msg->msg_control && msg->msg_controllen)
2517*d6b92ffaSHans Petter Selasky return ERR(ENOTSUP);
2518*d6b92ffaSHans Petter Selasky
2519*d6b92ffaSHans Petter Selasky return rrecvv(socket, msg->msg_iov, (int) msg->msg_iovlen, msg->msg_flags);
2520*d6b92ffaSHans Petter Selasky }
2521*d6b92ffaSHans Petter Selasky
rread(int socket,void * buf,size_t count)2522*d6b92ffaSHans Petter Selasky ssize_t rread(int socket, void *buf, size_t count)
2523*d6b92ffaSHans Petter Selasky {
2524*d6b92ffaSHans Petter Selasky return rrecv(socket, buf, count, 0);
2525*d6b92ffaSHans Petter Selasky }
2526*d6b92ffaSHans Petter Selasky
rreadv(int socket,const struct iovec * iov,int iovcnt)2527*d6b92ffaSHans Petter Selasky ssize_t rreadv(int socket, const struct iovec *iov, int iovcnt)
2528*d6b92ffaSHans Petter Selasky {
2529*d6b92ffaSHans Petter Selasky return rrecvv(socket, iov, iovcnt, 0);
2530*d6b92ffaSHans Petter Selasky }
2531*d6b92ffaSHans Petter Selasky
rs_send_iomaps(struct rsocket * rs,int flags)2532*d6b92ffaSHans Petter Selasky static int rs_send_iomaps(struct rsocket *rs, int flags)
2533*d6b92ffaSHans Petter Selasky {
2534*d6b92ffaSHans Petter Selasky struct rs_iomap_mr *iomr;
2535*d6b92ffaSHans Petter Selasky struct ibv_sge sge;
2536*d6b92ffaSHans Petter Selasky struct rs_iomap iom;
2537*d6b92ffaSHans Petter Selasky int ret;
2538*d6b92ffaSHans Petter Selasky
2539*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->map_lock);
2540*d6b92ffaSHans Petter Selasky while (!dlist_empty(&rs->iomap_queue)) {
2541*d6b92ffaSHans Petter Selasky if (!rs_can_send(rs)) {
2542*d6b92ffaSHans Petter Selasky ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
2543*d6b92ffaSHans Petter Selasky rs_conn_can_send);
2544*d6b92ffaSHans Petter Selasky if (ret)
2545*d6b92ffaSHans Petter Selasky break;
2546*d6b92ffaSHans Petter Selasky if (!(rs->state & rs_writable)) {
2547*d6b92ffaSHans Petter Selasky ret = ERR(ECONNRESET);
2548*d6b92ffaSHans Petter Selasky break;
2549*d6b92ffaSHans Petter Selasky }
2550*d6b92ffaSHans Petter Selasky }
2551*d6b92ffaSHans Petter Selasky
2552*d6b92ffaSHans Petter Selasky iomr = container_of(rs->iomap_queue.next, struct rs_iomap_mr, entry);
2553*d6b92ffaSHans Petter Selasky if (!(rs->opts & RS_OPT_SWAP_SGL)) {
2554*d6b92ffaSHans Petter Selasky iom.offset = iomr->offset;
2555*d6b92ffaSHans Petter Selasky iom.sge.addr = (uintptr_t) iomr->mr->addr;
2556*d6b92ffaSHans Petter Selasky iom.sge.length = iomr->mr->length;
2557*d6b92ffaSHans Petter Selasky iom.sge.key = iomr->mr->rkey;
2558*d6b92ffaSHans Petter Selasky } else {
2559*d6b92ffaSHans Petter Selasky iom.offset = bswap_64(iomr->offset);
2560*d6b92ffaSHans Petter Selasky iom.sge.addr = bswap_64((uintptr_t) iomr->mr->addr);
2561*d6b92ffaSHans Petter Selasky iom.sge.length = bswap_32(iomr->mr->length);
2562*d6b92ffaSHans Petter Selasky iom.sge.key = bswap_32(iomr->mr->rkey);
2563*d6b92ffaSHans Petter Selasky }
2564*d6b92ffaSHans Petter Selasky
2565*d6b92ffaSHans Petter Selasky if (rs->sq_inline >= sizeof iom) {
2566*d6b92ffaSHans Petter Selasky sge.addr = (uintptr_t) &iom;
2567*d6b92ffaSHans Petter Selasky sge.length = sizeof iom;
2568*d6b92ffaSHans Petter Selasky sge.lkey = 0;
2569*d6b92ffaSHans Petter Selasky ret = rs_write_iomap(rs, iomr, &sge, 1, IBV_SEND_INLINE);
2570*d6b92ffaSHans Petter Selasky } else if (rs_sbuf_left(rs) >= sizeof iom) {
2571*d6b92ffaSHans Petter Selasky memcpy((void *) (uintptr_t) rs->ssgl[0].addr, &iom, sizeof iom);
2572*d6b92ffaSHans Petter Selasky rs->ssgl[0].length = sizeof iom;
2573*d6b92ffaSHans Petter Selasky ret = rs_write_iomap(rs, iomr, rs->ssgl, 1, 0);
2574*d6b92ffaSHans Petter Selasky if (rs_sbuf_left(rs) > sizeof iom)
2575*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr += sizeof iom;
2576*d6b92ffaSHans Petter Selasky else
2577*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr = (uintptr_t) rs->sbuf;
2578*d6b92ffaSHans Petter Selasky } else {
2579*d6b92ffaSHans Petter Selasky rs->ssgl[0].length = rs_sbuf_left(rs);
2580*d6b92ffaSHans Petter Selasky memcpy((void *) (uintptr_t) rs->ssgl[0].addr, &iom,
2581*d6b92ffaSHans Petter Selasky rs->ssgl[0].length);
2582*d6b92ffaSHans Petter Selasky rs->ssgl[1].length = sizeof iom - rs->ssgl[0].length;
2583*d6b92ffaSHans Petter Selasky memcpy(rs->sbuf, ((void *) &iom) + rs->ssgl[0].length,
2584*d6b92ffaSHans Petter Selasky rs->ssgl[1].length);
2585*d6b92ffaSHans Petter Selasky ret = rs_write_iomap(rs, iomr, rs->ssgl, 2, 0);
2586*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length;
2587*d6b92ffaSHans Petter Selasky }
2588*d6b92ffaSHans Petter Selasky dlist_remove(&iomr->entry);
2589*d6b92ffaSHans Petter Selasky dlist_insert_tail(&iomr->entry, &rs->iomap_list);
2590*d6b92ffaSHans Petter Selasky if (ret)
2591*d6b92ffaSHans Petter Selasky break;
2592*d6b92ffaSHans Petter Selasky }
2593*d6b92ffaSHans Petter Selasky
2594*d6b92ffaSHans Petter Selasky rs->iomap_pending = !dlist_empty(&rs->iomap_queue);
2595*d6b92ffaSHans Petter Selasky fastlock_release(&rs->map_lock);
2596*d6b92ffaSHans Petter Selasky return ret;
2597*d6b92ffaSHans Petter Selasky }
2598*d6b92ffaSHans Petter Selasky
ds_sendv_udp(struct rsocket * rs,const struct iovec * iov,int iovcnt,int flags,uint8_t op)2599*d6b92ffaSHans Petter Selasky static ssize_t ds_sendv_udp(struct rsocket *rs, const struct iovec *iov,
2600*d6b92ffaSHans Petter Selasky int iovcnt, int flags, uint8_t op)
2601*d6b92ffaSHans Petter Selasky {
2602*d6b92ffaSHans Petter Selasky struct ds_udp_header hdr;
2603*d6b92ffaSHans Petter Selasky struct msghdr msg;
2604*d6b92ffaSHans Petter Selasky struct iovec miov[8];
2605*d6b92ffaSHans Petter Selasky ssize_t ret;
2606*d6b92ffaSHans Petter Selasky
2607*d6b92ffaSHans Petter Selasky if (iovcnt > 8)
2608*d6b92ffaSHans Petter Selasky return ERR(ENOTSUP);
2609*d6b92ffaSHans Petter Selasky
2610*d6b92ffaSHans Petter Selasky hdr.tag = htobe32(DS_UDP_TAG);
2611*d6b92ffaSHans Petter Selasky hdr.version = rs->conn_dest->qp->hdr.version;
2612*d6b92ffaSHans Petter Selasky hdr.op = op;
2613*d6b92ffaSHans Petter Selasky hdr.reserved = 0;
2614*d6b92ffaSHans Petter Selasky hdr.qpn = htobe32(rs->conn_dest->qp->cm_id->qp->qp_num & 0xFFFFFF);
2615*d6b92ffaSHans Petter Selasky if (rs->conn_dest->qp->hdr.version == 4) {
2616*d6b92ffaSHans Petter Selasky hdr.length = DS_UDP_IPV4_HDR_LEN;
2617*d6b92ffaSHans Petter Selasky hdr.addr.ipv4 = rs->conn_dest->qp->hdr.addr.ipv4;
2618*d6b92ffaSHans Petter Selasky } else {
2619*d6b92ffaSHans Petter Selasky hdr.length = DS_UDP_IPV6_HDR_LEN;
2620*d6b92ffaSHans Petter Selasky memcpy(hdr.addr.ipv6, &rs->conn_dest->qp->hdr.addr.ipv6, 16);
2621*d6b92ffaSHans Petter Selasky }
2622*d6b92ffaSHans Petter Selasky
2623*d6b92ffaSHans Petter Selasky miov[0].iov_base = &hdr;
2624*d6b92ffaSHans Petter Selasky miov[0].iov_len = hdr.length;
2625*d6b92ffaSHans Petter Selasky if (iov && iovcnt)
2626*d6b92ffaSHans Petter Selasky memcpy(&miov[1], iov, sizeof(*iov) * iovcnt);
2627*d6b92ffaSHans Petter Selasky
2628*d6b92ffaSHans Petter Selasky memset(&msg, 0, sizeof msg);
2629*d6b92ffaSHans Petter Selasky msg.msg_name = &rs->conn_dest->addr;
2630*d6b92ffaSHans Petter Selasky msg.msg_namelen = ucma_addrlen(&rs->conn_dest->addr.sa);
2631*d6b92ffaSHans Petter Selasky msg.msg_iov = miov;
2632*d6b92ffaSHans Petter Selasky msg.msg_iovlen = iovcnt + 1;
2633*d6b92ffaSHans Petter Selasky ret = sendmsg(rs->udp_sock, &msg, flags);
2634*d6b92ffaSHans Petter Selasky return ret > 0 ? ret - hdr.length : ret;
2635*d6b92ffaSHans Petter Selasky }
2636*d6b92ffaSHans Petter Selasky
ds_send_udp(struct rsocket * rs,const void * buf,size_t len,int flags,uint8_t op)2637*d6b92ffaSHans Petter Selasky static ssize_t ds_send_udp(struct rsocket *rs, const void *buf, size_t len,
2638*d6b92ffaSHans Petter Selasky int flags, uint8_t op)
2639*d6b92ffaSHans Petter Selasky {
2640*d6b92ffaSHans Petter Selasky struct iovec iov;
2641*d6b92ffaSHans Petter Selasky if (buf && len) {
2642*d6b92ffaSHans Petter Selasky iov.iov_base = (void *) buf;
2643*d6b92ffaSHans Petter Selasky iov.iov_len = len;
2644*d6b92ffaSHans Petter Selasky return ds_sendv_udp(rs, &iov, 1, flags, op);
2645*d6b92ffaSHans Petter Selasky } else {
2646*d6b92ffaSHans Petter Selasky return ds_sendv_udp(rs, NULL, 0, flags, op);
2647*d6b92ffaSHans Petter Selasky }
2648*d6b92ffaSHans Petter Selasky }
2649*d6b92ffaSHans Petter Selasky
dsend(struct rsocket * rs,const void * buf,size_t len,int flags)2650*d6b92ffaSHans Petter Selasky static ssize_t dsend(struct rsocket *rs, const void *buf, size_t len, int flags)
2651*d6b92ffaSHans Petter Selasky {
2652*d6b92ffaSHans Petter Selasky struct ds_smsg *msg;
2653*d6b92ffaSHans Petter Selasky struct ibv_sge sge;
2654*d6b92ffaSHans Petter Selasky uint64_t offset;
2655*d6b92ffaSHans Petter Selasky int ret = 0;
2656*d6b92ffaSHans Petter Selasky
2657*d6b92ffaSHans Petter Selasky if (!rs->conn_dest->ah)
2658*d6b92ffaSHans Petter Selasky return ds_send_udp(rs, buf, len, flags, RS_OP_DATA);
2659*d6b92ffaSHans Petter Selasky
2660*d6b92ffaSHans Petter Selasky if (!ds_can_send(rs)) {
2661*d6b92ffaSHans Petter Selasky ret = ds_get_comp(rs, rs_nonblocking(rs, flags), ds_can_send);
2662*d6b92ffaSHans Petter Selasky if (ret)
2663*d6b92ffaSHans Petter Selasky return ret;
2664*d6b92ffaSHans Petter Selasky }
2665*d6b92ffaSHans Petter Selasky
2666*d6b92ffaSHans Petter Selasky msg = rs->smsg_free;
2667*d6b92ffaSHans Petter Selasky rs->smsg_free = msg->next;
2668*d6b92ffaSHans Petter Selasky rs->sqe_avail--;
2669*d6b92ffaSHans Petter Selasky
2670*d6b92ffaSHans Petter Selasky memcpy((void *) msg, &rs->conn_dest->qp->hdr, rs->conn_dest->qp->hdr.length);
2671*d6b92ffaSHans Petter Selasky memcpy((void *) msg + rs->conn_dest->qp->hdr.length, buf, len);
2672*d6b92ffaSHans Petter Selasky sge.addr = (uintptr_t) msg;
2673*d6b92ffaSHans Petter Selasky sge.length = rs->conn_dest->qp->hdr.length + len;
2674*d6b92ffaSHans Petter Selasky sge.lkey = rs->conn_dest->qp->smr->lkey;
2675*d6b92ffaSHans Petter Selasky offset = (uint8_t *) msg - rs->sbuf;
2676*d6b92ffaSHans Petter Selasky
2677*d6b92ffaSHans Petter Selasky ret = ds_post_send(rs, &sge, offset);
2678*d6b92ffaSHans Petter Selasky return ret ? ret : len;
2679*d6b92ffaSHans Petter Selasky }
2680*d6b92ffaSHans Petter Selasky
2681*d6b92ffaSHans Petter Selasky /*
2682*d6b92ffaSHans Petter Selasky * We overlap sending the data, by posting a small work request immediately,
2683*d6b92ffaSHans Petter Selasky * then increasing the size of the send on each iteration.
2684*d6b92ffaSHans Petter Selasky */
rsend(int socket,const void * buf,size_t len,int flags)2685*d6b92ffaSHans Petter Selasky ssize_t rsend(int socket, const void *buf, size_t len, int flags)
2686*d6b92ffaSHans Petter Selasky {
2687*d6b92ffaSHans Petter Selasky struct rsocket *rs;
2688*d6b92ffaSHans Petter Selasky struct ibv_sge sge;
2689*d6b92ffaSHans Petter Selasky size_t left = len;
2690*d6b92ffaSHans Petter Selasky uint32_t xfer_size, olen = RS_OLAP_START_SIZE;
2691*d6b92ffaSHans Petter Selasky int ret = 0;
2692*d6b92ffaSHans Petter Selasky
2693*d6b92ffaSHans Petter Selasky rs = idm_at(&idm, socket);
2694*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_DGRAM) {
2695*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
2696*d6b92ffaSHans Petter Selasky ret = dsend(rs, buf, len, flags);
2697*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
2698*d6b92ffaSHans Petter Selasky return ret;
2699*d6b92ffaSHans Petter Selasky }
2700*d6b92ffaSHans Petter Selasky
2701*d6b92ffaSHans Petter Selasky if (rs->state & rs_opening) {
2702*d6b92ffaSHans Petter Selasky ret = rs_do_connect(rs);
2703*d6b92ffaSHans Petter Selasky if (ret) {
2704*d6b92ffaSHans Petter Selasky if (errno == EINPROGRESS)
2705*d6b92ffaSHans Petter Selasky errno = EAGAIN;
2706*d6b92ffaSHans Petter Selasky return ret;
2707*d6b92ffaSHans Petter Selasky }
2708*d6b92ffaSHans Petter Selasky }
2709*d6b92ffaSHans Petter Selasky
2710*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
2711*d6b92ffaSHans Petter Selasky if (rs->iomap_pending) {
2712*d6b92ffaSHans Petter Selasky ret = rs_send_iomaps(rs, flags);
2713*d6b92ffaSHans Petter Selasky if (ret)
2714*d6b92ffaSHans Petter Selasky goto out;
2715*d6b92ffaSHans Petter Selasky }
2716*d6b92ffaSHans Petter Selasky for (; left; left -= xfer_size, buf += xfer_size) {
2717*d6b92ffaSHans Petter Selasky if (!rs_can_send(rs)) {
2718*d6b92ffaSHans Petter Selasky ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
2719*d6b92ffaSHans Petter Selasky rs_conn_can_send);
2720*d6b92ffaSHans Petter Selasky if (ret)
2721*d6b92ffaSHans Petter Selasky break;
2722*d6b92ffaSHans Petter Selasky if (!(rs->state & rs_writable)) {
2723*d6b92ffaSHans Petter Selasky ret = ERR(ECONNRESET);
2724*d6b92ffaSHans Petter Selasky break;
2725*d6b92ffaSHans Petter Selasky }
2726*d6b92ffaSHans Petter Selasky }
2727*d6b92ffaSHans Petter Selasky
2728*d6b92ffaSHans Petter Selasky if (olen < left) {
2729*d6b92ffaSHans Petter Selasky xfer_size = olen;
2730*d6b92ffaSHans Petter Selasky if (olen < RS_MAX_TRANSFER)
2731*d6b92ffaSHans Petter Selasky olen <<= 1;
2732*d6b92ffaSHans Petter Selasky } else {
2733*d6b92ffaSHans Petter Selasky xfer_size = left;
2734*d6b92ffaSHans Petter Selasky }
2735*d6b92ffaSHans Petter Selasky
2736*d6b92ffaSHans Petter Selasky if (xfer_size > rs->sbuf_bytes_avail)
2737*d6b92ffaSHans Petter Selasky xfer_size = rs->sbuf_bytes_avail;
2738*d6b92ffaSHans Petter Selasky if (xfer_size > rs->target_sgl[rs->target_sge].length)
2739*d6b92ffaSHans Petter Selasky xfer_size = rs->target_sgl[rs->target_sge].length;
2740*d6b92ffaSHans Petter Selasky
2741*d6b92ffaSHans Petter Selasky if (xfer_size <= rs->sq_inline) {
2742*d6b92ffaSHans Petter Selasky sge.addr = (uintptr_t) buf;
2743*d6b92ffaSHans Petter Selasky sge.length = xfer_size;
2744*d6b92ffaSHans Petter Selasky sge.lkey = 0;
2745*d6b92ffaSHans Petter Selasky ret = rs_write_data(rs, &sge, 1, xfer_size, IBV_SEND_INLINE);
2746*d6b92ffaSHans Petter Selasky } else if (xfer_size <= rs_sbuf_left(rs)) {
2747*d6b92ffaSHans Petter Selasky memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, xfer_size);
2748*d6b92ffaSHans Petter Selasky rs->ssgl[0].length = xfer_size;
2749*d6b92ffaSHans Petter Selasky ret = rs_write_data(rs, rs->ssgl, 1, xfer_size, 0);
2750*d6b92ffaSHans Petter Selasky if (xfer_size < rs_sbuf_left(rs))
2751*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr += xfer_size;
2752*d6b92ffaSHans Petter Selasky else
2753*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr = (uintptr_t) rs->sbuf;
2754*d6b92ffaSHans Petter Selasky } else {
2755*d6b92ffaSHans Petter Selasky rs->ssgl[0].length = rs_sbuf_left(rs);
2756*d6b92ffaSHans Petter Selasky memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf,
2757*d6b92ffaSHans Petter Selasky rs->ssgl[0].length);
2758*d6b92ffaSHans Petter Selasky rs->ssgl[1].length = xfer_size - rs->ssgl[0].length;
2759*d6b92ffaSHans Petter Selasky memcpy(rs->sbuf, buf + rs->ssgl[0].length, rs->ssgl[1].length);
2760*d6b92ffaSHans Petter Selasky ret = rs_write_data(rs, rs->ssgl, 2, xfer_size, 0);
2761*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length;
2762*d6b92ffaSHans Petter Selasky }
2763*d6b92ffaSHans Petter Selasky if (ret)
2764*d6b92ffaSHans Petter Selasky break;
2765*d6b92ffaSHans Petter Selasky }
2766*d6b92ffaSHans Petter Selasky out:
2767*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
2768*d6b92ffaSHans Petter Selasky
2769*d6b92ffaSHans Petter Selasky return (ret && left == len) ? ret : len - left;
2770*d6b92ffaSHans Petter Selasky }
2771*d6b92ffaSHans Petter Selasky
rsendto(int socket,const void * buf,size_t len,int flags,const struct sockaddr * dest_addr,socklen_t addrlen)2772*d6b92ffaSHans Petter Selasky ssize_t rsendto(int socket, const void *buf, size_t len, int flags,
2773*d6b92ffaSHans Petter Selasky const struct sockaddr *dest_addr, socklen_t addrlen)
2774*d6b92ffaSHans Petter Selasky {
2775*d6b92ffaSHans Petter Selasky struct rsocket *rs;
2776*d6b92ffaSHans Petter Selasky int ret;
2777*d6b92ffaSHans Petter Selasky
2778*d6b92ffaSHans Petter Selasky rs = idm_at(&idm, socket);
2779*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
2780*d6b92ffaSHans Petter Selasky if (dest_addr || addrlen)
2781*d6b92ffaSHans Petter Selasky return ERR(EISCONN);
2782*d6b92ffaSHans Petter Selasky
2783*d6b92ffaSHans Petter Selasky return rsend(socket, buf, len, flags);
2784*d6b92ffaSHans Petter Selasky }
2785*d6b92ffaSHans Petter Selasky
2786*d6b92ffaSHans Petter Selasky if (rs->state == rs_init) {
2787*d6b92ffaSHans Petter Selasky ret = ds_init_ep(rs);
2788*d6b92ffaSHans Petter Selasky if (ret)
2789*d6b92ffaSHans Petter Selasky return ret;
2790*d6b92ffaSHans Petter Selasky }
2791*d6b92ffaSHans Petter Selasky
2792*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
2793*d6b92ffaSHans Petter Selasky if (!rs->conn_dest || ds_compare_addr(dest_addr, &rs->conn_dest->addr)) {
2794*d6b92ffaSHans Petter Selasky ret = ds_get_dest(rs, dest_addr, addrlen, &rs->conn_dest);
2795*d6b92ffaSHans Petter Selasky if (ret)
2796*d6b92ffaSHans Petter Selasky goto out;
2797*d6b92ffaSHans Petter Selasky }
2798*d6b92ffaSHans Petter Selasky
2799*d6b92ffaSHans Petter Selasky ret = dsend(rs, buf, len, flags);
2800*d6b92ffaSHans Petter Selasky out:
2801*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
2802*d6b92ffaSHans Petter Selasky return ret;
2803*d6b92ffaSHans Petter Selasky }
2804*d6b92ffaSHans Petter Selasky
rs_copy_iov(void * dst,const struct iovec ** iov,size_t * offset,size_t len)2805*d6b92ffaSHans Petter Selasky static void rs_copy_iov(void *dst, const struct iovec **iov, size_t *offset, size_t len)
2806*d6b92ffaSHans Petter Selasky {
2807*d6b92ffaSHans Petter Selasky size_t size;
2808*d6b92ffaSHans Petter Selasky
2809*d6b92ffaSHans Petter Selasky while (len) {
2810*d6b92ffaSHans Petter Selasky size = (*iov)->iov_len - *offset;
2811*d6b92ffaSHans Petter Selasky if (size > len) {
2812*d6b92ffaSHans Petter Selasky memcpy (dst, (*iov)->iov_base + *offset, len);
2813*d6b92ffaSHans Petter Selasky *offset += len;
2814*d6b92ffaSHans Petter Selasky break;
2815*d6b92ffaSHans Petter Selasky }
2816*d6b92ffaSHans Petter Selasky
2817*d6b92ffaSHans Petter Selasky memcpy(dst, (*iov)->iov_base + *offset, size);
2818*d6b92ffaSHans Petter Selasky len -= size;
2819*d6b92ffaSHans Petter Selasky dst += size;
2820*d6b92ffaSHans Petter Selasky (*iov)++;
2821*d6b92ffaSHans Petter Selasky *offset = 0;
2822*d6b92ffaSHans Petter Selasky }
2823*d6b92ffaSHans Petter Selasky }
2824*d6b92ffaSHans Petter Selasky
rsendv(int socket,const struct iovec * iov,int iovcnt,int flags)2825*d6b92ffaSHans Petter Selasky static ssize_t rsendv(int socket, const struct iovec *iov, int iovcnt, int flags)
2826*d6b92ffaSHans Petter Selasky {
2827*d6b92ffaSHans Petter Selasky struct rsocket *rs;
2828*d6b92ffaSHans Petter Selasky const struct iovec *cur_iov;
2829*d6b92ffaSHans Petter Selasky size_t left, len, offset = 0;
2830*d6b92ffaSHans Petter Selasky uint32_t xfer_size, olen = RS_OLAP_START_SIZE;
2831*d6b92ffaSHans Petter Selasky int i, ret = 0;
2832*d6b92ffaSHans Petter Selasky
2833*d6b92ffaSHans Petter Selasky rs = idm_at(&idm, socket);
2834*d6b92ffaSHans Petter Selasky if (rs->state & rs_opening) {
2835*d6b92ffaSHans Petter Selasky ret = rs_do_connect(rs);
2836*d6b92ffaSHans Petter Selasky if (ret) {
2837*d6b92ffaSHans Petter Selasky if (errno == EINPROGRESS)
2838*d6b92ffaSHans Petter Selasky errno = EAGAIN;
2839*d6b92ffaSHans Petter Selasky return ret;
2840*d6b92ffaSHans Petter Selasky }
2841*d6b92ffaSHans Petter Selasky }
2842*d6b92ffaSHans Petter Selasky
2843*d6b92ffaSHans Petter Selasky cur_iov = iov;
2844*d6b92ffaSHans Petter Selasky len = iov[0].iov_len;
2845*d6b92ffaSHans Petter Selasky for (i = 1; i < iovcnt; i++)
2846*d6b92ffaSHans Petter Selasky len += iov[i].iov_len;
2847*d6b92ffaSHans Petter Selasky left = len;
2848*d6b92ffaSHans Petter Selasky
2849*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
2850*d6b92ffaSHans Petter Selasky if (rs->iomap_pending) {
2851*d6b92ffaSHans Petter Selasky ret = rs_send_iomaps(rs, flags);
2852*d6b92ffaSHans Petter Selasky if (ret)
2853*d6b92ffaSHans Petter Selasky goto out;
2854*d6b92ffaSHans Petter Selasky }
2855*d6b92ffaSHans Petter Selasky for (; left; left -= xfer_size) {
2856*d6b92ffaSHans Petter Selasky if (!rs_can_send(rs)) {
2857*d6b92ffaSHans Petter Selasky ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
2858*d6b92ffaSHans Petter Selasky rs_conn_can_send);
2859*d6b92ffaSHans Petter Selasky if (ret)
2860*d6b92ffaSHans Petter Selasky break;
2861*d6b92ffaSHans Petter Selasky if (!(rs->state & rs_writable)) {
2862*d6b92ffaSHans Petter Selasky ret = ERR(ECONNRESET);
2863*d6b92ffaSHans Petter Selasky break;
2864*d6b92ffaSHans Petter Selasky }
2865*d6b92ffaSHans Petter Selasky }
2866*d6b92ffaSHans Petter Selasky
2867*d6b92ffaSHans Petter Selasky if (olen < left) {
2868*d6b92ffaSHans Petter Selasky xfer_size = olen;
2869*d6b92ffaSHans Petter Selasky if (olen < RS_MAX_TRANSFER)
2870*d6b92ffaSHans Petter Selasky olen <<= 1;
2871*d6b92ffaSHans Petter Selasky } else {
2872*d6b92ffaSHans Petter Selasky xfer_size = left;
2873*d6b92ffaSHans Petter Selasky }
2874*d6b92ffaSHans Petter Selasky
2875*d6b92ffaSHans Petter Selasky if (xfer_size > rs->sbuf_bytes_avail)
2876*d6b92ffaSHans Petter Selasky xfer_size = rs->sbuf_bytes_avail;
2877*d6b92ffaSHans Petter Selasky if (xfer_size > rs->target_sgl[rs->target_sge].length)
2878*d6b92ffaSHans Petter Selasky xfer_size = rs->target_sgl[rs->target_sge].length;
2879*d6b92ffaSHans Petter Selasky
2880*d6b92ffaSHans Petter Selasky if (xfer_size <= rs_sbuf_left(rs)) {
2881*d6b92ffaSHans Petter Selasky rs_copy_iov((void *) (uintptr_t) rs->ssgl[0].addr,
2882*d6b92ffaSHans Petter Selasky &cur_iov, &offset, xfer_size);
2883*d6b92ffaSHans Petter Selasky rs->ssgl[0].length = xfer_size;
2884*d6b92ffaSHans Petter Selasky ret = rs_write_data(rs, rs->ssgl, 1, xfer_size,
2885*d6b92ffaSHans Petter Selasky xfer_size <= rs->sq_inline ? IBV_SEND_INLINE : 0);
2886*d6b92ffaSHans Petter Selasky if (xfer_size < rs_sbuf_left(rs))
2887*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr += xfer_size;
2888*d6b92ffaSHans Petter Selasky else
2889*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr = (uintptr_t) rs->sbuf;
2890*d6b92ffaSHans Petter Selasky } else {
2891*d6b92ffaSHans Petter Selasky rs->ssgl[0].length = rs_sbuf_left(rs);
2892*d6b92ffaSHans Petter Selasky rs_copy_iov((void *) (uintptr_t) rs->ssgl[0].addr, &cur_iov,
2893*d6b92ffaSHans Petter Selasky &offset, rs->ssgl[0].length);
2894*d6b92ffaSHans Petter Selasky rs->ssgl[1].length = xfer_size - rs->ssgl[0].length;
2895*d6b92ffaSHans Petter Selasky rs_copy_iov(rs->sbuf, &cur_iov, &offset, rs->ssgl[1].length);
2896*d6b92ffaSHans Petter Selasky ret = rs_write_data(rs, rs->ssgl, 2, xfer_size,
2897*d6b92ffaSHans Petter Selasky xfer_size <= rs->sq_inline ? IBV_SEND_INLINE : 0);
2898*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length;
2899*d6b92ffaSHans Petter Selasky }
2900*d6b92ffaSHans Petter Selasky if (ret)
2901*d6b92ffaSHans Petter Selasky break;
2902*d6b92ffaSHans Petter Selasky }
2903*d6b92ffaSHans Petter Selasky out:
2904*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
2905*d6b92ffaSHans Petter Selasky
2906*d6b92ffaSHans Petter Selasky return (ret && left == len) ? ret : len - left;
2907*d6b92ffaSHans Petter Selasky }
2908*d6b92ffaSHans Petter Selasky
rsendmsg(int socket,const struct msghdr * msg,int flags)2909*d6b92ffaSHans Petter Selasky ssize_t rsendmsg(int socket, const struct msghdr *msg, int flags)
2910*d6b92ffaSHans Petter Selasky {
2911*d6b92ffaSHans Petter Selasky if (msg->msg_control && msg->msg_controllen)
2912*d6b92ffaSHans Petter Selasky return ERR(ENOTSUP);
2913*d6b92ffaSHans Petter Selasky
2914*d6b92ffaSHans Petter Selasky return rsendv(socket, msg->msg_iov, (int) msg->msg_iovlen, flags);
2915*d6b92ffaSHans Petter Selasky }
2916*d6b92ffaSHans Petter Selasky
rwrite(int socket,const void * buf,size_t count)2917*d6b92ffaSHans Petter Selasky ssize_t rwrite(int socket, const void *buf, size_t count)
2918*d6b92ffaSHans Petter Selasky {
2919*d6b92ffaSHans Petter Selasky return rsend(socket, buf, count, 0);
2920*d6b92ffaSHans Petter Selasky }
2921*d6b92ffaSHans Petter Selasky
rwritev(int socket,const struct iovec * iov,int iovcnt)2922*d6b92ffaSHans Petter Selasky ssize_t rwritev(int socket, const struct iovec *iov, int iovcnt)
2923*d6b92ffaSHans Petter Selasky {
2924*d6b92ffaSHans Petter Selasky return rsendv(socket, iov, iovcnt, 0);
2925*d6b92ffaSHans Petter Selasky }
2926*d6b92ffaSHans Petter Selasky
rs_fds_alloc(nfds_t nfds)2927*d6b92ffaSHans Petter Selasky static struct pollfd *rs_fds_alloc(nfds_t nfds)
2928*d6b92ffaSHans Petter Selasky {
2929*d6b92ffaSHans Petter Selasky static __thread struct pollfd *rfds;
2930*d6b92ffaSHans Petter Selasky static __thread nfds_t rnfds;
2931*d6b92ffaSHans Petter Selasky
2932*d6b92ffaSHans Petter Selasky if (nfds > rnfds) {
2933*d6b92ffaSHans Petter Selasky if (rfds)
2934*d6b92ffaSHans Petter Selasky free(rfds);
2935*d6b92ffaSHans Petter Selasky
2936*d6b92ffaSHans Petter Selasky rfds = malloc(sizeof(*rfds) * nfds);
2937*d6b92ffaSHans Petter Selasky rnfds = rfds ? nfds : 0;
2938*d6b92ffaSHans Petter Selasky }
2939*d6b92ffaSHans Petter Selasky
2940*d6b92ffaSHans Petter Selasky return rfds;
2941*d6b92ffaSHans Petter Selasky }
2942*d6b92ffaSHans Petter Selasky
rs_poll_rs(struct rsocket * rs,int events,int nonblock,int (* test)(struct rsocket * rs))2943*d6b92ffaSHans Petter Selasky static int rs_poll_rs(struct rsocket *rs, int events,
2944*d6b92ffaSHans Petter Selasky int nonblock, int (*test)(struct rsocket *rs))
2945*d6b92ffaSHans Petter Selasky {
2946*d6b92ffaSHans Petter Selasky struct pollfd fds;
2947*d6b92ffaSHans Petter Selasky short revents;
2948*d6b92ffaSHans Petter Selasky int ret;
2949*d6b92ffaSHans Petter Selasky
2950*d6b92ffaSHans Petter Selasky check_cq:
2951*d6b92ffaSHans Petter Selasky if ((rs->type == SOCK_STREAM) && ((rs->state & rs_connected) ||
2952*d6b92ffaSHans Petter Selasky (rs->state == rs_disconnected) || (rs->state & rs_error))) {
2953*d6b92ffaSHans Petter Selasky rs_process_cq(rs, nonblock, test);
2954*d6b92ffaSHans Petter Selasky
2955*d6b92ffaSHans Petter Selasky revents = 0;
2956*d6b92ffaSHans Petter Selasky if ((events & POLLIN) && rs_conn_have_rdata(rs))
2957*d6b92ffaSHans Petter Selasky revents |= POLLIN;
2958*d6b92ffaSHans Petter Selasky if ((events & POLLOUT) && rs_can_send(rs))
2959*d6b92ffaSHans Petter Selasky revents |= POLLOUT;
2960*d6b92ffaSHans Petter Selasky if (!(rs->state & rs_connected)) {
2961*d6b92ffaSHans Petter Selasky if (rs->state == rs_disconnected)
2962*d6b92ffaSHans Petter Selasky revents |= POLLHUP;
2963*d6b92ffaSHans Petter Selasky else
2964*d6b92ffaSHans Petter Selasky revents |= POLLERR;
2965*d6b92ffaSHans Petter Selasky }
2966*d6b92ffaSHans Petter Selasky
2967*d6b92ffaSHans Petter Selasky return revents;
2968*d6b92ffaSHans Petter Selasky } else if (rs->type == SOCK_DGRAM) {
2969*d6b92ffaSHans Petter Selasky ds_process_cqs(rs, nonblock, test);
2970*d6b92ffaSHans Petter Selasky
2971*d6b92ffaSHans Petter Selasky revents = 0;
2972*d6b92ffaSHans Petter Selasky if ((events & POLLIN) && rs_have_rdata(rs))
2973*d6b92ffaSHans Petter Selasky revents |= POLLIN;
2974*d6b92ffaSHans Petter Selasky if ((events & POLLOUT) && ds_can_send(rs))
2975*d6b92ffaSHans Petter Selasky revents |= POLLOUT;
2976*d6b92ffaSHans Petter Selasky
2977*d6b92ffaSHans Petter Selasky return revents;
2978*d6b92ffaSHans Petter Selasky }
2979*d6b92ffaSHans Petter Selasky
2980*d6b92ffaSHans Petter Selasky if (rs->state == rs_listening) {
2981*d6b92ffaSHans Petter Selasky fds.fd = rs->cm_id->channel->fd;
2982*d6b92ffaSHans Petter Selasky fds.events = events;
2983*d6b92ffaSHans Petter Selasky fds.revents = 0;
2984*d6b92ffaSHans Petter Selasky poll(&fds, 1, 0);
2985*d6b92ffaSHans Petter Selasky return fds.revents;
2986*d6b92ffaSHans Petter Selasky }
2987*d6b92ffaSHans Petter Selasky
2988*d6b92ffaSHans Petter Selasky if (rs->state & rs_opening) {
2989*d6b92ffaSHans Petter Selasky ret = rs_do_connect(rs);
2990*d6b92ffaSHans Petter Selasky if (ret && (errno == EINPROGRESS)) {
2991*d6b92ffaSHans Petter Selasky errno = 0;
2992*d6b92ffaSHans Petter Selasky } else {
2993*d6b92ffaSHans Petter Selasky goto check_cq;
2994*d6b92ffaSHans Petter Selasky }
2995*d6b92ffaSHans Petter Selasky }
2996*d6b92ffaSHans Petter Selasky
2997*d6b92ffaSHans Petter Selasky if (rs->state == rs_connect_error) {
2998*d6b92ffaSHans Petter Selasky revents = 0;
2999*d6b92ffaSHans Petter Selasky if (events & POLLOUT)
3000*d6b92ffaSHans Petter Selasky revents |= POLLOUT;
3001*d6b92ffaSHans Petter Selasky if (events & POLLIN)
3002*d6b92ffaSHans Petter Selasky revents |= POLLIN;
3003*d6b92ffaSHans Petter Selasky revents |= POLLERR;
3004*d6b92ffaSHans Petter Selasky return revents;
3005*d6b92ffaSHans Petter Selasky }
3006*d6b92ffaSHans Petter Selasky
3007*d6b92ffaSHans Petter Selasky return 0;
3008*d6b92ffaSHans Petter Selasky }
3009*d6b92ffaSHans Petter Selasky
rs_poll_check(struct pollfd * fds,nfds_t nfds)3010*d6b92ffaSHans Petter Selasky static int rs_poll_check(struct pollfd *fds, nfds_t nfds)
3011*d6b92ffaSHans Petter Selasky {
3012*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3013*d6b92ffaSHans Petter Selasky int i, cnt = 0;
3014*d6b92ffaSHans Petter Selasky
3015*d6b92ffaSHans Petter Selasky for (i = 0; i < nfds; i++) {
3016*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, fds[i].fd);
3017*d6b92ffaSHans Petter Selasky if (rs)
3018*d6b92ffaSHans Petter Selasky fds[i].revents = rs_poll_rs(rs, fds[i].events, 1, rs_poll_all);
3019*d6b92ffaSHans Petter Selasky else
3020*d6b92ffaSHans Petter Selasky poll(&fds[i], 1, 0);
3021*d6b92ffaSHans Petter Selasky
3022*d6b92ffaSHans Petter Selasky if (fds[i].revents)
3023*d6b92ffaSHans Petter Selasky cnt++;
3024*d6b92ffaSHans Petter Selasky }
3025*d6b92ffaSHans Petter Selasky return cnt;
3026*d6b92ffaSHans Petter Selasky }
3027*d6b92ffaSHans Petter Selasky
rs_poll_arm(struct pollfd * rfds,struct pollfd * fds,nfds_t nfds)3028*d6b92ffaSHans Petter Selasky static int rs_poll_arm(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds)
3029*d6b92ffaSHans Petter Selasky {
3030*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3031*d6b92ffaSHans Petter Selasky int i;
3032*d6b92ffaSHans Petter Selasky
3033*d6b92ffaSHans Petter Selasky for (i = 0; i < nfds; i++) {
3034*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, fds[i].fd);
3035*d6b92ffaSHans Petter Selasky if (rs) {
3036*d6b92ffaSHans Petter Selasky fds[i].revents = rs_poll_rs(rs, fds[i].events, 0, rs_is_cq_armed);
3037*d6b92ffaSHans Petter Selasky if (fds[i].revents)
3038*d6b92ffaSHans Petter Selasky return 1;
3039*d6b92ffaSHans Petter Selasky
3040*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
3041*d6b92ffaSHans Petter Selasky if (rs->state >= rs_connected)
3042*d6b92ffaSHans Petter Selasky rfds[i].fd = rs->cm_id->recv_cq_channel->fd;
3043*d6b92ffaSHans Petter Selasky else
3044*d6b92ffaSHans Petter Selasky rfds[i].fd = rs->cm_id->channel->fd;
3045*d6b92ffaSHans Petter Selasky } else {
3046*d6b92ffaSHans Petter Selasky rfds[i].fd = rs->epfd;
3047*d6b92ffaSHans Petter Selasky }
3048*d6b92ffaSHans Petter Selasky rfds[i].events = POLLIN;
3049*d6b92ffaSHans Petter Selasky } else {
3050*d6b92ffaSHans Petter Selasky rfds[i].fd = fds[i].fd;
3051*d6b92ffaSHans Petter Selasky rfds[i].events = fds[i].events;
3052*d6b92ffaSHans Petter Selasky }
3053*d6b92ffaSHans Petter Selasky rfds[i].revents = 0;
3054*d6b92ffaSHans Petter Selasky }
3055*d6b92ffaSHans Petter Selasky return 0;
3056*d6b92ffaSHans Petter Selasky }
3057*d6b92ffaSHans Petter Selasky
rs_poll_events(struct pollfd * rfds,struct pollfd * fds,nfds_t nfds)3058*d6b92ffaSHans Petter Selasky static int rs_poll_events(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds)
3059*d6b92ffaSHans Petter Selasky {
3060*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3061*d6b92ffaSHans Petter Selasky int i, cnt = 0;
3062*d6b92ffaSHans Petter Selasky
3063*d6b92ffaSHans Petter Selasky for (i = 0; i < nfds; i++) {
3064*d6b92ffaSHans Petter Selasky if (!rfds[i].revents)
3065*d6b92ffaSHans Petter Selasky continue;
3066*d6b92ffaSHans Petter Selasky
3067*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, fds[i].fd);
3068*d6b92ffaSHans Petter Selasky if (rs) {
3069*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->cq_wait_lock);
3070*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM)
3071*d6b92ffaSHans Petter Selasky rs_get_cq_event(rs);
3072*d6b92ffaSHans Petter Selasky else
3073*d6b92ffaSHans Petter Selasky ds_get_cq_event(rs);
3074*d6b92ffaSHans Petter Selasky fastlock_release(&rs->cq_wait_lock);
3075*d6b92ffaSHans Petter Selasky fds[i].revents = rs_poll_rs(rs, fds[i].events, 1, rs_poll_all);
3076*d6b92ffaSHans Petter Selasky } else {
3077*d6b92ffaSHans Petter Selasky fds[i].revents = rfds[i].revents;
3078*d6b92ffaSHans Petter Selasky }
3079*d6b92ffaSHans Petter Selasky if (fds[i].revents)
3080*d6b92ffaSHans Petter Selasky cnt++;
3081*d6b92ffaSHans Petter Selasky }
3082*d6b92ffaSHans Petter Selasky return cnt;
3083*d6b92ffaSHans Petter Selasky }
3084*d6b92ffaSHans Petter Selasky
3085*d6b92ffaSHans Petter Selasky /*
3086*d6b92ffaSHans Petter Selasky * We need to poll *all* fd's that the user specifies at least once.
3087*d6b92ffaSHans Petter Selasky * Note that we may receive events on an rsocket that may not be reported
3088*d6b92ffaSHans Petter Selasky * to the user (e.g. connection events or credit updates). Process those
3089*d6b92ffaSHans Petter Selasky * events, then return to polling until we find ones of interest.
3090*d6b92ffaSHans Petter Selasky */
rpoll(struct pollfd * fds,nfds_t nfds,int timeout)3091*d6b92ffaSHans Petter Selasky int rpoll(struct pollfd *fds, nfds_t nfds, int timeout)
3092*d6b92ffaSHans Petter Selasky {
3093*d6b92ffaSHans Petter Selasky struct timeval s, e;
3094*d6b92ffaSHans Petter Selasky struct pollfd *rfds;
3095*d6b92ffaSHans Petter Selasky uint32_t poll_time = 0;
3096*d6b92ffaSHans Petter Selasky int ret;
3097*d6b92ffaSHans Petter Selasky
3098*d6b92ffaSHans Petter Selasky do {
3099*d6b92ffaSHans Petter Selasky ret = rs_poll_check(fds, nfds);
3100*d6b92ffaSHans Petter Selasky if (ret || !timeout)
3101*d6b92ffaSHans Petter Selasky return ret;
3102*d6b92ffaSHans Petter Selasky
3103*d6b92ffaSHans Petter Selasky if (!poll_time)
3104*d6b92ffaSHans Petter Selasky gettimeofday(&s, NULL);
3105*d6b92ffaSHans Petter Selasky
3106*d6b92ffaSHans Petter Selasky gettimeofday(&e, NULL);
3107*d6b92ffaSHans Petter Selasky poll_time = (e.tv_sec - s.tv_sec) * 1000000 +
3108*d6b92ffaSHans Petter Selasky (e.tv_usec - s.tv_usec) + 1;
3109*d6b92ffaSHans Petter Selasky } while (poll_time <= polling_time);
3110*d6b92ffaSHans Petter Selasky
3111*d6b92ffaSHans Petter Selasky rfds = rs_fds_alloc(nfds);
3112*d6b92ffaSHans Petter Selasky if (!rfds)
3113*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
3114*d6b92ffaSHans Petter Selasky
3115*d6b92ffaSHans Petter Selasky do {
3116*d6b92ffaSHans Petter Selasky ret = rs_poll_arm(rfds, fds, nfds);
3117*d6b92ffaSHans Petter Selasky if (ret)
3118*d6b92ffaSHans Petter Selasky break;
3119*d6b92ffaSHans Petter Selasky
3120*d6b92ffaSHans Petter Selasky ret = poll(rfds, nfds, timeout);
3121*d6b92ffaSHans Petter Selasky if (ret <= 0)
3122*d6b92ffaSHans Petter Selasky break;
3123*d6b92ffaSHans Petter Selasky
3124*d6b92ffaSHans Petter Selasky ret = rs_poll_events(rfds, fds, nfds);
3125*d6b92ffaSHans Petter Selasky } while (!ret);
3126*d6b92ffaSHans Petter Selasky
3127*d6b92ffaSHans Petter Selasky return ret;
3128*d6b92ffaSHans Petter Selasky }
3129*d6b92ffaSHans Petter Selasky
3130*d6b92ffaSHans Petter Selasky static struct pollfd *
rs_select_to_poll(int * nfds,fd_set * readfds,fd_set * writefds,fd_set * exceptfds)3131*d6b92ffaSHans Petter Selasky rs_select_to_poll(int *nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds)
3132*d6b92ffaSHans Petter Selasky {
3133*d6b92ffaSHans Petter Selasky struct pollfd *fds;
3134*d6b92ffaSHans Petter Selasky int fd, i = 0;
3135*d6b92ffaSHans Petter Selasky
3136*d6b92ffaSHans Petter Selasky fds = calloc(*nfds, sizeof(*fds));
3137*d6b92ffaSHans Petter Selasky if (!fds)
3138*d6b92ffaSHans Petter Selasky return NULL;
3139*d6b92ffaSHans Petter Selasky
3140*d6b92ffaSHans Petter Selasky for (fd = 0; fd < *nfds; fd++) {
3141*d6b92ffaSHans Petter Selasky if (readfds && FD_ISSET(fd, readfds)) {
3142*d6b92ffaSHans Petter Selasky fds[i].fd = fd;
3143*d6b92ffaSHans Petter Selasky fds[i].events = POLLIN;
3144*d6b92ffaSHans Petter Selasky }
3145*d6b92ffaSHans Petter Selasky
3146*d6b92ffaSHans Petter Selasky if (writefds && FD_ISSET(fd, writefds)) {
3147*d6b92ffaSHans Petter Selasky fds[i].fd = fd;
3148*d6b92ffaSHans Petter Selasky fds[i].events |= POLLOUT;
3149*d6b92ffaSHans Petter Selasky }
3150*d6b92ffaSHans Petter Selasky
3151*d6b92ffaSHans Petter Selasky if (exceptfds && FD_ISSET(fd, exceptfds))
3152*d6b92ffaSHans Petter Selasky fds[i].fd = fd;
3153*d6b92ffaSHans Petter Selasky
3154*d6b92ffaSHans Petter Selasky if (fds[i].fd)
3155*d6b92ffaSHans Petter Selasky i++;
3156*d6b92ffaSHans Petter Selasky }
3157*d6b92ffaSHans Petter Selasky
3158*d6b92ffaSHans Petter Selasky *nfds = i;
3159*d6b92ffaSHans Petter Selasky return fds;
3160*d6b92ffaSHans Petter Selasky }
3161*d6b92ffaSHans Petter Selasky
3162*d6b92ffaSHans Petter Selasky static int
rs_poll_to_select(int nfds,struct pollfd * fds,fd_set * readfds,fd_set * writefds,fd_set * exceptfds)3163*d6b92ffaSHans Petter Selasky rs_poll_to_select(int nfds, struct pollfd *fds, fd_set *readfds,
3164*d6b92ffaSHans Petter Selasky fd_set *writefds, fd_set *exceptfds)
3165*d6b92ffaSHans Petter Selasky {
3166*d6b92ffaSHans Petter Selasky int i, cnt = 0;
3167*d6b92ffaSHans Petter Selasky
3168*d6b92ffaSHans Petter Selasky for (i = 0; i < nfds; i++) {
3169*d6b92ffaSHans Petter Selasky if (readfds && (fds[i].revents & (POLLIN | POLLHUP))) {
3170*d6b92ffaSHans Petter Selasky FD_SET(fds[i].fd, readfds);
3171*d6b92ffaSHans Petter Selasky cnt++;
3172*d6b92ffaSHans Petter Selasky }
3173*d6b92ffaSHans Petter Selasky
3174*d6b92ffaSHans Petter Selasky if (writefds && (fds[i].revents & POLLOUT)) {
3175*d6b92ffaSHans Petter Selasky FD_SET(fds[i].fd, writefds);
3176*d6b92ffaSHans Petter Selasky cnt++;
3177*d6b92ffaSHans Petter Selasky }
3178*d6b92ffaSHans Petter Selasky
3179*d6b92ffaSHans Petter Selasky if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) {
3180*d6b92ffaSHans Petter Selasky FD_SET(fds[i].fd, exceptfds);
3181*d6b92ffaSHans Petter Selasky cnt++;
3182*d6b92ffaSHans Petter Selasky }
3183*d6b92ffaSHans Petter Selasky }
3184*d6b92ffaSHans Petter Selasky return cnt;
3185*d6b92ffaSHans Petter Selasky }
3186*d6b92ffaSHans Petter Selasky
rs_convert_timeout(struct timeval * timeout)3187*d6b92ffaSHans Petter Selasky static int rs_convert_timeout(struct timeval *timeout)
3188*d6b92ffaSHans Petter Selasky {
3189*d6b92ffaSHans Petter Selasky return !timeout ? -1 :
3190*d6b92ffaSHans Petter Selasky timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
3191*d6b92ffaSHans Petter Selasky }
3192*d6b92ffaSHans Petter Selasky
rselect(int nfds,fd_set * readfds,fd_set * writefds,fd_set * exceptfds,struct timeval * timeout)3193*d6b92ffaSHans Petter Selasky int rselect(int nfds, fd_set *readfds, fd_set *writefds,
3194*d6b92ffaSHans Petter Selasky fd_set *exceptfds, struct timeval *timeout)
3195*d6b92ffaSHans Petter Selasky {
3196*d6b92ffaSHans Petter Selasky struct pollfd *fds;
3197*d6b92ffaSHans Petter Selasky int ret;
3198*d6b92ffaSHans Petter Selasky
3199*d6b92ffaSHans Petter Selasky fds = rs_select_to_poll(&nfds, readfds, writefds, exceptfds);
3200*d6b92ffaSHans Petter Selasky if (!fds)
3201*d6b92ffaSHans Petter Selasky return ERR(ENOMEM);
3202*d6b92ffaSHans Petter Selasky
3203*d6b92ffaSHans Petter Selasky ret = rpoll(fds, nfds, rs_convert_timeout(timeout));
3204*d6b92ffaSHans Petter Selasky
3205*d6b92ffaSHans Petter Selasky if (readfds)
3206*d6b92ffaSHans Petter Selasky FD_ZERO(readfds);
3207*d6b92ffaSHans Petter Selasky if (writefds)
3208*d6b92ffaSHans Petter Selasky FD_ZERO(writefds);
3209*d6b92ffaSHans Petter Selasky if (exceptfds)
3210*d6b92ffaSHans Petter Selasky FD_ZERO(exceptfds);
3211*d6b92ffaSHans Petter Selasky
3212*d6b92ffaSHans Petter Selasky if (ret > 0)
3213*d6b92ffaSHans Petter Selasky ret = rs_poll_to_select(nfds, fds, readfds, writefds, exceptfds);
3214*d6b92ffaSHans Petter Selasky
3215*d6b92ffaSHans Petter Selasky free(fds);
3216*d6b92ffaSHans Petter Selasky return ret;
3217*d6b92ffaSHans Petter Selasky }
3218*d6b92ffaSHans Petter Selasky
3219*d6b92ffaSHans Petter Selasky /*
3220*d6b92ffaSHans Petter Selasky * For graceful disconnect, notify the remote side that we're
3221*d6b92ffaSHans Petter Selasky * disconnecting and wait until all outstanding sends complete, provided
3222*d6b92ffaSHans Petter Selasky * that the remote side has not sent a disconnect message.
3223*d6b92ffaSHans Petter Selasky */
rshutdown(int socket,int how)3224*d6b92ffaSHans Petter Selasky int rshutdown(int socket, int how)
3225*d6b92ffaSHans Petter Selasky {
3226*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3227*d6b92ffaSHans Petter Selasky int ctrl, ret = 0;
3228*d6b92ffaSHans Petter Selasky
3229*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
3230*d6b92ffaSHans Petter Selasky if (!rs)
3231*d6b92ffaSHans Petter Selasky return ERR(EBADF);
3232*d6b92ffaSHans Petter Selasky if (rs->opts & RS_OPT_SVC_ACTIVE)
3233*d6b92ffaSHans Petter Selasky rs_notify_svc(&tcp_svc, rs, RS_SVC_REM_KEEPALIVE);
3234*d6b92ffaSHans Petter Selasky
3235*d6b92ffaSHans Petter Selasky if (rs->fd_flags & O_NONBLOCK)
3236*d6b92ffaSHans Petter Selasky rs_set_nonblocking(rs, 0);
3237*d6b92ffaSHans Petter Selasky
3238*d6b92ffaSHans Petter Selasky if (rs->state & rs_connected) {
3239*d6b92ffaSHans Petter Selasky if (how == SHUT_RDWR) {
3240*d6b92ffaSHans Petter Selasky ctrl = RS_CTRL_DISCONNECT;
3241*d6b92ffaSHans Petter Selasky rs->state &= ~(rs_readable | rs_writable);
3242*d6b92ffaSHans Petter Selasky } else if (how == SHUT_WR) {
3243*d6b92ffaSHans Petter Selasky rs->state &= ~rs_writable;
3244*d6b92ffaSHans Petter Selasky ctrl = (rs->state & rs_readable) ?
3245*d6b92ffaSHans Petter Selasky RS_CTRL_SHUTDOWN : RS_CTRL_DISCONNECT;
3246*d6b92ffaSHans Petter Selasky } else {
3247*d6b92ffaSHans Petter Selasky rs->state &= ~rs_readable;
3248*d6b92ffaSHans Petter Selasky if (rs->state & rs_writable)
3249*d6b92ffaSHans Petter Selasky goto out;
3250*d6b92ffaSHans Petter Selasky ctrl = RS_CTRL_DISCONNECT;
3251*d6b92ffaSHans Petter Selasky }
3252*d6b92ffaSHans Petter Selasky if (!rs_ctrl_avail(rs)) {
3253*d6b92ffaSHans Petter Selasky ret = rs_process_cq(rs, 0, rs_conn_can_send_ctrl);
3254*d6b92ffaSHans Petter Selasky if (ret)
3255*d6b92ffaSHans Petter Selasky goto out;
3256*d6b92ffaSHans Petter Selasky }
3257*d6b92ffaSHans Petter Selasky
3258*d6b92ffaSHans Petter Selasky if ((rs->state & rs_connected) && rs_ctrl_avail(rs)) {
3259*d6b92ffaSHans Petter Selasky rs->ctrl_seqno++;
3260*d6b92ffaSHans Petter Selasky ret = rs_post_msg(rs, rs_msg_set(RS_OP_CTRL, ctrl));
3261*d6b92ffaSHans Petter Selasky }
3262*d6b92ffaSHans Petter Selasky }
3263*d6b92ffaSHans Petter Selasky
3264*d6b92ffaSHans Petter Selasky if (rs->state & rs_connected)
3265*d6b92ffaSHans Petter Selasky rs_process_cq(rs, 0, rs_conn_all_sends_done);
3266*d6b92ffaSHans Petter Selasky
3267*d6b92ffaSHans Petter Selasky out:
3268*d6b92ffaSHans Petter Selasky if ((rs->fd_flags & O_NONBLOCK) && (rs->state & rs_connected))
3269*d6b92ffaSHans Petter Selasky rs_set_nonblocking(rs, rs->fd_flags);
3270*d6b92ffaSHans Petter Selasky
3271*d6b92ffaSHans Petter Selasky if (rs->state & rs_disconnected) {
3272*d6b92ffaSHans Petter Selasky /* Generate event by flushing receives to unblock rpoll */
3273*d6b92ffaSHans Petter Selasky ibv_req_notify_cq(rs->cm_id->recv_cq, 0);
3274*d6b92ffaSHans Petter Selasky ucma_shutdown(rs->cm_id);
3275*d6b92ffaSHans Petter Selasky }
3276*d6b92ffaSHans Petter Selasky
3277*d6b92ffaSHans Petter Selasky return ret;
3278*d6b92ffaSHans Petter Selasky }
3279*d6b92ffaSHans Petter Selasky
ds_shutdown(struct rsocket * rs)3280*d6b92ffaSHans Petter Selasky static void ds_shutdown(struct rsocket *rs)
3281*d6b92ffaSHans Petter Selasky {
3282*d6b92ffaSHans Petter Selasky if (rs->opts & RS_OPT_SVC_ACTIVE)
3283*d6b92ffaSHans Petter Selasky rs_notify_svc(&udp_svc, rs, RS_SVC_REM_DGRAM);
3284*d6b92ffaSHans Petter Selasky
3285*d6b92ffaSHans Petter Selasky if (rs->fd_flags & O_NONBLOCK)
3286*d6b92ffaSHans Petter Selasky rs_set_nonblocking(rs, 0);
3287*d6b92ffaSHans Petter Selasky
3288*d6b92ffaSHans Petter Selasky rs->state &= ~(rs_readable | rs_writable);
3289*d6b92ffaSHans Petter Selasky ds_process_cqs(rs, 0, ds_all_sends_done);
3290*d6b92ffaSHans Petter Selasky
3291*d6b92ffaSHans Petter Selasky if (rs->fd_flags & O_NONBLOCK)
3292*d6b92ffaSHans Petter Selasky rs_set_nonblocking(rs, rs->fd_flags);
3293*d6b92ffaSHans Petter Selasky }
3294*d6b92ffaSHans Petter Selasky
rclose(int socket)3295*d6b92ffaSHans Petter Selasky int rclose(int socket)
3296*d6b92ffaSHans Petter Selasky {
3297*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3298*d6b92ffaSHans Petter Selasky
3299*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
3300*d6b92ffaSHans Petter Selasky if (!rs)
3301*d6b92ffaSHans Petter Selasky return EBADF;
3302*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
3303*d6b92ffaSHans Petter Selasky if (rs->state & rs_connected)
3304*d6b92ffaSHans Petter Selasky rshutdown(socket, SHUT_RDWR);
3305*d6b92ffaSHans Petter Selasky else if (rs->opts & RS_OPT_SVC_ACTIVE)
3306*d6b92ffaSHans Petter Selasky rs_notify_svc(&tcp_svc, rs, RS_SVC_REM_KEEPALIVE);
3307*d6b92ffaSHans Petter Selasky } else {
3308*d6b92ffaSHans Petter Selasky ds_shutdown(rs);
3309*d6b92ffaSHans Petter Selasky }
3310*d6b92ffaSHans Petter Selasky
3311*d6b92ffaSHans Petter Selasky rs_free(rs);
3312*d6b92ffaSHans Petter Selasky return 0;
3313*d6b92ffaSHans Petter Selasky }
3314*d6b92ffaSHans Petter Selasky
rs_copy_addr(struct sockaddr * dst,struct sockaddr * src,socklen_t * len)3315*d6b92ffaSHans Petter Selasky static void rs_copy_addr(struct sockaddr *dst, struct sockaddr *src, socklen_t *len)
3316*d6b92ffaSHans Petter Selasky {
3317*d6b92ffaSHans Petter Selasky socklen_t size;
3318*d6b92ffaSHans Petter Selasky
3319*d6b92ffaSHans Petter Selasky if (src->sa_family == AF_INET) {
3320*d6b92ffaSHans Petter Selasky size = min_t(socklen_t, *len, sizeof(struct sockaddr_in));
3321*d6b92ffaSHans Petter Selasky *len = sizeof(struct sockaddr_in);
3322*d6b92ffaSHans Petter Selasky } else {
3323*d6b92ffaSHans Petter Selasky size = min_t(socklen_t, *len, sizeof(struct sockaddr_in6));
3324*d6b92ffaSHans Petter Selasky *len = sizeof(struct sockaddr_in6);
3325*d6b92ffaSHans Petter Selasky }
3326*d6b92ffaSHans Petter Selasky memcpy(dst, src, size);
3327*d6b92ffaSHans Petter Selasky }
3328*d6b92ffaSHans Petter Selasky
rgetpeername(int socket,struct sockaddr * addr,socklen_t * addrlen)3329*d6b92ffaSHans Petter Selasky int rgetpeername(int socket, struct sockaddr *addr, socklen_t *addrlen)
3330*d6b92ffaSHans Petter Selasky {
3331*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3332*d6b92ffaSHans Petter Selasky
3333*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
3334*d6b92ffaSHans Petter Selasky if (!rs)
3335*d6b92ffaSHans Petter Selasky return ERR(EBADF);
3336*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
3337*d6b92ffaSHans Petter Selasky rs_copy_addr(addr, rdma_get_peer_addr(rs->cm_id), addrlen);
3338*d6b92ffaSHans Petter Selasky return 0;
3339*d6b92ffaSHans Petter Selasky } else {
3340*d6b92ffaSHans Petter Selasky return getpeername(rs->udp_sock, addr, addrlen);
3341*d6b92ffaSHans Petter Selasky }
3342*d6b92ffaSHans Petter Selasky }
3343*d6b92ffaSHans Petter Selasky
rgetsockname(int socket,struct sockaddr * addr,socklen_t * addrlen)3344*d6b92ffaSHans Petter Selasky int rgetsockname(int socket, struct sockaddr *addr, socklen_t *addrlen)
3345*d6b92ffaSHans Petter Selasky {
3346*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3347*d6b92ffaSHans Petter Selasky
3348*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
3349*d6b92ffaSHans Petter Selasky if (!rs)
3350*d6b92ffaSHans Petter Selasky return ERR(EBADF);
3351*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
3352*d6b92ffaSHans Petter Selasky rs_copy_addr(addr, rdma_get_local_addr(rs->cm_id), addrlen);
3353*d6b92ffaSHans Petter Selasky return 0;
3354*d6b92ffaSHans Petter Selasky } else {
3355*d6b92ffaSHans Petter Selasky return getsockname(rs->udp_sock, addr, addrlen);
3356*d6b92ffaSHans Petter Selasky }
3357*d6b92ffaSHans Petter Selasky }
3358*d6b92ffaSHans Petter Selasky
rs_set_keepalive(struct rsocket * rs,int on)3359*d6b92ffaSHans Petter Selasky static int rs_set_keepalive(struct rsocket *rs, int on)
3360*d6b92ffaSHans Petter Selasky {
3361*d6b92ffaSHans Petter Selasky FILE *f;
3362*d6b92ffaSHans Petter Selasky int ret;
3363*d6b92ffaSHans Petter Selasky
3364*d6b92ffaSHans Petter Selasky if ((on && (rs->opts & RS_OPT_SVC_ACTIVE)) ||
3365*d6b92ffaSHans Petter Selasky (!on && !(rs->opts & RS_OPT_SVC_ACTIVE)))
3366*d6b92ffaSHans Petter Selasky return 0;
3367*d6b92ffaSHans Petter Selasky
3368*d6b92ffaSHans Petter Selasky if (on) {
3369*d6b92ffaSHans Petter Selasky if (!rs->keepalive_time) {
3370*d6b92ffaSHans Petter Selasky if ((f = fopen("/proc/sys/net/ipv4/tcp_keepalive_time", "r"))) {
3371*d6b92ffaSHans Petter Selasky if (fscanf(f, "%u", &rs->keepalive_time) != 1)
3372*d6b92ffaSHans Petter Selasky rs->keepalive_time = 7200;
3373*d6b92ffaSHans Petter Selasky fclose(f);
3374*d6b92ffaSHans Petter Selasky } else {
3375*d6b92ffaSHans Petter Selasky rs->keepalive_time = 7200;
3376*d6b92ffaSHans Petter Selasky }
3377*d6b92ffaSHans Petter Selasky }
3378*d6b92ffaSHans Petter Selasky ret = rs_notify_svc(&tcp_svc, rs, RS_SVC_ADD_KEEPALIVE);
3379*d6b92ffaSHans Petter Selasky } else {
3380*d6b92ffaSHans Petter Selasky ret = rs_notify_svc(&tcp_svc, rs, RS_SVC_REM_KEEPALIVE);
3381*d6b92ffaSHans Petter Selasky }
3382*d6b92ffaSHans Petter Selasky
3383*d6b92ffaSHans Petter Selasky return ret;
3384*d6b92ffaSHans Petter Selasky }
3385*d6b92ffaSHans Petter Selasky
rsetsockopt(int socket,int level,int optname,const void * optval,socklen_t optlen)3386*d6b92ffaSHans Petter Selasky int rsetsockopt(int socket, int level, int optname,
3387*d6b92ffaSHans Petter Selasky const void *optval, socklen_t optlen)
3388*d6b92ffaSHans Petter Selasky {
3389*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3390*d6b92ffaSHans Petter Selasky int ret, opt_on = 0;
3391*d6b92ffaSHans Petter Selasky uint64_t *opts = NULL;
3392*d6b92ffaSHans Petter Selasky
3393*d6b92ffaSHans Petter Selasky ret = ERR(ENOTSUP);
3394*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
3395*d6b92ffaSHans Petter Selasky if (!rs)
3396*d6b92ffaSHans Petter Selasky return ERR(EBADF);
3397*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_DGRAM && level != SOL_RDMA) {
3398*d6b92ffaSHans Petter Selasky ret = setsockopt(rs->udp_sock, level, optname, optval, optlen);
3399*d6b92ffaSHans Petter Selasky if (ret)
3400*d6b92ffaSHans Petter Selasky return ret;
3401*d6b92ffaSHans Petter Selasky }
3402*d6b92ffaSHans Petter Selasky
3403*d6b92ffaSHans Petter Selasky switch (level) {
3404*d6b92ffaSHans Petter Selasky case SOL_SOCKET:
3405*d6b92ffaSHans Petter Selasky opts = &rs->so_opts;
3406*d6b92ffaSHans Petter Selasky switch (optname) {
3407*d6b92ffaSHans Petter Selasky case SO_REUSEADDR:
3408*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
3409*d6b92ffaSHans Petter Selasky ret = rdma_set_option(rs->cm_id, RDMA_OPTION_ID,
3410*d6b92ffaSHans Petter Selasky RDMA_OPTION_ID_REUSEADDR,
3411*d6b92ffaSHans Petter Selasky (void *) optval, optlen);
3412*d6b92ffaSHans Petter Selasky if (ret && ((errno == ENOSYS) || ((rs->state != rs_init) &&
3413*d6b92ffaSHans Petter Selasky rs->cm_id->context &&
3414*d6b92ffaSHans Petter Selasky (rs->cm_id->verbs->device->transport_type == IBV_TRANSPORT_IB))))
3415*d6b92ffaSHans Petter Selasky ret = 0;
3416*d6b92ffaSHans Petter Selasky }
3417*d6b92ffaSHans Petter Selasky opt_on = *(int *) optval;
3418*d6b92ffaSHans Petter Selasky break;
3419*d6b92ffaSHans Petter Selasky case SO_RCVBUF:
3420*d6b92ffaSHans Petter Selasky if ((rs->type == SOCK_STREAM && !rs->rbuf) ||
3421*d6b92ffaSHans Petter Selasky (rs->type == SOCK_DGRAM && !rs->qp_list))
3422*d6b92ffaSHans Petter Selasky rs->rbuf_size = (*(uint32_t *) optval) << 1;
3423*d6b92ffaSHans Petter Selasky ret = 0;
3424*d6b92ffaSHans Petter Selasky break;
3425*d6b92ffaSHans Petter Selasky case SO_SNDBUF:
3426*d6b92ffaSHans Petter Selasky if (!rs->sbuf)
3427*d6b92ffaSHans Petter Selasky rs->sbuf_size = (*(uint32_t *) optval) << 1;
3428*d6b92ffaSHans Petter Selasky if (rs->sbuf_size < RS_SNDLOWAT)
3429*d6b92ffaSHans Petter Selasky rs->sbuf_size = RS_SNDLOWAT << 1;
3430*d6b92ffaSHans Petter Selasky ret = 0;
3431*d6b92ffaSHans Petter Selasky break;
3432*d6b92ffaSHans Petter Selasky case SO_LINGER:
3433*d6b92ffaSHans Petter Selasky /* Invert value so default so_opt = 0 is on */
3434*d6b92ffaSHans Petter Selasky opt_on = !((struct linger *) optval)->l_onoff;
3435*d6b92ffaSHans Petter Selasky ret = 0;
3436*d6b92ffaSHans Petter Selasky break;
3437*d6b92ffaSHans Petter Selasky case SO_KEEPALIVE:
3438*d6b92ffaSHans Petter Selasky ret = rs_set_keepalive(rs, *(int *) optval);
3439*d6b92ffaSHans Petter Selasky opt_on = rs->opts & RS_OPT_SVC_ACTIVE;
3440*d6b92ffaSHans Petter Selasky break;
3441*d6b92ffaSHans Petter Selasky case SO_OOBINLINE:
3442*d6b92ffaSHans Petter Selasky opt_on = *(int *) optval;
3443*d6b92ffaSHans Petter Selasky ret = 0;
3444*d6b92ffaSHans Petter Selasky break;
3445*d6b92ffaSHans Petter Selasky default:
3446*d6b92ffaSHans Petter Selasky break;
3447*d6b92ffaSHans Petter Selasky }
3448*d6b92ffaSHans Petter Selasky break;
3449*d6b92ffaSHans Petter Selasky case IPPROTO_TCP:
3450*d6b92ffaSHans Petter Selasky opts = &rs->tcp_opts;
3451*d6b92ffaSHans Petter Selasky switch (optname) {
3452*d6b92ffaSHans Petter Selasky case TCP_KEEPCNT:
3453*d6b92ffaSHans Petter Selasky case TCP_KEEPINTVL:
3454*d6b92ffaSHans Petter Selasky ret = 0; /* N/A - we're using a reliable connection */
3455*d6b92ffaSHans Petter Selasky break;
3456*d6b92ffaSHans Petter Selasky case TCP_KEEPIDLE:
3457*d6b92ffaSHans Petter Selasky if (*(int *) optval <= 0) {
3458*d6b92ffaSHans Petter Selasky ret = ERR(EINVAL);
3459*d6b92ffaSHans Petter Selasky break;
3460*d6b92ffaSHans Petter Selasky }
3461*d6b92ffaSHans Petter Selasky rs->keepalive_time = *(int *) optval;
3462*d6b92ffaSHans Petter Selasky ret = (rs->opts & RS_OPT_SVC_ACTIVE) ?
3463*d6b92ffaSHans Petter Selasky rs_notify_svc(&tcp_svc, rs, RS_SVC_MOD_KEEPALIVE) : 0;
3464*d6b92ffaSHans Petter Selasky break;
3465*d6b92ffaSHans Petter Selasky case TCP_NODELAY:
3466*d6b92ffaSHans Petter Selasky opt_on = *(int *) optval;
3467*d6b92ffaSHans Petter Selasky ret = 0;
3468*d6b92ffaSHans Petter Selasky break;
3469*d6b92ffaSHans Petter Selasky case TCP_MAXSEG:
3470*d6b92ffaSHans Petter Selasky ret = 0;
3471*d6b92ffaSHans Petter Selasky break;
3472*d6b92ffaSHans Petter Selasky default:
3473*d6b92ffaSHans Petter Selasky break;
3474*d6b92ffaSHans Petter Selasky }
3475*d6b92ffaSHans Petter Selasky break;
3476*d6b92ffaSHans Petter Selasky case IPPROTO_IPV6:
3477*d6b92ffaSHans Petter Selasky opts = &rs->ipv6_opts;
3478*d6b92ffaSHans Petter Selasky switch (optname) {
3479*d6b92ffaSHans Petter Selasky case IPV6_V6ONLY:
3480*d6b92ffaSHans Petter Selasky if (rs->type == SOCK_STREAM) {
3481*d6b92ffaSHans Petter Selasky ret = rdma_set_option(rs->cm_id, RDMA_OPTION_ID,
3482*d6b92ffaSHans Petter Selasky RDMA_OPTION_ID_AFONLY,
3483*d6b92ffaSHans Petter Selasky (void *) optval, optlen);
3484*d6b92ffaSHans Petter Selasky }
3485*d6b92ffaSHans Petter Selasky opt_on = *(int *) optval;
3486*d6b92ffaSHans Petter Selasky break;
3487*d6b92ffaSHans Petter Selasky default:
3488*d6b92ffaSHans Petter Selasky break;
3489*d6b92ffaSHans Petter Selasky }
3490*d6b92ffaSHans Petter Selasky break;
3491*d6b92ffaSHans Petter Selasky case SOL_RDMA:
3492*d6b92ffaSHans Petter Selasky if (rs->state >= rs_opening) {
3493*d6b92ffaSHans Petter Selasky ret = ERR(EINVAL);
3494*d6b92ffaSHans Petter Selasky break;
3495*d6b92ffaSHans Petter Selasky }
3496*d6b92ffaSHans Petter Selasky
3497*d6b92ffaSHans Petter Selasky switch (optname) {
3498*d6b92ffaSHans Petter Selasky case RDMA_SQSIZE:
3499*d6b92ffaSHans Petter Selasky rs->sq_size = min_t(uint32_t, (*(uint32_t *)optval),
3500*d6b92ffaSHans Petter Selasky RS_QP_MAX_SIZE);
3501*d6b92ffaSHans Petter Selasky ret = 0;
3502*d6b92ffaSHans Petter Selasky break;
3503*d6b92ffaSHans Petter Selasky case RDMA_RQSIZE:
3504*d6b92ffaSHans Petter Selasky rs->rq_size = min_t(uint32_t, (*(uint32_t *)optval),
3505*d6b92ffaSHans Petter Selasky RS_QP_MAX_SIZE);
3506*d6b92ffaSHans Petter Selasky ret = 0;
3507*d6b92ffaSHans Petter Selasky break;
3508*d6b92ffaSHans Petter Selasky case RDMA_INLINE:
3509*d6b92ffaSHans Petter Selasky rs->sq_inline = min_t(uint32_t, *(uint32_t *)optval,
3510*d6b92ffaSHans Petter Selasky RS_QP_MAX_SIZE);
3511*d6b92ffaSHans Petter Selasky ret = 0;
3512*d6b92ffaSHans Petter Selasky break;
3513*d6b92ffaSHans Petter Selasky case RDMA_IOMAPSIZE:
3514*d6b92ffaSHans Petter Selasky rs->target_iomap_size = (uint16_t) rs_scale_to_value(
3515*d6b92ffaSHans Petter Selasky (uint8_t) rs_value_to_scale(*(int *) optval, 8), 8);
3516*d6b92ffaSHans Petter Selasky ret = 0;
3517*d6b92ffaSHans Petter Selasky break;
3518*d6b92ffaSHans Petter Selasky case RDMA_ROUTE:
3519*d6b92ffaSHans Petter Selasky if ((rs->optval = malloc(optlen))) {
3520*d6b92ffaSHans Petter Selasky memcpy(rs->optval, optval, optlen);
3521*d6b92ffaSHans Petter Selasky rs->optlen = optlen;
3522*d6b92ffaSHans Petter Selasky ret = 0;
3523*d6b92ffaSHans Petter Selasky } else {
3524*d6b92ffaSHans Petter Selasky ret = ERR(ENOMEM);
3525*d6b92ffaSHans Petter Selasky }
3526*d6b92ffaSHans Petter Selasky break;
3527*d6b92ffaSHans Petter Selasky default:
3528*d6b92ffaSHans Petter Selasky break;
3529*d6b92ffaSHans Petter Selasky }
3530*d6b92ffaSHans Petter Selasky break;
3531*d6b92ffaSHans Petter Selasky default:
3532*d6b92ffaSHans Petter Selasky break;
3533*d6b92ffaSHans Petter Selasky }
3534*d6b92ffaSHans Petter Selasky
3535*d6b92ffaSHans Petter Selasky if (!ret && opts) {
3536*d6b92ffaSHans Petter Selasky if (opt_on)
3537*d6b92ffaSHans Petter Selasky *opts |= (1 << optname);
3538*d6b92ffaSHans Petter Selasky else
3539*d6b92ffaSHans Petter Selasky *opts &= ~(1 << optname);
3540*d6b92ffaSHans Petter Selasky }
3541*d6b92ffaSHans Petter Selasky
3542*d6b92ffaSHans Petter Selasky return ret;
3543*d6b92ffaSHans Petter Selasky }
3544*d6b92ffaSHans Petter Selasky
rs_convert_sa_path(struct ibv_sa_path_rec * sa_path,struct ibv_path_data * path_data)3545*d6b92ffaSHans Petter Selasky static void rs_convert_sa_path(struct ibv_sa_path_rec *sa_path,
3546*d6b92ffaSHans Petter Selasky struct ibv_path_data *path_data)
3547*d6b92ffaSHans Petter Selasky {
3548*d6b92ffaSHans Petter Selasky uint32_t fl_hop;
3549*d6b92ffaSHans Petter Selasky
3550*d6b92ffaSHans Petter Selasky memset(path_data, 0, sizeof(*path_data));
3551*d6b92ffaSHans Petter Selasky path_data->path.dgid = sa_path->dgid;
3552*d6b92ffaSHans Petter Selasky path_data->path.sgid = sa_path->sgid;
3553*d6b92ffaSHans Petter Selasky path_data->path.dlid = sa_path->dlid;
3554*d6b92ffaSHans Petter Selasky path_data->path.slid = sa_path->slid;
3555*d6b92ffaSHans Petter Selasky fl_hop = be32toh(sa_path->flow_label) << 8;
3556*d6b92ffaSHans Petter Selasky path_data->path.flowlabel_hoplimit = htobe32(fl_hop | sa_path->hop_limit);
3557*d6b92ffaSHans Petter Selasky path_data->path.tclass = sa_path->traffic_class;
3558*d6b92ffaSHans Petter Selasky path_data->path.reversible_numpath = sa_path->reversible << 7 | 1;
3559*d6b92ffaSHans Petter Selasky path_data->path.pkey = sa_path->pkey;
3560*d6b92ffaSHans Petter Selasky path_data->path.qosclass_sl = htobe16(sa_path->sl);
3561*d6b92ffaSHans Petter Selasky path_data->path.mtu = sa_path->mtu | 2 << 6; /* exactly */
3562*d6b92ffaSHans Petter Selasky path_data->path.rate = sa_path->rate | 2 << 6;
3563*d6b92ffaSHans Petter Selasky path_data->path.packetlifetime = sa_path->packet_life_time | 2 << 6;
3564*d6b92ffaSHans Petter Selasky path_data->flags= sa_path->preference;
3565*d6b92ffaSHans Petter Selasky }
3566*d6b92ffaSHans Petter Selasky
rgetsockopt(int socket,int level,int optname,void * optval,socklen_t * optlen)3567*d6b92ffaSHans Petter Selasky int rgetsockopt(int socket, int level, int optname,
3568*d6b92ffaSHans Petter Selasky void *optval, socklen_t *optlen)
3569*d6b92ffaSHans Petter Selasky {
3570*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3571*d6b92ffaSHans Petter Selasky void *opt;
3572*d6b92ffaSHans Petter Selasky struct ibv_sa_path_rec *path_rec;
3573*d6b92ffaSHans Petter Selasky struct ibv_path_data path_data;
3574*d6b92ffaSHans Petter Selasky socklen_t len;
3575*d6b92ffaSHans Petter Selasky int ret = 0;
3576*d6b92ffaSHans Petter Selasky int num_paths;
3577*d6b92ffaSHans Petter Selasky
3578*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
3579*d6b92ffaSHans Petter Selasky if (!rs)
3580*d6b92ffaSHans Petter Selasky return ERR(EBADF);
3581*d6b92ffaSHans Petter Selasky switch (level) {
3582*d6b92ffaSHans Petter Selasky case SOL_SOCKET:
3583*d6b92ffaSHans Petter Selasky switch (optname) {
3584*d6b92ffaSHans Petter Selasky case SO_REUSEADDR:
3585*d6b92ffaSHans Petter Selasky case SO_KEEPALIVE:
3586*d6b92ffaSHans Petter Selasky case SO_OOBINLINE:
3587*d6b92ffaSHans Petter Selasky *((int *) optval) = !!(rs->so_opts & (1 << optname));
3588*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3589*d6b92ffaSHans Petter Selasky break;
3590*d6b92ffaSHans Petter Selasky case SO_RCVBUF:
3591*d6b92ffaSHans Petter Selasky *((int *) optval) = rs->rbuf_size;
3592*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3593*d6b92ffaSHans Petter Selasky break;
3594*d6b92ffaSHans Petter Selasky case SO_SNDBUF:
3595*d6b92ffaSHans Petter Selasky *((int *) optval) = rs->sbuf_size;
3596*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3597*d6b92ffaSHans Petter Selasky break;
3598*d6b92ffaSHans Petter Selasky case SO_LINGER:
3599*d6b92ffaSHans Petter Selasky /* Value is inverted so default so_opt = 0 is on */
3600*d6b92ffaSHans Petter Selasky ((struct linger *) optval)->l_onoff =
3601*d6b92ffaSHans Petter Selasky !(rs->so_opts & (1 << optname));
3602*d6b92ffaSHans Petter Selasky ((struct linger *) optval)->l_linger = 0;
3603*d6b92ffaSHans Petter Selasky *optlen = sizeof(struct linger);
3604*d6b92ffaSHans Petter Selasky break;
3605*d6b92ffaSHans Petter Selasky case SO_ERROR:
3606*d6b92ffaSHans Petter Selasky *((int *) optval) = rs->err;
3607*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3608*d6b92ffaSHans Petter Selasky rs->err = 0;
3609*d6b92ffaSHans Petter Selasky break;
3610*d6b92ffaSHans Petter Selasky default:
3611*d6b92ffaSHans Petter Selasky ret = ENOTSUP;
3612*d6b92ffaSHans Petter Selasky break;
3613*d6b92ffaSHans Petter Selasky }
3614*d6b92ffaSHans Petter Selasky break;
3615*d6b92ffaSHans Petter Selasky case IPPROTO_TCP:
3616*d6b92ffaSHans Petter Selasky switch (optname) {
3617*d6b92ffaSHans Petter Selasky case TCP_KEEPCNT:
3618*d6b92ffaSHans Petter Selasky case TCP_KEEPINTVL:
3619*d6b92ffaSHans Petter Selasky *((int *) optval) = 1; /* N/A */
3620*d6b92ffaSHans Petter Selasky break;
3621*d6b92ffaSHans Petter Selasky case TCP_KEEPIDLE:
3622*d6b92ffaSHans Petter Selasky *((int *) optval) = (int) rs->keepalive_time;
3623*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3624*d6b92ffaSHans Petter Selasky break;
3625*d6b92ffaSHans Petter Selasky case TCP_NODELAY:
3626*d6b92ffaSHans Petter Selasky *((int *) optval) = !!(rs->tcp_opts & (1 << optname));
3627*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3628*d6b92ffaSHans Petter Selasky break;
3629*d6b92ffaSHans Petter Selasky case TCP_MAXSEG:
3630*d6b92ffaSHans Petter Selasky *((int *) optval) = (rs->cm_id && rs->cm_id->route.num_paths) ?
3631*d6b92ffaSHans Petter Selasky 1 << (7 + rs->cm_id->route.path_rec->mtu) :
3632*d6b92ffaSHans Petter Selasky 2048;
3633*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3634*d6b92ffaSHans Petter Selasky break;
3635*d6b92ffaSHans Petter Selasky default:
3636*d6b92ffaSHans Petter Selasky ret = ENOTSUP;
3637*d6b92ffaSHans Petter Selasky break;
3638*d6b92ffaSHans Petter Selasky }
3639*d6b92ffaSHans Petter Selasky break;
3640*d6b92ffaSHans Petter Selasky case IPPROTO_IPV6:
3641*d6b92ffaSHans Petter Selasky switch (optname) {
3642*d6b92ffaSHans Petter Selasky case IPV6_V6ONLY:
3643*d6b92ffaSHans Petter Selasky *((int *) optval) = !!(rs->ipv6_opts & (1 << optname));
3644*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3645*d6b92ffaSHans Petter Selasky break;
3646*d6b92ffaSHans Petter Selasky default:
3647*d6b92ffaSHans Petter Selasky ret = ENOTSUP;
3648*d6b92ffaSHans Petter Selasky break;
3649*d6b92ffaSHans Petter Selasky }
3650*d6b92ffaSHans Petter Selasky break;
3651*d6b92ffaSHans Petter Selasky case SOL_RDMA:
3652*d6b92ffaSHans Petter Selasky switch (optname) {
3653*d6b92ffaSHans Petter Selasky case RDMA_SQSIZE:
3654*d6b92ffaSHans Petter Selasky *((int *) optval) = rs->sq_size;
3655*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3656*d6b92ffaSHans Petter Selasky break;
3657*d6b92ffaSHans Petter Selasky case RDMA_RQSIZE:
3658*d6b92ffaSHans Petter Selasky *((int *) optval) = rs->rq_size;
3659*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3660*d6b92ffaSHans Petter Selasky break;
3661*d6b92ffaSHans Petter Selasky case RDMA_INLINE:
3662*d6b92ffaSHans Petter Selasky *((int *) optval) = rs->sq_inline;
3663*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3664*d6b92ffaSHans Petter Selasky break;
3665*d6b92ffaSHans Petter Selasky case RDMA_IOMAPSIZE:
3666*d6b92ffaSHans Petter Selasky *((int *) optval) = rs->target_iomap_size;
3667*d6b92ffaSHans Petter Selasky *optlen = sizeof(int);
3668*d6b92ffaSHans Petter Selasky break;
3669*d6b92ffaSHans Petter Selasky case RDMA_ROUTE:
3670*d6b92ffaSHans Petter Selasky if (rs->optval) {
3671*d6b92ffaSHans Petter Selasky if (*optlen < rs->optlen) {
3672*d6b92ffaSHans Petter Selasky ret = EINVAL;
3673*d6b92ffaSHans Petter Selasky } else {
3674*d6b92ffaSHans Petter Selasky memcpy(rs->optval, optval, rs->optlen);
3675*d6b92ffaSHans Petter Selasky *optlen = rs->optlen;
3676*d6b92ffaSHans Petter Selasky }
3677*d6b92ffaSHans Petter Selasky } else {
3678*d6b92ffaSHans Petter Selasky if (*optlen < sizeof(path_data)) {
3679*d6b92ffaSHans Petter Selasky ret = EINVAL;
3680*d6b92ffaSHans Petter Selasky } else {
3681*d6b92ffaSHans Petter Selasky len = 0;
3682*d6b92ffaSHans Petter Selasky opt = optval;
3683*d6b92ffaSHans Petter Selasky path_rec = rs->cm_id->route.path_rec;
3684*d6b92ffaSHans Petter Selasky num_paths = 0;
3685*d6b92ffaSHans Petter Selasky while (len + sizeof(path_data) <= *optlen &&
3686*d6b92ffaSHans Petter Selasky num_paths < rs->cm_id->route.num_paths) {
3687*d6b92ffaSHans Petter Selasky rs_convert_sa_path(path_rec, &path_data);
3688*d6b92ffaSHans Petter Selasky memcpy(opt, &path_data, sizeof(path_data));
3689*d6b92ffaSHans Petter Selasky len += sizeof(path_data);
3690*d6b92ffaSHans Petter Selasky opt += sizeof(path_data);
3691*d6b92ffaSHans Petter Selasky path_rec++;
3692*d6b92ffaSHans Petter Selasky num_paths++;
3693*d6b92ffaSHans Petter Selasky }
3694*d6b92ffaSHans Petter Selasky *optlen = len;
3695*d6b92ffaSHans Petter Selasky ret = 0;
3696*d6b92ffaSHans Petter Selasky }
3697*d6b92ffaSHans Petter Selasky }
3698*d6b92ffaSHans Petter Selasky break;
3699*d6b92ffaSHans Petter Selasky default:
3700*d6b92ffaSHans Petter Selasky ret = ENOTSUP;
3701*d6b92ffaSHans Petter Selasky break;
3702*d6b92ffaSHans Petter Selasky }
3703*d6b92ffaSHans Petter Selasky break;
3704*d6b92ffaSHans Petter Selasky default:
3705*d6b92ffaSHans Petter Selasky ret = ENOTSUP;
3706*d6b92ffaSHans Petter Selasky break;
3707*d6b92ffaSHans Petter Selasky }
3708*d6b92ffaSHans Petter Selasky
3709*d6b92ffaSHans Petter Selasky return rdma_seterrno(ret);
3710*d6b92ffaSHans Petter Selasky }
3711*d6b92ffaSHans Petter Selasky
rfcntl(int socket,int cmd,...)3712*d6b92ffaSHans Petter Selasky int rfcntl(int socket, int cmd, ... /* arg */ )
3713*d6b92ffaSHans Petter Selasky {
3714*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3715*d6b92ffaSHans Petter Selasky va_list args;
3716*d6b92ffaSHans Petter Selasky int param;
3717*d6b92ffaSHans Petter Selasky int ret = 0;
3718*d6b92ffaSHans Petter Selasky
3719*d6b92ffaSHans Petter Selasky rs = idm_lookup(&idm, socket);
3720*d6b92ffaSHans Petter Selasky if (!rs)
3721*d6b92ffaSHans Petter Selasky return ERR(EBADF);
3722*d6b92ffaSHans Petter Selasky va_start(args, cmd);
3723*d6b92ffaSHans Petter Selasky switch (cmd) {
3724*d6b92ffaSHans Petter Selasky case F_GETFL:
3725*d6b92ffaSHans Petter Selasky ret = rs->fd_flags;
3726*d6b92ffaSHans Petter Selasky break;
3727*d6b92ffaSHans Petter Selasky case F_SETFL:
3728*d6b92ffaSHans Petter Selasky param = va_arg(args, int);
3729*d6b92ffaSHans Petter Selasky if ((rs->fd_flags & O_NONBLOCK) != (param & O_NONBLOCK))
3730*d6b92ffaSHans Petter Selasky ret = rs_set_nonblocking(rs, param & O_NONBLOCK);
3731*d6b92ffaSHans Petter Selasky
3732*d6b92ffaSHans Petter Selasky if (!ret)
3733*d6b92ffaSHans Petter Selasky rs->fd_flags = param;
3734*d6b92ffaSHans Petter Selasky break;
3735*d6b92ffaSHans Petter Selasky default:
3736*d6b92ffaSHans Petter Selasky ret = ERR(ENOTSUP);
3737*d6b92ffaSHans Petter Selasky break;
3738*d6b92ffaSHans Petter Selasky }
3739*d6b92ffaSHans Petter Selasky va_end(args);
3740*d6b92ffaSHans Petter Selasky return ret;
3741*d6b92ffaSHans Petter Selasky }
3742*d6b92ffaSHans Petter Selasky
rs_get_iomap_mr(struct rsocket * rs)3743*d6b92ffaSHans Petter Selasky static struct rs_iomap_mr *rs_get_iomap_mr(struct rsocket *rs)
3744*d6b92ffaSHans Petter Selasky {
3745*d6b92ffaSHans Petter Selasky int i;
3746*d6b92ffaSHans Petter Selasky
3747*d6b92ffaSHans Petter Selasky if (!rs->remote_iomappings) {
3748*d6b92ffaSHans Petter Selasky rs->remote_iomappings = calloc(rs->remote_iomap.length,
3749*d6b92ffaSHans Petter Selasky sizeof(*rs->remote_iomappings));
3750*d6b92ffaSHans Petter Selasky if (!rs->remote_iomappings)
3751*d6b92ffaSHans Petter Selasky return NULL;
3752*d6b92ffaSHans Petter Selasky
3753*d6b92ffaSHans Petter Selasky for (i = 0; i < rs->remote_iomap.length; i++)
3754*d6b92ffaSHans Petter Selasky rs->remote_iomappings[i].index = i;
3755*d6b92ffaSHans Petter Selasky }
3756*d6b92ffaSHans Petter Selasky
3757*d6b92ffaSHans Petter Selasky for (i = 0; i < rs->remote_iomap.length; i++) {
3758*d6b92ffaSHans Petter Selasky if (!rs->remote_iomappings[i].mr)
3759*d6b92ffaSHans Petter Selasky return &rs->remote_iomappings[i];
3760*d6b92ffaSHans Petter Selasky }
3761*d6b92ffaSHans Petter Selasky return NULL;
3762*d6b92ffaSHans Petter Selasky }
3763*d6b92ffaSHans Petter Selasky
3764*d6b92ffaSHans Petter Selasky /*
3765*d6b92ffaSHans Petter Selasky * If an offset is given, we map to it. If offset is -1, then we map the
3766*d6b92ffaSHans Petter Selasky * offset to the address of buf. We do not check for conflicts, which must
3767*d6b92ffaSHans Petter Selasky * be fixed at some point.
3768*d6b92ffaSHans Petter Selasky */
riomap(int socket,void * buf,size_t len,int prot,int flags,off_t offset)3769*d6b92ffaSHans Petter Selasky off_t riomap(int socket, void *buf, size_t len, int prot, int flags, off_t offset)
3770*d6b92ffaSHans Petter Selasky {
3771*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3772*d6b92ffaSHans Petter Selasky struct rs_iomap_mr *iomr;
3773*d6b92ffaSHans Petter Selasky int access = IBV_ACCESS_LOCAL_WRITE;
3774*d6b92ffaSHans Petter Selasky
3775*d6b92ffaSHans Petter Selasky rs = idm_at(&idm, socket);
3776*d6b92ffaSHans Petter Selasky if (!rs->cm_id->pd || (prot & ~(PROT_WRITE | PROT_NONE)))
3777*d6b92ffaSHans Petter Selasky return ERR(EINVAL);
3778*d6b92ffaSHans Petter Selasky
3779*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->map_lock);
3780*d6b92ffaSHans Petter Selasky if (prot & PROT_WRITE) {
3781*d6b92ffaSHans Petter Selasky iomr = rs_get_iomap_mr(rs);
3782*d6b92ffaSHans Petter Selasky access |= IBV_ACCESS_REMOTE_WRITE;
3783*d6b92ffaSHans Petter Selasky } else {
3784*d6b92ffaSHans Petter Selasky iomr = calloc(1, sizeof(*iomr));
3785*d6b92ffaSHans Petter Selasky iomr->index = -1;
3786*d6b92ffaSHans Petter Selasky }
3787*d6b92ffaSHans Petter Selasky if (!iomr) {
3788*d6b92ffaSHans Petter Selasky offset = ERR(ENOMEM);
3789*d6b92ffaSHans Petter Selasky goto out;
3790*d6b92ffaSHans Petter Selasky }
3791*d6b92ffaSHans Petter Selasky
3792*d6b92ffaSHans Petter Selasky iomr->mr = ibv_reg_mr(rs->cm_id->pd, buf, len, access);
3793*d6b92ffaSHans Petter Selasky if (!iomr->mr) {
3794*d6b92ffaSHans Petter Selasky if (iomr->index < 0)
3795*d6b92ffaSHans Petter Selasky free(iomr);
3796*d6b92ffaSHans Petter Selasky offset = -1;
3797*d6b92ffaSHans Petter Selasky goto out;
3798*d6b92ffaSHans Petter Selasky }
3799*d6b92ffaSHans Petter Selasky
3800*d6b92ffaSHans Petter Selasky if (offset == -1)
3801*d6b92ffaSHans Petter Selasky offset = (uintptr_t) buf;
3802*d6b92ffaSHans Petter Selasky iomr->offset = offset;
3803*d6b92ffaSHans Petter Selasky atomic_store(&iomr->refcnt, 1);
3804*d6b92ffaSHans Petter Selasky
3805*d6b92ffaSHans Petter Selasky if (iomr->index >= 0) {
3806*d6b92ffaSHans Petter Selasky dlist_insert_tail(&iomr->entry, &rs->iomap_queue);
3807*d6b92ffaSHans Petter Selasky rs->iomap_pending = 1;
3808*d6b92ffaSHans Petter Selasky } else {
3809*d6b92ffaSHans Petter Selasky dlist_insert_tail(&iomr->entry, &rs->iomap_list);
3810*d6b92ffaSHans Petter Selasky }
3811*d6b92ffaSHans Petter Selasky out:
3812*d6b92ffaSHans Petter Selasky fastlock_release(&rs->map_lock);
3813*d6b92ffaSHans Petter Selasky return offset;
3814*d6b92ffaSHans Petter Selasky }
3815*d6b92ffaSHans Petter Selasky
riounmap(int socket,void * buf,size_t len)3816*d6b92ffaSHans Petter Selasky int riounmap(int socket, void *buf, size_t len)
3817*d6b92ffaSHans Petter Selasky {
3818*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3819*d6b92ffaSHans Petter Selasky struct rs_iomap_mr *iomr;
3820*d6b92ffaSHans Petter Selasky dlist_entry *entry;
3821*d6b92ffaSHans Petter Selasky int ret = 0;
3822*d6b92ffaSHans Petter Selasky
3823*d6b92ffaSHans Petter Selasky rs = idm_at(&idm, socket);
3824*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->map_lock);
3825*d6b92ffaSHans Petter Selasky
3826*d6b92ffaSHans Petter Selasky for (entry = rs->iomap_list.next; entry != &rs->iomap_list;
3827*d6b92ffaSHans Petter Selasky entry = entry->next) {
3828*d6b92ffaSHans Petter Selasky iomr = container_of(entry, struct rs_iomap_mr, entry);
3829*d6b92ffaSHans Petter Selasky if (iomr->mr->addr == buf && iomr->mr->length == len) {
3830*d6b92ffaSHans Petter Selasky rs_release_iomap_mr(iomr);
3831*d6b92ffaSHans Petter Selasky goto out;
3832*d6b92ffaSHans Petter Selasky }
3833*d6b92ffaSHans Petter Selasky }
3834*d6b92ffaSHans Petter Selasky
3835*d6b92ffaSHans Petter Selasky for (entry = rs->iomap_queue.next; entry != &rs->iomap_queue;
3836*d6b92ffaSHans Petter Selasky entry = entry->next) {
3837*d6b92ffaSHans Petter Selasky iomr = container_of(entry, struct rs_iomap_mr, entry);
3838*d6b92ffaSHans Petter Selasky if (iomr->mr->addr == buf && iomr->mr->length == len) {
3839*d6b92ffaSHans Petter Selasky rs_release_iomap_mr(iomr);
3840*d6b92ffaSHans Petter Selasky goto out;
3841*d6b92ffaSHans Petter Selasky }
3842*d6b92ffaSHans Petter Selasky }
3843*d6b92ffaSHans Petter Selasky ret = ERR(EINVAL);
3844*d6b92ffaSHans Petter Selasky out:
3845*d6b92ffaSHans Petter Selasky fastlock_release(&rs->map_lock);
3846*d6b92ffaSHans Petter Selasky return ret;
3847*d6b92ffaSHans Petter Selasky }
3848*d6b92ffaSHans Petter Selasky
rs_find_iomap(struct rsocket * rs,off_t offset)3849*d6b92ffaSHans Petter Selasky static struct rs_iomap *rs_find_iomap(struct rsocket *rs, off_t offset)
3850*d6b92ffaSHans Petter Selasky {
3851*d6b92ffaSHans Petter Selasky int i;
3852*d6b92ffaSHans Petter Selasky
3853*d6b92ffaSHans Petter Selasky for (i = 0; i < rs->target_iomap_size; i++) {
3854*d6b92ffaSHans Petter Selasky if (offset >= rs->target_iomap[i].offset &&
3855*d6b92ffaSHans Petter Selasky offset < rs->target_iomap[i].offset + rs->target_iomap[i].sge.length)
3856*d6b92ffaSHans Petter Selasky return &rs->target_iomap[i];
3857*d6b92ffaSHans Petter Selasky }
3858*d6b92ffaSHans Petter Selasky return NULL;
3859*d6b92ffaSHans Petter Selasky }
3860*d6b92ffaSHans Petter Selasky
riowrite(int socket,const void * buf,size_t count,off_t offset,int flags)3861*d6b92ffaSHans Petter Selasky size_t riowrite(int socket, const void *buf, size_t count, off_t offset, int flags)
3862*d6b92ffaSHans Petter Selasky {
3863*d6b92ffaSHans Petter Selasky struct rsocket *rs;
3864*d6b92ffaSHans Petter Selasky struct rs_iomap *iom = NULL;
3865*d6b92ffaSHans Petter Selasky struct ibv_sge sge;
3866*d6b92ffaSHans Petter Selasky size_t left = count;
3867*d6b92ffaSHans Petter Selasky uint32_t xfer_size, olen = RS_OLAP_START_SIZE;
3868*d6b92ffaSHans Petter Selasky int ret = 0;
3869*d6b92ffaSHans Petter Selasky
3870*d6b92ffaSHans Petter Selasky rs = idm_at(&idm, socket);
3871*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
3872*d6b92ffaSHans Petter Selasky if (rs->iomap_pending) {
3873*d6b92ffaSHans Petter Selasky ret = rs_send_iomaps(rs, flags);
3874*d6b92ffaSHans Petter Selasky if (ret)
3875*d6b92ffaSHans Petter Selasky goto out;
3876*d6b92ffaSHans Petter Selasky }
3877*d6b92ffaSHans Petter Selasky for (; left; left -= xfer_size, buf += xfer_size, offset += xfer_size) {
3878*d6b92ffaSHans Petter Selasky if (!iom || offset > iom->offset + iom->sge.length) {
3879*d6b92ffaSHans Petter Selasky iom = rs_find_iomap(rs, offset);
3880*d6b92ffaSHans Petter Selasky if (!iom)
3881*d6b92ffaSHans Petter Selasky break;
3882*d6b92ffaSHans Petter Selasky }
3883*d6b92ffaSHans Petter Selasky
3884*d6b92ffaSHans Petter Selasky if (!rs_can_send(rs)) {
3885*d6b92ffaSHans Petter Selasky ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
3886*d6b92ffaSHans Petter Selasky rs_conn_can_send);
3887*d6b92ffaSHans Petter Selasky if (ret)
3888*d6b92ffaSHans Petter Selasky break;
3889*d6b92ffaSHans Petter Selasky if (!(rs->state & rs_writable)) {
3890*d6b92ffaSHans Petter Selasky ret = ERR(ECONNRESET);
3891*d6b92ffaSHans Petter Selasky break;
3892*d6b92ffaSHans Petter Selasky }
3893*d6b92ffaSHans Petter Selasky }
3894*d6b92ffaSHans Petter Selasky
3895*d6b92ffaSHans Petter Selasky if (olen < left) {
3896*d6b92ffaSHans Petter Selasky xfer_size = olen;
3897*d6b92ffaSHans Petter Selasky if (olen < RS_MAX_TRANSFER)
3898*d6b92ffaSHans Petter Selasky olen <<= 1;
3899*d6b92ffaSHans Petter Selasky } else {
3900*d6b92ffaSHans Petter Selasky xfer_size = left;
3901*d6b92ffaSHans Petter Selasky }
3902*d6b92ffaSHans Petter Selasky
3903*d6b92ffaSHans Petter Selasky if (xfer_size > rs->sbuf_bytes_avail)
3904*d6b92ffaSHans Petter Selasky xfer_size = rs->sbuf_bytes_avail;
3905*d6b92ffaSHans Petter Selasky if (xfer_size > iom->offset + iom->sge.length - offset)
3906*d6b92ffaSHans Petter Selasky xfer_size = iom->offset + iom->sge.length - offset;
3907*d6b92ffaSHans Petter Selasky
3908*d6b92ffaSHans Petter Selasky if (xfer_size <= rs->sq_inline) {
3909*d6b92ffaSHans Petter Selasky sge.addr = (uintptr_t) buf;
3910*d6b92ffaSHans Petter Selasky sge.length = xfer_size;
3911*d6b92ffaSHans Petter Selasky sge.lkey = 0;
3912*d6b92ffaSHans Petter Selasky ret = rs_write_direct(rs, iom, offset, &sge, 1,
3913*d6b92ffaSHans Petter Selasky xfer_size, IBV_SEND_INLINE);
3914*d6b92ffaSHans Petter Selasky } else if (xfer_size <= rs_sbuf_left(rs)) {
3915*d6b92ffaSHans Petter Selasky memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, xfer_size);
3916*d6b92ffaSHans Petter Selasky rs->ssgl[0].length = xfer_size;
3917*d6b92ffaSHans Petter Selasky ret = rs_write_direct(rs, iom, offset, rs->ssgl, 1, xfer_size, 0);
3918*d6b92ffaSHans Petter Selasky if (xfer_size < rs_sbuf_left(rs))
3919*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr += xfer_size;
3920*d6b92ffaSHans Petter Selasky else
3921*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr = (uintptr_t) rs->sbuf;
3922*d6b92ffaSHans Petter Selasky } else {
3923*d6b92ffaSHans Petter Selasky rs->ssgl[0].length = rs_sbuf_left(rs);
3924*d6b92ffaSHans Petter Selasky memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf,
3925*d6b92ffaSHans Petter Selasky rs->ssgl[0].length);
3926*d6b92ffaSHans Petter Selasky rs->ssgl[1].length = xfer_size - rs->ssgl[0].length;
3927*d6b92ffaSHans Petter Selasky memcpy(rs->sbuf, buf + rs->ssgl[0].length, rs->ssgl[1].length);
3928*d6b92ffaSHans Petter Selasky ret = rs_write_direct(rs, iom, offset, rs->ssgl, 2, xfer_size, 0);
3929*d6b92ffaSHans Petter Selasky rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length;
3930*d6b92ffaSHans Petter Selasky }
3931*d6b92ffaSHans Petter Selasky if (ret)
3932*d6b92ffaSHans Petter Selasky break;
3933*d6b92ffaSHans Petter Selasky }
3934*d6b92ffaSHans Petter Selasky out:
3935*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
3936*d6b92ffaSHans Petter Selasky
3937*d6b92ffaSHans Petter Selasky return (ret && left == count) ? ret : count - left;
3938*d6b92ffaSHans Petter Selasky }
3939*d6b92ffaSHans Petter Selasky
3940*d6b92ffaSHans Petter Selasky /****************************************************************************
3941*d6b92ffaSHans Petter Selasky * Service Processing Threads
3942*d6b92ffaSHans Petter Selasky ****************************************************************************/
3943*d6b92ffaSHans Petter Selasky
rs_svc_grow_sets(struct rs_svc * svc,int grow_size)3944*d6b92ffaSHans Petter Selasky static int rs_svc_grow_sets(struct rs_svc *svc, int grow_size)
3945*d6b92ffaSHans Petter Selasky {
3946*d6b92ffaSHans Petter Selasky struct rsocket **rss;
3947*d6b92ffaSHans Petter Selasky void *set, *contexts;
3948*d6b92ffaSHans Petter Selasky
3949*d6b92ffaSHans Petter Selasky set = calloc(svc->size + grow_size, sizeof(*rss) + svc->context_size);
3950*d6b92ffaSHans Petter Selasky if (!set)
3951*d6b92ffaSHans Petter Selasky return ENOMEM;
3952*d6b92ffaSHans Petter Selasky
3953*d6b92ffaSHans Petter Selasky svc->size += grow_size;
3954*d6b92ffaSHans Petter Selasky rss = set;
3955*d6b92ffaSHans Petter Selasky contexts = set + sizeof(*rss) * svc->size;
3956*d6b92ffaSHans Petter Selasky if (svc->cnt) {
3957*d6b92ffaSHans Petter Selasky memcpy(rss, svc->rss, sizeof(*rss) * (svc->cnt + 1));
3958*d6b92ffaSHans Petter Selasky memcpy(contexts, svc->contexts, svc->context_size * (svc->cnt + 1));
3959*d6b92ffaSHans Petter Selasky }
3960*d6b92ffaSHans Petter Selasky
3961*d6b92ffaSHans Petter Selasky free(svc->rss);
3962*d6b92ffaSHans Petter Selasky svc->rss = rss;
3963*d6b92ffaSHans Petter Selasky svc->contexts = contexts;
3964*d6b92ffaSHans Petter Selasky return 0;
3965*d6b92ffaSHans Petter Selasky }
3966*d6b92ffaSHans Petter Selasky
3967*d6b92ffaSHans Petter Selasky /*
3968*d6b92ffaSHans Petter Selasky * Index 0 is reserved for the service's communication socket.
3969*d6b92ffaSHans Petter Selasky */
rs_svc_add_rs(struct rs_svc * svc,struct rsocket * rs)3970*d6b92ffaSHans Petter Selasky static int rs_svc_add_rs(struct rs_svc *svc, struct rsocket *rs)
3971*d6b92ffaSHans Petter Selasky {
3972*d6b92ffaSHans Petter Selasky int ret;
3973*d6b92ffaSHans Petter Selasky
3974*d6b92ffaSHans Petter Selasky if (svc->cnt >= svc->size - 1) {
3975*d6b92ffaSHans Petter Selasky ret = rs_svc_grow_sets(svc, 4);
3976*d6b92ffaSHans Petter Selasky if (ret)
3977*d6b92ffaSHans Petter Selasky return ret;
3978*d6b92ffaSHans Petter Selasky }
3979*d6b92ffaSHans Petter Selasky
3980*d6b92ffaSHans Petter Selasky svc->rss[++svc->cnt] = rs;
3981*d6b92ffaSHans Petter Selasky return 0;
3982*d6b92ffaSHans Petter Selasky }
3983*d6b92ffaSHans Petter Selasky
rs_svc_index(struct rs_svc * svc,struct rsocket * rs)3984*d6b92ffaSHans Petter Selasky static int rs_svc_index(struct rs_svc *svc, struct rsocket *rs)
3985*d6b92ffaSHans Petter Selasky {
3986*d6b92ffaSHans Petter Selasky int i;
3987*d6b92ffaSHans Petter Selasky
3988*d6b92ffaSHans Petter Selasky for (i = 1; i <= svc->cnt; i++) {
3989*d6b92ffaSHans Petter Selasky if (svc->rss[i] == rs)
3990*d6b92ffaSHans Petter Selasky return i;
3991*d6b92ffaSHans Petter Selasky }
3992*d6b92ffaSHans Petter Selasky return -1;
3993*d6b92ffaSHans Petter Selasky }
3994*d6b92ffaSHans Petter Selasky
rs_svc_rm_rs(struct rs_svc * svc,struct rsocket * rs)3995*d6b92ffaSHans Petter Selasky static int rs_svc_rm_rs(struct rs_svc *svc, struct rsocket *rs)
3996*d6b92ffaSHans Petter Selasky {
3997*d6b92ffaSHans Petter Selasky int i;
3998*d6b92ffaSHans Petter Selasky
3999*d6b92ffaSHans Petter Selasky if ((i = rs_svc_index(svc, rs)) >= 0) {
4000*d6b92ffaSHans Petter Selasky svc->rss[i] = svc->rss[svc->cnt];
4001*d6b92ffaSHans Petter Selasky memcpy(svc->contexts + i * svc->context_size,
4002*d6b92ffaSHans Petter Selasky svc->contexts + svc->cnt * svc->context_size,
4003*d6b92ffaSHans Petter Selasky svc->context_size);
4004*d6b92ffaSHans Petter Selasky svc->cnt--;
4005*d6b92ffaSHans Petter Selasky return 0;
4006*d6b92ffaSHans Petter Selasky }
4007*d6b92ffaSHans Petter Selasky return EBADF;
4008*d6b92ffaSHans Petter Selasky }
4009*d6b92ffaSHans Petter Selasky
udp_svc_process_sock(struct rs_svc * svc)4010*d6b92ffaSHans Petter Selasky static void udp_svc_process_sock(struct rs_svc *svc)
4011*d6b92ffaSHans Petter Selasky {
4012*d6b92ffaSHans Petter Selasky struct rs_svc_msg msg;
4013*d6b92ffaSHans Petter Selasky
4014*d6b92ffaSHans Petter Selasky read_all(svc->sock[1], &msg, sizeof msg);
4015*d6b92ffaSHans Petter Selasky switch (msg.cmd) {
4016*d6b92ffaSHans Petter Selasky case RS_SVC_ADD_DGRAM:
4017*d6b92ffaSHans Petter Selasky msg.status = rs_svc_add_rs(svc, msg.rs);
4018*d6b92ffaSHans Petter Selasky if (!msg.status) {
4019*d6b92ffaSHans Petter Selasky msg.rs->opts |= RS_OPT_SVC_ACTIVE;
4020*d6b92ffaSHans Petter Selasky udp_svc_fds = svc->contexts;
4021*d6b92ffaSHans Petter Selasky udp_svc_fds[svc->cnt].fd = msg.rs->udp_sock;
4022*d6b92ffaSHans Petter Selasky udp_svc_fds[svc->cnt].events = POLLIN;
4023*d6b92ffaSHans Petter Selasky udp_svc_fds[svc->cnt].revents = 0;
4024*d6b92ffaSHans Petter Selasky }
4025*d6b92ffaSHans Petter Selasky break;
4026*d6b92ffaSHans Petter Selasky case RS_SVC_REM_DGRAM:
4027*d6b92ffaSHans Petter Selasky msg.status = rs_svc_rm_rs(svc, msg.rs);
4028*d6b92ffaSHans Petter Selasky if (!msg.status)
4029*d6b92ffaSHans Petter Selasky msg.rs->opts &= ~RS_OPT_SVC_ACTIVE;
4030*d6b92ffaSHans Petter Selasky break;
4031*d6b92ffaSHans Petter Selasky case RS_SVC_NOOP:
4032*d6b92ffaSHans Petter Selasky msg.status = 0;
4033*d6b92ffaSHans Petter Selasky break;
4034*d6b92ffaSHans Petter Selasky default:
4035*d6b92ffaSHans Petter Selasky break;
4036*d6b92ffaSHans Petter Selasky }
4037*d6b92ffaSHans Petter Selasky
4038*d6b92ffaSHans Petter Selasky write_all(svc->sock[1], &msg, sizeof msg);
4039*d6b92ffaSHans Petter Selasky }
4040*d6b92ffaSHans Petter Selasky
udp_svc_sgid_index(struct ds_dest * dest,union ibv_gid * sgid)4041*d6b92ffaSHans Petter Selasky static uint8_t udp_svc_sgid_index(struct ds_dest *dest, union ibv_gid *sgid)
4042*d6b92ffaSHans Petter Selasky {
4043*d6b92ffaSHans Petter Selasky union ibv_gid gid;
4044*d6b92ffaSHans Petter Selasky int i;
4045*d6b92ffaSHans Petter Selasky
4046*d6b92ffaSHans Petter Selasky for (i = 0; i < 16; i++) {
4047*d6b92ffaSHans Petter Selasky ibv_query_gid(dest->qp->cm_id->verbs, dest->qp->cm_id->port_num,
4048*d6b92ffaSHans Petter Selasky i, &gid);
4049*d6b92ffaSHans Petter Selasky if (!memcmp(sgid, &gid, sizeof gid))
4050*d6b92ffaSHans Petter Selasky return i;
4051*d6b92ffaSHans Petter Selasky }
4052*d6b92ffaSHans Petter Selasky return 0;
4053*d6b92ffaSHans Petter Selasky }
4054*d6b92ffaSHans Petter Selasky
udp_svc_path_bits(struct ds_dest * dest)4055*d6b92ffaSHans Petter Selasky static uint8_t udp_svc_path_bits(struct ds_dest *dest)
4056*d6b92ffaSHans Petter Selasky {
4057*d6b92ffaSHans Petter Selasky struct ibv_port_attr attr;
4058*d6b92ffaSHans Petter Selasky
4059*d6b92ffaSHans Petter Selasky if (!ibv_query_port(dest->qp->cm_id->verbs, dest->qp->cm_id->port_num, &attr))
4060*d6b92ffaSHans Petter Selasky return (uint8_t) ((1 << attr.lmc) - 1);
4061*d6b92ffaSHans Petter Selasky return 0x7f;
4062*d6b92ffaSHans Petter Selasky }
4063*d6b92ffaSHans Petter Selasky
udp_svc_create_ah(struct rsocket * rs,struct ds_dest * dest,uint32_t qpn)4064*d6b92ffaSHans Petter Selasky static void udp_svc_create_ah(struct rsocket *rs, struct ds_dest *dest, uint32_t qpn)
4065*d6b92ffaSHans Petter Selasky {
4066*d6b92ffaSHans Petter Selasky union socket_addr saddr;
4067*d6b92ffaSHans Petter Selasky struct rdma_cm_id *id;
4068*d6b92ffaSHans Petter Selasky struct ibv_ah_attr attr;
4069*d6b92ffaSHans Petter Selasky int ret;
4070*d6b92ffaSHans Petter Selasky
4071*d6b92ffaSHans Petter Selasky if (dest->ah) {
4072*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
4073*d6b92ffaSHans Petter Selasky ibv_destroy_ah(dest->ah);
4074*d6b92ffaSHans Petter Selasky dest->ah = NULL;
4075*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
4076*d6b92ffaSHans Petter Selasky }
4077*d6b92ffaSHans Petter Selasky
4078*d6b92ffaSHans Petter Selasky ret = rdma_create_id(NULL, &id, NULL, dest->qp->cm_id->ps);
4079*d6b92ffaSHans Petter Selasky if (ret)
4080*d6b92ffaSHans Petter Selasky return;
4081*d6b92ffaSHans Petter Selasky
4082*d6b92ffaSHans Petter Selasky memcpy(&saddr, rdma_get_local_addr(dest->qp->cm_id),
4083*d6b92ffaSHans Petter Selasky ucma_addrlen(rdma_get_local_addr(dest->qp->cm_id)));
4084*d6b92ffaSHans Petter Selasky if (saddr.sa.sa_family == AF_INET)
4085*d6b92ffaSHans Petter Selasky saddr.sin.sin_port = 0;
4086*d6b92ffaSHans Petter Selasky else
4087*d6b92ffaSHans Petter Selasky saddr.sin6.sin6_port = 0;
4088*d6b92ffaSHans Petter Selasky ret = rdma_resolve_addr(id, &saddr.sa, &dest->addr.sa, 2000);
4089*d6b92ffaSHans Petter Selasky if (ret)
4090*d6b92ffaSHans Petter Selasky goto out;
4091*d6b92ffaSHans Petter Selasky
4092*d6b92ffaSHans Petter Selasky ret = rdma_resolve_route(id, 2000);
4093*d6b92ffaSHans Petter Selasky if (ret)
4094*d6b92ffaSHans Petter Selasky goto out;
4095*d6b92ffaSHans Petter Selasky
4096*d6b92ffaSHans Petter Selasky memset(&attr, 0, sizeof attr);
4097*d6b92ffaSHans Petter Selasky if (id->route.path_rec->hop_limit > 1) {
4098*d6b92ffaSHans Petter Selasky attr.is_global = 1;
4099*d6b92ffaSHans Petter Selasky attr.grh.dgid = id->route.path_rec->dgid;
4100*d6b92ffaSHans Petter Selasky attr.grh.flow_label = be32toh(id->route.path_rec->flow_label);
4101*d6b92ffaSHans Petter Selasky attr.grh.sgid_index = udp_svc_sgid_index(dest, &id->route.path_rec->sgid);
4102*d6b92ffaSHans Petter Selasky attr.grh.hop_limit = id->route.path_rec->hop_limit;
4103*d6b92ffaSHans Petter Selasky attr.grh.traffic_class = id->route.path_rec->traffic_class;
4104*d6b92ffaSHans Petter Selasky }
4105*d6b92ffaSHans Petter Selasky attr.dlid = be16toh(id->route.path_rec->dlid);
4106*d6b92ffaSHans Petter Selasky attr.sl = id->route.path_rec->sl;
4107*d6b92ffaSHans Petter Selasky attr.src_path_bits = be16toh(id->route.path_rec->slid) & udp_svc_path_bits(dest);
4108*d6b92ffaSHans Petter Selasky attr.static_rate = id->route.path_rec->rate;
4109*d6b92ffaSHans Petter Selasky attr.port_num = id->port_num;
4110*d6b92ffaSHans Petter Selasky
4111*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
4112*d6b92ffaSHans Petter Selasky dest->qpn = qpn;
4113*d6b92ffaSHans Petter Selasky dest->ah = ibv_create_ah(dest->qp->cm_id->pd, &attr);
4114*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
4115*d6b92ffaSHans Petter Selasky out:
4116*d6b92ffaSHans Petter Selasky rdma_destroy_id(id);
4117*d6b92ffaSHans Petter Selasky }
4118*d6b92ffaSHans Petter Selasky
udp_svc_valid_udp_hdr(struct ds_udp_header * udp_hdr,union socket_addr * addr)4119*d6b92ffaSHans Petter Selasky static int udp_svc_valid_udp_hdr(struct ds_udp_header *udp_hdr,
4120*d6b92ffaSHans Petter Selasky union socket_addr *addr)
4121*d6b92ffaSHans Petter Selasky {
4122*d6b92ffaSHans Petter Selasky return (udp_hdr->tag == htobe32(DS_UDP_TAG)) &&
4123*d6b92ffaSHans Petter Selasky ((udp_hdr->version == 4 && addr->sa.sa_family == AF_INET &&
4124*d6b92ffaSHans Petter Selasky udp_hdr->length == DS_UDP_IPV4_HDR_LEN) ||
4125*d6b92ffaSHans Petter Selasky (udp_hdr->version == 6 && addr->sa.sa_family == AF_INET6 &&
4126*d6b92ffaSHans Petter Selasky udp_hdr->length == DS_UDP_IPV6_HDR_LEN));
4127*d6b92ffaSHans Petter Selasky }
4128*d6b92ffaSHans Petter Selasky
udp_svc_forward(struct rsocket * rs,void * buf,size_t len,union socket_addr * src)4129*d6b92ffaSHans Petter Selasky static void udp_svc_forward(struct rsocket *rs, void *buf, size_t len,
4130*d6b92ffaSHans Petter Selasky union socket_addr *src)
4131*d6b92ffaSHans Petter Selasky {
4132*d6b92ffaSHans Petter Selasky struct ds_header hdr;
4133*d6b92ffaSHans Petter Selasky struct ds_smsg *msg;
4134*d6b92ffaSHans Petter Selasky struct ibv_sge sge;
4135*d6b92ffaSHans Petter Selasky uint64_t offset;
4136*d6b92ffaSHans Petter Selasky
4137*d6b92ffaSHans Petter Selasky if (!ds_can_send(rs)) {
4138*d6b92ffaSHans Petter Selasky if (ds_get_comp(rs, 0, ds_can_send))
4139*d6b92ffaSHans Petter Selasky return;
4140*d6b92ffaSHans Petter Selasky }
4141*d6b92ffaSHans Petter Selasky
4142*d6b92ffaSHans Petter Selasky msg = rs->smsg_free;
4143*d6b92ffaSHans Petter Selasky rs->smsg_free = msg->next;
4144*d6b92ffaSHans Petter Selasky rs->sqe_avail--;
4145*d6b92ffaSHans Petter Selasky
4146*d6b92ffaSHans Petter Selasky ds_format_hdr(&hdr, src);
4147*d6b92ffaSHans Petter Selasky memcpy((void *) msg, &hdr, hdr.length);
4148*d6b92ffaSHans Petter Selasky memcpy((void *) msg + hdr.length, buf, len);
4149*d6b92ffaSHans Petter Selasky sge.addr = (uintptr_t) msg;
4150*d6b92ffaSHans Petter Selasky sge.length = hdr.length + len;
4151*d6b92ffaSHans Petter Selasky sge.lkey = rs->conn_dest->qp->smr->lkey;
4152*d6b92ffaSHans Petter Selasky offset = (uint8_t *) msg - rs->sbuf;
4153*d6b92ffaSHans Petter Selasky
4154*d6b92ffaSHans Petter Selasky ds_post_send(rs, &sge, offset);
4155*d6b92ffaSHans Petter Selasky }
4156*d6b92ffaSHans Petter Selasky
udp_svc_process_rs(struct rsocket * rs)4157*d6b92ffaSHans Petter Selasky static void udp_svc_process_rs(struct rsocket *rs)
4158*d6b92ffaSHans Petter Selasky {
4159*d6b92ffaSHans Petter Selasky static uint8_t buf[RS_SNDLOWAT];
4160*d6b92ffaSHans Petter Selasky struct ds_dest *dest, *cur_dest;
4161*d6b92ffaSHans Petter Selasky struct ds_udp_header *udp_hdr;
4162*d6b92ffaSHans Petter Selasky union socket_addr addr;
4163*d6b92ffaSHans Petter Selasky socklen_t addrlen = sizeof addr;
4164*d6b92ffaSHans Petter Selasky int len, ret;
4165*d6b92ffaSHans Petter Selasky uint32_t qpn;
4166*d6b92ffaSHans Petter Selasky
4167*d6b92ffaSHans Petter Selasky ret = recvfrom(rs->udp_sock, buf, sizeof buf, 0, &addr.sa, &addrlen);
4168*d6b92ffaSHans Petter Selasky if (ret < DS_UDP_IPV4_HDR_LEN)
4169*d6b92ffaSHans Petter Selasky return;
4170*d6b92ffaSHans Petter Selasky
4171*d6b92ffaSHans Petter Selasky udp_hdr = (struct ds_udp_header *) buf;
4172*d6b92ffaSHans Petter Selasky if (!udp_svc_valid_udp_hdr(udp_hdr, &addr))
4173*d6b92ffaSHans Petter Selasky return;
4174*d6b92ffaSHans Petter Selasky
4175*d6b92ffaSHans Petter Selasky len = ret - udp_hdr->length;
4176*d6b92ffaSHans Petter Selasky qpn = be32toh(udp_hdr->qpn) & 0xFFFFFF;
4177*d6b92ffaSHans Petter Selasky
4178*d6b92ffaSHans Petter Selasky udp_hdr->tag = (__force __be32)be32toh(udp_hdr->tag);
4179*d6b92ffaSHans Petter Selasky udp_hdr->qpn = (__force __be32)qpn;
4180*d6b92ffaSHans Petter Selasky
4181*d6b92ffaSHans Petter Selasky ret = ds_get_dest(rs, &addr.sa, addrlen, &dest);
4182*d6b92ffaSHans Petter Selasky if (ret)
4183*d6b92ffaSHans Petter Selasky return;
4184*d6b92ffaSHans Petter Selasky
4185*d6b92ffaSHans Petter Selasky if (udp_hdr->op == RS_OP_DATA) {
4186*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
4187*d6b92ffaSHans Petter Selasky cur_dest = rs->conn_dest;
4188*d6b92ffaSHans Petter Selasky rs->conn_dest = dest;
4189*d6b92ffaSHans Petter Selasky ds_send_udp(rs, NULL, 0, 0, RS_OP_CTRL);
4190*d6b92ffaSHans Petter Selasky rs->conn_dest = cur_dest;
4191*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
4192*d6b92ffaSHans Petter Selasky }
4193*d6b92ffaSHans Petter Selasky
4194*d6b92ffaSHans Petter Selasky if (!dest->ah || (dest->qpn != qpn))
4195*d6b92ffaSHans Petter Selasky udp_svc_create_ah(rs, dest, qpn);
4196*d6b92ffaSHans Petter Selasky
4197*d6b92ffaSHans Petter Selasky /* to do: handle when dest local ip address doesn't match udp ip */
4198*d6b92ffaSHans Petter Selasky if (udp_hdr->op == RS_OP_DATA) {
4199*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->slock);
4200*d6b92ffaSHans Petter Selasky cur_dest = rs->conn_dest;
4201*d6b92ffaSHans Petter Selasky rs->conn_dest = &dest->qp->dest;
4202*d6b92ffaSHans Petter Selasky udp_svc_forward(rs, buf + udp_hdr->length, len, &addr);
4203*d6b92ffaSHans Petter Selasky rs->conn_dest = cur_dest;
4204*d6b92ffaSHans Petter Selasky fastlock_release(&rs->slock);
4205*d6b92ffaSHans Petter Selasky }
4206*d6b92ffaSHans Petter Selasky }
4207*d6b92ffaSHans Petter Selasky
udp_svc_run(void * arg)4208*d6b92ffaSHans Petter Selasky static void *udp_svc_run(void *arg)
4209*d6b92ffaSHans Petter Selasky {
4210*d6b92ffaSHans Petter Selasky struct rs_svc *svc = arg;
4211*d6b92ffaSHans Petter Selasky struct rs_svc_msg msg;
4212*d6b92ffaSHans Petter Selasky int i, ret;
4213*d6b92ffaSHans Petter Selasky
4214*d6b92ffaSHans Petter Selasky ret = rs_svc_grow_sets(svc, 4);
4215*d6b92ffaSHans Petter Selasky if (ret) {
4216*d6b92ffaSHans Petter Selasky msg.status = ret;
4217*d6b92ffaSHans Petter Selasky write_all(svc->sock[1], &msg, sizeof msg);
4218*d6b92ffaSHans Petter Selasky return (void *) (uintptr_t) ret;
4219*d6b92ffaSHans Petter Selasky }
4220*d6b92ffaSHans Petter Selasky
4221*d6b92ffaSHans Petter Selasky udp_svc_fds = svc->contexts;
4222*d6b92ffaSHans Petter Selasky udp_svc_fds[0].fd = svc->sock[1];
4223*d6b92ffaSHans Petter Selasky udp_svc_fds[0].events = POLLIN;
4224*d6b92ffaSHans Petter Selasky do {
4225*d6b92ffaSHans Petter Selasky for (i = 0; i <= svc->cnt; i++)
4226*d6b92ffaSHans Petter Selasky udp_svc_fds[i].revents = 0;
4227*d6b92ffaSHans Petter Selasky
4228*d6b92ffaSHans Petter Selasky poll(udp_svc_fds, svc->cnt + 1, -1);
4229*d6b92ffaSHans Petter Selasky if (udp_svc_fds[0].revents)
4230*d6b92ffaSHans Petter Selasky udp_svc_process_sock(svc);
4231*d6b92ffaSHans Petter Selasky
4232*d6b92ffaSHans Petter Selasky for (i = 1; i <= svc->cnt; i++) {
4233*d6b92ffaSHans Petter Selasky if (udp_svc_fds[i].revents)
4234*d6b92ffaSHans Petter Selasky udp_svc_process_rs(svc->rss[i]);
4235*d6b92ffaSHans Petter Selasky }
4236*d6b92ffaSHans Petter Selasky } while (svc->cnt >= 1);
4237*d6b92ffaSHans Petter Selasky
4238*d6b92ffaSHans Petter Selasky return NULL;
4239*d6b92ffaSHans Petter Selasky }
4240*d6b92ffaSHans Petter Selasky
rs_get_time(void)4241*d6b92ffaSHans Petter Selasky static uint32_t rs_get_time(void)
4242*d6b92ffaSHans Petter Selasky {
4243*d6b92ffaSHans Petter Selasky struct timeval now;
4244*d6b92ffaSHans Petter Selasky
4245*d6b92ffaSHans Petter Selasky memset(&now, 0, sizeof now);
4246*d6b92ffaSHans Petter Selasky gettimeofday(&now, NULL);
4247*d6b92ffaSHans Petter Selasky return (uint32_t) now.tv_sec;
4248*d6b92ffaSHans Petter Selasky }
4249*d6b92ffaSHans Petter Selasky
tcp_svc_process_sock(struct rs_svc * svc)4250*d6b92ffaSHans Petter Selasky static void tcp_svc_process_sock(struct rs_svc *svc)
4251*d6b92ffaSHans Petter Selasky {
4252*d6b92ffaSHans Petter Selasky struct rs_svc_msg msg;
4253*d6b92ffaSHans Petter Selasky int i;
4254*d6b92ffaSHans Petter Selasky
4255*d6b92ffaSHans Petter Selasky read_all(svc->sock[1], &msg, sizeof msg);
4256*d6b92ffaSHans Petter Selasky switch (msg.cmd) {
4257*d6b92ffaSHans Petter Selasky case RS_SVC_ADD_KEEPALIVE:
4258*d6b92ffaSHans Petter Selasky msg.status = rs_svc_add_rs(svc, msg.rs);
4259*d6b92ffaSHans Petter Selasky if (!msg.status) {
4260*d6b92ffaSHans Petter Selasky msg.rs->opts |= RS_OPT_SVC_ACTIVE;
4261*d6b92ffaSHans Petter Selasky tcp_svc_timeouts = svc->contexts;
4262*d6b92ffaSHans Petter Selasky tcp_svc_timeouts[svc->cnt] = rs_get_time() +
4263*d6b92ffaSHans Petter Selasky msg.rs->keepalive_time;
4264*d6b92ffaSHans Petter Selasky }
4265*d6b92ffaSHans Petter Selasky break;
4266*d6b92ffaSHans Petter Selasky case RS_SVC_REM_KEEPALIVE:
4267*d6b92ffaSHans Petter Selasky msg.status = rs_svc_rm_rs(svc, msg.rs);
4268*d6b92ffaSHans Petter Selasky if (!msg.status)
4269*d6b92ffaSHans Petter Selasky msg.rs->opts &= ~RS_OPT_SVC_ACTIVE;
4270*d6b92ffaSHans Petter Selasky break;
4271*d6b92ffaSHans Petter Selasky case RS_SVC_MOD_KEEPALIVE:
4272*d6b92ffaSHans Petter Selasky i = rs_svc_index(svc, msg.rs);
4273*d6b92ffaSHans Petter Selasky if (i >= 0) {
4274*d6b92ffaSHans Petter Selasky tcp_svc_timeouts[i] = rs_get_time() + msg.rs->keepalive_time;
4275*d6b92ffaSHans Petter Selasky msg.status = 0;
4276*d6b92ffaSHans Petter Selasky } else {
4277*d6b92ffaSHans Petter Selasky msg.status = EBADF;
4278*d6b92ffaSHans Petter Selasky }
4279*d6b92ffaSHans Petter Selasky break;
4280*d6b92ffaSHans Petter Selasky case RS_SVC_NOOP:
4281*d6b92ffaSHans Petter Selasky msg.status = 0;
4282*d6b92ffaSHans Petter Selasky break;
4283*d6b92ffaSHans Petter Selasky default:
4284*d6b92ffaSHans Petter Selasky break;
4285*d6b92ffaSHans Petter Selasky }
4286*d6b92ffaSHans Petter Selasky write_all(svc->sock[1], &msg, sizeof msg);
4287*d6b92ffaSHans Petter Selasky }
4288*d6b92ffaSHans Petter Selasky
4289*d6b92ffaSHans Petter Selasky /*
4290*d6b92ffaSHans Petter Selasky * Send a 0 byte RDMA write with immediate as keep-alive message.
4291*d6b92ffaSHans Petter Selasky * This avoids the need for the receive side to do any acknowledgment.
4292*d6b92ffaSHans Petter Selasky */
tcp_svc_send_keepalive(struct rsocket * rs)4293*d6b92ffaSHans Petter Selasky static void tcp_svc_send_keepalive(struct rsocket *rs)
4294*d6b92ffaSHans Petter Selasky {
4295*d6b92ffaSHans Petter Selasky fastlock_acquire(&rs->cq_lock);
4296*d6b92ffaSHans Petter Selasky if (rs_ctrl_avail(rs) && (rs->state & rs_connected)) {
4297*d6b92ffaSHans Petter Selasky rs->ctrl_seqno++;
4298*d6b92ffaSHans Petter Selasky rs_post_write(rs, NULL, 0, rs_msg_set(RS_OP_CTRL, RS_CTRL_KEEPALIVE),
4299*d6b92ffaSHans Petter Selasky 0, (uintptr_t) NULL, (uintptr_t) NULL);
4300*d6b92ffaSHans Petter Selasky }
4301*d6b92ffaSHans Petter Selasky fastlock_release(&rs->cq_lock);
4302*d6b92ffaSHans Petter Selasky }
4303*d6b92ffaSHans Petter Selasky
tcp_svc_run(void * arg)4304*d6b92ffaSHans Petter Selasky static void *tcp_svc_run(void *arg)
4305*d6b92ffaSHans Petter Selasky {
4306*d6b92ffaSHans Petter Selasky struct rs_svc *svc = arg;
4307*d6b92ffaSHans Petter Selasky struct rs_svc_msg msg;
4308*d6b92ffaSHans Petter Selasky struct pollfd fds;
4309*d6b92ffaSHans Petter Selasky uint32_t now, next_timeout;
4310*d6b92ffaSHans Petter Selasky int i, ret, timeout;
4311*d6b92ffaSHans Petter Selasky
4312*d6b92ffaSHans Petter Selasky ret = rs_svc_grow_sets(svc, 16);
4313*d6b92ffaSHans Petter Selasky if (ret) {
4314*d6b92ffaSHans Petter Selasky msg.status = ret;
4315*d6b92ffaSHans Petter Selasky write_all(svc->sock[1], &msg, sizeof msg);
4316*d6b92ffaSHans Petter Selasky return (void *) (uintptr_t) ret;
4317*d6b92ffaSHans Petter Selasky }
4318*d6b92ffaSHans Petter Selasky
4319*d6b92ffaSHans Petter Selasky tcp_svc_timeouts = svc->contexts;
4320*d6b92ffaSHans Petter Selasky fds.fd = svc->sock[1];
4321*d6b92ffaSHans Petter Selasky fds.events = POLLIN;
4322*d6b92ffaSHans Petter Selasky timeout = -1;
4323*d6b92ffaSHans Petter Selasky do {
4324*d6b92ffaSHans Petter Selasky poll(&fds, 1, timeout * 1000);
4325*d6b92ffaSHans Petter Selasky if (fds.revents)
4326*d6b92ffaSHans Petter Selasky tcp_svc_process_sock(svc);
4327*d6b92ffaSHans Petter Selasky
4328*d6b92ffaSHans Petter Selasky now = rs_get_time();
4329*d6b92ffaSHans Petter Selasky next_timeout = ~0;
4330*d6b92ffaSHans Petter Selasky for (i = 1; i <= svc->cnt; i++) {
4331*d6b92ffaSHans Petter Selasky if (tcp_svc_timeouts[i] <= now) {
4332*d6b92ffaSHans Petter Selasky tcp_svc_send_keepalive(svc->rss[i]);
4333*d6b92ffaSHans Petter Selasky tcp_svc_timeouts[i] =
4334*d6b92ffaSHans Petter Selasky now + svc->rss[i]->keepalive_time;
4335*d6b92ffaSHans Petter Selasky }
4336*d6b92ffaSHans Petter Selasky if (tcp_svc_timeouts[i] < next_timeout)
4337*d6b92ffaSHans Petter Selasky next_timeout = tcp_svc_timeouts[i];
4338*d6b92ffaSHans Petter Selasky }
4339*d6b92ffaSHans Petter Selasky timeout = (int) (next_timeout - now);
4340*d6b92ffaSHans Petter Selasky } while (svc->cnt >= 1);
4341*d6b92ffaSHans Petter Selasky
4342*d6b92ffaSHans Petter Selasky return NULL;
4343*d6b92ffaSHans Petter Selasky }
4344