1 /*
2 * Copyright (c) 2005 Topspin Communications. All rights reserved.
3 * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34 #define _GNU_SOURCE
35 #include <config.h>
36
37 #include <infiniband/endian.h>
38 #include <stdio.h>
39 #include <unistd.h>
40 #include <stdlib.h>
41 #include <errno.h>
42 #include <string.h>
43 #include <dirent.h>
44 #include <netinet/in.h>
45 #include <netinet/ip.h>
46 #include <sys/socket.h>
47
48 #include "ibverbs.h"
49 #ifndef NRESOLVE_NEIGH
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include "neigh.h"
53 #endif
54
55 /* Hack to avoid GCC's -Wmissing-prototypes and the similar error from sparse
56 with these prototypes. Symbol versionining requires the goofy names, the
57 prototype must match the version in verbs.h.
58 */
59 int __ibv_query_device(struct ibv_context *context,
60 struct ibv_device_attr *device_attr);
61 int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
62 struct ibv_port_attr *port_attr);
63 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index,
64 union ibv_gid *gid);
65 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index,
66 __be16 *pkey);
67 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context);
68 int __ibv_dealloc_pd(struct ibv_pd *pd);
69 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
70 int access);
71 int __ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr,
72 size_t length, int access);
73 int __ibv_dereg_mr(struct ibv_mr *mr);
74 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe,
75 void *cq_context,
76 struct ibv_comp_channel *channel,
77 int comp_vector);
78 int __ibv_resize_cq(struct ibv_cq *cq, int cqe);
79 int __ibv_destroy_cq(struct ibv_cq *cq);
80 int __ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq,
81 void **cq_context);
82 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents);
83 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
84 struct ibv_srq_init_attr *srq_init_attr);
85 int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr,
86 int srq_attr_mask);
87 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr);
88 int __ibv_destroy_srq(struct ibv_srq *srq);
89 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
90 struct ibv_qp_init_attr *qp_init_attr);
91 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
92 struct ibv_qp_init_attr *init_attr);
93 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
94 int __ibv_destroy_qp(struct ibv_qp *qp);
95 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
96 int __ibv_destroy_ah(struct ibv_ah *ah);
97 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
98 uint16_t lid);
99 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
100 uint16_t lid);
101
ibv_rate_to_mult(enum ibv_rate rate)102 int __attribute__((const)) ibv_rate_to_mult(enum ibv_rate rate)
103 {
104 switch (rate) {
105 case IBV_RATE_2_5_GBPS: return 1;
106 case IBV_RATE_5_GBPS: return 2;
107 case IBV_RATE_10_GBPS: return 4;
108 case IBV_RATE_20_GBPS: return 8;
109 case IBV_RATE_30_GBPS: return 12;
110 case IBV_RATE_40_GBPS: return 16;
111 case IBV_RATE_60_GBPS: return 24;
112 case IBV_RATE_80_GBPS: return 32;
113 case IBV_RATE_120_GBPS: return 48;
114 case IBV_RATE_28_GBPS: return 11;
115 case IBV_RATE_50_GBPS: return 20;
116 case IBV_RATE_400_GBPS: return 160;
117 case IBV_RATE_600_GBPS: return 240;
118 default: return -1;
119 }
120 }
121
mult_to_ibv_rate(int mult)122 enum ibv_rate __attribute__((const)) mult_to_ibv_rate(int mult)
123 {
124 switch (mult) {
125 case 1: return IBV_RATE_2_5_GBPS;
126 case 2: return IBV_RATE_5_GBPS;
127 case 4: return IBV_RATE_10_GBPS;
128 case 8: return IBV_RATE_20_GBPS;
129 case 12: return IBV_RATE_30_GBPS;
130 case 16: return IBV_RATE_40_GBPS;
131 case 24: return IBV_RATE_60_GBPS;
132 case 32: return IBV_RATE_80_GBPS;
133 case 48: return IBV_RATE_120_GBPS;
134 case 11: return IBV_RATE_28_GBPS;
135 case 20: return IBV_RATE_50_GBPS;
136 case 160: return IBV_RATE_400_GBPS;
137 case 240: return IBV_RATE_600_GBPS;
138 default: return IBV_RATE_MAX;
139 }
140 }
141
ibv_rate_to_mbps(enum ibv_rate rate)142 int __attribute__((const)) ibv_rate_to_mbps(enum ibv_rate rate)
143 {
144 switch (rate) {
145 case IBV_RATE_2_5_GBPS: return 2500;
146 case IBV_RATE_5_GBPS: return 5000;
147 case IBV_RATE_10_GBPS: return 10000;
148 case IBV_RATE_20_GBPS: return 20000;
149 case IBV_RATE_30_GBPS: return 30000;
150 case IBV_RATE_40_GBPS: return 40000;
151 case IBV_RATE_60_GBPS: return 60000;
152 case IBV_RATE_80_GBPS: return 80000;
153 case IBV_RATE_120_GBPS: return 120000;
154 case IBV_RATE_14_GBPS: return 14062;
155 case IBV_RATE_56_GBPS: return 56250;
156 case IBV_RATE_112_GBPS: return 112500;
157 case IBV_RATE_168_GBPS: return 168750;
158 case IBV_RATE_25_GBPS: return 25781;
159 case IBV_RATE_100_GBPS: return 103125;
160 case IBV_RATE_200_GBPS: return 206250;
161 case IBV_RATE_300_GBPS: return 309375;
162 case IBV_RATE_28_GBPS: return 28125;
163 case IBV_RATE_50_GBPS: return 53125;
164 case IBV_RATE_400_GBPS: return 425000;
165 case IBV_RATE_600_GBPS: return 637500;
166 default: return -1;
167 }
168 }
169
mbps_to_ibv_rate(int mbps)170 enum ibv_rate __attribute__((const)) mbps_to_ibv_rate(int mbps)
171 {
172 switch (mbps) {
173 case 2500: return IBV_RATE_2_5_GBPS;
174 case 5000: return IBV_RATE_5_GBPS;
175 case 10000: return IBV_RATE_10_GBPS;
176 case 20000: return IBV_RATE_20_GBPS;
177 case 30000: return IBV_RATE_30_GBPS;
178 case 40000: return IBV_RATE_40_GBPS;
179 case 60000: return IBV_RATE_60_GBPS;
180 case 80000: return IBV_RATE_80_GBPS;
181 case 120000: return IBV_RATE_120_GBPS;
182 case 14062: return IBV_RATE_14_GBPS;
183 case 56250: return IBV_RATE_56_GBPS;
184 case 112500: return IBV_RATE_112_GBPS;
185 case 168750: return IBV_RATE_168_GBPS;
186 case 25781: return IBV_RATE_25_GBPS;
187 case 103125: return IBV_RATE_100_GBPS;
188 case 206250: return IBV_RATE_200_GBPS;
189 case 309375: return IBV_RATE_300_GBPS;
190 case 28125: return IBV_RATE_28_GBPS;
191 case 53125: return IBV_RATE_50_GBPS;
192 case 425000: return IBV_RATE_400_GBPS;
193 case 637500: return IBV_RATE_600_GBPS;
194 default: return IBV_RATE_MAX;
195 }
196 }
197
__ibv_query_device(struct ibv_context * context,struct ibv_device_attr * device_attr)198 int __ibv_query_device(struct ibv_context *context,
199 struct ibv_device_attr *device_attr)
200 {
201 return context->ops.query_device(context, device_attr);
202 }
203 default_symver(__ibv_query_device, ibv_query_device);
204
__ibv_query_port(struct ibv_context * context,uint8_t port_num,struct ibv_port_attr * port_attr)205 int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
206 struct ibv_port_attr *port_attr)
207 {
208 return context->ops.query_port(context, port_num, port_attr);
209 }
210 default_symver(__ibv_query_port, ibv_query_port);
211
__ibv_query_gid(struct ibv_context * context,uint8_t port_num,int index,union ibv_gid * gid)212 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num,
213 int index, union ibv_gid *gid)
214 {
215 char name[24];
216 char attr[41];
217 uint16_t val;
218 int i;
219
220 snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index);
221
222 if (ibv_read_sysfs_file(context->device->ibdev_path, name,
223 attr, sizeof attr) < 0)
224 return -1;
225
226 for (i = 0; i < 8; ++i) {
227 if (sscanf(attr + i * 5, "%hx", &val) != 1)
228 return -1;
229 gid->raw[i * 2 ] = val >> 8;
230 gid->raw[i * 2 + 1] = val & 0xff;
231 }
232
233 return 0;
234 }
235 default_symver(__ibv_query_gid, ibv_query_gid);
236
__ibv_query_pkey(struct ibv_context * context,uint8_t port_num,int index,__be16 * pkey)237 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num,
238 int index, __be16 *pkey)
239 {
240 char name[24];
241 char attr[8];
242 uint16_t val;
243
244 snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index);
245
246 if (ibv_read_sysfs_file(context->device->ibdev_path, name,
247 attr, sizeof attr) < 0)
248 return -1;
249
250 if (sscanf(attr, "%hx", &val) != 1)
251 return -1;
252
253 *pkey = htobe16(val);
254 return 0;
255 }
256 default_symver(__ibv_query_pkey, ibv_query_pkey);
257
__ibv_alloc_pd(struct ibv_context * context)258 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context)
259 {
260 struct ibv_pd *pd;
261
262 pd = context->ops.alloc_pd(context);
263 if (pd)
264 pd->context = context;
265
266 return pd;
267 }
268 default_symver(__ibv_alloc_pd, ibv_alloc_pd);
269
__ibv_dealloc_pd(struct ibv_pd * pd)270 int __ibv_dealloc_pd(struct ibv_pd *pd)
271 {
272 return pd->context->ops.dealloc_pd(pd);
273 }
274 default_symver(__ibv_dealloc_pd, ibv_dealloc_pd);
275
__ibv_reg_mr(struct ibv_pd * pd,void * addr,size_t length,int access)276 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr,
277 size_t length, int access)
278 {
279 struct ibv_mr *mr;
280
281 if (ibv_dontfork_range(addr, length))
282 return NULL;
283
284 mr = pd->context->ops.reg_mr(pd, addr, length, access);
285 if (mr) {
286 mr->context = pd->context;
287 mr->pd = pd;
288 mr->addr = addr;
289 mr->length = length;
290 } else
291 ibv_dofork_range(addr, length);
292
293 return mr;
294 }
295 default_symver(__ibv_reg_mr, ibv_reg_mr);
296
__ibv_rereg_mr(struct ibv_mr * mr,int flags,struct ibv_pd * pd,void * addr,size_t length,int access)297 int __ibv_rereg_mr(struct ibv_mr *mr, int flags,
298 struct ibv_pd *pd, void *addr,
299 size_t length, int access)
300 {
301 int dofork_onfail = 0;
302 int err;
303 void *old_addr;
304 size_t old_len;
305
306 if (flags & ~IBV_REREG_MR_FLAGS_SUPPORTED) {
307 errno = EINVAL;
308 return IBV_REREG_MR_ERR_INPUT;
309 }
310
311 if ((flags & IBV_REREG_MR_CHANGE_TRANSLATION) &&
312 (!length || !addr)) {
313 errno = EINVAL;
314 return IBV_REREG_MR_ERR_INPUT;
315 }
316
317 if (access && !(flags & IBV_REREG_MR_CHANGE_ACCESS)) {
318 errno = EINVAL;
319 return IBV_REREG_MR_ERR_INPUT;
320 }
321
322 if (!mr->context->ops.rereg_mr) {
323 errno = ENOSYS;
324 return IBV_REREG_MR_ERR_INPUT;
325 }
326
327 if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
328 err = ibv_dontfork_range(addr, length);
329 if (err)
330 return IBV_REREG_MR_ERR_DONT_FORK_NEW;
331 dofork_onfail = 1;
332 }
333
334 old_addr = mr->addr;
335 old_len = mr->length;
336 err = mr->context->ops.rereg_mr(mr, flags, pd, addr, length, access);
337 if (!err) {
338 if (flags & IBV_REREG_MR_CHANGE_PD)
339 mr->pd = pd;
340 if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
341 mr->addr = addr;
342 mr->length = length;
343 err = ibv_dofork_range(old_addr, old_len);
344 if (err)
345 return IBV_REREG_MR_ERR_DO_FORK_OLD;
346 }
347 } else {
348 err = IBV_REREG_MR_ERR_CMD;
349 if (dofork_onfail) {
350 if (ibv_dofork_range(addr, length))
351 err = IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW;
352 }
353 }
354
355 return err;
356 }
357 default_symver(__ibv_rereg_mr, ibv_rereg_mr);
358
__ibv_dereg_mr(struct ibv_mr * mr)359 int __ibv_dereg_mr(struct ibv_mr *mr)
360 {
361 int ret;
362 void *addr = mr->addr;
363 size_t length = mr->length;
364
365 ret = mr->context->ops.dereg_mr(mr);
366 if (!ret)
367 ibv_dofork_range(addr, length);
368
369 return ret;
370 }
371 default_symver(__ibv_dereg_mr, ibv_dereg_mr);
372
ibv_create_comp_channel_v2(struct ibv_context * context)373 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
374 {
375 struct ibv_abi_compat_v2 *t = context->abi_compat;
376 static int warned;
377
378 if (!pthread_mutex_trylock(&t->in_use))
379 return &t->channel;
380
381 if (!warned) {
382 fprintf(stderr, PFX "Warning: kernel's ABI version %d limits capacity.\n"
383 " Only one completion channel can be created per context.\n",
384 abi_ver);
385 ++warned;
386 }
387
388 return NULL;
389 }
390
ibv_create_comp_channel(struct ibv_context * context)391 struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context)
392 {
393 struct ibv_comp_channel *channel;
394 struct ibv_create_comp_channel cmd;
395 struct ibv_create_comp_channel_resp resp;
396
397 if (abi_ver <= 2)
398 return ibv_create_comp_channel_v2(context);
399
400 channel = malloc(sizeof *channel);
401 if (!channel)
402 return NULL;
403
404 IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_COMP_CHANNEL, &resp, sizeof resp);
405 if (write(context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) {
406 free(channel);
407 return NULL;
408 }
409
410 (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
411
412 channel->context = context;
413 channel->fd = resp.fd;
414 channel->refcnt = 0;
415
416 return channel;
417 }
418
ibv_destroy_comp_channel_v2(struct ibv_comp_channel * channel)419 static int ibv_destroy_comp_channel_v2(struct ibv_comp_channel *channel)
420 {
421 struct ibv_abi_compat_v2 *t = (struct ibv_abi_compat_v2 *) channel;
422 pthread_mutex_unlock(&t->in_use);
423 return 0;
424 }
425
ibv_destroy_comp_channel(struct ibv_comp_channel * channel)426 int ibv_destroy_comp_channel(struct ibv_comp_channel *channel)
427 {
428 struct ibv_context *context;
429 int ret;
430
431 context = channel->context;
432 pthread_mutex_lock(&context->mutex);
433
434 if (channel->refcnt) {
435 ret = EBUSY;
436 goto out;
437 }
438
439 if (abi_ver <= 2) {
440 ret = ibv_destroy_comp_channel_v2(channel);
441 goto out;
442 }
443
444 close(channel->fd);
445 free(channel);
446 ret = 0;
447
448 out:
449 pthread_mutex_unlock(&context->mutex);
450
451 return ret;
452 }
453
__ibv_create_cq(struct ibv_context * context,int cqe,void * cq_context,struct ibv_comp_channel * channel,int comp_vector)454 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context,
455 struct ibv_comp_channel *channel, int comp_vector)
456 {
457 struct ibv_cq *cq;
458 int err = 0;
459
460 cq = context->ops.create_cq(context, cqe, channel, comp_vector);
461
462 if (!cq)
463 return NULL;
464
465 err = verbs_init_cq(cq, context, channel, cq_context);
466 if (err)
467 goto err;
468
469 return cq;
470
471 err:
472 context->ops.destroy_cq(cq);
473
474 return NULL;
475 }
476 default_symver(__ibv_create_cq, ibv_create_cq);
477
__ibv_resize_cq(struct ibv_cq * cq,int cqe)478 int __ibv_resize_cq(struct ibv_cq *cq, int cqe)
479 {
480 if (!cq->context->ops.resize_cq)
481 return ENOSYS;
482
483 return cq->context->ops.resize_cq(cq, cqe);
484 }
485 default_symver(__ibv_resize_cq, ibv_resize_cq);
486
__ibv_destroy_cq(struct ibv_cq * cq)487 int __ibv_destroy_cq(struct ibv_cq *cq)
488 {
489 struct ibv_comp_channel *channel = cq->channel;
490 int ret;
491
492 ret = cq->context->ops.destroy_cq(cq);
493
494 if (channel) {
495 if (!ret) {
496 pthread_mutex_lock(&channel->context->mutex);
497 --channel->refcnt;
498 pthread_mutex_unlock(&channel->context->mutex);
499 }
500 }
501
502 return ret;
503 }
504 default_symver(__ibv_destroy_cq, ibv_destroy_cq);
505
__ibv_get_cq_event(struct ibv_comp_channel * channel,struct ibv_cq ** cq,void ** cq_context)506 int __ibv_get_cq_event(struct ibv_comp_channel *channel,
507 struct ibv_cq **cq, void **cq_context)
508 {
509 struct ibv_comp_event ev;
510
511 if (read(channel->fd, &ev, sizeof ev) != sizeof ev)
512 return -1;
513
514 *cq = (struct ibv_cq *) (uintptr_t) ev.cq_handle;
515 *cq_context = (*cq)->cq_context;
516
517 if ((*cq)->context->ops.cq_event)
518 (*cq)->context->ops.cq_event(*cq);
519
520 return 0;
521 }
522 default_symver(__ibv_get_cq_event, ibv_get_cq_event);
523
__ibv_ack_cq_events(struct ibv_cq * cq,unsigned int nevents)524 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents)
525 {
526 pthread_mutex_lock(&cq->mutex);
527 cq->comp_events_completed += nevents;
528 pthread_cond_signal(&cq->cond);
529 pthread_mutex_unlock(&cq->mutex);
530 }
531 default_symver(__ibv_ack_cq_events, ibv_ack_cq_events);
532
__ibv_create_srq(struct ibv_pd * pd,struct ibv_srq_init_attr * srq_init_attr)533 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
534 struct ibv_srq_init_attr *srq_init_attr)
535 {
536 struct ibv_srq *srq;
537
538 if (!pd->context->ops.create_srq)
539 return NULL;
540
541 srq = pd->context->ops.create_srq(pd, srq_init_attr);
542 if (!srq)
543 return NULL;
544
545 srq->context = pd->context;
546 srq->srq_context = srq_init_attr->srq_context;
547 srq->pd = pd;
548 srq->events_completed = 0;
549 if (pthread_mutex_init(&srq->mutex, NULL))
550 goto err;
551 if (pthread_cond_init(&srq->cond, NULL))
552 goto err_mutex;
553
554 return srq;
555
556 err_mutex:
557 pthread_mutex_destroy(&srq->mutex);
558 err:
559 pd->context->ops.destroy_srq(srq);
560
561 return NULL;
562 }
563 default_symver(__ibv_create_srq, ibv_create_srq);
564
__ibv_modify_srq(struct ibv_srq * srq,struct ibv_srq_attr * srq_attr,int srq_attr_mask)565 int __ibv_modify_srq(struct ibv_srq *srq,
566 struct ibv_srq_attr *srq_attr,
567 int srq_attr_mask)
568 {
569 return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask);
570 }
571 default_symver(__ibv_modify_srq, ibv_modify_srq);
572
__ibv_query_srq(struct ibv_srq * srq,struct ibv_srq_attr * srq_attr)573 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr)
574 {
575 return srq->context->ops.query_srq(srq, srq_attr);
576 }
577 default_symver(__ibv_query_srq, ibv_query_srq);
578
__ibv_destroy_srq(struct ibv_srq * srq)579 int __ibv_destroy_srq(struct ibv_srq *srq)
580 {
581 pthread_cond_destroy(&srq->cond);
582 pthread_mutex_destroy(&srq->mutex);
583 return srq->context->ops.destroy_srq(srq);
584 }
585 default_symver(__ibv_destroy_srq, ibv_destroy_srq);
586
__ibv_create_qp(struct ibv_pd * pd,struct ibv_qp_init_attr * qp_init_attr)587 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
588 struct ibv_qp_init_attr *qp_init_attr)
589 {
590 struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr);
591
592 if (qp) {
593 qp->context = pd->context;
594 qp->qp_context = qp_init_attr->qp_context;
595 qp->pd = pd;
596 qp->send_cq = qp_init_attr->send_cq;
597 qp->recv_cq = qp_init_attr->recv_cq;
598 qp->srq = qp_init_attr->srq;
599 qp->qp_type = qp_init_attr->qp_type;
600 qp->state = IBV_QPS_RESET;
601 qp->events_completed = 0;
602 pthread_mutex_init(&qp->mutex, NULL);
603 pthread_cond_init(&qp->cond, NULL);
604 }
605
606 return qp;
607 }
608 default_symver(__ibv_create_qp, ibv_create_qp);
609
__ibv_query_qp(struct ibv_qp * qp,struct ibv_qp_attr * attr,int attr_mask,struct ibv_qp_init_attr * init_attr)610 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
611 int attr_mask,
612 struct ibv_qp_init_attr *init_attr)
613 {
614 int ret;
615
616 ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr);
617 if (ret)
618 return ret;
619
620 if (attr_mask & IBV_QP_STATE)
621 qp->state = attr->qp_state;
622
623 return 0;
624 }
625 default_symver(__ibv_query_qp, ibv_query_qp);
626
__ibv_modify_qp(struct ibv_qp * qp,struct ibv_qp_attr * attr,int attr_mask)627 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
628 int attr_mask)
629 {
630 int ret;
631
632 ret = qp->context->ops.modify_qp(qp, attr, attr_mask);
633 if (ret)
634 return ret;
635
636 if (attr_mask & IBV_QP_STATE)
637 qp->state = attr->qp_state;
638
639 return 0;
640 }
641 default_symver(__ibv_modify_qp, ibv_modify_qp);
642
__ibv_destroy_qp(struct ibv_qp * qp)643 int __ibv_destroy_qp(struct ibv_qp *qp)
644 {
645 return qp->context->ops.destroy_qp(qp);
646 }
647 default_symver(__ibv_destroy_qp, ibv_destroy_qp);
648
__ibv_create_ah(struct ibv_pd * pd,struct ibv_ah_attr * attr)649 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
650 {
651 struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr);
652
653 if (ah) {
654 ah->context = pd->context;
655 ah->pd = pd;
656 }
657
658 return ah;
659 }
660 default_symver(__ibv_create_ah, ibv_create_ah);
661
662 /* GID types as appear in sysfs, no change is expected as of ABI
663 * compatibility.
664 */
665 #define V1_TYPE "IB/RoCE v1"
666 #define V2_TYPE "RoCE v2"
ibv_query_gid_type(struct ibv_context * context,uint8_t port_num,unsigned int index,enum ibv_gid_type * type)667 int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num,
668 unsigned int index, enum ibv_gid_type *type)
669 {
670 char name[32];
671 char buff[11];
672
673 snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num,
674 index);
675
676 /* Reset errno so that we can rely on its value upon any error flow in
677 * ibv_read_sysfs_file.
678 */
679 errno = 0;
680 if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff,
681 sizeof(buff)) <= 0) {
682 char *dir_path;
683 DIR *dir;
684
685 if (errno == EINVAL) {
686 /* In IB, this file doesn't exist and the kernel sets
687 * errno to -EINVAL.
688 */
689 *type = IBV_GID_TYPE_IB_ROCE_V1;
690 return 0;
691 }
692 if (asprintf(&dir_path, "%s/%s/%d/%s/",
693 context->device->ibdev_path, "ports", port_num,
694 "gid_attrs") < 0)
695 return -1;
696 dir = opendir(dir_path);
697 free(dir_path);
698 if (!dir) {
699 if (errno == ENOENT)
700 /* Assuming that if gid_attrs doesn't exist,
701 * we have an old kernel and all GIDs are
702 * IB/RoCE v1
703 */
704 *type = IBV_GID_TYPE_IB_ROCE_V1;
705 else
706 return -1;
707 } else {
708 closedir(dir);
709 errno = EFAULT;
710 return -1;
711 }
712 } else {
713 if (!strcmp(buff, V1_TYPE)) {
714 *type = IBV_GID_TYPE_IB_ROCE_V1;
715 } else if (!strcmp(buff, V2_TYPE)) {
716 *type = IBV_GID_TYPE_ROCE_V2;
717 } else {
718 errno = ENOTSUP;
719 return -1;
720 }
721 }
722
723 return 0;
724 }
725
ibv_find_gid_index(struct ibv_context * context,uint8_t port_num,union ibv_gid * gid,enum ibv_gid_type gid_type)726 static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num,
727 union ibv_gid *gid, enum ibv_gid_type gid_type)
728 {
729 enum ibv_gid_type sgid_type = 0;
730 union ibv_gid sgid;
731 int i = 0, ret;
732
733 do {
734 ret = ibv_query_gid(context, port_num, i, &sgid);
735 if (!ret) {
736 ret = ibv_query_gid_type(context, port_num, i,
737 &sgid_type);
738 }
739 i++;
740 } while (!ret && (memcmp(&sgid, gid, sizeof(*gid)) ||
741 (gid_type != sgid_type)));
742
743 return ret ? ret : i - 1;
744 }
745
map_ipv4_addr_to_ipv6(__be32 ipv4,struct in6_addr * ipv6)746 static inline void map_ipv4_addr_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
747 {
748 ipv6->s6_addr32[0] = 0;
749 ipv6->s6_addr32[1] = 0;
750 ipv6->s6_addr32[2] = htobe32(0x0000FFFF);
751 ipv6->s6_addr32[3] = ipv4;
752 }
753
ipv4_calc_hdr_csum(uint16_t * data,unsigned int num_hwords)754 static inline __sum16 ipv4_calc_hdr_csum(uint16_t *data, unsigned int num_hwords)
755 {
756 unsigned int i = 0;
757 uint32_t sum = 0;
758
759 for (i = 0; i < num_hwords; i++)
760 sum += *(data++);
761
762 sum = (sum & 0xffff) + (sum >> 16);
763
764 return (__sum16)~sum;
765 }
766
get_grh_header_version(struct ibv_grh * grh)767 static inline int get_grh_header_version(struct ibv_grh *grh)
768 {
769 int ip6h_version = (be32toh(grh->version_tclass_flow) >> 28) & 0xf;
770 struct ip *ip4h = (struct ip *)((void *)grh + 20);
771 struct ip ip4h_checked;
772
773 if (ip6h_version != 6) {
774 if (ip4h->ip_v == 4)
775 return 4;
776 errno = EPROTONOSUPPORT;
777 return -1;
778 }
779 /* version may be 6 or 4 */
780 if (ip4h->ip_hl != 5) /* IPv4 header length must be 5 for RoCE v2. */
781 return 6;
782 /*
783 * Verify checksum.
784 * We can't write on scattered buffers so we have to copy to temp
785 * buffer.
786 */
787 memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
788 /* Need to set the checksum field (check) to 0 before re-calculating
789 * the checksum.
790 */
791 ip4h_checked.ip_sum = 0;
792 ip4h_checked.ip_sum = ipv4_calc_hdr_csum((uint16_t *)&ip4h_checked, 10);
793 /* if IPv4 header checksum is OK, believe it */
794 if (ip4h->ip_sum == ip4h_checked.ip_sum)
795 return 4;
796 return 6;
797 }
798
set_ah_attr_generic_fields(struct ibv_ah_attr * ah_attr,struct ibv_wc * wc,struct ibv_grh * grh,uint8_t port_num)799 static inline void set_ah_attr_generic_fields(struct ibv_ah_attr *ah_attr,
800 struct ibv_wc *wc,
801 struct ibv_grh *grh,
802 uint8_t port_num)
803 {
804 uint32_t flow_class;
805
806 flow_class = be32toh(grh->version_tclass_flow);
807 ah_attr->grh.flow_label = flow_class & 0xFFFFF;
808 ah_attr->dlid = wc->slid;
809 ah_attr->sl = wc->sl;
810 ah_attr->src_path_bits = wc->dlid_path_bits;
811 ah_attr->port_num = port_num;
812 }
813
set_ah_attr_by_ipv4(struct ibv_context * context,struct ibv_ah_attr * ah_attr,struct ip * ip4h,uint8_t port_num)814 static inline int set_ah_attr_by_ipv4(struct ibv_context *context,
815 struct ibv_ah_attr *ah_attr,
816 struct ip *ip4h, uint8_t port_num)
817 {
818 union ibv_gid sgid;
819 int ret;
820
821 /* No point searching multicast GIDs in GID table */
822 if (IN_CLASSD(be32toh(ip4h->ip_dst.s_addr))) {
823 errno = EINVAL;
824 return -1;
825 }
826
827 map_ipv4_addr_to_ipv6(ip4h->ip_dst.s_addr, (struct in6_addr *)&sgid);
828 ret = ibv_find_gid_index(context, port_num, &sgid,
829 IBV_GID_TYPE_ROCE_V2);
830 if (ret < 0)
831 return ret;
832
833 map_ipv4_addr_to_ipv6(ip4h->ip_src.s_addr,
834 (struct in6_addr *)&ah_attr->grh.dgid);
835 ah_attr->grh.sgid_index = (uint8_t) ret;
836 ah_attr->grh.hop_limit = ip4h->ip_ttl;
837 ah_attr->grh.traffic_class = ip4h->ip_tos;
838
839 return 0;
840 }
841
842 #define IB_NEXT_HDR 0x1b
set_ah_attr_by_ipv6(struct ibv_context * context,struct ibv_ah_attr * ah_attr,struct ibv_grh * grh,uint8_t port_num)843 static inline int set_ah_attr_by_ipv6(struct ibv_context *context,
844 struct ibv_ah_attr *ah_attr,
845 struct ibv_grh *grh, uint8_t port_num)
846 {
847 uint32_t flow_class;
848 uint32_t sgid_type;
849 int ret;
850
851 /* No point searching multicast GIDs in GID table */
852 if (grh->dgid.raw[0] == 0xFF) {
853 errno = EINVAL;
854 return -1;
855 }
856
857 ah_attr->grh.dgid = grh->sgid;
858 if (grh->next_hdr == IPPROTO_UDP) {
859 sgid_type = IBV_GID_TYPE_ROCE_V2;
860 } else if (grh->next_hdr == IB_NEXT_HDR) {
861 sgid_type = IBV_GID_TYPE_IB_ROCE_V1;
862 } else {
863 errno = EPROTONOSUPPORT;
864 return -1;
865 }
866
867 ret = ibv_find_gid_index(context, port_num, &grh->dgid,
868 sgid_type);
869 if (ret < 0)
870 return ret;
871
872 ah_attr->grh.sgid_index = (uint8_t) ret;
873 flow_class = be32toh(grh->version_tclass_flow);
874 ah_attr->grh.hop_limit = grh->hop_limit;
875 ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
876
877 return 0;
878 }
879
ibv_init_ah_from_wc(struct ibv_context * context,uint8_t port_num,struct ibv_wc * wc,struct ibv_grh * grh,struct ibv_ah_attr * ah_attr)880 int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num,
881 struct ibv_wc *wc, struct ibv_grh *grh,
882 struct ibv_ah_attr *ah_attr)
883 {
884 int version;
885 int ret = 0;
886
887 memset(ah_attr, 0, sizeof *ah_attr);
888 set_ah_attr_generic_fields(ah_attr, wc, grh, port_num);
889
890 if (wc->wc_flags & IBV_WC_GRH) {
891 ah_attr->is_global = 1;
892 version = get_grh_header_version(grh);
893
894 if (version == 4)
895 ret = set_ah_attr_by_ipv4(context, ah_attr,
896 (struct ip *)((void *)grh + 20),
897 port_num);
898 else if (version == 6)
899 ret = set_ah_attr_by_ipv6(context, ah_attr, grh,
900 port_num);
901 else
902 ret = -1;
903 }
904
905 return ret;
906 }
907
ibv_create_ah_from_wc(struct ibv_pd * pd,struct ibv_wc * wc,struct ibv_grh * grh,uint8_t port_num)908 struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc,
909 struct ibv_grh *grh, uint8_t port_num)
910 {
911 struct ibv_ah_attr ah_attr;
912 int ret;
913
914 ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr);
915 if (ret)
916 return NULL;
917
918 return ibv_create_ah(pd, &ah_attr);
919 }
920
__ibv_destroy_ah(struct ibv_ah * ah)921 int __ibv_destroy_ah(struct ibv_ah *ah)
922 {
923 return ah->context->ops.destroy_ah(ah);
924 }
925 default_symver(__ibv_destroy_ah, ibv_destroy_ah);
926
__ibv_attach_mcast(struct ibv_qp * qp,const union ibv_gid * gid,uint16_t lid)927 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
928 {
929 return qp->context->ops.attach_mcast(qp, gid, lid);
930 }
931 default_symver(__ibv_attach_mcast, ibv_attach_mcast);
932
__ibv_detach_mcast(struct ibv_qp * qp,const union ibv_gid * gid,uint16_t lid)933 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
934 {
935 return qp->context->ops.detach_mcast(qp, gid, lid);
936 }
937 default_symver(__ibv_detach_mcast, ibv_detach_mcast);
938
ipv6_addr_v4mapped(const struct in6_addr * a)939 static inline int ipv6_addr_v4mapped(const struct in6_addr *a)
940 {
941 return IN6_IS_ADDR_V4MAPPED(a) ||
942 /* IPv4 encoded multicast addresses */
943 (a->s6_addr32[0] == htobe32(0xff0e0000) &&
944 ((a->s6_addr32[1] |
945 (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL));
946 }
947
948 struct peer_address {
949 void *address;
950 uint32_t size;
951 };
952
create_peer_from_gid(int family,void * raw_gid,struct peer_address * peer_address)953 static inline int create_peer_from_gid(int family, void *raw_gid,
954 struct peer_address *peer_address)
955 {
956 switch (family) {
957 case AF_INET:
958 peer_address->address = raw_gid + 12;
959 peer_address->size = 4;
960 break;
961 case AF_INET6:
962 peer_address->address = raw_gid;
963 peer_address->size = 16;
964 break;
965 default:
966 return -1;
967 }
968
969 return 0;
970 }
971
972 #define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000
ibv_resolve_eth_l2_from_gid(struct ibv_context * context,struct ibv_ah_attr * attr,uint8_t eth_mac[ETHERNET_LL_SIZE],uint16_t * vid)973 int ibv_resolve_eth_l2_from_gid(struct ibv_context *context,
974 struct ibv_ah_attr *attr,
975 uint8_t eth_mac[ETHERNET_LL_SIZE],
976 uint16_t *vid)
977 {
978 #ifndef NRESOLVE_NEIGH
979 int dst_family;
980 int src_family;
981 int oif;
982 struct get_neigh_handler neigh_handler;
983 union ibv_gid sgid;
984 int ether_len;
985 struct peer_address src;
986 struct peer_address dst;
987 uint16_t ret_vid;
988 int ret = -EINVAL;
989 int err;
990
991 err = ibv_query_gid(context, attr->port_num,
992 attr->grh.sgid_index, &sgid);
993
994 if (err)
995 return err;
996
997 err = neigh_init_resources(&neigh_handler,
998 NEIGH_GET_DEFAULT_TIMEOUT_MS);
999
1000 if (err)
1001 return err;
1002
1003 dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ?
1004 AF_INET : AF_INET6;
1005 src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ?
1006 AF_INET : AF_INET6;
1007
1008 if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst))
1009 goto free_resources;
1010
1011 if (create_peer_from_gid(src_family, &sgid.raw, &src))
1012 goto free_resources;
1013
1014 if (neigh_set_dst(&neigh_handler, dst_family, dst.address,
1015 dst.size))
1016 goto free_resources;
1017
1018 if (neigh_set_src(&neigh_handler, src_family, src.address,
1019 src.size))
1020 goto free_resources;
1021
1022 oif = neigh_get_oif_from_src(&neigh_handler);
1023
1024 if (oif > 0)
1025 neigh_set_oif(&neigh_handler, oif);
1026 else
1027 goto free_resources;
1028
1029 ret = -EHOSTUNREACH;
1030
1031 /* blocking call */
1032 if (process_get_neigh(&neigh_handler))
1033 goto free_resources;
1034
1035 ret_vid = neigh_get_vlan_id_from_dev(&neigh_handler);
1036
1037 if (ret_vid <= 0xfff)
1038 neigh_set_vlan_id(&neigh_handler, ret_vid);
1039
1040 /* We are using only Ethernet here */
1041 ether_len = neigh_get_ll(&neigh_handler,
1042 eth_mac,
1043 sizeof(uint8_t) * ETHERNET_LL_SIZE);
1044
1045 if (ether_len <= 0)
1046 goto free_resources;
1047
1048 *vid = ret_vid;
1049
1050 ret = 0;
1051
1052 free_resources:
1053 neigh_free_resources(&neigh_handler);
1054
1055 return ret;
1056 #else
1057 return -ENOSYS;
1058 #endif
1059 }
1060