xref: /freebsd/contrib/ofed/libibverbs/verbs.c (revision 5963423232e869b8dbe8e9a65134e92735dfb521)
1 /*
2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #define _GNU_SOURCE
35 #include <config.h>
36 
37 #include <infiniband/endian.h>
38 #include <stdio.h>
39 #include <unistd.h>
40 #include <stdlib.h>
41 #include <errno.h>
42 #include <string.h>
43 #include <dirent.h>
44 #include <netinet/in.h>
45 #include <netinet/ip.h>
46 #include <sys/socket.h>
47 
48 #include "ibverbs.h"
49 #ifndef NRESOLVE_NEIGH
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include "neigh.h"
53 #endif
54 
55 /* Hack to avoid GCC's -Wmissing-prototypes and the similar error from sparse
56    with these prototypes. Symbol versionining requires the goofy names, the
57    prototype must match the version in verbs.h.
58  */
59 int __ibv_query_device(struct ibv_context *context,
60 		       struct ibv_device_attr *device_attr);
61 int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
62 		     struct ibv_port_attr *port_attr);
63 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index,
64 		    union ibv_gid *gid);
65 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index,
66 		     __be16 *pkey);
67 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context);
68 int __ibv_dealloc_pd(struct ibv_pd *pd);
69 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
70 			    int access);
71 int __ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr,
72 		   size_t length, int access);
73 int __ibv_dereg_mr(struct ibv_mr *mr);
74 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe,
75 			       void *cq_context,
76 			       struct ibv_comp_channel *channel,
77 			       int comp_vector);
78 int __ibv_resize_cq(struct ibv_cq *cq, int cqe);
79 int __ibv_destroy_cq(struct ibv_cq *cq);
80 int __ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq,
81 		       void **cq_context);
82 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents);
83 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
84 				 struct ibv_srq_init_attr *srq_init_attr);
85 int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr,
86 		     int srq_attr_mask);
87 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr);
88 int __ibv_destroy_srq(struct ibv_srq *srq);
89 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
90 			       struct ibv_qp_init_attr *qp_init_attr);
91 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
92 		   struct ibv_qp_init_attr *init_attr);
93 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
94 int __ibv_destroy_qp(struct ibv_qp *qp);
95 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
96 int __ibv_destroy_ah(struct ibv_ah *ah);
97 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
98 		       uint16_t lid);
99 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
100 		       uint16_t lid);
101 
ibv_rate_to_mult(enum ibv_rate rate)102 int __attribute__((const)) ibv_rate_to_mult(enum ibv_rate rate)
103 {
104 	switch (rate) {
105 	case IBV_RATE_2_5_GBPS: return  1;
106 	case IBV_RATE_5_GBPS:   return  2;
107 	case IBV_RATE_10_GBPS:  return  4;
108 	case IBV_RATE_20_GBPS:  return  8;
109 	case IBV_RATE_30_GBPS:  return 12;
110 	case IBV_RATE_40_GBPS:  return 16;
111 	case IBV_RATE_60_GBPS:  return 24;
112 	case IBV_RATE_80_GBPS:  return 32;
113 	case IBV_RATE_120_GBPS: return 48;
114 	case IBV_RATE_28_GBPS:  return 11;
115 	case IBV_RATE_50_GBPS:  return 20;
116 	case IBV_RATE_400_GBPS: return 160;
117 	case IBV_RATE_600_GBPS: return 240;
118 	case IBV_RATE_800_GBPS: return 320;
119 	case IBV_RATE_1200_GBPS: return 480;
120 	default:           return -1;
121 	}
122 }
123 
mult_to_ibv_rate(int mult)124 enum ibv_rate __attribute__((const)) mult_to_ibv_rate(int mult)
125 {
126 	switch (mult) {
127 	case 1:  return IBV_RATE_2_5_GBPS;
128 	case 2:  return IBV_RATE_5_GBPS;
129 	case 4:  return IBV_RATE_10_GBPS;
130 	case 8:  return IBV_RATE_20_GBPS;
131 	case 12: return IBV_RATE_30_GBPS;
132 	case 16: return IBV_RATE_40_GBPS;
133 	case 24: return IBV_RATE_60_GBPS;
134 	case 32: return IBV_RATE_80_GBPS;
135 	case 48: return IBV_RATE_120_GBPS;
136 	case 11: return IBV_RATE_28_GBPS;
137 	case 20: return IBV_RATE_50_GBPS;
138 	case 160: return IBV_RATE_400_GBPS;
139 	case 240: return IBV_RATE_600_GBPS;
140 	case 320: return IBV_RATE_800_GBPS;
141 	case 480: return IBV_RATE_1200_GBPS;
142 	default: return IBV_RATE_MAX;
143 	}
144 }
145 
ibv_rate_to_mbps(enum ibv_rate rate)146 int  __attribute__((const)) ibv_rate_to_mbps(enum ibv_rate rate)
147 {
148 	switch (rate) {
149 	case IBV_RATE_2_5_GBPS: return 2500;
150 	case IBV_RATE_5_GBPS:   return 5000;
151 	case IBV_RATE_10_GBPS:  return 10000;
152 	case IBV_RATE_20_GBPS:  return 20000;
153 	case IBV_RATE_30_GBPS:  return 30000;
154 	case IBV_RATE_40_GBPS:  return 40000;
155 	case IBV_RATE_60_GBPS:  return 60000;
156 	case IBV_RATE_80_GBPS:  return 80000;
157 	case IBV_RATE_120_GBPS: return 120000;
158 	case IBV_RATE_14_GBPS:  return 14062;
159 	case IBV_RATE_56_GBPS:  return 56250;
160 	case IBV_RATE_112_GBPS: return 112500;
161 	case IBV_RATE_168_GBPS: return 168750;
162 	case IBV_RATE_25_GBPS:  return 25781;
163 	case IBV_RATE_100_GBPS: return 103125;
164 	case IBV_RATE_200_GBPS: return 206250;
165 	case IBV_RATE_300_GBPS: return 309375;
166 	case IBV_RATE_28_GBPS:  return 28125;
167 	case IBV_RATE_50_GBPS:  return 53125;
168 	case IBV_RATE_400_GBPS: return 425000;
169 	case IBV_RATE_600_GBPS: return 637500;
170 	case IBV_RATE_800_GBPS: return 850000;
171 	case IBV_RATE_1200_GBPS: return 1275000;
172 	default:               return -1;
173 	}
174 }
175 
mbps_to_ibv_rate(int mbps)176 enum ibv_rate __attribute__((const)) mbps_to_ibv_rate(int mbps)
177 {
178 	switch (mbps) {
179 	case 2500:   return IBV_RATE_2_5_GBPS;
180 	case 5000:   return IBV_RATE_5_GBPS;
181 	case 10000:  return IBV_RATE_10_GBPS;
182 	case 20000:  return IBV_RATE_20_GBPS;
183 	case 30000:  return IBV_RATE_30_GBPS;
184 	case 40000:  return IBV_RATE_40_GBPS;
185 	case 60000:  return IBV_RATE_60_GBPS;
186 	case 80000:  return IBV_RATE_80_GBPS;
187 	case 120000: return IBV_RATE_120_GBPS;
188 	case 14062:  return IBV_RATE_14_GBPS;
189 	case 56250:  return IBV_RATE_56_GBPS;
190 	case 112500: return IBV_RATE_112_GBPS;
191 	case 168750: return IBV_RATE_168_GBPS;
192 	case 25781:  return IBV_RATE_25_GBPS;
193 	case 103125: return IBV_RATE_100_GBPS;
194 	case 206250: return IBV_RATE_200_GBPS;
195 	case 309375: return IBV_RATE_300_GBPS;
196 	case 28125:  return IBV_RATE_28_GBPS;
197 	case 53125:  return IBV_RATE_50_GBPS;
198 	case 425000: return IBV_RATE_400_GBPS;
199 	case 637500: return IBV_RATE_600_GBPS;
200 	case 850000: return IBV_RATE_800_GBPS;
201 	case 1275000: return IBV_RATE_1200_GBPS;
202 	default:     return IBV_RATE_MAX;
203 	}
204 }
205 
__ibv_query_device(struct ibv_context * context,struct ibv_device_attr * device_attr)206 int __ibv_query_device(struct ibv_context *context,
207 		       struct ibv_device_attr *device_attr)
208 {
209 	return context->ops.query_device(context, device_attr);
210 }
211 default_symver(__ibv_query_device, ibv_query_device);
212 
__ibv_query_port(struct ibv_context * context,uint8_t port_num,struct ibv_port_attr * port_attr)213 int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
214 		     struct ibv_port_attr *port_attr)
215 {
216 	return context->ops.query_port(context, port_num, port_attr);
217 }
218 default_symver(__ibv_query_port, ibv_query_port);
219 
__ibv_query_gid(struct ibv_context * context,uint8_t port_num,int index,union ibv_gid * gid)220 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num,
221 		    int index, union ibv_gid *gid)
222 {
223 	char name[24];
224 	char attr[41];
225 	uint16_t val;
226 	int i;
227 
228 	snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index);
229 
230 	if (ibv_read_sysfs_file(context->device->ibdev_path, name,
231 				attr, sizeof attr) < 0)
232 		return -1;
233 
234 	for (i = 0; i < 8; ++i) {
235 		if (sscanf(attr + i * 5, "%hx", &val) != 1)
236 			return -1;
237 		gid->raw[i * 2    ] = val >> 8;
238 		gid->raw[i * 2 + 1] = val & 0xff;
239 	}
240 
241 	return 0;
242 }
243 default_symver(__ibv_query_gid, ibv_query_gid);
244 
__ibv_query_pkey(struct ibv_context * context,uint8_t port_num,int index,__be16 * pkey)245 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num,
246 		     int index, __be16 *pkey)
247 {
248 	char name[24];
249 	char attr[8];
250 	uint16_t val;
251 
252 	snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index);
253 
254 	if (ibv_read_sysfs_file(context->device->ibdev_path, name,
255 				attr, sizeof attr) < 0)
256 		return -1;
257 
258 	if (sscanf(attr, "%hx", &val) != 1)
259 		return -1;
260 
261 	*pkey = htobe16(val);
262 	return 0;
263 }
264 default_symver(__ibv_query_pkey, ibv_query_pkey);
265 
__ibv_alloc_pd(struct ibv_context * context)266 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context)
267 {
268 	struct ibv_pd *pd;
269 
270 	pd = context->ops.alloc_pd(context);
271 	if (pd)
272 		pd->context = context;
273 
274 	return pd;
275 }
276 default_symver(__ibv_alloc_pd, ibv_alloc_pd);
277 
__ibv_dealloc_pd(struct ibv_pd * pd)278 int __ibv_dealloc_pd(struct ibv_pd *pd)
279 {
280 	return pd->context->ops.dealloc_pd(pd);
281 }
282 default_symver(__ibv_dealloc_pd, ibv_dealloc_pd);
283 
__ibv_reg_mr(struct ibv_pd * pd,void * addr,size_t length,int access)284 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr,
285 			    size_t length, int access)
286 {
287 	struct ibv_mr *mr;
288 
289 	if (ibv_dontfork_range(addr, length))
290 		return NULL;
291 
292 	mr = pd->context->ops.reg_mr(pd, addr, length, access);
293 	if (mr) {
294 		mr->context = pd->context;
295 		mr->pd      = pd;
296 		mr->addr    = addr;
297 		mr->length  = length;
298 	} else
299 		ibv_dofork_range(addr, length);
300 
301 	return mr;
302 }
303 default_symver(__ibv_reg_mr, ibv_reg_mr);
304 
__ibv_rereg_mr(struct ibv_mr * mr,int flags,struct ibv_pd * pd,void * addr,size_t length,int access)305 int __ibv_rereg_mr(struct ibv_mr *mr, int flags,
306 		   struct ibv_pd *pd, void *addr,
307 		   size_t length, int access)
308 {
309 	int dofork_onfail = 0;
310 	int err;
311 	void *old_addr;
312 	size_t old_len;
313 
314 	if (flags & ~IBV_REREG_MR_FLAGS_SUPPORTED) {
315 		errno = EINVAL;
316 		return IBV_REREG_MR_ERR_INPUT;
317 	}
318 
319 	if ((flags & IBV_REREG_MR_CHANGE_TRANSLATION) &&
320 	    (!length || !addr)) {
321 		errno = EINVAL;
322 		return IBV_REREG_MR_ERR_INPUT;
323 	}
324 
325 	if (access && !(flags & IBV_REREG_MR_CHANGE_ACCESS)) {
326 		errno = EINVAL;
327 		return IBV_REREG_MR_ERR_INPUT;
328 	}
329 
330 	if (!mr->context->ops.rereg_mr) {
331 		errno = ENOSYS;
332 		return IBV_REREG_MR_ERR_INPUT;
333 	}
334 
335 	if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
336 		err = ibv_dontfork_range(addr, length);
337 		if (err)
338 			return IBV_REREG_MR_ERR_DONT_FORK_NEW;
339 		dofork_onfail = 1;
340 	}
341 
342 	old_addr = mr->addr;
343 	old_len = mr->length;
344 	err = mr->context->ops.rereg_mr(mr, flags, pd, addr, length, access);
345 	if (!err) {
346 		if (flags & IBV_REREG_MR_CHANGE_PD)
347 			mr->pd = pd;
348 		if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
349 			mr->addr    = addr;
350 			mr->length  = length;
351 			err = ibv_dofork_range(old_addr, old_len);
352 			if (err)
353 				return IBV_REREG_MR_ERR_DO_FORK_OLD;
354 		}
355 	} else {
356 		err = IBV_REREG_MR_ERR_CMD;
357 		if (dofork_onfail) {
358 			if (ibv_dofork_range(addr, length))
359 				err = IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW;
360 		}
361 	}
362 
363 	return err;
364 }
365 default_symver(__ibv_rereg_mr, ibv_rereg_mr);
366 
__ibv_dereg_mr(struct ibv_mr * mr)367 int __ibv_dereg_mr(struct ibv_mr *mr)
368 {
369 	int ret;
370 	void *addr	= mr->addr;
371 	size_t length	= mr->length;
372 
373 	ret = mr->context->ops.dereg_mr(mr);
374 	if (!ret)
375 		ibv_dofork_range(addr, length);
376 
377 	return ret;
378 }
379 default_symver(__ibv_dereg_mr, ibv_dereg_mr);
380 
ibv_create_comp_channel_v2(struct ibv_context * context)381 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
382 {
383 	struct ibv_abi_compat_v2 *t = context->abi_compat;
384 	static int warned;
385 
386 	if (!pthread_mutex_trylock(&t->in_use))
387 		return &t->channel;
388 
389 	if (!warned) {
390 		fprintf(stderr, PFX "Warning: kernel's ABI version %d limits capacity.\n"
391 			"    Only one completion channel can be created per context.\n",
392 			abi_ver);
393 		++warned;
394 	}
395 
396 	return NULL;
397 }
398 
ibv_create_comp_channel(struct ibv_context * context)399 struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context)
400 {
401 	struct ibv_comp_channel            *channel;
402 	struct ibv_create_comp_channel      cmd;
403 	struct ibv_create_comp_channel_resp resp;
404 
405 	if (abi_ver <= 2)
406 		return ibv_create_comp_channel_v2(context);
407 
408 	channel = malloc(sizeof *channel);
409 	if (!channel)
410 		return NULL;
411 
412 	IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_COMP_CHANNEL, &resp, sizeof resp);
413 	if (write(context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) {
414 		free(channel);
415 		return NULL;
416 	}
417 
418 	(void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
419 
420 	channel->context = context;
421 	channel->fd      = resp.fd;
422 	channel->refcnt  = 0;
423 
424 	return channel;
425 }
426 
ibv_destroy_comp_channel_v2(struct ibv_comp_channel * channel)427 static int ibv_destroy_comp_channel_v2(struct ibv_comp_channel *channel)
428 {
429 	struct ibv_abi_compat_v2 *t = (struct ibv_abi_compat_v2 *) channel;
430 	pthread_mutex_unlock(&t->in_use);
431 	return 0;
432 }
433 
ibv_destroy_comp_channel(struct ibv_comp_channel * channel)434 int ibv_destroy_comp_channel(struct ibv_comp_channel *channel)
435 {
436 	struct ibv_context *context;
437 	int ret;
438 
439 	context = channel->context;
440 	pthread_mutex_lock(&context->mutex);
441 
442 	if (channel->refcnt) {
443 		ret = EBUSY;
444 		goto out;
445 	}
446 
447 	if (abi_ver <= 2) {
448 		ret = ibv_destroy_comp_channel_v2(channel);
449 		goto out;
450 	}
451 
452 	close(channel->fd);
453 	free(channel);
454 	ret = 0;
455 
456 out:
457 	pthread_mutex_unlock(&context->mutex);
458 
459 	return ret;
460 }
461 
__ibv_create_cq(struct ibv_context * context,int cqe,void * cq_context,struct ibv_comp_channel * channel,int comp_vector)462 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context,
463 			       struct ibv_comp_channel *channel, int comp_vector)
464 {
465 	struct ibv_cq *cq;
466 	int err = 0;
467 
468 	cq = context->ops.create_cq(context, cqe, channel, comp_vector);
469 
470 	if (!cq)
471 		return NULL;
472 
473 	err = verbs_init_cq(cq, context, channel, cq_context);
474 	if (err)
475 		goto err;
476 
477 	return cq;
478 
479 err:
480 	context->ops.destroy_cq(cq);
481 
482 	return NULL;
483 }
484 default_symver(__ibv_create_cq, ibv_create_cq);
485 
__ibv_resize_cq(struct ibv_cq * cq,int cqe)486 int __ibv_resize_cq(struct ibv_cq *cq, int cqe)
487 {
488 	if (!cq->context->ops.resize_cq)
489 		return ENOSYS;
490 
491 	return cq->context->ops.resize_cq(cq, cqe);
492 }
493 default_symver(__ibv_resize_cq, ibv_resize_cq);
494 
__ibv_destroy_cq(struct ibv_cq * cq)495 int __ibv_destroy_cq(struct ibv_cq *cq)
496 {
497 	struct ibv_comp_channel *channel = cq->channel;
498 	int ret;
499 
500 	ret = cq->context->ops.destroy_cq(cq);
501 
502 	if (channel) {
503 		if (!ret) {
504 			pthread_mutex_lock(&channel->context->mutex);
505 			--channel->refcnt;
506 			pthread_mutex_unlock(&channel->context->mutex);
507 		}
508 	}
509 
510 	return ret;
511 }
512 default_symver(__ibv_destroy_cq, ibv_destroy_cq);
513 
__ibv_get_cq_event(struct ibv_comp_channel * channel,struct ibv_cq ** cq,void ** cq_context)514 int __ibv_get_cq_event(struct ibv_comp_channel *channel,
515 		       struct ibv_cq **cq, void **cq_context)
516 {
517 	struct ibv_comp_event ev;
518 
519 	if (read(channel->fd, &ev, sizeof ev) != sizeof ev)
520 		return -1;
521 
522 	*cq         = (struct ibv_cq *) (uintptr_t) ev.cq_handle;
523 	*cq_context = (*cq)->cq_context;
524 
525 	if ((*cq)->context->ops.cq_event)
526 		(*cq)->context->ops.cq_event(*cq);
527 
528 	return 0;
529 }
530 default_symver(__ibv_get_cq_event, ibv_get_cq_event);
531 
__ibv_ack_cq_events(struct ibv_cq * cq,unsigned int nevents)532 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents)
533 {
534 	pthread_mutex_lock(&cq->mutex);
535 	cq->comp_events_completed += nevents;
536 	pthread_cond_signal(&cq->cond);
537 	pthread_mutex_unlock(&cq->mutex);
538 }
539 default_symver(__ibv_ack_cq_events, ibv_ack_cq_events);
540 
__ibv_create_srq(struct ibv_pd * pd,struct ibv_srq_init_attr * srq_init_attr)541 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
542 				 struct ibv_srq_init_attr *srq_init_attr)
543 {
544 	struct ibv_srq *srq;
545 
546 	if (!pd->context->ops.create_srq)
547 		return NULL;
548 
549 	srq = pd->context->ops.create_srq(pd, srq_init_attr);
550 	if (!srq)
551 		return NULL;
552 
553 	srq->context		  = pd->context;
554 	srq->srq_context	  = srq_init_attr->srq_context;
555 	srq->pd				  = pd;
556 	srq->events_completed = 0;
557 	if (pthread_mutex_init(&srq->mutex, NULL))
558 		goto err;
559 	if (pthread_cond_init(&srq->cond, NULL))
560 		goto err_mutex;
561 
562 	return srq;
563 
564 err_mutex:
565 	pthread_mutex_destroy(&srq->mutex);
566 err:
567 	pd->context->ops.destroy_srq(srq);
568 
569 	return NULL;
570 }
571 default_symver(__ibv_create_srq, ibv_create_srq);
572 
__ibv_modify_srq(struct ibv_srq * srq,struct ibv_srq_attr * srq_attr,int srq_attr_mask)573 int __ibv_modify_srq(struct ibv_srq *srq,
574 		     struct ibv_srq_attr *srq_attr,
575 		     int srq_attr_mask)
576 {
577 	return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask);
578 }
579 default_symver(__ibv_modify_srq, ibv_modify_srq);
580 
__ibv_query_srq(struct ibv_srq * srq,struct ibv_srq_attr * srq_attr)581 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr)
582 {
583 	return srq->context->ops.query_srq(srq, srq_attr);
584 }
585 default_symver(__ibv_query_srq, ibv_query_srq);
586 
__ibv_destroy_srq(struct ibv_srq * srq)587 int __ibv_destroy_srq(struct ibv_srq *srq)
588 {
589 	pthread_cond_destroy(&srq->cond);
590 	pthread_mutex_destroy(&srq->mutex);
591 	return srq->context->ops.destroy_srq(srq);
592 }
593 default_symver(__ibv_destroy_srq, ibv_destroy_srq);
594 
__ibv_create_qp(struct ibv_pd * pd,struct ibv_qp_init_attr * qp_init_attr)595 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
596 			       struct ibv_qp_init_attr *qp_init_attr)
597 {
598 	struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr);
599 
600 	if (qp) {
601 		qp->context    	     = pd->context;
602 		qp->qp_context 	     = qp_init_attr->qp_context;
603 		qp->pd         	     = pd;
604 		qp->send_cq    	     = qp_init_attr->send_cq;
605 		qp->recv_cq    	     = qp_init_attr->recv_cq;
606 		qp->srq        	     = qp_init_attr->srq;
607 		qp->qp_type          = qp_init_attr->qp_type;
608 		qp->state	     = IBV_QPS_RESET;
609 		qp->events_completed = 0;
610 		pthread_mutex_init(&qp->mutex, NULL);
611 		pthread_cond_init(&qp->cond, NULL);
612 	}
613 
614 	return qp;
615 }
616 default_symver(__ibv_create_qp, ibv_create_qp);
617 
__ibv_query_qp(struct ibv_qp * qp,struct ibv_qp_attr * attr,int attr_mask,struct ibv_qp_init_attr * init_attr)618 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
619 		   int attr_mask,
620 		   struct ibv_qp_init_attr *init_attr)
621 {
622 	int ret;
623 
624 	ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr);
625 	if (ret)
626 		return ret;
627 
628 	if (attr_mask & IBV_QP_STATE)
629 		qp->state = attr->qp_state;
630 
631 	return 0;
632 }
633 default_symver(__ibv_query_qp, ibv_query_qp);
634 
__ibv_modify_qp(struct ibv_qp * qp,struct ibv_qp_attr * attr,int attr_mask)635 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
636 		    int attr_mask)
637 {
638 	int ret;
639 
640 	ret = qp->context->ops.modify_qp(qp, attr, attr_mask);
641 	if (ret)
642 		return ret;
643 
644 	if (attr_mask & IBV_QP_STATE)
645 		qp->state = attr->qp_state;
646 
647 	return 0;
648 }
649 default_symver(__ibv_modify_qp, ibv_modify_qp);
650 
__ibv_destroy_qp(struct ibv_qp * qp)651 int __ibv_destroy_qp(struct ibv_qp *qp)
652 {
653 	return qp->context->ops.destroy_qp(qp);
654 }
655 default_symver(__ibv_destroy_qp, ibv_destroy_qp);
656 
__ibv_create_ah(struct ibv_pd * pd,struct ibv_ah_attr * attr)657 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
658 {
659 	struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr);
660 
661 	if (ah) {
662 		ah->context = pd->context;
663 		ah->pd      = pd;
664 	}
665 
666 	return ah;
667 }
668 default_symver(__ibv_create_ah, ibv_create_ah);
669 
670 /* GID types as appear in sysfs, no change is expected as of ABI
671  * compatibility.
672  */
673 #define V1_TYPE "IB/RoCE v1"
674 #define V2_TYPE "RoCE v2"
ibv_query_gid_type(struct ibv_context * context,uint8_t port_num,unsigned int index,enum ibv_gid_type * type)675 int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num,
676 		       unsigned int index, enum ibv_gid_type *type)
677 {
678 	char name[32];
679 	char buff[11];
680 
681 	snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num,
682 		 index);
683 
684 	/* Reset errno so that we can rely on its value upon any error flow in
685 	 * ibv_read_sysfs_file.
686 	 */
687 	errno = 0;
688 	if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff,
689 				sizeof(buff)) <= 0) {
690 		char *dir_path;
691 		DIR *dir;
692 
693 		if (errno == EINVAL) {
694 			/* In IB, this file doesn't exist and the kernel sets
695 			 * errno to -EINVAL.
696 			 */
697 			*type = IBV_GID_TYPE_IB_ROCE_V1;
698 			return 0;
699 		}
700 		if (asprintf(&dir_path, "%s/%s/%d/%s/",
701 			     context->device->ibdev_path, "ports", port_num,
702 			     "gid_attrs") < 0)
703 			return -1;
704 		dir = opendir(dir_path);
705 		free(dir_path);
706 		if (!dir) {
707 			if (errno == ENOENT)
708 				/* Assuming that if gid_attrs doesn't exist,
709 				 * we have an old kernel and all GIDs are
710 				 * IB/RoCE v1
711 				 */
712 				*type = IBV_GID_TYPE_IB_ROCE_V1;
713 			else
714 				return -1;
715 		} else {
716 			closedir(dir);
717 			errno = EFAULT;
718 			return -1;
719 		}
720 	} else {
721 		if (!strcmp(buff, V1_TYPE)) {
722 			*type = IBV_GID_TYPE_IB_ROCE_V1;
723 		} else if (!strcmp(buff, V2_TYPE)) {
724 			*type = IBV_GID_TYPE_ROCE_V2;
725 		} else {
726 			errno = ENOTSUP;
727 			return -1;
728 		}
729 	}
730 
731 	return 0;
732 }
733 
ibv_find_gid_index(struct ibv_context * context,uint8_t port_num,union ibv_gid * gid,enum ibv_gid_type gid_type)734 static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num,
735 			      union ibv_gid *gid, enum ibv_gid_type gid_type)
736 {
737 	enum ibv_gid_type sgid_type = 0;
738 	union ibv_gid sgid;
739 	int i = 0, ret;
740 
741 	do {
742 		ret = ibv_query_gid(context, port_num, i, &sgid);
743 		if (!ret) {
744 			ret = ibv_query_gid_type(context, port_num, i,
745 						 &sgid_type);
746 		}
747 		i++;
748 	} while (!ret && (memcmp(&sgid, gid, sizeof(*gid)) ||
749 		 (gid_type != sgid_type)));
750 
751 	return ret ? ret : i - 1;
752 }
753 
map_ipv4_addr_to_ipv6(__be32 ipv4,struct in6_addr * ipv6)754 static inline void map_ipv4_addr_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
755 {
756 	ipv6->s6_addr32[0] = 0;
757 	ipv6->s6_addr32[1] = 0;
758 	ipv6->s6_addr32[2] = htobe32(0x0000FFFF);
759 	ipv6->s6_addr32[3] = ipv4;
760 }
761 
ipv4_calc_hdr_csum(uint16_t * data,unsigned int num_hwords)762 static inline __sum16 ipv4_calc_hdr_csum(uint16_t *data, unsigned int num_hwords)
763 {
764 	unsigned int i = 0;
765 	uint32_t sum = 0;
766 
767 	for (i = 0; i < num_hwords; i++)
768 		sum += *(data++);
769 
770 	sum = (sum & 0xffff) + (sum >> 16);
771 
772 	return (__sum16)~sum;
773 }
774 
get_grh_header_version(struct ibv_grh * grh)775 static inline int get_grh_header_version(struct ibv_grh *grh)
776 {
777 	int ip6h_version = (be32toh(grh->version_tclass_flow) >> 28) & 0xf;
778 	struct ip *ip4h = (struct ip *)((void *)grh + 20);
779 	struct ip ip4h_checked;
780 
781 	if (ip6h_version != 6) {
782 		if (ip4h->ip_v == 4)
783 			return 4;
784 		errno = EPROTONOSUPPORT;
785 		return -1;
786 	}
787 	/* version may be 6 or 4 */
788 	if (ip4h->ip_hl != 5) /* IPv4 header length must be 5 for RoCE v2. */
789 		return 6;
790 	/*
791 	* Verify checksum.
792 	* We can't write on scattered buffers so we have to copy to temp
793 	* buffer.
794 	*/
795 	memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
796 	/* Need to set the checksum field (check) to 0 before re-calculating
797 	 * the checksum.
798 	 */
799 	ip4h_checked.ip_sum = 0;
800 	ip4h_checked.ip_sum = ipv4_calc_hdr_csum((uint16_t *)&ip4h_checked, 10);
801 	/* if IPv4 header checksum is OK, believe it */
802 	if (ip4h->ip_sum == ip4h_checked.ip_sum)
803 		return 4;
804 	return 6;
805 }
806 
set_ah_attr_generic_fields(struct ibv_ah_attr * ah_attr,struct ibv_wc * wc,struct ibv_grh * grh,uint8_t port_num)807 static inline void set_ah_attr_generic_fields(struct ibv_ah_attr *ah_attr,
808 					      struct ibv_wc *wc,
809 					      struct ibv_grh *grh,
810 					      uint8_t port_num)
811 {
812 	uint32_t flow_class;
813 
814 	flow_class = be32toh(grh->version_tclass_flow);
815 	ah_attr->grh.flow_label = flow_class & 0xFFFFF;
816 	ah_attr->dlid = wc->slid;
817 	ah_attr->sl = wc->sl;
818 	ah_attr->src_path_bits = wc->dlid_path_bits;
819 	ah_attr->port_num = port_num;
820 }
821 
set_ah_attr_by_ipv4(struct ibv_context * context,struct ibv_ah_attr * ah_attr,struct ip * ip4h,uint8_t port_num)822 static inline int set_ah_attr_by_ipv4(struct ibv_context *context,
823 				      struct ibv_ah_attr *ah_attr,
824 				      struct ip *ip4h, uint8_t port_num)
825 {
826 	union ibv_gid sgid;
827 	int ret;
828 
829 	/* No point searching multicast GIDs in GID table */
830 	if (IN_CLASSD(be32toh(ip4h->ip_dst.s_addr))) {
831 		errno = EINVAL;
832 		return -1;
833 	}
834 
835 	map_ipv4_addr_to_ipv6(ip4h->ip_dst.s_addr, (struct in6_addr *)&sgid);
836 	ret = ibv_find_gid_index(context, port_num, &sgid,
837 				 IBV_GID_TYPE_ROCE_V2);
838 	if (ret < 0)
839 		return ret;
840 
841 	map_ipv4_addr_to_ipv6(ip4h->ip_src.s_addr,
842 			      (struct in6_addr *)&ah_attr->grh.dgid);
843 	ah_attr->grh.sgid_index = (uint8_t) ret;
844 	ah_attr->grh.hop_limit = ip4h->ip_ttl;
845 	ah_attr->grh.traffic_class = ip4h->ip_tos;
846 
847 	return 0;
848 }
849 
850 #define IB_NEXT_HDR    0x1b
set_ah_attr_by_ipv6(struct ibv_context * context,struct ibv_ah_attr * ah_attr,struct ibv_grh * grh,uint8_t port_num)851 static inline int set_ah_attr_by_ipv6(struct ibv_context *context,
852 				  struct ibv_ah_attr *ah_attr,
853 				  struct ibv_grh *grh, uint8_t port_num)
854 {
855 	uint32_t flow_class;
856 	uint32_t sgid_type;
857 	int ret;
858 
859 	/* No point searching multicast GIDs in GID table */
860 	if (grh->dgid.raw[0] == 0xFF) {
861 		errno = EINVAL;
862 		return -1;
863 	}
864 
865 	ah_attr->grh.dgid = grh->sgid;
866 	if (grh->next_hdr == IPPROTO_UDP) {
867 		sgid_type = IBV_GID_TYPE_ROCE_V2;
868 	} else if (grh->next_hdr == IB_NEXT_HDR) {
869 		sgid_type = IBV_GID_TYPE_IB_ROCE_V1;
870 	} else {
871 		errno = EPROTONOSUPPORT;
872 		return -1;
873 	}
874 
875 	ret = ibv_find_gid_index(context, port_num, &grh->dgid,
876 				 sgid_type);
877 	if (ret < 0)
878 		return ret;
879 
880 	ah_attr->grh.sgid_index = (uint8_t) ret;
881 	flow_class = be32toh(grh->version_tclass_flow);
882 	ah_attr->grh.hop_limit = grh->hop_limit;
883 	ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
884 
885 	return 0;
886 }
887 
ibv_init_ah_from_wc(struct ibv_context * context,uint8_t port_num,struct ibv_wc * wc,struct ibv_grh * grh,struct ibv_ah_attr * ah_attr)888 int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num,
889 			struct ibv_wc *wc, struct ibv_grh *grh,
890 			struct ibv_ah_attr *ah_attr)
891 {
892 	int version;
893 	int ret = 0;
894 
895 	memset(ah_attr, 0, sizeof *ah_attr);
896 	set_ah_attr_generic_fields(ah_attr, wc, grh, port_num);
897 
898 	if (wc->wc_flags & IBV_WC_GRH) {
899 		ah_attr->is_global = 1;
900 		version = get_grh_header_version(grh);
901 
902 		if (version == 4)
903 			ret = set_ah_attr_by_ipv4(context, ah_attr,
904 						  (struct ip *)((void *)grh + 20),
905 						  port_num);
906 		else if (version == 6)
907 			ret = set_ah_attr_by_ipv6(context, ah_attr, grh,
908 						  port_num);
909 		else
910 			ret = -1;
911 	}
912 
913 	return ret;
914 }
915 
ibv_create_ah_from_wc(struct ibv_pd * pd,struct ibv_wc * wc,struct ibv_grh * grh,uint8_t port_num)916 struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc,
917 				     struct ibv_grh *grh, uint8_t port_num)
918 {
919 	struct ibv_ah_attr ah_attr;
920 	int ret;
921 
922 	ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr);
923 	if (ret)
924 		return NULL;
925 
926 	return ibv_create_ah(pd, &ah_attr);
927 }
928 
__ibv_destroy_ah(struct ibv_ah * ah)929 int __ibv_destroy_ah(struct ibv_ah *ah)
930 {
931 	return ah->context->ops.destroy_ah(ah);
932 }
933 default_symver(__ibv_destroy_ah, ibv_destroy_ah);
934 
__ibv_attach_mcast(struct ibv_qp * qp,const union ibv_gid * gid,uint16_t lid)935 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
936 {
937 	return qp->context->ops.attach_mcast(qp, gid, lid);
938 }
939 default_symver(__ibv_attach_mcast, ibv_attach_mcast);
940 
__ibv_detach_mcast(struct ibv_qp * qp,const union ibv_gid * gid,uint16_t lid)941 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
942 {
943 	return qp->context->ops.detach_mcast(qp, gid, lid);
944 }
945 default_symver(__ibv_detach_mcast, ibv_detach_mcast);
946 
ipv6_addr_v4mapped(const struct in6_addr * a)947 static inline int ipv6_addr_v4mapped(const struct in6_addr *a)
948 {
949 	return IN6_IS_ADDR_V4MAPPED(a) ||
950 		/* IPv4 encoded multicast addresses */
951 		(a->s6_addr32[0]  == htobe32(0xff0e0000) &&
952 		((a->s6_addr32[1] |
953 		 (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL));
954 }
955 
956 struct peer_address {
957 	void *address;
958 	uint32_t size;
959 };
960 
create_peer_from_gid(int family,void * raw_gid,struct peer_address * peer_address)961 static inline int create_peer_from_gid(int family, void *raw_gid,
962 				       struct peer_address *peer_address)
963 {
964 	switch (family) {
965 	case AF_INET:
966 		peer_address->address = raw_gid + 12;
967 		peer_address->size = 4;
968 		break;
969 	case AF_INET6:
970 		peer_address->address = raw_gid;
971 		peer_address->size = 16;
972 		break;
973 	default:
974 		return -1;
975 	}
976 
977 	return 0;
978 }
979 
980 #define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000
ibv_resolve_eth_l2_from_gid(struct ibv_context * context,struct ibv_ah_attr * attr,uint8_t eth_mac[ETHERNET_LL_SIZE],uint16_t * vid)981 int ibv_resolve_eth_l2_from_gid(struct ibv_context *context,
982 				struct ibv_ah_attr *attr,
983 				uint8_t eth_mac[ETHERNET_LL_SIZE],
984 				uint16_t *vid)
985 {
986 #ifndef NRESOLVE_NEIGH
987 	int dst_family;
988 	int src_family;
989 	int oif;
990 	struct get_neigh_handler neigh_handler;
991 	union ibv_gid sgid;
992 	int ether_len;
993 	struct peer_address src;
994 	struct peer_address dst;
995 	uint16_t ret_vid;
996 	int ret = -EINVAL;
997 	int err;
998 
999 	err = ibv_query_gid(context, attr->port_num,
1000 			    attr->grh.sgid_index, &sgid);
1001 
1002 	if (err)
1003 		return err;
1004 
1005 	err = neigh_init_resources(&neigh_handler,
1006 				   NEIGH_GET_DEFAULT_TIMEOUT_MS);
1007 
1008 	if (err)
1009 		return err;
1010 
1011 	dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ?
1012 			AF_INET : AF_INET6;
1013 	src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ?
1014 			AF_INET : AF_INET6;
1015 
1016 	if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst))
1017 		goto free_resources;
1018 
1019 	if (create_peer_from_gid(src_family, &sgid.raw, &src))
1020 		goto free_resources;
1021 
1022 	if (neigh_set_dst(&neigh_handler, dst_family, dst.address,
1023 			  dst.size))
1024 		goto free_resources;
1025 
1026 	if (neigh_set_src(&neigh_handler, src_family, src.address,
1027 			  src.size))
1028 		goto free_resources;
1029 
1030 	oif = neigh_get_oif_from_src(&neigh_handler);
1031 
1032 	if (oif > 0)
1033 		neigh_set_oif(&neigh_handler, oif);
1034 	else
1035 		goto free_resources;
1036 
1037 	ret = -EHOSTUNREACH;
1038 
1039 	/* blocking call */
1040 	if (process_get_neigh(&neigh_handler))
1041 		goto free_resources;
1042 
1043 	ret_vid = neigh_get_vlan_id_from_dev(&neigh_handler);
1044 
1045 	if (ret_vid <= 0xfff)
1046 		neigh_set_vlan_id(&neigh_handler, ret_vid);
1047 
1048 	/* We are using only Ethernet here */
1049 	ether_len = neigh_get_ll(&neigh_handler,
1050 				 eth_mac,
1051 				 sizeof(uint8_t) * ETHERNET_LL_SIZE);
1052 
1053 	if (ether_len <= 0)
1054 		goto free_resources;
1055 
1056 	*vid = ret_vid;
1057 
1058 	ret = 0;
1059 
1060 free_resources:
1061 	neigh_free_resources(&neigh_handler);
1062 
1063 	return ret;
1064 #else
1065 	return -ENOSYS;
1066 #endif
1067 }
1068