xref: /freebsd/contrib/ofed/libibverbs/verbs.c (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*
2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #define _GNU_SOURCE
35 #include <config.h>
36 
37 #include <infiniband/endian.h>
38 #include <stdio.h>
39 #include <unistd.h>
40 #include <stdlib.h>
41 #include <errno.h>
42 #include <string.h>
43 #include <dirent.h>
44 #include <netinet/in.h>
45 #include <netinet/ip.h>
46 #include <sys/socket.h>
47 
48 #include "ibverbs.h"
49 #ifndef NRESOLVE_NEIGH
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include "neigh.h"
53 #endif
54 
55 /* Hack to avoid GCC's -Wmissing-prototypes and the similar error from sparse
56    with these prototypes. Symbol versionining requires the goofy names, the
57    prototype must match the version in verbs.h.
58  */
59 int __ibv_query_device(struct ibv_context *context,
60 		       struct ibv_device_attr *device_attr);
61 int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
62 		     struct ibv_port_attr *port_attr);
63 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index,
64 		    union ibv_gid *gid);
65 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index,
66 		     __be16 *pkey);
67 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context);
68 int __ibv_dealloc_pd(struct ibv_pd *pd);
69 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
70 			    int access);
71 int __ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr,
72 		   size_t length, int access);
73 int __ibv_dereg_mr(struct ibv_mr *mr);
74 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe,
75 			       void *cq_context,
76 			       struct ibv_comp_channel *channel,
77 			       int comp_vector);
78 int __ibv_resize_cq(struct ibv_cq *cq, int cqe);
79 int __ibv_destroy_cq(struct ibv_cq *cq);
80 int __ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq,
81 		       void **cq_context);
82 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents);
83 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
84 				 struct ibv_srq_init_attr *srq_init_attr);
85 int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr,
86 		     int srq_attr_mask);
87 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr);
88 int __ibv_destroy_srq(struct ibv_srq *srq);
89 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
90 			       struct ibv_qp_init_attr *qp_init_attr);
91 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
92 		   struct ibv_qp_init_attr *init_attr);
93 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
94 int __ibv_destroy_qp(struct ibv_qp *qp);
95 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
96 int __ibv_destroy_ah(struct ibv_ah *ah);
97 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
98 		       uint16_t lid);
99 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
100 		       uint16_t lid);
101 
102 int __attribute__((const)) ibv_rate_to_mult(enum ibv_rate rate)
103 {
104 	switch (rate) {
105 	case IBV_RATE_2_5_GBPS: return  1;
106 	case IBV_RATE_5_GBPS:   return  2;
107 	case IBV_RATE_10_GBPS:  return  4;
108 	case IBV_RATE_20_GBPS:  return  8;
109 	case IBV_RATE_30_GBPS:  return 12;
110 	case IBV_RATE_40_GBPS:  return 16;
111 	case IBV_RATE_60_GBPS:  return 24;
112 	case IBV_RATE_80_GBPS:  return 32;
113 	case IBV_RATE_120_GBPS: return 48;
114 	case IBV_RATE_28_GBPS:  return 11;
115 	case IBV_RATE_50_GBPS:  return 20;
116 	case IBV_RATE_400_GBPS: return 160;
117 	case IBV_RATE_600_GBPS: return 240;
118 	default:           return -1;
119 	}
120 }
121 
122 enum ibv_rate __attribute__((const)) mult_to_ibv_rate(int mult)
123 {
124 	switch (mult) {
125 	case 1:  return IBV_RATE_2_5_GBPS;
126 	case 2:  return IBV_RATE_5_GBPS;
127 	case 4:  return IBV_RATE_10_GBPS;
128 	case 8:  return IBV_RATE_20_GBPS;
129 	case 12: return IBV_RATE_30_GBPS;
130 	case 16: return IBV_RATE_40_GBPS;
131 	case 24: return IBV_RATE_60_GBPS;
132 	case 32: return IBV_RATE_80_GBPS;
133 	case 48: return IBV_RATE_120_GBPS;
134 	case 11: return IBV_RATE_28_GBPS;
135 	case 20: return IBV_RATE_50_GBPS;
136 	case 160: return IBV_RATE_400_GBPS;
137 	case 240: return IBV_RATE_600_GBPS;
138 	default: return IBV_RATE_MAX;
139 	}
140 }
141 
142 int  __attribute__((const)) ibv_rate_to_mbps(enum ibv_rate rate)
143 {
144 	switch (rate) {
145 	case IBV_RATE_2_5_GBPS: return 2500;
146 	case IBV_RATE_5_GBPS:   return 5000;
147 	case IBV_RATE_10_GBPS:  return 10000;
148 	case IBV_RATE_20_GBPS:  return 20000;
149 	case IBV_RATE_30_GBPS:  return 30000;
150 	case IBV_RATE_40_GBPS:  return 40000;
151 	case IBV_RATE_60_GBPS:  return 60000;
152 	case IBV_RATE_80_GBPS:  return 80000;
153 	case IBV_RATE_120_GBPS: return 120000;
154 	case IBV_RATE_14_GBPS:  return 14062;
155 	case IBV_RATE_56_GBPS:  return 56250;
156 	case IBV_RATE_112_GBPS: return 112500;
157 	case IBV_RATE_168_GBPS: return 168750;
158 	case IBV_RATE_25_GBPS:  return 25781;
159 	case IBV_RATE_100_GBPS: return 103125;
160 	case IBV_RATE_200_GBPS: return 206250;
161 	case IBV_RATE_300_GBPS: return 309375;
162 	case IBV_RATE_28_GBPS:  return 28125;
163 	case IBV_RATE_50_GBPS:  return 53125;
164 	case IBV_RATE_400_GBPS: return 425000;
165 	case IBV_RATE_600_GBPS: return 637500;
166 	default:               return -1;
167 	}
168 }
169 
170 enum ibv_rate __attribute__((const)) mbps_to_ibv_rate(int mbps)
171 {
172 	switch (mbps) {
173 	case 2500:   return IBV_RATE_2_5_GBPS;
174 	case 5000:   return IBV_RATE_5_GBPS;
175 	case 10000:  return IBV_RATE_10_GBPS;
176 	case 20000:  return IBV_RATE_20_GBPS;
177 	case 30000:  return IBV_RATE_30_GBPS;
178 	case 40000:  return IBV_RATE_40_GBPS;
179 	case 60000:  return IBV_RATE_60_GBPS;
180 	case 80000:  return IBV_RATE_80_GBPS;
181 	case 120000: return IBV_RATE_120_GBPS;
182 	case 14062:  return IBV_RATE_14_GBPS;
183 	case 56250:  return IBV_RATE_56_GBPS;
184 	case 112500: return IBV_RATE_112_GBPS;
185 	case 168750: return IBV_RATE_168_GBPS;
186 	case 25781:  return IBV_RATE_25_GBPS;
187 	case 103125: return IBV_RATE_100_GBPS;
188 	case 206250: return IBV_RATE_200_GBPS;
189 	case 309375: return IBV_RATE_300_GBPS;
190 	case 28125:  return IBV_RATE_28_GBPS;
191 	case 53125:  return IBV_RATE_50_GBPS;
192 	case 425000: return IBV_RATE_400_GBPS;
193 	case 637500: return IBV_RATE_600_GBPS;
194 	default:     return IBV_RATE_MAX;
195 	}
196 }
197 
198 int __ibv_query_device(struct ibv_context *context,
199 		       struct ibv_device_attr *device_attr)
200 {
201 	return context->ops.query_device(context, device_attr);
202 }
203 default_symver(__ibv_query_device, ibv_query_device);
204 
205 int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
206 		     struct ibv_port_attr *port_attr)
207 {
208 	return context->ops.query_port(context, port_num, port_attr);
209 }
210 default_symver(__ibv_query_port, ibv_query_port);
211 
212 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num,
213 		    int index, union ibv_gid *gid)
214 {
215 	char name[24];
216 	char attr[41];
217 	uint16_t val;
218 	int i;
219 
220 	snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index);
221 
222 	if (ibv_read_sysfs_file(context->device->ibdev_path, name,
223 				attr, sizeof attr) < 0)
224 		return -1;
225 
226 	for (i = 0; i < 8; ++i) {
227 		if (sscanf(attr + i * 5, "%hx", &val) != 1)
228 			return -1;
229 		gid->raw[i * 2    ] = val >> 8;
230 		gid->raw[i * 2 + 1] = val & 0xff;
231 	}
232 
233 	return 0;
234 }
235 default_symver(__ibv_query_gid, ibv_query_gid);
236 
237 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num,
238 		     int index, __be16 *pkey)
239 {
240 	char name[24];
241 	char attr[8];
242 	uint16_t val;
243 
244 	snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index);
245 
246 	if (ibv_read_sysfs_file(context->device->ibdev_path, name,
247 				attr, sizeof attr) < 0)
248 		return -1;
249 
250 	if (sscanf(attr, "%hx", &val) != 1)
251 		return -1;
252 
253 	*pkey = htobe16(val);
254 	return 0;
255 }
256 default_symver(__ibv_query_pkey, ibv_query_pkey);
257 
258 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context)
259 {
260 	struct ibv_pd *pd;
261 
262 	pd = context->ops.alloc_pd(context);
263 	if (pd)
264 		pd->context = context;
265 
266 	return pd;
267 }
268 default_symver(__ibv_alloc_pd, ibv_alloc_pd);
269 
270 int __ibv_dealloc_pd(struct ibv_pd *pd)
271 {
272 	return pd->context->ops.dealloc_pd(pd);
273 }
274 default_symver(__ibv_dealloc_pd, ibv_dealloc_pd);
275 
276 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr,
277 			    size_t length, int access)
278 {
279 	struct ibv_mr *mr;
280 
281 	if (ibv_dontfork_range(addr, length))
282 		return NULL;
283 
284 	mr = pd->context->ops.reg_mr(pd, addr, length, access);
285 	if (mr) {
286 		mr->context = pd->context;
287 		mr->pd      = pd;
288 		mr->addr    = addr;
289 		mr->length  = length;
290 	} else
291 		ibv_dofork_range(addr, length);
292 
293 	return mr;
294 }
295 default_symver(__ibv_reg_mr, ibv_reg_mr);
296 
297 int __ibv_rereg_mr(struct ibv_mr *mr, int flags,
298 		   struct ibv_pd *pd, void *addr,
299 		   size_t length, int access)
300 {
301 	int dofork_onfail = 0;
302 	int err;
303 	void *old_addr;
304 	size_t old_len;
305 
306 	if (flags & ~IBV_REREG_MR_FLAGS_SUPPORTED) {
307 		errno = EINVAL;
308 		return IBV_REREG_MR_ERR_INPUT;
309 	}
310 
311 	if ((flags & IBV_REREG_MR_CHANGE_TRANSLATION) &&
312 	    (!length || !addr)) {
313 		errno = EINVAL;
314 		return IBV_REREG_MR_ERR_INPUT;
315 	}
316 
317 	if (access && !(flags & IBV_REREG_MR_CHANGE_ACCESS)) {
318 		errno = EINVAL;
319 		return IBV_REREG_MR_ERR_INPUT;
320 	}
321 
322 	if (!mr->context->ops.rereg_mr) {
323 		errno = ENOSYS;
324 		return IBV_REREG_MR_ERR_INPUT;
325 	}
326 
327 	if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
328 		err = ibv_dontfork_range(addr, length);
329 		if (err)
330 			return IBV_REREG_MR_ERR_DONT_FORK_NEW;
331 		dofork_onfail = 1;
332 	}
333 
334 	old_addr = mr->addr;
335 	old_len = mr->length;
336 	err = mr->context->ops.rereg_mr(mr, flags, pd, addr, length, access);
337 	if (!err) {
338 		if (flags & IBV_REREG_MR_CHANGE_PD)
339 			mr->pd = pd;
340 		if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
341 			mr->addr    = addr;
342 			mr->length  = length;
343 			err = ibv_dofork_range(old_addr, old_len);
344 			if (err)
345 				return IBV_REREG_MR_ERR_DO_FORK_OLD;
346 		}
347 	} else {
348 		err = IBV_REREG_MR_ERR_CMD;
349 		if (dofork_onfail) {
350 			if (ibv_dofork_range(addr, length))
351 				err = IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW;
352 		}
353 	}
354 
355 	return err;
356 }
357 default_symver(__ibv_rereg_mr, ibv_rereg_mr);
358 
359 int __ibv_dereg_mr(struct ibv_mr *mr)
360 {
361 	int ret;
362 	void *addr	= mr->addr;
363 	size_t length	= mr->length;
364 
365 	ret = mr->context->ops.dereg_mr(mr);
366 	if (!ret)
367 		ibv_dofork_range(addr, length);
368 
369 	return ret;
370 }
371 default_symver(__ibv_dereg_mr, ibv_dereg_mr);
372 
373 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
374 {
375 	struct ibv_abi_compat_v2 *t = context->abi_compat;
376 	static int warned;
377 
378 	if (!pthread_mutex_trylock(&t->in_use))
379 		return &t->channel;
380 
381 	if (!warned) {
382 		fprintf(stderr, PFX "Warning: kernel's ABI version %d limits capacity.\n"
383 			"    Only one completion channel can be created per context.\n",
384 			abi_ver);
385 		++warned;
386 	}
387 
388 	return NULL;
389 }
390 
391 struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context)
392 {
393 	struct ibv_comp_channel            *channel;
394 	struct ibv_create_comp_channel      cmd;
395 	struct ibv_create_comp_channel_resp resp;
396 
397 	if (abi_ver <= 2)
398 		return ibv_create_comp_channel_v2(context);
399 
400 	channel = malloc(sizeof *channel);
401 	if (!channel)
402 		return NULL;
403 
404 	IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_COMP_CHANNEL, &resp, sizeof resp);
405 	if (write(context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) {
406 		free(channel);
407 		return NULL;
408 	}
409 
410 	(void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
411 
412 	channel->context = context;
413 	channel->fd      = resp.fd;
414 	channel->refcnt  = 0;
415 
416 	return channel;
417 }
418 
419 static int ibv_destroy_comp_channel_v2(struct ibv_comp_channel *channel)
420 {
421 	struct ibv_abi_compat_v2 *t = (struct ibv_abi_compat_v2 *) channel;
422 	pthread_mutex_unlock(&t->in_use);
423 	return 0;
424 }
425 
426 int ibv_destroy_comp_channel(struct ibv_comp_channel *channel)
427 {
428 	struct ibv_context *context;
429 	int ret;
430 
431 	context = channel->context;
432 	pthread_mutex_lock(&context->mutex);
433 
434 	if (channel->refcnt) {
435 		ret = EBUSY;
436 		goto out;
437 	}
438 
439 	if (abi_ver <= 2) {
440 		ret = ibv_destroy_comp_channel_v2(channel);
441 		goto out;
442 	}
443 
444 	close(channel->fd);
445 	free(channel);
446 	ret = 0;
447 
448 out:
449 	pthread_mutex_unlock(&context->mutex);
450 
451 	return ret;
452 }
453 
454 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context,
455 			       struct ibv_comp_channel *channel, int comp_vector)
456 {
457 	struct ibv_cq *cq;
458 
459 	cq = context->ops.create_cq(context, cqe, channel, comp_vector);
460 
461 	if (cq)
462 		verbs_init_cq(cq, context, channel, cq_context);
463 
464 	return cq;
465 }
466 default_symver(__ibv_create_cq, ibv_create_cq);
467 
468 int __ibv_resize_cq(struct ibv_cq *cq, int cqe)
469 {
470 	if (!cq->context->ops.resize_cq)
471 		return ENOSYS;
472 
473 	return cq->context->ops.resize_cq(cq, cqe);
474 }
475 default_symver(__ibv_resize_cq, ibv_resize_cq);
476 
477 int __ibv_destroy_cq(struct ibv_cq *cq)
478 {
479 	struct ibv_comp_channel *channel = cq->channel;
480 	int ret;
481 
482 	ret = cq->context->ops.destroy_cq(cq);
483 
484 	if (channel) {
485 		if (!ret) {
486 			pthread_mutex_lock(&channel->context->mutex);
487 			--channel->refcnt;
488 			pthread_mutex_unlock(&channel->context->mutex);
489 		}
490 	}
491 
492 	return ret;
493 }
494 default_symver(__ibv_destroy_cq, ibv_destroy_cq);
495 
496 int __ibv_get_cq_event(struct ibv_comp_channel *channel,
497 		       struct ibv_cq **cq, void **cq_context)
498 {
499 	struct ibv_comp_event ev;
500 
501 	if (read(channel->fd, &ev, sizeof ev) != sizeof ev)
502 		return -1;
503 
504 	*cq         = (struct ibv_cq *) (uintptr_t) ev.cq_handle;
505 	*cq_context = (*cq)->cq_context;
506 
507 	if ((*cq)->context->ops.cq_event)
508 		(*cq)->context->ops.cq_event(*cq);
509 
510 	return 0;
511 }
512 default_symver(__ibv_get_cq_event, ibv_get_cq_event);
513 
514 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents)
515 {
516 	pthread_mutex_lock(&cq->mutex);
517 	cq->comp_events_completed += nevents;
518 	pthread_cond_signal(&cq->cond);
519 	pthread_mutex_unlock(&cq->mutex);
520 }
521 default_symver(__ibv_ack_cq_events, ibv_ack_cq_events);
522 
523 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
524 				 struct ibv_srq_init_attr *srq_init_attr)
525 {
526 	struct ibv_srq *srq;
527 
528 	if (!pd->context->ops.create_srq)
529 		return NULL;
530 
531 	srq = pd->context->ops.create_srq(pd, srq_init_attr);
532 	if (srq) {
533 		srq->context          = pd->context;
534 		srq->srq_context      = srq_init_attr->srq_context;
535 		srq->pd               = pd;
536 		srq->events_completed = 0;
537 		pthread_mutex_init(&srq->mutex, NULL);
538 		pthread_cond_init(&srq->cond, NULL);
539 	}
540 
541 	return srq;
542 }
543 default_symver(__ibv_create_srq, ibv_create_srq);
544 
545 int __ibv_modify_srq(struct ibv_srq *srq,
546 		     struct ibv_srq_attr *srq_attr,
547 		     int srq_attr_mask)
548 {
549 	return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask);
550 }
551 default_symver(__ibv_modify_srq, ibv_modify_srq);
552 
553 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr)
554 {
555 	return srq->context->ops.query_srq(srq, srq_attr);
556 }
557 default_symver(__ibv_query_srq, ibv_query_srq);
558 
559 int __ibv_destroy_srq(struct ibv_srq *srq)
560 {
561 	return srq->context->ops.destroy_srq(srq);
562 }
563 default_symver(__ibv_destroy_srq, ibv_destroy_srq);
564 
565 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
566 			       struct ibv_qp_init_attr *qp_init_attr)
567 {
568 	struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr);
569 
570 	if (qp) {
571 		qp->context    	     = pd->context;
572 		qp->qp_context 	     = qp_init_attr->qp_context;
573 		qp->pd         	     = pd;
574 		qp->send_cq    	     = qp_init_attr->send_cq;
575 		qp->recv_cq    	     = qp_init_attr->recv_cq;
576 		qp->srq        	     = qp_init_attr->srq;
577 		qp->qp_type          = qp_init_attr->qp_type;
578 		qp->state	     = IBV_QPS_RESET;
579 		qp->events_completed = 0;
580 		pthread_mutex_init(&qp->mutex, NULL);
581 		pthread_cond_init(&qp->cond, NULL);
582 	}
583 
584 	return qp;
585 }
586 default_symver(__ibv_create_qp, ibv_create_qp);
587 
588 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
589 		   int attr_mask,
590 		   struct ibv_qp_init_attr *init_attr)
591 {
592 	int ret;
593 
594 	ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr);
595 	if (ret)
596 		return ret;
597 
598 	if (attr_mask & IBV_QP_STATE)
599 		qp->state = attr->qp_state;
600 
601 	return 0;
602 }
603 default_symver(__ibv_query_qp, ibv_query_qp);
604 
605 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
606 		    int attr_mask)
607 {
608 	int ret;
609 
610 	ret = qp->context->ops.modify_qp(qp, attr, attr_mask);
611 	if (ret)
612 		return ret;
613 
614 	if (attr_mask & IBV_QP_STATE)
615 		qp->state = attr->qp_state;
616 
617 	return 0;
618 }
619 default_symver(__ibv_modify_qp, ibv_modify_qp);
620 
621 int __ibv_destroy_qp(struct ibv_qp *qp)
622 {
623 	return qp->context->ops.destroy_qp(qp);
624 }
625 default_symver(__ibv_destroy_qp, ibv_destroy_qp);
626 
627 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
628 {
629 	struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr);
630 
631 	if (ah) {
632 		ah->context = pd->context;
633 		ah->pd      = pd;
634 	}
635 
636 	return ah;
637 }
638 default_symver(__ibv_create_ah, ibv_create_ah);
639 
640 /* GID types as appear in sysfs, no change is expected as of ABI
641  * compatibility.
642  */
643 #define V1_TYPE "IB/RoCE v1"
644 #define V2_TYPE "RoCE v2"
645 int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num,
646 		       unsigned int index, enum ibv_gid_type *type)
647 {
648 	char name[32];
649 	char buff[11];
650 
651 	snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num,
652 		 index);
653 
654 	/* Reset errno so that we can rely on its value upon any error flow in
655 	 * ibv_read_sysfs_file.
656 	 */
657 	errno = 0;
658 	if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff,
659 				sizeof(buff)) <= 0) {
660 		char *dir_path;
661 		DIR *dir;
662 
663 		if (errno == EINVAL) {
664 			/* In IB, this file doesn't exist and the kernel sets
665 			 * errno to -EINVAL.
666 			 */
667 			*type = IBV_GID_TYPE_IB_ROCE_V1;
668 			return 0;
669 		}
670 		if (asprintf(&dir_path, "%s/%s/%d/%s/",
671 			     context->device->ibdev_path, "ports", port_num,
672 			     "gid_attrs") < 0)
673 			return -1;
674 		dir = opendir(dir_path);
675 		free(dir_path);
676 		if (!dir) {
677 			if (errno == ENOENT)
678 				/* Assuming that if gid_attrs doesn't exist,
679 				 * we have an old kernel and all GIDs are
680 				 * IB/RoCE v1
681 				 */
682 				*type = IBV_GID_TYPE_IB_ROCE_V1;
683 			else
684 				return -1;
685 		} else {
686 			closedir(dir);
687 			errno = EFAULT;
688 			return -1;
689 		}
690 	} else {
691 		if (!strcmp(buff, V1_TYPE)) {
692 			*type = IBV_GID_TYPE_IB_ROCE_V1;
693 		} else if (!strcmp(buff, V2_TYPE)) {
694 			*type = IBV_GID_TYPE_ROCE_V2;
695 		} else {
696 			errno = ENOTSUP;
697 			return -1;
698 		}
699 	}
700 
701 	return 0;
702 }
703 
704 static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num,
705 			      union ibv_gid *gid, enum ibv_gid_type gid_type)
706 {
707 	enum ibv_gid_type sgid_type = 0;
708 	union ibv_gid sgid;
709 	int i = 0, ret;
710 
711 	do {
712 		ret = ibv_query_gid(context, port_num, i, &sgid);
713 		if (!ret) {
714 			ret = ibv_query_gid_type(context, port_num, i,
715 						 &sgid_type);
716 		}
717 		i++;
718 	} while (!ret && (memcmp(&sgid, gid, sizeof(*gid)) ||
719 		 (gid_type != sgid_type)));
720 
721 	return ret ? ret : i - 1;
722 }
723 
724 static inline void map_ipv4_addr_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
725 {
726 	ipv6->s6_addr32[0] = 0;
727 	ipv6->s6_addr32[1] = 0;
728 	ipv6->s6_addr32[2] = htobe32(0x0000FFFF);
729 	ipv6->s6_addr32[3] = ipv4;
730 }
731 
732 static inline __sum16 ipv4_calc_hdr_csum(uint16_t *data, unsigned int num_hwords)
733 {
734 	unsigned int i = 0;
735 	uint32_t sum = 0;
736 
737 	for (i = 0; i < num_hwords; i++)
738 		sum += *(data++);
739 
740 	sum = (sum & 0xffff) + (sum >> 16);
741 
742 	return (__sum16)~sum;
743 }
744 
745 static inline int get_grh_header_version(struct ibv_grh *grh)
746 {
747 	int ip6h_version = (be32toh(grh->version_tclass_flow) >> 28) & 0xf;
748 	struct ip *ip4h = (struct ip *)((void *)grh + 20);
749 	struct ip ip4h_checked;
750 
751 	if (ip6h_version != 6) {
752 		if (ip4h->ip_v == 4)
753 			return 4;
754 		errno = EPROTONOSUPPORT;
755 		return -1;
756 	}
757 	/* version may be 6 or 4 */
758 	if (ip4h->ip_hl != 5) /* IPv4 header length must be 5 for RoCE v2. */
759 		return 6;
760 	/*
761 	* Verify checksum.
762 	* We can't write on scattered buffers so we have to copy to temp
763 	* buffer.
764 	*/
765 	memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
766 	/* Need to set the checksum field (check) to 0 before re-calculating
767 	 * the checksum.
768 	 */
769 	ip4h_checked.ip_sum = 0;
770 	ip4h_checked.ip_sum = ipv4_calc_hdr_csum((uint16_t *)&ip4h_checked, 10);
771 	/* if IPv4 header checksum is OK, believe it */
772 	if (ip4h->ip_sum == ip4h_checked.ip_sum)
773 		return 4;
774 	return 6;
775 }
776 
777 static inline void set_ah_attr_generic_fields(struct ibv_ah_attr *ah_attr,
778 					      struct ibv_wc *wc,
779 					      struct ibv_grh *grh,
780 					      uint8_t port_num)
781 {
782 	uint32_t flow_class;
783 
784 	flow_class = be32toh(grh->version_tclass_flow);
785 	ah_attr->grh.flow_label = flow_class & 0xFFFFF;
786 	ah_attr->dlid = wc->slid;
787 	ah_attr->sl = wc->sl;
788 	ah_attr->src_path_bits = wc->dlid_path_bits;
789 	ah_attr->port_num = port_num;
790 }
791 
792 static inline int set_ah_attr_by_ipv4(struct ibv_context *context,
793 				      struct ibv_ah_attr *ah_attr,
794 				      struct ip *ip4h, uint8_t port_num)
795 {
796 	union ibv_gid sgid;
797 	int ret;
798 
799 	/* No point searching multicast GIDs in GID table */
800 	if (IN_CLASSD(be32toh(ip4h->ip_dst.s_addr))) {
801 		errno = EINVAL;
802 		return -1;
803 	}
804 
805 	map_ipv4_addr_to_ipv6(ip4h->ip_dst.s_addr, (struct in6_addr *)&sgid);
806 	ret = ibv_find_gid_index(context, port_num, &sgid,
807 				 IBV_GID_TYPE_ROCE_V2);
808 	if (ret < 0)
809 		return ret;
810 
811 	map_ipv4_addr_to_ipv6(ip4h->ip_src.s_addr,
812 			      (struct in6_addr *)&ah_attr->grh.dgid);
813 	ah_attr->grh.sgid_index = (uint8_t) ret;
814 	ah_attr->grh.hop_limit = ip4h->ip_ttl;
815 	ah_attr->grh.traffic_class = ip4h->ip_tos;
816 
817 	return 0;
818 }
819 
820 #define IB_NEXT_HDR    0x1b
821 static inline int set_ah_attr_by_ipv6(struct ibv_context *context,
822 				  struct ibv_ah_attr *ah_attr,
823 				  struct ibv_grh *grh, uint8_t port_num)
824 {
825 	uint32_t flow_class;
826 	uint32_t sgid_type;
827 	int ret;
828 
829 	/* No point searching multicast GIDs in GID table */
830 	if (grh->dgid.raw[0] == 0xFF) {
831 		errno = EINVAL;
832 		return -1;
833 	}
834 
835 	ah_attr->grh.dgid = grh->sgid;
836 	if (grh->next_hdr == IPPROTO_UDP) {
837 		sgid_type = IBV_GID_TYPE_ROCE_V2;
838 	} else if (grh->next_hdr == IB_NEXT_HDR) {
839 		sgid_type = IBV_GID_TYPE_IB_ROCE_V1;
840 	} else {
841 		errno = EPROTONOSUPPORT;
842 		return -1;
843 	}
844 
845 	ret = ibv_find_gid_index(context, port_num, &grh->dgid,
846 				 sgid_type);
847 	if (ret < 0)
848 		return ret;
849 
850 	ah_attr->grh.sgid_index = (uint8_t) ret;
851 	flow_class = be32toh(grh->version_tclass_flow);
852 	ah_attr->grh.hop_limit = grh->hop_limit;
853 	ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
854 
855 	return 0;
856 }
857 
858 int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num,
859 			struct ibv_wc *wc, struct ibv_grh *grh,
860 			struct ibv_ah_attr *ah_attr)
861 {
862 	int version;
863 	int ret = 0;
864 
865 	memset(ah_attr, 0, sizeof *ah_attr);
866 	set_ah_attr_generic_fields(ah_attr, wc, grh, port_num);
867 
868 	if (wc->wc_flags & IBV_WC_GRH) {
869 		ah_attr->is_global = 1;
870 		version = get_grh_header_version(grh);
871 
872 		if (version == 4)
873 			ret = set_ah_attr_by_ipv4(context, ah_attr,
874 						  (struct ip *)((void *)grh + 20),
875 						  port_num);
876 		else if (version == 6)
877 			ret = set_ah_attr_by_ipv6(context, ah_attr, grh,
878 						  port_num);
879 		else
880 			ret = -1;
881 	}
882 
883 	return ret;
884 }
885 
886 struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc,
887 				     struct ibv_grh *grh, uint8_t port_num)
888 {
889 	struct ibv_ah_attr ah_attr;
890 	int ret;
891 
892 	ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr);
893 	if (ret)
894 		return NULL;
895 
896 	return ibv_create_ah(pd, &ah_attr);
897 }
898 
899 int __ibv_destroy_ah(struct ibv_ah *ah)
900 {
901 	return ah->context->ops.destroy_ah(ah);
902 }
903 default_symver(__ibv_destroy_ah, ibv_destroy_ah);
904 
905 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
906 {
907 	return qp->context->ops.attach_mcast(qp, gid, lid);
908 }
909 default_symver(__ibv_attach_mcast, ibv_attach_mcast);
910 
911 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
912 {
913 	return qp->context->ops.detach_mcast(qp, gid, lid);
914 }
915 default_symver(__ibv_detach_mcast, ibv_detach_mcast);
916 
917 static inline int ipv6_addr_v4mapped(const struct in6_addr *a)
918 {
919 	return IN6_IS_ADDR_V4MAPPED(a) ||
920 		/* IPv4 encoded multicast addresses */
921 		(a->s6_addr32[0]  == htobe32(0xff0e0000) &&
922 		((a->s6_addr32[1] |
923 		 (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL));
924 }
925 
926 struct peer_address {
927 	void *address;
928 	uint32_t size;
929 };
930 
931 static inline int create_peer_from_gid(int family, void *raw_gid,
932 				       struct peer_address *peer_address)
933 {
934 	switch (family) {
935 	case AF_INET:
936 		peer_address->address = raw_gid + 12;
937 		peer_address->size = 4;
938 		break;
939 	case AF_INET6:
940 		peer_address->address = raw_gid;
941 		peer_address->size = 16;
942 		break;
943 	default:
944 		return -1;
945 	}
946 
947 	return 0;
948 }
949 
950 #define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000
951 int ibv_resolve_eth_l2_from_gid(struct ibv_context *context,
952 				struct ibv_ah_attr *attr,
953 				uint8_t eth_mac[ETHERNET_LL_SIZE],
954 				uint16_t *vid)
955 {
956 #ifndef NRESOLVE_NEIGH
957 	int dst_family;
958 	int src_family;
959 	int oif;
960 	struct get_neigh_handler neigh_handler;
961 	union ibv_gid sgid;
962 	int ether_len;
963 	struct peer_address src;
964 	struct peer_address dst;
965 	uint16_t ret_vid;
966 	int ret = -EINVAL;
967 	int err;
968 
969 	err = ibv_query_gid(context, attr->port_num,
970 			    attr->grh.sgid_index, &sgid);
971 
972 	if (err)
973 		return err;
974 
975 	err = neigh_init_resources(&neigh_handler,
976 				   NEIGH_GET_DEFAULT_TIMEOUT_MS);
977 
978 	if (err)
979 		return err;
980 
981 	dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ?
982 			AF_INET : AF_INET6;
983 	src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ?
984 			AF_INET : AF_INET6;
985 
986 	if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst))
987 		goto free_resources;
988 
989 	if (create_peer_from_gid(src_family, &sgid.raw, &src))
990 		goto free_resources;
991 
992 	if (neigh_set_dst(&neigh_handler, dst_family, dst.address,
993 			  dst.size))
994 		goto free_resources;
995 
996 	if (neigh_set_src(&neigh_handler, src_family, src.address,
997 			  src.size))
998 		goto free_resources;
999 
1000 	oif = neigh_get_oif_from_src(&neigh_handler);
1001 
1002 	if (oif > 0)
1003 		neigh_set_oif(&neigh_handler, oif);
1004 	else
1005 		goto free_resources;
1006 
1007 	ret = -EHOSTUNREACH;
1008 
1009 	/* blocking call */
1010 	if (process_get_neigh(&neigh_handler))
1011 		goto free_resources;
1012 
1013 	ret_vid = neigh_get_vlan_id_from_dev(&neigh_handler);
1014 
1015 	if (ret_vid <= 0xfff)
1016 		neigh_set_vlan_id(&neigh_handler, ret_vid);
1017 
1018 	/* We are using only Ethernet here */
1019 	ether_len = neigh_get_ll(&neigh_handler,
1020 				 eth_mac,
1021 				 sizeof(uint8_t) * ETHERNET_LL_SIZE);
1022 
1023 	if (ether_len <= 0)
1024 		goto free_resources;
1025 
1026 	*vid = ret_vid;
1027 
1028 	ret = 0;
1029 
1030 free_resources:
1031 	neigh_free_resources(&neigh_handler);
1032 
1033 	return ret;
1034 #else
1035 	return -ENOSYS;
1036 #endif
1037 }
1038