1 /* 2 * Copyright (c) 2005 Topspin Communications. All rights reserved. 3 * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #define _GNU_SOURCE 35 #include <config.h> 36 37 #include <infiniband/endian.h> 38 #include <stdio.h> 39 #include <unistd.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <string.h> 43 #include <dirent.h> 44 #include <netinet/in.h> 45 #include <netinet/ip.h> 46 #include <sys/socket.h> 47 48 #include "ibverbs.h" 49 #ifndef NRESOLVE_NEIGH 50 #include <net/if.h> 51 #include <net/if_arp.h> 52 #include "neigh.h" 53 #endif 54 55 /* Hack to avoid GCC's -Wmissing-prototypes and the similar error from sparse 56 with these prototypes. Symbol versionining requires the goofy names, the 57 prototype must match the version in verbs.h. 58 */ 59 int __ibv_query_device(struct ibv_context *context, 60 struct ibv_device_attr *device_attr); 61 int __ibv_query_port(struct ibv_context *context, uint8_t port_num, 62 struct ibv_port_attr *port_attr); 63 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, 64 union ibv_gid *gid); 65 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index, 66 __be16 *pkey); 67 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context); 68 int __ibv_dealloc_pd(struct ibv_pd *pd); 69 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, 70 int access); 71 int __ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, 72 size_t length, int access); 73 int __ibv_dereg_mr(struct ibv_mr *mr); 74 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, 75 void *cq_context, 76 struct ibv_comp_channel *channel, 77 int comp_vector); 78 int __ibv_resize_cq(struct ibv_cq *cq, int cqe); 79 int __ibv_destroy_cq(struct ibv_cq *cq); 80 int __ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq, 81 void **cq_context); 82 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents); 83 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd, 84 struct ibv_srq_init_attr *srq_init_attr); 85 int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, 86 int srq_attr_mask); 87 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); 88 int __ibv_destroy_srq(struct ibv_srq *srq); 89 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd, 90 struct ibv_qp_init_attr *qp_init_attr); 91 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, 92 struct ibv_qp_init_attr *init_attr); 93 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); 94 int __ibv_destroy_qp(struct ibv_qp *qp); 95 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); 96 int __ibv_destroy_ah(struct ibv_ah *ah); 97 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, 98 uint16_t lid); 99 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, 100 uint16_t lid); 101 102 int __attribute__((const)) ibv_rate_to_mult(enum ibv_rate rate) 103 { 104 switch (rate) { 105 case IBV_RATE_2_5_GBPS: return 1; 106 case IBV_RATE_5_GBPS: return 2; 107 case IBV_RATE_10_GBPS: return 4; 108 case IBV_RATE_20_GBPS: return 8; 109 case IBV_RATE_30_GBPS: return 12; 110 case IBV_RATE_40_GBPS: return 16; 111 case IBV_RATE_60_GBPS: return 24; 112 case IBV_RATE_80_GBPS: return 32; 113 case IBV_RATE_120_GBPS: return 48; 114 default: return -1; 115 } 116 } 117 118 enum ibv_rate __attribute__((const)) mult_to_ibv_rate(int mult) 119 { 120 switch (mult) { 121 case 1: return IBV_RATE_2_5_GBPS; 122 case 2: return IBV_RATE_5_GBPS; 123 case 4: return IBV_RATE_10_GBPS; 124 case 8: return IBV_RATE_20_GBPS; 125 case 12: return IBV_RATE_30_GBPS; 126 case 16: return IBV_RATE_40_GBPS; 127 case 24: return IBV_RATE_60_GBPS; 128 case 32: return IBV_RATE_80_GBPS; 129 case 48: return IBV_RATE_120_GBPS; 130 default: return IBV_RATE_MAX; 131 } 132 } 133 134 int __attribute__((const)) ibv_rate_to_mbps(enum ibv_rate rate) 135 { 136 switch (rate) { 137 case IBV_RATE_2_5_GBPS: return 2500; 138 case IBV_RATE_5_GBPS: return 5000; 139 case IBV_RATE_10_GBPS: return 10000; 140 case IBV_RATE_20_GBPS: return 20000; 141 case IBV_RATE_30_GBPS: return 30000; 142 case IBV_RATE_40_GBPS: return 40000; 143 case IBV_RATE_60_GBPS: return 60000; 144 case IBV_RATE_80_GBPS: return 80000; 145 case IBV_RATE_120_GBPS: return 120000; 146 case IBV_RATE_14_GBPS: return 14062; 147 case IBV_RATE_56_GBPS: return 56250; 148 case IBV_RATE_112_GBPS: return 112500; 149 case IBV_RATE_168_GBPS: return 168750; 150 case IBV_RATE_25_GBPS: return 25781; 151 case IBV_RATE_100_GBPS: return 103125; 152 case IBV_RATE_200_GBPS: return 206250; 153 case IBV_RATE_300_GBPS: return 309375; 154 default: return -1; 155 } 156 } 157 158 enum ibv_rate __attribute__((const)) mbps_to_ibv_rate(int mbps) 159 { 160 switch (mbps) { 161 case 2500: return IBV_RATE_2_5_GBPS; 162 case 5000: return IBV_RATE_5_GBPS; 163 case 10000: return IBV_RATE_10_GBPS; 164 case 20000: return IBV_RATE_20_GBPS; 165 case 30000: return IBV_RATE_30_GBPS; 166 case 40000: return IBV_RATE_40_GBPS; 167 case 60000: return IBV_RATE_60_GBPS; 168 case 80000: return IBV_RATE_80_GBPS; 169 case 120000: return IBV_RATE_120_GBPS; 170 case 14062: return IBV_RATE_14_GBPS; 171 case 56250: return IBV_RATE_56_GBPS; 172 case 112500: return IBV_RATE_112_GBPS; 173 case 168750: return IBV_RATE_168_GBPS; 174 case 25781: return IBV_RATE_25_GBPS; 175 case 103125: return IBV_RATE_100_GBPS; 176 case 206250: return IBV_RATE_200_GBPS; 177 case 309375: return IBV_RATE_300_GBPS; 178 default: return IBV_RATE_MAX; 179 } 180 } 181 182 int __ibv_query_device(struct ibv_context *context, 183 struct ibv_device_attr *device_attr) 184 { 185 return context->ops.query_device(context, device_attr); 186 } 187 default_symver(__ibv_query_device, ibv_query_device); 188 189 int __ibv_query_port(struct ibv_context *context, uint8_t port_num, 190 struct ibv_port_attr *port_attr) 191 { 192 return context->ops.query_port(context, port_num, port_attr); 193 } 194 default_symver(__ibv_query_port, ibv_query_port); 195 196 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, 197 int index, union ibv_gid *gid) 198 { 199 char name[24]; 200 char attr[41]; 201 uint16_t val; 202 int i; 203 204 snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index); 205 206 if (ibv_read_sysfs_file(context->device->ibdev_path, name, 207 attr, sizeof attr) < 0) 208 return -1; 209 210 for (i = 0; i < 8; ++i) { 211 if (sscanf(attr + i * 5, "%hx", &val) != 1) 212 return -1; 213 gid->raw[i * 2 ] = val >> 8; 214 gid->raw[i * 2 + 1] = val & 0xff; 215 } 216 217 return 0; 218 } 219 default_symver(__ibv_query_gid, ibv_query_gid); 220 221 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, 222 int index, __be16 *pkey) 223 { 224 char name[24]; 225 char attr[8]; 226 uint16_t val; 227 228 snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index); 229 230 if (ibv_read_sysfs_file(context->device->ibdev_path, name, 231 attr, sizeof attr) < 0) 232 return -1; 233 234 if (sscanf(attr, "%hx", &val) != 1) 235 return -1; 236 237 *pkey = htobe16(val); 238 return 0; 239 } 240 default_symver(__ibv_query_pkey, ibv_query_pkey); 241 242 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context) 243 { 244 struct ibv_pd *pd; 245 246 pd = context->ops.alloc_pd(context); 247 if (pd) 248 pd->context = context; 249 250 return pd; 251 } 252 default_symver(__ibv_alloc_pd, ibv_alloc_pd); 253 254 int __ibv_dealloc_pd(struct ibv_pd *pd) 255 { 256 return pd->context->ops.dealloc_pd(pd); 257 } 258 default_symver(__ibv_dealloc_pd, ibv_dealloc_pd); 259 260 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, 261 size_t length, int access) 262 { 263 struct ibv_mr *mr; 264 265 if (ibv_dontfork_range(addr, length)) 266 return NULL; 267 268 mr = pd->context->ops.reg_mr(pd, addr, length, access); 269 if (mr) { 270 mr->context = pd->context; 271 mr->pd = pd; 272 mr->addr = addr; 273 mr->length = length; 274 } else 275 ibv_dofork_range(addr, length); 276 277 return mr; 278 } 279 default_symver(__ibv_reg_mr, ibv_reg_mr); 280 281 int __ibv_rereg_mr(struct ibv_mr *mr, int flags, 282 struct ibv_pd *pd, void *addr, 283 size_t length, int access) 284 { 285 int dofork_onfail = 0; 286 int err; 287 void *old_addr; 288 size_t old_len; 289 290 if (flags & ~IBV_REREG_MR_FLAGS_SUPPORTED) { 291 errno = EINVAL; 292 return IBV_REREG_MR_ERR_INPUT; 293 } 294 295 if ((flags & IBV_REREG_MR_CHANGE_TRANSLATION) && 296 (!length || !addr)) { 297 errno = EINVAL; 298 return IBV_REREG_MR_ERR_INPUT; 299 } 300 301 if (access && !(flags & IBV_REREG_MR_CHANGE_ACCESS)) { 302 errno = EINVAL; 303 return IBV_REREG_MR_ERR_INPUT; 304 } 305 306 if (!mr->context->ops.rereg_mr) { 307 errno = ENOSYS; 308 return IBV_REREG_MR_ERR_INPUT; 309 } 310 311 if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) { 312 err = ibv_dontfork_range(addr, length); 313 if (err) 314 return IBV_REREG_MR_ERR_DONT_FORK_NEW; 315 dofork_onfail = 1; 316 } 317 318 old_addr = mr->addr; 319 old_len = mr->length; 320 err = mr->context->ops.rereg_mr(mr, flags, pd, addr, length, access); 321 if (!err) { 322 if (flags & IBV_REREG_MR_CHANGE_PD) 323 mr->pd = pd; 324 if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) { 325 mr->addr = addr; 326 mr->length = length; 327 err = ibv_dofork_range(old_addr, old_len); 328 if (err) 329 return IBV_REREG_MR_ERR_DO_FORK_OLD; 330 } 331 } else { 332 err = IBV_REREG_MR_ERR_CMD; 333 if (dofork_onfail) { 334 if (ibv_dofork_range(addr, length)) 335 err = IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW; 336 } 337 } 338 339 return err; 340 } 341 default_symver(__ibv_rereg_mr, ibv_rereg_mr); 342 343 int __ibv_dereg_mr(struct ibv_mr *mr) 344 { 345 int ret; 346 void *addr = mr->addr; 347 size_t length = mr->length; 348 349 ret = mr->context->ops.dereg_mr(mr); 350 if (!ret) 351 ibv_dofork_range(addr, length); 352 353 return ret; 354 } 355 default_symver(__ibv_dereg_mr, ibv_dereg_mr); 356 357 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context) 358 { 359 struct ibv_abi_compat_v2 *t = context->abi_compat; 360 static int warned; 361 362 if (!pthread_mutex_trylock(&t->in_use)) 363 return &t->channel; 364 365 if (!warned) { 366 fprintf(stderr, PFX "Warning: kernel's ABI version %d limits capacity.\n" 367 " Only one completion channel can be created per context.\n", 368 abi_ver); 369 ++warned; 370 } 371 372 return NULL; 373 } 374 375 struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context) 376 { 377 struct ibv_comp_channel *channel; 378 struct ibv_create_comp_channel cmd; 379 struct ibv_create_comp_channel_resp resp; 380 381 if (abi_ver <= 2) 382 return ibv_create_comp_channel_v2(context); 383 384 channel = malloc(sizeof *channel); 385 if (!channel) 386 return NULL; 387 388 IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_COMP_CHANNEL, &resp, sizeof resp); 389 if (write(context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) { 390 free(channel); 391 return NULL; 392 } 393 394 (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 395 396 channel->context = context; 397 channel->fd = resp.fd; 398 channel->refcnt = 0; 399 400 return channel; 401 } 402 403 static int ibv_destroy_comp_channel_v2(struct ibv_comp_channel *channel) 404 { 405 struct ibv_abi_compat_v2 *t = (struct ibv_abi_compat_v2 *) channel; 406 pthread_mutex_unlock(&t->in_use); 407 return 0; 408 } 409 410 int ibv_destroy_comp_channel(struct ibv_comp_channel *channel) 411 { 412 struct ibv_context *context; 413 int ret; 414 415 context = channel->context; 416 pthread_mutex_lock(&context->mutex); 417 418 if (channel->refcnt) { 419 ret = EBUSY; 420 goto out; 421 } 422 423 if (abi_ver <= 2) { 424 ret = ibv_destroy_comp_channel_v2(channel); 425 goto out; 426 } 427 428 close(channel->fd); 429 free(channel); 430 ret = 0; 431 432 out: 433 pthread_mutex_unlock(&context->mutex); 434 435 return ret; 436 } 437 438 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context, 439 struct ibv_comp_channel *channel, int comp_vector) 440 { 441 struct ibv_cq *cq; 442 443 cq = context->ops.create_cq(context, cqe, channel, comp_vector); 444 445 if (cq) 446 verbs_init_cq(cq, context, channel, cq_context); 447 448 return cq; 449 } 450 default_symver(__ibv_create_cq, ibv_create_cq); 451 452 int __ibv_resize_cq(struct ibv_cq *cq, int cqe) 453 { 454 if (!cq->context->ops.resize_cq) 455 return ENOSYS; 456 457 return cq->context->ops.resize_cq(cq, cqe); 458 } 459 default_symver(__ibv_resize_cq, ibv_resize_cq); 460 461 int __ibv_destroy_cq(struct ibv_cq *cq) 462 { 463 struct ibv_comp_channel *channel = cq->channel; 464 int ret; 465 466 ret = cq->context->ops.destroy_cq(cq); 467 468 if (channel) { 469 if (!ret) { 470 pthread_mutex_lock(&channel->context->mutex); 471 --channel->refcnt; 472 pthread_mutex_unlock(&channel->context->mutex); 473 } 474 } 475 476 return ret; 477 } 478 default_symver(__ibv_destroy_cq, ibv_destroy_cq); 479 480 int __ibv_get_cq_event(struct ibv_comp_channel *channel, 481 struct ibv_cq **cq, void **cq_context) 482 { 483 struct ibv_comp_event ev; 484 485 if (read(channel->fd, &ev, sizeof ev) != sizeof ev) 486 return -1; 487 488 *cq = (struct ibv_cq *) (uintptr_t) ev.cq_handle; 489 *cq_context = (*cq)->cq_context; 490 491 if ((*cq)->context->ops.cq_event) 492 (*cq)->context->ops.cq_event(*cq); 493 494 return 0; 495 } 496 default_symver(__ibv_get_cq_event, ibv_get_cq_event); 497 498 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents) 499 { 500 pthread_mutex_lock(&cq->mutex); 501 cq->comp_events_completed += nevents; 502 pthread_cond_signal(&cq->cond); 503 pthread_mutex_unlock(&cq->mutex); 504 } 505 default_symver(__ibv_ack_cq_events, ibv_ack_cq_events); 506 507 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd, 508 struct ibv_srq_init_attr *srq_init_attr) 509 { 510 struct ibv_srq *srq; 511 512 if (!pd->context->ops.create_srq) 513 return NULL; 514 515 srq = pd->context->ops.create_srq(pd, srq_init_attr); 516 if (srq) { 517 srq->context = pd->context; 518 srq->srq_context = srq_init_attr->srq_context; 519 srq->pd = pd; 520 srq->events_completed = 0; 521 pthread_mutex_init(&srq->mutex, NULL); 522 pthread_cond_init(&srq->cond, NULL); 523 } 524 525 return srq; 526 } 527 default_symver(__ibv_create_srq, ibv_create_srq); 528 529 int __ibv_modify_srq(struct ibv_srq *srq, 530 struct ibv_srq_attr *srq_attr, 531 int srq_attr_mask) 532 { 533 return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask); 534 } 535 default_symver(__ibv_modify_srq, ibv_modify_srq); 536 537 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr) 538 { 539 return srq->context->ops.query_srq(srq, srq_attr); 540 } 541 default_symver(__ibv_query_srq, ibv_query_srq); 542 543 int __ibv_destroy_srq(struct ibv_srq *srq) 544 { 545 return srq->context->ops.destroy_srq(srq); 546 } 547 default_symver(__ibv_destroy_srq, ibv_destroy_srq); 548 549 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd, 550 struct ibv_qp_init_attr *qp_init_attr) 551 { 552 struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr); 553 554 if (qp) { 555 qp->context = pd->context; 556 qp->qp_context = qp_init_attr->qp_context; 557 qp->pd = pd; 558 qp->send_cq = qp_init_attr->send_cq; 559 qp->recv_cq = qp_init_attr->recv_cq; 560 qp->srq = qp_init_attr->srq; 561 qp->qp_type = qp_init_attr->qp_type; 562 qp->state = IBV_QPS_RESET; 563 qp->events_completed = 0; 564 pthread_mutex_init(&qp->mutex, NULL); 565 pthread_cond_init(&qp->cond, NULL); 566 } 567 568 return qp; 569 } 570 default_symver(__ibv_create_qp, ibv_create_qp); 571 572 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, 573 int attr_mask, 574 struct ibv_qp_init_attr *init_attr) 575 { 576 int ret; 577 578 ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr); 579 if (ret) 580 return ret; 581 582 if (attr_mask & IBV_QP_STATE) 583 qp->state = attr->qp_state; 584 585 return 0; 586 } 587 default_symver(__ibv_query_qp, ibv_query_qp); 588 589 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, 590 int attr_mask) 591 { 592 int ret; 593 594 ret = qp->context->ops.modify_qp(qp, attr, attr_mask); 595 if (ret) 596 return ret; 597 598 if (attr_mask & IBV_QP_STATE) 599 qp->state = attr->qp_state; 600 601 return 0; 602 } 603 default_symver(__ibv_modify_qp, ibv_modify_qp); 604 605 int __ibv_destroy_qp(struct ibv_qp *qp) 606 { 607 return qp->context->ops.destroy_qp(qp); 608 } 609 default_symver(__ibv_destroy_qp, ibv_destroy_qp); 610 611 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) 612 { 613 struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr); 614 615 if (ah) { 616 ah->context = pd->context; 617 ah->pd = pd; 618 } 619 620 return ah; 621 } 622 default_symver(__ibv_create_ah, ibv_create_ah); 623 624 /* GID types as appear in sysfs, no change is expected as of ABI 625 * compatibility. 626 */ 627 #define V1_TYPE "IB/RoCE v1" 628 #define V2_TYPE "RoCE v2" 629 int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, 630 unsigned int index, enum ibv_gid_type *type) 631 { 632 char name[32]; 633 char buff[11]; 634 635 snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num, 636 index); 637 638 /* Reset errno so that we can rely on its value upon any error flow in 639 * ibv_read_sysfs_file. 640 */ 641 errno = 0; 642 if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff, 643 sizeof(buff)) <= 0) { 644 char *dir_path; 645 DIR *dir; 646 647 if (errno == EINVAL) { 648 /* In IB, this file doesn't exist and the kernel sets 649 * errno to -EINVAL. 650 */ 651 *type = IBV_GID_TYPE_IB_ROCE_V1; 652 return 0; 653 } 654 if (asprintf(&dir_path, "%s/%s/%d/%s/", 655 context->device->ibdev_path, "ports", port_num, 656 "gid_attrs") < 0) 657 return -1; 658 dir = opendir(dir_path); 659 free(dir_path); 660 if (!dir) { 661 if (errno == ENOENT) 662 /* Assuming that if gid_attrs doesn't exist, 663 * we have an old kernel and all GIDs are 664 * IB/RoCE v1 665 */ 666 *type = IBV_GID_TYPE_IB_ROCE_V1; 667 else 668 return -1; 669 } else { 670 closedir(dir); 671 errno = EFAULT; 672 return -1; 673 } 674 } else { 675 if (!strcmp(buff, V1_TYPE)) { 676 *type = IBV_GID_TYPE_IB_ROCE_V1; 677 } else if (!strcmp(buff, V2_TYPE)) { 678 *type = IBV_GID_TYPE_ROCE_V2; 679 } else { 680 errno = ENOTSUP; 681 return -1; 682 } 683 } 684 685 return 0; 686 } 687 688 static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num, 689 union ibv_gid *gid, enum ibv_gid_type gid_type) 690 { 691 enum ibv_gid_type sgid_type = 0; 692 union ibv_gid sgid; 693 int i = 0, ret; 694 695 do { 696 ret = ibv_query_gid(context, port_num, i, &sgid); 697 if (!ret) { 698 ret = ibv_query_gid_type(context, port_num, i, 699 &sgid_type); 700 } 701 i++; 702 } while (!ret && (memcmp(&sgid, gid, sizeof(*gid)) || 703 (gid_type != sgid_type))); 704 705 return ret ? ret : i - 1; 706 } 707 708 static inline void map_ipv4_addr_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) 709 { 710 ipv6->s6_addr32[0] = 0; 711 ipv6->s6_addr32[1] = 0; 712 ipv6->s6_addr32[2] = htobe32(0x0000FFFF); 713 ipv6->s6_addr32[3] = ipv4; 714 } 715 716 static inline __sum16 ipv4_calc_hdr_csum(uint16_t *data, unsigned int num_hwords) 717 { 718 unsigned int i = 0; 719 uint32_t sum = 0; 720 721 for (i = 0; i < num_hwords; i++) 722 sum += *(data++); 723 724 sum = (sum & 0xffff) + (sum >> 16); 725 726 return (__sum16)~sum; 727 } 728 729 static inline int get_grh_header_version(struct ibv_grh *grh) 730 { 731 int ip6h_version = (be32toh(grh->version_tclass_flow) >> 28) & 0xf; 732 struct ip *ip4h = (struct ip *)((void *)grh + 20); 733 struct ip ip4h_checked; 734 735 if (ip6h_version != 6) { 736 if (ip4h->ip_v == 4) 737 return 4; 738 errno = EPROTONOSUPPORT; 739 return -1; 740 } 741 /* version may be 6 or 4 */ 742 if (ip4h->ip_hl != 5) /* IPv4 header length must be 5 for RoCE v2. */ 743 return 6; 744 /* 745 * Verify checksum. 746 * We can't write on scattered buffers so we have to copy to temp 747 * buffer. 748 */ 749 memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); 750 /* Need to set the checksum field (check) to 0 before re-calculating 751 * the checksum. 752 */ 753 ip4h_checked.ip_sum = 0; 754 ip4h_checked.ip_sum = ipv4_calc_hdr_csum((uint16_t *)&ip4h_checked, 10); 755 /* if IPv4 header checksum is OK, believe it */ 756 if (ip4h->ip_sum == ip4h_checked.ip_sum) 757 return 4; 758 return 6; 759 } 760 761 static inline void set_ah_attr_generic_fields(struct ibv_ah_attr *ah_attr, 762 struct ibv_wc *wc, 763 struct ibv_grh *grh, 764 uint8_t port_num) 765 { 766 uint32_t flow_class; 767 768 flow_class = be32toh(grh->version_tclass_flow); 769 ah_attr->grh.flow_label = flow_class & 0xFFFFF; 770 ah_attr->dlid = wc->slid; 771 ah_attr->sl = wc->sl; 772 ah_attr->src_path_bits = wc->dlid_path_bits; 773 ah_attr->port_num = port_num; 774 } 775 776 static inline int set_ah_attr_by_ipv4(struct ibv_context *context, 777 struct ibv_ah_attr *ah_attr, 778 struct ip *ip4h, uint8_t port_num) 779 { 780 union ibv_gid sgid; 781 int ret; 782 783 /* No point searching multicast GIDs in GID table */ 784 if (IN_CLASSD(be32toh(ip4h->ip_dst.s_addr))) { 785 errno = EINVAL; 786 return -1; 787 } 788 789 map_ipv4_addr_to_ipv6(ip4h->ip_dst.s_addr, (struct in6_addr *)&sgid); 790 ret = ibv_find_gid_index(context, port_num, &sgid, 791 IBV_GID_TYPE_ROCE_V2); 792 if (ret < 0) 793 return ret; 794 795 map_ipv4_addr_to_ipv6(ip4h->ip_src.s_addr, 796 (struct in6_addr *)&ah_attr->grh.dgid); 797 ah_attr->grh.sgid_index = (uint8_t) ret; 798 ah_attr->grh.hop_limit = ip4h->ip_ttl; 799 ah_attr->grh.traffic_class = ip4h->ip_tos; 800 801 return 0; 802 } 803 804 #define IB_NEXT_HDR 0x1b 805 static inline int set_ah_attr_by_ipv6(struct ibv_context *context, 806 struct ibv_ah_attr *ah_attr, 807 struct ibv_grh *grh, uint8_t port_num) 808 { 809 uint32_t flow_class; 810 uint32_t sgid_type; 811 int ret; 812 813 /* No point searching multicast GIDs in GID table */ 814 if (grh->dgid.raw[0] == 0xFF) { 815 errno = EINVAL; 816 return -1; 817 } 818 819 ah_attr->grh.dgid = grh->sgid; 820 if (grh->next_hdr == IPPROTO_UDP) { 821 sgid_type = IBV_GID_TYPE_ROCE_V2; 822 } else if (grh->next_hdr == IB_NEXT_HDR) { 823 sgid_type = IBV_GID_TYPE_IB_ROCE_V1; 824 } else { 825 errno = EPROTONOSUPPORT; 826 return -1; 827 } 828 829 ret = ibv_find_gid_index(context, port_num, &grh->dgid, 830 sgid_type); 831 if (ret < 0) 832 return ret; 833 834 ah_attr->grh.sgid_index = (uint8_t) ret; 835 flow_class = be32toh(grh->version_tclass_flow); 836 ah_attr->grh.hop_limit = grh->hop_limit; 837 ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; 838 839 return 0; 840 } 841 842 int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num, 843 struct ibv_wc *wc, struct ibv_grh *grh, 844 struct ibv_ah_attr *ah_attr) 845 { 846 int version; 847 int ret = 0; 848 849 memset(ah_attr, 0, sizeof *ah_attr); 850 set_ah_attr_generic_fields(ah_attr, wc, grh, port_num); 851 852 if (wc->wc_flags & IBV_WC_GRH) { 853 ah_attr->is_global = 1; 854 version = get_grh_header_version(grh); 855 856 if (version == 4) 857 ret = set_ah_attr_by_ipv4(context, ah_attr, 858 (struct ip *)((void *)grh + 20), 859 port_num); 860 else if (version == 6) 861 ret = set_ah_attr_by_ipv6(context, ah_attr, grh, 862 port_num); 863 else 864 ret = -1; 865 } 866 867 return ret; 868 } 869 870 struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc, 871 struct ibv_grh *grh, uint8_t port_num) 872 { 873 struct ibv_ah_attr ah_attr; 874 int ret; 875 876 ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr); 877 if (ret) 878 return NULL; 879 880 return ibv_create_ah(pd, &ah_attr); 881 } 882 883 int __ibv_destroy_ah(struct ibv_ah *ah) 884 { 885 return ah->context->ops.destroy_ah(ah); 886 } 887 default_symver(__ibv_destroy_ah, ibv_destroy_ah); 888 889 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) 890 { 891 return qp->context->ops.attach_mcast(qp, gid, lid); 892 } 893 default_symver(__ibv_attach_mcast, ibv_attach_mcast); 894 895 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) 896 { 897 return qp->context->ops.detach_mcast(qp, gid, lid); 898 } 899 default_symver(__ibv_detach_mcast, ibv_detach_mcast); 900 901 static inline int ipv6_addr_v4mapped(const struct in6_addr *a) 902 { 903 return IN6_IS_ADDR_V4MAPPED(a) || 904 /* IPv4 encoded multicast addresses */ 905 (a->s6_addr32[0] == htobe32(0xff0e0000) && 906 ((a->s6_addr32[1] | 907 (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL)); 908 } 909 910 struct peer_address { 911 void *address; 912 uint32_t size; 913 }; 914 915 static inline int create_peer_from_gid(int family, void *raw_gid, 916 struct peer_address *peer_address) 917 { 918 switch (family) { 919 case AF_INET: 920 peer_address->address = raw_gid + 12; 921 peer_address->size = 4; 922 break; 923 case AF_INET6: 924 peer_address->address = raw_gid; 925 peer_address->size = 16; 926 break; 927 default: 928 return -1; 929 } 930 931 return 0; 932 } 933 934 #define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000 935 int ibv_resolve_eth_l2_from_gid(struct ibv_context *context, 936 struct ibv_ah_attr *attr, 937 uint8_t eth_mac[ETHERNET_LL_SIZE], 938 uint16_t *vid) 939 { 940 #ifndef NRESOLVE_NEIGH 941 int dst_family; 942 int src_family; 943 int oif; 944 struct get_neigh_handler neigh_handler; 945 union ibv_gid sgid; 946 int ether_len; 947 struct peer_address src; 948 struct peer_address dst; 949 uint16_t ret_vid; 950 int ret = -EINVAL; 951 int err; 952 953 err = ibv_query_gid(context, attr->port_num, 954 attr->grh.sgid_index, &sgid); 955 956 if (err) 957 return err; 958 959 err = neigh_init_resources(&neigh_handler, 960 NEIGH_GET_DEFAULT_TIMEOUT_MS); 961 962 if (err) 963 return err; 964 965 dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ? 966 AF_INET : AF_INET6; 967 src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ? 968 AF_INET : AF_INET6; 969 970 if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst)) 971 goto free_resources; 972 973 if (create_peer_from_gid(src_family, &sgid.raw, &src)) 974 goto free_resources; 975 976 if (neigh_set_dst(&neigh_handler, dst_family, dst.address, 977 dst.size)) 978 goto free_resources; 979 980 if (neigh_set_src(&neigh_handler, src_family, src.address, 981 src.size)) 982 goto free_resources; 983 984 oif = neigh_get_oif_from_src(&neigh_handler); 985 986 if (oif > 0) 987 neigh_set_oif(&neigh_handler, oif); 988 else 989 goto free_resources; 990 991 ret = -EHOSTUNREACH; 992 993 /* blocking call */ 994 if (process_get_neigh(&neigh_handler)) 995 goto free_resources; 996 997 ret_vid = neigh_get_vlan_id_from_dev(&neigh_handler); 998 999 if (ret_vid <= 0xfff) 1000 neigh_set_vlan_id(&neigh_handler, ret_vid); 1001 1002 /* We are using only Ethernet here */ 1003 ether_len = neigh_get_ll(&neigh_handler, 1004 eth_mac, 1005 sizeof(uint8_t) * ETHERNET_LL_SIZE); 1006 1007 if (ether_len <= 0) 1008 goto free_resources; 1009 1010 *vid = ret_vid; 1011 1012 ret = 0; 1013 1014 free_resources: 1015 neigh_free_resources(&neigh_handler); 1016 1017 return ret; 1018 #else 1019 return -ENOSYS; 1020 #endif 1021 } 1022