1 /* 2 * Copyright (c) 2005 Topspin Communications. All rights reserved. 3 * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #define _GNU_SOURCE 35 #include <config.h> 36 37 #include <infiniband/endian.h> 38 #include <stdio.h> 39 #include <unistd.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <string.h> 43 #include <dirent.h> 44 #include <netinet/in.h> 45 #include <netinet/ip.h> 46 #include <sys/socket.h> 47 48 #include "ibverbs.h" 49 #ifndef NRESOLVE_NEIGH 50 #include <net/if.h> 51 #include <net/if_arp.h> 52 #include "neigh.h" 53 #endif 54 55 /* Hack to avoid GCC's -Wmissing-prototypes and the similar error from sparse 56 with these prototypes. Symbol versionining requires the goofy names, the 57 prototype must match the version in verbs.h. 58 */ 59 int __ibv_query_device(struct ibv_context *context, 60 struct ibv_device_attr *device_attr); 61 int __ibv_query_port(struct ibv_context *context, uint8_t port_num, 62 struct ibv_port_attr *port_attr); 63 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, 64 union ibv_gid *gid); 65 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index, 66 __be16 *pkey); 67 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context); 68 int __ibv_dealloc_pd(struct ibv_pd *pd); 69 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, 70 int access); 71 int __ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr, 72 size_t length, int access); 73 int __ibv_dereg_mr(struct ibv_mr *mr); 74 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, 75 void *cq_context, 76 struct ibv_comp_channel *channel, 77 int comp_vector); 78 int __ibv_resize_cq(struct ibv_cq *cq, int cqe); 79 int __ibv_destroy_cq(struct ibv_cq *cq); 80 int __ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq, 81 void **cq_context); 82 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents); 83 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd, 84 struct ibv_srq_init_attr *srq_init_attr); 85 int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, 86 int srq_attr_mask); 87 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); 88 int __ibv_destroy_srq(struct ibv_srq *srq); 89 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd, 90 struct ibv_qp_init_attr *qp_init_attr); 91 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, 92 struct ibv_qp_init_attr *init_attr); 93 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); 94 int __ibv_destroy_qp(struct ibv_qp *qp); 95 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); 96 int __ibv_destroy_ah(struct ibv_ah *ah); 97 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, 98 uint16_t lid); 99 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, 100 uint16_t lid); 101 102 int __attribute__((const)) ibv_rate_to_mult(enum ibv_rate rate) 103 { 104 switch (rate) { 105 case IBV_RATE_2_5_GBPS: return 1; 106 case IBV_RATE_5_GBPS: return 2; 107 case IBV_RATE_10_GBPS: return 4; 108 case IBV_RATE_20_GBPS: return 8; 109 case IBV_RATE_30_GBPS: return 12; 110 case IBV_RATE_40_GBPS: return 16; 111 case IBV_RATE_60_GBPS: return 24; 112 case IBV_RATE_80_GBPS: return 32; 113 case IBV_RATE_120_GBPS: return 48; 114 case IBV_RATE_28_GBPS: return 11; 115 case IBV_RATE_50_GBPS: return 20; 116 case IBV_RATE_400_GBPS: return 160; 117 case IBV_RATE_600_GBPS: return 240; 118 default: return -1; 119 } 120 } 121 122 enum ibv_rate __attribute__((const)) mult_to_ibv_rate(int mult) 123 { 124 switch (mult) { 125 case 1: return IBV_RATE_2_5_GBPS; 126 case 2: return IBV_RATE_5_GBPS; 127 case 4: return IBV_RATE_10_GBPS; 128 case 8: return IBV_RATE_20_GBPS; 129 case 12: return IBV_RATE_30_GBPS; 130 case 16: return IBV_RATE_40_GBPS; 131 case 24: return IBV_RATE_60_GBPS; 132 case 32: return IBV_RATE_80_GBPS; 133 case 48: return IBV_RATE_120_GBPS; 134 case 11: return IBV_RATE_28_GBPS; 135 case 20: return IBV_RATE_50_GBPS; 136 case 160: return IBV_RATE_400_GBPS; 137 case 240: return IBV_RATE_600_GBPS; 138 default: return IBV_RATE_MAX; 139 } 140 } 141 142 int __attribute__((const)) ibv_rate_to_mbps(enum ibv_rate rate) 143 { 144 switch (rate) { 145 case IBV_RATE_2_5_GBPS: return 2500; 146 case IBV_RATE_5_GBPS: return 5000; 147 case IBV_RATE_10_GBPS: return 10000; 148 case IBV_RATE_20_GBPS: return 20000; 149 case IBV_RATE_30_GBPS: return 30000; 150 case IBV_RATE_40_GBPS: return 40000; 151 case IBV_RATE_60_GBPS: return 60000; 152 case IBV_RATE_80_GBPS: return 80000; 153 case IBV_RATE_120_GBPS: return 120000; 154 case IBV_RATE_14_GBPS: return 14062; 155 case IBV_RATE_56_GBPS: return 56250; 156 case IBV_RATE_112_GBPS: return 112500; 157 case IBV_RATE_168_GBPS: return 168750; 158 case IBV_RATE_25_GBPS: return 25781; 159 case IBV_RATE_100_GBPS: return 103125; 160 case IBV_RATE_200_GBPS: return 206250; 161 case IBV_RATE_300_GBPS: return 309375; 162 case IBV_RATE_28_GBPS: return 28125; 163 case IBV_RATE_50_GBPS: return 53125; 164 case IBV_RATE_400_GBPS: return 425000; 165 case IBV_RATE_600_GBPS: return 637500; 166 default: return -1; 167 } 168 } 169 170 enum ibv_rate __attribute__((const)) mbps_to_ibv_rate(int mbps) 171 { 172 switch (mbps) { 173 case 2500: return IBV_RATE_2_5_GBPS; 174 case 5000: return IBV_RATE_5_GBPS; 175 case 10000: return IBV_RATE_10_GBPS; 176 case 20000: return IBV_RATE_20_GBPS; 177 case 30000: return IBV_RATE_30_GBPS; 178 case 40000: return IBV_RATE_40_GBPS; 179 case 60000: return IBV_RATE_60_GBPS; 180 case 80000: return IBV_RATE_80_GBPS; 181 case 120000: return IBV_RATE_120_GBPS; 182 case 14062: return IBV_RATE_14_GBPS; 183 case 56250: return IBV_RATE_56_GBPS; 184 case 112500: return IBV_RATE_112_GBPS; 185 case 168750: return IBV_RATE_168_GBPS; 186 case 25781: return IBV_RATE_25_GBPS; 187 case 103125: return IBV_RATE_100_GBPS; 188 case 206250: return IBV_RATE_200_GBPS; 189 case 309375: return IBV_RATE_300_GBPS; 190 case 28125: return IBV_RATE_28_GBPS; 191 case 53125: return IBV_RATE_50_GBPS; 192 case 425000: return IBV_RATE_400_GBPS; 193 case 637500: return IBV_RATE_600_GBPS; 194 default: return IBV_RATE_MAX; 195 } 196 } 197 198 int __ibv_query_device(struct ibv_context *context, 199 struct ibv_device_attr *device_attr) 200 { 201 return context->ops.query_device(context, device_attr); 202 } 203 default_symver(__ibv_query_device, ibv_query_device); 204 205 int __ibv_query_port(struct ibv_context *context, uint8_t port_num, 206 struct ibv_port_attr *port_attr) 207 { 208 return context->ops.query_port(context, port_num, port_attr); 209 } 210 default_symver(__ibv_query_port, ibv_query_port); 211 212 int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, 213 int index, union ibv_gid *gid) 214 { 215 char name[24]; 216 char attr[41]; 217 uint16_t val; 218 int i; 219 220 snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index); 221 222 if (ibv_read_sysfs_file(context->device->ibdev_path, name, 223 attr, sizeof attr) < 0) 224 return -1; 225 226 for (i = 0; i < 8; ++i) { 227 if (sscanf(attr + i * 5, "%hx", &val) != 1) 228 return -1; 229 gid->raw[i * 2 ] = val >> 8; 230 gid->raw[i * 2 + 1] = val & 0xff; 231 } 232 233 return 0; 234 } 235 default_symver(__ibv_query_gid, ibv_query_gid); 236 237 int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, 238 int index, __be16 *pkey) 239 { 240 char name[24]; 241 char attr[8]; 242 uint16_t val; 243 244 snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index); 245 246 if (ibv_read_sysfs_file(context->device->ibdev_path, name, 247 attr, sizeof attr) < 0) 248 return -1; 249 250 if (sscanf(attr, "%hx", &val) != 1) 251 return -1; 252 253 *pkey = htobe16(val); 254 return 0; 255 } 256 default_symver(__ibv_query_pkey, ibv_query_pkey); 257 258 struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context) 259 { 260 struct ibv_pd *pd; 261 262 pd = context->ops.alloc_pd(context); 263 if (pd) 264 pd->context = context; 265 266 return pd; 267 } 268 default_symver(__ibv_alloc_pd, ibv_alloc_pd); 269 270 int __ibv_dealloc_pd(struct ibv_pd *pd) 271 { 272 return pd->context->ops.dealloc_pd(pd); 273 } 274 default_symver(__ibv_dealloc_pd, ibv_dealloc_pd); 275 276 struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, 277 size_t length, int access) 278 { 279 struct ibv_mr *mr; 280 281 if (ibv_dontfork_range(addr, length)) 282 return NULL; 283 284 mr = pd->context->ops.reg_mr(pd, addr, length, access); 285 if (mr) { 286 mr->context = pd->context; 287 mr->pd = pd; 288 mr->addr = addr; 289 mr->length = length; 290 } else 291 ibv_dofork_range(addr, length); 292 293 return mr; 294 } 295 default_symver(__ibv_reg_mr, ibv_reg_mr); 296 297 int __ibv_rereg_mr(struct ibv_mr *mr, int flags, 298 struct ibv_pd *pd, void *addr, 299 size_t length, int access) 300 { 301 int dofork_onfail = 0; 302 int err; 303 void *old_addr; 304 size_t old_len; 305 306 if (flags & ~IBV_REREG_MR_FLAGS_SUPPORTED) { 307 errno = EINVAL; 308 return IBV_REREG_MR_ERR_INPUT; 309 } 310 311 if ((flags & IBV_REREG_MR_CHANGE_TRANSLATION) && 312 (!length || !addr)) { 313 errno = EINVAL; 314 return IBV_REREG_MR_ERR_INPUT; 315 } 316 317 if (access && !(flags & IBV_REREG_MR_CHANGE_ACCESS)) { 318 errno = EINVAL; 319 return IBV_REREG_MR_ERR_INPUT; 320 } 321 322 if (!mr->context->ops.rereg_mr) { 323 errno = ENOSYS; 324 return IBV_REREG_MR_ERR_INPUT; 325 } 326 327 if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) { 328 err = ibv_dontfork_range(addr, length); 329 if (err) 330 return IBV_REREG_MR_ERR_DONT_FORK_NEW; 331 dofork_onfail = 1; 332 } 333 334 old_addr = mr->addr; 335 old_len = mr->length; 336 err = mr->context->ops.rereg_mr(mr, flags, pd, addr, length, access); 337 if (!err) { 338 if (flags & IBV_REREG_MR_CHANGE_PD) 339 mr->pd = pd; 340 if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) { 341 mr->addr = addr; 342 mr->length = length; 343 err = ibv_dofork_range(old_addr, old_len); 344 if (err) 345 return IBV_REREG_MR_ERR_DO_FORK_OLD; 346 } 347 } else { 348 err = IBV_REREG_MR_ERR_CMD; 349 if (dofork_onfail) { 350 if (ibv_dofork_range(addr, length)) 351 err = IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW; 352 } 353 } 354 355 return err; 356 } 357 default_symver(__ibv_rereg_mr, ibv_rereg_mr); 358 359 int __ibv_dereg_mr(struct ibv_mr *mr) 360 { 361 int ret; 362 void *addr = mr->addr; 363 size_t length = mr->length; 364 365 ret = mr->context->ops.dereg_mr(mr); 366 if (!ret) 367 ibv_dofork_range(addr, length); 368 369 return ret; 370 } 371 default_symver(__ibv_dereg_mr, ibv_dereg_mr); 372 373 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context) 374 { 375 struct ibv_abi_compat_v2 *t = context->abi_compat; 376 static int warned; 377 378 if (!pthread_mutex_trylock(&t->in_use)) 379 return &t->channel; 380 381 if (!warned) { 382 fprintf(stderr, PFX "Warning: kernel's ABI version %d limits capacity.\n" 383 " Only one completion channel can be created per context.\n", 384 abi_ver); 385 ++warned; 386 } 387 388 return NULL; 389 } 390 391 struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context) 392 { 393 struct ibv_comp_channel *channel; 394 struct ibv_create_comp_channel cmd; 395 struct ibv_create_comp_channel_resp resp; 396 397 if (abi_ver <= 2) 398 return ibv_create_comp_channel_v2(context); 399 400 channel = malloc(sizeof *channel); 401 if (!channel) 402 return NULL; 403 404 IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_COMP_CHANNEL, &resp, sizeof resp); 405 if (write(context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) { 406 free(channel); 407 return NULL; 408 } 409 410 (void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 411 412 channel->context = context; 413 channel->fd = resp.fd; 414 channel->refcnt = 0; 415 416 return channel; 417 } 418 419 static int ibv_destroy_comp_channel_v2(struct ibv_comp_channel *channel) 420 { 421 struct ibv_abi_compat_v2 *t = (struct ibv_abi_compat_v2 *) channel; 422 pthread_mutex_unlock(&t->in_use); 423 return 0; 424 } 425 426 int ibv_destroy_comp_channel(struct ibv_comp_channel *channel) 427 { 428 struct ibv_context *context; 429 int ret; 430 431 context = channel->context; 432 pthread_mutex_lock(&context->mutex); 433 434 if (channel->refcnt) { 435 ret = EBUSY; 436 goto out; 437 } 438 439 if (abi_ver <= 2) { 440 ret = ibv_destroy_comp_channel_v2(channel); 441 goto out; 442 } 443 444 close(channel->fd); 445 free(channel); 446 ret = 0; 447 448 out: 449 pthread_mutex_unlock(&context->mutex); 450 451 return ret; 452 } 453 454 struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context, 455 struct ibv_comp_channel *channel, int comp_vector) 456 { 457 struct ibv_cq *cq; 458 int err = 0; 459 460 cq = context->ops.create_cq(context, cqe, channel, comp_vector); 461 462 if (!cq) 463 return NULL; 464 465 err = verbs_init_cq(cq, context, channel, cq_context); 466 if (err) 467 goto err; 468 469 return cq; 470 471 err: 472 context->ops.destroy_cq(cq); 473 474 return NULL; 475 } 476 default_symver(__ibv_create_cq, ibv_create_cq); 477 478 int __ibv_resize_cq(struct ibv_cq *cq, int cqe) 479 { 480 if (!cq->context->ops.resize_cq) 481 return ENOSYS; 482 483 return cq->context->ops.resize_cq(cq, cqe); 484 } 485 default_symver(__ibv_resize_cq, ibv_resize_cq); 486 487 int __ibv_destroy_cq(struct ibv_cq *cq) 488 { 489 struct ibv_comp_channel *channel = cq->channel; 490 int ret; 491 492 ret = cq->context->ops.destroy_cq(cq); 493 494 if (channel) { 495 if (!ret) { 496 pthread_mutex_lock(&channel->context->mutex); 497 --channel->refcnt; 498 pthread_mutex_unlock(&channel->context->mutex); 499 } 500 } 501 502 return ret; 503 } 504 default_symver(__ibv_destroy_cq, ibv_destroy_cq); 505 506 int __ibv_get_cq_event(struct ibv_comp_channel *channel, 507 struct ibv_cq **cq, void **cq_context) 508 { 509 struct ibv_comp_event ev; 510 511 if (read(channel->fd, &ev, sizeof ev) != sizeof ev) 512 return -1; 513 514 *cq = (struct ibv_cq *) (uintptr_t) ev.cq_handle; 515 *cq_context = (*cq)->cq_context; 516 517 if ((*cq)->context->ops.cq_event) 518 (*cq)->context->ops.cq_event(*cq); 519 520 return 0; 521 } 522 default_symver(__ibv_get_cq_event, ibv_get_cq_event); 523 524 void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents) 525 { 526 pthread_mutex_lock(&cq->mutex); 527 cq->comp_events_completed += nevents; 528 pthread_cond_signal(&cq->cond); 529 pthread_mutex_unlock(&cq->mutex); 530 } 531 default_symver(__ibv_ack_cq_events, ibv_ack_cq_events); 532 533 struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd, 534 struct ibv_srq_init_attr *srq_init_attr) 535 { 536 struct ibv_srq *srq; 537 538 if (!pd->context->ops.create_srq) 539 return NULL; 540 541 srq = pd->context->ops.create_srq(pd, srq_init_attr); 542 if (!srq) 543 return NULL; 544 545 srq->context = pd->context; 546 srq->srq_context = srq_init_attr->srq_context; 547 srq->pd = pd; 548 srq->events_completed = 0; 549 if (pthread_mutex_init(&srq->mutex, NULL)) 550 goto err; 551 if (pthread_cond_init(&srq->cond, NULL)) 552 goto err_mutex; 553 554 return srq; 555 556 err_mutex: 557 pthread_mutex_destroy(&srq->mutex); 558 err: 559 pd->context->ops.destroy_srq(srq); 560 561 return NULL; 562 } 563 default_symver(__ibv_create_srq, ibv_create_srq); 564 565 int __ibv_modify_srq(struct ibv_srq *srq, 566 struct ibv_srq_attr *srq_attr, 567 int srq_attr_mask) 568 { 569 return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask); 570 } 571 default_symver(__ibv_modify_srq, ibv_modify_srq); 572 573 int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr) 574 { 575 return srq->context->ops.query_srq(srq, srq_attr); 576 } 577 default_symver(__ibv_query_srq, ibv_query_srq); 578 579 int __ibv_destroy_srq(struct ibv_srq *srq) 580 { 581 pthread_cond_destroy(&srq->cond); 582 pthread_mutex_destroy(&srq->mutex); 583 return srq->context->ops.destroy_srq(srq); 584 } 585 default_symver(__ibv_destroy_srq, ibv_destroy_srq); 586 587 struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd, 588 struct ibv_qp_init_attr *qp_init_attr) 589 { 590 struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr); 591 592 if (qp) { 593 qp->context = pd->context; 594 qp->qp_context = qp_init_attr->qp_context; 595 qp->pd = pd; 596 qp->send_cq = qp_init_attr->send_cq; 597 qp->recv_cq = qp_init_attr->recv_cq; 598 qp->srq = qp_init_attr->srq; 599 qp->qp_type = qp_init_attr->qp_type; 600 qp->state = IBV_QPS_RESET; 601 qp->events_completed = 0; 602 pthread_mutex_init(&qp->mutex, NULL); 603 pthread_cond_init(&qp->cond, NULL); 604 } 605 606 return qp; 607 } 608 default_symver(__ibv_create_qp, ibv_create_qp); 609 610 int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, 611 int attr_mask, 612 struct ibv_qp_init_attr *init_attr) 613 { 614 int ret; 615 616 ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr); 617 if (ret) 618 return ret; 619 620 if (attr_mask & IBV_QP_STATE) 621 qp->state = attr->qp_state; 622 623 return 0; 624 } 625 default_symver(__ibv_query_qp, ibv_query_qp); 626 627 int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, 628 int attr_mask) 629 { 630 int ret; 631 632 ret = qp->context->ops.modify_qp(qp, attr, attr_mask); 633 if (ret) 634 return ret; 635 636 if (attr_mask & IBV_QP_STATE) 637 qp->state = attr->qp_state; 638 639 return 0; 640 } 641 default_symver(__ibv_modify_qp, ibv_modify_qp); 642 643 int __ibv_destroy_qp(struct ibv_qp *qp) 644 { 645 return qp->context->ops.destroy_qp(qp); 646 } 647 default_symver(__ibv_destroy_qp, ibv_destroy_qp); 648 649 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) 650 { 651 struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr); 652 653 if (ah) { 654 ah->context = pd->context; 655 ah->pd = pd; 656 } 657 658 return ah; 659 } 660 default_symver(__ibv_create_ah, ibv_create_ah); 661 662 /* GID types as appear in sysfs, no change is expected as of ABI 663 * compatibility. 664 */ 665 #define V1_TYPE "IB/RoCE v1" 666 #define V2_TYPE "RoCE v2" 667 int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, 668 unsigned int index, enum ibv_gid_type *type) 669 { 670 char name[32]; 671 char buff[11]; 672 673 snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num, 674 index); 675 676 /* Reset errno so that we can rely on its value upon any error flow in 677 * ibv_read_sysfs_file. 678 */ 679 errno = 0; 680 if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff, 681 sizeof(buff)) <= 0) { 682 char *dir_path; 683 DIR *dir; 684 685 if (errno == EINVAL) { 686 /* In IB, this file doesn't exist and the kernel sets 687 * errno to -EINVAL. 688 */ 689 *type = IBV_GID_TYPE_IB_ROCE_V1; 690 return 0; 691 } 692 if (asprintf(&dir_path, "%s/%s/%d/%s/", 693 context->device->ibdev_path, "ports", port_num, 694 "gid_attrs") < 0) 695 return -1; 696 dir = opendir(dir_path); 697 free(dir_path); 698 if (!dir) { 699 if (errno == ENOENT) 700 /* Assuming that if gid_attrs doesn't exist, 701 * we have an old kernel and all GIDs are 702 * IB/RoCE v1 703 */ 704 *type = IBV_GID_TYPE_IB_ROCE_V1; 705 else 706 return -1; 707 } else { 708 closedir(dir); 709 errno = EFAULT; 710 return -1; 711 } 712 } else { 713 if (!strcmp(buff, V1_TYPE)) { 714 *type = IBV_GID_TYPE_IB_ROCE_V1; 715 } else if (!strcmp(buff, V2_TYPE)) { 716 *type = IBV_GID_TYPE_ROCE_V2; 717 } else { 718 errno = ENOTSUP; 719 return -1; 720 } 721 } 722 723 return 0; 724 } 725 726 static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num, 727 union ibv_gid *gid, enum ibv_gid_type gid_type) 728 { 729 enum ibv_gid_type sgid_type = 0; 730 union ibv_gid sgid; 731 int i = 0, ret; 732 733 do { 734 ret = ibv_query_gid(context, port_num, i, &sgid); 735 if (!ret) { 736 ret = ibv_query_gid_type(context, port_num, i, 737 &sgid_type); 738 } 739 i++; 740 } while (!ret && (memcmp(&sgid, gid, sizeof(*gid)) || 741 (gid_type != sgid_type))); 742 743 return ret ? ret : i - 1; 744 } 745 746 static inline void map_ipv4_addr_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) 747 { 748 ipv6->s6_addr32[0] = 0; 749 ipv6->s6_addr32[1] = 0; 750 ipv6->s6_addr32[2] = htobe32(0x0000FFFF); 751 ipv6->s6_addr32[3] = ipv4; 752 } 753 754 static inline __sum16 ipv4_calc_hdr_csum(uint16_t *data, unsigned int num_hwords) 755 { 756 unsigned int i = 0; 757 uint32_t sum = 0; 758 759 for (i = 0; i < num_hwords; i++) 760 sum += *(data++); 761 762 sum = (sum & 0xffff) + (sum >> 16); 763 764 return (__sum16)~sum; 765 } 766 767 static inline int get_grh_header_version(struct ibv_grh *grh) 768 { 769 int ip6h_version = (be32toh(grh->version_tclass_flow) >> 28) & 0xf; 770 struct ip *ip4h = (struct ip *)((void *)grh + 20); 771 struct ip ip4h_checked; 772 773 if (ip6h_version != 6) { 774 if (ip4h->ip_v == 4) 775 return 4; 776 errno = EPROTONOSUPPORT; 777 return -1; 778 } 779 /* version may be 6 or 4 */ 780 if (ip4h->ip_hl != 5) /* IPv4 header length must be 5 for RoCE v2. */ 781 return 6; 782 /* 783 * Verify checksum. 784 * We can't write on scattered buffers so we have to copy to temp 785 * buffer. 786 */ 787 memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); 788 /* Need to set the checksum field (check) to 0 before re-calculating 789 * the checksum. 790 */ 791 ip4h_checked.ip_sum = 0; 792 ip4h_checked.ip_sum = ipv4_calc_hdr_csum((uint16_t *)&ip4h_checked, 10); 793 /* if IPv4 header checksum is OK, believe it */ 794 if (ip4h->ip_sum == ip4h_checked.ip_sum) 795 return 4; 796 return 6; 797 } 798 799 static inline void set_ah_attr_generic_fields(struct ibv_ah_attr *ah_attr, 800 struct ibv_wc *wc, 801 struct ibv_grh *grh, 802 uint8_t port_num) 803 { 804 uint32_t flow_class; 805 806 flow_class = be32toh(grh->version_tclass_flow); 807 ah_attr->grh.flow_label = flow_class & 0xFFFFF; 808 ah_attr->dlid = wc->slid; 809 ah_attr->sl = wc->sl; 810 ah_attr->src_path_bits = wc->dlid_path_bits; 811 ah_attr->port_num = port_num; 812 } 813 814 static inline int set_ah_attr_by_ipv4(struct ibv_context *context, 815 struct ibv_ah_attr *ah_attr, 816 struct ip *ip4h, uint8_t port_num) 817 { 818 union ibv_gid sgid; 819 int ret; 820 821 /* No point searching multicast GIDs in GID table */ 822 if (IN_CLASSD(be32toh(ip4h->ip_dst.s_addr))) { 823 errno = EINVAL; 824 return -1; 825 } 826 827 map_ipv4_addr_to_ipv6(ip4h->ip_dst.s_addr, (struct in6_addr *)&sgid); 828 ret = ibv_find_gid_index(context, port_num, &sgid, 829 IBV_GID_TYPE_ROCE_V2); 830 if (ret < 0) 831 return ret; 832 833 map_ipv4_addr_to_ipv6(ip4h->ip_src.s_addr, 834 (struct in6_addr *)&ah_attr->grh.dgid); 835 ah_attr->grh.sgid_index = (uint8_t) ret; 836 ah_attr->grh.hop_limit = ip4h->ip_ttl; 837 ah_attr->grh.traffic_class = ip4h->ip_tos; 838 839 return 0; 840 } 841 842 #define IB_NEXT_HDR 0x1b 843 static inline int set_ah_attr_by_ipv6(struct ibv_context *context, 844 struct ibv_ah_attr *ah_attr, 845 struct ibv_grh *grh, uint8_t port_num) 846 { 847 uint32_t flow_class; 848 uint32_t sgid_type; 849 int ret; 850 851 /* No point searching multicast GIDs in GID table */ 852 if (grh->dgid.raw[0] == 0xFF) { 853 errno = EINVAL; 854 return -1; 855 } 856 857 ah_attr->grh.dgid = grh->sgid; 858 if (grh->next_hdr == IPPROTO_UDP) { 859 sgid_type = IBV_GID_TYPE_ROCE_V2; 860 } else if (grh->next_hdr == IB_NEXT_HDR) { 861 sgid_type = IBV_GID_TYPE_IB_ROCE_V1; 862 } else { 863 errno = EPROTONOSUPPORT; 864 return -1; 865 } 866 867 ret = ibv_find_gid_index(context, port_num, &grh->dgid, 868 sgid_type); 869 if (ret < 0) 870 return ret; 871 872 ah_attr->grh.sgid_index = (uint8_t) ret; 873 flow_class = be32toh(grh->version_tclass_flow); 874 ah_attr->grh.hop_limit = grh->hop_limit; 875 ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; 876 877 return 0; 878 } 879 880 int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num, 881 struct ibv_wc *wc, struct ibv_grh *grh, 882 struct ibv_ah_attr *ah_attr) 883 { 884 int version; 885 int ret = 0; 886 887 memset(ah_attr, 0, sizeof *ah_attr); 888 set_ah_attr_generic_fields(ah_attr, wc, grh, port_num); 889 890 if (wc->wc_flags & IBV_WC_GRH) { 891 ah_attr->is_global = 1; 892 version = get_grh_header_version(grh); 893 894 if (version == 4) 895 ret = set_ah_attr_by_ipv4(context, ah_attr, 896 (struct ip *)((void *)grh + 20), 897 port_num); 898 else if (version == 6) 899 ret = set_ah_attr_by_ipv6(context, ah_attr, grh, 900 port_num); 901 else 902 ret = -1; 903 } 904 905 return ret; 906 } 907 908 struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc, 909 struct ibv_grh *grh, uint8_t port_num) 910 { 911 struct ibv_ah_attr ah_attr; 912 int ret; 913 914 ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr); 915 if (ret) 916 return NULL; 917 918 return ibv_create_ah(pd, &ah_attr); 919 } 920 921 int __ibv_destroy_ah(struct ibv_ah *ah) 922 { 923 return ah->context->ops.destroy_ah(ah); 924 } 925 default_symver(__ibv_destroy_ah, ibv_destroy_ah); 926 927 int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) 928 { 929 return qp->context->ops.attach_mcast(qp, gid, lid); 930 } 931 default_symver(__ibv_attach_mcast, ibv_attach_mcast); 932 933 int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) 934 { 935 return qp->context->ops.detach_mcast(qp, gid, lid); 936 } 937 default_symver(__ibv_detach_mcast, ibv_detach_mcast); 938 939 static inline int ipv6_addr_v4mapped(const struct in6_addr *a) 940 { 941 return IN6_IS_ADDR_V4MAPPED(a) || 942 /* IPv4 encoded multicast addresses */ 943 (a->s6_addr32[0] == htobe32(0xff0e0000) && 944 ((a->s6_addr32[1] | 945 (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL)); 946 } 947 948 struct peer_address { 949 void *address; 950 uint32_t size; 951 }; 952 953 static inline int create_peer_from_gid(int family, void *raw_gid, 954 struct peer_address *peer_address) 955 { 956 switch (family) { 957 case AF_INET: 958 peer_address->address = raw_gid + 12; 959 peer_address->size = 4; 960 break; 961 case AF_INET6: 962 peer_address->address = raw_gid; 963 peer_address->size = 16; 964 break; 965 default: 966 return -1; 967 } 968 969 return 0; 970 } 971 972 #define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000 973 int ibv_resolve_eth_l2_from_gid(struct ibv_context *context, 974 struct ibv_ah_attr *attr, 975 uint8_t eth_mac[ETHERNET_LL_SIZE], 976 uint16_t *vid) 977 { 978 #ifndef NRESOLVE_NEIGH 979 int dst_family; 980 int src_family; 981 int oif; 982 struct get_neigh_handler neigh_handler; 983 union ibv_gid sgid; 984 int ether_len; 985 struct peer_address src; 986 struct peer_address dst; 987 uint16_t ret_vid; 988 int ret = -EINVAL; 989 int err; 990 991 err = ibv_query_gid(context, attr->port_num, 992 attr->grh.sgid_index, &sgid); 993 994 if (err) 995 return err; 996 997 err = neigh_init_resources(&neigh_handler, 998 NEIGH_GET_DEFAULT_TIMEOUT_MS); 999 1000 if (err) 1001 return err; 1002 1003 dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ? 1004 AF_INET : AF_INET6; 1005 src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ? 1006 AF_INET : AF_INET6; 1007 1008 if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst)) 1009 goto free_resources; 1010 1011 if (create_peer_from_gid(src_family, &sgid.raw, &src)) 1012 goto free_resources; 1013 1014 if (neigh_set_dst(&neigh_handler, dst_family, dst.address, 1015 dst.size)) 1016 goto free_resources; 1017 1018 if (neigh_set_src(&neigh_handler, src_family, src.address, 1019 src.size)) 1020 goto free_resources; 1021 1022 oif = neigh_get_oif_from_src(&neigh_handler); 1023 1024 if (oif > 0) 1025 neigh_set_oif(&neigh_handler, oif); 1026 else 1027 goto free_resources; 1028 1029 ret = -EHOSTUNREACH; 1030 1031 /* blocking call */ 1032 if (process_get_neigh(&neigh_handler)) 1033 goto free_resources; 1034 1035 ret_vid = neigh_get_vlan_id_from_dev(&neigh_handler); 1036 1037 if (ret_vid <= 0xfff) 1038 neigh_set_vlan_id(&neigh_handler, ret_vid); 1039 1040 /* We are using only Ethernet here */ 1041 ether_len = neigh_get_ll(&neigh_handler, 1042 eth_mac, 1043 sizeof(uint8_t) * ETHERNET_LL_SIZE); 1044 1045 if (ether_len <= 0) 1046 goto free_resources; 1047 1048 *vid = ret_vid; 1049 1050 ret = 0; 1051 1052 free_resources: 1053 neigh_free_resources(&neigh_handler); 1054 1055 return ret; 1056 #else 1057 return -ENOSYS; 1058 #endif 1059 } 1060