1 /* 2 * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <config.h> 34 35 #include <stdlib.h> 36 #include <string.h> 37 #include <glob.h> 38 #include <stdio.h> 39 #include <fcntl.h> 40 #include <errno.h> 41 #include <stdint.h> 42 #include <poll.h> 43 #include <unistd.h> 44 #include <pthread.h> 45 #include <infiniband/endian.h> 46 #include <stddef.h> 47 #include <netdb.h> 48 #include <syslog.h> 49 #include <limits.h> 50 51 #include "cma.h" 52 #include "indexer.h" 53 #include <infiniband/driver.h> 54 #include <infiniband/marshall.h> 55 #include <rdma/rdma_cma.h> 56 #include <rdma/rdma_cma_abi.h> 57 #include <rdma/rdma_verbs.h> 58 #include <infiniband/ib.h> 59 60 #define CMA_INIT_CMD(req, req_size, op) \ 61 do { \ 62 memset(req, 0, req_size); \ 63 (req)->cmd = UCMA_CMD_##op; \ 64 (req)->in = req_size - sizeof(struct ucma_abi_cmd_hdr); \ 65 } while (0) 66 67 #define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \ 68 do { \ 69 CMA_INIT_CMD(req, req_size, op); \ 70 (req)->out = resp_size; \ 71 (req)->response = (uintptr_t) (resp); \ 72 } while (0) 73 74 struct cma_port { 75 uint8_t link_layer; 76 }; 77 78 struct cma_device { 79 struct ibv_context *verbs; 80 struct ibv_pd *pd; 81 struct ibv_xrcd *xrcd; 82 struct cma_port *port; 83 __be64 guid; 84 int port_cnt; 85 int refcnt; 86 int max_qpsize; 87 uint8_t max_initiator_depth; 88 uint8_t max_responder_resources; 89 }; 90 91 struct cma_id_private { 92 struct rdma_cm_id id; 93 struct cma_device *cma_dev; 94 void *connect; 95 size_t connect_len; 96 int events_completed; 97 int connect_error; 98 int sync; 99 pthread_cond_t cond; 100 pthread_mutex_t mut; 101 uint32_t handle; 102 struct cma_multicast *mc_list; 103 struct ibv_qp_init_attr *qp_init_attr; 104 uint8_t initiator_depth; 105 uint8_t responder_resources; 106 }; 107 108 struct cma_multicast { 109 struct cma_multicast *next; 110 struct cma_id_private *id_priv; 111 void *context; 112 int events_completed; 113 pthread_cond_t cond; 114 uint32_t handle; 115 union ibv_gid mgid; 116 uint16_t mlid; 117 struct sockaddr_storage addr; 118 }; 119 120 struct cma_event { 121 struct rdma_cm_event event; 122 uint8_t private_data[RDMA_MAX_PRIVATE_DATA]; 123 struct cma_id_private *id_priv; 124 struct cma_multicast *mc; 125 }; 126 127 static struct cma_device *cma_dev_array; 128 static int cma_dev_cnt; 129 static int cma_init_cnt; 130 static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; 131 static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION; 132 int af_ib_support; 133 static struct index_map ucma_idm; 134 static fastlock_t idm_lock; 135 136 static int check_abi_version(void) 137 { 138 char value[8]; 139 140 if ((ibv_read_sysfs_file(ibv_get_sysfs_path(), 141 "class/misc/rdma_cm/abi_version", 142 value, sizeof value) < 0) && 143 (ibv_read_sysfs_file(ibv_get_sysfs_path(), 144 "class/infiniband_ucma/abi_version", 145 value, sizeof value) < 0)) { 146 /* 147 * Older version of Linux do not have class/misc. To support 148 * backports, assume the most recent version of the ABI. If 149 * we're wrong, we'll simply fail later when calling the ABI. 150 */ 151 return 0; 152 } 153 154 abi_ver = strtol(value, NULL, 10); 155 if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION || 156 abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) { 157 return -1; 158 } 159 return 0; 160 } 161 162 /* 163 * This function is called holding the mutex lock 164 * cma_dev_cnt must be set before calling this function to 165 * ensure that the lock is not acquired recursively. 166 */ 167 static void ucma_set_af_ib_support(void) 168 { 169 struct rdma_cm_id *id; 170 struct sockaddr_ib sib; 171 int ret; 172 173 ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB); 174 if (ret) 175 return; 176 177 memset(&sib, 0, sizeof sib); 178 sib.sib_family = AF_IB; 179 sib.sib_sid = htobe64(RDMA_IB_IP_PS_TCP); 180 sib.sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK); 181 af_ib_support = 1; 182 ret = rdma_bind_addr(id, (struct sockaddr *) &sib); 183 af_ib_support = !ret; 184 185 rdma_destroy_id(id); 186 } 187 188 int ucma_init(void) 189 { 190 struct ibv_device **dev_list = NULL; 191 int i, ret, dev_cnt; 192 193 /* Quick check without lock to see if we're already initialized */ 194 if (cma_dev_cnt) 195 return 0; 196 197 pthread_mutex_lock(&mut); 198 if (cma_dev_cnt) { 199 pthread_mutex_unlock(&mut); 200 return 0; 201 } 202 203 fastlock_init(&idm_lock); 204 ret = check_abi_version(); 205 if (ret) 206 goto err1; 207 208 dev_list = ibv_get_device_list(&dev_cnt); 209 if (!dev_list) { 210 ret = ERR(ENODEV); 211 goto err1; 212 } 213 214 if (!dev_cnt) { 215 ret = ERR(ENODEV); 216 goto err2; 217 } 218 219 cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array)); 220 if (!cma_dev_array) { 221 ret = ERR(ENOMEM); 222 goto err2; 223 } 224 225 for (i = 0; dev_list[i]; i++) 226 cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]); 227 228 cma_dev_cnt = dev_cnt; 229 ucma_set_af_ib_support(); 230 pthread_mutex_unlock(&mut); 231 ibv_free_device_list(dev_list); 232 return 0; 233 234 err2: 235 ibv_free_device_list(dev_list); 236 err1: 237 fastlock_destroy(&idm_lock); 238 pthread_mutex_unlock(&mut); 239 return ret; 240 } 241 242 static struct ibv_context *ucma_open_device(__be64 guid) 243 { 244 struct ibv_device **dev_list; 245 struct ibv_context *verbs = NULL; 246 int i; 247 248 dev_list = ibv_get_device_list(NULL); 249 if (!dev_list) { 250 return NULL; 251 } 252 253 for (i = 0; dev_list[i]; i++) { 254 if (ibv_get_device_guid(dev_list[i]) == guid) { 255 verbs = ibv_open_device(dev_list[i]); 256 break; 257 } 258 } 259 260 ibv_free_device_list(dev_list); 261 return verbs; 262 } 263 264 static int ucma_init_device(struct cma_device *cma_dev) 265 { 266 struct ibv_port_attr port_attr; 267 struct ibv_device_attr attr; 268 int i, ret; 269 270 if (cma_dev->verbs) 271 return 0; 272 273 cma_dev->verbs = ucma_open_device(cma_dev->guid); 274 if (!cma_dev->verbs) 275 return ERR(ENODEV); 276 277 ret = ibv_query_device(cma_dev->verbs, &attr); 278 if (ret) { 279 ret = ERR(ret); 280 goto err; 281 } 282 283 cma_dev->port = malloc(sizeof(*cma_dev->port) * attr.phys_port_cnt); 284 if (!cma_dev->port) { 285 ret = ERR(ENOMEM); 286 goto err; 287 } 288 289 for (i = 1; i <= attr.phys_port_cnt; i++) { 290 if (ibv_query_port(cma_dev->verbs, i, &port_attr)) 291 cma_dev->port[i - 1].link_layer = IBV_LINK_LAYER_UNSPECIFIED; 292 else 293 cma_dev->port[i - 1].link_layer = port_attr.link_layer; 294 } 295 296 cma_dev->port_cnt = attr.phys_port_cnt; 297 cma_dev->max_qpsize = attr.max_qp_wr; 298 cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom; 299 cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom; 300 cma_init_cnt++; 301 return 0; 302 303 err: 304 ibv_close_device(cma_dev->verbs); 305 cma_dev->verbs = NULL; 306 return ret; 307 } 308 309 static int ucma_init_all(void) 310 { 311 int i, ret = 0; 312 313 if (!cma_dev_cnt) { 314 ret = ucma_init(); 315 if (ret) 316 return ret; 317 } 318 319 if (cma_init_cnt == cma_dev_cnt) 320 return 0; 321 322 pthread_mutex_lock(&mut); 323 for (i = 0; i < cma_dev_cnt; i++) { 324 ret = ucma_init_device(&cma_dev_array[i]); 325 if (ret) 326 break; 327 } 328 pthread_mutex_unlock(&mut); 329 return ret; 330 } 331 332 struct ibv_context **rdma_get_devices(int *num_devices) 333 { 334 struct ibv_context **devs = NULL; 335 int i; 336 337 if (ucma_init_all()) 338 goto out; 339 340 devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1)); 341 if (!devs) 342 goto out; 343 344 for (i = 0; i < cma_dev_cnt; i++) 345 devs[i] = cma_dev_array[i].verbs; 346 devs[i] = NULL; 347 out: 348 if (num_devices) 349 *num_devices = devs ? cma_dev_cnt : 0; 350 return devs; 351 } 352 353 void rdma_free_devices(struct ibv_context **list) 354 { 355 free(list); 356 } 357 358 struct rdma_event_channel *rdma_create_event_channel(void) 359 { 360 struct rdma_event_channel *channel; 361 362 if (ucma_init()) 363 return NULL; 364 365 channel = malloc(sizeof(*channel)); 366 if (!channel) 367 return NULL; 368 369 channel->fd = open("/dev/rdma_cm", O_RDWR | O_CLOEXEC); 370 if (channel->fd < 0) { 371 goto err; 372 } 373 return channel; 374 err: 375 free(channel); 376 return NULL; 377 } 378 379 void rdma_destroy_event_channel(struct rdma_event_channel *channel) 380 { 381 close(channel->fd); 382 free(channel); 383 } 384 385 static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid) 386 { 387 struct cma_device *cma_dev; 388 int i, ret; 389 390 for (i = 0; i < cma_dev_cnt; i++) { 391 cma_dev = &cma_dev_array[i]; 392 if (cma_dev->guid == guid) 393 goto match; 394 } 395 396 return ERR(ENODEV); 397 match: 398 pthread_mutex_lock(&mut); 399 if ((ret = ucma_init_device(cma_dev))) 400 goto out; 401 402 if (!cma_dev->refcnt++) { 403 cma_dev->pd = ibv_alloc_pd(cma_dev->verbs); 404 if (!cma_dev->pd) { 405 cma_dev->refcnt--; 406 ret = ERR(ENOMEM); 407 goto out; 408 } 409 } 410 id_priv->cma_dev = cma_dev; 411 id_priv->id.verbs = cma_dev->verbs; 412 id_priv->id.pd = cma_dev->pd; 413 out: 414 pthread_mutex_unlock(&mut); 415 return ret; 416 } 417 418 static void ucma_put_device(struct cma_device *cma_dev) 419 { 420 pthread_mutex_lock(&mut); 421 if (!--cma_dev->refcnt) { 422 ibv_dealloc_pd(cma_dev->pd); 423 if (cma_dev->xrcd) 424 ibv_close_xrcd(cma_dev->xrcd); 425 } 426 pthread_mutex_unlock(&mut); 427 } 428 429 static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev) 430 { 431 struct ibv_xrcd_init_attr attr; 432 433 pthread_mutex_lock(&mut); 434 if (!cma_dev->xrcd) { 435 memset(&attr, 0, sizeof attr); 436 attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; 437 attr.fd = -1; 438 attr.oflags = O_CREAT; 439 cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr); 440 } 441 pthread_mutex_unlock(&mut); 442 return cma_dev->xrcd; 443 } 444 445 static void ucma_insert_id(struct cma_id_private *id_priv) 446 { 447 fastlock_acquire(&idm_lock); 448 idm_set(&ucma_idm, id_priv->handle, id_priv); 449 fastlock_release(&idm_lock); 450 } 451 452 static void ucma_remove_id(struct cma_id_private *id_priv) 453 { 454 if (id_priv->handle <= IDX_MAX_INDEX) 455 idm_clear(&ucma_idm, id_priv->handle); 456 } 457 458 static struct cma_id_private *ucma_lookup_id(int handle) 459 { 460 return idm_lookup(&ucma_idm, handle); 461 } 462 463 static void ucma_free_id(struct cma_id_private *id_priv) 464 { 465 ucma_remove_id(id_priv); 466 if (id_priv->cma_dev) 467 ucma_put_device(id_priv->cma_dev); 468 pthread_cond_destroy(&id_priv->cond); 469 pthread_mutex_destroy(&id_priv->mut); 470 if (id_priv->id.route.path_rec) 471 free(id_priv->id.route.path_rec); 472 473 if (id_priv->sync) 474 rdma_destroy_event_channel(id_priv->id.channel); 475 if (id_priv->connect_len) 476 free(id_priv->connect); 477 free(id_priv); 478 } 479 480 static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel, 481 void *context, 482 enum rdma_port_space ps, 483 enum ibv_qp_type qp_type) 484 { 485 struct cma_id_private *id_priv; 486 487 id_priv = calloc(1, sizeof(*id_priv)); 488 if (!id_priv) 489 return NULL; 490 491 id_priv->id.context = context; 492 id_priv->id.ps = ps; 493 id_priv->id.qp_type = qp_type; 494 id_priv->handle = 0xFFFFFFFF; 495 496 if (!channel) { 497 id_priv->id.channel = rdma_create_event_channel(); 498 if (!id_priv->id.channel) 499 goto err; 500 id_priv->sync = 1; 501 } else { 502 id_priv->id.channel = channel; 503 } 504 505 pthread_mutex_init(&id_priv->mut, NULL); 506 if (pthread_cond_init(&id_priv->cond, NULL)) 507 goto err; 508 509 return id_priv; 510 511 err: ucma_free_id(id_priv); 512 return NULL; 513 } 514 515 static int rdma_create_id2(struct rdma_event_channel *channel, 516 struct rdma_cm_id **id, void *context, 517 enum rdma_port_space ps, enum ibv_qp_type qp_type) 518 { 519 struct ucma_abi_create_id_resp resp; 520 struct ucma_abi_create_id cmd; 521 struct cma_id_private *id_priv; 522 int ret; 523 524 ret = ucma_init(); 525 if (ret) 526 return ret; 527 528 id_priv = ucma_alloc_id(channel, context, ps, qp_type); 529 if (!id_priv) 530 return ERR(ENOMEM); 531 532 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp); 533 cmd.uid = (uintptr_t) id_priv; 534 cmd.ps = ps; 535 cmd.qp_type = qp_type; 536 537 ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); 538 if (ret != sizeof cmd) 539 goto err; 540 541 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 542 543 id_priv->handle = resp.id; 544 ucma_insert_id(id_priv); 545 *id = &id_priv->id; 546 return 0; 547 548 err: ucma_free_id(id_priv); 549 return ret; 550 } 551 552 int rdma_create_id(struct rdma_event_channel *channel, 553 struct rdma_cm_id **id, void *context, 554 enum rdma_port_space ps) 555 { 556 enum ibv_qp_type qp_type; 557 558 qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ? 559 IBV_QPT_UD : IBV_QPT_RC; 560 return rdma_create_id2(channel, id, context, ps, qp_type); 561 } 562 563 static int ucma_destroy_kern_id(int fd, uint32_t handle) 564 { 565 struct ucma_abi_destroy_id_resp resp; 566 struct ucma_abi_destroy_id cmd; 567 int ret; 568 569 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp); 570 cmd.id = handle; 571 572 ret = write(fd, &cmd, sizeof cmd); 573 if (ret != sizeof cmd) 574 return (ret >= 0) ? ERR(ENODATA) : -1; 575 576 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 577 578 return resp.events_reported; 579 } 580 581 int rdma_destroy_id(struct rdma_cm_id *id) 582 { 583 struct cma_id_private *id_priv; 584 int ret; 585 586 id_priv = container_of(id, struct cma_id_private, id); 587 ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle); 588 if (ret < 0) 589 return ret; 590 591 if (id_priv->id.event) 592 rdma_ack_cm_event(id_priv->id.event); 593 594 pthread_mutex_lock(&id_priv->mut); 595 while (id_priv->events_completed < ret) 596 pthread_cond_wait(&id_priv->cond, &id_priv->mut); 597 pthread_mutex_unlock(&id_priv->mut); 598 599 ucma_free_id(id_priv); 600 return 0; 601 } 602 603 int ucma_addrlen(struct sockaddr *addr) 604 { 605 if (!addr) 606 return 0; 607 608 switch (addr->sa_family) { 609 case PF_INET: 610 return sizeof(struct sockaddr_in); 611 case PF_INET6: 612 return sizeof(struct sockaddr_in6); 613 case PF_IB: 614 return af_ib_support ? sizeof(struct sockaddr_ib) : 0; 615 default: 616 return 0; 617 } 618 } 619 620 static int ucma_query_addr(struct rdma_cm_id *id) 621 { 622 struct ucma_abi_query_addr_resp resp; 623 struct ucma_abi_query cmd; 624 struct cma_id_private *id_priv; 625 int ret; 626 627 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); 628 id_priv = container_of(id, struct cma_id_private, id); 629 cmd.id = id_priv->handle; 630 cmd.option = UCMA_QUERY_ADDR; 631 632 ret = write(id->channel->fd, &cmd, sizeof cmd); 633 if (ret != sizeof cmd) 634 return (ret >= 0) ? ERR(ENODATA) : -1; 635 636 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 637 638 memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size); 639 memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size); 640 641 if (!id_priv->cma_dev && resp.node_guid) { 642 ret = ucma_get_device(id_priv, resp.node_guid); 643 if (ret) 644 return ret; 645 id->port_num = resp.port_num; 646 id->route.addr.addr.ibaddr.pkey = resp.pkey; 647 } 648 649 return 0; 650 } 651 652 static int ucma_query_gid(struct rdma_cm_id *id) 653 { 654 struct ucma_abi_query_addr_resp resp; 655 struct ucma_abi_query cmd; 656 struct cma_id_private *id_priv; 657 struct sockaddr_ib *sib; 658 int ret; 659 660 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); 661 id_priv = container_of(id, struct cma_id_private, id); 662 cmd.id = id_priv->handle; 663 cmd.option = UCMA_QUERY_GID; 664 665 ret = write(id->channel->fd, &cmd, sizeof cmd); 666 if (ret != sizeof cmd) 667 return (ret >= 0) ? ERR(ENODATA) : -1; 668 669 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 670 671 sib = (struct sockaddr_ib *) &resp.src_addr; 672 memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw, 673 sizeof id->route.addr.addr.ibaddr.sgid); 674 675 sib = (struct sockaddr_ib *) &resp.dst_addr; 676 memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw, 677 sizeof id->route.addr.addr.ibaddr.dgid); 678 679 return 0; 680 } 681 682 static void ucma_convert_path(struct ibv_path_data *path_data, 683 struct ibv_sa_path_rec *sa_path) 684 { 685 uint32_t fl_hop; 686 687 sa_path->dgid = path_data->path.dgid; 688 sa_path->sgid = path_data->path.sgid; 689 sa_path->dlid = path_data->path.dlid; 690 sa_path->slid = path_data->path.slid; 691 sa_path->raw_traffic = 0; 692 693 fl_hop = be32toh(path_data->path.flowlabel_hoplimit); 694 sa_path->flow_label = htobe32(fl_hop >> 8); 695 sa_path->hop_limit = (uint8_t) fl_hop; 696 697 sa_path->traffic_class = path_data->path.tclass; 698 sa_path->reversible = path_data->path.reversible_numpath >> 7; 699 sa_path->numb_path = 1; 700 sa_path->pkey = path_data->path.pkey; 701 sa_path->sl = be16toh(path_data->path.qosclass_sl) & 0xF; 702 sa_path->mtu_selector = 2; /* exactly */ 703 sa_path->mtu = path_data->path.mtu & 0x1F; 704 sa_path->rate_selector = 2; 705 sa_path->rate = path_data->path.rate & 0x1F; 706 sa_path->packet_life_time_selector = 2; 707 sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F; 708 709 sa_path->preference = (uint8_t) path_data->flags; 710 } 711 712 static int ucma_query_path(struct rdma_cm_id *id) 713 { 714 struct ucma_abi_query_path_resp *resp; 715 struct ucma_abi_query cmd; 716 struct cma_id_private *id_priv; 717 int ret, i, size; 718 719 size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6; 720 resp = alloca(size); 721 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size); 722 id_priv = container_of(id, struct cma_id_private, id); 723 cmd.id = id_priv->handle; 724 cmd.option = UCMA_QUERY_PATH; 725 726 ret = write(id->channel->fd, &cmd, sizeof cmd); 727 if (ret != sizeof cmd) 728 return (ret >= 0) ? ERR(ENODATA) : -1; 729 730 VALGRIND_MAKE_MEM_DEFINED(resp, size); 731 732 if (resp->num_paths) { 733 id->route.path_rec = malloc(sizeof(*id->route.path_rec) * 734 resp->num_paths); 735 if (!id->route.path_rec) 736 return ERR(ENOMEM); 737 738 id->route.num_paths = resp->num_paths; 739 for (i = 0; i < resp->num_paths; i++) 740 ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]); 741 } 742 743 return 0; 744 } 745 746 static int ucma_query_route(struct rdma_cm_id *id) 747 { 748 struct ucma_abi_query_route_resp resp; 749 struct ucma_abi_query cmd; 750 struct cma_id_private *id_priv; 751 int ret, i; 752 753 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp); 754 id_priv = container_of(id, struct cma_id_private, id); 755 cmd.id = id_priv->handle; 756 757 ret = write(id->channel->fd, &cmd, sizeof cmd); 758 if (ret != sizeof cmd) 759 return (ret >= 0) ? ERR(ENODATA) : -1; 760 761 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 762 763 if (resp.num_paths) { 764 id->route.path_rec = malloc(sizeof(*id->route.path_rec) * 765 resp.num_paths); 766 if (!id->route.path_rec) 767 return ERR(ENOMEM); 768 769 id->route.num_paths = resp.num_paths; 770 for (i = 0; i < resp.num_paths; i++) 771 ibv_copy_path_rec_from_kern(&id->route.path_rec[i], 772 &resp.ib_route[i]); 773 } 774 775 memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid, 776 sizeof id->route.addr.addr.ibaddr.sgid); 777 memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid, 778 sizeof id->route.addr.addr.ibaddr.dgid); 779 id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey; 780 memcpy(&id->route.addr.src_addr, &resp.src_addr, 781 sizeof resp.src_addr); 782 memcpy(&id->route.addr.dst_addr, &resp.dst_addr, 783 sizeof resp.dst_addr); 784 785 if (!id_priv->cma_dev && resp.node_guid) { 786 ret = ucma_get_device(id_priv, resp.node_guid); 787 if (ret) 788 return ret; 789 id_priv->id.port_num = resp.port_num; 790 } 791 792 return 0; 793 } 794 795 static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr, 796 socklen_t addrlen) 797 { 798 struct ucma_abi_bind cmd; 799 struct cma_id_private *id_priv; 800 int ret; 801 802 CMA_INIT_CMD(&cmd, sizeof cmd, BIND); 803 id_priv = container_of(id, struct cma_id_private, id); 804 cmd.id = id_priv->handle; 805 cmd.addr_size = addrlen; 806 memcpy(&cmd.addr, addr, addrlen); 807 808 ret = write(id->channel->fd, &cmd, sizeof cmd); 809 if (ret != sizeof cmd) 810 return (ret >= 0) ? ERR(ENODATA) : -1; 811 812 ret = ucma_query_addr(id); 813 if (!ret) 814 ret = ucma_query_gid(id); 815 return ret; 816 } 817 818 int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) 819 { 820 struct ucma_abi_bind_ip cmd; 821 struct cma_id_private *id_priv; 822 int ret, addrlen; 823 824 addrlen = ucma_addrlen(addr); 825 if (!addrlen) 826 return ERR(EINVAL); 827 828 if (af_ib_support) 829 return rdma_bind_addr2(id, addr, addrlen); 830 831 CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP); 832 id_priv = container_of(id, struct cma_id_private, id); 833 cmd.id = id_priv->handle; 834 memcpy(&cmd.addr, addr, addrlen); 835 836 ret = write(id->channel->fd, &cmd, sizeof cmd); 837 if (ret != sizeof cmd) 838 return (ret >= 0) ? ERR(ENODATA) : -1; 839 840 return ucma_query_route(id); 841 } 842 843 int ucma_complete(struct rdma_cm_id *id) 844 { 845 struct cma_id_private *id_priv; 846 int ret; 847 848 id_priv = container_of(id, struct cma_id_private, id); 849 if (!id_priv->sync) 850 return 0; 851 852 if (id_priv->id.event) { 853 rdma_ack_cm_event(id_priv->id.event); 854 id_priv->id.event = NULL; 855 } 856 857 ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event); 858 if (ret) 859 return ret; 860 861 if (id_priv->id.event->status) { 862 if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED) 863 ret = ERR(ECONNREFUSED); 864 else if (id_priv->id.event->status < 0) 865 ret = ERR(-id_priv->id.event->status); 866 else 867 ret = ERR(-id_priv->id.event->status); 868 } 869 return ret; 870 } 871 872 static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr, 873 socklen_t src_len, struct sockaddr *dst_addr, 874 socklen_t dst_len, int timeout_ms) 875 { 876 struct ucma_abi_resolve_addr cmd; 877 struct cma_id_private *id_priv; 878 int ret; 879 880 CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR); 881 id_priv = container_of(id, struct cma_id_private, id); 882 cmd.id = id_priv->handle; 883 if ((cmd.src_size = src_len)) 884 memcpy(&cmd.src_addr, src_addr, src_len); 885 memcpy(&cmd.dst_addr, dst_addr, dst_len); 886 cmd.dst_size = dst_len; 887 cmd.timeout_ms = timeout_ms; 888 889 ret = write(id->channel->fd, &cmd, sizeof cmd); 890 if (ret != sizeof cmd) 891 return (ret >= 0) ? ERR(ENODATA) : -1; 892 893 memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); 894 return ucma_complete(id); 895 } 896 897 int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, 898 struct sockaddr *dst_addr, int timeout_ms) 899 { 900 struct ucma_abi_resolve_ip cmd; 901 struct cma_id_private *id_priv; 902 int ret, dst_len, src_len; 903 904 dst_len = ucma_addrlen(dst_addr); 905 if (!dst_len) 906 return ERR(EINVAL); 907 908 src_len = ucma_addrlen(src_addr); 909 if (src_addr && !src_len) 910 return ERR(EINVAL); 911 912 if (af_ib_support) 913 return rdma_resolve_addr2(id, src_addr, src_len, dst_addr, 914 dst_len, timeout_ms); 915 916 CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP); 917 id_priv = container_of(id, struct cma_id_private, id); 918 cmd.id = id_priv->handle; 919 if (src_addr) 920 memcpy(&cmd.src_addr, src_addr, src_len); 921 memcpy(&cmd.dst_addr, dst_addr, dst_len); 922 cmd.timeout_ms = timeout_ms; 923 924 ret = write(id->channel->fd, &cmd, sizeof cmd); 925 if (ret != sizeof cmd) 926 return (ret >= 0) ? ERR(ENODATA) : -1; 927 928 memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); 929 return ucma_complete(id); 930 } 931 932 static int ucma_set_ib_route(struct rdma_cm_id *id) 933 { 934 struct rdma_addrinfo hint, *rai; 935 int ret; 936 937 memset(&hint, 0, sizeof hint); 938 hint.ai_flags = RAI_ROUTEONLY; 939 hint.ai_family = id->route.addr.src_addr.sa_family; 940 hint.ai_src_len = ucma_addrlen((struct sockaddr *) &id->route.addr.src_addr); 941 hint.ai_src_addr = &id->route.addr.src_addr; 942 hint.ai_dst_len = ucma_addrlen((struct sockaddr *) &id->route.addr.dst_addr); 943 hint.ai_dst_addr = &id->route.addr.dst_addr; 944 945 ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai); 946 if (ret) 947 return ret; 948 949 if (rai->ai_route_len) 950 ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, 951 rai->ai_route, rai->ai_route_len); 952 else 953 ret = -1; 954 955 rdma_freeaddrinfo(rai); 956 return ret; 957 } 958 959 int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) 960 { 961 struct ucma_abi_resolve_route cmd; 962 struct cma_id_private *id_priv; 963 int ret; 964 965 id_priv = container_of(id, struct cma_id_private, id); 966 if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) { 967 ret = ucma_set_ib_route(id); 968 if (!ret) 969 goto out; 970 } 971 972 CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE); 973 cmd.id = id_priv->handle; 974 cmd.timeout_ms = timeout_ms; 975 976 ret = write(id->channel->fd, &cmd, sizeof cmd); 977 if (ret != sizeof cmd) 978 return (ret >= 0) ? ERR(ENODATA) : -1; 979 980 out: 981 return ucma_complete(id); 982 } 983 984 static int ucma_is_ud_qp(enum ibv_qp_type qp_type) 985 { 986 return (qp_type == IBV_QPT_UD); 987 } 988 989 static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr, 990 int *qp_attr_mask) 991 { 992 struct ucma_abi_init_qp_attr cmd; 993 struct ibv_kern_qp_attr resp; 994 struct cma_id_private *id_priv; 995 int ret; 996 997 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp); 998 id_priv = container_of(id, struct cma_id_private, id); 999 cmd.id = id_priv->handle; 1000 cmd.qp_state = qp_attr->qp_state; 1001 1002 ret = write(id->channel->fd, &cmd, sizeof cmd); 1003 if (ret != sizeof cmd) 1004 return (ret >= 0) ? ERR(ENODATA) : -1; 1005 1006 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 1007 1008 ibv_copy_qp_attr_from_kern(qp_attr, &resp); 1009 *qp_attr_mask = resp.qp_attr_mask; 1010 return 0; 1011 } 1012 1013 static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res) 1014 { 1015 struct cma_id_private *id_priv; 1016 struct ibv_qp_attr qp_attr; 1017 int qp_attr_mask, ret; 1018 uint8_t link_layer; 1019 1020 if (!id->qp) 1021 return ERR(EINVAL); 1022 1023 /* Need to update QP attributes from default values. */ 1024 qp_attr.qp_state = IBV_QPS_INIT; 1025 ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); 1026 if (ret) 1027 return ret; 1028 1029 ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask); 1030 if (ret) 1031 return ERR(ret); 1032 1033 qp_attr.qp_state = IBV_QPS_RTR; 1034 ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); 1035 if (ret) 1036 return ret; 1037 1038 /* 1039 * Workaround for rdma_ucm kernel bug: 1040 * mask off qp_attr_mask bits 21-24 which are used for RoCE 1041 */ 1042 id_priv = container_of(id, struct cma_id_private, id); 1043 link_layer = id_priv->cma_dev->port[id->port_num - 1].link_layer; 1044 1045 if (link_layer == IBV_LINK_LAYER_INFINIBAND) 1046 qp_attr_mask &= UINT_MAX ^ 0xe00000; 1047 1048 if (resp_res != RDMA_MAX_RESP_RES) 1049 qp_attr.max_dest_rd_atomic = resp_res; 1050 return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); 1051 } 1052 1053 static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth) 1054 { 1055 struct ibv_qp_attr qp_attr; 1056 int qp_attr_mask, ret; 1057 1058 qp_attr.qp_state = IBV_QPS_RTS; 1059 ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); 1060 if (ret) 1061 return ret; 1062 1063 if (init_depth != RDMA_MAX_INIT_DEPTH) 1064 qp_attr.max_rd_atomic = init_depth; 1065 return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); 1066 } 1067 1068 static int ucma_modify_qp_sqd(struct rdma_cm_id *id) 1069 { 1070 struct ibv_qp_attr qp_attr; 1071 1072 if (!id->qp) 1073 return 0; 1074 1075 qp_attr.qp_state = IBV_QPS_SQD; 1076 return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); 1077 } 1078 1079 static int ucma_modify_qp_err(struct rdma_cm_id *id) 1080 { 1081 struct ibv_qp_attr qp_attr; 1082 1083 if (!id->qp) 1084 return 0; 1085 1086 qp_attr.qp_state = IBV_QPS_ERR; 1087 return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); 1088 } 1089 1090 static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num, 1091 __be16 pkey, uint16_t *pkey_index) 1092 { 1093 int ret, i; 1094 __be16 chk_pkey; 1095 1096 for (i = 0, ret = 0; !ret; i++) { 1097 ret = ibv_query_pkey(cma_dev->verbs, port_num, i, &chk_pkey); 1098 if (!ret && pkey == chk_pkey) { 1099 *pkey_index = (uint16_t) i; 1100 return 0; 1101 } 1102 } 1103 return ERR(EINVAL); 1104 } 1105 1106 static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) 1107 { 1108 struct ibv_qp_attr qp_attr; 1109 int ret; 1110 1111 ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num, 1112 id_priv->id.route.addr.addr.ibaddr.pkey, 1113 &qp_attr.pkey_index); 1114 if (ret) 1115 return ret; 1116 1117 qp_attr.port_num = id_priv->id.port_num; 1118 qp_attr.qp_state = IBV_QPS_INIT; 1119 qp_attr.qp_access_flags = 0; 1120 1121 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS | 1122 IBV_QP_PKEY_INDEX | IBV_QP_PORT); 1123 return rdma_seterrno(ret); 1124 } 1125 1126 static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) 1127 { 1128 struct ibv_qp_attr qp_attr; 1129 int qp_attr_mask, ret; 1130 1131 if (abi_ver == 3) 1132 return ucma_init_conn_qp3(id_priv, qp); 1133 1134 qp_attr.qp_state = IBV_QPS_INIT; 1135 ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); 1136 if (ret) 1137 return ret; 1138 1139 return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask)); 1140 } 1141 1142 static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) 1143 { 1144 struct ibv_qp_attr qp_attr; 1145 int ret; 1146 1147 ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num, 1148 id_priv->id.route.addr.addr.ibaddr.pkey, 1149 &qp_attr.pkey_index); 1150 if (ret) 1151 return ret; 1152 1153 qp_attr.port_num = id_priv->id.port_num; 1154 qp_attr.qp_state = IBV_QPS_INIT; 1155 qp_attr.qkey = RDMA_UDP_QKEY; 1156 1157 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY | 1158 IBV_QP_PKEY_INDEX | IBV_QP_PORT); 1159 if (ret) 1160 return ERR(ret); 1161 1162 qp_attr.qp_state = IBV_QPS_RTR; 1163 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); 1164 if (ret) 1165 return ERR(ret); 1166 1167 qp_attr.qp_state = IBV_QPS_RTS; 1168 qp_attr.sq_psn = 0; 1169 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); 1170 return rdma_seterrno(ret); 1171 } 1172 1173 static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) 1174 { 1175 struct ibv_qp_attr qp_attr; 1176 int qp_attr_mask, ret; 1177 1178 if (abi_ver == 3) 1179 return ucma_init_ud_qp3(id_priv, qp); 1180 1181 qp_attr.qp_state = IBV_QPS_INIT; 1182 ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); 1183 if (ret) 1184 return ret; 1185 1186 ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask); 1187 if (ret) 1188 return ERR(ret); 1189 1190 qp_attr.qp_state = IBV_QPS_RTR; 1191 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); 1192 if (ret) 1193 return ERR(ret); 1194 1195 qp_attr.qp_state = IBV_QPS_RTS; 1196 qp_attr.sq_psn = 0; 1197 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); 1198 return rdma_seterrno(ret); 1199 } 1200 1201 static void ucma_destroy_cqs(struct rdma_cm_id *id) 1202 { 1203 if (id->qp_type == IBV_QPT_XRC_RECV && id->srq) 1204 return; 1205 1206 if (id->recv_cq) { 1207 ibv_destroy_cq(id->recv_cq); 1208 if (id->send_cq && (id->send_cq != id->recv_cq)) { 1209 ibv_destroy_cq(id->send_cq); 1210 id->send_cq = NULL; 1211 } 1212 id->recv_cq = NULL; 1213 } 1214 1215 if (id->recv_cq_channel) { 1216 ibv_destroy_comp_channel(id->recv_cq_channel); 1217 if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) { 1218 ibv_destroy_comp_channel(id->send_cq_channel); 1219 id->send_cq_channel = NULL; 1220 } 1221 id->recv_cq_channel = NULL; 1222 } 1223 } 1224 1225 static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size) 1226 { 1227 if (recv_size) { 1228 id->recv_cq_channel = ibv_create_comp_channel(id->verbs); 1229 if (!id->recv_cq_channel) 1230 goto err; 1231 1232 id->recv_cq = ibv_create_cq(id->verbs, recv_size, 1233 id, id->recv_cq_channel, 0); 1234 if (!id->recv_cq) 1235 goto err; 1236 } 1237 1238 if (send_size) { 1239 id->send_cq_channel = ibv_create_comp_channel(id->verbs); 1240 if (!id->send_cq_channel) 1241 goto err; 1242 1243 id->send_cq = ibv_create_cq(id->verbs, send_size, 1244 id, id->send_cq_channel, 0); 1245 if (!id->send_cq) 1246 goto err; 1247 } 1248 1249 return 0; 1250 err: 1251 ucma_destroy_cqs(id); 1252 return ERR(ENOMEM); 1253 } 1254 1255 int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr) 1256 { 1257 struct cma_id_private *id_priv; 1258 struct ibv_srq *srq; 1259 int ret; 1260 1261 id_priv = container_of(id, struct cma_id_private, id); 1262 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE)) 1263 return ERR(EINVAL); 1264 1265 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) { 1266 attr->pd = id->pd; 1267 attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD; 1268 } 1269 1270 if (attr->srq_type == IBV_SRQT_XRC) { 1271 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) { 1272 attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); 1273 if (!attr->xrcd) 1274 return -1; 1275 } 1276 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) { 1277 ret = ucma_create_cqs(id, 0, attr->attr.max_wr); 1278 if (ret) 1279 return ret; 1280 attr->cq = id->recv_cq; 1281 } 1282 attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ; 1283 } 1284 1285 srq = ibv_create_srq_ex(id->verbs, attr); 1286 if (!srq) { 1287 ret = -1; 1288 goto err; 1289 } 1290 1291 if (!id->pd) 1292 id->pd = attr->pd; 1293 id->srq = srq; 1294 return 0; 1295 err: 1296 ucma_destroy_cqs(id); 1297 return ret; 1298 } 1299 1300 int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, 1301 struct ibv_srq_init_attr *attr) 1302 { 1303 struct ibv_srq_init_attr_ex attr_ex; 1304 int ret; 1305 1306 memcpy(&attr_ex, attr, sizeof(*attr)); 1307 attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD; 1308 if (id->qp_type == IBV_QPT_XRC_RECV) { 1309 attr_ex.srq_type = IBV_SRQT_XRC; 1310 } else { 1311 attr_ex.srq_type = IBV_SRQT_BASIC; 1312 } 1313 attr_ex.pd = pd; 1314 ret = rdma_create_srq_ex(id, &attr_ex); 1315 memcpy(attr, &attr_ex, sizeof(*attr)); 1316 return ret; 1317 } 1318 1319 void rdma_destroy_srq(struct rdma_cm_id *id) 1320 { 1321 ibv_destroy_srq(id->srq); 1322 id->srq = NULL; 1323 ucma_destroy_cqs(id); 1324 } 1325 1326 int rdma_create_qp_ex(struct rdma_cm_id *id, 1327 struct ibv_qp_init_attr_ex *attr) 1328 { 1329 struct cma_id_private *id_priv; 1330 struct ibv_qp *qp; 1331 int ret; 1332 1333 if (id->qp) 1334 return ERR(EINVAL); 1335 1336 id_priv = container_of(id, struct cma_id_private, id); 1337 if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) { 1338 attr->comp_mask |= IBV_QP_INIT_ATTR_PD; 1339 attr->pd = id->pd; 1340 } else if (id->verbs != attr->pd->context) 1341 return ERR(EINVAL); 1342 1343 if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) || 1344 (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq)) 1345 return ERR(EINVAL); 1346 1347 if (id->qp_type == IBV_QPT_XRC_RECV) { 1348 if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) { 1349 attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); 1350 if (!attr->xrcd) 1351 return -1; 1352 attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD; 1353 } 1354 } 1355 1356 ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr, 1357 attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr); 1358 if (ret) 1359 return ret; 1360 1361 if (!attr->send_cq) 1362 attr->send_cq = id->send_cq; 1363 if (!attr->recv_cq) 1364 attr->recv_cq = id->recv_cq; 1365 if (id->srq && !attr->srq) 1366 attr->srq = id->srq; 1367 qp = ibv_create_qp_ex(id->verbs, attr); 1368 if (!qp) { 1369 ret = ERR(ENOMEM); 1370 goto err1; 1371 } 1372 1373 if (ucma_is_ud_qp(id->qp_type)) 1374 ret = ucma_init_ud_qp(id_priv, qp); 1375 else 1376 ret = ucma_init_conn_qp(id_priv, qp); 1377 if (ret) 1378 goto err2; 1379 1380 id->pd = qp->pd; 1381 id->qp = qp; 1382 return 0; 1383 err2: 1384 ibv_destroy_qp(qp); 1385 err1: 1386 ucma_destroy_cqs(id); 1387 return ret; 1388 } 1389 1390 int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, 1391 struct ibv_qp_init_attr *qp_init_attr) 1392 { 1393 struct ibv_qp_init_attr_ex attr_ex; 1394 int ret; 1395 1396 memcpy(&attr_ex, qp_init_attr, sizeof(*qp_init_attr)); 1397 attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; 1398 attr_ex.pd = pd ? pd : id->pd; 1399 ret = rdma_create_qp_ex(id, &attr_ex); 1400 memcpy(qp_init_attr, &attr_ex, sizeof(*qp_init_attr)); 1401 return ret; 1402 } 1403 1404 void rdma_destroy_qp(struct rdma_cm_id *id) 1405 { 1406 ibv_destroy_qp(id->qp); 1407 id->qp = NULL; 1408 ucma_destroy_cqs(id); 1409 } 1410 1411 static int ucma_valid_param(struct cma_id_private *id_priv, 1412 struct rdma_conn_param *param) 1413 { 1414 if (id_priv->id.ps != RDMA_PS_TCP) 1415 return 0; 1416 1417 if (!id_priv->id.qp && !param) 1418 goto err; 1419 1420 if (!param) 1421 return 0; 1422 1423 if ((param->responder_resources != RDMA_MAX_RESP_RES) && 1424 (param->responder_resources > id_priv->cma_dev->max_responder_resources)) 1425 goto err; 1426 1427 if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) && 1428 (param->initiator_depth > id_priv->cma_dev->max_initiator_depth)) 1429 goto err; 1430 1431 return 0; 1432 err: 1433 return ERR(EINVAL); 1434 } 1435 1436 static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv, 1437 struct ucma_abi_conn_param *dst, 1438 struct rdma_conn_param *src, 1439 uint32_t qp_num, uint8_t srq) 1440 { 1441 dst->qp_num = qp_num; 1442 dst->srq = srq; 1443 dst->responder_resources = id_priv->responder_resources; 1444 dst->initiator_depth = id_priv->initiator_depth; 1445 dst->valid = 1; 1446 1447 if (id_priv->connect_len) { 1448 memcpy(dst->private_data, id_priv->connect, id_priv->connect_len); 1449 dst->private_data_len = id_priv->connect_len; 1450 } 1451 1452 if (src) { 1453 dst->flow_control = src->flow_control; 1454 dst->retry_count = src->retry_count; 1455 dst->rnr_retry_count = src->rnr_retry_count; 1456 1457 if (src->private_data && src->private_data_len) { 1458 memcpy(dst->private_data + dst->private_data_len, 1459 src->private_data, src->private_data_len); 1460 dst->private_data_len += src->private_data_len; 1461 } 1462 } else { 1463 dst->retry_count = 7; 1464 dst->rnr_retry_count = 7; 1465 } 1466 } 1467 1468 int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) 1469 { 1470 struct ucma_abi_connect cmd; 1471 struct cma_id_private *id_priv; 1472 int ret; 1473 1474 id_priv = container_of(id, struct cma_id_private, id); 1475 ret = ucma_valid_param(id_priv, conn_param); 1476 if (ret) 1477 return ret; 1478 1479 if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH) 1480 id_priv->initiator_depth = conn_param->initiator_depth; 1481 else 1482 id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth; 1483 if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES) 1484 id_priv->responder_resources = conn_param->responder_resources; 1485 else 1486 id_priv->responder_resources = id_priv->cma_dev->max_responder_resources; 1487 1488 CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT); 1489 cmd.id = id_priv->handle; 1490 if (id->qp) { 1491 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1492 conn_param, id->qp->qp_num, 1493 (id->qp->srq != NULL)); 1494 } else if (conn_param) { 1495 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1496 conn_param, conn_param->qp_num, 1497 conn_param->srq); 1498 } else { 1499 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1500 conn_param, 0, 0); 1501 } 1502 1503 ret = write(id->channel->fd, &cmd, sizeof cmd); 1504 if (ret != sizeof cmd) 1505 return (ret >= 0) ? ERR(ENODATA) : -1; 1506 1507 if (id_priv->connect_len) { 1508 free(id_priv->connect); 1509 id_priv->connect_len = 0; 1510 } 1511 1512 return ucma_complete(id); 1513 } 1514 1515 int rdma_listen(struct rdma_cm_id *id, int backlog) 1516 { 1517 struct ucma_abi_listen cmd; 1518 struct cma_id_private *id_priv; 1519 int ret; 1520 1521 CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN); 1522 id_priv = container_of(id, struct cma_id_private, id); 1523 cmd.id = id_priv->handle; 1524 cmd.backlog = backlog; 1525 1526 ret = write(id->channel->fd, &cmd, sizeof cmd); 1527 if (ret != sizeof cmd) 1528 return (ret >= 0) ? ERR(ENODATA) : -1; 1529 1530 if (af_ib_support) 1531 return ucma_query_addr(id); 1532 else 1533 return ucma_query_route(id); 1534 } 1535 1536 int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id) 1537 { 1538 struct cma_id_private *id_priv; 1539 struct rdma_cm_event *event; 1540 int ret; 1541 1542 id_priv = container_of(listen, struct cma_id_private, id); 1543 if (!id_priv->sync) 1544 return ERR(EINVAL); 1545 1546 if (listen->event) { 1547 rdma_ack_cm_event(listen->event); 1548 listen->event = NULL; 1549 } 1550 1551 ret = rdma_get_cm_event(listen->channel, &event); 1552 if (ret) 1553 return ret; 1554 1555 if (event->status) { 1556 ret = ERR(event->status); 1557 goto err; 1558 } 1559 1560 if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { 1561 ret = ERR(EINVAL); 1562 goto err; 1563 } 1564 1565 if (id_priv->qp_init_attr) { 1566 struct ibv_qp_init_attr attr; 1567 1568 attr = *id_priv->qp_init_attr; 1569 ret = rdma_create_qp(event->id, listen->pd, &attr); 1570 if (ret) 1571 goto err; 1572 } 1573 1574 *id = event->id; 1575 (*id)->event = event; 1576 return 0; 1577 1578 err: 1579 listen->event = event; 1580 return ret; 1581 } 1582 1583 int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) 1584 { 1585 struct ucma_abi_accept cmd; 1586 struct cma_id_private *id_priv; 1587 int ret; 1588 1589 id_priv = container_of(id, struct cma_id_private, id); 1590 ret = ucma_valid_param(id_priv, conn_param); 1591 if (ret) 1592 return ret; 1593 1594 if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) { 1595 id_priv->initiator_depth = min(id_priv->initiator_depth, 1596 id_priv->cma_dev->max_initiator_depth); 1597 } else { 1598 id_priv->initiator_depth = conn_param->initiator_depth; 1599 } 1600 if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) { 1601 id_priv->responder_resources = min(id_priv->responder_resources, 1602 id_priv->cma_dev->max_responder_resources); 1603 } else { 1604 id_priv->responder_resources = conn_param->responder_resources; 1605 } 1606 1607 if (!ucma_is_ud_qp(id->qp_type)) { 1608 ret = ucma_modify_qp_rtr(id, id_priv->responder_resources); 1609 if (ret) 1610 return ret; 1611 1612 ret = ucma_modify_qp_rts(id, id_priv->initiator_depth); 1613 if (ret) 1614 return ret; 1615 } 1616 1617 CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); 1618 cmd.id = id_priv->handle; 1619 cmd.uid = (uintptr_t) id_priv; 1620 if (id->qp) 1621 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1622 conn_param, id->qp->qp_num, 1623 (id->qp->srq != NULL)); 1624 else 1625 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1626 conn_param, conn_param->qp_num, 1627 conn_param->srq); 1628 1629 ret = write(id->channel->fd, &cmd, sizeof cmd); 1630 if (ret != sizeof cmd) { 1631 ucma_modify_qp_err(id); 1632 return (ret >= 0) ? ERR(ENODATA) : -1; 1633 } 1634 1635 if (ucma_is_ud_qp(id->qp_type)) 1636 return 0; 1637 1638 return ucma_complete(id); 1639 } 1640 1641 int rdma_reject(struct rdma_cm_id *id, const void *private_data, 1642 uint8_t private_data_len) 1643 { 1644 struct ucma_abi_reject cmd; 1645 struct cma_id_private *id_priv; 1646 int ret; 1647 1648 CMA_INIT_CMD(&cmd, sizeof cmd, REJECT); 1649 1650 id_priv = container_of(id, struct cma_id_private, id); 1651 cmd.id = id_priv->handle; 1652 if (private_data && private_data_len) { 1653 memcpy(cmd.private_data, private_data, private_data_len); 1654 cmd.private_data_len = private_data_len; 1655 } 1656 1657 ret = write(id->channel->fd, &cmd, sizeof cmd); 1658 if (ret != sizeof cmd) 1659 return (ret >= 0) ? ERR(ENODATA) : -1; 1660 1661 return 0; 1662 } 1663 1664 int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event) 1665 { 1666 struct ucma_abi_notify cmd; 1667 struct cma_id_private *id_priv; 1668 int ret; 1669 1670 CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY); 1671 1672 id_priv = container_of(id, struct cma_id_private, id); 1673 cmd.id = id_priv->handle; 1674 cmd.event = event; 1675 ret = write(id->channel->fd, &cmd, sizeof cmd); 1676 if (ret != sizeof cmd) 1677 return (ret >= 0) ? ERR(ENODATA) : -1; 1678 1679 return 0; 1680 } 1681 1682 int ucma_shutdown(struct rdma_cm_id *id) 1683 { 1684 switch (id->verbs->device->transport_type) { 1685 case IBV_TRANSPORT_IB: 1686 return ucma_modify_qp_err(id); 1687 case IBV_TRANSPORT_IWARP: 1688 return ucma_modify_qp_sqd(id); 1689 default: 1690 return ERR(EINVAL); 1691 } 1692 } 1693 1694 int rdma_disconnect(struct rdma_cm_id *id) 1695 { 1696 struct ucma_abi_disconnect cmd; 1697 struct cma_id_private *id_priv; 1698 int ret; 1699 1700 ret = ucma_shutdown(id); 1701 if (ret) 1702 return ret; 1703 1704 CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT); 1705 id_priv = container_of(id, struct cma_id_private, id); 1706 cmd.id = id_priv->handle; 1707 1708 ret = write(id->channel->fd, &cmd, sizeof cmd); 1709 if (ret != sizeof cmd) 1710 return (ret >= 0) ? ERR(ENODATA) : -1; 1711 1712 return ucma_complete(id); 1713 } 1714 1715 static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr, 1716 socklen_t addrlen, void *context) 1717 { 1718 struct ucma_abi_create_id_resp resp; 1719 struct cma_id_private *id_priv; 1720 struct cma_multicast *mc, **pos; 1721 int ret; 1722 1723 id_priv = container_of(id, struct cma_id_private, id); 1724 mc = calloc(1, sizeof(*mc)); 1725 if (!mc) 1726 return ERR(ENOMEM); 1727 1728 mc->context = context; 1729 mc->id_priv = id_priv; 1730 memcpy(&mc->addr, addr, addrlen); 1731 if (pthread_cond_init(&mc->cond, NULL)) { 1732 ret = -1; 1733 goto err1; 1734 } 1735 1736 pthread_mutex_lock(&id_priv->mut); 1737 mc->next = id_priv->mc_list; 1738 id_priv->mc_list = mc; 1739 pthread_mutex_unlock(&id_priv->mut); 1740 1741 if (af_ib_support) { 1742 struct ucma_abi_join_mcast cmd; 1743 1744 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp); 1745 cmd.id = id_priv->handle; 1746 memcpy(&cmd.addr, addr, addrlen); 1747 cmd.addr_size = addrlen; 1748 cmd.uid = (uintptr_t) mc; 1749 cmd.reserved = 0; 1750 1751 ret = write(id->channel->fd, &cmd, sizeof cmd); 1752 if (ret != sizeof cmd) { 1753 ret = (ret >= 0) ? ERR(ENODATA) : -1; 1754 goto err2; 1755 } 1756 } else { 1757 struct ucma_abi_join_ip_mcast cmd; 1758 1759 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp); 1760 cmd.id = id_priv->handle; 1761 memcpy(&cmd.addr, addr, addrlen); 1762 cmd.uid = (uintptr_t) mc; 1763 1764 ret = write(id->channel->fd, &cmd, sizeof cmd); 1765 if (ret != sizeof cmd) { 1766 ret = (ret >= 0) ? ERR(ENODATA) : -1; 1767 goto err2; 1768 } 1769 } 1770 1771 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 1772 1773 mc->handle = resp.id; 1774 return ucma_complete(id); 1775 1776 err2: 1777 pthread_mutex_lock(&id_priv->mut); 1778 for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next) 1779 ; 1780 *pos = mc->next; 1781 pthread_mutex_unlock(&id_priv->mut); 1782 err1: 1783 free(mc); 1784 return ret; 1785 } 1786 1787 int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, 1788 void *context) 1789 { 1790 int addrlen; 1791 1792 addrlen = ucma_addrlen(addr); 1793 if (!addrlen) 1794 return ERR(EINVAL); 1795 1796 return rdma_join_multicast2(id, addr, addrlen, context); 1797 } 1798 1799 int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) 1800 { 1801 struct ucma_abi_destroy_id cmd; 1802 struct ucma_abi_destroy_id_resp resp; 1803 struct cma_id_private *id_priv; 1804 struct cma_multicast *mc, **pos; 1805 int ret, addrlen; 1806 1807 addrlen = ucma_addrlen(addr); 1808 if (!addrlen) 1809 return ERR(EINVAL); 1810 1811 id_priv = container_of(id, struct cma_id_private, id); 1812 pthread_mutex_lock(&id_priv->mut); 1813 for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next) 1814 if (!memcmp(&(*pos)->addr, addr, addrlen)) 1815 break; 1816 1817 mc = *pos; 1818 if (*pos) 1819 *pos = mc->next; 1820 pthread_mutex_unlock(&id_priv->mut); 1821 if (!mc) 1822 return ERR(EADDRNOTAVAIL); 1823 1824 if (id->qp) 1825 ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid); 1826 1827 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp); 1828 cmd.id = mc->handle; 1829 1830 ret = write(id->channel->fd, &cmd, sizeof cmd); 1831 if (ret != sizeof cmd) { 1832 ret = (ret >= 0) ? ERR(ENODATA) : -1; 1833 goto free; 1834 } 1835 1836 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 1837 1838 pthread_mutex_lock(&id_priv->mut); 1839 while (mc->events_completed < resp.events_reported) 1840 pthread_cond_wait(&mc->cond, &id_priv->mut); 1841 pthread_mutex_unlock(&id_priv->mut); 1842 1843 ret = 0; 1844 free: 1845 free(mc); 1846 return ret; 1847 } 1848 1849 static void ucma_complete_event(struct cma_id_private *id_priv) 1850 { 1851 pthread_mutex_lock(&id_priv->mut); 1852 id_priv->events_completed++; 1853 pthread_cond_signal(&id_priv->cond); 1854 pthread_mutex_unlock(&id_priv->mut); 1855 } 1856 1857 static void ucma_complete_mc_event(struct cma_multicast *mc) 1858 { 1859 pthread_mutex_lock(&mc->id_priv->mut); 1860 mc->events_completed++; 1861 pthread_cond_signal(&mc->cond); 1862 mc->id_priv->events_completed++; 1863 pthread_cond_signal(&mc->id_priv->cond); 1864 pthread_mutex_unlock(&mc->id_priv->mut); 1865 } 1866 1867 int rdma_ack_cm_event(struct rdma_cm_event *event) 1868 { 1869 struct cma_event *evt; 1870 1871 if (!event) 1872 return ERR(EINVAL); 1873 1874 evt = container_of(event, struct cma_event, event); 1875 1876 if (evt->mc) 1877 ucma_complete_mc_event(evt->mc); 1878 else 1879 ucma_complete_event(evt->id_priv); 1880 free(evt); 1881 return 0; 1882 } 1883 1884 static void ucma_process_addr_resolved(struct cma_event *evt) 1885 { 1886 if (af_ib_support) { 1887 evt->event.status = ucma_query_addr(&evt->id_priv->id); 1888 if (!evt->event.status && 1889 evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB) 1890 evt->event.status = ucma_query_gid(&evt->id_priv->id); 1891 } else { 1892 evt->event.status = ucma_query_route(&evt->id_priv->id); 1893 } 1894 1895 if (evt->event.status) 1896 evt->event.event = RDMA_CM_EVENT_ADDR_ERROR; 1897 } 1898 1899 static void ucma_process_route_resolved(struct cma_event *evt) 1900 { 1901 if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB) 1902 return; 1903 1904 if (af_ib_support) 1905 evt->event.status = ucma_query_path(&evt->id_priv->id); 1906 else 1907 evt->event.status = ucma_query_route(&evt->id_priv->id); 1908 1909 if (evt->event.status) 1910 evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR; 1911 } 1912 1913 static int ucma_query_req_info(struct rdma_cm_id *id) 1914 { 1915 int ret; 1916 1917 if (!af_ib_support) 1918 return ucma_query_route(id); 1919 1920 ret = ucma_query_addr(id); 1921 if (ret) 1922 return ret; 1923 1924 ret = ucma_query_gid(id); 1925 if (ret) 1926 return ret; 1927 1928 ret = ucma_query_path(id); 1929 if (ret) 1930 return ret; 1931 1932 return 0; 1933 } 1934 1935 static int ucma_process_conn_req(struct cma_event *evt, 1936 uint32_t handle) 1937 { 1938 struct cma_id_private *id_priv; 1939 int ret; 1940 1941 id_priv = ucma_alloc_id(evt->id_priv->id.channel, 1942 evt->id_priv->id.context, evt->id_priv->id.ps, 1943 evt->id_priv->id.qp_type); 1944 if (!id_priv) { 1945 ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle); 1946 ret = ERR(ENOMEM); 1947 goto err1; 1948 } 1949 1950 evt->event.listen_id = &evt->id_priv->id; 1951 evt->event.id = &id_priv->id; 1952 id_priv->handle = handle; 1953 ucma_insert_id(id_priv); 1954 id_priv->initiator_depth = evt->event.param.conn.initiator_depth; 1955 id_priv->responder_resources = evt->event.param.conn.responder_resources; 1956 1957 if (evt->id_priv->sync) { 1958 ret = rdma_migrate_id(&id_priv->id, NULL); 1959 if (ret) 1960 goto err2; 1961 } 1962 1963 ret = ucma_query_req_info(&id_priv->id); 1964 if (ret) 1965 goto err2; 1966 1967 return 0; 1968 1969 err2: 1970 rdma_destroy_id(&id_priv->id); 1971 err1: 1972 ucma_complete_event(evt->id_priv); 1973 return ret; 1974 } 1975 1976 static int ucma_process_conn_resp(struct cma_id_private *id_priv) 1977 { 1978 struct ucma_abi_accept cmd; 1979 int ret; 1980 1981 ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES); 1982 if (ret) 1983 goto err; 1984 1985 ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH); 1986 if (ret) 1987 goto err; 1988 1989 CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); 1990 cmd.id = id_priv->handle; 1991 1992 ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); 1993 if (ret != sizeof cmd) { 1994 ret = (ret >= 0) ? ERR(ENODATA) : -1; 1995 goto err; 1996 } 1997 1998 return 0; 1999 err: 2000 ucma_modify_qp_err(&id_priv->id); 2001 return ret; 2002 } 2003 2004 static int ucma_process_join(struct cma_event *evt) 2005 { 2006 evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid; 2007 evt->mc->mlid = evt->event.param.ud.ah_attr.dlid; 2008 2009 if (!evt->id_priv->id.qp) 2010 return 0; 2011 2012 return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp, 2013 &evt->mc->mgid, evt->mc->mlid)); 2014 } 2015 2016 static void ucma_copy_conn_event(struct cma_event *event, 2017 struct ucma_abi_conn_param *src) 2018 { 2019 struct rdma_conn_param *dst = &event->event.param.conn; 2020 2021 dst->private_data_len = src->private_data_len; 2022 if (src->private_data_len) { 2023 dst->private_data = &event->private_data; 2024 memcpy(&event->private_data, src->private_data, 2025 src->private_data_len); 2026 } 2027 2028 dst->responder_resources = src->responder_resources; 2029 dst->initiator_depth = src->initiator_depth; 2030 dst->flow_control = src->flow_control; 2031 dst->retry_count = src->retry_count; 2032 dst->rnr_retry_count = src->rnr_retry_count; 2033 dst->srq = src->srq; 2034 dst->qp_num = src->qp_num; 2035 } 2036 2037 static void ucma_copy_ud_event(struct cma_event *event, 2038 struct ucma_abi_ud_param *src) 2039 { 2040 struct rdma_ud_param *dst = &event->event.param.ud; 2041 2042 dst->private_data_len = src->private_data_len; 2043 if (src->private_data_len) { 2044 dst->private_data = &event->private_data; 2045 memcpy(&event->private_data, src->private_data, 2046 src->private_data_len); 2047 } 2048 2049 ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr); 2050 dst->qp_num = src->qp_num; 2051 dst->qkey = src->qkey; 2052 } 2053 2054 int rdma_get_cm_event(struct rdma_event_channel *channel, 2055 struct rdma_cm_event **event) 2056 { 2057 struct ucma_abi_event_resp resp; 2058 struct ucma_abi_get_event cmd; 2059 struct cma_event *evt; 2060 int ret; 2061 2062 ret = ucma_init(); 2063 if (ret) 2064 return ret; 2065 2066 if (!event) 2067 return ERR(EINVAL); 2068 2069 evt = malloc(sizeof(*evt)); 2070 if (!evt) 2071 return ERR(ENOMEM); 2072 2073 retry: 2074 memset(evt, 0, sizeof(*evt)); 2075 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp); 2076 ret = write(channel->fd, &cmd, sizeof cmd); 2077 if (ret != sizeof cmd) { 2078 free(evt); 2079 return (ret >= 0) ? ERR(ENODATA) : -1; 2080 } 2081 2082 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 2083 2084 evt->event.event = resp.event; 2085 /* 2086 * We should have a non-zero uid, except for connection requests. 2087 * But a bug in older kernels can report a uid 0. Work-around this 2088 * issue by looking up the cma_id based on the kernel's id when the 2089 * uid is 0 and we're processing a connection established event. 2090 * In all other cases, if the uid is 0, we discard the event, like 2091 * the kernel should have done. 2092 */ 2093 if (resp.uid) { 2094 evt->id_priv = (void *) (uintptr_t) resp.uid; 2095 } else { 2096 evt->id_priv = ucma_lookup_id(resp.id); 2097 if (!evt->id_priv) { 2098 syslog(LOG_WARNING, PFX "Warning: discarding unmatched " 2099 "event - rdma_destroy_id may hang.\n"); 2100 goto retry; 2101 } 2102 if (resp.event != RDMA_CM_EVENT_ESTABLISHED) { 2103 ucma_complete_event(evt->id_priv); 2104 goto retry; 2105 } 2106 } 2107 evt->event.id = &evt->id_priv->id; 2108 evt->event.status = resp.status; 2109 2110 switch (resp.event) { 2111 case RDMA_CM_EVENT_ADDR_RESOLVED: 2112 ucma_process_addr_resolved(evt); 2113 break; 2114 case RDMA_CM_EVENT_ROUTE_RESOLVED: 2115 ucma_process_route_resolved(evt); 2116 break; 2117 case RDMA_CM_EVENT_CONNECT_REQUEST: 2118 evt->id_priv = (void *) (uintptr_t) resp.uid; 2119 if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) 2120 ucma_copy_ud_event(evt, &resp.param.ud); 2121 else 2122 ucma_copy_conn_event(evt, &resp.param.conn); 2123 2124 ret = ucma_process_conn_req(evt, resp.id); 2125 if (ret) 2126 goto retry; 2127 break; 2128 case RDMA_CM_EVENT_CONNECT_RESPONSE: 2129 ucma_copy_conn_event(evt, &resp.param.conn); 2130 evt->event.status = ucma_process_conn_resp(evt->id_priv); 2131 if (!evt->event.status) 2132 evt->event.event = RDMA_CM_EVENT_ESTABLISHED; 2133 else { 2134 evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR; 2135 evt->id_priv->connect_error = 1; 2136 } 2137 break; 2138 case RDMA_CM_EVENT_ESTABLISHED: 2139 if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) { 2140 ucma_copy_ud_event(evt, &resp.param.ud); 2141 break; 2142 } 2143 2144 ucma_copy_conn_event(evt, &resp.param.conn); 2145 break; 2146 case RDMA_CM_EVENT_REJECTED: 2147 if (evt->id_priv->connect_error) { 2148 ucma_complete_event(evt->id_priv); 2149 goto retry; 2150 } 2151 ucma_copy_conn_event(evt, &resp.param.conn); 2152 ucma_modify_qp_err(evt->event.id); 2153 break; 2154 case RDMA_CM_EVENT_DISCONNECTED: 2155 if (evt->id_priv->connect_error) { 2156 ucma_complete_event(evt->id_priv); 2157 goto retry; 2158 } 2159 ucma_copy_conn_event(evt, &resp.param.conn); 2160 break; 2161 case RDMA_CM_EVENT_MULTICAST_JOIN: 2162 evt->mc = (void *) (uintptr_t) resp.uid; 2163 evt->id_priv = evt->mc->id_priv; 2164 evt->event.id = &evt->id_priv->id; 2165 ucma_copy_ud_event(evt, &resp.param.ud); 2166 evt->event.param.ud.private_data = evt->mc->context; 2167 evt->event.status = ucma_process_join(evt); 2168 if (evt->event.status) 2169 evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR; 2170 break; 2171 case RDMA_CM_EVENT_MULTICAST_ERROR: 2172 evt->mc = (void *) (uintptr_t) resp.uid; 2173 evt->id_priv = evt->mc->id_priv; 2174 evt->event.id = &evt->id_priv->id; 2175 evt->event.param.ud.private_data = evt->mc->context; 2176 break; 2177 default: 2178 evt->id_priv = (void *) (uintptr_t) resp.uid; 2179 evt->event.id = &evt->id_priv->id; 2180 evt->event.status = resp.status; 2181 if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) 2182 ucma_copy_ud_event(evt, &resp.param.ud); 2183 else 2184 ucma_copy_conn_event(evt, &resp.param.conn); 2185 break; 2186 } 2187 2188 *event = &evt->event; 2189 return 0; 2190 } 2191 2192 const char *rdma_event_str(enum rdma_cm_event_type event) 2193 { 2194 switch (event) { 2195 case RDMA_CM_EVENT_ADDR_RESOLVED: 2196 return "RDMA_CM_EVENT_ADDR_RESOLVED"; 2197 case RDMA_CM_EVENT_ADDR_ERROR: 2198 return "RDMA_CM_EVENT_ADDR_ERROR"; 2199 case RDMA_CM_EVENT_ROUTE_RESOLVED: 2200 return "RDMA_CM_EVENT_ROUTE_RESOLVED"; 2201 case RDMA_CM_EVENT_ROUTE_ERROR: 2202 return "RDMA_CM_EVENT_ROUTE_ERROR"; 2203 case RDMA_CM_EVENT_CONNECT_REQUEST: 2204 return "RDMA_CM_EVENT_CONNECT_REQUEST"; 2205 case RDMA_CM_EVENT_CONNECT_RESPONSE: 2206 return "RDMA_CM_EVENT_CONNECT_RESPONSE"; 2207 case RDMA_CM_EVENT_CONNECT_ERROR: 2208 return "RDMA_CM_EVENT_CONNECT_ERROR"; 2209 case RDMA_CM_EVENT_UNREACHABLE: 2210 return "RDMA_CM_EVENT_UNREACHABLE"; 2211 case RDMA_CM_EVENT_REJECTED: 2212 return "RDMA_CM_EVENT_REJECTED"; 2213 case RDMA_CM_EVENT_ESTABLISHED: 2214 return "RDMA_CM_EVENT_ESTABLISHED"; 2215 case RDMA_CM_EVENT_DISCONNECTED: 2216 return "RDMA_CM_EVENT_DISCONNECTED"; 2217 case RDMA_CM_EVENT_DEVICE_REMOVAL: 2218 return "RDMA_CM_EVENT_DEVICE_REMOVAL"; 2219 case RDMA_CM_EVENT_MULTICAST_JOIN: 2220 return "RDMA_CM_EVENT_MULTICAST_JOIN"; 2221 case RDMA_CM_EVENT_MULTICAST_ERROR: 2222 return "RDMA_CM_EVENT_MULTICAST_ERROR"; 2223 case RDMA_CM_EVENT_ADDR_CHANGE: 2224 return "RDMA_CM_EVENT_ADDR_CHANGE"; 2225 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 2226 return "RDMA_CM_EVENT_TIMEWAIT_EXIT"; 2227 default: 2228 return "UNKNOWN EVENT"; 2229 } 2230 } 2231 2232 int rdma_set_option(struct rdma_cm_id *id, int level, int optname, 2233 void *optval, size_t optlen) 2234 { 2235 struct ucma_abi_set_option cmd; 2236 struct cma_id_private *id_priv; 2237 int ret; 2238 2239 CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION); 2240 id_priv = container_of(id, struct cma_id_private, id); 2241 cmd.id = id_priv->handle; 2242 cmd.optval = (uintptr_t) optval; 2243 cmd.level = level; 2244 cmd.optname = optname; 2245 cmd.optlen = optlen; 2246 2247 ret = write(id->channel->fd, &cmd, sizeof cmd); 2248 if (ret != sizeof cmd) 2249 return (ret >= 0) ? ERR(ENODATA) : -1; 2250 2251 return 0; 2252 } 2253 2254 int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel) 2255 { 2256 struct ucma_abi_migrate_resp resp; 2257 struct ucma_abi_migrate_id cmd; 2258 struct cma_id_private *id_priv; 2259 int ret, sync; 2260 2261 id_priv = container_of(id, struct cma_id_private, id); 2262 if (id_priv->sync && !channel) 2263 return ERR(EINVAL); 2264 2265 if ((sync = (channel == NULL))) { 2266 channel = rdma_create_event_channel(); 2267 if (!channel) 2268 return -1; 2269 } 2270 2271 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp); 2272 cmd.id = id_priv->handle; 2273 cmd.fd = id->channel->fd; 2274 2275 ret = write(channel->fd, &cmd, sizeof cmd); 2276 if (ret != sizeof cmd) { 2277 if (sync) 2278 rdma_destroy_event_channel(channel); 2279 return (ret >= 0) ? ERR(ENODATA) : -1; 2280 } 2281 2282 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 2283 2284 if (id_priv->sync) { 2285 if (id->event) { 2286 rdma_ack_cm_event(id->event); 2287 id->event = NULL; 2288 } 2289 rdma_destroy_event_channel(id->channel); 2290 } 2291 2292 /* 2293 * Eventually if we want to support migrating channels while events are 2294 * being processed on the current channel, we need to block here while 2295 * there are any outstanding events on the current channel for this id 2296 * to prevent the user from processing events for this id on the old 2297 * channel after this call returns. 2298 */ 2299 pthread_mutex_lock(&id_priv->mut); 2300 id_priv->sync = sync; 2301 id->channel = channel; 2302 while (id_priv->events_completed < resp.events_reported) 2303 pthread_cond_wait(&id_priv->cond, &id_priv->mut); 2304 pthread_mutex_unlock(&id_priv->mut); 2305 2306 return 0; 2307 } 2308 2309 static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res, 2310 struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) 2311 { 2312 struct cma_id_private *id_priv; 2313 int ret; 2314 2315 if (af_ib_support) 2316 ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len); 2317 else 2318 ret = rdma_bind_addr(id, res->ai_src_addr); 2319 if (ret) 2320 return ret; 2321 2322 id_priv = container_of(id, struct cma_id_private, id); 2323 if (pd) 2324 id->pd = pd; 2325 2326 if (qp_init_attr) { 2327 id_priv->qp_init_attr = malloc(sizeof(*qp_init_attr)); 2328 if (!id_priv->qp_init_attr) 2329 return ERR(ENOMEM); 2330 2331 *id_priv->qp_init_attr = *qp_init_attr; 2332 id_priv->qp_init_attr->qp_type = res->ai_qp_type; 2333 } 2334 2335 return 0; 2336 } 2337 2338 int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res, 2339 struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) 2340 { 2341 struct rdma_cm_id *cm_id; 2342 struct cma_id_private *id_priv; 2343 int ret; 2344 2345 ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type); 2346 if (ret) 2347 return ret; 2348 2349 if (res->ai_flags & RAI_PASSIVE) { 2350 ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr); 2351 if (ret) 2352 goto err; 2353 goto out; 2354 } 2355 2356 if (af_ib_support) 2357 ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len, 2358 res->ai_dst_addr, res->ai_dst_len, 2000); 2359 else 2360 ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000); 2361 if (ret) 2362 goto err; 2363 2364 if (res->ai_route_len) { 2365 ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, 2366 res->ai_route, res->ai_route_len); 2367 if (!ret) 2368 ret = ucma_complete(cm_id); 2369 } else { 2370 ret = rdma_resolve_route(cm_id, 2000); 2371 } 2372 if (ret) 2373 goto err; 2374 2375 if (qp_init_attr) { 2376 qp_init_attr->qp_type = res->ai_qp_type; 2377 ret = rdma_create_qp(cm_id, pd, qp_init_attr); 2378 if (ret) 2379 goto err; 2380 } 2381 2382 if (res->ai_connect_len) { 2383 id_priv = container_of(cm_id, struct cma_id_private, id); 2384 id_priv->connect = malloc(res->ai_connect_len); 2385 if (!id_priv->connect) { 2386 ret = ERR(ENOMEM); 2387 goto err; 2388 } 2389 memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len); 2390 id_priv->connect_len = res->ai_connect_len; 2391 } 2392 2393 out: 2394 *id = cm_id; 2395 return 0; 2396 2397 err: 2398 rdma_destroy_ep(cm_id); 2399 return ret; 2400 } 2401 2402 void rdma_destroy_ep(struct rdma_cm_id *id) 2403 { 2404 struct cma_id_private *id_priv; 2405 2406 if (id->qp) 2407 rdma_destroy_qp(id); 2408 2409 if (id->srq) 2410 rdma_destroy_srq(id); 2411 2412 id_priv = container_of(id, struct cma_id_private, id); 2413 if (id_priv->qp_init_attr) 2414 free(id_priv->qp_init_attr); 2415 2416 rdma_destroy_id(id); 2417 } 2418 2419 int ucma_max_qpsize(struct rdma_cm_id *id) 2420 { 2421 struct cma_id_private *id_priv; 2422 int i, max_size = 0; 2423 2424 id_priv = container_of(id, struct cma_id_private, id); 2425 if (id && id_priv->cma_dev) { 2426 max_size = id_priv->cma_dev->max_qpsize; 2427 } else { 2428 ucma_init_all(); 2429 for (i = 0; i < cma_dev_cnt; i++) { 2430 if (!max_size || max_size > cma_dev_array[i].max_qpsize) 2431 max_size = cma_dev_array[i].max_qpsize; 2432 } 2433 } 2434 return max_size; 2435 } 2436 2437 __be16 ucma_get_port(struct sockaddr *addr) 2438 { 2439 switch (addr->sa_family) { 2440 case AF_INET: 2441 return ((struct sockaddr_in *) addr)->sin_port; 2442 case AF_INET6: 2443 return ((struct sockaddr_in6 *) addr)->sin6_port; 2444 case AF_IB: 2445 return htobe16((uint16_t) be64toh(((struct sockaddr_ib *) addr)->sib_sid)); 2446 default: 2447 return 0; 2448 } 2449 } 2450 2451 __be16 rdma_get_src_port(struct rdma_cm_id *id) 2452 { 2453 return ucma_get_port(&id->route.addr.src_addr); 2454 } 2455 2456 __be16 rdma_get_dst_port(struct rdma_cm_id *id) 2457 { 2458 return ucma_get_port(&id->route.addr.dst_addr); 2459 } 2460 2461