1 /* 2 * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <config.h> 34 35 #include <stdlib.h> 36 #include <string.h> 37 #include <glob.h> 38 #include <stdio.h> 39 #include <fcntl.h> 40 #include <errno.h> 41 #include <stdint.h> 42 #include <poll.h> 43 #include <unistd.h> 44 #include <pthread.h> 45 #include <infiniband/endian.h> 46 #include <stddef.h> 47 #include <netdb.h> 48 #include <syslog.h> 49 #include <limits.h> 50 51 #include "cma.h" 52 #include "indexer.h" 53 #include <infiniband/driver.h> 54 #include <infiniband/marshall.h> 55 #include <rdma/rdma_cma.h> 56 #include <rdma/rdma_cma_abi.h> 57 #include <rdma/rdma_verbs.h> 58 #include <infiniband/ib.h> 59 60 #define CMA_INIT_CMD(req, req_size, op) \ 61 do { \ 62 memset(req, 0, req_size); \ 63 (req)->cmd = UCMA_CMD_##op; \ 64 (req)->in = req_size - sizeof(struct ucma_abi_cmd_hdr); \ 65 } while (0) 66 67 #define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \ 68 do { \ 69 CMA_INIT_CMD(req, req_size, op); \ 70 (req)->out = resp_size; \ 71 (req)->response = (uintptr_t) (resp); \ 72 } while (0) 73 74 struct cma_port { 75 uint8_t link_layer; 76 }; 77 78 struct cma_device { 79 struct ibv_context *verbs; 80 struct ibv_pd *pd; 81 struct ibv_xrcd *xrcd; 82 struct cma_port *port; 83 __be64 guid; 84 int port_cnt; 85 int refcnt; 86 int max_qpsize; 87 uint8_t max_initiator_depth; 88 uint8_t max_responder_resources; 89 }; 90 91 struct cma_id_private { 92 struct rdma_cm_id id; 93 struct cma_device *cma_dev; 94 void *connect; 95 size_t connect_len; 96 int events_completed; 97 int connect_error; 98 int sync; 99 pthread_cond_t cond; 100 pthread_mutex_t mut; 101 uint32_t handle; 102 struct cma_multicast *mc_list; 103 struct ibv_qp_init_attr *qp_init_attr; 104 uint8_t initiator_depth; 105 uint8_t responder_resources; 106 }; 107 108 struct cma_multicast { 109 struct cma_multicast *next; 110 struct cma_id_private *id_priv; 111 void *context; 112 int events_completed; 113 pthread_cond_t cond; 114 uint32_t handle; 115 union ibv_gid mgid; 116 uint16_t mlid; 117 struct sockaddr_storage addr; 118 }; 119 120 struct cma_event { 121 struct rdma_cm_event event; 122 uint8_t private_data[RDMA_MAX_PRIVATE_DATA]; 123 struct cma_id_private *id_priv; 124 struct cma_multicast *mc; 125 }; 126 127 static struct cma_device *cma_dev_array; 128 static int cma_dev_cnt; 129 static int cma_init_cnt; 130 static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; 131 static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION; 132 int af_ib_support; 133 static struct index_map ucma_idm; 134 static fastlock_t idm_lock; 135 136 static int check_abi_version(void) 137 { 138 char value[8]; 139 140 if ((ibv_read_sysfs_file(ibv_get_sysfs_path(), 141 "class/misc/rdma_cm/abi_version", 142 value, sizeof value) < 0) && 143 (ibv_read_sysfs_file(ibv_get_sysfs_path(), 144 "class/infiniband_ucma/abi_version", 145 value, sizeof value) < 0)) { 146 /* 147 * Older version of Linux do not have class/misc. To support 148 * backports, assume the most recent version of the ABI. If 149 * we're wrong, we'll simply fail later when calling the ABI. 150 */ 151 return 0; 152 } 153 154 abi_ver = strtol(value, NULL, 10); 155 if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION || 156 abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) { 157 return -1; 158 } 159 return 0; 160 } 161 162 /* 163 * This function is called holding the mutex lock 164 * cma_dev_cnt must be set before calling this function to 165 * ensure that the lock is not acquired recursively. 166 */ 167 static void ucma_set_af_ib_support(void) 168 { 169 struct rdma_cm_id *id; 170 struct sockaddr_ib sib; 171 int ret; 172 173 ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB); 174 if (ret) 175 return; 176 177 memset(&sib, 0, sizeof sib); 178 sib.sib_family = AF_IB; 179 sib.sib_sid = htobe64(RDMA_IB_IP_PS_TCP); 180 sib.sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK); 181 af_ib_support = 1; 182 ret = rdma_bind_addr(id, (struct sockaddr *) &sib); 183 af_ib_support = !ret; 184 185 rdma_destroy_id(id); 186 } 187 188 int ucma_init(void) 189 { 190 struct ibv_device **dev_list = NULL; 191 int i, ret, dev_cnt; 192 193 /* Quick check without lock to see if we're already initialized */ 194 if (cma_dev_cnt) 195 return 0; 196 197 pthread_mutex_lock(&mut); 198 if (cma_dev_cnt) { 199 pthread_mutex_unlock(&mut); 200 return 0; 201 } 202 203 fastlock_init(&idm_lock); 204 ret = check_abi_version(); 205 if (ret) 206 goto err1; 207 208 dev_list = ibv_get_device_list(&dev_cnt); 209 if (!dev_list) { 210 ret = ERR(ENODEV); 211 goto err1; 212 } 213 214 if (!dev_cnt) { 215 ret = ERR(ENODEV); 216 goto err2; 217 } 218 219 cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array)); 220 if (!cma_dev_array) { 221 ret = ERR(ENOMEM); 222 goto err2; 223 } 224 225 for (i = 0; dev_list[i]; i++) 226 cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]); 227 228 cma_dev_cnt = dev_cnt; 229 ucma_set_af_ib_support(); 230 pthread_mutex_unlock(&mut); 231 ibv_free_device_list(dev_list); 232 return 0; 233 234 err2: 235 ibv_free_device_list(dev_list); 236 err1: 237 fastlock_destroy(&idm_lock); 238 pthread_mutex_unlock(&mut); 239 return ret; 240 } 241 242 static struct ibv_context *ucma_open_device(__be64 guid) 243 { 244 struct ibv_device **dev_list; 245 struct ibv_context *verbs = NULL; 246 int i; 247 248 dev_list = ibv_get_device_list(NULL); 249 if (!dev_list) { 250 return NULL; 251 } 252 253 for (i = 0; dev_list[i]; i++) { 254 if (ibv_get_device_guid(dev_list[i]) == guid) { 255 verbs = ibv_open_device(dev_list[i]); 256 break; 257 } 258 } 259 260 ibv_free_device_list(dev_list); 261 return verbs; 262 } 263 264 static int ucma_init_device(struct cma_device *cma_dev) 265 { 266 struct ibv_port_attr port_attr; 267 struct ibv_device_attr attr; 268 int i, ret; 269 270 if (cma_dev->verbs) 271 return 0; 272 273 cma_dev->verbs = ucma_open_device(cma_dev->guid); 274 if (!cma_dev->verbs) 275 return ERR(ENODEV); 276 277 ret = ibv_query_device(cma_dev->verbs, &attr); 278 if (ret) { 279 ret = ERR(ret); 280 goto err; 281 } 282 283 cma_dev->port = malloc(sizeof(*cma_dev->port) * attr.phys_port_cnt); 284 if (!cma_dev->port) { 285 ret = ERR(ENOMEM); 286 goto err; 287 } 288 289 for (i = 1; i <= attr.phys_port_cnt; i++) { 290 if (ibv_query_port(cma_dev->verbs, i, &port_attr)) 291 cma_dev->port[i - 1].link_layer = IBV_LINK_LAYER_UNSPECIFIED; 292 else 293 cma_dev->port[i - 1].link_layer = port_attr.link_layer; 294 } 295 296 cma_dev->port_cnt = attr.phys_port_cnt; 297 cma_dev->max_qpsize = attr.max_qp_wr; 298 cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom; 299 cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom; 300 cma_init_cnt++; 301 return 0; 302 303 err: 304 ibv_close_device(cma_dev->verbs); 305 cma_dev->verbs = NULL; 306 return ret; 307 } 308 309 static int ucma_init_all(void) 310 { 311 int i, ret = 0; 312 313 if (!cma_dev_cnt) { 314 ret = ucma_init(); 315 if (ret) 316 return ret; 317 } 318 319 if (cma_init_cnt == cma_dev_cnt) 320 return 0; 321 322 pthread_mutex_lock(&mut); 323 for (i = 0; i < cma_dev_cnt; i++) { 324 ret = ucma_init_device(&cma_dev_array[i]); 325 if (ret) 326 break; 327 } 328 pthread_mutex_unlock(&mut); 329 return ret; 330 } 331 332 struct ibv_context **rdma_get_devices(int *num_devices) 333 { 334 struct ibv_context **devs = NULL; 335 int i; 336 337 if (ucma_init_all()) 338 goto out; 339 340 devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1)); 341 if (!devs) 342 goto out; 343 344 for (i = 0; i < cma_dev_cnt; i++) 345 devs[i] = cma_dev_array[i].verbs; 346 devs[i] = NULL; 347 out: 348 if (num_devices) 349 *num_devices = devs ? cma_dev_cnt : 0; 350 return devs; 351 } 352 353 void rdma_free_devices(struct ibv_context **list) 354 { 355 free(list); 356 } 357 358 struct rdma_event_channel *rdma_create_event_channel(void) 359 { 360 struct rdma_event_channel *channel; 361 362 if (ucma_init()) 363 return NULL; 364 365 channel = malloc(sizeof(*channel)); 366 if (!channel) 367 return NULL; 368 369 channel->fd = open("/dev/rdma_cm", O_RDWR | O_CLOEXEC); 370 if (channel->fd < 0) { 371 goto err; 372 } 373 return channel; 374 err: 375 free(channel); 376 return NULL; 377 } 378 379 void rdma_destroy_event_channel(struct rdma_event_channel *channel) 380 { 381 close(channel->fd); 382 free(channel); 383 } 384 385 static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid) 386 { 387 struct cma_device *cma_dev; 388 int i, ret; 389 390 for (i = 0; i < cma_dev_cnt; i++) { 391 cma_dev = &cma_dev_array[i]; 392 if (cma_dev->guid == guid) 393 goto match; 394 } 395 396 return ERR(ENODEV); 397 match: 398 pthread_mutex_lock(&mut); 399 if ((ret = ucma_init_device(cma_dev))) 400 goto out; 401 402 if (!cma_dev->refcnt++) { 403 cma_dev->pd = ibv_alloc_pd(cma_dev->verbs); 404 if (!cma_dev->pd) { 405 cma_dev->refcnt--; 406 ret = ERR(ENOMEM); 407 goto out; 408 } 409 } 410 id_priv->cma_dev = cma_dev; 411 id_priv->id.verbs = cma_dev->verbs; 412 id_priv->id.pd = cma_dev->pd; 413 out: 414 pthread_mutex_unlock(&mut); 415 return ret; 416 } 417 418 static void ucma_put_device(struct cma_device *cma_dev) 419 { 420 pthread_mutex_lock(&mut); 421 if (!--cma_dev->refcnt) { 422 ibv_dealloc_pd(cma_dev->pd); 423 if (cma_dev->xrcd) 424 ibv_close_xrcd(cma_dev->xrcd); 425 } 426 pthread_mutex_unlock(&mut); 427 } 428 429 static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev) 430 { 431 struct ibv_xrcd_init_attr attr; 432 433 pthread_mutex_lock(&mut); 434 if (!cma_dev->xrcd) { 435 memset(&attr, 0, sizeof attr); 436 attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; 437 attr.fd = -1; 438 attr.oflags = O_CREAT; 439 cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr); 440 } 441 pthread_mutex_unlock(&mut); 442 return cma_dev->xrcd; 443 } 444 445 static void ucma_insert_id(struct cma_id_private *id_priv) 446 { 447 fastlock_acquire(&idm_lock); 448 idm_set(&ucma_idm, id_priv->handle, id_priv); 449 fastlock_release(&idm_lock); 450 } 451 452 static void ucma_remove_id(struct cma_id_private *id_priv) 453 { 454 if (id_priv->handle <= IDX_MAX_INDEX) 455 idm_clear(&ucma_idm, id_priv->handle); 456 } 457 458 static struct cma_id_private *ucma_lookup_id(int handle) 459 { 460 return idm_lookup(&ucma_idm, handle); 461 } 462 463 static void ucma_free_id(struct cma_id_private *id_priv) 464 { 465 ucma_remove_id(id_priv); 466 if (id_priv->cma_dev) 467 ucma_put_device(id_priv->cma_dev); 468 pthread_cond_destroy(&id_priv->cond); 469 pthread_mutex_destroy(&id_priv->mut); 470 if (id_priv->id.route.path_rec) 471 free(id_priv->id.route.path_rec); 472 473 if (id_priv->sync) 474 rdma_destroy_event_channel(id_priv->id.channel); 475 if (id_priv->connect_len) 476 free(id_priv->connect); 477 free(id_priv); 478 } 479 480 static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel, 481 void *context, 482 enum rdma_port_space ps, 483 enum ibv_qp_type qp_type) 484 { 485 struct cma_id_private *id_priv; 486 487 id_priv = calloc(1, sizeof(*id_priv)); 488 if (!id_priv) 489 return NULL; 490 491 id_priv->id.context = context; 492 id_priv->id.ps = ps; 493 id_priv->id.qp_type = qp_type; 494 id_priv->handle = 0xFFFFFFFF; 495 496 if (!channel) { 497 id_priv->id.channel = rdma_create_event_channel(); 498 if (!id_priv->id.channel) 499 goto err; 500 id_priv->sync = 1; 501 } else { 502 id_priv->id.channel = channel; 503 } 504 505 if (pthread_mutex_init(&id_priv->mut, NULL)) 506 goto err; 507 if (pthread_cond_init(&id_priv->cond, NULL)) 508 goto err; 509 510 return id_priv; 511 512 err: ucma_free_id(id_priv); 513 return NULL; 514 } 515 516 static int rdma_create_id2(struct rdma_event_channel *channel, 517 struct rdma_cm_id **id, void *context, 518 enum rdma_port_space ps, enum ibv_qp_type qp_type) 519 { 520 struct ucma_abi_create_id_resp resp; 521 struct ucma_abi_create_id cmd; 522 struct cma_id_private *id_priv; 523 int ret; 524 525 ret = ucma_init(); 526 if (ret) 527 return ret; 528 529 id_priv = ucma_alloc_id(channel, context, ps, qp_type); 530 if (!id_priv) 531 return ERR(ENOMEM); 532 533 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp); 534 cmd.uid = (uintptr_t) id_priv; 535 cmd.ps = ps; 536 cmd.qp_type = qp_type; 537 538 ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); 539 if (ret != sizeof cmd) 540 goto err; 541 542 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 543 544 id_priv->handle = resp.id; 545 ucma_insert_id(id_priv); 546 *id = &id_priv->id; 547 return 0; 548 549 err: ucma_free_id(id_priv); 550 return ret; 551 } 552 553 int rdma_create_id(struct rdma_event_channel *channel, 554 struct rdma_cm_id **id, void *context, 555 enum rdma_port_space ps) 556 { 557 enum ibv_qp_type qp_type; 558 559 qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ? 560 IBV_QPT_UD : IBV_QPT_RC; 561 return rdma_create_id2(channel, id, context, ps, qp_type); 562 } 563 564 static int ucma_destroy_kern_id(int fd, uint32_t handle) 565 { 566 struct ucma_abi_destroy_id_resp resp; 567 struct ucma_abi_destroy_id cmd; 568 int ret; 569 570 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp); 571 cmd.id = handle; 572 573 ret = write(fd, &cmd, sizeof cmd); 574 if (ret != sizeof cmd) 575 return (ret >= 0) ? ERR(ENODATA) : -1; 576 577 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 578 579 return resp.events_reported; 580 } 581 582 int rdma_destroy_id(struct rdma_cm_id *id) 583 { 584 struct cma_id_private *id_priv; 585 int ret; 586 587 id_priv = container_of(id, struct cma_id_private, id); 588 ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle); 589 if (ret < 0) 590 return ret; 591 592 if (id_priv->id.event) 593 rdma_ack_cm_event(id_priv->id.event); 594 595 pthread_mutex_lock(&id_priv->mut); 596 while (id_priv->events_completed < ret) 597 pthread_cond_wait(&id_priv->cond, &id_priv->mut); 598 pthread_mutex_unlock(&id_priv->mut); 599 600 ucma_free_id(id_priv); 601 return 0; 602 } 603 604 int ucma_addrlen(struct sockaddr *addr) 605 { 606 if (!addr) 607 return 0; 608 609 switch (addr->sa_family) { 610 case PF_INET: 611 return sizeof(struct sockaddr_in); 612 case PF_INET6: 613 return sizeof(struct sockaddr_in6); 614 case PF_IB: 615 return af_ib_support ? sizeof(struct sockaddr_ib) : 0; 616 default: 617 return 0; 618 } 619 } 620 621 static int ucma_query_addr(struct rdma_cm_id *id) 622 { 623 struct ucma_abi_query_addr_resp resp; 624 struct ucma_abi_query cmd; 625 struct cma_id_private *id_priv; 626 int ret; 627 628 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); 629 id_priv = container_of(id, struct cma_id_private, id); 630 cmd.id = id_priv->handle; 631 cmd.option = UCMA_QUERY_ADDR; 632 633 ret = write(id->channel->fd, &cmd, sizeof cmd); 634 if (ret != sizeof cmd) 635 return (ret >= 0) ? ERR(ENODATA) : -1; 636 637 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 638 639 memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size); 640 memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size); 641 642 if (!id_priv->cma_dev && resp.node_guid) { 643 ret = ucma_get_device(id_priv, resp.node_guid); 644 if (ret) 645 return ret; 646 id->port_num = resp.port_num; 647 id->route.addr.addr.ibaddr.pkey = resp.pkey; 648 } 649 650 return 0; 651 } 652 653 static int ucma_query_gid(struct rdma_cm_id *id) 654 { 655 struct ucma_abi_query_addr_resp resp; 656 struct ucma_abi_query cmd; 657 struct cma_id_private *id_priv; 658 struct sockaddr_ib *sib; 659 int ret; 660 661 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); 662 id_priv = container_of(id, struct cma_id_private, id); 663 cmd.id = id_priv->handle; 664 cmd.option = UCMA_QUERY_GID; 665 666 ret = write(id->channel->fd, &cmd, sizeof cmd); 667 if (ret != sizeof cmd) 668 return (ret >= 0) ? ERR(ENODATA) : -1; 669 670 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 671 672 sib = (struct sockaddr_ib *) &resp.src_addr; 673 memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw, 674 sizeof id->route.addr.addr.ibaddr.sgid); 675 676 sib = (struct sockaddr_ib *) &resp.dst_addr; 677 memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw, 678 sizeof id->route.addr.addr.ibaddr.dgid); 679 680 return 0; 681 } 682 683 static void ucma_convert_path(struct ibv_path_data *path_data, 684 struct ibv_sa_path_rec *sa_path) 685 { 686 uint32_t fl_hop; 687 688 sa_path->dgid = path_data->path.dgid; 689 sa_path->sgid = path_data->path.sgid; 690 sa_path->dlid = path_data->path.dlid; 691 sa_path->slid = path_data->path.slid; 692 sa_path->raw_traffic = 0; 693 694 fl_hop = be32toh(path_data->path.flowlabel_hoplimit); 695 sa_path->flow_label = htobe32(fl_hop >> 8); 696 sa_path->hop_limit = (uint8_t) fl_hop; 697 698 sa_path->traffic_class = path_data->path.tclass; 699 sa_path->reversible = path_data->path.reversible_numpath >> 7; 700 sa_path->numb_path = 1; 701 sa_path->pkey = path_data->path.pkey; 702 sa_path->sl = be16toh(path_data->path.qosclass_sl) & 0xF; 703 sa_path->mtu_selector = 2; /* exactly */ 704 sa_path->mtu = path_data->path.mtu & 0x1F; 705 sa_path->rate_selector = 2; 706 sa_path->rate = path_data->path.rate & 0x1F; 707 sa_path->packet_life_time_selector = 2; 708 sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F; 709 710 sa_path->preference = (uint8_t) path_data->flags; 711 } 712 713 static int ucma_query_path(struct rdma_cm_id *id) 714 { 715 struct ucma_abi_query_path_resp *resp; 716 struct ucma_abi_query cmd; 717 struct cma_id_private *id_priv; 718 int ret, i, size; 719 720 size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6; 721 resp = alloca(size); 722 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size); 723 id_priv = container_of(id, struct cma_id_private, id); 724 cmd.id = id_priv->handle; 725 cmd.option = UCMA_QUERY_PATH; 726 727 ret = write(id->channel->fd, &cmd, sizeof cmd); 728 if (ret != sizeof cmd) 729 return (ret >= 0) ? ERR(ENODATA) : -1; 730 731 VALGRIND_MAKE_MEM_DEFINED(resp, size); 732 733 if (resp->num_paths) { 734 id->route.path_rec = malloc(sizeof(*id->route.path_rec) * 735 resp->num_paths); 736 if (!id->route.path_rec) 737 return ERR(ENOMEM); 738 739 id->route.num_paths = resp->num_paths; 740 for (i = 0; i < resp->num_paths; i++) 741 ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]); 742 } 743 744 return 0; 745 } 746 747 static int ucma_query_route(struct rdma_cm_id *id) 748 { 749 struct ucma_abi_query_route_resp resp; 750 struct ucma_abi_query cmd; 751 struct cma_id_private *id_priv; 752 int ret, i; 753 754 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp); 755 id_priv = container_of(id, struct cma_id_private, id); 756 cmd.id = id_priv->handle; 757 758 ret = write(id->channel->fd, &cmd, sizeof cmd); 759 if (ret != sizeof cmd) 760 return (ret >= 0) ? ERR(ENODATA) : -1; 761 762 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 763 764 if (resp.num_paths) { 765 id->route.path_rec = malloc(sizeof(*id->route.path_rec) * 766 resp.num_paths); 767 if (!id->route.path_rec) 768 return ERR(ENOMEM); 769 770 id->route.num_paths = resp.num_paths; 771 for (i = 0; i < resp.num_paths; i++) 772 ibv_copy_path_rec_from_kern(&id->route.path_rec[i], 773 &resp.ib_route[i]); 774 } 775 776 memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid, 777 sizeof id->route.addr.addr.ibaddr.sgid); 778 memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid, 779 sizeof id->route.addr.addr.ibaddr.dgid); 780 id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey; 781 memcpy(&id->route.addr.src_addr, &resp.src_addr, 782 sizeof resp.src_addr); 783 memcpy(&id->route.addr.dst_addr, &resp.dst_addr, 784 sizeof resp.dst_addr); 785 786 if (!id_priv->cma_dev && resp.node_guid) { 787 ret = ucma_get_device(id_priv, resp.node_guid); 788 if (ret) 789 return ret; 790 id_priv->id.port_num = resp.port_num; 791 } 792 793 return 0; 794 } 795 796 static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr, 797 socklen_t addrlen) 798 { 799 struct ucma_abi_bind cmd; 800 struct cma_id_private *id_priv; 801 int ret; 802 803 CMA_INIT_CMD(&cmd, sizeof cmd, BIND); 804 id_priv = container_of(id, struct cma_id_private, id); 805 cmd.id = id_priv->handle; 806 cmd.addr_size = addrlen; 807 memcpy(&cmd.addr, addr, addrlen); 808 809 ret = write(id->channel->fd, &cmd, sizeof cmd); 810 if (ret != sizeof cmd) 811 return (ret >= 0) ? ERR(ENODATA) : -1; 812 813 ret = ucma_query_addr(id); 814 if (!ret) 815 ret = ucma_query_gid(id); 816 return ret; 817 } 818 819 int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) 820 { 821 struct ucma_abi_bind_ip cmd; 822 struct cma_id_private *id_priv; 823 int ret, addrlen; 824 825 addrlen = ucma_addrlen(addr); 826 if (!addrlen) 827 return ERR(EINVAL); 828 829 if (af_ib_support) 830 return rdma_bind_addr2(id, addr, addrlen); 831 832 CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP); 833 id_priv = container_of(id, struct cma_id_private, id); 834 cmd.id = id_priv->handle; 835 memcpy(&cmd.addr, addr, addrlen); 836 837 ret = write(id->channel->fd, &cmd, sizeof cmd); 838 if (ret != sizeof cmd) 839 return (ret >= 0) ? ERR(ENODATA) : -1; 840 841 return ucma_query_route(id); 842 } 843 844 int ucma_complete(struct rdma_cm_id *id) 845 { 846 struct cma_id_private *id_priv; 847 int ret; 848 849 id_priv = container_of(id, struct cma_id_private, id); 850 if (!id_priv->sync) 851 return 0; 852 853 if (id_priv->id.event) { 854 rdma_ack_cm_event(id_priv->id.event); 855 id_priv->id.event = NULL; 856 } 857 858 ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event); 859 if (ret) 860 return ret; 861 862 if (id_priv->id.event->status) { 863 if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED) 864 ret = ERR(ECONNREFUSED); 865 else if (id_priv->id.event->status < 0) 866 ret = ERR(-id_priv->id.event->status); 867 else 868 ret = ERR(-id_priv->id.event->status); 869 } 870 return ret; 871 } 872 873 static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr, 874 socklen_t src_len, struct sockaddr *dst_addr, 875 socklen_t dst_len, int timeout_ms) 876 { 877 struct ucma_abi_resolve_addr cmd; 878 struct cma_id_private *id_priv; 879 int ret; 880 881 CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR); 882 id_priv = container_of(id, struct cma_id_private, id); 883 cmd.id = id_priv->handle; 884 if ((cmd.src_size = src_len)) 885 memcpy(&cmd.src_addr, src_addr, src_len); 886 memcpy(&cmd.dst_addr, dst_addr, dst_len); 887 cmd.dst_size = dst_len; 888 cmd.timeout_ms = timeout_ms; 889 890 ret = write(id->channel->fd, &cmd, sizeof cmd); 891 if (ret != sizeof cmd) 892 return (ret >= 0) ? ERR(ENODATA) : -1; 893 894 memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); 895 return ucma_complete(id); 896 } 897 898 int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, 899 struct sockaddr *dst_addr, int timeout_ms) 900 { 901 struct ucma_abi_resolve_ip cmd; 902 struct cma_id_private *id_priv; 903 int ret, dst_len, src_len; 904 905 dst_len = ucma_addrlen(dst_addr); 906 if (!dst_len) 907 return ERR(EINVAL); 908 909 src_len = ucma_addrlen(src_addr); 910 if (src_addr && !src_len) 911 return ERR(EINVAL); 912 913 if (af_ib_support) 914 return rdma_resolve_addr2(id, src_addr, src_len, dst_addr, 915 dst_len, timeout_ms); 916 917 CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP); 918 id_priv = container_of(id, struct cma_id_private, id); 919 cmd.id = id_priv->handle; 920 if (src_addr) 921 memcpy(&cmd.src_addr, src_addr, src_len); 922 memcpy(&cmd.dst_addr, dst_addr, dst_len); 923 cmd.timeout_ms = timeout_ms; 924 925 ret = write(id->channel->fd, &cmd, sizeof cmd); 926 if (ret != sizeof cmd) 927 return (ret >= 0) ? ERR(ENODATA) : -1; 928 929 memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); 930 return ucma_complete(id); 931 } 932 933 static int ucma_set_ib_route(struct rdma_cm_id *id) 934 { 935 struct rdma_addrinfo hint, *rai; 936 int ret; 937 938 memset(&hint, 0, sizeof hint); 939 hint.ai_flags = RAI_ROUTEONLY; 940 hint.ai_family = id->route.addr.src_addr.sa_family; 941 hint.ai_src_len = ucma_addrlen((struct sockaddr *) &id->route.addr.src_addr); 942 hint.ai_src_addr = &id->route.addr.src_addr; 943 hint.ai_dst_len = ucma_addrlen((struct sockaddr *) &id->route.addr.dst_addr); 944 hint.ai_dst_addr = &id->route.addr.dst_addr; 945 946 ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai); 947 if (ret) 948 return ret; 949 950 if (rai->ai_route_len) 951 ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, 952 rai->ai_route, rai->ai_route_len); 953 else 954 ret = -1; 955 956 rdma_freeaddrinfo(rai); 957 return ret; 958 } 959 960 int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) 961 { 962 struct ucma_abi_resolve_route cmd; 963 struct cma_id_private *id_priv; 964 int ret; 965 966 id_priv = container_of(id, struct cma_id_private, id); 967 if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) { 968 ret = ucma_set_ib_route(id); 969 if (!ret) 970 goto out; 971 } 972 973 CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE); 974 cmd.id = id_priv->handle; 975 cmd.timeout_ms = timeout_ms; 976 977 ret = write(id->channel->fd, &cmd, sizeof cmd); 978 if (ret != sizeof cmd) 979 return (ret >= 0) ? ERR(ENODATA) : -1; 980 981 out: 982 return ucma_complete(id); 983 } 984 985 static int ucma_is_ud_qp(enum ibv_qp_type qp_type) 986 { 987 return (qp_type == IBV_QPT_UD); 988 } 989 990 static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr, 991 int *qp_attr_mask) 992 { 993 struct ucma_abi_init_qp_attr cmd; 994 struct ibv_kern_qp_attr resp; 995 struct cma_id_private *id_priv; 996 int ret; 997 998 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp); 999 id_priv = container_of(id, struct cma_id_private, id); 1000 cmd.id = id_priv->handle; 1001 cmd.qp_state = qp_attr->qp_state; 1002 1003 ret = write(id->channel->fd, &cmd, sizeof cmd); 1004 if (ret != sizeof cmd) 1005 return (ret >= 0) ? ERR(ENODATA) : -1; 1006 1007 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 1008 1009 ibv_copy_qp_attr_from_kern(qp_attr, &resp); 1010 *qp_attr_mask = resp.qp_attr_mask; 1011 return 0; 1012 } 1013 1014 static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res) 1015 { 1016 struct cma_id_private *id_priv; 1017 struct ibv_qp_attr qp_attr; 1018 int qp_attr_mask, ret; 1019 uint8_t link_layer; 1020 1021 if (!id->qp) 1022 return ERR(EINVAL); 1023 1024 /* Need to update QP attributes from default values. */ 1025 qp_attr.qp_state = IBV_QPS_INIT; 1026 ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); 1027 if (ret) 1028 return ret; 1029 1030 ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask); 1031 if (ret) 1032 return ERR(ret); 1033 1034 qp_attr.qp_state = IBV_QPS_RTR; 1035 ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); 1036 if (ret) 1037 return ret; 1038 1039 /* 1040 * Workaround for rdma_ucm kernel bug: 1041 * mask off qp_attr_mask bits 21-24 which are used for RoCE 1042 */ 1043 id_priv = container_of(id, struct cma_id_private, id); 1044 link_layer = id_priv->cma_dev->port[id->port_num - 1].link_layer; 1045 1046 if (link_layer == IBV_LINK_LAYER_INFINIBAND) 1047 qp_attr_mask &= UINT_MAX ^ 0xe00000; 1048 1049 if (resp_res != RDMA_MAX_RESP_RES) 1050 qp_attr.max_dest_rd_atomic = resp_res; 1051 return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); 1052 } 1053 1054 static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth) 1055 { 1056 struct ibv_qp_attr qp_attr; 1057 int qp_attr_mask, ret; 1058 1059 qp_attr.qp_state = IBV_QPS_RTS; 1060 ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); 1061 if (ret) 1062 return ret; 1063 1064 if (init_depth != RDMA_MAX_INIT_DEPTH) 1065 qp_attr.max_rd_atomic = init_depth; 1066 return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); 1067 } 1068 1069 static int ucma_modify_qp_sqd(struct rdma_cm_id *id) 1070 { 1071 struct ibv_qp_attr qp_attr; 1072 1073 if (!id->qp) 1074 return 0; 1075 1076 qp_attr.qp_state = IBV_QPS_SQD; 1077 return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); 1078 } 1079 1080 static int ucma_modify_qp_err(struct rdma_cm_id *id) 1081 { 1082 struct ibv_qp_attr qp_attr; 1083 1084 if (!id->qp) 1085 return 0; 1086 1087 qp_attr.qp_state = IBV_QPS_ERR; 1088 return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); 1089 } 1090 1091 static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num, 1092 __be16 pkey, uint16_t *pkey_index) 1093 { 1094 int ret, i; 1095 __be16 chk_pkey; 1096 1097 for (i = 0, ret = 0; !ret; i++) { 1098 ret = ibv_query_pkey(cma_dev->verbs, port_num, i, &chk_pkey); 1099 if (!ret && pkey == chk_pkey) { 1100 *pkey_index = (uint16_t) i; 1101 return 0; 1102 } 1103 } 1104 return ERR(EINVAL); 1105 } 1106 1107 static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) 1108 { 1109 struct ibv_qp_attr qp_attr; 1110 int ret; 1111 1112 ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num, 1113 id_priv->id.route.addr.addr.ibaddr.pkey, 1114 &qp_attr.pkey_index); 1115 if (ret) 1116 return ret; 1117 1118 qp_attr.port_num = id_priv->id.port_num; 1119 qp_attr.qp_state = IBV_QPS_INIT; 1120 qp_attr.qp_access_flags = 0; 1121 1122 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS | 1123 IBV_QP_PKEY_INDEX | IBV_QP_PORT); 1124 return rdma_seterrno(ret); 1125 } 1126 1127 static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) 1128 { 1129 struct ibv_qp_attr qp_attr; 1130 int qp_attr_mask, ret; 1131 1132 if (abi_ver == 3) 1133 return ucma_init_conn_qp3(id_priv, qp); 1134 1135 qp_attr.qp_state = IBV_QPS_INIT; 1136 ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); 1137 if (ret) 1138 return ret; 1139 1140 return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask)); 1141 } 1142 1143 static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) 1144 { 1145 struct ibv_qp_attr qp_attr; 1146 int ret; 1147 1148 ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num, 1149 id_priv->id.route.addr.addr.ibaddr.pkey, 1150 &qp_attr.pkey_index); 1151 if (ret) 1152 return ret; 1153 1154 qp_attr.port_num = id_priv->id.port_num; 1155 qp_attr.qp_state = IBV_QPS_INIT; 1156 qp_attr.qkey = RDMA_UDP_QKEY; 1157 1158 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY | 1159 IBV_QP_PKEY_INDEX | IBV_QP_PORT); 1160 if (ret) 1161 return ERR(ret); 1162 1163 qp_attr.qp_state = IBV_QPS_RTR; 1164 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); 1165 if (ret) 1166 return ERR(ret); 1167 1168 qp_attr.qp_state = IBV_QPS_RTS; 1169 qp_attr.sq_psn = 0; 1170 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); 1171 return rdma_seterrno(ret); 1172 } 1173 1174 static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) 1175 { 1176 struct ibv_qp_attr qp_attr; 1177 int qp_attr_mask, ret; 1178 1179 if (abi_ver == 3) 1180 return ucma_init_ud_qp3(id_priv, qp); 1181 1182 qp_attr.qp_state = IBV_QPS_INIT; 1183 ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); 1184 if (ret) 1185 return ret; 1186 1187 ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask); 1188 if (ret) 1189 return ERR(ret); 1190 1191 qp_attr.qp_state = IBV_QPS_RTR; 1192 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); 1193 if (ret) 1194 return ERR(ret); 1195 1196 qp_attr.qp_state = IBV_QPS_RTS; 1197 qp_attr.sq_psn = 0; 1198 ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); 1199 return rdma_seterrno(ret); 1200 } 1201 1202 static void ucma_destroy_cqs(struct rdma_cm_id *id) 1203 { 1204 if (id->qp_type == IBV_QPT_XRC_RECV && id->srq) 1205 return; 1206 1207 if (id->recv_cq) { 1208 ibv_destroy_cq(id->recv_cq); 1209 if (id->send_cq && (id->send_cq != id->recv_cq)) { 1210 ibv_destroy_cq(id->send_cq); 1211 id->send_cq = NULL; 1212 } 1213 id->recv_cq = NULL; 1214 } 1215 1216 if (id->recv_cq_channel) { 1217 ibv_destroy_comp_channel(id->recv_cq_channel); 1218 if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) { 1219 ibv_destroy_comp_channel(id->send_cq_channel); 1220 id->send_cq_channel = NULL; 1221 } 1222 id->recv_cq_channel = NULL; 1223 } 1224 } 1225 1226 static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size) 1227 { 1228 if (recv_size) { 1229 id->recv_cq_channel = ibv_create_comp_channel(id->verbs); 1230 if (!id->recv_cq_channel) 1231 goto err; 1232 1233 id->recv_cq = ibv_create_cq(id->verbs, recv_size, 1234 id, id->recv_cq_channel, 0); 1235 if (!id->recv_cq) 1236 goto err; 1237 } 1238 1239 if (send_size) { 1240 id->send_cq_channel = ibv_create_comp_channel(id->verbs); 1241 if (!id->send_cq_channel) 1242 goto err; 1243 1244 id->send_cq = ibv_create_cq(id->verbs, send_size, 1245 id, id->send_cq_channel, 0); 1246 if (!id->send_cq) 1247 goto err; 1248 } 1249 1250 return 0; 1251 err: 1252 ucma_destroy_cqs(id); 1253 return ERR(ENOMEM); 1254 } 1255 1256 int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr) 1257 { 1258 struct cma_id_private *id_priv; 1259 struct ibv_srq *srq; 1260 int ret; 1261 1262 id_priv = container_of(id, struct cma_id_private, id); 1263 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE)) 1264 return ERR(EINVAL); 1265 1266 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) { 1267 attr->pd = id->pd; 1268 attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD; 1269 } 1270 1271 if (attr->srq_type == IBV_SRQT_XRC) { 1272 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) { 1273 attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); 1274 if (!attr->xrcd) 1275 return -1; 1276 } 1277 if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) { 1278 ret = ucma_create_cqs(id, 0, attr->attr.max_wr); 1279 if (ret) 1280 return ret; 1281 attr->cq = id->recv_cq; 1282 } 1283 attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ; 1284 } 1285 1286 srq = ibv_create_srq_ex(id->verbs, attr); 1287 if (!srq) { 1288 ret = -1; 1289 goto err; 1290 } 1291 1292 if (!id->pd) 1293 id->pd = attr->pd; 1294 id->srq = srq; 1295 return 0; 1296 err: 1297 ucma_destroy_cqs(id); 1298 return ret; 1299 } 1300 1301 int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, 1302 struct ibv_srq_init_attr *attr) 1303 { 1304 struct ibv_srq_init_attr_ex attr_ex; 1305 int ret; 1306 1307 memcpy(&attr_ex, attr, sizeof(*attr)); 1308 attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD; 1309 if (id->qp_type == IBV_QPT_XRC_RECV) { 1310 attr_ex.srq_type = IBV_SRQT_XRC; 1311 } else { 1312 attr_ex.srq_type = IBV_SRQT_BASIC; 1313 } 1314 attr_ex.pd = pd; 1315 ret = rdma_create_srq_ex(id, &attr_ex); 1316 memcpy(attr, &attr_ex, sizeof(*attr)); 1317 return ret; 1318 } 1319 1320 void rdma_destroy_srq(struct rdma_cm_id *id) 1321 { 1322 ibv_destroy_srq(id->srq); 1323 id->srq = NULL; 1324 ucma_destroy_cqs(id); 1325 } 1326 1327 int rdma_create_qp_ex(struct rdma_cm_id *id, 1328 struct ibv_qp_init_attr_ex *attr) 1329 { 1330 struct cma_id_private *id_priv; 1331 struct ibv_qp *qp; 1332 int ret; 1333 1334 if (id->qp) 1335 return ERR(EINVAL); 1336 1337 id_priv = container_of(id, struct cma_id_private, id); 1338 if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) { 1339 attr->comp_mask |= IBV_QP_INIT_ATTR_PD; 1340 attr->pd = id->pd; 1341 } else if (id->verbs != attr->pd->context) 1342 return ERR(EINVAL); 1343 1344 if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) || 1345 (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq)) 1346 return ERR(EINVAL); 1347 1348 if (id->qp_type == IBV_QPT_XRC_RECV) { 1349 if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) { 1350 attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); 1351 if (!attr->xrcd) 1352 return -1; 1353 attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD; 1354 } 1355 } 1356 1357 ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr, 1358 attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr); 1359 if (ret) 1360 return ret; 1361 1362 if (!attr->send_cq) 1363 attr->send_cq = id->send_cq; 1364 if (!attr->recv_cq) 1365 attr->recv_cq = id->recv_cq; 1366 if (id->srq && !attr->srq) 1367 attr->srq = id->srq; 1368 qp = ibv_create_qp_ex(id->verbs, attr); 1369 if (!qp) { 1370 ret = ERR(ENOMEM); 1371 goto err1; 1372 } 1373 1374 if (ucma_is_ud_qp(id->qp_type)) 1375 ret = ucma_init_ud_qp(id_priv, qp); 1376 else 1377 ret = ucma_init_conn_qp(id_priv, qp); 1378 if (ret) 1379 goto err2; 1380 1381 id->pd = qp->pd; 1382 id->qp = qp; 1383 return 0; 1384 err2: 1385 ibv_destroy_qp(qp); 1386 err1: 1387 ucma_destroy_cqs(id); 1388 return ret; 1389 } 1390 1391 int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, 1392 struct ibv_qp_init_attr *qp_init_attr) 1393 { 1394 struct ibv_qp_init_attr_ex attr_ex; 1395 int ret; 1396 1397 memcpy(&attr_ex, qp_init_attr, sizeof(*qp_init_attr)); 1398 attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; 1399 attr_ex.pd = pd ? pd : id->pd; 1400 ret = rdma_create_qp_ex(id, &attr_ex); 1401 memcpy(qp_init_attr, &attr_ex, sizeof(*qp_init_attr)); 1402 return ret; 1403 } 1404 1405 void rdma_destroy_qp(struct rdma_cm_id *id) 1406 { 1407 ibv_destroy_qp(id->qp); 1408 id->qp = NULL; 1409 ucma_destroy_cqs(id); 1410 } 1411 1412 static int ucma_valid_param(struct cma_id_private *id_priv, 1413 struct rdma_conn_param *param) 1414 { 1415 if (id_priv->id.ps != RDMA_PS_TCP) 1416 return 0; 1417 1418 if (!id_priv->id.qp && !param) 1419 goto err; 1420 1421 if (!param) 1422 return 0; 1423 1424 if ((param->responder_resources != RDMA_MAX_RESP_RES) && 1425 (param->responder_resources > id_priv->cma_dev->max_responder_resources)) 1426 goto err; 1427 1428 if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) && 1429 (param->initiator_depth > id_priv->cma_dev->max_initiator_depth)) 1430 goto err; 1431 1432 return 0; 1433 err: 1434 return ERR(EINVAL); 1435 } 1436 1437 static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv, 1438 struct ucma_abi_conn_param *dst, 1439 struct rdma_conn_param *src, 1440 uint32_t qp_num, uint8_t srq) 1441 { 1442 dst->qp_num = qp_num; 1443 dst->srq = srq; 1444 dst->responder_resources = id_priv->responder_resources; 1445 dst->initiator_depth = id_priv->initiator_depth; 1446 dst->valid = 1; 1447 1448 if (id_priv->connect_len) { 1449 memcpy(dst->private_data, id_priv->connect, id_priv->connect_len); 1450 dst->private_data_len = id_priv->connect_len; 1451 } 1452 1453 if (src) { 1454 dst->flow_control = src->flow_control; 1455 dst->retry_count = src->retry_count; 1456 dst->rnr_retry_count = src->rnr_retry_count; 1457 1458 if (src->private_data && src->private_data_len) { 1459 memcpy(dst->private_data + dst->private_data_len, 1460 src->private_data, src->private_data_len); 1461 dst->private_data_len += src->private_data_len; 1462 } 1463 } else { 1464 dst->retry_count = 7; 1465 dst->rnr_retry_count = 7; 1466 } 1467 } 1468 1469 int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) 1470 { 1471 struct ucma_abi_connect cmd; 1472 struct cma_id_private *id_priv; 1473 int ret; 1474 1475 id_priv = container_of(id, struct cma_id_private, id); 1476 ret = ucma_valid_param(id_priv, conn_param); 1477 if (ret) 1478 return ret; 1479 1480 if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH) 1481 id_priv->initiator_depth = conn_param->initiator_depth; 1482 else 1483 id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth; 1484 if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES) 1485 id_priv->responder_resources = conn_param->responder_resources; 1486 else 1487 id_priv->responder_resources = id_priv->cma_dev->max_responder_resources; 1488 1489 CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT); 1490 cmd.id = id_priv->handle; 1491 if (id->qp) { 1492 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1493 conn_param, id->qp->qp_num, 1494 (id->qp->srq != NULL)); 1495 } else if (conn_param) { 1496 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1497 conn_param, conn_param->qp_num, 1498 conn_param->srq); 1499 } else { 1500 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1501 conn_param, 0, 0); 1502 } 1503 1504 ret = write(id->channel->fd, &cmd, sizeof cmd); 1505 if (ret != sizeof cmd) 1506 return (ret >= 0) ? ERR(ENODATA) : -1; 1507 1508 if (id_priv->connect_len) { 1509 free(id_priv->connect); 1510 id_priv->connect_len = 0; 1511 } 1512 1513 return ucma_complete(id); 1514 } 1515 1516 int rdma_listen(struct rdma_cm_id *id, int backlog) 1517 { 1518 struct ucma_abi_listen cmd; 1519 struct cma_id_private *id_priv; 1520 int ret; 1521 1522 CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN); 1523 id_priv = container_of(id, struct cma_id_private, id); 1524 cmd.id = id_priv->handle; 1525 cmd.backlog = backlog; 1526 1527 ret = write(id->channel->fd, &cmd, sizeof cmd); 1528 if (ret != sizeof cmd) 1529 return (ret >= 0) ? ERR(ENODATA) : -1; 1530 1531 if (af_ib_support) 1532 return ucma_query_addr(id); 1533 else 1534 return ucma_query_route(id); 1535 } 1536 1537 int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id) 1538 { 1539 struct cma_id_private *id_priv; 1540 struct rdma_cm_event *event; 1541 int ret; 1542 1543 id_priv = container_of(listen, struct cma_id_private, id); 1544 if (!id_priv->sync) 1545 return ERR(EINVAL); 1546 1547 if (listen->event) { 1548 rdma_ack_cm_event(listen->event); 1549 listen->event = NULL; 1550 } 1551 1552 ret = rdma_get_cm_event(listen->channel, &event); 1553 if (ret) 1554 return ret; 1555 1556 if (event->status) { 1557 ret = ERR(event->status); 1558 goto err; 1559 } 1560 1561 if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { 1562 ret = ERR(EINVAL); 1563 goto err; 1564 } 1565 1566 if (id_priv->qp_init_attr) { 1567 struct ibv_qp_init_attr attr; 1568 1569 attr = *id_priv->qp_init_attr; 1570 ret = rdma_create_qp(event->id, listen->pd, &attr); 1571 if (ret) 1572 goto err; 1573 } 1574 1575 *id = event->id; 1576 (*id)->event = event; 1577 return 0; 1578 1579 err: 1580 listen->event = event; 1581 return ret; 1582 } 1583 1584 int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) 1585 { 1586 struct ucma_abi_accept cmd; 1587 struct cma_id_private *id_priv; 1588 int ret; 1589 1590 id_priv = container_of(id, struct cma_id_private, id); 1591 ret = ucma_valid_param(id_priv, conn_param); 1592 if (ret) 1593 return ret; 1594 1595 if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) { 1596 id_priv->initiator_depth = min(id_priv->initiator_depth, 1597 id_priv->cma_dev->max_initiator_depth); 1598 } else { 1599 id_priv->initiator_depth = conn_param->initiator_depth; 1600 } 1601 if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) { 1602 id_priv->responder_resources = min(id_priv->responder_resources, 1603 id_priv->cma_dev->max_responder_resources); 1604 } else { 1605 id_priv->responder_resources = conn_param->responder_resources; 1606 } 1607 1608 if (!ucma_is_ud_qp(id->qp_type)) { 1609 ret = ucma_modify_qp_rtr(id, id_priv->responder_resources); 1610 if (ret) 1611 return ret; 1612 1613 ret = ucma_modify_qp_rts(id, id_priv->initiator_depth); 1614 if (ret) 1615 return ret; 1616 } 1617 1618 CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); 1619 cmd.id = id_priv->handle; 1620 cmd.uid = (uintptr_t) id_priv; 1621 if (id->qp) 1622 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1623 conn_param, id->qp->qp_num, 1624 (id->qp->srq != NULL)); 1625 else 1626 ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, 1627 conn_param, conn_param->qp_num, 1628 conn_param->srq); 1629 1630 ret = write(id->channel->fd, &cmd, sizeof cmd); 1631 if (ret != sizeof cmd) { 1632 ucma_modify_qp_err(id); 1633 return (ret >= 0) ? ERR(ENODATA) : -1; 1634 } 1635 1636 if (ucma_is_ud_qp(id->qp_type)) 1637 return 0; 1638 1639 return ucma_complete(id); 1640 } 1641 1642 int rdma_reject(struct rdma_cm_id *id, const void *private_data, 1643 uint8_t private_data_len) 1644 { 1645 struct ucma_abi_reject cmd; 1646 struct cma_id_private *id_priv; 1647 int ret; 1648 1649 CMA_INIT_CMD(&cmd, sizeof cmd, REJECT); 1650 1651 id_priv = container_of(id, struct cma_id_private, id); 1652 cmd.id = id_priv->handle; 1653 if (private_data && private_data_len) { 1654 memcpy(cmd.private_data, private_data, private_data_len); 1655 cmd.private_data_len = private_data_len; 1656 } 1657 1658 ret = write(id->channel->fd, &cmd, sizeof cmd); 1659 if (ret != sizeof cmd) 1660 return (ret >= 0) ? ERR(ENODATA) : -1; 1661 1662 return 0; 1663 } 1664 1665 int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event) 1666 { 1667 struct ucma_abi_notify cmd; 1668 struct cma_id_private *id_priv; 1669 int ret; 1670 1671 CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY); 1672 1673 id_priv = container_of(id, struct cma_id_private, id); 1674 cmd.id = id_priv->handle; 1675 cmd.event = event; 1676 ret = write(id->channel->fd, &cmd, sizeof cmd); 1677 if (ret != sizeof cmd) 1678 return (ret >= 0) ? ERR(ENODATA) : -1; 1679 1680 return 0; 1681 } 1682 1683 int ucma_shutdown(struct rdma_cm_id *id) 1684 { 1685 switch (id->verbs->device->transport_type) { 1686 case IBV_TRANSPORT_IB: 1687 return ucma_modify_qp_err(id); 1688 case IBV_TRANSPORT_IWARP: 1689 return ucma_modify_qp_sqd(id); 1690 default: 1691 return ERR(EINVAL); 1692 } 1693 } 1694 1695 int rdma_disconnect(struct rdma_cm_id *id) 1696 { 1697 struct ucma_abi_disconnect cmd; 1698 struct cma_id_private *id_priv; 1699 int ret; 1700 1701 ret = ucma_shutdown(id); 1702 if (ret) 1703 return ret; 1704 1705 CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT); 1706 id_priv = container_of(id, struct cma_id_private, id); 1707 cmd.id = id_priv->handle; 1708 1709 ret = write(id->channel->fd, &cmd, sizeof cmd); 1710 if (ret != sizeof cmd) 1711 return (ret >= 0) ? ERR(ENODATA) : -1; 1712 1713 return ucma_complete(id); 1714 } 1715 1716 static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr, 1717 socklen_t addrlen, void *context) 1718 { 1719 struct ucma_abi_create_id_resp resp; 1720 struct cma_id_private *id_priv; 1721 struct cma_multicast *mc, **pos; 1722 int ret; 1723 1724 id_priv = container_of(id, struct cma_id_private, id); 1725 mc = calloc(1, sizeof(*mc)); 1726 if (!mc) 1727 return ERR(ENOMEM); 1728 1729 mc->context = context; 1730 mc->id_priv = id_priv; 1731 memcpy(&mc->addr, addr, addrlen); 1732 if (pthread_cond_init(&mc->cond, NULL)) { 1733 ret = -1; 1734 goto err1; 1735 } 1736 1737 pthread_mutex_lock(&id_priv->mut); 1738 mc->next = id_priv->mc_list; 1739 id_priv->mc_list = mc; 1740 pthread_mutex_unlock(&id_priv->mut); 1741 1742 if (af_ib_support) { 1743 struct ucma_abi_join_mcast cmd; 1744 1745 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp); 1746 cmd.id = id_priv->handle; 1747 memcpy(&cmd.addr, addr, addrlen); 1748 cmd.addr_size = addrlen; 1749 cmd.uid = (uintptr_t) mc; 1750 cmd.reserved = 0; 1751 1752 ret = write(id->channel->fd, &cmd, sizeof cmd); 1753 if (ret != sizeof cmd) { 1754 ret = (ret >= 0) ? ERR(ENODATA) : -1; 1755 goto err2; 1756 } 1757 } else { 1758 struct ucma_abi_join_ip_mcast cmd; 1759 1760 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp); 1761 cmd.id = id_priv->handle; 1762 memcpy(&cmd.addr, addr, addrlen); 1763 cmd.uid = (uintptr_t) mc; 1764 1765 ret = write(id->channel->fd, &cmd, sizeof cmd); 1766 if (ret != sizeof cmd) { 1767 ret = (ret >= 0) ? ERR(ENODATA) : -1; 1768 goto err2; 1769 } 1770 } 1771 1772 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 1773 1774 mc->handle = resp.id; 1775 return ucma_complete(id); 1776 1777 err2: 1778 pthread_mutex_lock(&id_priv->mut); 1779 for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next) 1780 ; 1781 *pos = mc->next; 1782 pthread_mutex_unlock(&id_priv->mut); 1783 err1: 1784 free(mc); 1785 return ret; 1786 } 1787 1788 int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, 1789 void *context) 1790 { 1791 int addrlen; 1792 1793 addrlen = ucma_addrlen(addr); 1794 if (!addrlen) 1795 return ERR(EINVAL); 1796 1797 return rdma_join_multicast2(id, addr, addrlen, context); 1798 } 1799 1800 int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) 1801 { 1802 struct ucma_abi_destroy_id cmd; 1803 struct ucma_abi_destroy_id_resp resp; 1804 struct cma_id_private *id_priv; 1805 struct cma_multicast *mc, **pos; 1806 int ret, addrlen; 1807 1808 addrlen = ucma_addrlen(addr); 1809 if (!addrlen) 1810 return ERR(EINVAL); 1811 1812 id_priv = container_of(id, struct cma_id_private, id); 1813 pthread_mutex_lock(&id_priv->mut); 1814 for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next) 1815 if (!memcmp(&(*pos)->addr, addr, addrlen)) 1816 break; 1817 1818 mc = *pos; 1819 if (*pos) 1820 *pos = mc->next; 1821 pthread_mutex_unlock(&id_priv->mut); 1822 if (!mc) 1823 return ERR(EADDRNOTAVAIL); 1824 1825 if (id->qp) 1826 ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid); 1827 1828 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp); 1829 cmd.id = mc->handle; 1830 1831 ret = write(id->channel->fd, &cmd, sizeof cmd); 1832 if (ret != sizeof cmd) { 1833 ret = (ret >= 0) ? ERR(ENODATA) : -1; 1834 goto free; 1835 } 1836 1837 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 1838 1839 pthread_mutex_lock(&id_priv->mut); 1840 while (mc->events_completed < resp.events_reported) 1841 pthread_cond_wait(&mc->cond, &id_priv->mut); 1842 pthread_mutex_unlock(&id_priv->mut); 1843 1844 ret = 0; 1845 free: 1846 free(mc); 1847 return ret; 1848 } 1849 1850 static void ucma_complete_event(struct cma_id_private *id_priv) 1851 { 1852 pthread_mutex_lock(&id_priv->mut); 1853 id_priv->events_completed++; 1854 pthread_cond_signal(&id_priv->cond); 1855 pthread_mutex_unlock(&id_priv->mut); 1856 } 1857 1858 static void ucma_complete_mc_event(struct cma_multicast *mc) 1859 { 1860 pthread_mutex_lock(&mc->id_priv->mut); 1861 mc->events_completed++; 1862 pthread_cond_signal(&mc->cond); 1863 mc->id_priv->events_completed++; 1864 pthread_cond_signal(&mc->id_priv->cond); 1865 pthread_mutex_unlock(&mc->id_priv->mut); 1866 } 1867 1868 int rdma_ack_cm_event(struct rdma_cm_event *event) 1869 { 1870 struct cma_event *evt; 1871 1872 if (!event) 1873 return ERR(EINVAL); 1874 1875 evt = container_of(event, struct cma_event, event); 1876 1877 if (evt->mc) 1878 ucma_complete_mc_event(evt->mc); 1879 else 1880 ucma_complete_event(evt->id_priv); 1881 free(evt); 1882 return 0; 1883 } 1884 1885 static void ucma_process_addr_resolved(struct cma_event *evt) 1886 { 1887 if (af_ib_support) { 1888 evt->event.status = ucma_query_addr(&evt->id_priv->id); 1889 if (!evt->event.status && 1890 evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB) 1891 evt->event.status = ucma_query_gid(&evt->id_priv->id); 1892 } else { 1893 evt->event.status = ucma_query_route(&evt->id_priv->id); 1894 } 1895 1896 if (evt->event.status) 1897 evt->event.event = RDMA_CM_EVENT_ADDR_ERROR; 1898 } 1899 1900 static void ucma_process_route_resolved(struct cma_event *evt) 1901 { 1902 if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB) 1903 return; 1904 1905 if (af_ib_support) 1906 evt->event.status = ucma_query_path(&evt->id_priv->id); 1907 else 1908 evt->event.status = ucma_query_route(&evt->id_priv->id); 1909 1910 if (evt->event.status) 1911 evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR; 1912 } 1913 1914 static int ucma_query_req_info(struct rdma_cm_id *id) 1915 { 1916 int ret; 1917 1918 if (!af_ib_support) 1919 return ucma_query_route(id); 1920 1921 ret = ucma_query_addr(id); 1922 if (ret) 1923 return ret; 1924 1925 ret = ucma_query_gid(id); 1926 if (ret) 1927 return ret; 1928 1929 ret = ucma_query_path(id); 1930 if (ret) 1931 return ret; 1932 1933 return 0; 1934 } 1935 1936 static int ucma_process_conn_req(struct cma_event *evt, 1937 uint32_t handle) 1938 { 1939 struct cma_id_private *id_priv; 1940 int ret; 1941 1942 id_priv = ucma_alloc_id(evt->id_priv->id.channel, 1943 evt->id_priv->id.context, evt->id_priv->id.ps, 1944 evt->id_priv->id.qp_type); 1945 if (!id_priv) { 1946 ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle); 1947 ret = ERR(ENOMEM); 1948 goto err1; 1949 } 1950 1951 evt->event.listen_id = &evt->id_priv->id; 1952 evt->event.id = &id_priv->id; 1953 id_priv->handle = handle; 1954 ucma_insert_id(id_priv); 1955 id_priv->initiator_depth = evt->event.param.conn.initiator_depth; 1956 id_priv->responder_resources = evt->event.param.conn.responder_resources; 1957 1958 if (evt->id_priv->sync) { 1959 ret = rdma_migrate_id(&id_priv->id, NULL); 1960 if (ret) 1961 goto err2; 1962 } 1963 1964 ret = ucma_query_req_info(&id_priv->id); 1965 if (ret) 1966 goto err2; 1967 1968 return 0; 1969 1970 err2: 1971 rdma_destroy_id(&id_priv->id); 1972 err1: 1973 ucma_complete_event(evt->id_priv); 1974 return ret; 1975 } 1976 1977 static int ucma_process_conn_resp(struct cma_id_private *id_priv) 1978 { 1979 struct ucma_abi_accept cmd; 1980 int ret; 1981 1982 ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES); 1983 if (ret) 1984 goto err; 1985 1986 ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH); 1987 if (ret) 1988 goto err; 1989 1990 CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); 1991 cmd.id = id_priv->handle; 1992 1993 ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); 1994 if (ret != sizeof cmd) { 1995 ret = (ret >= 0) ? ERR(ENODATA) : -1; 1996 goto err; 1997 } 1998 1999 return 0; 2000 err: 2001 ucma_modify_qp_err(&id_priv->id); 2002 return ret; 2003 } 2004 2005 static int ucma_process_join(struct cma_event *evt) 2006 { 2007 evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid; 2008 evt->mc->mlid = evt->event.param.ud.ah_attr.dlid; 2009 2010 if (!evt->id_priv->id.qp) 2011 return 0; 2012 2013 return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp, 2014 &evt->mc->mgid, evt->mc->mlid)); 2015 } 2016 2017 static void ucma_copy_conn_event(struct cma_event *event, 2018 struct ucma_abi_conn_param *src) 2019 { 2020 struct rdma_conn_param *dst = &event->event.param.conn; 2021 2022 dst->private_data_len = src->private_data_len; 2023 if (src->private_data_len) { 2024 dst->private_data = &event->private_data; 2025 memcpy(&event->private_data, src->private_data, 2026 src->private_data_len); 2027 } 2028 2029 dst->responder_resources = src->responder_resources; 2030 dst->initiator_depth = src->initiator_depth; 2031 dst->flow_control = src->flow_control; 2032 dst->retry_count = src->retry_count; 2033 dst->rnr_retry_count = src->rnr_retry_count; 2034 dst->srq = src->srq; 2035 dst->qp_num = src->qp_num; 2036 } 2037 2038 static void ucma_copy_ud_event(struct cma_event *event, 2039 struct ucma_abi_ud_param *src) 2040 { 2041 struct rdma_ud_param *dst = &event->event.param.ud; 2042 2043 dst->private_data_len = src->private_data_len; 2044 if (src->private_data_len) { 2045 dst->private_data = &event->private_data; 2046 memcpy(&event->private_data, src->private_data, 2047 src->private_data_len); 2048 } 2049 2050 ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr); 2051 dst->qp_num = src->qp_num; 2052 dst->qkey = src->qkey; 2053 } 2054 2055 int rdma_get_cm_event(struct rdma_event_channel *channel, 2056 struct rdma_cm_event **event) 2057 { 2058 struct ucma_abi_event_resp resp; 2059 struct ucma_abi_get_event cmd; 2060 struct cma_event *evt; 2061 int ret; 2062 2063 ret = ucma_init(); 2064 if (ret) 2065 return ret; 2066 2067 if (!event) 2068 return ERR(EINVAL); 2069 2070 evt = malloc(sizeof(*evt)); 2071 if (!evt) 2072 return ERR(ENOMEM); 2073 2074 retry: 2075 memset(evt, 0, sizeof(*evt)); 2076 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp); 2077 ret = write(channel->fd, &cmd, sizeof cmd); 2078 if (ret != sizeof cmd) { 2079 free(evt); 2080 return (ret >= 0) ? ERR(ENODATA) : -1; 2081 } 2082 2083 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 2084 2085 evt->event.event = resp.event; 2086 /* 2087 * We should have a non-zero uid, except for connection requests. 2088 * But a bug in older kernels can report a uid 0. Work-around this 2089 * issue by looking up the cma_id based on the kernel's id when the 2090 * uid is 0 and we're processing a connection established event. 2091 * In all other cases, if the uid is 0, we discard the event, like 2092 * the kernel should have done. 2093 */ 2094 if (resp.uid) { 2095 evt->id_priv = (void *) (uintptr_t) resp.uid; 2096 } else { 2097 evt->id_priv = ucma_lookup_id(resp.id); 2098 if (!evt->id_priv) { 2099 syslog(LOG_WARNING, PFX "Warning: discarding unmatched " 2100 "event - rdma_destroy_id may hang.\n"); 2101 goto retry; 2102 } 2103 if (resp.event != RDMA_CM_EVENT_ESTABLISHED) { 2104 ucma_complete_event(evt->id_priv); 2105 goto retry; 2106 } 2107 } 2108 evt->event.id = &evt->id_priv->id; 2109 evt->event.status = resp.status; 2110 2111 switch (resp.event) { 2112 case RDMA_CM_EVENT_ADDR_RESOLVED: 2113 ucma_process_addr_resolved(evt); 2114 break; 2115 case RDMA_CM_EVENT_ROUTE_RESOLVED: 2116 ucma_process_route_resolved(evt); 2117 break; 2118 case RDMA_CM_EVENT_CONNECT_REQUEST: 2119 evt->id_priv = (void *) (uintptr_t) resp.uid; 2120 if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) 2121 ucma_copy_ud_event(evt, &resp.param.ud); 2122 else 2123 ucma_copy_conn_event(evt, &resp.param.conn); 2124 2125 ret = ucma_process_conn_req(evt, resp.id); 2126 if (ret) 2127 goto retry; 2128 break; 2129 case RDMA_CM_EVENT_CONNECT_RESPONSE: 2130 ucma_copy_conn_event(evt, &resp.param.conn); 2131 evt->event.status = ucma_process_conn_resp(evt->id_priv); 2132 if (!evt->event.status) 2133 evt->event.event = RDMA_CM_EVENT_ESTABLISHED; 2134 else { 2135 evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR; 2136 evt->id_priv->connect_error = 1; 2137 } 2138 break; 2139 case RDMA_CM_EVENT_ESTABLISHED: 2140 if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) { 2141 ucma_copy_ud_event(evt, &resp.param.ud); 2142 break; 2143 } 2144 2145 ucma_copy_conn_event(evt, &resp.param.conn); 2146 break; 2147 case RDMA_CM_EVENT_REJECTED: 2148 if (evt->id_priv->connect_error) { 2149 ucma_complete_event(evt->id_priv); 2150 goto retry; 2151 } 2152 ucma_copy_conn_event(evt, &resp.param.conn); 2153 ucma_modify_qp_err(evt->event.id); 2154 break; 2155 case RDMA_CM_EVENT_DISCONNECTED: 2156 if (evt->id_priv->connect_error) { 2157 ucma_complete_event(evt->id_priv); 2158 goto retry; 2159 } 2160 ucma_copy_conn_event(evt, &resp.param.conn); 2161 break; 2162 case RDMA_CM_EVENT_MULTICAST_JOIN: 2163 evt->mc = (void *) (uintptr_t) resp.uid; 2164 evt->id_priv = evt->mc->id_priv; 2165 evt->event.id = &evt->id_priv->id; 2166 ucma_copy_ud_event(evt, &resp.param.ud); 2167 evt->event.param.ud.private_data = evt->mc->context; 2168 evt->event.status = ucma_process_join(evt); 2169 if (evt->event.status) 2170 evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR; 2171 break; 2172 case RDMA_CM_EVENT_MULTICAST_ERROR: 2173 evt->mc = (void *) (uintptr_t) resp.uid; 2174 evt->id_priv = evt->mc->id_priv; 2175 evt->event.id = &evt->id_priv->id; 2176 evt->event.param.ud.private_data = evt->mc->context; 2177 break; 2178 default: 2179 evt->id_priv = (void *) (uintptr_t) resp.uid; 2180 evt->event.id = &evt->id_priv->id; 2181 evt->event.status = resp.status; 2182 if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) 2183 ucma_copy_ud_event(evt, &resp.param.ud); 2184 else 2185 ucma_copy_conn_event(evt, &resp.param.conn); 2186 break; 2187 } 2188 2189 *event = &evt->event; 2190 return 0; 2191 } 2192 2193 const char *rdma_event_str(enum rdma_cm_event_type event) 2194 { 2195 switch (event) { 2196 case RDMA_CM_EVENT_ADDR_RESOLVED: 2197 return "RDMA_CM_EVENT_ADDR_RESOLVED"; 2198 case RDMA_CM_EVENT_ADDR_ERROR: 2199 return "RDMA_CM_EVENT_ADDR_ERROR"; 2200 case RDMA_CM_EVENT_ROUTE_RESOLVED: 2201 return "RDMA_CM_EVENT_ROUTE_RESOLVED"; 2202 case RDMA_CM_EVENT_ROUTE_ERROR: 2203 return "RDMA_CM_EVENT_ROUTE_ERROR"; 2204 case RDMA_CM_EVENT_CONNECT_REQUEST: 2205 return "RDMA_CM_EVENT_CONNECT_REQUEST"; 2206 case RDMA_CM_EVENT_CONNECT_RESPONSE: 2207 return "RDMA_CM_EVENT_CONNECT_RESPONSE"; 2208 case RDMA_CM_EVENT_CONNECT_ERROR: 2209 return "RDMA_CM_EVENT_CONNECT_ERROR"; 2210 case RDMA_CM_EVENT_UNREACHABLE: 2211 return "RDMA_CM_EVENT_UNREACHABLE"; 2212 case RDMA_CM_EVENT_REJECTED: 2213 return "RDMA_CM_EVENT_REJECTED"; 2214 case RDMA_CM_EVENT_ESTABLISHED: 2215 return "RDMA_CM_EVENT_ESTABLISHED"; 2216 case RDMA_CM_EVENT_DISCONNECTED: 2217 return "RDMA_CM_EVENT_DISCONNECTED"; 2218 case RDMA_CM_EVENT_DEVICE_REMOVAL: 2219 return "RDMA_CM_EVENT_DEVICE_REMOVAL"; 2220 case RDMA_CM_EVENT_MULTICAST_JOIN: 2221 return "RDMA_CM_EVENT_MULTICAST_JOIN"; 2222 case RDMA_CM_EVENT_MULTICAST_ERROR: 2223 return "RDMA_CM_EVENT_MULTICAST_ERROR"; 2224 case RDMA_CM_EVENT_ADDR_CHANGE: 2225 return "RDMA_CM_EVENT_ADDR_CHANGE"; 2226 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 2227 return "RDMA_CM_EVENT_TIMEWAIT_EXIT"; 2228 default: 2229 return "UNKNOWN EVENT"; 2230 } 2231 } 2232 2233 int rdma_set_option(struct rdma_cm_id *id, int level, int optname, 2234 void *optval, size_t optlen) 2235 { 2236 struct ucma_abi_set_option cmd; 2237 struct cma_id_private *id_priv; 2238 int ret; 2239 2240 CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION); 2241 id_priv = container_of(id, struct cma_id_private, id); 2242 cmd.id = id_priv->handle; 2243 cmd.optval = (uintptr_t) optval; 2244 cmd.level = level; 2245 cmd.optname = optname; 2246 cmd.optlen = optlen; 2247 2248 ret = write(id->channel->fd, &cmd, sizeof cmd); 2249 if (ret != sizeof cmd) 2250 return (ret >= 0) ? ERR(ENODATA) : -1; 2251 2252 return 0; 2253 } 2254 2255 int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel) 2256 { 2257 struct ucma_abi_migrate_resp resp; 2258 struct ucma_abi_migrate_id cmd; 2259 struct cma_id_private *id_priv; 2260 int ret, sync; 2261 2262 id_priv = container_of(id, struct cma_id_private, id); 2263 if (id_priv->sync && !channel) 2264 return ERR(EINVAL); 2265 2266 if ((sync = (channel == NULL))) { 2267 channel = rdma_create_event_channel(); 2268 if (!channel) 2269 return -1; 2270 } 2271 2272 CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp); 2273 cmd.id = id_priv->handle; 2274 cmd.fd = id->channel->fd; 2275 2276 ret = write(channel->fd, &cmd, sizeof cmd); 2277 if (ret != sizeof cmd) { 2278 if (sync) 2279 rdma_destroy_event_channel(channel); 2280 return (ret >= 0) ? ERR(ENODATA) : -1; 2281 } 2282 2283 VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); 2284 2285 if (id_priv->sync) { 2286 if (id->event) { 2287 rdma_ack_cm_event(id->event); 2288 id->event = NULL; 2289 } 2290 rdma_destroy_event_channel(id->channel); 2291 } 2292 2293 /* 2294 * Eventually if we want to support migrating channels while events are 2295 * being processed on the current channel, we need to block here while 2296 * there are any outstanding events on the current channel for this id 2297 * to prevent the user from processing events for this id on the old 2298 * channel after this call returns. 2299 */ 2300 pthread_mutex_lock(&id_priv->mut); 2301 id_priv->sync = sync; 2302 id->channel = channel; 2303 while (id_priv->events_completed < resp.events_reported) 2304 pthread_cond_wait(&id_priv->cond, &id_priv->mut); 2305 pthread_mutex_unlock(&id_priv->mut); 2306 2307 return 0; 2308 } 2309 2310 static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res, 2311 struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) 2312 { 2313 struct cma_id_private *id_priv; 2314 int ret; 2315 2316 if (af_ib_support) 2317 ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len); 2318 else 2319 ret = rdma_bind_addr(id, res->ai_src_addr); 2320 if (ret) 2321 return ret; 2322 2323 id_priv = container_of(id, struct cma_id_private, id); 2324 if (pd) 2325 id->pd = pd; 2326 2327 if (qp_init_attr) { 2328 id_priv->qp_init_attr = malloc(sizeof(*qp_init_attr)); 2329 if (!id_priv->qp_init_attr) 2330 return ERR(ENOMEM); 2331 2332 *id_priv->qp_init_attr = *qp_init_attr; 2333 id_priv->qp_init_attr->qp_type = res->ai_qp_type; 2334 } 2335 2336 return 0; 2337 } 2338 2339 int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res, 2340 struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) 2341 { 2342 struct rdma_cm_id *cm_id; 2343 struct cma_id_private *id_priv; 2344 int ret; 2345 2346 ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type); 2347 if (ret) 2348 return ret; 2349 2350 if (res->ai_flags & RAI_PASSIVE) { 2351 ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr); 2352 if (ret) 2353 goto err; 2354 goto out; 2355 } 2356 2357 if (af_ib_support) 2358 ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len, 2359 res->ai_dst_addr, res->ai_dst_len, 2000); 2360 else 2361 ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000); 2362 if (ret) 2363 goto err; 2364 2365 if (res->ai_route_len) { 2366 ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, 2367 res->ai_route, res->ai_route_len); 2368 if (!ret) 2369 ret = ucma_complete(cm_id); 2370 } else { 2371 ret = rdma_resolve_route(cm_id, 2000); 2372 } 2373 if (ret) 2374 goto err; 2375 2376 if (qp_init_attr) { 2377 qp_init_attr->qp_type = res->ai_qp_type; 2378 ret = rdma_create_qp(cm_id, pd, qp_init_attr); 2379 if (ret) 2380 goto err; 2381 } 2382 2383 if (res->ai_connect_len) { 2384 id_priv = container_of(cm_id, struct cma_id_private, id); 2385 id_priv->connect = malloc(res->ai_connect_len); 2386 if (!id_priv->connect) { 2387 ret = ERR(ENOMEM); 2388 goto err; 2389 } 2390 memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len); 2391 id_priv->connect_len = res->ai_connect_len; 2392 } 2393 2394 out: 2395 *id = cm_id; 2396 return 0; 2397 2398 err: 2399 rdma_destroy_ep(cm_id); 2400 return ret; 2401 } 2402 2403 void rdma_destroy_ep(struct rdma_cm_id *id) 2404 { 2405 struct cma_id_private *id_priv; 2406 2407 if (id->qp) 2408 rdma_destroy_qp(id); 2409 2410 if (id->srq) 2411 rdma_destroy_srq(id); 2412 2413 id_priv = container_of(id, struct cma_id_private, id); 2414 if (id_priv->qp_init_attr) 2415 free(id_priv->qp_init_attr); 2416 2417 rdma_destroy_id(id); 2418 } 2419 2420 int ucma_max_qpsize(struct rdma_cm_id *id) 2421 { 2422 struct cma_id_private *id_priv; 2423 int i, max_size = 0; 2424 2425 id_priv = container_of(id, struct cma_id_private, id); 2426 if (id && id_priv->cma_dev) { 2427 max_size = id_priv->cma_dev->max_qpsize; 2428 } else { 2429 ucma_init_all(); 2430 for (i = 0; i < cma_dev_cnt; i++) { 2431 if (!max_size || max_size > cma_dev_array[i].max_qpsize) 2432 max_size = cma_dev_array[i].max_qpsize; 2433 } 2434 } 2435 return max_size; 2436 } 2437 2438 __be16 ucma_get_port(struct sockaddr *addr) 2439 { 2440 switch (addr->sa_family) { 2441 case AF_INET: 2442 return ((struct sockaddr_in *) addr)->sin_port; 2443 case AF_INET6: 2444 return ((struct sockaddr_in6 *) addr)->sin6_port; 2445 case AF_IB: 2446 return htobe16((uint16_t) be64toh(((struct sockaddr_ib *) addr)->sib_sid)); 2447 default: 2448 return 0; 2449 } 2450 } 2451 2452 __be16 rdma_get_src_port(struct rdma_cm_id *id) 2453 { 2454 return ucma_get_port(&id->route.addr.src_addr); 2455 } 2456 2457 __be16 rdma_get_dst_port(struct rdma_cm_id *id) 2458 { 2459 return ucma_get_port(&id->route.addr.dst_addr); 2460 } 2461 2462