1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * IB infrastructure: 6 * Establish SMC-R as an Infiniband Client to be notified about added and 7 * removed IB devices of type RDMA. 8 * Determine device and port characteristics for these IB devices. 9 * 10 * Copyright IBM Corp. 2016 11 * 12 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 13 */ 14 15 #include <linux/random.h> 16 #include <linux/workqueue.h> 17 #include <linux/scatterlist.h> 18 #include <linux/wait.h> 19 #include <linux/mutex.h> 20 #include <rdma/ib_verbs.h> 21 #include <rdma/ib_cache.h> 22 23 #include "smc_pnet.h" 24 #include "smc_ib.h" 25 #include "smc_core.h" 26 #include "smc_wr.h" 27 #include "smc.h" 28 #include "smc_netlink.h" 29 30 #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ 31 32 #define SMC_QP_MIN_RNR_TIMER 5 33 #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ 34 #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ 35 #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ 36 37 struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ 38 .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex), 39 .list = LIST_HEAD_INIT(smc_ib_devices.list), 40 }; 41 42 u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ 43 44 static int smc_ib_modify_qp_init(struct smc_link *lnk) 45 { 46 struct ib_qp_attr qp_attr; 47 48 memset(&qp_attr, 0, sizeof(qp_attr)); 49 qp_attr.qp_state = IB_QPS_INIT; 50 qp_attr.pkey_index = 0; 51 qp_attr.port_num = lnk->ibport; 52 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE 53 | IB_ACCESS_REMOTE_WRITE; 54 return ib_modify_qp(lnk->roce_qp, &qp_attr, 55 IB_QP_STATE | IB_QP_PKEY_INDEX | 56 IB_QP_ACCESS_FLAGS | IB_QP_PORT); 57 } 58 59 static int smc_ib_modify_qp_rtr(struct smc_link *lnk) 60 { 61 enum ib_qp_attr_mask qp_attr_mask = 62 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | 63 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; 64 struct ib_qp_attr qp_attr; 65 66 memset(&qp_attr, 0, sizeof(qp_attr)); 67 qp_attr.qp_state = IB_QPS_RTR; 68 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); 69 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; 70 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); 71 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0); 72 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); 73 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, 74 sizeof(lnk->peer_mac)); 75 qp_attr.dest_qp_num = lnk->peer_qpn; 76 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ 77 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming 78 * requests 79 */ 80 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; 81 82 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); 83 } 84 85 int smc_ib_modify_qp_rts(struct smc_link *lnk) 86 { 87 struct ib_qp_attr qp_attr; 88 89 memset(&qp_attr, 0, sizeof(qp_attr)); 90 qp_attr.qp_state = IB_QPS_RTS; 91 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ 92 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ 93 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ 94 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ 95 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and 96 * atomic ops allowed 97 */ 98 return ib_modify_qp(lnk->roce_qp, &qp_attr, 99 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | 100 IB_QP_SQ_PSN | IB_QP_RNR_RETRY | 101 IB_QP_MAX_QP_RD_ATOMIC); 102 } 103 104 int smc_ib_modify_qp_reset(struct smc_link *lnk) 105 { 106 struct ib_qp_attr qp_attr; 107 108 memset(&qp_attr, 0, sizeof(qp_attr)); 109 qp_attr.qp_state = IB_QPS_RESET; 110 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); 111 } 112 113 int smc_ib_ready_link(struct smc_link *lnk) 114 { 115 struct smc_link_group *lgr = smc_get_lgr(lnk); 116 int rc = 0; 117 118 rc = smc_ib_modify_qp_init(lnk); 119 if (rc) 120 goto out; 121 122 rc = smc_ib_modify_qp_rtr(lnk); 123 if (rc) 124 goto out; 125 smc_wr_remember_qp_attr(lnk); 126 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, 127 IB_CQ_SOLICITED_MASK); 128 if (rc) 129 goto out; 130 rc = smc_wr_rx_post_init(lnk); 131 if (rc) 132 goto out; 133 smc_wr_remember_qp_attr(lnk); 134 135 if (lgr->role == SMC_SERV) { 136 rc = smc_ib_modify_qp_rts(lnk); 137 if (rc) 138 goto out; 139 smc_wr_remember_qp_attr(lnk); 140 } 141 out: 142 return rc; 143 } 144 145 static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) 146 { 147 const struct ib_gid_attr *attr; 148 int rc; 149 150 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0); 151 if (IS_ERR(attr)) 152 return -ENODEV; 153 154 rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]); 155 rdma_put_gid_attr(attr); 156 return rc; 157 } 158 159 /* Create an identifier unique for this instance of SMC-R. 160 * The MAC-address of the first active registered IB device 161 * plus a random 2-byte number is used to create this identifier. 162 * This name is delivered to the peer during connection initialization. 163 */ 164 static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, 165 u8 ibport) 166 { 167 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], 168 sizeof(smcibdev->mac[ibport - 1])); 169 } 170 171 bool smc_ib_is_valid_local_systemid(void) 172 { 173 return !is_zero_ether_addr(&local_systemid[2]); 174 } 175 176 static void smc_ib_init_local_systemid(void) 177 { 178 get_random_bytes(&local_systemid[0], 2); 179 } 180 181 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) 182 { 183 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; 184 } 185 186 /* determine the gid for an ib-device port and vlan id */ 187 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, 188 unsigned short vlan_id, u8 gid[], u8 *sgid_index) 189 { 190 const struct ib_gid_attr *attr; 191 const struct net_device *ndev; 192 int i; 193 194 for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { 195 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); 196 if (IS_ERR(attr)) 197 continue; 198 199 rcu_read_lock(); 200 ndev = rdma_read_gid_attr_ndev_rcu(attr); 201 if (!IS_ERR(ndev) && 202 ((!vlan_id && !is_vlan_dev(ndev)) || 203 (vlan_id && is_vlan_dev(ndev) && 204 vlan_dev_vlan_id(ndev) == vlan_id)) && 205 attr->gid_type == IB_GID_TYPE_ROCE) { 206 rcu_read_unlock(); 207 if (gid) 208 memcpy(gid, &attr->gid, SMC_GID_SIZE); 209 if (sgid_index) 210 *sgid_index = attr->index; 211 rdma_put_gid_attr(attr); 212 return 0; 213 } 214 rcu_read_unlock(); 215 rdma_put_gid_attr(attr); 216 } 217 return -ENODEV; 218 } 219 220 static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) 221 { 222 int rc; 223 224 memset(&smcibdev->pattr[ibport - 1], 0, 225 sizeof(smcibdev->pattr[ibport - 1])); 226 rc = ib_query_port(smcibdev->ibdev, ibport, 227 &smcibdev->pattr[ibport - 1]); 228 if (rc) 229 goto out; 230 /* the SMC protocol requires specification of the RoCE MAC address */ 231 rc = smc_ib_fill_mac(smcibdev, ibport); 232 if (rc) 233 goto out; 234 if (!smc_ib_is_valid_local_systemid() && 235 smc_ib_port_active(smcibdev, ibport)) 236 /* create unique system identifier */ 237 smc_ib_define_local_systemid(smcibdev, ibport); 238 out: 239 return rc; 240 } 241 242 /* process context wrapper for might_sleep smc_ib_remember_port_attr */ 243 static void smc_ib_port_event_work(struct work_struct *work) 244 { 245 struct smc_ib_device *smcibdev = container_of( 246 work, struct smc_ib_device, port_event_work); 247 u8 port_idx; 248 249 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { 250 smc_ib_remember_port_attr(smcibdev, port_idx + 1); 251 clear_bit(port_idx, &smcibdev->port_event_mask); 252 if (!smc_ib_port_active(smcibdev, port_idx + 1)) { 253 set_bit(port_idx, smcibdev->ports_going_away); 254 smcr_port_err(smcibdev, port_idx + 1); 255 } else { 256 clear_bit(port_idx, smcibdev->ports_going_away); 257 smcr_port_add(smcibdev, port_idx + 1); 258 } 259 } 260 } 261 262 /* can be called in IRQ context */ 263 static void smc_ib_global_event_handler(struct ib_event_handler *handler, 264 struct ib_event *ibevent) 265 { 266 struct smc_ib_device *smcibdev; 267 bool schedule = false; 268 u8 port_idx; 269 270 smcibdev = container_of(handler, struct smc_ib_device, event_handler); 271 272 switch (ibevent->event) { 273 case IB_EVENT_DEVICE_FATAL: 274 /* terminate all ports on device */ 275 for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { 276 set_bit(port_idx, &smcibdev->port_event_mask); 277 if (!test_and_set_bit(port_idx, 278 smcibdev->ports_going_away)) 279 schedule = true; 280 } 281 if (schedule) 282 schedule_work(&smcibdev->port_event_work); 283 break; 284 case IB_EVENT_PORT_ACTIVE: 285 port_idx = ibevent->element.port_num - 1; 286 if (port_idx >= SMC_MAX_PORTS) 287 break; 288 set_bit(port_idx, &smcibdev->port_event_mask); 289 if (test_and_clear_bit(port_idx, smcibdev->ports_going_away)) 290 schedule_work(&smcibdev->port_event_work); 291 break; 292 case IB_EVENT_PORT_ERR: 293 port_idx = ibevent->element.port_num - 1; 294 if (port_idx >= SMC_MAX_PORTS) 295 break; 296 set_bit(port_idx, &smcibdev->port_event_mask); 297 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 298 schedule_work(&smcibdev->port_event_work); 299 break; 300 case IB_EVENT_GID_CHANGE: 301 port_idx = ibevent->element.port_num - 1; 302 if (port_idx >= SMC_MAX_PORTS) 303 break; 304 set_bit(port_idx, &smcibdev->port_event_mask); 305 schedule_work(&smcibdev->port_event_work); 306 break; 307 default: 308 break; 309 } 310 } 311 312 void smc_ib_dealloc_protection_domain(struct smc_link *lnk) 313 { 314 if (lnk->roce_pd) 315 ib_dealloc_pd(lnk->roce_pd); 316 lnk->roce_pd = NULL; 317 } 318 319 int smc_ib_create_protection_domain(struct smc_link *lnk) 320 { 321 int rc; 322 323 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); 324 rc = PTR_ERR_OR_ZERO(lnk->roce_pd); 325 if (IS_ERR(lnk->roce_pd)) 326 lnk->roce_pd = NULL; 327 return rc; 328 } 329 330 static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, 331 struct smc_ib_device *smcibdev) 332 { 333 struct smc_link_group *lgr; 334 bool rc = false; 335 int i; 336 337 spin_lock_bh(&smc_lgr->lock); 338 list_for_each_entry(lgr, &smc_lgr->list, list) { 339 if (lgr->is_smcd) 340 continue; 341 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 342 if (lgr->lnk[i].state == SMC_LNK_UNUSED || 343 lgr->lnk[i].smcibdev != smcibdev) 344 continue; 345 if (lgr->type == SMC_LGR_SINGLE || 346 lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { 347 rc = true; 348 goto out; 349 } 350 } 351 } 352 out: 353 spin_unlock_bh(&smc_lgr->lock); 354 return rc; 355 } 356 357 static int smc_nl_handle_dev_port(struct sk_buff *skb, 358 struct ib_device *ibdev, 359 struct smc_ib_device *smcibdev, 360 int port) 361 { 362 char smc_pnet[SMC_MAX_PNETID_LEN + 1]; 363 struct nlattr *port_attrs; 364 unsigned char port_state; 365 int lnk_count = 0; 366 367 port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port); 368 if (!port_attrs) 369 goto errout; 370 371 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, 372 smcibdev->pnetid_by_user[port])) 373 goto errattr; 374 memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN); 375 smc_pnet[SMC_MAX_PNETID_LEN] = 0; 376 if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) 377 goto errattr; 378 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, 379 smcibdev->ndev_ifidx[port])) 380 goto errattr; 381 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1)) 382 goto errattr; 383 port_state = smc_ib_port_active(smcibdev, port + 1); 384 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state)) 385 goto errattr; 386 lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]); 387 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count)) 388 goto errattr; 389 nla_nest_end(skb, port_attrs); 390 return 0; 391 errattr: 392 nla_nest_cancel(skb, port_attrs); 393 errout: 394 return -EMSGSIZE; 395 } 396 397 static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev, 398 struct sk_buff *skb) 399 { 400 if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid)) 401 return false; 402 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid)) 403 return false; 404 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor)) 405 return false; 406 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device)) 407 return false; 408 if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id)) 409 return false; 410 return true; 411 } 412 413 static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, 414 struct sk_buff *skb, 415 struct netlink_callback *cb) 416 { 417 char smc_ibname[IB_DEVICE_NAME_MAX]; 418 struct smc_pci_dev smc_pci_dev; 419 struct pci_dev *pci_dev; 420 unsigned char is_crit; 421 struct nlattr *attrs; 422 void *nlh; 423 int i; 424 425 nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 426 &smc_gen_nl_family, NLM_F_MULTI, 427 SMC_NETLINK_GET_DEV_SMCR); 428 if (!nlh) 429 goto errmsg; 430 attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR); 431 if (!attrs) 432 goto errout; 433 is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); 434 if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) 435 goto errattr; 436 if (smcibdev->ibdev->dev.parent) { 437 memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); 438 pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); 439 smc_set_pci_values(pci_dev, &smc_pci_dev); 440 if (!smc_nl_handle_pci_values(&smc_pci_dev, skb)) 441 goto errattr; 442 } 443 snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); 444 if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) 445 goto errattr; 446 for (i = 1; i <= SMC_MAX_PORTS; i++) { 447 if (!rdma_is_port_valid(smcibdev->ibdev, i)) 448 continue; 449 if (smc_nl_handle_dev_port(skb, smcibdev->ibdev, 450 smcibdev, i - 1)) 451 goto errattr; 452 } 453 454 nla_nest_end(skb, attrs); 455 genlmsg_end(skb, nlh); 456 return 0; 457 458 errattr: 459 nla_nest_cancel(skb, attrs); 460 errout: 461 genlmsg_cancel(skb, nlh); 462 errmsg: 463 return -EMSGSIZE; 464 } 465 466 static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, 467 struct sk_buff *skb, 468 struct netlink_callback *cb) 469 { 470 struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); 471 struct smc_ib_device *smcibdev; 472 int snum = cb_ctx->pos[0]; 473 int num = 0; 474 475 mutex_lock(&dev_list->mutex); 476 list_for_each_entry(smcibdev, &dev_list->list, list) { 477 if (num < snum) 478 goto next; 479 if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) 480 goto errout; 481 next: 482 num++; 483 } 484 errout: 485 mutex_unlock(&dev_list->mutex); 486 cb_ctx->pos[0] = num; 487 } 488 489 int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) 490 { 491 smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb); 492 return skb->len; 493 } 494 495 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) 496 { 497 struct smc_link *lnk = (struct smc_link *)priv; 498 struct smc_ib_device *smcibdev = lnk->smcibdev; 499 u8 port_idx; 500 501 switch (ibevent->event) { 502 case IB_EVENT_QP_FATAL: 503 case IB_EVENT_QP_ACCESS_ERR: 504 port_idx = ibevent->element.qp->port - 1; 505 if (port_idx >= SMC_MAX_PORTS) 506 break; 507 set_bit(port_idx, &smcibdev->port_event_mask); 508 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 509 schedule_work(&smcibdev->port_event_work); 510 break; 511 default: 512 break; 513 } 514 } 515 516 void smc_ib_destroy_queue_pair(struct smc_link *lnk) 517 { 518 if (lnk->roce_qp) 519 ib_destroy_qp(lnk->roce_qp); 520 lnk->roce_qp = NULL; 521 } 522 523 /* create a queue pair within the protection domain for a link */ 524 int smc_ib_create_queue_pair(struct smc_link *lnk) 525 { 526 struct ib_qp_init_attr qp_attr = { 527 .event_handler = smc_ib_qp_event_handler, 528 .qp_context = lnk, 529 .send_cq = lnk->smcibdev->roce_cq_send, 530 .recv_cq = lnk->smcibdev->roce_cq_recv, 531 .srq = NULL, 532 .cap = { 533 /* include unsolicited rdma_writes as well, 534 * there are max. 2 RDMA_WRITE per 1 WR_SEND 535 */ 536 .max_send_wr = SMC_WR_BUF_CNT * 3, 537 .max_recv_wr = SMC_WR_BUF_CNT * 3, 538 .max_send_sge = SMC_IB_MAX_SEND_SGE, 539 .max_recv_sge = 1, 540 }, 541 .sq_sig_type = IB_SIGNAL_REQ_WR, 542 .qp_type = IB_QPT_RC, 543 }; 544 int rc; 545 546 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); 547 rc = PTR_ERR_OR_ZERO(lnk->roce_qp); 548 if (IS_ERR(lnk->roce_qp)) 549 lnk->roce_qp = NULL; 550 else 551 smc_wr_remember_qp_attr(lnk); 552 return rc; 553 } 554 555 void smc_ib_put_memory_region(struct ib_mr *mr) 556 { 557 ib_dereg_mr(mr); 558 } 559 560 static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) 561 { 562 unsigned int offset = 0; 563 int sg_num; 564 565 /* map the largest prefix of a dma mapped SG list */ 566 sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], 567 buf_slot->sgt[link_idx].sgl, 568 buf_slot->sgt[link_idx].orig_nents, 569 &offset, PAGE_SIZE); 570 571 return sg_num; 572 } 573 574 /* Allocate a memory region and map the dma mapped SG list of buf_slot */ 575 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, 576 struct smc_buf_desc *buf_slot, u8 link_idx) 577 { 578 if (buf_slot->mr_rx[link_idx]) 579 return 0; /* already done */ 580 581 buf_slot->mr_rx[link_idx] = 582 ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); 583 if (IS_ERR(buf_slot->mr_rx[link_idx])) { 584 int rc; 585 586 rc = PTR_ERR(buf_slot->mr_rx[link_idx]); 587 buf_slot->mr_rx[link_idx] = NULL; 588 return rc; 589 } 590 591 if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) 592 return -EINVAL; 593 594 return 0; 595 } 596 597 /* synchronize buffer usage for cpu access */ 598 void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, 599 struct smc_buf_desc *buf_slot, 600 enum dma_data_direction data_direction) 601 { 602 struct scatterlist *sg; 603 unsigned int i; 604 605 /* for now there is just one DMA address */ 606 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 607 buf_slot->sgt[lnk->link_idx].nents, i) { 608 if (!sg_dma_len(sg)) 609 break; 610 ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, 611 sg_dma_address(sg), 612 sg_dma_len(sg), 613 data_direction); 614 } 615 } 616 617 /* synchronize buffer usage for device access */ 618 void smc_ib_sync_sg_for_device(struct smc_link *lnk, 619 struct smc_buf_desc *buf_slot, 620 enum dma_data_direction data_direction) 621 { 622 struct scatterlist *sg; 623 unsigned int i; 624 625 /* for now there is just one DMA address */ 626 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 627 buf_slot->sgt[lnk->link_idx].nents, i) { 628 if (!sg_dma_len(sg)) 629 break; 630 ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, 631 sg_dma_address(sg), 632 sg_dma_len(sg), 633 data_direction); 634 } 635 } 636 637 /* Map a new TX or RX buffer SG-table to DMA */ 638 int smc_ib_buf_map_sg(struct smc_link *lnk, 639 struct smc_buf_desc *buf_slot, 640 enum dma_data_direction data_direction) 641 { 642 int mapped_nents; 643 644 mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, 645 buf_slot->sgt[lnk->link_idx].sgl, 646 buf_slot->sgt[lnk->link_idx].orig_nents, 647 data_direction); 648 if (!mapped_nents) 649 return -ENOMEM; 650 651 return mapped_nents; 652 } 653 654 void smc_ib_buf_unmap_sg(struct smc_link *lnk, 655 struct smc_buf_desc *buf_slot, 656 enum dma_data_direction data_direction) 657 { 658 if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) 659 return; /* already unmapped */ 660 661 ib_dma_unmap_sg(lnk->smcibdev->ibdev, 662 buf_slot->sgt[lnk->link_idx].sgl, 663 buf_slot->sgt[lnk->link_idx].orig_nents, 664 data_direction); 665 buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; 666 } 667 668 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) 669 { 670 struct ib_cq_init_attr cqattr = { 671 .cqe = SMC_MAX_CQE, .comp_vector = 0 }; 672 int cqe_size_order, smc_order; 673 long rc; 674 675 mutex_lock(&smcibdev->mutex); 676 rc = 0; 677 if (smcibdev->initialized) 678 goto out; 679 /* the calculated number of cq entries fits to mlx5 cq allocation */ 680 cqe_size_order = cache_line_size() == 128 ? 7 : 6; 681 smc_order = MAX_ORDER - cqe_size_order - 1; 682 if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) 683 cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; 684 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, 685 smc_wr_tx_cq_handler, NULL, 686 smcibdev, &cqattr); 687 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); 688 if (IS_ERR(smcibdev->roce_cq_send)) { 689 smcibdev->roce_cq_send = NULL; 690 goto out; 691 } 692 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, 693 smc_wr_rx_cq_handler, NULL, 694 smcibdev, &cqattr); 695 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); 696 if (IS_ERR(smcibdev->roce_cq_recv)) { 697 smcibdev->roce_cq_recv = NULL; 698 goto err; 699 } 700 smc_wr_add_dev(smcibdev); 701 smcibdev->initialized = 1; 702 goto out; 703 704 err: 705 ib_destroy_cq(smcibdev->roce_cq_send); 706 out: 707 mutex_unlock(&smcibdev->mutex); 708 return rc; 709 } 710 711 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) 712 { 713 mutex_lock(&smcibdev->mutex); 714 if (!smcibdev->initialized) 715 goto out; 716 smcibdev->initialized = 0; 717 ib_destroy_cq(smcibdev->roce_cq_recv); 718 ib_destroy_cq(smcibdev->roce_cq_send); 719 smc_wr_remove_dev(smcibdev); 720 out: 721 mutex_unlock(&smcibdev->mutex); 722 } 723 724 static struct ib_client smc_ib_client; 725 726 static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) 727 { 728 struct ib_device *ibdev = smcibdev->ibdev; 729 struct net_device *ndev; 730 731 if (!ibdev->ops.get_netdev) 732 return; 733 ndev = ibdev->ops.get_netdev(ibdev, port + 1); 734 if (ndev) { 735 smcibdev->ndev_ifidx[port] = ndev->ifindex; 736 dev_put(ndev); 737 } 738 } 739 740 void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) 741 { 742 struct smc_ib_device *smcibdev; 743 struct ib_device *libdev; 744 struct net_device *lndev; 745 u8 port_cnt; 746 int i; 747 748 mutex_lock(&smc_ib_devices.mutex); 749 list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { 750 port_cnt = smcibdev->ibdev->phys_port_cnt; 751 for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { 752 libdev = smcibdev->ibdev; 753 if (!libdev->ops.get_netdev) 754 continue; 755 lndev = libdev->ops.get_netdev(libdev, i + 1); 756 if (lndev) 757 dev_put(lndev); 758 if (lndev != ndev) 759 continue; 760 if (event == NETDEV_REGISTER) 761 smcibdev->ndev_ifidx[i] = ndev->ifindex; 762 if (event == NETDEV_UNREGISTER) 763 smcibdev->ndev_ifidx[i] = 0; 764 } 765 } 766 mutex_unlock(&smc_ib_devices.mutex); 767 } 768 769 /* callback function for ib_register_client() */ 770 static int smc_ib_add_dev(struct ib_device *ibdev) 771 { 772 struct smc_ib_device *smcibdev; 773 u8 port_cnt; 774 int i; 775 776 if (ibdev->node_type != RDMA_NODE_IB_CA) 777 return -EOPNOTSUPP; 778 779 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); 780 if (!smcibdev) 781 return -ENOMEM; 782 783 smcibdev->ibdev = ibdev; 784 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); 785 atomic_set(&smcibdev->lnk_cnt, 0); 786 init_waitqueue_head(&smcibdev->lnks_deleted); 787 mutex_init(&smcibdev->mutex); 788 mutex_lock(&smc_ib_devices.mutex); 789 list_add_tail(&smcibdev->list, &smc_ib_devices.list); 790 mutex_unlock(&smc_ib_devices.mutex); 791 ib_set_client_data(ibdev, &smc_ib_client, smcibdev); 792 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, 793 smc_ib_global_event_handler); 794 ib_register_event_handler(&smcibdev->event_handler); 795 796 /* trigger reading of the port attributes */ 797 port_cnt = smcibdev->ibdev->phys_port_cnt; 798 pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", 799 smcibdev->ibdev->name, port_cnt); 800 for (i = 0; 801 i < min_t(size_t, port_cnt, SMC_MAX_PORTS); 802 i++) { 803 set_bit(i, &smcibdev->port_event_mask); 804 /* determine pnetids of the port */ 805 if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, 806 smcibdev->pnetid[i])) 807 smc_pnetid_by_table_ib(smcibdev, i + 1); 808 smc_copy_netdev_ifindex(smcibdev, i); 809 pr_warn_ratelimited("smc: ib device %s port %d has pnetid " 810 "%.16s%s\n", 811 smcibdev->ibdev->name, i + 1, 812 smcibdev->pnetid[i], 813 smcibdev->pnetid_by_user[i] ? 814 " (user defined)" : 815 ""); 816 } 817 schedule_work(&smcibdev->port_event_work); 818 return 0; 819 } 820 821 /* callback function for ib_unregister_client() */ 822 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) 823 { 824 struct smc_ib_device *smcibdev = client_data; 825 826 mutex_lock(&smc_ib_devices.mutex); 827 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ 828 mutex_unlock(&smc_ib_devices.mutex); 829 pr_warn_ratelimited("smc: removing ib device %s\n", 830 smcibdev->ibdev->name); 831 smc_smcr_terminate_all(smcibdev); 832 smc_ib_cleanup_per_ibdev(smcibdev); 833 ib_unregister_event_handler(&smcibdev->event_handler); 834 cancel_work_sync(&smcibdev->port_event_work); 835 kfree(smcibdev); 836 } 837 838 static struct ib_client smc_ib_client = { 839 .name = "smc_ib", 840 .add = smc_ib_add_dev, 841 .remove = smc_ib_remove_dev, 842 }; 843 844 int __init smc_ib_register_client(void) 845 { 846 smc_ib_init_local_systemid(); 847 return ib_register_client(&smc_ib_client); 848 } 849 850 void smc_ib_unregister_client(void) 851 { 852 ib_unregister_client(&smc_ib_client); 853 } 854