1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * IB infrastructure: 6 * Establish SMC-R as an Infiniband Client to be notified about added and 7 * removed IB devices of type RDMA. 8 * Determine device and port characteristics for these IB devices. 9 * 10 * Copyright IBM Corp. 2016 11 * 12 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 13 */ 14 15 #include <linux/etherdevice.h> 16 #include <linux/if_vlan.h> 17 #include <linux/random.h> 18 #include <linux/workqueue.h> 19 #include <linux/scatterlist.h> 20 #include <linux/wait.h> 21 #include <linux/mutex.h> 22 #include <linux/inetdevice.h> 23 #include <rdma/ib_verbs.h> 24 #include <rdma/ib_cache.h> 25 26 #include "smc_pnet.h" 27 #include "smc_ib.h" 28 #include "smc_core.h" 29 #include "smc_wr.h" 30 #include "smc.h" 31 #include "smc_netlink.h" 32 33 #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ 34 35 #define SMC_QP_MIN_RNR_TIMER 5 36 #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ 37 #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ 38 #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ 39 40 struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ 41 .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex), 42 .list = LIST_HEAD_INIT(smc_ib_devices.list), 43 }; 44 45 u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ 46 47 static int smc_ib_modify_qp_init(struct smc_link *lnk) 48 { 49 struct ib_qp_attr qp_attr; 50 51 memset(&qp_attr, 0, sizeof(qp_attr)); 52 qp_attr.qp_state = IB_QPS_INIT; 53 qp_attr.pkey_index = 0; 54 qp_attr.port_num = lnk->ibport; 55 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE 56 | IB_ACCESS_REMOTE_WRITE; 57 return ib_modify_qp(lnk->roce_qp, &qp_attr, 58 IB_QP_STATE | IB_QP_PKEY_INDEX | 59 IB_QP_ACCESS_FLAGS | IB_QP_PORT); 60 } 61 62 static int smc_ib_modify_qp_rtr(struct smc_link *lnk) 63 { 64 enum ib_qp_attr_mask qp_attr_mask = 65 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | 66 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; 67 struct ib_qp_attr qp_attr; 68 u8 hop_lim = 1; 69 70 memset(&qp_attr, 0, sizeof(qp_attr)); 71 qp_attr.qp_state = IB_QPS_RTR; 72 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); 73 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; 74 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); 75 if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) 76 hop_lim = IPV6_DEFAULT_HOPLIMIT; 77 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0); 78 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); 79 if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) 80 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac, 81 sizeof(lnk->lgr->nexthop_mac)); 82 else 83 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, 84 sizeof(lnk->peer_mac)); 85 qp_attr.dest_qp_num = lnk->peer_qpn; 86 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ 87 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming 88 * requests 89 */ 90 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; 91 92 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); 93 } 94 95 int smc_ib_modify_qp_rts(struct smc_link *lnk) 96 { 97 struct ib_qp_attr qp_attr; 98 99 memset(&qp_attr, 0, sizeof(qp_attr)); 100 qp_attr.qp_state = IB_QPS_RTS; 101 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ 102 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ 103 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ 104 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ 105 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and 106 * atomic ops allowed 107 */ 108 return ib_modify_qp(lnk->roce_qp, &qp_attr, 109 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | 110 IB_QP_SQ_PSN | IB_QP_RNR_RETRY | 111 IB_QP_MAX_QP_RD_ATOMIC); 112 } 113 114 int smc_ib_modify_qp_error(struct smc_link *lnk) 115 { 116 struct ib_qp_attr qp_attr; 117 118 memset(&qp_attr, 0, sizeof(qp_attr)); 119 qp_attr.qp_state = IB_QPS_ERR; 120 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); 121 } 122 123 int smc_ib_ready_link(struct smc_link *lnk) 124 { 125 struct smc_link_group *lgr = smc_get_lgr(lnk); 126 int rc = 0; 127 128 rc = smc_ib_modify_qp_init(lnk); 129 if (rc) 130 goto out; 131 132 rc = smc_ib_modify_qp_rtr(lnk); 133 if (rc) 134 goto out; 135 smc_wr_remember_qp_attr(lnk); 136 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, 137 IB_CQ_SOLICITED_MASK); 138 if (rc) 139 goto out; 140 rc = smc_wr_rx_post_init(lnk); 141 if (rc) 142 goto out; 143 smc_wr_remember_qp_attr(lnk); 144 145 if (lgr->role == SMC_SERV) { 146 rc = smc_ib_modify_qp_rts(lnk); 147 if (rc) 148 goto out; 149 smc_wr_remember_qp_attr(lnk); 150 } 151 out: 152 return rc; 153 } 154 155 static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) 156 { 157 const struct ib_gid_attr *attr; 158 int rc; 159 160 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0); 161 if (IS_ERR(attr)) 162 return -ENODEV; 163 164 rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]); 165 rdma_put_gid_attr(attr); 166 return rc; 167 } 168 169 /* Create an identifier unique for this instance of SMC-R. 170 * The MAC-address of the first active registered IB device 171 * plus a random 2-byte number is used to create this identifier. 172 * This name is delivered to the peer during connection initialization. 173 */ 174 static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, 175 u8 ibport) 176 { 177 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], 178 sizeof(smcibdev->mac[ibport - 1])); 179 } 180 181 bool smc_ib_is_valid_local_systemid(void) 182 { 183 return !is_zero_ether_addr(&local_systemid[2]); 184 } 185 186 static void smc_ib_init_local_systemid(void) 187 { 188 get_random_bytes(&local_systemid[0], 2); 189 } 190 191 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) 192 { 193 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; 194 } 195 196 int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, 197 u8 nexthop_mac[], u8 *uses_gateway) 198 { 199 struct neighbour *neigh = NULL; 200 struct rtable *rt = NULL; 201 struct flowi4 fl4 = { 202 .saddr = saddr, 203 .daddr = daddr 204 }; 205 206 if (daddr == cpu_to_be32(INADDR_NONE)) 207 goto out; 208 rt = ip_route_output_flow(net, &fl4, NULL); 209 if (IS_ERR(rt)) 210 goto out; 211 if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) 212 goto out_rt; 213 neigh = dst_neigh_lookup(&rt->dst, &fl4.daddr); 214 if (!neigh) 215 goto out_rt; 216 memcpy(nexthop_mac, neigh->ha, ETH_ALEN); 217 *uses_gateway = rt->rt_uses_gateway; 218 neigh_release(neigh); 219 ip_rt_put(rt); 220 return 0; 221 222 out_rt: 223 ip_rt_put(rt); 224 out: 225 return -ENOENT; 226 } 227 228 static int smc_ib_determine_gid_rcu(const struct net_device *ndev, 229 const struct ib_gid_attr *attr, 230 u8 gid[], u8 *sgid_index, 231 struct smc_init_info_smcrv2 *smcrv2) 232 { 233 if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { 234 if (gid) 235 memcpy(gid, &attr->gid, SMC_GID_SIZE); 236 if (sgid_index) 237 *sgid_index = attr->index; 238 return 0; 239 } 240 if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && 241 smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { 242 struct in_device *in_dev = __in_dev_get_rcu(ndev); 243 struct net *net = dev_net(ndev); 244 const struct in_ifaddr *ifa; 245 bool subnet_match = false; 246 247 if (!in_dev) 248 goto out; 249 in_dev_for_each_ifa_rcu(ifa, in_dev) { 250 if (!inet_ifa_match(smcrv2->saddr, ifa)) 251 continue; 252 subnet_match = true; 253 break; 254 } 255 if (!subnet_match) 256 goto out; 257 if (smcrv2->daddr && smc_ib_find_route(net, smcrv2->saddr, 258 smcrv2->daddr, 259 smcrv2->nexthop_mac, 260 &smcrv2->uses_gateway)) 261 goto out; 262 263 if (gid) 264 memcpy(gid, &attr->gid, SMC_GID_SIZE); 265 if (sgid_index) 266 *sgid_index = attr->index; 267 return 0; 268 } 269 out: 270 return -ENODEV; 271 } 272 273 /* determine the gid for an ib-device port and vlan id */ 274 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, 275 unsigned short vlan_id, u8 gid[], u8 *sgid_index, 276 struct smc_init_info_smcrv2 *smcrv2) 277 { 278 const struct ib_gid_attr *attr; 279 const struct net_device *ndev; 280 int i; 281 282 for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { 283 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); 284 if (IS_ERR(attr)) 285 continue; 286 287 rcu_read_lock(); 288 ndev = rdma_read_gid_attr_ndev_rcu(attr); 289 if (!IS_ERR(ndev) && 290 ((!vlan_id && !is_vlan_dev(ndev)) || 291 (vlan_id && is_vlan_dev(ndev) && 292 vlan_dev_vlan_id(ndev) == vlan_id))) { 293 if (!smc_ib_determine_gid_rcu(ndev, attr, gid, 294 sgid_index, smcrv2)) { 295 rcu_read_unlock(); 296 rdma_put_gid_attr(attr); 297 return 0; 298 } 299 } 300 rcu_read_unlock(); 301 rdma_put_gid_attr(attr); 302 } 303 return -ENODEV; 304 } 305 306 /* check if gid is still defined on smcibdev */ 307 static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2, 308 struct smc_ib_device *smcibdev, u8 ibport) 309 { 310 const struct ib_gid_attr *attr; 311 bool rc = false; 312 int i; 313 314 for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { 315 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); 316 if (IS_ERR(attr)) 317 continue; 318 319 rcu_read_lock(); 320 if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) || 321 (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && 322 !(ipv6_addr_type((const struct in6_addr *)&attr->gid) 323 & IPV6_ADDR_LINKLOCAL))) 324 if (!memcmp(gid, &attr->gid, SMC_GID_SIZE)) 325 rc = true; 326 rcu_read_unlock(); 327 rdma_put_gid_attr(attr); 328 } 329 return rc; 330 } 331 332 /* check all links if the gid is still defined on smcibdev */ 333 static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport) 334 { 335 struct smc_link_group *lgr; 336 int i; 337 338 spin_lock_bh(&smc_lgr_list.lock); 339 list_for_each_entry(lgr, &smc_lgr_list.list, list) { 340 if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, 341 SMC_MAX_PNETID_LEN)) 342 continue; /* lgr is not affected */ 343 if (list_empty(&lgr->list)) 344 continue; 345 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 346 if (lgr->lnk[i].state == SMC_LNK_UNUSED || 347 lgr->lnk[i].smcibdev != smcibdev) 348 continue; 349 if (!smc_ib_check_link_gid(lgr->lnk[i].gid, 350 lgr->smc_version == SMC_V2, 351 smcibdev, ibport)) 352 smcr_port_err(smcibdev, ibport); 353 } 354 } 355 spin_unlock_bh(&smc_lgr_list.lock); 356 } 357 358 static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) 359 { 360 int rc; 361 362 memset(&smcibdev->pattr[ibport - 1], 0, 363 sizeof(smcibdev->pattr[ibport - 1])); 364 rc = ib_query_port(smcibdev->ibdev, ibport, 365 &smcibdev->pattr[ibport - 1]); 366 if (rc) 367 goto out; 368 /* the SMC protocol requires specification of the RoCE MAC address */ 369 rc = smc_ib_fill_mac(smcibdev, ibport); 370 if (rc) 371 goto out; 372 if (!smc_ib_is_valid_local_systemid() && 373 smc_ib_port_active(smcibdev, ibport)) 374 /* create unique system identifier */ 375 smc_ib_define_local_systemid(smcibdev, ibport); 376 out: 377 return rc; 378 } 379 380 /* process context wrapper for might_sleep smc_ib_remember_port_attr */ 381 static void smc_ib_port_event_work(struct work_struct *work) 382 { 383 struct smc_ib_device *smcibdev = container_of( 384 work, struct smc_ib_device, port_event_work); 385 u8 port_idx; 386 387 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { 388 smc_ib_remember_port_attr(smcibdev, port_idx + 1); 389 clear_bit(port_idx, &smcibdev->port_event_mask); 390 if (!smc_ib_port_active(smcibdev, port_idx + 1)) { 391 set_bit(port_idx, smcibdev->ports_going_away); 392 smcr_port_err(smcibdev, port_idx + 1); 393 } else { 394 clear_bit(port_idx, smcibdev->ports_going_away); 395 smcr_port_add(smcibdev, port_idx + 1); 396 smc_ib_gid_check(smcibdev, port_idx + 1); 397 } 398 } 399 } 400 401 /* can be called in IRQ context */ 402 static void smc_ib_global_event_handler(struct ib_event_handler *handler, 403 struct ib_event *ibevent) 404 { 405 struct smc_ib_device *smcibdev; 406 bool schedule = false; 407 u8 port_idx; 408 409 smcibdev = container_of(handler, struct smc_ib_device, event_handler); 410 411 switch (ibevent->event) { 412 case IB_EVENT_DEVICE_FATAL: 413 /* terminate all ports on device */ 414 for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { 415 set_bit(port_idx, &smcibdev->port_event_mask); 416 if (!test_and_set_bit(port_idx, 417 smcibdev->ports_going_away)) 418 schedule = true; 419 } 420 if (schedule) 421 schedule_work(&smcibdev->port_event_work); 422 break; 423 case IB_EVENT_PORT_ACTIVE: 424 port_idx = ibevent->element.port_num - 1; 425 if (port_idx >= SMC_MAX_PORTS) 426 break; 427 set_bit(port_idx, &smcibdev->port_event_mask); 428 if (test_and_clear_bit(port_idx, smcibdev->ports_going_away)) 429 schedule_work(&smcibdev->port_event_work); 430 break; 431 case IB_EVENT_PORT_ERR: 432 port_idx = ibevent->element.port_num - 1; 433 if (port_idx >= SMC_MAX_PORTS) 434 break; 435 set_bit(port_idx, &smcibdev->port_event_mask); 436 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 437 schedule_work(&smcibdev->port_event_work); 438 break; 439 case IB_EVENT_GID_CHANGE: 440 port_idx = ibevent->element.port_num - 1; 441 if (port_idx >= SMC_MAX_PORTS) 442 break; 443 set_bit(port_idx, &smcibdev->port_event_mask); 444 schedule_work(&smcibdev->port_event_work); 445 break; 446 default: 447 break; 448 } 449 } 450 451 void smc_ib_dealloc_protection_domain(struct smc_link *lnk) 452 { 453 if (lnk->roce_pd) 454 ib_dealloc_pd(lnk->roce_pd); 455 lnk->roce_pd = NULL; 456 } 457 458 int smc_ib_create_protection_domain(struct smc_link *lnk) 459 { 460 int rc; 461 462 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); 463 rc = PTR_ERR_OR_ZERO(lnk->roce_pd); 464 if (IS_ERR(lnk->roce_pd)) 465 lnk->roce_pd = NULL; 466 return rc; 467 } 468 469 static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, 470 struct smc_ib_device *smcibdev) 471 { 472 struct smc_link_group *lgr; 473 bool rc = false; 474 int i; 475 476 spin_lock_bh(&smc_lgr->lock); 477 list_for_each_entry(lgr, &smc_lgr->list, list) { 478 if (lgr->is_smcd) 479 continue; 480 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 481 if (lgr->lnk[i].state == SMC_LNK_UNUSED || 482 lgr->lnk[i].smcibdev != smcibdev) 483 continue; 484 if (lgr->type == SMC_LGR_SINGLE || 485 lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { 486 rc = true; 487 goto out; 488 } 489 } 490 } 491 out: 492 spin_unlock_bh(&smc_lgr->lock); 493 return rc; 494 } 495 496 static int smc_nl_handle_dev_port(struct sk_buff *skb, 497 struct ib_device *ibdev, 498 struct smc_ib_device *smcibdev, 499 int port) 500 { 501 char smc_pnet[SMC_MAX_PNETID_LEN + 1]; 502 struct nlattr *port_attrs; 503 unsigned char port_state; 504 int lnk_count = 0; 505 506 port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port); 507 if (!port_attrs) 508 goto errout; 509 510 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, 511 smcibdev->pnetid_by_user[port])) 512 goto errattr; 513 memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN); 514 smc_pnet[SMC_MAX_PNETID_LEN] = 0; 515 if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) 516 goto errattr; 517 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, 518 smcibdev->ndev_ifidx[port])) 519 goto errattr; 520 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1)) 521 goto errattr; 522 port_state = smc_ib_port_active(smcibdev, port + 1); 523 if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state)) 524 goto errattr; 525 lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]); 526 if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count)) 527 goto errattr; 528 nla_nest_end(skb, port_attrs); 529 return 0; 530 errattr: 531 nla_nest_cancel(skb, port_attrs); 532 errout: 533 return -EMSGSIZE; 534 } 535 536 static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev, 537 struct sk_buff *skb) 538 { 539 if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid)) 540 return false; 541 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid)) 542 return false; 543 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor)) 544 return false; 545 if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device)) 546 return false; 547 if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id)) 548 return false; 549 return true; 550 } 551 552 static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, 553 struct sk_buff *skb, 554 struct netlink_callback *cb) 555 { 556 char smc_ibname[IB_DEVICE_NAME_MAX]; 557 struct smc_pci_dev smc_pci_dev; 558 struct pci_dev *pci_dev; 559 unsigned char is_crit; 560 struct nlattr *attrs; 561 void *nlh; 562 int i; 563 564 nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 565 &smc_gen_nl_family, NLM_F_MULTI, 566 SMC_NETLINK_GET_DEV_SMCR); 567 if (!nlh) 568 goto errmsg; 569 attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR); 570 if (!attrs) 571 goto errout; 572 is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); 573 if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) 574 goto errattr; 575 if (smcibdev->ibdev->dev.parent) { 576 memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); 577 pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); 578 smc_set_pci_values(pci_dev, &smc_pci_dev); 579 if (!smc_nl_handle_pci_values(&smc_pci_dev, skb)) 580 goto errattr; 581 } 582 snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); 583 if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) 584 goto errattr; 585 for (i = 1; i <= SMC_MAX_PORTS; i++) { 586 if (!rdma_is_port_valid(smcibdev->ibdev, i)) 587 continue; 588 if (smc_nl_handle_dev_port(skb, smcibdev->ibdev, 589 smcibdev, i - 1)) 590 goto errattr; 591 } 592 593 nla_nest_end(skb, attrs); 594 genlmsg_end(skb, nlh); 595 return 0; 596 597 errattr: 598 nla_nest_cancel(skb, attrs); 599 errout: 600 genlmsg_cancel(skb, nlh); 601 errmsg: 602 return -EMSGSIZE; 603 } 604 605 static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, 606 struct sk_buff *skb, 607 struct netlink_callback *cb) 608 { 609 struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); 610 struct smc_ib_device *smcibdev; 611 int snum = cb_ctx->pos[0]; 612 int num = 0; 613 614 mutex_lock(&dev_list->mutex); 615 list_for_each_entry(smcibdev, &dev_list->list, list) { 616 if (num < snum) 617 goto next; 618 if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) 619 goto errout; 620 next: 621 num++; 622 } 623 errout: 624 mutex_unlock(&dev_list->mutex); 625 cb_ctx->pos[0] = num; 626 } 627 628 int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) 629 { 630 smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb); 631 return skb->len; 632 } 633 634 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) 635 { 636 struct smc_link *lnk = (struct smc_link *)priv; 637 struct smc_ib_device *smcibdev = lnk->smcibdev; 638 u8 port_idx; 639 640 switch (ibevent->event) { 641 case IB_EVENT_QP_FATAL: 642 case IB_EVENT_QP_ACCESS_ERR: 643 port_idx = ibevent->element.qp->port - 1; 644 if (port_idx >= SMC_MAX_PORTS) 645 break; 646 set_bit(port_idx, &smcibdev->port_event_mask); 647 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) 648 schedule_work(&smcibdev->port_event_work); 649 break; 650 default: 651 break; 652 } 653 } 654 655 void smc_ib_destroy_queue_pair(struct smc_link *lnk) 656 { 657 if (lnk->roce_qp) 658 ib_destroy_qp(lnk->roce_qp); 659 lnk->roce_qp = NULL; 660 } 661 662 /* create a queue pair within the protection domain for a link */ 663 int smc_ib_create_queue_pair(struct smc_link *lnk) 664 { 665 struct ib_qp_init_attr qp_attr = { 666 .event_handler = smc_ib_qp_event_handler, 667 .qp_context = lnk, 668 .send_cq = lnk->smcibdev->roce_cq_send, 669 .recv_cq = lnk->smcibdev->roce_cq_recv, 670 .srq = NULL, 671 .cap = { 672 /* include unsolicited rdma_writes as well, 673 * there are max. 2 RDMA_WRITE per 1 WR_SEND 674 */ 675 .max_send_wr = SMC_WR_BUF_CNT * 3, 676 .max_recv_wr = SMC_WR_BUF_CNT * 3, 677 .max_send_sge = SMC_IB_MAX_SEND_SGE, 678 .max_recv_sge = lnk->wr_rx_sge_cnt, 679 .max_inline_data = 0, 680 }, 681 .sq_sig_type = IB_SIGNAL_REQ_WR, 682 .qp_type = IB_QPT_RC, 683 }; 684 int rc; 685 686 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); 687 rc = PTR_ERR_OR_ZERO(lnk->roce_qp); 688 if (IS_ERR(lnk->roce_qp)) 689 lnk->roce_qp = NULL; 690 else 691 smc_wr_remember_qp_attr(lnk); 692 return rc; 693 } 694 695 void smc_ib_put_memory_region(struct ib_mr *mr) 696 { 697 ib_dereg_mr(mr); 698 } 699 700 static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) 701 { 702 unsigned int offset = 0; 703 int sg_num; 704 705 /* map the largest prefix of a dma mapped SG list */ 706 sg_num = ib_map_mr_sg(buf_slot->mr[link_idx], 707 buf_slot->sgt[link_idx].sgl, 708 buf_slot->sgt[link_idx].orig_nents, 709 &offset, PAGE_SIZE); 710 711 return sg_num; 712 } 713 714 /* Allocate a memory region and map the dma mapped SG list of buf_slot */ 715 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, 716 struct smc_buf_desc *buf_slot, u8 link_idx) 717 { 718 if (buf_slot->mr[link_idx]) 719 return 0; /* already done */ 720 721 buf_slot->mr[link_idx] = 722 ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); 723 if (IS_ERR(buf_slot->mr[link_idx])) { 724 int rc; 725 726 rc = PTR_ERR(buf_slot->mr[link_idx]); 727 buf_slot->mr[link_idx] = NULL; 728 return rc; 729 } 730 731 if (smc_ib_map_mr_sg(buf_slot, link_idx) != 732 buf_slot->sgt[link_idx].orig_nents) 733 return -EINVAL; 734 735 return 0; 736 } 737 738 bool smc_ib_is_sg_need_sync(struct smc_link *lnk, 739 struct smc_buf_desc *buf_slot) 740 { 741 struct scatterlist *sg; 742 unsigned int i; 743 bool ret = false; 744 745 /* for now there is just one DMA address */ 746 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 747 buf_slot->sgt[lnk->link_idx].nents, i) { 748 if (!sg_dma_len(sg)) 749 break; 750 if (dma_need_sync(lnk->smcibdev->ibdev->dma_device, 751 sg_dma_address(sg))) { 752 ret = true; 753 goto out; 754 } 755 } 756 757 out: 758 return ret; 759 } 760 761 /* synchronize buffer usage for cpu access */ 762 void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, 763 struct smc_buf_desc *buf_slot, 764 enum dma_data_direction data_direction) 765 { 766 struct scatterlist *sg; 767 unsigned int i; 768 769 if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) 770 return; 771 772 /* for now there is just one DMA address */ 773 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 774 buf_slot->sgt[lnk->link_idx].nents, i) { 775 if (!sg_dma_len(sg)) 776 break; 777 ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, 778 sg_dma_address(sg), 779 sg_dma_len(sg), 780 data_direction); 781 } 782 } 783 784 /* synchronize buffer usage for device access */ 785 void smc_ib_sync_sg_for_device(struct smc_link *lnk, 786 struct smc_buf_desc *buf_slot, 787 enum dma_data_direction data_direction) 788 { 789 struct scatterlist *sg; 790 unsigned int i; 791 792 if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) 793 return; 794 795 /* for now there is just one DMA address */ 796 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, 797 buf_slot->sgt[lnk->link_idx].nents, i) { 798 if (!sg_dma_len(sg)) 799 break; 800 ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, 801 sg_dma_address(sg), 802 sg_dma_len(sg), 803 data_direction); 804 } 805 } 806 807 /* Map a new TX or RX buffer SG-table to DMA */ 808 int smc_ib_buf_map_sg(struct smc_link *lnk, 809 struct smc_buf_desc *buf_slot, 810 enum dma_data_direction data_direction) 811 { 812 int mapped_nents; 813 814 mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, 815 buf_slot->sgt[lnk->link_idx].sgl, 816 buf_slot->sgt[lnk->link_idx].orig_nents, 817 data_direction); 818 if (!mapped_nents) 819 return -ENOMEM; 820 821 return mapped_nents; 822 } 823 824 void smc_ib_buf_unmap_sg(struct smc_link *lnk, 825 struct smc_buf_desc *buf_slot, 826 enum dma_data_direction data_direction) 827 { 828 if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) 829 return; /* already unmapped */ 830 831 ib_dma_unmap_sg(lnk->smcibdev->ibdev, 832 buf_slot->sgt[lnk->link_idx].sgl, 833 buf_slot->sgt[lnk->link_idx].orig_nents, 834 data_direction); 835 buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; 836 } 837 838 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) 839 { 840 struct ib_cq_init_attr cqattr = { 841 .cqe = SMC_MAX_CQE, .comp_vector = 0 }; 842 int cqe_size_order, smc_order; 843 long rc; 844 845 mutex_lock(&smcibdev->mutex); 846 rc = 0; 847 if (smcibdev->initialized) 848 goto out; 849 /* the calculated number of cq entries fits to mlx5 cq allocation */ 850 cqe_size_order = cache_line_size() == 128 ? 7 : 6; 851 smc_order = MAX_PAGE_ORDER - cqe_size_order; 852 if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) 853 cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; 854 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, 855 smc_wr_tx_cq_handler, NULL, 856 smcibdev, &cqattr); 857 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); 858 if (IS_ERR(smcibdev->roce_cq_send)) { 859 smcibdev->roce_cq_send = NULL; 860 goto out; 861 } 862 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, 863 smc_wr_rx_cq_handler, NULL, 864 smcibdev, &cqattr); 865 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); 866 if (IS_ERR(smcibdev->roce_cq_recv)) { 867 smcibdev->roce_cq_recv = NULL; 868 goto err; 869 } 870 smc_wr_add_dev(smcibdev); 871 smcibdev->initialized = 1; 872 goto out; 873 874 err: 875 ib_destroy_cq(smcibdev->roce_cq_send); 876 out: 877 mutex_unlock(&smcibdev->mutex); 878 return rc; 879 } 880 881 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) 882 { 883 mutex_lock(&smcibdev->mutex); 884 if (!smcibdev->initialized) 885 goto out; 886 smcibdev->initialized = 0; 887 ib_destroy_cq(smcibdev->roce_cq_recv); 888 ib_destroy_cq(smcibdev->roce_cq_send); 889 smc_wr_remove_dev(smcibdev); 890 out: 891 mutex_unlock(&smcibdev->mutex); 892 } 893 894 static struct ib_client smc_ib_client; 895 896 static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) 897 { 898 struct ib_device *ibdev = smcibdev->ibdev; 899 struct net_device *ndev; 900 901 ndev = ib_device_get_netdev(ibdev, port + 1); 902 if (ndev) { 903 smcibdev->ndev_ifidx[port] = ndev->ifindex; 904 dev_put(ndev); 905 } 906 } 907 908 void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) 909 { 910 struct smc_ib_device *smcibdev; 911 struct ib_device *libdev; 912 struct net_device *lndev; 913 u8 port_cnt; 914 int i; 915 916 mutex_lock(&smc_ib_devices.mutex); 917 list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { 918 port_cnt = smcibdev->ibdev->phys_port_cnt; 919 for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { 920 libdev = smcibdev->ibdev; 921 lndev = ib_device_get_netdev(libdev, i + 1); 922 dev_put(lndev); 923 if (lndev != ndev) 924 continue; 925 if (event == NETDEV_REGISTER) 926 smcibdev->ndev_ifidx[i] = ndev->ifindex; 927 if (event == NETDEV_UNREGISTER) 928 smcibdev->ndev_ifidx[i] = 0; 929 } 930 } 931 mutex_unlock(&smc_ib_devices.mutex); 932 } 933 934 /* callback function for ib_register_client() */ 935 static int smc_ib_add_dev(struct ib_device *ibdev) 936 { 937 struct smc_ib_device *smcibdev; 938 u8 port_cnt; 939 int i; 940 941 if (ibdev->node_type != RDMA_NODE_IB_CA) 942 return -EOPNOTSUPP; 943 944 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); 945 if (!smcibdev) 946 return -ENOMEM; 947 948 smcibdev->ibdev = ibdev; 949 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); 950 atomic_set(&smcibdev->lnk_cnt, 0); 951 init_waitqueue_head(&smcibdev->lnks_deleted); 952 mutex_init(&smcibdev->mutex); 953 mutex_lock(&smc_ib_devices.mutex); 954 list_add_tail(&smcibdev->list, &smc_ib_devices.list); 955 mutex_unlock(&smc_ib_devices.mutex); 956 ib_set_client_data(ibdev, &smc_ib_client, smcibdev); 957 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, 958 smc_ib_global_event_handler); 959 ib_register_event_handler(&smcibdev->event_handler); 960 961 /* trigger reading of the port attributes */ 962 port_cnt = smcibdev->ibdev->phys_port_cnt; 963 pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", 964 smcibdev->ibdev->name, port_cnt); 965 for (i = 0; 966 i < min_t(size_t, port_cnt, SMC_MAX_PORTS); 967 i++) { 968 set_bit(i, &smcibdev->port_event_mask); 969 /* determine pnetids of the port */ 970 if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, 971 smcibdev->pnetid[i])) 972 smc_pnetid_by_table_ib(smcibdev, i + 1); 973 smc_copy_netdev_ifindex(smcibdev, i); 974 pr_warn_ratelimited("smc: ib device %s port %d has pnetid " 975 "%.16s%s\n", 976 smcibdev->ibdev->name, i + 1, 977 smcibdev->pnetid[i], 978 smcibdev->pnetid_by_user[i] ? 979 " (user defined)" : 980 ""); 981 } 982 schedule_work(&smcibdev->port_event_work); 983 return 0; 984 } 985 986 /* callback function for ib_unregister_client() */ 987 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) 988 { 989 struct smc_ib_device *smcibdev = client_data; 990 991 mutex_lock(&smc_ib_devices.mutex); 992 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ 993 mutex_unlock(&smc_ib_devices.mutex); 994 pr_warn_ratelimited("smc: removing ib device %s\n", 995 smcibdev->ibdev->name); 996 smc_smcr_terminate_all(smcibdev); 997 smc_ib_cleanup_per_ibdev(smcibdev); 998 ib_unregister_event_handler(&smcibdev->event_handler); 999 cancel_work_sync(&smcibdev->port_event_work); 1000 kfree(smcibdev); 1001 } 1002 1003 static struct ib_client smc_ib_client = { 1004 .name = "smc_ib", 1005 .add = smc_ib_add_dev, 1006 .remove = smc_ib_remove_dev, 1007 }; 1008 1009 int __init smc_ib_register_client(void) 1010 { 1011 smc_ib_init_local_systemid(); 1012 return ib_register_client(&smc_ib_client); 1013 } 1014 1015 void smc_ib_unregister_client(void) 1016 { 1017 ib_unregister_client(&smc_ib_client); 1018 } 1019