1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * Basic Transport Functions exploiting Infiniband API 6 * 7 * Copyright IBM Corp. 2016 8 * 9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 10 */ 11 12 #include <linux/socket.h> 13 #include <linux/if_vlan.h> 14 #include <linux/random.h> 15 #include <linux/workqueue.h> 16 #include <net/tcp.h> 17 #include <net/sock.h> 18 #include <rdma/ib_verbs.h> 19 #include <rdma/ib_cache.h> 20 21 #include "smc.h" 22 #include "smc_clc.h" 23 #include "smc_core.h" 24 #include "smc_ib.h" 25 #include "smc_wr.h" 26 #include "smc_llc.h" 27 #include "smc_cdc.h" 28 #include "smc_close.h" 29 #include "smc_ism.h" 30 31 #define SMC_LGR_NUM_INCR 256 32 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) 33 #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) 34 #define SMC_LGR_FREE_DELAY_FAST (8 * HZ) 35 36 static struct smc_lgr_list smc_lgr_list = { /* established link groups */ 37 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 38 .list = LIST_HEAD_INIT(smc_lgr_list.list), 39 .num = 0, 40 }; 41 42 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, 43 struct smc_buf_desc *buf_desc); 44 45 /* return head of link group list and its lock for a given link group */ 46 static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, 47 spinlock_t **lgr_lock) 48 { 49 if (lgr->is_smcd) { 50 *lgr_lock = &lgr->smcd->lgr_lock; 51 return &lgr->smcd->lgr_list; 52 } 53 54 *lgr_lock = &smc_lgr_list.lock; 55 return &smc_lgr_list.list; 56 } 57 58 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) 59 { 60 /* client link group creation always follows the server link group 61 * creation. For client use a somewhat higher removal delay time, 62 * otherwise there is a risk of out-of-sync link groups. 63 */ 64 if (!lgr->freeing && !lgr->freefast) { 65 mod_delayed_work(system_wq, &lgr->free_work, 66 (!lgr->is_smcd && lgr->role == SMC_CLNT) ? 67 SMC_LGR_FREE_DELAY_CLNT : 68 SMC_LGR_FREE_DELAY_SERV); 69 } 70 } 71 72 void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) 73 { 74 if (!lgr->freeing && !lgr->freefast) { 75 lgr->freefast = 1; 76 mod_delayed_work(system_wq, &lgr->free_work, 77 SMC_LGR_FREE_DELAY_FAST); 78 } 79 } 80 81 /* Register connection's alert token in our lookup structure. 82 * To use rbtrees we have to implement our own insert core. 83 * Requires @conns_lock 84 * @smc connection to register 85 * Returns 0 on success, != otherwise. 86 */ 87 static void smc_lgr_add_alert_token(struct smc_connection *conn) 88 { 89 struct rb_node **link, *parent = NULL; 90 u32 token = conn->alert_token_local; 91 92 link = &conn->lgr->conns_all.rb_node; 93 while (*link) { 94 struct smc_connection *cur = rb_entry(*link, 95 struct smc_connection, alert_node); 96 97 parent = *link; 98 if (cur->alert_token_local > token) 99 link = &parent->rb_left; 100 else 101 link = &parent->rb_right; 102 } 103 /* Put the new node there */ 104 rb_link_node(&conn->alert_node, parent, link); 105 rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); 106 } 107 108 /* Register connection in link group by assigning an alert token 109 * registered in a search tree. 110 * Requires @conns_lock 111 * Note that '0' is a reserved value and not assigned. 112 */ 113 static void smc_lgr_register_conn(struct smc_connection *conn) 114 { 115 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 116 static atomic_t nexttoken = ATOMIC_INIT(0); 117 118 /* find a new alert_token_local value not yet used by some connection 119 * in this link group 120 */ 121 sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ 122 while (!conn->alert_token_local) { 123 conn->alert_token_local = atomic_inc_return(&nexttoken); 124 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) 125 conn->alert_token_local = 0; 126 } 127 smc_lgr_add_alert_token(conn); 128 conn->lgr->conns_num++; 129 } 130 131 /* Unregister connection and reset the alert token of the given connection< 132 */ 133 static void __smc_lgr_unregister_conn(struct smc_connection *conn) 134 { 135 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 136 struct smc_link_group *lgr = conn->lgr; 137 138 rb_erase(&conn->alert_node, &lgr->conns_all); 139 lgr->conns_num--; 140 conn->alert_token_local = 0; 141 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ 142 } 143 144 /* Unregister connection from lgr 145 */ 146 static void smc_lgr_unregister_conn(struct smc_connection *conn) 147 { 148 struct smc_link_group *lgr = conn->lgr; 149 150 if (!lgr) 151 return; 152 write_lock_bh(&lgr->conns_lock); 153 if (conn->alert_token_local) { 154 __smc_lgr_unregister_conn(conn); 155 } 156 write_unlock_bh(&lgr->conns_lock); 157 conn->lgr = NULL; 158 } 159 160 /* Send delete link, either as client to request the initiation 161 * of the DELETE LINK sequence from server; or as server to 162 * initiate the delete processing. See smc_llc_rx_delete_link(). 163 */ 164 static int smc_link_send_delete(struct smc_link *lnk) 165 { 166 if (lnk->state == SMC_LNK_ACTIVE && 167 !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { 168 smc_llc_link_deleting(lnk); 169 return 0; 170 } 171 return -ENOTCONN; 172 } 173 174 static void smc_lgr_free(struct smc_link_group *lgr); 175 176 static void smc_lgr_free_work(struct work_struct *work) 177 { 178 struct smc_link_group *lgr = container_of(to_delayed_work(work), 179 struct smc_link_group, 180 free_work); 181 spinlock_t *lgr_lock; 182 struct smc_link *lnk; 183 bool conns; 184 185 smc_lgr_list_head(lgr, &lgr_lock); 186 spin_lock_bh(lgr_lock); 187 if (lgr->freeing) { 188 spin_unlock_bh(lgr_lock); 189 return; 190 } 191 read_lock_bh(&lgr->conns_lock); 192 conns = RB_EMPTY_ROOT(&lgr->conns_all); 193 read_unlock_bh(&lgr->conns_lock); 194 if (!conns) { /* number of lgr connections is no longer zero */ 195 spin_unlock_bh(lgr_lock); 196 return; 197 } 198 list_del_init(&lgr->list); /* remove from smc_lgr_list */ 199 200 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 201 if (!lgr->is_smcd && !lgr->terminating) { 202 /* try to send del link msg, on error free lgr immediately */ 203 if (lnk->state == SMC_LNK_ACTIVE && 204 !smc_link_send_delete(lnk)) { 205 /* reschedule in case we never receive a response */ 206 smc_lgr_schedule_free_work(lgr); 207 spin_unlock_bh(lgr_lock); 208 return; 209 } 210 } 211 lgr->freeing = 1; /* this instance does the freeing, no new schedule */ 212 spin_unlock_bh(lgr_lock); 213 cancel_delayed_work(&lgr->free_work); 214 215 if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) 216 smc_llc_link_inactive(lnk); 217 if (lgr->is_smcd) 218 smc_ism_signal_shutdown(lgr); 219 smc_lgr_free(lgr); 220 } 221 222 static void smc_lgr_terminate_work(struct work_struct *work) 223 { 224 struct smc_link_group *lgr = container_of(work, struct smc_link_group, 225 terminate_work); 226 227 smc_lgr_terminate(lgr); 228 } 229 230 /* create a new SMC link group */ 231 static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) 232 { 233 struct smc_link_group *lgr; 234 struct list_head *lgr_list; 235 struct smc_link *lnk; 236 spinlock_t *lgr_lock; 237 u8 rndvec[3]; 238 int rc = 0; 239 int i; 240 241 if (ini->is_smcd && ini->vlan_id) { 242 if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) { 243 rc = SMC_CLC_DECL_ISMVLANERR; 244 goto out; 245 } 246 } 247 248 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); 249 if (!lgr) { 250 rc = SMC_CLC_DECL_MEM; 251 goto ism_put_vlan; 252 } 253 lgr->is_smcd = ini->is_smcd; 254 lgr->sync_err = 0; 255 lgr->terminating = 0; 256 lgr->freefast = 0; 257 lgr->freeing = 0; 258 lgr->vlan_id = ini->vlan_id; 259 rwlock_init(&lgr->sndbufs_lock); 260 rwlock_init(&lgr->rmbs_lock); 261 rwlock_init(&lgr->conns_lock); 262 for (i = 0; i < SMC_RMBE_SIZES; i++) { 263 INIT_LIST_HEAD(&lgr->sndbufs[i]); 264 INIT_LIST_HEAD(&lgr->rmbs[i]); 265 } 266 smc_lgr_list.num += SMC_LGR_NUM_INCR; 267 memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); 268 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); 269 INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); 270 lgr->conns_all = RB_ROOT; 271 if (ini->is_smcd) { 272 /* SMC-D specific settings */ 273 get_device(&ini->ism_dev->dev); 274 lgr->peer_gid = ini->ism_gid; 275 lgr->smcd = ini->ism_dev; 276 lgr_list = &ini->ism_dev->lgr_list; 277 lgr_lock = &lgr->smcd->lgr_lock; 278 } else { 279 /* SMC-R specific settings */ 280 get_device(&ini->ib_dev->ibdev->dev); 281 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 282 memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, 283 SMC_SYSTEMID_LEN); 284 285 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 286 /* initialize link */ 287 lnk->state = SMC_LNK_ACTIVATING; 288 lnk->link_id = SMC_SINGLE_LINK; 289 lnk->smcibdev = ini->ib_dev; 290 lnk->ibport = ini->ib_port; 291 lgr_list = &smc_lgr_list.list; 292 lgr_lock = &smc_lgr_list.lock; 293 lnk->path_mtu = 294 ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; 295 if (!ini->ib_dev->initialized) 296 smc_ib_setup_per_ibdev(ini->ib_dev); 297 get_random_bytes(rndvec, sizeof(rndvec)); 298 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + 299 (rndvec[2] << 16); 300 rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, 301 ini->vlan_id, lnk->gid, 302 &lnk->sgid_index); 303 if (rc) 304 goto free_lgr; 305 rc = smc_llc_link_init(lnk); 306 if (rc) 307 goto free_lgr; 308 rc = smc_wr_alloc_link_mem(lnk); 309 if (rc) 310 goto clear_llc_lnk; 311 rc = smc_ib_create_protection_domain(lnk); 312 if (rc) 313 goto free_link_mem; 314 rc = smc_ib_create_queue_pair(lnk); 315 if (rc) 316 goto dealloc_pd; 317 rc = smc_wr_create_link(lnk); 318 if (rc) 319 goto destroy_qp; 320 } 321 smc->conn.lgr = lgr; 322 spin_lock_bh(lgr_lock); 323 list_add(&lgr->list, lgr_list); 324 spin_unlock_bh(lgr_lock); 325 return 0; 326 327 destroy_qp: 328 smc_ib_destroy_queue_pair(lnk); 329 dealloc_pd: 330 smc_ib_dealloc_protection_domain(lnk); 331 free_link_mem: 332 smc_wr_free_link_mem(lnk); 333 clear_llc_lnk: 334 smc_llc_link_clear(lnk); 335 free_lgr: 336 kfree(lgr); 337 ism_put_vlan: 338 if (ini->is_smcd && ini->vlan_id) 339 smc_ism_put_vlan(ini->ism_dev, ini->vlan_id); 340 out: 341 if (rc < 0) { 342 if (rc == -ENOMEM) 343 rc = SMC_CLC_DECL_MEM; 344 else 345 rc = SMC_CLC_DECL_INTERR; 346 } 347 return rc; 348 } 349 350 static void smc_buf_unuse(struct smc_connection *conn, 351 struct smc_link_group *lgr) 352 { 353 if (conn->sndbuf_desc) 354 conn->sndbuf_desc->used = 0; 355 if (conn->rmb_desc) { 356 if (!conn->rmb_desc->regerr) { 357 if (!lgr->is_smcd && !list_empty(&lgr->list)) { 358 /* unregister rmb with peer */ 359 smc_llc_do_delete_rkey( 360 &lgr->lnk[SMC_SINGLE_LINK], 361 conn->rmb_desc); 362 } 363 conn->rmb_desc->used = 0; 364 } else { 365 /* buf registration failed, reuse not possible */ 366 write_lock_bh(&lgr->rmbs_lock); 367 list_del(&conn->rmb_desc->list); 368 write_unlock_bh(&lgr->rmbs_lock); 369 370 smc_buf_free(lgr, true, conn->rmb_desc); 371 } 372 } 373 } 374 375 /* remove a finished connection from its link group */ 376 void smc_conn_free(struct smc_connection *conn) 377 { 378 struct smc_link_group *lgr = conn->lgr; 379 380 if (!lgr) 381 return; 382 if (lgr->is_smcd) { 383 smc_ism_unset_conn(conn); 384 tasklet_kill(&conn->rx_tsklet); 385 } else { 386 smc_cdc_tx_dismiss_slots(conn); 387 } 388 if (!list_empty(&lgr->list)) { 389 smc_lgr_unregister_conn(conn); 390 smc_buf_unuse(conn, lgr); /* allow buffer reuse */ 391 } 392 393 if (!lgr->conns_num) 394 smc_lgr_schedule_free_work(lgr); 395 } 396 397 static void smc_link_clear(struct smc_link *lnk) 398 { 399 lnk->peer_qpn = 0; 400 smc_llc_link_clear(lnk); 401 smc_ib_modify_qp_reset(lnk); 402 smc_wr_free_link(lnk); 403 smc_ib_destroy_queue_pair(lnk); 404 smc_ib_dealloc_protection_domain(lnk); 405 smc_wr_free_link_mem(lnk); 406 } 407 408 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, 409 struct smc_buf_desc *buf_desc) 410 { 411 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 412 413 if (is_rmb) { 414 if (buf_desc->mr_rx[SMC_SINGLE_LINK]) 415 smc_ib_put_memory_region( 416 buf_desc->mr_rx[SMC_SINGLE_LINK]); 417 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, 418 DMA_FROM_DEVICE); 419 } else { 420 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, 421 DMA_TO_DEVICE); 422 } 423 sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); 424 if (buf_desc->pages) 425 __free_pages(buf_desc->pages, buf_desc->order); 426 kfree(buf_desc); 427 } 428 429 static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, 430 struct smc_buf_desc *buf_desc) 431 { 432 if (is_dmb) { 433 /* restore original buf len */ 434 buf_desc->len += sizeof(struct smcd_cdc_msg); 435 smc_ism_unregister_dmb(lgr->smcd, buf_desc); 436 } else { 437 kfree(buf_desc->cpu_addr); 438 } 439 kfree(buf_desc); 440 } 441 442 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, 443 struct smc_buf_desc *buf_desc) 444 { 445 if (lgr->is_smcd) 446 smcd_buf_free(lgr, is_rmb, buf_desc); 447 else 448 smcr_buf_free(lgr, is_rmb, buf_desc); 449 } 450 451 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) 452 { 453 struct smc_buf_desc *buf_desc, *bf_desc; 454 struct list_head *buf_list; 455 int i; 456 457 for (i = 0; i < SMC_RMBE_SIZES; i++) { 458 if (is_rmb) 459 buf_list = &lgr->rmbs[i]; 460 else 461 buf_list = &lgr->sndbufs[i]; 462 list_for_each_entry_safe(buf_desc, bf_desc, buf_list, 463 list) { 464 list_del(&buf_desc->list); 465 smc_buf_free(lgr, is_rmb, buf_desc); 466 } 467 } 468 } 469 470 static void smc_lgr_free_bufs(struct smc_link_group *lgr) 471 { 472 /* free send buffers */ 473 __smc_lgr_free_bufs(lgr, false); 474 /* free rmbs */ 475 __smc_lgr_free_bufs(lgr, true); 476 } 477 478 /* remove a link group */ 479 static void smc_lgr_free(struct smc_link_group *lgr) 480 { 481 smc_lgr_free_bufs(lgr); 482 if (lgr->is_smcd) { 483 smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); 484 put_device(&lgr->smcd->dev); 485 } else { 486 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); 487 put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); 488 } 489 kfree(lgr); 490 } 491 492 void smc_lgr_forget(struct smc_link_group *lgr) 493 { 494 struct list_head *lgr_list; 495 spinlock_t *lgr_lock; 496 497 lgr_list = smc_lgr_list_head(lgr, &lgr_lock); 498 spin_lock_bh(lgr_lock); 499 /* do not use this link group for new connections */ 500 if (!list_empty(lgr_list)) 501 list_del_init(lgr_list); 502 spin_unlock_bh(lgr_lock); 503 } 504 505 static void smc_sk_wake_ups(struct smc_sock *smc) 506 { 507 smc->sk.sk_write_space(&smc->sk); 508 smc->sk.sk_data_ready(&smc->sk); 509 smc->sk.sk_state_change(&smc->sk); 510 } 511 512 /* kill a connection */ 513 static void smc_conn_kill(struct smc_connection *conn) 514 { 515 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 516 517 smc_close_abort(conn); 518 conn->killed = 1; 519 smc_sk_wake_ups(smc); 520 smc_lgr_unregister_conn(conn); 521 smc->sk.sk_err = ECONNABORTED; 522 smc_close_active_abort(smc); 523 } 524 525 /* terminate link group */ 526 static void __smc_lgr_terminate(struct smc_link_group *lgr) 527 { 528 struct smc_connection *conn; 529 struct smc_sock *smc; 530 struct rb_node *node; 531 532 if (lgr->terminating) 533 return; /* lgr already terminating */ 534 lgr->terminating = 1; 535 if (!lgr->is_smcd) 536 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 537 538 /* kill remaining link group connections */ 539 read_lock_bh(&lgr->conns_lock); 540 node = rb_first(&lgr->conns_all); 541 while (node) { 542 read_unlock_bh(&lgr->conns_lock); 543 conn = rb_entry(node, struct smc_connection, alert_node); 544 smc = container_of(conn, struct smc_sock, conn); 545 sock_hold(&smc->sk); /* sock_put below */ 546 lock_sock(&smc->sk); 547 smc_conn_kill(conn); 548 release_sock(&smc->sk); 549 sock_put(&smc->sk); /* sock_hold above */ 550 read_lock_bh(&lgr->conns_lock); 551 node = rb_first(&lgr->conns_all); 552 } 553 read_unlock_bh(&lgr->conns_lock); 554 if (!lgr->is_smcd) 555 wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); 556 smc_lgr_schedule_free_work_fast(lgr); 557 } 558 559 /* unlink and terminate link group */ 560 void smc_lgr_terminate(struct smc_link_group *lgr) 561 { 562 spinlock_t *lgr_lock; 563 564 smc_lgr_list_head(lgr, &lgr_lock); 565 spin_lock_bh(lgr_lock); 566 if (lgr->terminating) { 567 spin_unlock_bh(lgr_lock); 568 return; /* lgr already terminating */ 569 } 570 list_del_init(&lgr->list); 571 spin_unlock_bh(lgr_lock); 572 __smc_lgr_terminate(lgr); 573 } 574 575 /* Called when IB port is terminated */ 576 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) 577 { 578 struct smc_link_group *lgr, *l; 579 LIST_HEAD(lgr_free_list); 580 581 spin_lock_bh(&smc_lgr_list.lock); 582 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { 583 if (!lgr->is_smcd && 584 lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && 585 lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) 586 list_move(&lgr->list, &lgr_free_list); 587 } 588 spin_unlock_bh(&smc_lgr_list.lock); 589 590 list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { 591 list_del_init(&lgr->list); 592 __smc_lgr_terminate(lgr); 593 } 594 } 595 596 /* Called when SMC-D device is terminated or peer is lost */ 597 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) 598 { 599 struct smc_link_group *lgr, *l; 600 LIST_HEAD(lgr_free_list); 601 602 /* run common cleanup function and build free list */ 603 spin_lock_bh(&dev->lgr_lock); 604 list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { 605 if ((!peer_gid || lgr->peer_gid == peer_gid) && 606 (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { 607 list_move(&lgr->list, &lgr_free_list); 608 } 609 } 610 spin_unlock_bh(&dev->lgr_lock); 611 612 /* cancel the regular free workers and actually free lgrs */ 613 list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { 614 list_del_init(&lgr->list); 615 __smc_lgr_terminate(lgr); 616 cancel_delayed_work_sync(&lgr->free_work); 617 if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ 618 smc_ism_signal_shutdown(lgr); 619 smc_lgr_free(lgr); 620 } 621 } 622 623 /* Determine vlan of internal TCP socket. 624 * @vlan_id: address to store the determined vlan id into 625 */ 626 int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) 627 { 628 struct dst_entry *dst = sk_dst_get(clcsock->sk); 629 struct net_device *ndev; 630 int i, nest_lvl, rc = 0; 631 632 ini->vlan_id = 0; 633 if (!dst) { 634 rc = -ENOTCONN; 635 goto out; 636 } 637 if (!dst->dev) { 638 rc = -ENODEV; 639 goto out_rel; 640 } 641 642 ndev = dst->dev; 643 if (is_vlan_dev(ndev)) { 644 ini->vlan_id = vlan_dev_vlan_id(ndev); 645 goto out_rel; 646 } 647 648 rtnl_lock(); 649 nest_lvl = ndev->lower_level; 650 for (i = 0; i < nest_lvl; i++) { 651 struct list_head *lower = &ndev->adj_list.lower; 652 653 if (list_empty(lower)) 654 break; 655 lower = lower->next; 656 ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); 657 if (is_vlan_dev(ndev)) { 658 ini->vlan_id = vlan_dev_vlan_id(ndev); 659 break; 660 } 661 } 662 rtnl_unlock(); 663 664 out_rel: 665 dst_release(dst); 666 out: 667 return rc; 668 } 669 670 static bool smcr_lgr_match(struct smc_link_group *lgr, 671 struct smc_clc_msg_local *lcl, 672 enum smc_lgr_role role, u32 clcqpn) 673 { 674 return !memcmp(lgr->peer_systemid, lcl->id_for_peer, 675 SMC_SYSTEMID_LEN) && 676 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, 677 SMC_GID_SIZE) && 678 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, 679 sizeof(lcl->mac)) && 680 lgr->role == role && 681 (lgr->role == SMC_SERV || 682 lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); 683 } 684 685 static bool smcd_lgr_match(struct smc_link_group *lgr, 686 struct smcd_dev *smcismdev, u64 peer_gid) 687 { 688 return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; 689 } 690 691 /* create a new SMC connection (and a new link group if necessary) */ 692 int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) 693 { 694 struct smc_connection *conn = &smc->conn; 695 struct list_head *lgr_list; 696 struct smc_link_group *lgr; 697 enum smc_lgr_role role; 698 spinlock_t *lgr_lock; 699 int rc = 0; 700 701 lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list; 702 lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock; 703 ini->cln_first_contact = SMC_FIRST_CONTACT; 704 role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 705 if (role == SMC_CLNT && ini->srv_first_contact) 706 /* create new link group as well */ 707 goto create; 708 709 /* determine if an existing link group can be reused */ 710 spin_lock_bh(lgr_lock); 711 list_for_each_entry(lgr, lgr_list, list) { 712 write_lock_bh(&lgr->conns_lock); 713 if ((ini->is_smcd ? 714 smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : 715 smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && 716 !lgr->sync_err && 717 lgr->vlan_id == ini->vlan_id && 718 (role == SMC_CLNT || 719 lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { 720 /* link group found */ 721 ini->cln_first_contact = SMC_REUSE_CONTACT; 722 conn->lgr = lgr; 723 smc_lgr_register_conn(conn); /* add smc conn to lgr */ 724 if (delayed_work_pending(&lgr->free_work)) 725 cancel_delayed_work(&lgr->free_work); 726 write_unlock_bh(&lgr->conns_lock); 727 break; 728 } 729 write_unlock_bh(&lgr->conns_lock); 730 } 731 spin_unlock_bh(lgr_lock); 732 733 if (role == SMC_CLNT && !ini->srv_first_contact && 734 ini->cln_first_contact == SMC_FIRST_CONTACT) { 735 /* Server reuses a link group, but Client wants to start 736 * a new one 737 * send out_of_sync decline, reason synchr. error 738 */ 739 return SMC_CLC_DECL_SYNCERR; 740 } 741 742 create: 743 if (ini->cln_first_contact == SMC_FIRST_CONTACT) { 744 rc = smc_lgr_create(smc, ini); 745 if (rc) 746 goto out; 747 lgr = conn->lgr; 748 write_lock_bh(&lgr->conns_lock); 749 smc_lgr_register_conn(conn); /* add smc conn to lgr */ 750 write_unlock_bh(&lgr->conns_lock); 751 } 752 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; 753 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; 754 conn->urg_state = SMC_URG_READ; 755 if (ini->is_smcd) { 756 conn->rx_off = sizeof(struct smcd_cdc_msg); 757 smcd_cdc_rx_init(conn); /* init tasklet for this conn */ 758 } 759 #ifndef KERNEL_HAS_ATOMIC64 760 spin_lock_init(&conn->acurs_lock); 761 #endif 762 763 out: 764 return rc; 765 } 766 767 /* convert the RMB size into the compressed notation - minimum 16K. 768 * In contrast to plain ilog2, this rounds towards the next power of 2, 769 * so the socket application gets at least its desired sndbuf / rcvbuf size. 770 */ 771 static u8 smc_compress_bufsize(int size) 772 { 773 u8 compressed; 774 775 if (size <= SMC_BUF_MIN_SIZE) 776 return 0; 777 778 size = (size - 1) >> 14; 779 compressed = ilog2(size) + 1; 780 if (compressed >= SMC_RMBE_SIZES) 781 compressed = SMC_RMBE_SIZES - 1; 782 return compressed; 783 } 784 785 /* convert the RMB size from compressed notation into integer */ 786 int smc_uncompress_bufsize(u8 compressed) 787 { 788 u32 size; 789 790 size = 0x00000001 << (((int)compressed) + 14); 791 return (int)size; 792 } 793 794 /* try to reuse a sndbuf or rmb description slot for a certain 795 * buffer size; if not available, return NULL 796 */ 797 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, 798 rwlock_t *lock, 799 struct list_head *buf_list) 800 { 801 struct smc_buf_desc *buf_slot; 802 803 read_lock_bh(lock); 804 list_for_each_entry(buf_slot, buf_list, list) { 805 if (cmpxchg(&buf_slot->used, 0, 1) == 0) { 806 read_unlock_bh(lock); 807 return buf_slot; 808 } 809 } 810 read_unlock_bh(lock); 811 return NULL; 812 } 813 814 /* one of the conditions for announcing a receiver's current window size is 815 * that it "results in a minimum increase in the window size of 10% of the 816 * receive buffer space" [RFC7609] 817 */ 818 static inline int smc_rmb_wnd_update_limit(int rmbe_size) 819 { 820 return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); 821 } 822 823 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, 824 bool is_rmb, int bufsize) 825 { 826 struct smc_buf_desc *buf_desc; 827 struct smc_link *lnk; 828 int rc; 829 830 /* try to alloc a new buffer */ 831 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); 832 if (!buf_desc) 833 return ERR_PTR(-ENOMEM); 834 835 buf_desc->order = get_order(bufsize); 836 buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | 837 __GFP_NOMEMALLOC | __GFP_COMP | 838 __GFP_NORETRY | __GFP_ZERO, 839 buf_desc->order); 840 if (!buf_desc->pages) { 841 kfree(buf_desc); 842 return ERR_PTR(-EAGAIN); 843 } 844 buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); 845 846 /* build the sg table from the pages */ 847 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 848 rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, 849 GFP_KERNEL); 850 if (rc) { 851 smc_buf_free(lgr, is_rmb, buf_desc); 852 return ERR_PTR(rc); 853 } 854 sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, 855 buf_desc->cpu_addr, bufsize); 856 857 /* map sg table to DMA address */ 858 rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, 859 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 860 /* SMC protocol depends on mapping to one DMA address only */ 861 if (rc != 1) { 862 smc_buf_free(lgr, is_rmb, buf_desc); 863 return ERR_PTR(-EAGAIN); 864 } 865 866 /* create a new memory region for the RMB */ 867 if (is_rmb) { 868 rc = smc_ib_get_memory_region(lnk->roce_pd, 869 IB_ACCESS_REMOTE_WRITE | 870 IB_ACCESS_LOCAL_WRITE, 871 buf_desc); 872 if (rc) { 873 smc_buf_free(lgr, is_rmb, buf_desc); 874 return ERR_PTR(rc); 875 } 876 } 877 878 buf_desc->len = bufsize; 879 return buf_desc; 880 } 881 882 #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ 883 884 static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, 885 bool is_dmb, int bufsize) 886 { 887 struct smc_buf_desc *buf_desc; 888 int rc; 889 890 if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) 891 return ERR_PTR(-EAGAIN); 892 893 /* try to alloc a new DMB */ 894 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); 895 if (!buf_desc) 896 return ERR_PTR(-ENOMEM); 897 if (is_dmb) { 898 rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); 899 if (rc) { 900 kfree(buf_desc); 901 return ERR_PTR(-EAGAIN); 902 } 903 buf_desc->pages = virt_to_page(buf_desc->cpu_addr); 904 /* CDC header stored in buf. So, pretend it was smaller */ 905 buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); 906 } else { 907 buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | 908 __GFP_NOWARN | __GFP_NORETRY | 909 __GFP_NOMEMALLOC); 910 if (!buf_desc->cpu_addr) { 911 kfree(buf_desc); 912 return ERR_PTR(-EAGAIN); 913 } 914 buf_desc->len = bufsize; 915 } 916 return buf_desc; 917 } 918 919 static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) 920 { 921 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); 922 struct smc_connection *conn = &smc->conn; 923 struct smc_link_group *lgr = conn->lgr; 924 struct list_head *buf_list; 925 int bufsize, bufsize_short; 926 int sk_buf_size; 927 rwlock_t *lock; 928 929 if (is_rmb) 930 /* use socket recv buffer size (w/o overhead) as start value */ 931 sk_buf_size = smc->sk.sk_rcvbuf / 2; 932 else 933 /* use socket send buffer size (w/o overhead) as start value */ 934 sk_buf_size = smc->sk.sk_sndbuf / 2; 935 936 for (bufsize_short = smc_compress_bufsize(sk_buf_size); 937 bufsize_short >= 0; bufsize_short--) { 938 939 if (is_rmb) { 940 lock = &lgr->rmbs_lock; 941 buf_list = &lgr->rmbs[bufsize_short]; 942 } else { 943 lock = &lgr->sndbufs_lock; 944 buf_list = &lgr->sndbufs[bufsize_short]; 945 } 946 bufsize = smc_uncompress_bufsize(bufsize_short); 947 if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) 948 continue; 949 950 /* check for reusable slot in the link group */ 951 buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); 952 if (buf_desc) { 953 memset(buf_desc->cpu_addr, 0, bufsize); 954 break; /* found reusable slot */ 955 } 956 957 if (is_smcd) 958 buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); 959 else 960 buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); 961 962 if (PTR_ERR(buf_desc) == -ENOMEM) 963 break; 964 if (IS_ERR(buf_desc)) 965 continue; 966 967 buf_desc->used = 1; 968 write_lock_bh(lock); 969 list_add(&buf_desc->list, buf_list); 970 write_unlock_bh(lock); 971 break; /* found */ 972 } 973 974 if (IS_ERR(buf_desc)) 975 return -ENOMEM; 976 977 if (is_rmb) { 978 conn->rmb_desc = buf_desc; 979 conn->rmbe_size_short = bufsize_short; 980 smc->sk.sk_rcvbuf = bufsize * 2; 981 atomic_set(&conn->bytes_to_rcv, 0); 982 conn->rmbe_update_limit = 983 smc_rmb_wnd_update_limit(buf_desc->len); 984 if (is_smcd) 985 smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ 986 } else { 987 conn->sndbuf_desc = buf_desc; 988 smc->sk.sk_sndbuf = bufsize * 2; 989 atomic_set(&conn->sndbuf_space, bufsize); 990 } 991 return 0; 992 } 993 994 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) 995 { 996 struct smc_link_group *lgr = conn->lgr; 997 998 if (!conn->lgr || conn->lgr->is_smcd) 999 return; 1000 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 1001 conn->sndbuf_desc, DMA_TO_DEVICE); 1002 } 1003 1004 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) 1005 { 1006 struct smc_link_group *lgr = conn->lgr; 1007 1008 if (!conn->lgr || conn->lgr->is_smcd) 1009 return; 1010 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 1011 conn->sndbuf_desc, DMA_TO_DEVICE); 1012 } 1013 1014 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) 1015 { 1016 struct smc_link_group *lgr = conn->lgr; 1017 1018 if (!conn->lgr || conn->lgr->is_smcd) 1019 return; 1020 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 1021 conn->rmb_desc, DMA_FROM_DEVICE); 1022 } 1023 1024 void smc_rmb_sync_sg_for_device(struct smc_connection *conn) 1025 { 1026 struct smc_link_group *lgr = conn->lgr; 1027 1028 if (!conn->lgr || conn->lgr->is_smcd) 1029 return; 1030 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 1031 conn->rmb_desc, DMA_FROM_DEVICE); 1032 } 1033 1034 /* create the send and receive buffer for an SMC socket; 1035 * receive buffers are called RMBs; 1036 * (even though the SMC protocol allows more than one RMB-element per RMB, 1037 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an 1038 * extra RMB for every connection in a link group 1039 */ 1040 int smc_buf_create(struct smc_sock *smc, bool is_smcd) 1041 { 1042 int rc; 1043 1044 /* create send buffer */ 1045 rc = __smc_buf_create(smc, is_smcd, false); 1046 if (rc) 1047 return rc; 1048 /* create rmb */ 1049 rc = __smc_buf_create(smc, is_smcd, true); 1050 if (rc) 1051 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); 1052 return rc; 1053 } 1054 1055 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) 1056 { 1057 int i; 1058 1059 for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { 1060 if (!test_and_set_bit(i, lgr->rtokens_used_mask)) 1061 return i; 1062 } 1063 return -ENOSPC; 1064 } 1065 1066 /* add a new rtoken from peer */ 1067 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) 1068 { 1069 u64 dma_addr = be64_to_cpu(nw_vaddr); 1070 u32 rkey = ntohl(nw_rkey); 1071 int i; 1072 1073 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { 1074 if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && 1075 (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && 1076 test_bit(i, lgr->rtokens_used_mask)) { 1077 /* already in list */ 1078 return i; 1079 } 1080 } 1081 i = smc_rmb_reserve_rtoken_idx(lgr); 1082 if (i < 0) 1083 return i; 1084 lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; 1085 lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; 1086 return i; 1087 } 1088 1089 /* delete an rtoken */ 1090 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) 1091 { 1092 u32 rkey = ntohl(nw_rkey); 1093 int i; 1094 1095 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { 1096 if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && 1097 test_bit(i, lgr->rtokens_used_mask)) { 1098 lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; 1099 lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; 1100 1101 clear_bit(i, lgr->rtokens_used_mask); 1102 return 0; 1103 } 1104 } 1105 return -ENOENT; 1106 } 1107 1108 /* save rkey and dma_addr received from peer during clc handshake */ 1109 int smc_rmb_rtoken_handling(struct smc_connection *conn, 1110 struct smc_clc_msg_accept_confirm *clc) 1111 { 1112 conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, 1113 clc->rmb_rkey); 1114 if (conn->rtoken_idx < 0) 1115 return conn->rtoken_idx; 1116 return 0; 1117 } 1118 1119 static void smc_core_going_away(void) 1120 { 1121 struct smc_ib_device *smcibdev; 1122 struct smcd_dev *smcd; 1123 1124 spin_lock(&smc_ib_devices.lock); 1125 list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { 1126 int i; 1127 1128 for (i = 0; i < SMC_MAX_PORTS; i++) 1129 set_bit(i, smcibdev->ports_going_away); 1130 } 1131 spin_unlock(&smc_ib_devices.lock); 1132 1133 spin_lock(&smcd_dev_list.lock); 1134 list_for_each_entry(smcd, &smcd_dev_list.list, list) { 1135 smcd->going_away = 1; 1136 } 1137 spin_unlock(&smcd_dev_list.lock); 1138 } 1139 1140 /* Called (from smc_exit) when module is removed */ 1141 void smc_core_exit(void) 1142 { 1143 struct smc_link_group *lgr, *lg; 1144 LIST_HEAD(lgr_freeing_list); 1145 struct smcd_dev *smcd; 1146 1147 smc_core_going_away(); 1148 1149 spin_lock_bh(&smc_lgr_list.lock); 1150 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1151 spin_unlock_bh(&smc_lgr_list.lock); 1152 1153 spin_lock(&smcd_dev_list.lock); 1154 list_for_each_entry(smcd, &smcd_dev_list.list, list) 1155 list_splice_init(&smcd->lgr_list, &lgr_freeing_list); 1156 spin_unlock(&smcd_dev_list.lock); 1157 1158 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1159 list_del_init(&lgr->list); 1160 if (!lgr->is_smcd) { 1161 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 1162 1163 if (lnk->state == SMC_LNK_ACTIVE) 1164 smc_llc_send_delete_link(lnk, SMC_LLC_REQ, 1165 false); 1166 smc_llc_link_inactive(lnk); 1167 } 1168 cancel_delayed_work_sync(&lgr->free_work); 1169 if (lgr->is_smcd) 1170 smc_ism_signal_shutdown(lgr); 1171 smc_lgr_free(lgr); /* free link group */ 1172 } 1173 } 1174