1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 6 * applies to SOCK_STREAM sockets only 7 * offers an alternative communication option for TCP-protocol sockets 8 * applicable with RoCE-cards only 9 * 10 * Initial restrictions: 11 * - support for alternate links postponed 12 * 13 * Copyright IBM Corp. 2016, 2018 14 * 15 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 16 * based on prototype from Frank Blaschka 17 */ 18 19 #define KMSG_COMPONENT "smc" 20 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 21 22 #include <linux/module.h> 23 #include <linux/socket.h> 24 #include <linux/workqueue.h> 25 #include <linux/in.h> 26 #include <linux/sched/signal.h> 27 #include <linux/if_vlan.h> 28 #include <linux/rcupdate_wait.h> 29 #include <linux/ctype.h> 30 31 #include <net/sock.h> 32 #include <net/tcp.h> 33 #include <net/smc.h> 34 #include <asm/ioctls.h> 35 36 #include <net/net_namespace.h> 37 #include <net/netns/generic.h> 38 #include "smc_netns.h" 39 40 #include "smc.h" 41 #include "smc_clc.h" 42 #include "smc_llc.h" 43 #include "smc_cdc.h" 44 #include "smc_core.h" 45 #include "smc_ib.h" 46 #include "smc_ism.h" 47 #include "smc_pnet.h" 48 #include "smc_netlink.h" 49 #include "smc_tx.h" 50 #include "smc_rx.h" 51 #include "smc_close.h" 52 #include "smc_stats.h" 53 #include "smc_tracepoint.h" 54 55 static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group 56 * creation on server 57 */ 58 static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group 59 * creation on client 60 */ 61 62 struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ 63 struct workqueue_struct *smc_close_wq; /* wq for close work */ 64 65 static void smc_tcp_listen_work(struct work_struct *); 66 static void smc_connect_work(struct work_struct *); 67 68 static void smc_set_keepalive(struct sock *sk, int val) 69 { 70 struct smc_sock *smc = smc_sk(sk); 71 72 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 73 } 74 75 static struct smc_hashinfo smc_v4_hashinfo = { 76 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 77 }; 78 79 static struct smc_hashinfo smc_v6_hashinfo = { 80 .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), 81 }; 82 83 int smc_hash_sk(struct sock *sk) 84 { 85 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 86 struct hlist_head *head; 87 88 head = &h->ht; 89 90 write_lock_bh(&h->lock); 91 sk_add_node(sk, head); 92 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 93 write_unlock_bh(&h->lock); 94 95 return 0; 96 } 97 EXPORT_SYMBOL_GPL(smc_hash_sk); 98 99 void smc_unhash_sk(struct sock *sk) 100 { 101 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 102 103 write_lock_bh(&h->lock); 104 if (sk_del_node_init(sk)) 105 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 106 write_unlock_bh(&h->lock); 107 } 108 EXPORT_SYMBOL_GPL(smc_unhash_sk); 109 110 struct proto smc_proto = { 111 .name = "SMC", 112 .owner = THIS_MODULE, 113 .keepalive = smc_set_keepalive, 114 .hash = smc_hash_sk, 115 .unhash = smc_unhash_sk, 116 .obj_size = sizeof(struct smc_sock), 117 .h.smc_hash = &smc_v4_hashinfo, 118 .slab_flags = SLAB_TYPESAFE_BY_RCU, 119 }; 120 EXPORT_SYMBOL_GPL(smc_proto); 121 122 struct proto smc_proto6 = { 123 .name = "SMC6", 124 .owner = THIS_MODULE, 125 .keepalive = smc_set_keepalive, 126 .hash = smc_hash_sk, 127 .unhash = smc_unhash_sk, 128 .obj_size = sizeof(struct smc_sock), 129 .h.smc_hash = &smc_v6_hashinfo, 130 .slab_flags = SLAB_TYPESAFE_BY_RCU, 131 }; 132 EXPORT_SYMBOL_GPL(smc_proto6); 133 134 static void smc_restore_fallback_changes(struct smc_sock *smc) 135 { 136 if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ 137 smc->clcsock->file->private_data = smc->sk.sk_socket; 138 smc->clcsock->file = NULL; 139 } 140 } 141 142 static int __smc_release(struct smc_sock *smc) 143 { 144 struct sock *sk = &smc->sk; 145 int rc = 0; 146 147 if (!smc->use_fallback) { 148 rc = smc_close_active(smc); 149 sock_set_flag(sk, SOCK_DEAD); 150 sk->sk_shutdown |= SHUTDOWN_MASK; 151 } else { 152 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) 153 sock_put(sk); /* passive closing */ 154 if (sk->sk_state == SMC_LISTEN) { 155 /* wake up clcsock accept */ 156 rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); 157 } 158 sk->sk_state = SMC_CLOSED; 159 sk->sk_state_change(sk); 160 smc_restore_fallback_changes(smc); 161 } 162 163 sk->sk_prot->unhash(sk); 164 165 if (sk->sk_state == SMC_CLOSED) { 166 if (smc->clcsock) { 167 release_sock(sk); 168 smc_clcsock_release(smc); 169 lock_sock(sk); 170 } 171 if (!smc->use_fallback) 172 smc_conn_free(&smc->conn); 173 } 174 175 return rc; 176 } 177 178 static int smc_release(struct socket *sock) 179 { 180 struct sock *sk = sock->sk; 181 struct smc_sock *smc; 182 int rc = 0; 183 184 if (!sk) 185 goto out; 186 187 sock_hold(sk); /* sock_put below */ 188 smc = smc_sk(sk); 189 190 /* cleanup for a dangling non-blocking connect */ 191 if (smc->connect_nonblock && sk->sk_state == SMC_INIT) 192 tcp_abort(smc->clcsock->sk, ECONNABORTED); 193 flush_work(&smc->connect_work); 194 195 if (sk->sk_state == SMC_LISTEN) 196 /* smc_close_non_accepted() is called and acquires 197 * sock lock for child sockets again 198 */ 199 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 200 else 201 lock_sock(sk); 202 203 rc = __smc_release(smc); 204 205 /* detach socket */ 206 sock_orphan(sk); 207 sock->sk = NULL; 208 release_sock(sk); 209 210 sock_put(sk); /* sock_hold above */ 211 sock_put(sk); /* final sock_put */ 212 out: 213 return rc; 214 } 215 216 static void smc_destruct(struct sock *sk) 217 { 218 if (sk->sk_state != SMC_CLOSED) 219 return; 220 if (!sock_flag(sk, SOCK_DEAD)) 221 return; 222 223 sk_refcnt_debug_dec(sk); 224 } 225 226 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, 227 int protocol) 228 { 229 struct smc_sock *smc; 230 struct proto *prot; 231 struct sock *sk; 232 233 prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; 234 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); 235 if (!sk) 236 return NULL; 237 238 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 239 sk->sk_state = SMC_INIT; 240 sk->sk_destruct = smc_destruct; 241 sk->sk_protocol = protocol; 242 smc = smc_sk(sk); 243 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 244 INIT_WORK(&smc->connect_work, smc_connect_work); 245 INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); 246 INIT_LIST_HEAD(&smc->accept_q); 247 spin_lock_init(&smc->accept_q_lock); 248 spin_lock_init(&smc->conn.send_lock); 249 sk->sk_prot->hash(sk); 250 sk_refcnt_debug_inc(sk); 251 mutex_init(&smc->clcsock_release_lock); 252 253 return sk; 254 } 255 256 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 257 int addr_len) 258 { 259 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 260 struct sock *sk = sock->sk; 261 struct smc_sock *smc; 262 int rc; 263 264 smc = smc_sk(sk); 265 266 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 267 rc = -EINVAL; 268 if (addr_len < sizeof(struct sockaddr_in)) 269 goto out; 270 271 rc = -EAFNOSUPPORT; 272 if (addr->sin_family != AF_INET && 273 addr->sin_family != AF_INET6 && 274 addr->sin_family != AF_UNSPEC) 275 goto out; 276 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 277 if (addr->sin_family == AF_UNSPEC && 278 addr->sin_addr.s_addr != htonl(INADDR_ANY)) 279 goto out; 280 281 lock_sock(sk); 282 283 /* Check if socket is already active */ 284 rc = -EINVAL; 285 if (sk->sk_state != SMC_INIT || smc->connect_nonblock) 286 goto out_rel; 287 288 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 289 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 290 291 out_rel: 292 release_sock(sk); 293 out: 294 return rc; 295 } 296 297 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 298 unsigned long mask) 299 { 300 /* options we don't get control via setsockopt for */ 301 nsk->sk_type = osk->sk_type; 302 nsk->sk_sndbuf = osk->sk_sndbuf; 303 nsk->sk_rcvbuf = osk->sk_rcvbuf; 304 nsk->sk_sndtimeo = osk->sk_sndtimeo; 305 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 306 nsk->sk_mark = osk->sk_mark; 307 nsk->sk_priority = osk->sk_priority; 308 nsk->sk_rcvlowat = osk->sk_rcvlowat; 309 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 310 nsk->sk_err = osk->sk_err; 311 312 nsk->sk_flags &= ~mask; 313 nsk->sk_flags |= osk->sk_flags & mask; 314 } 315 316 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 317 (1UL << SOCK_KEEPOPEN) | \ 318 (1UL << SOCK_LINGER) | \ 319 (1UL << SOCK_BROADCAST) | \ 320 (1UL << SOCK_TIMESTAMP) | \ 321 (1UL << SOCK_DBG) | \ 322 (1UL << SOCK_RCVTSTAMP) | \ 323 (1UL << SOCK_RCVTSTAMPNS) | \ 324 (1UL << SOCK_LOCALROUTE) | \ 325 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 326 (1UL << SOCK_RXQ_OVFL) | \ 327 (1UL << SOCK_WIFI_STATUS) | \ 328 (1UL << SOCK_NOFCS) | \ 329 (1UL << SOCK_FILTER_LOCKED) | \ 330 (1UL << SOCK_TSTAMP_NEW)) 331 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 332 * clc socket (since smc is not called for these options from net/core) 333 */ 334 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 335 { 336 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 337 } 338 339 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 340 (1UL << SOCK_KEEPOPEN) | \ 341 (1UL << SOCK_LINGER) | \ 342 (1UL << SOCK_DBG)) 343 /* copy only settings and flags relevant for smc from clc to smc socket */ 344 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 345 { 346 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 347 } 348 349 /* register the new rmb on all links */ 350 static int smcr_lgr_reg_rmbs(struct smc_link *link, 351 struct smc_buf_desc *rmb_desc) 352 { 353 struct smc_link_group *lgr = link->lgr; 354 int i, rc = 0; 355 356 rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); 357 if (rc) 358 return rc; 359 /* protect against parallel smc_llc_cli_rkey_exchange() and 360 * parallel smcr_link_reg_rmb() 361 */ 362 mutex_lock(&lgr->llc_conf_mutex); 363 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 364 if (!smc_link_active(&lgr->lnk[i])) 365 continue; 366 rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); 367 if (rc) 368 goto out; 369 } 370 371 /* exchange confirm_rkey msg with peer */ 372 rc = smc_llc_do_confirm_rkey(link, rmb_desc); 373 if (rc) { 374 rc = -EFAULT; 375 goto out; 376 } 377 rmb_desc->is_conf_rkey = true; 378 out: 379 mutex_unlock(&lgr->llc_conf_mutex); 380 smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); 381 return rc; 382 } 383 384 static int smcr_clnt_conf_first_link(struct smc_sock *smc) 385 { 386 struct smc_link *link = smc->conn.lnk; 387 struct smc_llc_qentry *qentry; 388 int rc; 389 390 /* receive CONFIRM LINK request from server over RoCE fabric */ 391 qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, 392 SMC_LLC_CONFIRM_LINK); 393 if (!qentry) { 394 struct smc_clc_msg_decline dclc; 395 396 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 397 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); 398 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; 399 } 400 smc_llc_save_peer_uid(qentry); 401 rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); 402 smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); 403 if (rc) 404 return SMC_CLC_DECL_RMBE_EC; 405 406 rc = smc_ib_modify_qp_rts(link); 407 if (rc) 408 return SMC_CLC_DECL_ERR_RDYLNK; 409 410 smc_wr_remember_qp_attr(link); 411 412 if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) 413 return SMC_CLC_DECL_ERR_REGRMB; 414 415 /* confirm_rkey is implicit on 1st contact */ 416 smc->conn.rmb_desc->is_conf_rkey = true; 417 418 /* send CONFIRM LINK response over RoCE fabric */ 419 rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); 420 if (rc < 0) 421 return SMC_CLC_DECL_TIMEOUT_CL; 422 423 smc_llc_link_active(link); 424 smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); 425 426 /* optional 2nd link, receive ADD LINK request from server */ 427 qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, 428 SMC_LLC_ADD_LINK); 429 if (!qentry) { 430 struct smc_clc_msg_decline dclc; 431 432 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 433 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); 434 if (rc == -EAGAIN) 435 rc = 0; /* no DECLINE received, go with one link */ 436 return rc; 437 } 438 smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); 439 smc_llc_cli_add_link(link, qentry); 440 return 0; 441 } 442 443 static bool smc_isascii(char *hostname) 444 { 445 int i; 446 447 for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) 448 if (!isascii(hostname[i])) 449 return false; 450 return true; 451 } 452 453 static void smc_conn_save_peer_info_fce(struct smc_sock *smc, 454 struct smc_clc_msg_accept_confirm *clc) 455 { 456 struct smc_clc_msg_accept_confirm_v2 *clc_v2 = 457 (struct smc_clc_msg_accept_confirm_v2 *)clc; 458 struct smc_clc_first_contact_ext *fce; 459 int clc_v2_len; 460 461 if (clc->hdr.version == SMC_V1 || 462 !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) 463 return; 464 465 if (smc->conn.lgr->is_smcd) { 466 memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid, 467 SMC_MAX_EID_LEN); 468 clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, 469 d1); 470 } else { 471 memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid, 472 SMC_MAX_EID_LEN); 473 clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, 474 r1); 475 } 476 fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len); 477 smc->conn.lgr->peer_os = fce->os_type; 478 smc->conn.lgr->peer_smc_release = fce->release; 479 if (smc_isascii(fce->hostname)) 480 memcpy(smc->conn.lgr->peer_hostname, fce->hostname, 481 SMC_MAX_HOSTNAME_LEN); 482 } 483 484 static void smcr_conn_save_peer_info(struct smc_sock *smc, 485 struct smc_clc_msg_accept_confirm *clc) 486 { 487 int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size); 488 489 smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx; 490 smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token); 491 smc->conn.peer_rmbe_size = bufsize; 492 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 493 smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); 494 } 495 496 static void smcd_conn_save_peer_info(struct smc_sock *smc, 497 struct smc_clc_msg_accept_confirm *clc) 498 { 499 int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size); 500 501 smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx; 502 smc->conn.peer_token = clc->d0.token; 503 /* msg header takes up space in the buffer */ 504 smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); 505 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 506 smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; 507 } 508 509 static void smc_conn_save_peer_info(struct smc_sock *smc, 510 struct smc_clc_msg_accept_confirm *clc) 511 { 512 if (smc->conn.lgr->is_smcd) 513 smcd_conn_save_peer_info(smc, clc); 514 else 515 smcr_conn_save_peer_info(smc, clc); 516 smc_conn_save_peer_info_fce(smc, clc); 517 } 518 519 static void smc_link_save_peer_info(struct smc_link *link, 520 struct smc_clc_msg_accept_confirm *clc, 521 struct smc_init_info *ini) 522 { 523 link->peer_qpn = ntoh24(clc->r0.qpn); 524 memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE); 525 memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); 526 link->peer_psn = ntoh24(clc->r0.psn); 527 link->peer_mtu = clc->r0.qp_mtu; 528 } 529 530 static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, 531 struct smc_stats_fback *fback_arr) 532 { 533 int cnt; 534 535 for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) { 536 if (fback_arr[cnt].fback_code == smc->fallback_rsn) { 537 fback_arr[cnt].count++; 538 break; 539 } 540 if (!fback_arr[cnt].fback_code) { 541 fback_arr[cnt].fback_code = smc->fallback_rsn; 542 fback_arr[cnt].count++; 543 break; 544 } 545 } 546 } 547 548 static void smc_stat_fallback(struct smc_sock *smc) 549 { 550 struct net *net = sock_net(&smc->sk); 551 552 mutex_lock(&net->smc.mutex_fback_rsn); 553 if (smc->listen_smc) { 554 smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); 555 net->smc.fback_rsn->srv_fback_cnt++; 556 } else { 557 smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); 558 net->smc.fback_rsn->clnt_fback_cnt++; 559 } 560 mutex_unlock(&net->smc.mutex_fback_rsn); 561 } 562 563 static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) 564 { 565 smc->use_fallback = true; 566 smc->fallback_rsn = reason_code; 567 smc_stat_fallback(smc); 568 trace_smc_switch_to_fallback(smc, reason_code); 569 if (smc->sk.sk_socket && smc->sk.sk_socket->file) { 570 smc->clcsock->file = smc->sk.sk_socket->file; 571 smc->clcsock->file->private_data = smc->clcsock; 572 smc->clcsock->wq.fasync_list = 573 smc->sk.sk_socket->wq.fasync_list; 574 } 575 } 576 577 /* fall back during connect */ 578 static int smc_connect_fallback(struct smc_sock *smc, int reason_code) 579 { 580 smc_switch_to_fallback(smc, reason_code); 581 smc_copy_sock_settings_to_clc(smc); 582 smc->connect_nonblock = 0; 583 if (smc->sk.sk_state == SMC_INIT) 584 smc->sk.sk_state = SMC_ACTIVE; 585 return 0; 586 } 587 588 /* decline and fall back during connect */ 589 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, 590 u8 version) 591 { 592 struct net *net = sock_net(&smc->sk); 593 int rc; 594 595 if (reason_code < 0) { /* error, fallback is not possible */ 596 this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); 597 if (smc->sk.sk_state == SMC_INIT) 598 sock_put(&smc->sk); /* passive closing */ 599 return reason_code; 600 } 601 if (reason_code != SMC_CLC_DECL_PEERDECL) { 602 rc = smc_clc_send_decline(smc, reason_code, version); 603 if (rc < 0) { 604 this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); 605 if (smc->sk.sk_state == SMC_INIT) 606 sock_put(&smc->sk); /* passive closing */ 607 return rc; 608 } 609 } 610 return smc_connect_fallback(smc, reason_code); 611 } 612 613 static void smc_conn_abort(struct smc_sock *smc, int local_first) 614 { 615 if (local_first) 616 smc_lgr_cleanup_early(&smc->conn); 617 else 618 smc_conn_free(&smc->conn); 619 } 620 621 /* check if there is a rdma device available for this connection. */ 622 /* called for connect and listen */ 623 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) 624 { 625 /* PNET table look up: search active ib_device and port 626 * within same PNETID that also contains the ethernet device 627 * used for the internal TCP socket 628 */ 629 smc_pnet_find_roce_resource(smc->clcsock->sk, ini); 630 if (!ini->check_smcrv2 && !ini->ib_dev) 631 return SMC_CLC_DECL_NOSMCRDEV; 632 if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2) 633 return SMC_CLC_DECL_NOSMCRDEV; 634 return 0; 635 } 636 637 /* check if there is an ISM device available for this connection. */ 638 /* called for connect and listen */ 639 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) 640 { 641 /* Find ISM device with same PNETID as connecting interface */ 642 smc_pnet_find_ism_resource(smc->clcsock->sk, ini); 643 if (!ini->ism_dev[0]) 644 return SMC_CLC_DECL_NOSMCDDEV; 645 else 646 ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]); 647 return 0; 648 } 649 650 /* is chid unique for the ism devices that are already determined? */ 651 static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini, 652 int cnt) 653 { 654 int i = (!ini->ism_dev[0]) ? 1 : 0; 655 656 for (; i < cnt; i++) 657 if (ini->ism_chid[i] == chid) 658 return false; 659 return true; 660 } 661 662 /* determine possible V2 ISM devices (either without PNETID or with PNETID plus 663 * PNETID matching net_device) 664 */ 665 static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, 666 struct smc_init_info *ini) 667 { 668 int rc = SMC_CLC_DECL_NOSMCDDEV; 669 struct smcd_dev *smcd; 670 int i = 1; 671 u16 chid; 672 673 if (smcd_indicated(ini->smc_type_v1)) 674 rc = 0; /* already initialized for V1 */ 675 mutex_lock(&smcd_dev_list.mutex); 676 list_for_each_entry(smcd, &smcd_dev_list.list, list) { 677 if (smcd->going_away || smcd == ini->ism_dev[0]) 678 continue; 679 chid = smc_ism_get_chid(smcd); 680 if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) 681 continue; 682 if (!smc_pnet_is_pnetid_set(smcd->pnetid) || 683 smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { 684 ini->ism_dev[i] = smcd; 685 ini->ism_chid[i] = chid; 686 ini->is_smcd = true; 687 rc = 0; 688 i++; 689 if (i > SMC_MAX_ISM_DEVS) 690 break; 691 } 692 } 693 mutex_unlock(&smcd_dev_list.mutex); 694 ini->ism_offered_cnt = i - 1; 695 if (!ini->ism_dev[0] && !ini->ism_dev[1]) 696 ini->smcd_version = 0; 697 698 return rc; 699 } 700 701 /* Check for VLAN ID and register it on ISM device just for CLC handshake */ 702 static int smc_connect_ism_vlan_setup(struct smc_sock *smc, 703 struct smc_init_info *ini) 704 { 705 if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) 706 return SMC_CLC_DECL_ISMVLANERR; 707 return 0; 708 } 709 710 static int smc_find_proposal_devices(struct smc_sock *smc, 711 struct smc_init_info *ini) 712 { 713 int rc = 0; 714 715 /* check if there is an ism device available */ 716 if (!(ini->smcd_version & SMC_V1) || 717 smc_find_ism_device(smc, ini) || 718 smc_connect_ism_vlan_setup(smc, ini)) 719 ini->smcd_version &= ~SMC_V1; 720 /* else ISM V1 is supported for this connection */ 721 722 /* check if there is an rdma device available */ 723 if (!(ini->smcr_version & SMC_V1) || 724 smc_find_rdma_device(smc, ini)) 725 ini->smcr_version &= ~SMC_V1; 726 /* else RDMA is supported for this connection */ 727 728 ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1, 729 ini->smcr_version & SMC_V1); 730 731 /* check if there is an ism v2 device available */ 732 if (!(ini->smcd_version & SMC_V2) || 733 !smc_ism_is_v2_capable() || 734 smc_find_ism_v2_device_clnt(smc, ini)) 735 ini->smcd_version &= ~SMC_V2; 736 737 /* check if there is an rdma v2 device available */ 738 ini->check_smcrv2 = true; 739 ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; 740 if (!(ini->smcr_version & SMC_V2) || 741 smc->clcsock->sk->sk_family != AF_INET || 742 !smc_clc_ueid_count() || 743 smc_find_rdma_device(smc, ini)) 744 ini->smcr_version &= ~SMC_V2; 745 ini->check_smcrv2 = false; 746 747 ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2, 748 ini->smcr_version & SMC_V2); 749 750 /* if neither ISM nor RDMA are supported, fallback */ 751 if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) 752 rc = SMC_CLC_DECL_NOSMCDEV; 753 754 return rc; 755 } 756 757 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is 758 * used, the VLAN ID will be registered again during the connection setup. 759 */ 760 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, 761 struct smc_init_info *ini) 762 { 763 if (!smcd_indicated(ini->smc_type_v1)) 764 return 0; 765 if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id)) 766 return SMC_CLC_DECL_CNFERR; 767 return 0; 768 } 769 770 #define SMC_CLC_MAX_ACCEPT_LEN \ 771 (sizeof(struct smc_clc_msg_accept_confirm_v2) + \ 772 sizeof(struct smc_clc_first_contact_ext) + \ 773 sizeof(struct smc_clc_msg_trail)) 774 775 /* CLC handshake during connect */ 776 static int smc_connect_clc(struct smc_sock *smc, 777 struct smc_clc_msg_accept_confirm_v2 *aclc2, 778 struct smc_init_info *ini) 779 { 780 int rc = 0; 781 782 /* do inband token exchange */ 783 rc = smc_clc_send_proposal(smc, ini); 784 if (rc) 785 return rc; 786 /* receive SMC Accept CLC message */ 787 return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, 788 SMC_CLC_ACCEPT, CLC_WAIT_TIME); 789 } 790 791 void smc_fill_gid_list(struct smc_link_group *lgr, 792 struct smc_gidlist *gidlist, 793 struct smc_ib_device *known_dev, u8 *known_gid) 794 { 795 struct smc_init_info *alt_ini = NULL; 796 797 memset(gidlist, 0, sizeof(*gidlist)); 798 memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE); 799 800 alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL); 801 if (!alt_ini) 802 goto out; 803 804 alt_ini->vlan_id = lgr->vlan_id; 805 alt_ini->check_smcrv2 = true; 806 alt_ini->smcrv2.saddr = lgr->saddr; 807 smc_pnet_find_alt_roce(lgr, alt_ini, known_dev); 808 809 if (!alt_ini->smcrv2.ib_dev_v2) 810 goto out; 811 812 memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2, 813 SMC_GID_SIZE); 814 815 out: 816 kfree(alt_ini); 817 } 818 819 static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, 820 struct smc_clc_msg_accept_confirm *aclc, 821 struct smc_init_info *ini) 822 { 823 struct smc_clc_msg_accept_confirm_v2 *clc_v2 = 824 (struct smc_clc_msg_accept_confirm_v2 *)aclc; 825 struct smc_clc_first_contact_ext *fce = 826 (struct smc_clc_first_contact_ext *) 827 (((u8 *)clc_v2) + sizeof(*clc_v2)); 828 829 if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) 830 return 0; 831 832 if (fce->v2_direct) { 833 memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); 834 ini->smcrv2.uses_gateway = false; 835 } else { 836 if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr, 837 smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), 838 ini->smcrv2.nexthop_mac, 839 &ini->smcrv2.uses_gateway)) 840 return SMC_CLC_DECL_NOROUTE; 841 if (!ini->smcrv2.uses_gateway) { 842 /* mismatch: peer claims indirect, but its direct */ 843 return SMC_CLC_DECL_NOINDIRECT; 844 } 845 } 846 return 0; 847 } 848 849 /* setup for RDMA connection of client */ 850 static int smc_connect_rdma(struct smc_sock *smc, 851 struct smc_clc_msg_accept_confirm *aclc, 852 struct smc_init_info *ini) 853 { 854 int i, reason_code = 0; 855 struct smc_link *link; 856 u8 *eid = NULL; 857 858 ini->is_smcd = false; 859 ini->ib_clcqpn = ntoh24(aclc->r0.qpn); 860 ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; 861 memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); 862 memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); 863 memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); 864 865 reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); 866 if (reason_code) 867 return reason_code; 868 869 mutex_lock(&smc_client_lgr_pending); 870 reason_code = smc_conn_create(smc, ini); 871 if (reason_code) { 872 mutex_unlock(&smc_client_lgr_pending); 873 return reason_code; 874 } 875 876 smc_conn_save_peer_info(smc, aclc); 877 878 if (ini->first_contact_local) { 879 link = smc->conn.lnk; 880 } else { 881 /* set link that was assigned by server */ 882 link = NULL; 883 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 884 struct smc_link *l = &smc->conn.lgr->lnk[i]; 885 886 if (l->peer_qpn == ntoh24(aclc->r0.qpn) && 887 !memcmp(l->peer_gid, &aclc->r0.lcl.gid, 888 SMC_GID_SIZE) && 889 (aclc->hdr.version > SMC_V1 || 890 !memcmp(l->peer_mac, &aclc->r0.lcl.mac, 891 sizeof(l->peer_mac)))) { 892 link = l; 893 break; 894 } 895 } 896 if (!link) { 897 reason_code = SMC_CLC_DECL_NOSRVLINK; 898 goto connect_abort; 899 } 900 smc_switch_link_and_count(&smc->conn, link); 901 } 902 903 /* create send buffer and rmb */ 904 if (smc_buf_create(smc, false)) { 905 reason_code = SMC_CLC_DECL_MEM; 906 goto connect_abort; 907 } 908 909 if (ini->first_contact_local) 910 smc_link_save_peer_info(link, aclc, ini); 911 912 if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) { 913 reason_code = SMC_CLC_DECL_ERR_RTOK; 914 goto connect_abort; 915 } 916 917 smc_close_init(smc); 918 smc_rx_init(smc); 919 920 if (ini->first_contact_local) { 921 if (smc_ib_ready_link(link)) { 922 reason_code = SMC_CLC_DECL_ERR_RDYLNK; 923 goto connect_abort; 924 } 925 } else { 926 if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { 927 reason_code = SMC_CLC_DECL_ERR_REGRMB; 928 goto connect_abort; 929 } 930 } 931 smc_rmb_sync_sg_for_device(&smc->conn); 932 933 if (aclc->hdr.version > SMC_V1) { 934 struct smc_clc_msg_accept_confirm_v2 *clc_v2 = 935 (struct smc_clc_msg_accept_confirm_v2 *)aclc; 936 937 eid = clc_v2->r1.eid; 938 if (ini->first_contact_local) 939 smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, 940 link->smcibdev, link->gid); 941 } 942 943 reason_code = smc_clc_send_confirm(smc, ini->first_contact_local, 944 aclc->hdr.version, eid, ini); 945 if (reason_code) 946 goto connect_abort; 947 948 smc_tx_init(smc); 949 950 if (ini->first_contact_local) { 951 /* QP confirmation over RoCE fabric */ 952 smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); 953 reason_code = smcr_clnt_conf_first_link(smc); 954 smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); 955 if (reason_code) 956 goto connect_abort; 957 } 958 mutex_unlock(&smc_client_lgr_pending); 959 960 smc_copy_sock_settings_to_clc(smc); 961 smc->connect_nonblock = 0; 962 if (smc->sk.sk_state == SMC_INIT) 963 smc->sk.sk_state = SMC_ACTIVE; 964 965 return 0; 966 connect_abort: 967 smc_conn_abort(smc, ini->first_contact_local); 968 mutex_unlock(&smc_client_lgr_pending); 969 smc->connect_nonblock = 0; 970 971 return reason_code; 972 } 973 974 /* The server has chosen one of the proposed ISM devices for the communication. 975 * Determine from the CHID of the received CLC ACCEPT the ISM device chosen. 976 */ 977 static int 978 smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc, 979 struct smc_init_info *ini) 980 { 981 int i; 982 983 for (i = 0; i < ini->ism_offered_cnt + 1; i++) { 984 if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { 985 ini->ism_selected = i; 986 return 0; 987 } 988 } 989 990 return -EPROTO; 991 } 992 993 /* setup for ISM connection of client */ 994 static int smc_connect_ism(struct smc_sock *smc, 995 struct smc_clc_msg_accept_confirm *aclc, 996 struct smc_init_info *ini) 997 { 998 u8 *eid = NULL; 999 int rc = 0; 1000 1001 ini->is_smcd = true; 1002 ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; 1003 1004 if (aclc->hdr.version == SMC_V2) { 1005 struct smc_clc_msg_accept_confirm_v2 *aclc_v2 = 1006 (struct smc_clc_msg_accept_confirm_v2 *)aclc; 1007 1008 rc = smc_v2_determine_accepted_chid(aclc_v2, ini); 1009 if (rc) 1010 return rc; 1011 } 1012 ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid; 1013 1014 /* there is only one lgr role for SMC-D; use server lock */ 1015 mutex_lock(&smc_server_lgr_pending); 1016 rc = smc_conn_create(smc, ini); 1017 if (rc) { 1018 mutex_unlock(&smc_server_lgr_pending); 1019 return rc; 1020 } 1021 1022 /* Create send and receive buffers */ 1023 rc = smc_buf_create(smc, true); 1024 if (rc) { 1025 rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; 1026 goto connect_abort; 1027 } 1028 1029 smc_conn_save_peer_info(smc, aclc); 1030 smc_close_init(smc); 1031 smc_rx_init(smc); 1032 smc_tx_init(smc); 1033 1034 if (aclc->hdr.version > SMC_V1) { 1035 struct smc_clc_msg_accept_confirm_v2 *clc_v2 = 1036 (struct smc_clc_msg_accept_confirm_v2 *)aclc; 1037 1038 eid = clc_v2->d1.eid; 1039 } 1040 1041 rc = smc_clc_send_confirm(smc, ini->first_contact_local, 1042 aclc->hdr.version, eid, NULL); 1043 if (rc) 1044 goto connect_abort; 1045 mutex_unlock(&smc_server_lgr_pending); 1046 1047 smc_copy_sock_settings_to_clc(smc); 1048 smc->connect_nonblock = 0; 1049 if (smc->sk.sk_state == SMC_INIT) 1050 smc->sk.sk_state = SMC_ACTIVE; 1051 1052 return 0; 1053 connect_abort: 1054 smc_conn_abort(smc, ini->first_contact_local); 1055 mutex_unlock(&smc_server_lgr_pending); 1056 smc->connect_nonblock = 0; 1057 1058 return rc; 1059 } 1060 1061 /* check if received accept type and version matches a proposed one */ 1062 static int smc_connect_check_aclc(struct smc_init_info *ini, 1063 struct smc_clc_msg_accept_confirm *aclc) 1064 { 1065 if (aclc->hdr.typev1 != SMC_TYPE_R && 1066 aclc->hdr.typev1 != SMC_TYPE_D) 1067 return SMC_CLC_DECL_MODEUNSUPP; 1068 1069 if (aclc->hdr.version >= SMC_V2) { 1070 if ((aclc->hdr.typev1 == SMC_TYPE_R && 1071 !smcr_indicated(ini->smc_type_v2)) || 1072 (aclc->hdr.typev1 == SMC_TYPE_D && 1073 !smcd_indicated(ini->smc_type_v2))) 1074 return SMC_CLC_DECL_MODEUNSUPP; 1075 } else { 1076 if ((aclc->hdr.typev1 == SMC_TYPE_R && 1077 !smcr_indicated(ini->smc_type_v1)) || 1078 (aclc->hdr.typev1 == SMC_TYPE_D && 1079 !smcd_indicated(ini->smc_type_v1))) 1080 return SMC_CLC_DECL_MODEUNSUPP; 1081 } 1082 1083 return 0; 1084 } 1085 1086 /* perform steps before actually connecting */ 1087 static int __smc_connect(struct smc_sock *smc) 1088 { 1089 u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; 1090 struct smc_clc_msg_accept_confirm_v2 *aclc2; 1091 struct smc_clc_msg_accept_confirm *aclc; 1092 struct smc_init_info *ini = NULL; 1093 u8 *buf = NULL; 1094 int rc = 0; 1095 1096 if (smc->use_fallback) 1097 return smc_connect_fallback(smc, smc->fallback_rsn); 1098 1099 /* if peer has not signalled SMC-capability, fall back */ 1100 if (!tcp_sk(smc->clcsock->sk)->syn_smc) 1101 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); 1102 1103 /* IPSec connections opt out of SMC optimizations */ 1104 if (using_ipsec(smc)) 1105 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC, 1106 version); 1107 1108 ini = kzalloc(sizeof(*ini), GFP_KERNEL); 1109 if (!ini) 1110 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, 1111 version); 1112 1113 ini->smcd_version = SMC_V1 | SMC_V2; 1114 ini->smcr_version = SMC_V1 | SMC_V2; 1115 ini->smc_type_v1 = SMC_TYPE_B; 1116 ini->smc_type_v2 = SMC_TYPE_B; 1117 1118 /* get vlan id from IP device */ 1119 if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { 1120 ini->smcd_version &= ~SMC_V1; 1121 ini->smcr_version = 0; 1122 ini->smc_type_v1 = SMC_TYPE_N; 1123 if (!ini->smcd_version) { 1124 rc = SMC_CLC_DECL_GETVLANERR; 1125 goto fallback; 1126 } 1127 } 1128 1129 rc = smc_find_proposal_devices(smc, ini); 1130 if (rc) 1131 goto fallback; 1132 1133 buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL); 1134 if (!buf) { 1135 rc = SMC_CLC_DECL_MEM; 1136 goto fallback; 1137 } 1138 aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf; 1139 aclc = (struct smc_clc_msg_accept_confirm *)aclc2; 1140 1141 /* perform CLC handshake */ 1142 rc = smc_connect_clc(smc, aclc2, ini); 1143 if (rc) 1144 goto vlan_cleanup; 1145 1146 /* check if smc modes and versions of CLC proposal and accept match */ 1147 rc = smc_connect_check_aclc(ini, aclc); 1148 version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; 1149 if (rc) 1150 goto vlan_cleanup; 1151 1152 /* depending on previous steps, connect using rdma or ism */ 1153 if (aclc->hdr.typev1 == SMC_TYPE_R) { 1154 ini->smcr_version = version; 1155 rc = smc_connect_rdma(smc, aclc, ini); 1156 } else if (aclc->hdr.typev1 == SMC_TYPE_D) { 1157 ini->smcd_version = version; 1158 rc = smc_connect_ism(smc, aclc, ini); 1159 } 1160 if (rc) 1161 goto vlan_cleanup; 1162 1163 SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); 1164 smc_connect_ism_vlan_cleanup(smc, ini); 1165 kfree(buf); 1166 kfree(ini); 1167 return 0; 1168 1169 vlan_cleanup: 1170 smc_connect_ism_vlan_cleanup(smc, ini); 1171 kfree(buf); 1172 fallback: 1173 kfree(ini); 1174 return smc_connect_decline_fallback(smc, rc, version); 1175 } 1176 1177 static void smc_connect_work(struct work_struct *work) 1178 { 1179 struct smc_sock *smc = container_of(work, struct smc_sock, 1180 connect_work); 1181 long timeo = smc->sk.sk_sndtimeo; 1182 int rc = 0; 1183 1184 if (!timeo) 1185 timeo = MAX_SCHEDULE_TIMEOUT; 1186 lock_sock(smc->clcsock->sk); 1187 if (smc->clcsock->sk->sk_err) { 1188 smc->sk.sk_err = smc->clcsock->sk->sk_err; 1189 } else if ((1 << smc->clcsock->sk->sk_state) & 1190 (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 1191 rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); 1192 if ((rc == -EPIPE) && 1193 ((1 << smc->clcsock->sk->sk_state) & 1194 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) 1195 rc = 0; 1196 } 1197 release_sock(smc->clcsock->sk); 1198 lock_sock(&smc->sk); 1199 if (rc != 0 || smc->sk.sk_err) { 1200 smc->sk.sk_state = SMC_CLOSED; 1201 if (rc == -EPIPE || rc == -EAGAIN) 1202 smc->sk.sk_err = EPIPE; 1203 else if (signal_pending(current)) 1204 smc->sk.sk_err = -sock_intr_errno(timeo); 1205 sock_put(&smc->sk); /* passive closing */ 1206 goto out; 1207 } 1208 1209 rc = __smc_connect(smc); 1210 if (rc < 0) 1211 smc->sk.sk_err = -rc; 1212 1213 out: 1214 if (!sock_flag(&smc->sk, SOCK_DEAD)) { 1215 if (smc->sk.sk_err) { 1216 smc->sk.sk_state_change(&smc->sk); 1217 } else { /* allow polling before and after fallback decision */ 1218 smc->clcsock->sk->sk_write_space(smc->clcsock->sk); 1219 smc->sk.sk_write_space(&smc->sk); 1220 } 1221 } 1222 release_sock(&smc->sk); 1223 } 1224 1225 static int smc_connect(struct socket *sock, struct sockaddr *addr, 1226 int alen, int flags) 1227 { 1228 struct sock *sk = sock->sk; 1229 struct smc_sock *smc; 1230 int rc = -EINVAL; 1231 1232 smc = smc_sk(sk); 1233 1234 /* separate smc parameter checking to be safe */ 1235 if (alen < sizeof(addr->sa_family)) 1236 goto out_err; 1237 if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) 1238 goto out_err; 1239 1240 lock_sock(sk); 1241 switch (sk->sk_state) { 1242 default: 1243 goto out; 1244 case SMC_ACTIVE: 1245 rc = -EISCONN; 1246 goto out; 1247 case SMC_INIT: 1248 break; 1249 } 1250 1251 smc_copy_sock_settings_to_clc(smc); 1252 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1253 if (smc->connect_nonblock) { 1254 rc = -EALREADY; 1255 goto out; 1256 } 1257 rc = kernel_connect(smc->clcsock, addr, alen, flags); 1258 if (rc && rc != -EINPROGRESS) 1259 goto out; 1260 1261 sock_hold(&smc->sk); /* sock put in passive closing */ 1262 if (smc->use_fallback) 1263 goto out; 1264 if (flags & O_NONBLOCK) { 1265 if (queue_work(smc_hs_wq, &smc->connect_work)) 1266 smc->connect_nonblock = 1; 1267 rc = -EINPROGRESS; 1268 } else { 1269 rc = __smc_connect(smc); 1270 if (rc < 0) 1271 goto out; 1272 else 1273 rc = 0; /* success cases including fallback */ 1274 } 1275 1276 out: 1277 release_sock(sk); 1278 out_err: 1279 return rc; 1280 } 1281 1282 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 1283 { 1284 struct socket *new_clcsock = NULL; 1285 struct sock *lsk = &lsmc->sk; 1286 struct sock *new_sk; 1287 int rc = -EINVAL; 1288 1289 release_sock(lsk); 1290 new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); 1291 if (!new_sk) { 1292 rc = -ENOMEM; 1293 lsk->sk_err = ENOMEM; 1294 *new_smc = NULL; 1295 lock_sock(lsk); 1296 goto out; 1297 } 1298 *new_smc = smc_sk(new_sk); 1299 1300 mutex_lock(&lsmc->clcsock_release_lock); 1301 if (lsmc->clcsock) 1302 rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); 1303 mutex_unlock(&lsmc->clcsock_release_lock); 1304 lock_sock(lsk); 1305 if (rc < 0 && rc != -EAGAIN) 1306 lsk->sk_err = -rc; 1307 if (rc < 0 || lsk->sk_state == SMC_CLOSED) { 1308 new_sk->sk_prot->unhash(new_sk); 1309 if (new_clcsock) 1310 sock_release(new_clcsock); 1311 new_sk->sk_state = SMC_CLOSED; 1312 sock_set_flag(new_sk, SOCK_DEAD); 1313 sock_put(new_sk); /* final */ 1314 *new_smc = NULL; 1315 goto out; 1316 } 1317 1318 /* new clcsock has inherited the smc listen-specific sk_data_ready 1319 * function; switch it back to the original sk_data_ready function 1320 */ 1321 new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; 1322 (*new_smc)->clcsock = new_clcsock; 1323 out: 1324 return rc; 1325 } 1326 1327 /* add a just created sock to the accept queue of the listen sock as 1328 * candidate for a following socket accept call from user space 1329 */ 1330 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 1331 { 1332 struct smc_sock *par = smc_sk(parent); 1333 1334 sock_hold(sk); /* sock_put in smc_accept_unlink () */ 1335 spin_lock(&par->accept_q_lock); 1336 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 1337 spin_unlock(&par->accept_q_lock); 1338 sk_acceptq_added(parent); 1339 } 1340 1341 /* remove a socket from the accept queue of its parental listening socket */ 1342 static void smc_accept_unlink(struct sock *sk) 1343 { 1344 struct smc_sock *par = smc_sk(sk)->listen_smc; 1345 1346 spin_lock(&par->accept_q_lock); 1347 list_del_init(&smc_sk(sk)->accept_q); 1348 spin_unlock(&par->accept_q_lock); 1349 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 1350 sock_put(sk); /* sock_hold in smc_accept_enqueue */ 1351 } 1352 1353 /* remove a sock from the accept queue to bind it to a new socket created 1354 * for a socket accept call from user space 1355 */ 1356 struct sock *smc_accept_dequeue(struct sock *parent, 1357 struct socket *new_sock) 1358 { 1359 struct smc_sock *isk, *n; 1360 struct sock *new_sk; 1361 1362 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 1363 new_sk = (struct sock *)isk; 1364 1365 smc_accept_unlink(new_sk); 1366 if (new_sk->sk_state == SMC_CLOSED) { 1367 new_sk->sk_prot->unhash(new_sk); 1368 if (isk->clcsock) { 1369 sock_release(isk->clcsock); 1370 isk->clcsock = NULL; 1371 } 1372 sock_put(new_sk); /* final */ 1373 continue; 1374 } 1375 if (new_sock) { 1376 sock_graft(new_sk, new_sock); 1377 if (isk->use_fallback) { 1378 smc_sk(new_sk)->clcsock->file = new_sock->file; 1379 isk->clcsock->file->private_data = isk->clcsock; 1380 } 1381 } 1382 return new_sk; 1383 } 1384 return NULL; 1385 } 1386 1387 /* clean up for a created but never accepted sock */ 1388 void smc_close_non_accepted(struct sock *sk) 1389 { 1390 struct smc_sock *smc = smc_sk(sk); 1391 1392 sock_hold(sk); /* sock_put below */ 1393 lock_sock(sk); 1394 if (!sk->sk_lingertime) 1395 /* wait for peer closing */ 1396 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 1397 __smc_release(smc); 1398 release_sock(sk); 1399 sock_put(sk); /* sock_hold above */ 1400 sock_put(sk); /* final sock_put */ 1401 } 1402 1403 static int smcr_serv_conf_first_link(struct smc_sock *smc) 1404 { 1405 struct smc_link *link = smc->conn.lnk; 1406 struct smc_llc_qentry *qentry; 1407 int rc; 1408 1409 if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) 1410 return SMC_CLC_DECL_ERR_REGRMB; 1411 1412 /* send CONFIRM LINK request to client over the RoCE fabric */ 1413 rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); 1414 if (rc < 0) 1415 return SMC_CLC_DECL_TIMEOUT_CL; 1416 1417 /* receive CONFIRM LINK response from client over the RoCE fabric */ 1418 qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, 1419 SMC_LLC_CONFIRM_LINK); 1420 if (!qentry) { 1421 struct smc_clc_msg_decline dclc; 1422 1423 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 1424 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); 1425 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; 1426 } 1427 smc_llc_save_peer_uid(qentry); 1428 rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); 1429 smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); 1430 if (rc) 1431 return SMC_CLC_DECL_RMBE_EC; 1432 1433 /* confirm_rkey is implicit on 1st contact */ 1434 smc->conn.rmb_desc->is_conf_rkey = true; 1435 1436 smc_llc_link_active(link); 1437 smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); 1438 1439 /* initial contact - try to establish second link */ 1440 smc_llc_srv_add_link(link, NULL); 1441 return 0; 1442 } 1443 1444 /* listen worker: finish */ 1445 static void smc_listen_out(struct smc_sock *new_smc) 1446 { 1447 struct smc_sock *lsmc = new_smc->listen_smc; 1448 struct sock *newsmcsk = &new_smc->sk; 1449 1450 if (lsmc->sk.sk_state == SMC_LISTEN) { 1451 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 1452 smc_accept_enqueue(&lsmc->sk, newsmcsk); 1453 release_sock(&lsmc->sk); 1454 } else { /* no longer listening */ 1455 smc_close_non_accepted(newsmcsk); 1456 } 1457 1458 /* Wake up accept */ 1459 lsmc->sk.sk_data_ready(&lsmc->sk); 1460 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 1461 } 1462 1463 /* listen worker: finish in state connected */ 1464 static void smc_listen_out_connected(struct smc_sock *new_smc) 1465 { 1466 struct sock *newsmcsk = &new_smc->sk; 1467 1468 sk_refcnt_debug_inc(newsmcsk); 1469 if (newsmcsk->sk_state == SMC_INIT) 1470 newsmcsk->sk_state = SMC_ACTIVE; 1471 1472 smc_listen_out(new_smc); 1473 } 1474 1475 /* listen worker: finish in error state */ 1476 static void smc_listen_out_err(struct smc_sock *new_smc) 1477 { 1478 struct sock *newsmcsk = &new_smc->sk; 1479 struct net *net = sock_net(newsmcsk); 1480 1481 this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); 1482 if (newsmcsk->sk_state == SMC_INIT) 1483 sock_put(&new_smc->sk); /* passive closing */ 1484 newsmcsk->sk_state = SMC_CLOSED; 1485 1486 smc_listen_out(new_smc); 1487 } 1488 1489 /* listen worker: decline and fall back if possible */ 1490 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, 1491 int local_first, u8 version) 1492 { 1493 /* RDMA setup failed, switch back to TCP */ 1494 smc_conn_abort(new_smc, local_first); 1495 if (reason_code < 0) { /* error, no fallback possible */ 1496 smc_listen_out_err(new_smc); 1497 return; 1498 } 1499 smc_switch_to_fallback(new_smc, reason_code); 1500 if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { 1501 if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { 1502 smc_listen_out_err(new_smc); 1503 return; 1504 } 1505 } 1506 smc_listen_out_connected(new_smc); 1507 } 1508 1509 /* listen worker: version checking */ 1510 static int smc_listen_v2_check(struct smc_sock *new_smc, 1511 struct smc_clc_msg_proposal *pclc, 1512 struct smc_init_info *ini) 1513 { 1514 struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext; 1515 struct smc_clc_v2_extension *pclc_v2_ext; 1516 int rc = SMC_CLC_DECL_PEERNOSMC; 1517 1518 ini->smc_type_v1 = pclc->hdr.typev1; 1519 ini->smc_type_v2 = pclc->hdr.typev2; 1520 ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0; 1521 ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0; 1522 if (pclc->hdr.version > SMC_V1) { 1523 if (smcd_indicated(ini->smc_type_v2)) 1524 ini->smcd_version |= SMC_V2; 1525 if (smcr_indicated(ini->smc_type_v2)) 1526 ini->smcr_version |= SMC_V2; 1527 } 1528 if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) { 1529 rc = SMC_CLC_DECL_PEERNOSMC; 1530 goto out; 1531 } 1532 pclc_v2_ext = smc_get_clc_v2_ext(pclc); 1533 if (!pclc_v2_ext) { 1534 ini->smcd_version &= ~SMC_V2; 1535 ini->smcr_version &= ~SMC_V2; 1536 rc = SMC_CLC_DECL_NOV2EXT; 1537 goto out; 1538 } 1539 pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); 1540 if (ini->smcd_version & SMC_V2) { 1541 if (!smc_ism_is_v2_capable()) { 1542 ini->smcd_version &= ~SMC_V2; 1543 rc = SMC_CLC_DECL_NOISM2SUPP; 1544 } else if (!pclc_smcd_v2_ext) { 1545 ini->smcd_version &= ~SMC_V2; 1546 rc = SMC_CLC_DECL_NOV2DEXT; 1547 } else if (!pclc_v2_ext->hdr.eid_cnt && 1548 !pclc_v2_ext->hdr.flag.seid) { 1549 ini->smcd_version &= ~SMC_V2; 1550 rc = SMC_CLC_DECL_NOUEID; 1551 } 1552 } 1553 if (ini->smcr_version & SMC_V2) { 1554 if (!pclc_v2_ext->hdr.eid_cnt) { 1555 ini->smcr_version &= ~SMC_V2; 1556 rc = SMC_CLC_DECL_NOUEID; 1557 } 1558 } 1559 1560 out: 1561 if (!ini->smcd_version && !ini->smcr_version) 1562 return rc; 1563 1564 return 0; 1565 } 1566 1567 /* listen worker: check prefixes */ 1568 static int smc_listen_prfx_check(struct smc_sock *new_smc, 1569 struct smc_clc_msg_proposal *pclc) 1570 { 1571 struct smc_clc_msg_proposal_prefix *pclc_prfx; 1572 struct socket *newclcsock = new_smc->clcsock; 1573 1574 if (pclc->hdr.typev1 == SMC_TYPE_N) 1575 return 0; 1576 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 1577 if (smc_clc_prfx_match(newclcsock, pclc_prfx)) 1578 return SMC_CLC_DECL_DIFFPREFIX; 1579 1580 return 0; 1581 } 1582 1583 /* listen worker: initialize connection and buffers */ 1584 static int smc_listen_rdma_init(struct smc_sock *new_smc, 1585 struct smc_init_info *ini) 1586 { 1587 int rc; 1588 1589 /* allocate connection / link group */ 1590 rc = smc_conn_create(new_smc, ini); 1591 if (rc) 1592 return rc; 1593 1594 /* create send buffer and rmb */ 1595 if (smc_buf_create(new_smc, false)) 1596 return SMC_CLC_DECL_MEM; 1597 1598 return 0; 1599 } 1600 1601 /* listen worker: initialize connection and buffers for SMC-D */ 1602 static int smc_listen_ism_init(struct smc_sock *new_smc, 1603 struct smc_init_info *ini) 1604 { 1605 int rc; 1606 1607 rc = smc_conn_create(new_smc, ini); 1608 if (rc) 1609 return rc; 1610 1611 /* Create send and receive buffers */ 1612 rc = smc_buf_create(new_smc, true); 1613 if (rc) { 1614 smc_conn_abort(new_smc, ini->first_contact_local); 1615 return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : 1616 SMC_CLC_DECL_MEM; 1617 } 1618 1619 return 0; 1620 } 1621 1622 static bool smc_is_already_selected(struct smcd_dev *smcd, 1623 struct smc_init_info *ini, 1624 int matches) 1625 { 1626 int i; 1627 1628 for (i = 0; i < matches; i++) 1629 if (smcd == ini->ism_dev[i]) 1630 return true; 1631 1632 return false; 1633 } 1634 1635 /* check for ISM devices matching proposed ISM devices */ 1636 static void smc_check_ism_v2_match(struct smc_init_info *ini, 1637 u16 proposed_chid, u64 proposed_gid, 1638 unsigned int *matches) 1639 { 1640 struct smcd_dev *smcd; 1641 1642 list_for_each_entry(smcd, &smcd_dev_list.list, list) { 1643 if (smcd->going_away) 1644 continue; 1645 if (smc_is_already_selected(smcd, ini, *matches)) 1646 continue; 1647 if (smc_ism_get_chid(smcd) == proposed_chid && 1648 !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { 1649 ini->ism_peer_gid[*matches] = proposed_gid; 1650 ini->ism_dev[*matches] = smcd; 1651 (*matches)++; 1652 break; 1653 } 1654 } 1655 } 1656 1657 static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) 1658 { 1659 if (!ini->rc) 1660 ini->rc = rc; 1661 } 1662 1663 static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, 1664 struct smc_clc_msg_proposal *pclc, 1665 struct smc_init_info *ini) 1666 { 1667 struct smc_clc_smcd_v2_extension *smcd_v2_ext; 1668 struct smc_clc_v2_extension *smc_v2_ext; 1669 struct smc_clc_msg_smcd *pclc_smcd; 1670 unsigned int matches = 0; 1671 u8 smcd_version; 1672 u8 *eid = NULL; 1673 int i, rc; 1674 1675 if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) 1676 goto not_found; 1677 1678 pclc_smcd = smc_get_clc_msg_smcd(pclc); 1679 smc_v2_ext = smc_get_clc_v2_ext(pclc); 1680 smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); 1681 1682 mutex_lock(&smcd_dev_list.mutex); 1683 if (pclc_smcd->ism.chid) 1684 /* check for ISM device matching proposed native ISM device */ 1685 smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid), 1686 ntohll(pclc_smcd->ism.gid), &matches); 1687 for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) { 1688 /* check for ISM devices matching proposed non-native ISM 1689 * devices 1690 */ 1691 smc_check_ism_v2_match(ini, 1692 ntohs(smcd_v2_ext->gidchid[i - 1].chid), 1693 ntohll(smcd_v2_ext->gidchid[i - 1].gid), 1694 &matches); 1695 } 1696 mutex_unlock(&smcd_dev_list.mutex); 1697 1698 if (!ini->ism_dev[0]) { 1699 smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); 1700 goto not_found; 1701 } 1702 1703 smc_ism_get_system_eid(&eid); 1704 if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, 1705 smcd_v2_ext->system_eid, eid)) 1706 goto not_found; 1707 1708 /* separate - outside the smcd_dev_list.lock */ 1709 smcd_version = ini->smcd_version; 1710 for (i = 0; i < matches; i++) { 1711 ini->smcd_version = SMC_V2; 1712 ini->is_smcd = true; 1713 ini->ism_selected = i; 1714 rc = smc_listen_ism_init(new_smc, ini); 1715 if (rc) { 1716 smc_find_ism_store_rc(rc, ini); 1717 /* try next active ISM device */ 1718 continue; 1719 } 1720 return; /* matching and usable V2 ISM device found */ 1721 } 1722 /* no V2 ISM device could be initialized */ 1723 ini->smcd_version = smcd_version; /* restore original value */ 1724 ini->negotiated_eid[0] = 0; 1725 1726 not_found: 1727 ini->smcd_version &= ~SMC_V2; 1728 ini->ism_dev[0] = NULL; 1729 ini->is_smcd = false; 1730 } 1731 1732 static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, 1733 struct smc_clc_msg_proposal *pclc, 1734 struct smc_init_info *ini) 1735 { 1736 struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); 1737 int rc = 0; 1738 1739 /* check if ISM V1 is available */ 1740 if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) 1741 goto not_found; 1742 ini->is_smcd = true; /* prepare ISM check */ 1743 ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid); 1744 rc = smc_find_ism_device(new_smc, ini); 1745 if (rc) 1746 goto not_found; 1747 ini->ism_selected = 0; 1748 rc = smc_listen_ism_init(new_smc, ini); 1749 if (!rc) 1750 return; /* V1 ISM device found */ 1751 1752 not_found: 1753 smc_find_ism_store_rc(rc, ini); 1754 ini->smcd_version &= ~SMC_V1; 1755 ini->ism_dev[0] = NULL; 1756 ini->is_smcd = false; 1757 } 1758 1759 /* listen worker: register buffers */ 1760 static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) 1761 { 1762 struct smc_connection *conn = &new_smc->conn; 1763 1764 if (!local_first) { 1765 if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) 1766 return SMC_CLC_DECL_ERR_REGRMB; 1767 } 1768 smc_rmb_sync_sg_for_device(&new_smc->conn); 1769 1770 return 0; 1771 } 1772 1773 static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, 1774 struct smc_clc_msg_proposal *pclc, 1775 struct smc_init_info *ini) 1776 { 1777 struct smc_clc_v2_extension *smc_v2_ext; 1778 u8 smcr_version; 1779 int rc; 1780 1781 if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) 1782 goto not_found; 1783 1784 smc_v2_ext = smc_get_clc_v2_ext(pclc); 1785 if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) 1786 goto not_found; 1787 1788 /* prepare RDMA check */ 1789 memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); 1790 memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE); 1791 memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); 1792 ini->check_smcrv2 = true; 1793 ini->smcrv2.clc_sk = new_smc->clcsock->sk; 1794 ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; 1795 ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); 1796 rc = smc_find_rdma_device(new_smc, ini); 1797 if (rc) { 1798 smc_find_ism_store_rc(rc, ini); 1799 goto not_found; 1800 } 1801 if (!ini->smcrv2.uses_gateway) 1802 memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); 1803 1804 smcr_version = ini->smcr_version; 1805 ini->smcr_version = SMC_V2; 1806 rc = smc_listen_rdma_init(new_smc, ini); 1807 if (!rc) 1808 rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); 1809 if (!rc) 1810 return; 1811 ini->smcr_version = smcr_version; 1812 smc_find_ism_store_rc(rc, ini); 1813 1814 not_found: 1815 ini->smcr_version &= ~SMC_V2; 1816 ini->check_smcrv2 = false; 1817 } 1818 1819 static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, 1820 struct smc_clc_msg_proposal *pclc, 1821 struct smc_init_info *ini) 1822 { 1823 int rc; 1824 1825 if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1)) 1826 return SMC_CLC_DECL_NOSMCDEV; 1827 1828 /* prepare RDMA check */ 1829 memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); 1830 memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE); 1831 memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); 1832 rc = smc_find_rdma_device(new_smc, ini); 1833 if (rc) { 1834 /* no RDMA device found */ 1835 return SMC_CLC_DECL_NOSMCDEV; 1836 } 1837 rc = smc_listen_rdma_init(new_smc, ini); 1838 if (rc) 1839 return rc; 1840 return smc_listen_rdma_reg(new_smc, ini->first_contact_local); 1841 } 1842 1843 /* determine the local device matching to proposal */ 1844 static int smc_listen_find_device(struct smc_sock *new_smc, 1845 struct smc_clc_msg_proposal *pclc, 1846 struct smc_init_info *ini) 1847 { 1848 int prfx_rc; 1849 1850 /* check for ISM device matching V2 proposed device */ 1851 smc_find_ism_v2_device_serv(new_smc, pclc, ini); 1852 if (ini->ism_dev[0]) 1853 return 0; 1854 1855 /* check for matching IP prefix and subnet length (V1) */ 1856 prfx_rc = smc_listen_prfx_check(new_smc, pclc); 1857 if (prfx_rc) 1858 smc_find_ism_store_rc(prfx_rc, ini); 1859 1860 /* get vlan id from IP device */ 1861 if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) 1862 return ini->rc ?: SMC_CLC_DECL_GETVLANERR; 1863 1864 /* check for ISM device matching V1 proposed device */ 1865 if (!prfx_rc) 1866 smc_find_ism_v1_device_serv(new_smc, pclc, ini); 1867 if (ini->ism_dev[0]) 1868 return 0; 1869 1870 if (!smcr_indicated(pclc->hdr.typev1) && 1871 !smcr_indicated(pclc->hdr.typev2)) 1872 /* skip RDMA and decline */ 1873 return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; 1874 1875 /* check if RDMA V2 is available */ 1876 smc_find_rdma_v2_device_serv(new_smc, pclc, ini); 1877 if (ini->smcrv2.ib_dev_v2) 1878 return 0; 1879 1880 /* check if RDMA V1 is available */ 1881 if (!prfx_rc) { 1882 int rc; 1883 1884 rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); 1885 smc_find_ism_store_rc(rc, ini); 1886 return (!rc) ? 0 : ini->rc; 1887 } 1888 return SMC_CLC_DECL_NOSMCDEV; 1889 } 1890 1891 /* listen worker: finish RDMA setup */ 1892 static int smc_listen_rdma_finish(struct smc_sock *new_smc, 1893 struct smc_clc_msg_accept_confirm *cclc, 1894 bool local_first, 1895 struct smc_init_info *ini) 1896 { 1897 struct smc_link *link = new_smc->conn.lnk; 1898 int reason_code = 0; 1899 1900 if (local_first) 1901 smc_link_save_peer_info(link, cclc, ini); 1902 1903 if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) 1904 return SMC_CLC_DECL_ERR_RTOK; 1905 1906 if (local_first) { 1907 if (smc_ib_ready_link(link)) 1908 return SMC_CLC_DECL_ERR_RDYLNK; 1909 /* QP confirmation over RoCE fabric */ 1910 smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); 1911 reason_code = smcr_serv_conf_first_link(new_smc); 1912 smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); 1913 } 1914 return reason_code; 1915 } 1916 1917 /* setup for connection of server */ 1918 static void smc_listen_work(struct work_struct *work) 1919 { 1920 struct smc_sock *new_smc = container_of(work, struct smc_sock, 1921 smc_listen_work); 1922 struct socket *newclcsock = new_smc->clcsock; 1923 struct smc_clc_msg_accept_confirm *cclc; 1924 struct smc_clc_msg_proposal_area *buf; 1925 struct smc_clc_msg_proposal *pclc; 1926 struct smc_init_info *ini = NULL; 1927 u8 proposal_version = SMC_V1; 1928 u8 accept_version; 1929 int rc = 0; 1930 1931 if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) 1932 return smc_listen_out_err(new_smc); 1933 1934 if (new_smc->use_fallback) { 1935 smc_listen_out_connected(new_smc); 1936 return; 1937 } 1938 1939 /* check if peer is smc capable */ 1940 if (!tcp_sk(newclcsock->sk)->syn_smc) { 1941 smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); 1942 smc_listen_out_connected(new_smc); 1943 return; 1944 } 1945 1946 /* do inband token exchange - 1947 * wait for and receive SMC Proposal CLC message 1948 */ 1949 buf = kzalloc(sizeof(*buf), GFP_KERNEL); 1950 if (!buf) { 1951 rc = SMC_CLC_DECL_MEM; 1952 goto out_decl; 1953 } 1954 pclc = (struct smc_clc_msg_proposal *)buf; 1955 rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf), 1956 SMC_CLC_PROPOSAL, CLC_WAIT_TIME); 1957 if (rc) 1958 goto out_decl; 1959 1960 if (pclc->hdr.version > SMC_V1) 1961 proposal_version = SMC_V2; 1962 1963 /* IPSec connections opt out of SMC optimizations */ 1964 if (using_ipsec(new_smc)) { 1965 rc = SMC_CLC_DECL_IPSEC; 1966 goto out_decl; 1967 } 1968 1969 ini = kzalloc(sizeof(*ini), GFP_KERNEL); 1970 if (!ini) { 1971 rc = SMC_CLC_DECL_MEM; 1972 goto out_decl; 1973 } 1974 1975 /* initial version checking */ 1976 rc = smc_listen_v2_check(new_smc, pclc, ini); 1977 if (rc) 1978 goto out_decl; 1979 1980 mutex_lock(&smc_server_lgr_pending); 1981 smc_close_init(new_smc); 1982 smc_rx_init(new_smc); 1983 smc_tx_init(new_smc); 1984 1985 /* determine ISM or RoCE device used for connection */ 1986 rc = smc_listen_find_device(new_smc, pclc, ini); 1987 if (rc) 1988 goto out_unlock; 1989 1990 /* send SMC Accept CLC message */ 1991 accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; 1992 rc = smc_clc_send_accept(new_smc, ini->first_contact_local, 1993 accept_version, ini->negotiated_eid); 1994 if (rc) 1995 goto out_unlock; 1996 1997 /* SMC-D does not need this lock any more */ 1998 if (ini->is_smcd) 1999 mutex_unlock(&smc_server_lgr_pending); 2000 2001 /* receive SMC Confirm CLC message */ 2002 memset(buf, 0, sizeof(*buf)); 2003 cclc = (struct smc_clc_msg_accept_confirm *)buf; 2004 rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf), 2005 SMC_CLC_CONFIRM, CLC_WAIT_TIME); 2006 if (rc) { 2007 if (!ini->is_smcd) 2008 goto out_unlock; 2009 goto out_decl; 2010 } 2011 2012 /* finish worker */ 2013 if (!ini->is_smcd) { 2014 rc = smc_listen_rdma_finish(new_smc, cclc, 2015 ini->first_contact_local, ini); 2016 if (rc) 2017 goto out_unlock; 2018 mutex_unlock(&smc_server_lgr_pending); 2019 } 2020 smc_conn_save_peer_info(new_smc, cclc); 2021 smc_listen_out_connected(new_smc); 2022 SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); 2023 goto out_free; 2024 2025 out_unlock: 2026 mutex_unlock(&smc_server_lgr_pending); 2027 out_decl: 2028 smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, 2029 proposal_version); 2030 out_free: 2031 kfree(ini); 2032 kfree(buf); 2033 } 2034 2035 static void smc_tcp_listen_work(struct work_struct *work) 2036 { 2037 struct smc_sock *lsmc = container_of(work, struct smc_sock, 2038 tcp_listen_work); 2039 struct sock *lsk = &lsmc->sk; 2040 struct smc_sock *new_smc; 2041 int rc = 0; 2042 2043 lock_sock(lsk); 2044 while (lsk->sk_state == SMC_LISTEN) { 2045 rc = smc_clcsock_accept(lsmc, &new_smc); 2046 if (rc) /* clcsock accept queue empty or error */ 2047 goto out; 2048 if (!new_smc) 2049 continue; 2050 2051 new_smc->listen_smc = lsmc; 2052 new_smc->use_fallback = lsmc->use_fallback; 2053 new_smc->fallback_rsn = lsmc->fallback_rsn; 2054 sock_hold(lsk); /* sock_put in smc_listen_work */ 2055 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 2056 smc_copy_sock_settings_to_smc(new_smc); 2057 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; 2058 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; 2059 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 2060 if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) 2061 sock_put(&new_smc->sk); 2062 } 2063 2064 out: 2065 release_sock(lsk); 2066 sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ 2067 } 2068 2069 static void smc_clcsock_data_ready(struct sock *listen_clcsock) 2070 { 2071 struct smc_sock *lsmc; 2072 2073 lsmc = (struct smc_sock *) 2074 ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); 2075 if (!lsmc) 2076 return; 2077 lsmc->clcsk_data_ready(listen_clcsock); 2078 if (lsmc->sk.sk_state == SMC_LISTEN) { 2079 sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ 2080 if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) 2081 sock_put(&lsmc->sk); 2082 } 2083 } 2084 2085 static int smc_listen(struct socket *sock, int backlog) 2086 { 2087 struct sock *sk = sock->sk; 2088 struct smc_sock *smc; 2089 int rc; 2090 2091 smc = smc_sk(sk); 2092 lock_sock(sk); 2093 2094 rc = -EINVAL; 2095 if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || 2096 smc->connect_nonblock) 2097 goto out; 2098 2099 rc = 0; 2100 if (sk->sk_state == SMC_LISTEN) { 2101 sk->sk_max_ack_backlog = backlog; 2102 goto out; 2103 } 2104 /* some socket options are handled in core, so we could not apply 2105 * them to the clc socket -- copy smc socket options to clc socket 2106 */ 2107 smc_copy_sock_settings_to_clc(smc); 2108 if (!smc->use_fallback) 2109 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 2110 2111 /* save original sk_data_ready function and establish 2112 * smc-specific sk_data_ready function 2113 */ 2114 smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready; 2115 smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; 2116 smc->clcsock->sk->sk_user_data = 2117 (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); 2118 rc = kernel_listen(smc->clcsock, backlog); 2119 if (rc) 2120 goto out; 2121 sk->sk_max_ack_backlog = backlog; 2122 sk->sk_ack_backlog = 0; 2123 sk->sk_state = SMC_LISTEN; 2124 2125 out: 2126 release_sock(sk); 2127 return rc; 2128 } 2129 2130 static int smc_accept(struct socket *sock, struct socket *new_sock, 2131 int flags, bool kern) 2132 { 2133 struct sock *sk = sock->sk, *nsk; 2134 DECLARE_WAITQUEUE(wait, current); 2135 struct smc_sock *lsmc; 2136 long timeo; 2137 int rc = 0; 2138 2139 lsmc = smc_sk(sk); 2140 sock_hold(sk); /* sock_put below */ 2141 lock_sock(sk); 2142 2143 if (lsmc->sk.sk_state != SMC_LISTEN) { 2144 rc = -EINVAL; 2145 release_sock(sk); 2146 goto out; 2147 } 2148 2149 /* Wait for an incoming connection */ 2150 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 2151 add_wait_queue_exclusive(sk_sleep(sk), &wait); 2152 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 2153 set_current_state(TASK_INTERRUPTIBLE); 2154 if (!timeo) { 2155 rc = -EAGAIN; 2156 break; 2157 } 2158 release_sock(sk); 2159 timeo = schedule_timeout(timeo); 2160 /* wakeup by sk_data_ready in smc_listen_work() */ 2161 sched_annotate_sleep(); 2162 lock_sock(sk); 2163 if (signal_pending(current)) { 2164 rc = sock_intr_errno(timeo); 2165 break; 2166 } 2167 } 2168 set_current_state(TASK_RUNNING); 2169 remove_wait_queue(sk_sleep(sk), &wait); 2170 2171 if (!rc) 2172 rc = sock_error(nsk); 2173 release_sock(sk); 2174 if (rc) 2175 goto out; 2176 2177 if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { 2178 /* wait till data arrives on the socket */ 2179 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * 2180 MSEC_PER_SEC); 2181 if (smc_sk(nsk)->use_fallback) { 2182 struct sock *clcsk = smc_sk(nsk)->clcsock->sk; 2183 2184 lock_sock(clcsk); 2185 if (skb_queue_empty(&clcsk->sk_receive_queue)) 2186 sk_wait_data(clcsk, &timeo, NULL); 2187 release_sock(clcsk); 2188 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { 2189 lock_sock(nsk); 2190 smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); 2191 release_sock(nsk); 2192 } 2193 } 2194 2195 out: 2196 sock_put(sk); /* sock_hold above */ 2197 return rc; 2198 } 2199 2200 static int smc_getname(struct socket *sock, struct sockaddr *addr, 2201 int peer) 2202 { 2203 struct smc_sock *smc; 2204 2205 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 2206 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 2207 return -ENOTCONN; 2208 2209 smc = smc_sk(sock->sk); 2210 2211 return smc->clcsock->ops->getname(smc->clcsock, addr, peer); 2212 } 2213 2214 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 2215 { 2216 struct sock *sk = sock->sk; 2217 struct smc_sock *smc; 2218 int rc = -EPIPE; 2219 2220 smc = smc_sk(sk); 2221 lock_sock(sk); 2222 if ((sk->sk_state != SMC_ACTIVE) && 2223 (sk->sk_state != SMC_APPCLOSEWAIT1) && 2224 (sk->sk_state != SMC_INIT)) 2225 goto out; 2226 2227 if (msg->msg_flags & MSG_FASTOPEN) { 2228 if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { 2229 smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); 2230 } else { 2231 rc = -EINVAL; 2232 goto out; 2233 } 2234 } 2235 2236 if (smc->use_fallback) { 2237 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 2238 } else { 2239 rc = smc_tx_sendmsg(smc, msg, len); 2240 SMC_STAT_TX_PAYLOAD(smc, len, rc); 2241 } 2242 out: 2243 release_sock(sk); 2244 return rc; 2245 } 2246 2247 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 2248 int flags) 2249 { 2250 struct sock *sk = sock->sk; 2251 struct smc_sock *smc; 2252 int rc = -ENOTCONN; 2253 2254 smc = smc_sk(sk); 2255 lock_sock(sk); 2256 if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { 2257 /* socket was connected before, no more data to read */ 2258 rc = 0; 2259 goto out; 2260 } 2261 if ((sk->sk_state == SMC_INIT) || 2262 (sk->sk_state == SMC_LISTEN) || 2263 (sk->sk_state == SMC_CLOSED)) 2264 goto out; 2265 2266 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 2267 rc = 0; 2268 goto out; 2269 } 2270 2271 if (smc->use_fallback) { 2272 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 2273 } else { 2274 msg->msg_namelen = 0; 2275 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); 2276 SMC_STAT_RX_PAYLOAD(smc, rc, rc); 2277 } 2278 2279 out: 2280 release_sock(sk); 2281 return rc; 2282 } 2283 2284 static __poll_t smc_accept_poll(struct sock *parent) 2285 { 2286 struct smc_sock *isk = smc_sk(parent); 2287 __poll_t mask = 0; 2288 2289 spin_lock(&isk->accept_q_lock); 2290 if (!list_empty(&isk->accept_q)) 2291 mask = EPOLLIN | EPOLLRDNORM; 2292 spin_unlock(&isk->accept_q_lock); 2293 2294 return mask; 2295 } 2296 2297 static __poll_t smc_poll(struct file *file, struct socket *sock, 2298 poll_table *wait) 2299 { 2300 struct sock *sk = sock->sk; 2301 struct smc_sock *smc; 2302 __poll_t mask = 0; 2303 2304 if (!sk) 2305 return EPOLLNVAL; 2306 2307 smc = smc_sk(sock->sk); 2308 if (smc->use_fallback) { 2309 /* delegate to CLC child sock */ 2310 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 2311 sk->sk_err = smc->clcsock->sk->sk_err; 2312 } else { 2313 if (sk->sk_state != SMC_CLOSED) 2314 sock_poll_wait(file, sock, wait); 2315 if (sk->sk_err) 2316 mask |= EPOLLERR; 2317 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 2318 (sk->sk_state == SMC_CLOSED)) 2319 mask |= EPOLLHUP; 2320 if (sk->sk_state == SMC_LISTEN) { 2321 /* woken up by sk_data_ready in smc_listen_work() */ 2322 mask |= smc_accept_poll(sk); 2323 } else if (smc->use_fallback) { /* as result of connect_work()*/ 2324 mask |= smc->clcsock->ops->poll(file, smc->clcsock, 2325 wait); 2326 sk->sk_err = smc->clcsock->sk->sk_err; 2327 } else { 2328 if ((sk->sk_state != SMC_INIT && 2329 atomic_read(&smc->conn.sndbuf_space)) || 2330 sk->sk_shutdown & SEND_SHUTDOWN) { 2331 mask |= EPOLLOUT | EPOLLWRNORM; 2332 } else { 2333 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2334 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2335 } 2336 if (atomic_read(&smc->conn.bytes_to_rcv)) 2337 mask |= EPOLLIN | EPOLLRDNORM; 2338 if (sk->sk_shutdown & RCV_SHUTDOWN) 2339 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 2340 if (sk->sk_state == SMC_APPCLOSEWAIT1) 2341 mask |= EPOLLIN; 2342 if (smc->conn.urg_state == SMC_URG_VALID) 2343 mask |= EPOLLPRI; 2344 } 2345 } 2346 2347 return mask; 2348 } 2349 2350 static int smc_shutdown(struct socket *sock, int how) 2351 { 2352 struct sock *sk = sock->sk; 2353 struct smc_sock *smc; 2354 int rc = -EINVAL; 2355 int rc1 = 0; 2356 2357 smc = smc_sk(sk); 2358 2359 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 2360 return rc; 2361 2362 lock_sock(sk); 2363 2364 rc = -ENOTCONN; 2365 if ((sk->sk_state != SMC_ACTIVE) && 2366 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 2367 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 2368 (sk->sk_state != SMC_APPCLOSEWAIT1) && 2369 (sk->sk_state != SMC_APPCLOSEWAIT2) && 2370 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 2371 goto out; 2372 if (smc->use_fallback) { 2373 rc = kernel_sock_shutdown(smc->clcsock, how); 2374 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 2375 if (sk->sk_shutdown == SHUTDOWN_MASK) 2376 sk->sk_state = SMC_CLOSED; 2377 goto out; 2378 } 2379 switch (how) { 2380 case SHUT_RDWR: /* shutdown in both directions */ 2381 rc = smc_close_active(smc); 2382 break; 2383 case SHUT_WR: 2384 rc = smc_close_shutdown_write(smc); 2385 break; 2386 case SHUT_RD: 2387 rc = 0; 2388 /* nothing more to do because peer is not involved */ 2389 break; 2390 } 2391 if (smc->clcsock) 2392 rc1 = kernel_sock_shutdown(smc->clcsock, how); 2393 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 2394 sk->sk_shutdown |= how + 1; 2395 2396 out: 2397 release_sock(sk); 2398 return rc ? rc : rc1; 2399 } 2400 2401 static int smc_setsockopt(struct socket *sock, int level, int optname, 2402 sockptr_t optval, unsigned int optlen) 2403 { 2404 struct sock *sk = sock->sk; 2405 struct smc_sock *smc; 2406 int val, rc; 2407 2408 if (level == SOL_TCP && optname == TCP_ULP) 2409 return -EOPNOTSUPP; 2410 2411 smc = smc_sk(sk); 2412 2413 /* generic setsockopts reaching us here always apply to the 2414 * CLC socket 2415 */ 2416 if (unlikely(!smc->clcsock->ops->setsockopt)) 2417 rc = -EOPNOTSUPP; 2418 else 2419 rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 2420 optval, optlen); 2421 if (smc->clcsock->sk->sk_err) { 2422 sk->sk_err = smc->clcsock->sk->sk_err; 2423 sk_error_report(sk); 2424 } 2425 2426 if (optlen < sizeof(int)) 2427 return -EINVAL; 2428 if (copy_from_sockptr(&val, optval, sizeof(int))) 2429 return -EFAULT; 2430 2431 lock_sock(sk); 2432 if (rc || smc->use_fallback) 2433 goto out; 2434 switch (optname) { 2435 case TCP_FASTOPEN: 2436 case TCP_FASTOPEN_CONNECT: 2437 case TCP_FASTOPEN_KEY: 2438 case TCP_FASTOPEN_NO_COOKIE: 2439 /* option not supported by SMC */ 2440 if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { 2441 smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); 2442 } else { 2443 rc = -EINVAL; 2444 } 2445 break; 2446 case TCP_NODELAY: 2447 if (sk->sk_state != SMC_INIT && 2448 sk->sk_state != SMC_LISTEN && 2449 sk->sk_state != SMC_CLOSED) { 2450 if (val) { 2451 SMC_STAT_INC(smc, ndly_cnt); 2452 mod_delayed_work(smc->conn.lgr->tx_wq, 2453 &smc->conn.tx_work, 0); 2454 } 2455 } 2456 break; 2457 case TCP_CORK: 2458 if (sk->sk_state != SMC_INIT && 2459 sk->sk_state != SMC_LISTEN && 2460 sk->sk_state != SMC_CLOSED) { 2461 if (!val) { 2462 SMC_STAT_INC(smc, cork_cnt); 2463 mod_delayed_work(smc->conn.lgr->tx_wq, 2464 &smc->conn.tx_work, 0); 2465 } 2466 } 2467 break; 2468 case TCP_DEFER_ACCEPT: 2469 smc->sockopt_defer_accept = val; 2470 break; 2471 default: 2472 break; 2473 } 2474 out: 2475 release_sock(sk); 2476 2477 return rc; 2478 } 2479 2480 static int smc_getsockopt(struct socket *sock, int level, int optname, 2481 char __user *optval, int __user *optlen) 2482 { 2483 struct smc_sock *smc; 2484 2485 smc = smc_sk(sock->sk); 2486 /* socket options apply to the CLC socket */ 2487 if (unlikely(!smc->clcsock->ops->getsockopt)) 2488 return -EOPNOTSUPP; 2489 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 2490 optval, optlen); 2491 } 2492 2493 static int smc_ioctl(struct socket *sock, unsigned int cmd, 2494 unsigned long arg) 2495 { 2496 union smc_host_cursor cons, urg; 2497 struct smc_connection *conn; 2498 struct smc_sock *smc; 2499 int answ; 2500 2501 smc = smc_sk(sock->sk); 2502 conn = &smc->conn; 2503 lock_sock(&smc->sk); 2504 if (smc->use_fallback) { 2505 if (!smc->clcsock) { 2506 release_sock(&smc->sk); 2507 return -EBADF; 2508 } 2509 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 2510 release_sock(&smc->sk); 2511 return answ; 2512 } 2513 switch (cmd) { 2514 case SIOCINQ: /* same as FIONREAD */ 2515 if (smc->sk.sk_state == SMC_LISTEN) { 2516 release_sock(&smc->sk); 2517 return -EINVAL; 2518 } 2519 if (smc->sk.sk_state == SMC_INIT || 2520 smc->sk.sk_state == SMC_CLOSED) 2521 answ = 0; 2522 else 2523 answ = atomic_read(&smc->conn.bytes_to_rcv); 2524 break; 2525 case SIOCOUTQ: 2526 /* output queue size (not send + not acked) */ 2527 if (smc->sk.sk_state == SMC_LISTEN) { 2528 release_sock(&smc->sk); 2529 return -EINVAL; 2530 } 2531 if (smc->sk.sk_state == SMC_INIT || 2532 smc->sk.sk_state == SMC_CLOSED) 2533 answ = 0; 2534 else 2535 answ = smc->conn.sndbuf_desc->len - 2536 atomic_read(&smc->conn.sndbuf_space); 2537 break; 2538 case SIOCOUTQNSD: 2539 /* output queue size (not send only) */ 2540 if (smc->sk.sk_state == SMC_LISTEN) { 2541 release_sock(&smc->sk); 2542 return -EINVAL; 2543 } 2544 if (smc->sk.sk_state == SMC_INIT || 2545 smc->sk.sk_state == SMC_CLOSED) 2546 answ = 0; 2547 else 2548 answ = smc_tx_prepared_sends(&smc->conn); 2549 break; 2550 case SIOCATMARK: 2551 if (smc->sk.sk_state == SMC_LISTEN) { 2552 release_sock(&smc->sk); 2553 return -EINVAL; 2554 } 2555 if (smc->sk.sk_state == SMC_INIT || 2556 smc->sk.sk_state == SMC_CLOSED) { 2557 answ = 0; 2558 } else { 2559 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); 2560 smc_curs_copy(&urg, &conn->urg_curs, conn); 2561 answ = smc_curs_diff(conn->rmb_desc->len, 2562 &cons, &urg) == 1; 2563 } 2564 break; 2565 default: 2566 release_sock(&smc->sk); 2567 return -ENOIOCTLCMD; 2568 } 2569 release_sock(&smc->sk); 2570 2571 return put_user(answ, (int __user *)arg); 2572 } 2573 2574 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 2575 int offset, size_t size, int flags) 2576 { 2577 struct sock *sk = sock->sk; 2578 struct smc_sock *smc; 2579 int rc = -EPIPE; 2580 2581 smc = smc_sk(sk); 2582 lock_sock(sk); 2583 if (sk->sk_state != SMC_ACTIVE) { 2584 release_sock(sk); 2585 goto out; 2586 } 2587 release_sock(sk); 2588 if (smc->use_fallback) { 2589 rc = kernel_sendpage(smc->clcsock, page, offset, 2590 size, flags); 2591 } else { 2592 SMC_STAT_INC(smc, sendpage_cnt); 2593 rc = sock_no_sendpage(sock, page, offset, size, flags); 2594 } 2595 2596 out: 2597 return rc; 2598 } 2599 2600 /* Map the affected portions of the rmbe into an spd, note the number of bytes 2601 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor 2602 * updates till whenever a respective page has been fully processed. 2603 * Note that subsequent recv() calls have to wait till all splice() processing 2604 * completed. 2605 */ 2606 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 2607 struct pipe_inode_info *pipe, size_t len, 2608 unsigned int flags) 2609 { 2610 struct sock *sk = sock->sk; 2611 struct smc_sock *smc; 2612 int rc = -ENOTCONN; 2613 2614 smc = smc_sk(sk); 2615 lock_sock(sk); 2616 if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { 2617 /* socket was connected before, no more data to read */ 2618 rc = 0; 2619 goto out; 2620 } 2621 if (sk->sk_state == SMC_INIT || 2622 sk->sk_state == SMC_LISTEN || 2623 sk->sk_state == SMC_CLOSED) 2624 goto out; 2625 2626 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 2627 rc = 0; 2628 goto out; 2629 } 2630 2631 if (smc->use_fallback) { 2632 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 2633 pipe, len, flags); 2634 } else { 2635 if (*ppos) { 2636 rc = -ESPIPE; 2637 goto out; 2638 } 2639 if (flags & SPLICE_F_NONBLOCK) 2640 flags = MSG_DONTWAIT; 2641 else 2642 flags = 0; 2643 SMC_STAT_INC(smc, splice_cnt); 2644 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); 2645 } 2646 out: 2647 release_sock(sk); 2648 2649 return rc; 2650 } 2651 2652 /* must look like tcp */ 2653 static const struct proto_ops smc_sock_ops = { 2654 .family = PF_SMC, 2655 .owner = THIS_MODULE, 2656 .release = smc_release, 2657 .bind = smc_bind, 2658 .connect = smc_connect, 2659 .socketpair = sock_no_socketpair, 2660 .accept = smc_accept, 2661 .getname = smc_getname, 2662 .poll = smc_poll, 2663 .ioctl = smc_ioctl, 2664 .listen = smc_listen, 2665 .shutdown = smc_shutdown, 2666 .setsockopt = smc_setsockopt, 2667 .getsockopt = smc_getsockopt, 2668 .sendmsg = smc_sendmsg, 2669 .recvmsg = smc_recvmsg, 2670 .mmap = sock_no_mmap, 2671 .sendpage = smc_sendpage, 2672 .splice_read = smc_splice_read, 2673 }; 2674 2675 static int smc_create(struct net *net, struct socket *sock, int protocol, 2676 int kern) 2677 { 2678 int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; 2679 struct smc_sock *smc; 2680 struct sock *sk; 2681 int rc; 2682 2683 rc = -ESOCKTNOSUPPORT; 2684 if (sock->type != SOCK_STREAM) 2685 goto out; 2686 2687 rc = -EPROTONOSUPPORT; 2688 if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) 2689 goto out; 2690 2691 rc = -ENOBUFS; 2692 sock->ops = &smc_sock_ops; 2693 sk = smc_sock_alloc(net, sock, protocol); 2694 if (!sk) 2695 goto out; 2696 2697 /* create internal TCP socket for CLC handshake and fallback */ 2698 smc = smc_sk(sk); 2699 smc->use_fallback = false; /* assume rdma capability first */ 2700 smc->fallback_rsn = 0; 2701 rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, 2702 &smc->clcsock); 2703 if (rc) { 2704 sk_common_release(sk); 2705 goto out; 2706 } 2707 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 2708 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 2709 2710 out: 2711 return rc; 2712 } 2713 2714 static const struct net_proto_family smc_sock_family_ops = { 2715 .family = PF_SMC, 2716 .owner = THIS_MODULE, 2717 .create = smc_create, 2718 }; 2719 2720 unsigned int smc_net_id; 2721 2722 static __net_init int smc_net_init(struct net *net) 2723 { 2724 return smc_pnet_net_init(net); 2725 } 2726 2727 static void __net_exit smc_net_exit(struct net *net) 2728 { 2729 smc_pnet_net_exit(net); 2730 } 2731 2732 static __net_init int smc_net_stat_init(struct net *net) 2733 { 2734 return smc_stats_init(net); 2735 } 2736 2737 static void __net_exit smc_net_stat_exit(struct net *net) 2738 { 2739 smc_stats_exit(net); 2740 } 2741 2742 static struct pernet_operations smc_net_ops = { 2743 .init = smc_net_init, 2744 .exit = smc_net_exit, 2745 .id = &smc_net_id, 2746 .size = sizeof(struct smc_net), 2747 }; 2748 2749 static struct pernet_operations smc_net_stat_ops = { 2750 .init = smc_net_stat_init, 2751 .exit = smc_net_stat_exit, 2752 }; 2753 2754 static int __init smc_init(void) 2755 { 2756 int rc; 2757 2758 rc = register_pernet_subsys(&smc_net_ops); 2759 if (rc) 2760 return rc; 2761 2762 rc = register_pernet_subsys(&smc_net_stat_ops); 2763 if (rc) 2764 return rc; 2765 2766 smc_ism_init(); 2767 smc_clc_init(); 2768 2769 rc = smc_nl_init(); 2770 if (rc) 2771 goto out_pernet_subsys; 2772 2773 rc = smc_pnet_init(); 2774 if (rc) 2775 goto out_nl; 2776 2777 rc = -ENOMEM; 2778 smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); 2779 if (!smc_hs_wq) 2780 goto out_pnet; 2781 2782 smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); 2783 if (!smc_close_wq) 2784 goto out_alloc_hs_wq; 2785 2786 rc = smc_core_init(); 2787 if (rc) { 2788 pr_err("%s: smc_core_init fails with %d\n", __func__, rc); 2789 goto out_alloc_wqs; 2790 } 2791 2792 rc = smc_llc_init(); 2793 if (rc) { 2794 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 2795 goto out_core; 2796 } 2797 2798 rc = smc_cdc_init(); 2799 if (rc) { 2800 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 2801 goto out_core; 2802 } 2803 2804 rc = proto_register(&smc_proto, 1); 2805 if (rc) { 2806 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); 2807 goto out_core; 2808 } 2809 2810 rc = proto_register(&smc_proto6, 1); 2811 if (rc) { 2812 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); 2813 goto out_proto; 2814 } 2815 2816 rc = sock_register(&smc_sock_family_ops); 2817 if (rc) { 2818 pr_err("%s: sock_register fails with %d\n", __func__, rc); 2819 goto out_proto6; 2820 } 2821 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 2822 INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); 2823 2824 rc = smc_ib_register_client(); 2825 if (rc) { 2826 pr_err("%s: ib_register fails with %d\n", __func__, rc); 2827 goto out_sock; 2828 } 2829 2830 static_branch_enable(&tcp_have_smc); 2831 return 0; 2832 2833 out_sock: 2834 sock_unregister(PF_SMC); 2835 out_proto6: 2836 proto_unregister(&smc_proto6); 2837 out_proto: 2838 proto_unregister(&smc_proto); 2839 out_core: 2840 smc_core_exit(); 2841 out_alloc_wqs: 2842 destroy_workqueue(smc_close_wq); 2843 out_alloc_hs_wq: 2844 destroy_workqueue(smc_hs_wq); 2845 out_pnet: 2846 smc_pnet_exit(); 2847 out_nl: 2848 smc_nl_exit(); 2849 out_pernet_subsys: 2850 unregister_pernet_subsys(&smc_net_ops); 2851 2852 return rc; 2853 } 2854 2855 static void __exit smc_exit(void) 2856 { 2857 static_branch_disable(&tcp_have_smc); 2858 sock_unregister(PF_SMC); 2859 smc_core_exit(); 2860 smc_ib_unregister_client(); 2861 destroy_workqueue(smc_close_wq); 2862 destroy_workqueue(smc_hs_wq); 2863 proto_unregister(&smc_proto6); 2864 proto_unregister(&smc_proto); 2865 smc_pnet_exit(); 2866 smc_nl_exit(); 2867 smc_clc_exit(); 2868 unregister_pernet_subsys(&smc_net_stat_ops); 2869 unregister_pernet_subsys(&smc_net_ops); 2870 rcu_barrier(); 2871 } 2872 2873 module_init(smc_init); 2874 module_exit(smc_exit); 2875 2876 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 2877 MODULE_DESCRIPTION("smc socket address family"); 2878 MODULE_LICENSE("GPL"); 2879 MODULE_ALIAS_NETPROTO(PF_SMC); 2880