1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 5 * applies to SOCK_STREAM sockets only 6 * offers an alternative communication option for TCP-protocol sockets 7 * applicable with RoCE-cards only 8 * 9 * Initial restrictions: 10 * - support for alternate links postponed 11 * - partial support for non-blocking sockets only 12 * - support for urgent data postponed 13 * 14 * Copyright IBM Corp. 2016, 2018 15 * 16 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 17 * based on prototype from Frank Blaschka 18 */ 19 20 #define KMSG_COMPONENT "smc" 21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 22 23 #include <linux/module.h> 24 #include <linux/socket.h> 25 #include <linux/workqueue.h> 26 #include <linux/in.h> 27 #include <linux/sched/signal.h> 28 29 #include <net/sock.h> 30 #include <net/tcp.h> 31 #include <net/smc.h> 32 #include <asm/ioctls.h> 33 34 #include "smc.h" 35 #include "smc_clc.h" 36 #include "smc_llc.h" 37 #include "smc_cdc.h" 38 #include "smc_core.h" 39 #include "smc_ib.h" 40 #include "smc_pnet.h" 41 #include "smc_tx.h" 42 #include "smc_rx.h" 43 #include "smc_close.h" 44 45 static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group 46 * creation 47 */ 48 49 struct smc_lgr_list smc_lgr_list = { /* established link groups */ 50 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 51 .list = LIST_HEAD_INIT(smc_lgr_list.list), 52 }; 53 54 static void smc_tcp_listen_work(struct work_struct *); 55 56 static void smc_set_keepalive(struct sock *sk, int val) 57 { 58 struct smc_sock *smc = smc_sk(sk); 59 60 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 61 } 62 63 static struct smc_hashinfo smc_v4_hashinfo = { 64 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 65 }; 66 67 static struct smc_hashinfo smc_v6_hashinfo = { 68 .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), 69 }; 70 71 int smc_hash_sk(struct sock *sk) 72 { 73 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 74 struct hlist_head *head; 75 76 head = &h->ht; 77 78 write_lock_bh(&h->lock); 79 sk_add_node(sk, head); 80 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 81 write_unlock_bh(&h->lock); 82 83 return 0; 84 } 85 EXPORT_SYMBOL_GPL(smc_hash_sk); 86 87 void smc_unhash_sk(struct sock *sk) 88 { 89 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 90 91 write_lock_bh(&h->lock); 92 if (sk_del_node_init(sk)) 93 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 94 write_unlock_bh(&h->lock); 95 } 96 EXPORT_SYMBOL_GPL(smc_unhash_sk); 97 98 struct proto smc_proto = { 99 .name = "SMC", 100 .owner = THIS_MODULE, 101 .keepalive = smc_set_keepalive, 102 .hash = smc_hash_sk, 103 .unhash = smc_unhash_sk, 104 .obj_size = sizeof(struct smc_sock), 105 .h.smc_hash = &smc_v4_hashinfo, 106 .slab_flags = SLAB_TYPESAFE_BY_RCU, 107 }; 108 EXPORT_SYMBOL_GPL(smc_proto); 109 110 struct proto smc_proto6 = { 111 .name = "SMC6", 112 .owner = THIS_MODULE, 113 .keepalive = smc_set_keepalive, 114 .hash = smc_hash_sk, 115 .unhash = smc_unhash_sk, 116 .obj_size = sizeof(struct smc_sock), 117 .h.smc_hash = &smc_v6_hashinfo, 118 .slab_flags = SLAB_TYPESAFE_BY_RCU, 119 }; 120 EXPORT_SYMBOL_GPL(smc_proto6); 121 122 static int smc_release(struct socket *sock) 123 { 124 struct sock *sk = sock->sk; 125 struct smc_sock *smc; 126 int rc = 0; 127 128 if (!sk) 129 goto out; 130 131 smc = smc_sk(sk); 132 if (sk->sk_state == SMC_LISTEN) 133 /* smc_close_non_accepted() is called and acquires 134 * sock lock for child sockets again 135 */ 136 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 137 else 138 lock_sock(sk); 139 140 if (!smc->use_fallback) { 141 rc = smc_close_active(smc); 142 sock_set_flag(sk, SOCK_DEAD); 143 sk->sk_shutdown |= SHUTDOWN_MASK; 144 } 145 if (smc->clcsock) { 146 sock_release(smc->clcsock); 147 smc->clcsock = NULL; 148 } 149 if (smc->use_fallback) { 150 sock_put(sk); /* passive closing */ 151 sk->sk_state = SMC_CLOSED; 152 sk->sk_state_change(sk); 153 } 154 155 /* detach socket */ 156 sock_orphan(sk); 157 sock->sk = NULL; 158 if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) 159 smc_conn_free(&smc->conn); 160 release_sock(sk); 161 162 sk->sk_prot->unhash(sk); 163 sock_put(sk); /* final sock_put */ 164 out: 165 return rc; 166 } 167 168 static void smc_destruct(struct sock *sk) 169 { 170 if (sk->sk_state != SMC_CLOSED) 171 return; 172 if (!sock_flag(sk, SOCK_DEAD)) 173 return; 174 175 sk_refcnt_debug_dec(sk); 176 } 177 178 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, 179 int protocol) 180 { 181 struct smc_sock *smc; 182 struct proto *prot; 183 struct sock *sk; 184 185 prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; 186 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); 187 if (!sk) 188 return NULL; 189 190 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 191 sk->sk_state = SMC_INIT; 192 sk->sk_destruct = smc_destruct; 193 sk->sk_protocol = protocol; 194 smc = smc_sk(sk); 195 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 196 INIT_LIST_HEAD(&smc->accept_q); 197 spin_lock_init(&smc->accept_q_lock); 198 sk->sk_prot->hash(sk); 199 sk_refcnt_debug_inc(sk); 200 201 return sk; 202 } 203 204 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 205 int addr_len) 206 { 207 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 208 struct sock *sk = sock->sk; 209 struct smc_sock *smc; 210 int rc; 211 212 smc = smc_sk(sk); 213 214 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 215 rc = -EINVAL; 216 if (addr_len < sizeof(struct sockaddr_in)) 217 goto out; 218 219 rc = -EAFNOSUPPORT; 220 if (addr->sin_family != AF_INET && 221 addr->sin_family != AF_INET6 && 222 addr->sin_family != AF_UNSPEC) 223 goto out; 224 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 225 if (addr->sin_family == AF_UNSPEC && 226 addr->sin_addr.s_addr != htonl(INADDR_ANY)) 227 goto out; 228 229 lock_sock(sk); 230 231 /* Check if socket is already active */ 232 rc = -EINVAL; 233 if (sk->sk_state != SMC_INIT) 234 goto out_rel; 235 236 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 237 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 238 239 out_rel: 240 release_sock(sk); 241 out: 242 return rc; 243 } 244 245 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 246 unsigned long mask) 247 { 248 /* options we don't get control via setsockopt for */ 249 nsk->sk_type = osk->sk_type; 250 nsk->sk_sndbuf = osk->sk_sndbuf; 251 nsk->sk_rcvbuf = osk->sk_rcvbuf; 252 nsk->sk_sndtimeo = osk->sk_sndtimeo; 253 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 254 nsk->sk_mark = osk->sk_mark; 255 nsk->sk_priority = osk->sk_priority; 256 nsk->sk_rcvlowat = osk->sk_rcvlowat; 257 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 258 nsk->sk_err = osk->sk_err; 259 260 nsk->sk_flags &= ~mask; 261 nsk->sk_flags |= osk->sk_flags & mask; 262 } 263 264 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 265 (1UL << SOCK_KEEPOPEN) | \ 266 (1UL << SOCK_LINGER) | \ 267 (1UL << SOCK_BROADCAST) | \ 268 (1UL << SOCK_TIMESTAMP) | \ 269 (1UL << SOCK_DBG) | \ 270 (1UL << SOCK_RCVTSTAMP) | \ 271 (1UL << SOCK_RCVTSTAMPNS) | \ 272 (1UL << SOCK_LOCALROUTE) | \ 273 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 274 (1UL << SOCK_RXQ_OVFL) | \ 275 (1UL << SOCK_WIFI_STATUS) | \ 276 (1UL << SOCK_NOFCS) | \ 277 (1UL << SOCK_FILTER_LOCKED)) 278 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 279 * clc socket (since smc is not called for these options from net/core) 280 */ 281 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 282 { 283 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 284 } 285 286 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 287 (1UL << SOCK_KEEPOPEN) | \ 288 (1UL << SOCK_LINGER) | \ 289 (1UL << SOCK_DBG)) 290 /* copy only settings and flags relevant for smc from clc to smc socket */ 291 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 292 { 293 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 294 } 295 296 /* register a new rmb, optionally send confirm_rkey msg to register with peer */ 297 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, 298 bool conf_rkey) 299 { 300 /* register memory region for new rmb */ 301 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { 302 rmb_desc->regerr = 1; 303 return -EFAULT; 304 } 305 if (!conf_rkey) 306 return 0; 307 /* exchange confirm_rkey msg with peer */ 308 if (smc_llc_do_confirm_rkey(link, rmb_desc)) { 309 rmb_desc->regerr = 1; 310 return -EFAULT; 311 } 312 return 0; 313 } 314 315 static int smc_clnt_conf_first_link(struct smc_sock *smc) 316 { 317 struct net *net = sock_net(smc->clcsock->sk); 318 struct smc_link_group *lgr = smc->conn.lgr; 319 struct smc_link *link; 320 int rest; 321 int rc; 322 323 link = &lgr->lnk[SMC_SINGLE_LINK]; 324 /* receive CONFIRM LINK request from server over RoCE fabric */ 325 rest = wait_for_completion_interruptible_timeout( 326 &link->llc_confirm, 327 SMC_LLC_WAIT_FIRST_TIME); 328 if (rest <= 0) { 329 struct smc_clc_msg_decline dclc; 330 331 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 332 SMC_CLC_DECLINE); 333 return rc; 334 } 335 336 if (link->llc_confirm_rc) 337 return SMC_CLC_DECL_RMBE_EC; 338 339 rc = smc_ib_modify_qp_rts(link); 340 if (rc) 341 return SMC_CLC_DECL_INTERR; 342 343 smc_wr_remember_qp_attr(link); 344 345 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 346 return SMC_CLC_DECL_INTERR; 347 348 /* send CONFIRM LINK response over RoCE fabric */ 349 rc = smc_llc_send_confirm_link(link, 350 link->smcibdev->mac[link->ibport - 1], 351 &link->smcibdev->gid[link->ibport - 1], 352 SMC_LLC_RESP); 353 if (rc < 0) 354 return SMC_CLC_DECL_TCL; 355 356 /* receive ADD LINK request from server over RoCE fabric */ 357 rest = wait_for_completion_interruptible_timeout(&link->llc_add, 358 SMC_LLC_WAIT_TIME); 359 if (rest <= 0) { 360 struct smc_clc_msg_decline dclc; 361 362 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 363 SMC_CLC_DECLINE); 364 return rc; 365 } 366 367 /* send add link reject message, only one link supported for now */ 368 rc = smc_llc_send_add_link(link, 369 link->smcibdev->mac[link->ibport - 1], 370 &link->smcibdev->gid[link->ibport - 1], 371 SMC_LLC_RESP); 372 if (rc < 0) 373 return SMC_CLC_DECL_TCL; 374 375 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); 376 377 return 0; 378 } 379 380 static void smc_conn_save_peer_info(struct smc_sock *smc, 381 struct smc_clc_msg_accept_confirm *clc) 382 { 383 smc->conn.peer_conn_idx = clc->conn_idx; 384 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 385 smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); 386 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 387 } 388 389 static void smc_link_save_peer_info(struct smc_link *link, 390 struct smc_clc_msg_accept_confirm *clc) 391 { 392 link->peer_qpn = ntoh24(clc->qpn); 393 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); 394 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); 395 link->peer_psn = ntoh24(clc->psn); 396 link->peer_mtu = clc->qp_mtu; 397 } 398 399 /* setup for RDMA connection of client */ 400 static int smc_connect_rdma(struct smc_sock *smc) 401 { 402 struct smc_clc_msg_accept_confirm aclc; 403 int local_contact = SMC_FIRST_CONTACT; 404 struct smc_ib_device *smcibdev; 405 struct smc_link *link; 406 u8 srv_first_contact; 407 int reason_code = 0; 408 int rc = 0; 409 u8 ibport; 410 411 sock_hold(&smc->sk); /* sock put in passive closing */ 412 413 if (smc->use_fallback) 414 goto out_connected; 415 416 if (!tcp_sk(smc->clcsock->sk)->syn_smc) { 417 /* peer has not signalled SMC-capability */ 418 smc->use_fallback = true; 419 goto out_connected; 420 } 421 422 /* IPSec connections opt out of SMC-R optimizations */ 423 if (using_ipsec(smc)) { 424 reason_code = SMC_CLC_DECL_IPSEC; 425 goto decline_rdma; 426 } 427 428 /* PNET table look up: search active ib_device and port 429 * within same PNETID that also contains the ethernet device 430 * used for the internal TCP socket 431 */ 432 smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); 433 if (!smcibdev) { 434 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 435 goto decline_rdma; 436 } 437 438 /* do inband token exchange */ 439 reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); 440 if (reason_code < 0) { 441 rc = reason_code; 442 goto out_err; 443 } 444 if (reason_code > 0) /* configuration error */ 445 goto decline_rdma; 446 /* receive SMC Accept CLC message */ 447 reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), 448 SMC_CLC_ACCEPT); 449 if (reason_code < 0) { 450 rc = reason_code; 451 goto out_err; 452 } 453 if (reason_code > 0) 454 goto decline_rdma; 455 456 srv_first_contact = aclc.hdr.flag; 457 mutex_lock(&smc_create_lgr_pending); 458 local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl, 459 srv_first_contact); 460 if (local_contact < 0) { 461 rc = local_contact; 462 if (rc == -ENOMEM) 463 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 464 else if (rc == -ENOLINK) 465 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 466 else 467 reason_code = SMC_CLC_DECL_INTERR; /* other error */ 468 goto decline_rdma_unlock; 469 } 470 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 471 472 smc_conn_save_peer_info(smc, &aclc); 473 474 /* create send buffer and rmb */ 475 rc = smc_buf_create(smc); 476 if (rc) { 477 reason_code = SMC_CLC_DECL_MEM; 478 goto decline_rdma_unlock; 479 } 480 481 if (local_contact == SMC_FIRST_CONTACT) 482 smc_link_save_peer_info(link, &aclc); 483 484 rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); 485 if (rc) { 486 reason_code = SMC_CLC_DECL_INTERR; 487 goto decline_rdma_unlock; 488 } 489 490 smc_close_init(smc); 491 smc_rx_init(smc); 492 493 if (local_contact == SMC_FIRST_CONTACT) { 494 rc = smc_ib_ready_link(link); 495 if (rc) { 496 reason_code = SMC_CLC_DECL_INTERR; 497 goto decline_rdma_unlock; 498 } 499 } else { 500 if (!smc->conn.rmb_desc->reused) { 501 if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) { 502 reason_code = SMC_CLC_DECL_INTERR; 503 goto decline_rdma_unlock; 504 } 505 } 506 } 507 smc_rmb_sync_sg_for_device(&smc->conn); 508 509 rc = smc_clc_send_confirm(smc); 510 if (rc) 511 goto out_err_unlock; 512 513 if (local_contact == SMC_FIRST_CONTACT) { 514 /* QP confirmation over RoCE fabric */ 515 reason_code = smc_clnt_conf_first_link(smc); 516 if (reason_code < 0) { 517 rc = reason_code; 518 goto out_err_unlock; 519 } 520 if (reason_code > 0) 521 goto decline_rdma_unlock; 522 } 523 524 mutex_unlock(&smc_create_lgr_pending); 525 smc_tx_init(smc); 526 527 out_connected: 528 smc_copy_sock_settings_to_clc(smc); 529 if (smc->sk.sk_state == SMC_INIT) 530 smc->sk.sk_state = SMC_ACTIVE; 531 532 return rc ? rc : local_contact; 533 534 decline_rdma_unlock: 535 if (local_contact == SMC_FIRST_CONTACT) 536 smc_lgr_forget(smc->conn.lgr); 537 mutex_unlock(&smc_create_lgr_pending); 538 smc_conn_free(&smc->conn); 539 decline_rdma: 540 /* RDMA setup failed, switch back to TCP */ 541 smc->use_fallback = true; 542 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 543 rc = smc_clc_send_decline(smc, reason_code); 544 if (rc < 0) 545 goto out_err; 546 } 547 goto out_connected; 548 549 out_err_unlock: 550 if (local_contact == SMC_FIRST_CONTACT) 551 smc_lgr_forget(smc->conn.lgr); 552 mutex_unlock(&smc_create_lgr_pending); 553 smc_conn_free(&smc->conn); 554 out_err: 555 if (smc->sk.sk_state == SMC_INIT) 556 sock_put(&smc->sk); /* passive closing */ 557 return rc; 558 } 559 560 static int smc_connect(struct socket *sock, struct sockaddr *addr, 561 int alen, int flags) 562 { 563 struct sock *sk = sock->sk; 564 struct smc_sock *smc; 565 int rc = -EINVAL; 566 567 smc = smc_sk(sk); 568 569 /* separate smc parameter checking to be safe */ 570 if (alen < sizeof(addr->sa_family)) 571 goto out_err; 572 if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) 573 goto out_err; 574 575 lock_sock(sk); 576 switch (sk->sk_state) { 577 default: 578 goto out; 579 case SMC_ACTIVE: 580 rc = -EISCONN; 581 goto out; 582 case SMC_INIT: 583 rc = 0; 584 break; 585 } 586 587 smc_copy_sock_settings_to_clc(smc); 588 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 589 rc = kernel_connect(smc->clcsock, addr, alen, flags); 590 if (rc) 591 goto out; 592 593 /* setup RDMA connection */ 594 rc = smc_connect_rdma(smc); 595 if (rc < 0) 596 goto out; 597 else 598 rc = 0; /* success cases including fallback */ 599 600 out: 601 release_sock(sk); 602 out_err: 603 return rc; 604 } 605 606 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 607 { 608 struct socket *new_clcsock = NULL; 609 struct sock *lsk = &lsmc->sk; 610 struct sock *new_sk; 611 int rc; 612 613 release_sock(lsk); 614 new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); 615 if (!new_sk) { 616 rc = -ENOMEM; 617 lsk->sk_err = ENOMEM; 618 *new_smc = NULL; 619 lock_sock(lsk); 620 goto out; 621 } 622 *new_smc = smc_sk(new_sk); 623 624 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); 625 lock_sock(lsk); 626 if (rc < 0) 627 lsk->sk_err = -rc; 628 if (rc < 0 || lsk->sk_state == SMC_CLOSED) { 629 if (new_clcsock) 630 sock_release(new_clcsock); 631 new_sk->sk_state = SMC_CLOSED; 632 sock_set_flag(new_sk, SOCK_DEAD); 633 new_sk->sk_prot->unhash(new_sk); 634 sock_put(new_sk); /* final */ 635 *new_smc = NULL; 636 goto out; 637 } 638 639 (*new_smc)->clcsock = new_clcsock; 640 out: 641 return rc; 642 } 643 644 /* add a just created sock to the accept queue of the listen sock as 645 * candidate for a following socket accept call from user space 646 */ 647 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 648 { 649 struct smc_sock *par = smc_sk(parent); 650 651 sock_hold(sk); /* sock_put in smc_accept_unlink () */ 652 spin_lock(&par->accept_q_lock); 653 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 654 spin_unlock(&par->accept_q_lock); 655 sk_acceptq_added(parent); 656 } 657 658 /* remove a socket from the accept queue of its parental listening socket */ 659 static void smc_accept_unlink(struct sock *sk) 660 { 661 struct smc_sock *par = smc_sk(sk)->listen_smc; 662 663 spin_lock(&par->accept_q_lock); 664 list_del_init(&smc_sk(sk)->accept_q); 665 spin_unlock(&par->accept_q_lock); 666 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 667 sock_put(sk); /* sock_hold in smc_accept_enqueue */ 668 } 669 670 /* remove a sock from the accept queue to bind it to a new socket created 671 * for a socket accept call from user space 672 */ 673 struct sock *smc_accept_dequeue(struct sock *parent, 674 struct socket *new_sock) 675 { 676 struct smc_sock *isk, *n; 677 struct sock *new_sk; 678 679 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 680 new_sk = (struct sock *)isk; 681 682 smc_accept_unlink(new_sk); 683 if (new_sk->sk_state == SMC_CLOSED) { 684 if (isk->clcsock) { 685 sock_release(isk->clcsock); 686 isk->clcsock = NULL; 687 } 688 new_sk->sk_prot->unhash(new_sk); 689 sock_put(new_sk); /* final */ 690 continue; 691 } 692 if (new_sock) 693 sock_graft(new_sk, new_sock); 694 return new_sk; 695 } 696 return NULL; 697 } 698 699 /* clean up for a created but never accepted sock */ 700 void smc_close_non_accepted(struct sock *sk) 701 { 702 struct smc_sock *smc = smc_sk(sk); 703 704 lock_sock(sk); 705 if (!sk->sk_lingertime) 706 /* wait for peer closing */ 707 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 708 if (!smc->use_fallback) { 709 smc_close_active(smc); 710 sock_set_flag(sk, SOCK_DEAD); 711 sk->sk_shutdown |= SHUTDOWN_MASK; 712 } 713 if (smc->clcsock) { 714 struct socket *tcp; 715 716 tcp = smc->clcsock; 717 smc->clcsock = NULL; 718 sock_release(tcp); 719 } 720 if (smc->use_fallback) { 721 sock_put(sk); /* passive closing */ 722 sk->sk_state = SMC_CLOSED; 723 } else { 724 if (sk->sk_state == SMC_CLOSED) 725 smc_conn_free(&smc->conn); 726 } 727 release_sock(sk); 728 sk->sk_prot->unhash(sk); 729 sock_put(sk); /* final sock_put */ 730 } 731 732 static int smc_serv_conf_first_link(struct smc_sock *smc) 733 { 734 struct net *net = sock_net(smc->clcsock->sk); 735 struct smc_link_group *lgr = smc->conn.lgr; 736 struct smc_link *link; 737 int rest; 738 int rc; 739 740 link = &lgr->lnk[SMC_SINGLE_LINK]; 741 742 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 743 return SMC_CLC_DECL_INTERR; 744 745 /* send CONFIRM LINK request to client over the RoCE fabric */ 746 rc = smc_llc_send_confirm_link(link, 747 link->smcibdev->mac[link->ibport - 1], 748 &link->smcibdev->gid[link->ibport - 1], 749 SMC_LLC_REQ); 750 if (rc < 0) 751 return SMC_CLC_DECL_TCL; 752 753 /* receive CONFIRM LINK response from client over the RoCE fabric */ 754 rest = wait_for_completion_interruptible_timeout( 755 &link->llc_confirm_resp, 756 SMC_LLC_WAIT_FIRST_TIME); 757 if (rest <= 0) { 758 struct smc_clc_msg_decline dclc; 759 760 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 761 SMC_CLC_DECLINE); 762 return rc; 763 } 764 765 if (link->llc_confirm_resp_rc) 766 return SMC_CLC_DECL_RMBE_EC; 767 768 /* send ADD LINK request to client over the RoCE fabric */ 769 rc = smc_llc_send_add_link(link, 770 link->smcibdev->mac[link->ibport - 1], 771 &link->smcibdev->gid[link->ibport - 1], 772 SMC_LLC_REQ); 773 if (rc < 0) 774 return SMC_CLC_DECL_TCL; 775 776 /* receive ADD LINK response from client over the RoCE fabric */ 777 rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, 778 SMC_LLC_WAIT_TIME); 779 if (rest <= 0) { 780 struct smc_clc_msg_decline dclc; 781 782 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 783 SMC_CLC_DECLINE); 784 return rc; 785 } 786 787 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); 788 789 return 0; 790 } 791 792 /* setup for RDMA connection of server */ 793 static void smc_listen_work(struct work_struct *work) 794 { 795 struct smc_sock *new_smc = container_of(work, struct smc_sock, 796 smc_listen_work); 797 struct smc_clc_msg_proposal_prefix *pclc_prfx; 798 struct socket *newclcsock = new_smc->clcsock; 799 struct smc_sock *lsmc = new_smc->listen_smc; 800 struct smc_clc_msg_accept_confirm cclc; 801 int local_contact = SMC_REUSE_CONTACT; 802 struct sock *newsmcsk = &new_smc->sk; 803 struct smc_clc_msg_proposal *pclc; 804 struct smc_ib_device *smcibdev; 805 u8 buf[SMC_CLC_MAX_LEN]; 806 struct smc_link *link; 807 int reason_code = 0; 808 int rc = 0; 809 u8 ibport; 810 811 if (new_smc->use_fallback) 812 goto out_connected; 813 814 /* check if peer is smc capable */ 815 if (!tcp_sk(newclcsock->sk)->syn_smc) { 816 new_smc->use_fallback = true; 817 goto out_connected; 818 } 819 820 /* do inband token exchange - 821 *wait for and receive SMC Proposal CLC message 822 */ 823 reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), 824 SMC_CLC_PROPOSAL); 825 if (reason_code < 0) 826 goto out_err; 827 if (reason_code > 0) 828 goto decline_rdma; 829 830 /* IPSec connections opt out of SMC-R optimizations */ 831 if (using_ipsec(new_smc)) { 832 reason_code = SMC_CLC_DECL_IPSEC; 833 goto decline_rdma; 834 } 835 836 /* PNET table look up: search active ib_device and port 837 * within same PNETID that also contains the ethernet device 838 * used for the internal TCP socket 839 */ 840 smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); 841 if (!smcibdev) { 842 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 843 goto decline_rdma; 844 } 845 846 pclc = (struct smc_clc_msg_proposal *)&buf; 847 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 848 849 rc = smc_clc_prfx_match(newclcsock, pclc_prfx); 850 if (rc) { 851 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 852 goto decline_rdma; 853 } 854 855 /* allocate connection / link group */ 856 mutex_lock(&smc_create_lgr_pending); 857 local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl, 858 0); 859 if (local_contact < 0) { 860 rc = local_contact; 861 if (rc == -ENOMEM) 862 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 863 goto decline_rdma_unlock; 864 } 865 link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 866 867 /* create send buffer and rmb */ 868 rc = smc_buf_create(new_smc); 869 if (rc) { 870 reason_code = SMC_CLC_DECL_MEM; 871 goto decline_rdma_unlock; 872 } 873 874 smc_close_init(new_smc); 875 smc_rx_init(new_smc); 876 877 if (local_contact != SMC_FIRST_CONTACT) { 878 if (!new_smc->conn.rmb_desc->reused) { 879 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) { 880 reason_code = SMC_CLC_DECL_INTERR; 881 goto decline_rdma_unlock; 882 } 883 } 884 } 885 smc_rmb_sync_sg_for_device(&new_smc->conn); 886 887 rc = smc_clc_send_accept(new_smc, local_contact); 888 if (rc) 889 goto out_err_unlock; 890 891 /* receive SMC Confirm CLC message */ 892 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), 893 SMC_CLC_CONFIRM); 894 if (reason_code < 0) 895 goto out_err_unlock; 896 if (reason_code > 0) 897 goto decline_rdma_unlock; 898 smc_conn_save_peer_info(new_smc, &cclc); 899 if (local_contact == SMC_FIRST_CONTACT) 900 smc_link_save_peer_info(link, &cclc); 901 902 rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); 903 if (rc) { 904 reason_code = SMC_CLC_DECL_INTERR; 905 goto decline_rdma_unlock; 906 } 907 908 if (local_contact == SMC_FIRST_CONTACT) { 909 rc = smc_ib_ready_link(link); 910 if (rc) { 911 reason_code = SMC_CLC_DECL_INTERR; 912 goto decline_rdma_unlock; 913 } 914 /* QP confirmation over RoCE fabric */ 915 reason_code = smc_serv_conf_first_link(new_smc); 916 if (reason_code < 0) 917 /* peer is not aware of a problem */ 918 goto out_err_unlock; 919 if (reason_code > 0) 920 goto decline_rdma_unlock; 921 } 922 923 smc_tx_init(new_smc); 924 mutex_unlock(&smc_create_lgr_pending); 925 926 out_connected: 927 sk_refcnt_debug_inc(newsmcsk); 928 if (newsmcsk->sk_state == SMC_INIT) 929 newsmcsk->sk_state = SMC_ACTIVE; 930 enqueue: 931 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 932 if (lsmc->sk.sk_state == SMC_LISTEN) { 933 smc_accept_enqueue(&lsmc->sk, newsmcsk); 934 } else { /* no longer listening */ 935 smc_close_non_accepted(newsmcsk); 936 } 937 release_sock(&lsmc->sk); 938 939 /* Wake up accept */ 940 lsmc->sk.sk_data_ready(&lsmc->sk); 941 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 942 return; 943 944 decline_rdma_unlock: 945 if (local_contact == SMC_FIRST_CONTACT) 946 smc_lgr_forget(new_smc->conn.lgr); 947 mutex_unlock(&smc_create_lgr_pending); 948 decline_rdma: 949 /* RDMA setup failed, switch back to TCP */ 950 smc_conn_free(&new_smc->conn); 951 new_smc->use_fallback = true; 952 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 953 if (smc_clc_send_decline(new_smc, reason_code) < 0) 954 goto out_err; 955 } 956 goto out_connected; 957 958 out_err_unlock: 959 if (local_contact == SMC_FIRST_CONTACT) 960 smc_lgr_forget(new_smc->conn.lgr); 961 mutex_unlock(&smc_create_lgr_pending); 962 out_err: 963 if (newsmcsk->sk_state == SMC_INIT) 964 sock_put(&new_smc->sk); /* passive closing */ 965 newsmcsk->sk_state = SMC_CLOSED; 966 smc_conn_free(&new_smc->conn); 967 goto enqueue; /* queue new sock with sk_err set */ 968 } 969 970 static void smc_tcp_listen_work(struct work_struct *work) 971 { 972 struct smc_sock *lsmc = container_of(work, struct smc_sock, 973 tcp_listen_work); 974 struct sock *lsk = &lsmc->sk; 975 struct smc_sock *new_smc; 976 int rc = 0; 977 978 lock_sock(lsk); 979 while (lsk->sk_state == SMC_LISTEN) { 980 rc = smc_clcsock_accept(lsmc, &new_smc); 981 if (rc) 982 goto out; 983 if (!new_smc) 984 continue; 985 986 new_smc->listen_smc = lsmc; 987 new_smc->use_fallback = lsmc->use_fallback; 988 sock_hold(lsk); /* sock_put in smc_listen_work */ 989 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 990 smc_copy_sock_settings_to_smc(new_smc); 991 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 992 if (!schedule_work(&new_smc->smc_listen_work)) 993 sock_put(&new_smc->sk); 994 } 995 996 out: 997 release_sock(lsk); 998 sock_put(&lsmc->sk); /* sock_hold in smc_listen */ 999 } 1000 1001 static int smc_listen(struct socket *sock, int backlog) 1002 { 1003 struct sock *sk = sock->sk; 1004 struct smc_sock *smc; 1005 int rc; 1006 1007 smc = smc_sk(sk); 1008 lock_sock(sk); 1009 1010 rc = -EINVAL; 1011 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) 1012 goto out; 1013 1014 rc = 0; 1015 if (sk->sk_state == SMC_LISTEN) { 1016 sk->sk_max_ack_backlog = backlog; 1017 goto out; 1018 } 1019 /* some socket options are handled in core, so we could not apply 1020 * them to the clc socket -- copy smc socket options to clc socket 1021 */ 1022 smc_copy_sock_settings_to_clc(smc); 1023 if (!smc->use_fallback) 1024 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1025 1026 rc = kernel_listen(smc->clcsock, backlog); 1027 if (rc) 1028 goto out; 1029 sk->sk_max_ack_backlog = backlog; 1030 sk->sk_ack_backlog = 0; 1031 sk->sk_state = SMC_LISTEN; 1032 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 1033 sock_hold(sk); /* sock_hold in tcp_listen_worker */ 1034 if (!schedule_work(&smc->tcp_listen_work)) 1035 sock_put(sk); 1036 1037 out: 1038 release_sock(sk); 1039 return rc; 1040 } 1041 1042 static int smc_accept(struct socket *sock, struct socket *new_sock, 1043 int flags, bool kern) 1044 { 1045 struct sock *sk = sock->sk, *nsk; 1046 DECLARE_WAITQUEUE(wait, current); 1047 struct smc_sock *lsmc; 1048 long timeo; 1049 int rc = 0; 1050 1051 lsmc = smc_sk(sk); 1052 sock_hold(sk); /* sock_put below */ 1053 lock_sock(sk); 1054 1055 if (lsmc->sk.sk_state != SMC_LISTEN) { 1056 rc = -EINVAL; 1057 release_sock(sk); 1058 goto out; 1059 } 1060 1061 /* Wait for an incoming connection */ 1062 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1063 add_wait_queue_exclusive(sk_sleep(sk), &wait); 1064 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 1065 set_current_state(TASK_INTERRUPTIBLE); 1066 if (!timeo) { 1067 rc = -EAGAIN; 1068 break; 1069 } 1070 release_sock(sk); 1071 timeo = schedule_timeout(timeo); 1072 /* wakeup by sk_data_ready in smc_listen_work() */ 1073 sched_annotate_sleep(); 1074 lock_sock(sk); 1075 if (signal_pending(current)) { 1076 rc = sock_intr_errno(timeo); 1077 break; 1078 } 1079 } 1080 set_current_state(TASK_RUNNING); 1081 remove_wait_queue(sk_sleep(sk), &wait); 1082 1083 if (!rc) 1084 rc = sock_error(nsk); 1085 release_sock(sk); 1086 if (rc) 1087 goto out; 1088 1089 if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { 1090 /* wait till data arrives on the socket */ 1091 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * 1092 MSEC_PER_SEC); 1093 if (smc_sk(nsk)->use_fallback) { 1094 struct sock *clcsk = smc_sk(nsk)->clcsock->sk; 1095 1096 lock_sock(clcsk); 1097 if (skb_queue_empty(&clcsk->sk_receive_queue)) 1098 sk_wait_data(clcsk, &timeo, NULL); 1099 release_sock(clcsk); 1100 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { 1101 lock_sock(nsk); 1102 smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); 1103 release_sock(nsk); 1104 } 1105 } 1106 1107 out: 1108 sock_put(sk); /* sock_hold above */ 1109 return rc; 1110 } 1111 1112 static int smc_getname(struct socket *sock, struct sockaddr *addr, 1113 int peer) 1114 { 1115 struct smc_sock *smc; 1116 1117 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 1118 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 1119 return -ENOTCONN; 1120 1121 smc = smc_sk(sock->sk); 1122 1123 return smc->clcsock->ops->getname(smc->clcsock, addr, peer); 1124 } 1125 1126 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1127 { 1128 struct sock *sk = sock->sk; 1129 struct smc_sock *smc; 1130 int rc = -EPIPE; 1131 1132 smc = smc_sk(sk); 1133 lock_sock(sk); 1134 if ((sk->sk_state != SMC_ACTIVE) && 1135 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1136 (sk->sk_state != SMC_INIT)) 1137 goto out; 1138 1139 if (msg->msg_flags & MSG_FASTOPEN) { 1140 if (sk->sk_state == SMC_INIT) { 1141 smc->use_fallback = true; 1142 } else { 1143 rc = -EINVAL; 1144 goto out; 1145 } 1146 } 1147 1148 if (smc->use_fallback) 1149 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1150 else 1151 rc = smc_tx_sendmsg(smc, msg, len); 1152 out: 1153 release_sock(sk); 1154 return rc; 1155 } 1156 1157 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1158 int flags) 1159 { 1160 struct sock *sk = sock->sk; 1161 struct smc_sock *smc; 1162 int rc = -ENOTCONN; 1163 1164 smc = smc_sk(sk); 1165 lock_sock(sk); 1166 if ((sk->sk_state == SMC_INIT) || 1167 (sk->sk_state == SMC_LISTEN) || 1168 (sk->sk_state == SMC_CLOSED)) 1169 goto out; 1170 1171 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1172 rc = 0; 1173 goto out; 1174 } 1175 1176 if (smc->use_fallback) { 1177 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1178 } else { 1179 msg->msg_namelen = 0; 1180 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); 1181 } 1182 1183 out: 1184 release_sock(sk); 1185 return rc; 1186 } 1187 1188 static __poll_t smc_accept_poll(struct sock *parent) 1189 { 1190 struct smc_sock *isk = smc_sk(parent); 1191 __poll_t mask = 0; 1192 1193 spin_lock(&isk->accept_q_lock); 1194 if (!list_empty(&isk->accept_q)) 1195 mask = EPOLLIN | EPOLLRDNORM; 1196 spin_unlock(&isk->accept_q_lock); 1197 1198 return mask; 1199 } 1200 1201 static __poll_t smc_poll(struct file *file, struct socket *sock, 1202 poll_table *wait) 1203 { 1204 struct sock *sk = sock->sk; 1205 __poll_t mask = 0; 1206 struct smc_sock *smc; 1207 int rc; 1208 1209 if (!sk) 1210 return EPOLLNVAL; 1211 1212 smc = smc_sk(sock->sk); 1213 sock_hold(sk); 1214 lock_sock(sk); 1215 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { 1216 /* delegate to CLC child sock */ 1217 release_sock(sk); 1218 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 1219 lock_sock(sk); 1220 sk->sk_err = smc->clcsock->sk->sk_err; 1221 if (sk->sk_err) { 1222 mask |= EPOLLERR; 1223 } else { 1224 /* if non-blocking connect finished ... */ 1225 if (sk->sk_state == SMC_INIT && 1226 mask & EPOLLOUT && 1227 smc->clcsock->sk->sk_state != TCP_CLOSE) { 1228 rc = smc_connect_rdma(smc); 1229 if (rc < 0) 1230 mask |= EPOLLERR; 1231 /* success cases including fallback */ 1232 mask |= EPOLLOUT | EPOLLWRNORM; 1233 } 1234 } 1235 } else { 1236 if (sk->sk_state != SMC_CLOSED) { 1237 release_sock(sk); 1238 sock_poll_wait(file, sk_sleep(sk), wait); 1239 lock_sock(sk); 1240 } 1241 if (sk->sk_err) 1242 mask |= EPOLLERR; 1243 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1244 (sk->sk_state == SMC_CLOSED)) 1245 mask |= EPOLLHUP; 1246 if (sk->sk_state == SMC_LISTEN) { 1247 /* woken up by sk_data_ready in smc_listen_work() */ 1248 mask = smc_accept_poll(sk); 1249 } else { 1250 if (atomic_read(&smc->conn.sndbuf_space) || 1251 sk->sk_shutdown & SEND_SHUTDOWN) { 1252 mask |= EPOLLOUT | EPOLLWRNORM; 1253 } else { 1254 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1255 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1256 } 1257 if (atomic_read(&smc->conn.bytes_to_rcv)) 1258 mask |= EPOLLIN | EPOLLRDNORM; 1259 if (sk->sk_shutdown & RCV_SHUTDOWN) 1260 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1261 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1262 mask |= EPOLLIN; 1263 } 1264 1265 } 1266 release_sock(sk); 1267 sock_put(sk); 1268 1269 return mask; 1270 } 1271 1272 static int smc_shutdown(struct socket *sock, int how) 1273 { 1274 struct sock *sk = sock->sk; 1275 struct smc_sock *smc; 1276 int rc = -EINVAL; 1277 int rc1 = 0; 1278 1279 smc = smc_sk(sk); 1280 1281 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 1282 return rc; 1283 1284 lock_sock(sk); 1285 1286 rc = -ENOTCONN; 1287 if ((sk->sk_state != SMC_LISTEN) && 1288 (sk->sk_state != SMC_ACTIVE) && 1289 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1290 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1291 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1292 (sk->sk_state != SMC_APPCLOSEWAIT2) && 1293 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 1294 goto out; 1295 if (smc->use_fallback) { 1296 rc = kernel_sock_shutdown(smc->clcsock, how); 1297 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 1298 if (sk->sk_shutdown == SHUTDOWN_MASK) 1299 sk->sk_state = SMC_CLOSED; 1300 goto out; 1301 } 1302 switch (how) { 1303 case SHUT_RDWR: /* shutdown in both directions */ 1304 rc = smc_close_active(smc); 1305 break; 1306 case SHUT_WR: 1307 rc = smc_close_shutdown_write(smc); 1308 break; 1309 case SHUT_RD: 1310 rc = 0; 1311 /* nothing more to do because peer is not involved */ 1312 break; 1313 } 1314 if (smc->clcsock) 1315 rc1 = kernel_sock_shutdown(smc->clcsock, how); 1316 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 1317 sk->sk_shutdown |= how + 1; 1318 1319 out: 1320 release_sock(sk); 1321 return rc ? rc : rc1; 1322 } 1323 1324 static int smc_setsockopt(struct socket *sock, int level, int optname, 1325 char __user *optval, unsigned int optlen) 1326 { 1327 struct sock *sk = sock->sk; 1328 struct smc_sock *smc; 1329 int val, rc; 1330 1331 smc = smc_sk(sk); 1332 1333 /* generic setsockopts reaching us here always apply to the 1334 * CLC socket 1335 */ 1336 rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1337 optval, optlen); 1338 if (smc->clcsock->sk->sk_err) { 1339 sk->sk_err = smc->clcsock->sk->sk_err; 1340 sk->sk_error_report(sk); 1341 } 1342 if (rc) 1343 return rc; 1344 1345 if (optlen < sizeof(int)) 1346 return rc; 1347 get_user(val, (int __user *)optval); 1348 1349 lock_sock(sk); 1350 switch (optname) { 1351 case TCP_ULP: 1352 case TCP_FASTOPEN: 1353 case TCP_FASTOPEN_CONNECT: 1354 case TCP_FASTOPEN_KEY: 1355 case TCP_FASTOPEN_NO_COOKIE: 1356 /* option not supported by SMC */ 1357 if (sk->sk_state == SMC_INIT) { 1358 smc->use_fallback = true; 1359 } else { 1360 if (!smc->use_fallback) 1361 rc = -EINVAL; 1362 } 1363 break; 1364 case TCP_NODELAY: 1365 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { 1366 if (val && !smc->use_fallback) 1367 mod_delayed_work(system_wq, &smc->conn.tx_work, 1368 0); 1369 } 1370 break; 1371 case TCP_CORK: 1372 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { 1373 if (!val && !smc->use_fallback) 1374 mod_delayed_work(system_wq, &smc->conn.tx_work, 1375 0); 1376 } 1377 break; 1378 case TCP_DEFER_ACCEPT: 1379 smc->sockopt_defer_accept = val; 1380 break; 1381 default: 1382 break; 1383 } 1384 release_sock(sk); 1385 1386 return rc; 1387 } 1388 1389 static int smc_getsockopt(struct socket *sock, int level, int optname, 1390 char __user *optval, int __user *optlen) 1391 { 1392 struct smc_sock *smc; 1393 1394 smc = smc_sk(sock->sk); 1395 /* socket options apply to the CLC socket */ 1396 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 1397 optval, optlen); 1398 } 1399 1400 static int smc_ioctl(struct socket *sock, unsigned int cmd, 1401 unsigned long arg) 1402 { 1403 struct smc_sock *smc; 1404 int answ; 1405 1406 smc = smc_sk(sock->sk); 1407 if (smc->use_fallback) { 1408 if (!smc->clcsock) 1409 return -EBADF; 1410 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1411 } 1412 switch (cmd) { 1413 case SIOCINQ: /* same as FIONREAD */ 1414 if (smc->sk.sk_state == SMC_LISTEN) 1415 return -EINVAL; 1416 answ = atomic_read(&smc->conn.bytes_to_rcv); 1417 break; 1418 case SIOCOUTQ: 1419 /* output queue size (not send + not acked) */ 1420 if (smc->sk.sk_state == SMC_LISTEN) 1421 return -EINVAL; 1422 answ = smc->conn.sndbuf_size - 1423 atomic_read(&smc->conn.sndbuf_space); 1424 break; 1425 case SIOCOUTQNSD: 1426 /* output queue size (not send only) */ 1427 if (smc->sk.sk_state == SMC_LISTEN) 1428 return -EINVAL; 1429 answ = smc_tx_prepared_sends(&smc->conn); 1430 break; 1431 default: 1432 return -ENOIOCTLCMD; 1433 } 1434 1435 return put_user(answ, (int __user *)arg); 1436 } 1437 1438 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1439 int offset, size_t size, int flags) 1440 { 1441 struct sock *sk = sock->sk; 1442 struct smc_sock *smc; 1443 int rc = -EPIPE; 1444 1445 smc = smc_sk(sk); 1446 lock_sock(sk); 1447 if (sk->sk_state != SMC_ACTIVE) { 1448 release_sock(sk); 1449 goto out; 1450 } 1451 release_sock(sk); 1452 if (smc->use_fallback) 1453 rc = kernel_sendpage(smc->clcsock, page, offset, 1454 size, flags); 1455 else 1456 rc = sock_no_sendpage(sock, page, offset, size, flags); 1457 1458 out: 1459 return rc; 1460 } 1461 1462 /* Map the affected portions of the rmbe into an spd, note the number of bytes 1463 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor 1464 * updates till whenever a respective page has been fully processed. 1465 * Note that subsequent recv() calls have to wait till all splice() processing 1466 * completed. 1467 */ 1468 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1469 struct pipe_inode_info *pipe, size_t len, 1470 unsigned int flags) 1471 { 1472 struct sock *sk = sock->sk; 1473 struct smc_sock *smc; 1474 int rc = -ENOTCONN; 1475 1476 smc = smc_sk(sk); 1477 lock_sock(sk); 1478 1479 if (sk->sk_state == SMC_INIT || 1480 sk->sk_state == SMC_LISTEN || 1481 sk->sk_state == SMC_CLOSED) 1482 goto out; 1483 1484 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1485 rc = 0; 1486 goto out; 1487 } 1488 1489 if (smc->use_fallback) { 1490 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1491 pipe, len, flags); 1492 } else { 1493 if (*ppos) { 1494 rc = -ESPIPE; 1495 goto out; 1496 } 1497 if (flags & SPLICE_F_NONBLOCK) 1498 flags = MSG_DONTWAIT; 1499 else 1500 flags = 0; 1501 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); 1502 } 1503 out: 1504 release_sock(sk); 1505 1506 return rc; 1507 } 1508 1509 /* must look like tcp */ 1510 static const struct proto_ops smc_sock_ops = { 1511 .family = PF_SMC, 1512 .owner = THIS_MODULE, 1513 .release = smc_release, 1514 .bind = smc_bind, 1515 .connect = smc_connect, 1516 .socketpair = sock_no_socketpair, 1517 .accept = smc_accept, 1518 .getname = smc_getname, 1519 .poll = smc_poll, 1520 .ioctl = smc_ioctl, 1521 .listen = smc_listen, 1522 .shutdown = smc_shutdown, 1523 .setsockopt = smc_setsockopt, 1524 .getsockopt = smc_getsockopt, 1525 .sendmsg = smc_sendmsg, 1526 .recvmsg = smc_recvmsg, 1527 .mmap = sock_no_mmap, 1528 .sendpage = smc_sendpage, 1529 .splice_read = smc_splice_read, 1530 }; 1531 1532 static int smc_create(struct net *net, struct socket *sock, int protocol, 1533 int kern) 1534 { 1535 int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; 1536 struct smc_sock *smc; 1537 struct sock *sk; 1538 int rc; 1539 1540 rc = -ESOCKTNOSUPPORT; 1541 if (sock->type != SOCK_STREAM) 1542 goto out; 1543 1544 rc = -EPROTONOSUPPORT; 1545 if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) 1546 goto out; 1547 1548 rc = -ENOBUFS; 1549 sock->ops = &smc_sock_ops; 1550 sk = smc_sock_alloc(net, sock, protocol); 1551 if (!sk) 1552 goto out; 1553 1554 /* create internal TCP socket for CLC handshake and fallback */ 1555 smc = smc_sk(sk); 1556 smc->use_fallback = false; /* assume rdma capability first */ 1557 rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, 1558 &smc->clcsock); 1559 if (rc) { 1560 sk_common_release(sk); 1561 goto out; 1562 } 1563 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 1564 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 1565 1566 out: 1567 return rc; 1568 } 1569 1570 static const struct net_proto_family smc_sock_family_ops = { 1571 .family = PF_SMC, 1572 .owner = THIS_MODULE, 1573 .create = smc_create, 1574 }; 1575 1576 static int __init smc_init(void) 1577 { 1578 int rc; 1579 1580 rc = smc_pnet_init(); 1581 if (rc) 1582 return rc; 1583 1584 rc = smc_llc_init(); 1585 if (rc) { 1586 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 1587 goto out_pnet; 1588 } 1589 1590 rc = smc_cdc_init(); 1591 if (rc) { 1592 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 1593 goto out_pnet; 1594 } 1595 1596 rc = proto_register(&smc_proto, 1); 1597 if (rc) { 1598 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); 1599 goto out_pnet; 1600 } 1601 1602 rc = proto_register(&smc_proto6, 1); 1603 if (rc) { 1604 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); 1605 goto out_proto; 1606 } 1607 1608 rc = sock_register(&smc_sock_family_ops); 1609 if (rc) { 1610 pr_err("%s: sock_register fails with %d\n", __func__, rc); 1611 goto out_proto6; 1612 } 1613 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 1614 INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); 1615 1616 rc = smc_ib_register_client(); 1617 if (rc) { 1618 pr_err("%s: ib_register fails with %d\n", __func__, rc); 1619 goto out_sock; 1620 } 1621 1622 static_branch_enable(&tcp_have_smc); 1623 return 0; 1624 1625 out_sock: 1626 sock_unregister(PF_SMC); 1627 out_proto6: 1628 proto_unregister(&smc_proto6); 1629 out_proto: 1630 proto_unregister(&smc_proto); 1631 out_pnet: 1632 smc_pnet_exit(); 1633 return rc; 1634 } 1635 1636 static void __exit smc_exit(void) 1637 { 1638 struct smc_link_group *lgr, *lg; 1639 LIST_HEAD(lgr_freeing_list); 1640 1641 spin_lock_bh(&smc_lgr_list.lock); 1642 if (!list_empty(&smc_lgr_list.list)) 1643 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1644 spin_unlock_bh(&smc_lgr_list.lock); 1645 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1646 list_del_init(&lgr->list); 1647 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 1648 cancel_delayed_work_sync(&lgr->free_work); 1649 smc_lgr_free(lgr); /* free link group */ 1650 } 1651 static_branch_disable(&tcp_have_smc); 1652 smc_ib_unregister_client(); 1653 sock_unregister(PF_SMC); 1654 proto_unregister(&smc_proto6); 1655 proto_unregister(&smc_proto); 1656 smc_pnet_exit(); 1657 } 1658 1659 module_init(smc_init); 1660 module_exit(smc_exit); 1661 1662 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 1663 MODULE_DESCRIPTION("smc socket address family"); 1664 MODULE_LICENSE("GPL"); 1665 MODULE_ALIAS_NETPROTO(PF_SMC); 1666