1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 5 * applies to SOCK_STREAM sockets only 6 * offers an alternative communication option for TCP-protocol sockets 7 * applicable with RoCE-cards only 8 * 9 * Initial restrictions: 10 * - non-blocking connect postponed 11 * - IPv6 support postponed 12 * - support for alternate links postponed 13 * - partial support for non-blocking sockets only 14 * - support for urgent data postponed 15 * 16 * Copyright IBM Corp. 2016 17 * 18 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 19 * based on prototype from Frank Blaschka 20 */ 21 22 #define KMSG_COMPONENT "smc" 23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/socket.h> 27 #include <linux/inetdevice.h> 28 #include <linux/workqueue.h> 29 #include <linux/in.h> 30 #include <linux/sched/signal.h> 31 32 #include <net/sock.h> 33 #include <net/tcp.h> 34 #include <net/smc.h> 35 36 #include "smc.h" 37 #include "smc_clc.h" 38 #include "smc_llc.h" 39 #include "smc_cdc.h" 40 #include "smc_core.h" 41 #include "smc_ib.h" 42 #include "smc_pnet.h" 43 #include "smc_tx.h" 44 #include "smc_rx.h" 45 #include "smc_close.h" 46 47 static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group 48 * creation 49 */ 50 51 struct smc_lgr_list smc_lgr_list = { /* established link groups */ 52 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 53 .list = LIST_HEAD_INIT(smc_lgr_list.list), 54 }; 55 56 static void smc_tcp_listen_work(struct work_struct *); 57 58 static void smc_set_keepalive(struct sock *sk, int val) 59 { 60 struct smc_sock *smc = smc_sk(sk); 61 62 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 63 } 64 65 static struct smc_hashinfo smc_v4_hashinfo = { 66 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 67 }; 68 69 int smc_hash_sk(struct sock *sk) 70 { 71 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 72 struct hlist_head *head; 73 74 head = &h->ht; 75 76 write_lock_bh(&h->lock); 77 sk_add_node(sk, head); 78 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 79 write_unlock_bh(&h->lock); 80 81 return 0; 82 } 83 EXPORT_SYMBOL_GPL(smc_hash_sk); 84 85 void smc_unhash_sk(struct sock *sk) 86 { 87 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 88 89 write_lock_bh(&h->lock); 90 if (sk_del_node_init(sk)) 91 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 92 write_unlock_bh(&h->lock); 93 } 94 EXPORT_SYMBOL_GPL(smc_unhash_sk); 95 96 struct proto smc_proto = { 97 .name = "SMC", 98 .owner = THIS_MODULE, 99 .keepalive = smc_set_keepalive, 100 .hash = smc_hash_sk, 101 .unhash = smc_unhash_sk, 102 .obj_size = sizeof(struct smc_sock), 103 .h.smc_hash = &smc_v4_hashinfo, 104 .slab_flags = SLAB_TYPESAFE_BY_RCU, 105 }; 106 EXPORT_SYMBOL_GPL(smc_proto); 107 108 static int smc_release(struct socket *sock) 109 { 110 struct sock *sk = sock->sk; 111 struct smc_sock *smc; 112 int rc = 0; 113 114 if (!sk) 115 goto out; 116 117 smc = smc_sk(sk); 118 if (sk->sk_state == SMC_LISTEN) 119 /* smc_close_non_accepted() is called and acquires 120 * sock lock for child sockets again 121 */ 122 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 123 else 124 lock_sock(sk); 125 126 if (!smc->use_fallback) { 127 rc = smc_close_active(smc); 128 sock_set_flag(sk, SOCK_DEAD); 129 sk->sk_shutdown |= SHUTDOWN_MASK; 130 } 131 if (smc->clcsock) { 132 sock_release(smc->clcsock); 133 smc->clcsock = NULL; 134 } 135 if (smc->use_fallback) { 136 sock_put(sk); /* passive closing */ 137 sk->sk_state = SMC_CLOSED; 138 sk->sk_state_change(sk); 139 } 140 141 /* detach socket */ 142 sock_orphan(sk); 143 sock->sk = NULL; 144 if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) 145 smc_conn_free(&smc->conn); 146 release_sock(sk); 147 148 sk->sk_prot->unhash(sk); 149 sock_put(sk); /* final sock_put */ 150 out: 151 return rc; 152 } 153 154 static void smc_destruct(struct sock *sk) 155 { 156 if (sk->sk_state != SMC_CLOSED) 157 return; 158 if (!sock_flag(sk, SOCK_DEAD)) 159 return; 160 161 sk_refcnt_debug_dec(sk); 162 } 163 164 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) 165 { 166 struct smc_sock *smc; 167 struct sock *sk; 168 169 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); 170 if (!sk) 171 return NULL; 172 173 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 174 sk->sk_state = SMC_INIT; 175 sk->sk_destruct = smc_destruct; 176 sk->sk_protocol = SMCPROTO_SMC; 177 smc = smc_sk(sk); 178 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 179 INIT_LIST_HEAD(&smc->accept_q); 180 spin_lock_init(&smc->accept_q_lock); 181 sk->sk_prot->hash(sk); 182 sk_refcnt_debug_inc(sk); 183 184 return sk; 185 } 186 187 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 188 int addr_len) 189 { 190 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 191 struct sock *sk = sock->sk; 192 struct smc_sock *smc; 193 int rc; 194 195 smc = smc_sk(sk); 196 197 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 198 rc = -EINVAL; 199 if (addr_len < sizeof(struct sockaddr_in)) 200 goto out; 201 202 rc = -EAFNOSUPPORT; 203 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 204 if ((addr->sin_family != AF_INET) && 205 ((addr->sin_family != AF_UNSPEC) || 206 (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) 207 goto out; 208 209 lock_sock(sk); 210 211 /* Check if socket is already active */ 212 rc = -EINVAL; 213 if (sk->sk_state != SMC_INIT) 214 goto out_rel; 215 216 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 217 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 218 219 out_rel: 220 release_sock(sk); 221 out: 222 return rc; 223 } 224 225 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 226 unsigned long mask) 227 { 228 /* options we don't get control via setsockopt for */ 229 nsk->sk_type = osk->sk_type; 230 nsk->sk_sndbuf = osk->sk_sndbuf; 231 nsk->sk_rcvbuf = osk->sk_rcvbuf; 232 nsk->sk_sndtimeo = osk->sk_sndtimeo; 233 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 234 nsk->sk_mark = osk->sk_mark; 235 nsk->sk_priority = osk->sk_priority; 236 nsk->sk_rcvlowat = osk->sk_rcvlowat; 237 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 238 nsk->sk_err = osk->sk_err; 239 240 nsk->sk_flags &= ~mask; 241 nsk->sk_flags |= osk->sk_flags & mask; 242 } 243 244 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 245 (1UL << SOCK_KEEPOPEN) | \ 246 (1UL << SOCK_LINGER) | \ 247 (1UL << SOCK_BROADCAST) | \ 248 (1UL << SOCK_TIMESTAMP) | \ 249 (1UL << SOCK_DBG) | \ 250 (1UL << SOCK_RCVTSTAMP) | \ 251 (1UL << SOCK_RCVTSTAMPNS) | \ 252 (1UL << SOCK_LOCALROUTE) | \ 253 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 254 (1UL << SOCK_RXQ_OVFL) | \ 255 (1UL << SOCK_WIFI_STATUS) | \ 256 (1UL << SOCK_NOFCS) | \ 257 (1UL << SOCK_FILTER_LOCKED)) 258 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 259 * clc socket (since smc is not called for these options from net/core) 260 */ 261 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 262 { 263 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 264 } 265 266 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 267 (1UL << SOCK_KEEPOPEN) | \ 268 (1UL << SOCK_LINGER) | \ 269 (1UL << SOCK_DBG)) 270 /* copy only settings and flags relevant for smc from clc to smc socket */ 271 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 272 { 273 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 274 } 275 276 /* determine subnet and mask of internal TCP socket */ 277 int smc_netinfo_by_tcpsk(struct socket *clcsock, 278 __be32 *subnet, u8 *prefix_len) 279 { 280 struct dst_entry *dst = sk_dst_get(clcsock->sk); 281 struct in_device *in_dev; 282 struct sockaddr_in addr; 283 int rc = -ENOENT; 284 int len; 285 286 if (!dst) { 287 rc = -ENOTCONN; 288 goto out; 289 } 290 if (!dst->dev) { 291 rc = -ENODEV; 292 goto out_rel; 293 } 294 295 /* get address to which the internal TCP socket is bound */ 296 kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); 297 /* analyze IPv4 specific data of net_device belonging to TCP socket */ 298 rcu_read_lock(); 299 in_dev = __in_dev_get_rcu(dst->dev); 300 for_ifa(in_dev) { 301 if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) 302 continue; 303 *prefix_len = inet_mask_len(ifa->ifa_mask); 304 *subnet = ifa->ifa_address & ifa->ifa_mask; 305 rc = 0; 306 break; 307 } endfor_ifa(in_dev); 308 rcu_read_unlock(); 309 310 out_rel: 311 dst_release(dst); 312 out: 313 return rc; 314 } 315 316 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) 317 { 318 struct smc_link_group *lgr = smc->conn.lgr; 319 struct smc_link *link; 320 int rest; 321 int rc; 322 323 link = &lgr->lnk[SMC_SINGLE_LINK]; 324 /* receive CONFIRM LINK request from server over RoCE fabric */ 325 rest = wait_for_completion_interruptible_timeout( 326 &link->llc_confirm, 327 SMC_LLC_WAIT_FIRST_TIME); 328 if (rest <= 0) { 329 struct smc_clc_msg_decline dclc; 330 331 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 332 SMC_CLC_DECLINE); 333 return rc; 334 } 335 336 rc = smc_ib_modify_qp_rts(link); 337 if (rc) 338 return SMC_CLC_DECL_INTERR; 339 340 smc_wr_remember_qp_attr(link); 341 342 rc = smc_wr_reg_send(link, 343 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 344 if (rc) 345 return SMC_CLC_DECL_INTERR; 346 347 /* send CONFIRM LINK response over RoCE fabric */ 348 rc = smc_llc_send_confirm_link(link, 349 link->smcibdev->mac[link->ibport - 1], 350 gid, SMC_LLC_RESP); 351 if (rc < 0) 352 return SMC_CLC_DECL_TCL; 353 354 return rc; 355 } 356 357 static void smc_conn_save_peer_info(struct smc_sock *smc, 358 struct smc_clc_msg_accept_confirm *clc) 359 { 360 smc->conn.peer_conn_idx = clc->conn_idx; 361 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 362 smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); 363 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 364 } 365 366 static void smc_link_save_peer_info(struct smc_link *link, 367 struct smc_clc_msg_accept_confirm *clc) 368 { 369 link->peer_qpn = ntoh24(clc->qpn); 370 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); 371 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); 372 link->peer_psn = ntoh24(clc->psn); 373 link->peer_mtu = clc->qp_mtu; 374 } 375 376 static void smc_lgr_forget(struct smc_link_group *lgr) 377 { 378 spin_lock_bh(&smc_lgr_list.lock); 379 /* do not use this link group for new connections */ 380 if (!list_empty(&lgr->list)) 381 list_del_init(&lgr->list); 382 spin_unlock_bh(&smc_lgr_list.lock); 383 } 384 385 /* setup for RDMA connection of client */ 386 static int smc_connect_rdma(struct smc_sock *smc) 387 { 388 struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr; 389 struct smc_clc_msg_accept_confirm aclc; 390 int local_contact = SMC_FIRST_CONTACT; 391 struct smc_ib_device *smcibdev; 392 struct smc_link *link; 393 u8 srv_first_contact; 394 int reason_code = 0; 395 int rc = 0; 396 u8 ibport; 397 398 sock_hold(&smc->sk); /* sock put in passive closing */ 399 400 if (!tcp_sk(smc->clcsock->sk)->syn_smc) { 401 /* peer has not signalled SMC-capability */ 402 smc->use_fallback = true; 403 goto out_connected; 404 } 405 406 /* IPSec connections opt out of SMC-R optimizations */ 407 if (using_ipsec(smc)) { 408 reason_code = SMC_CLC_DECL_IPSEC; 409 goto decline_rdma; 410 } 411 412 /* PNET table look up: search active ib_device and port 413 * within same PNETID that also contains the ethernet device 414 * used for the internal TCP socket 415 */ 416 smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); 417 if (!smcibdev) { 418 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 419 goto decline_rdma; 420 } 421 422 /* do inband token exchange */ 423 reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); 424 if (reason_code < 0) { 425 rc = reason_code; 426 goto out_err; 427 } 428 if (reason_code > 0) /* configuration error */ 429 goto decline_rdma; 430 /* receive SMC Accept CLC message */ 431 reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), 432 SMC_CLC_ACCEPT); 433 if (reason_code < 0) { 434 rc = reason_code; 435 goto out_err; 436 } 437 if (reason_code > 0) 438 goto decline_rdma; 439 440 srv_first_contact = aclc.hdr.flag; 441 mutex_lock(&smc_create_lgr_pending); 442 local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev, 443 ibport, &aclc.lcl, srv_first_contact); 444 if (local_contact < 0) { 445 rc = local_contact; 446 if (rc == -ENOMEM) 447 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 448 else if (rc == -ENOLINK) 449 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 450 goto decline_rdma_unlock; 451 } 452 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 453 454 smc_conn_save_peer_info(smc, &aclc); 455 456 /* create send buffer and rmb */ 457 rc = smc_buf_create(smc); 458 if (rc) { 459 reason_code = SMC_CLC_DECL_MEM; 460 goto decline_rdma_unlock; 461 } 462 463 if (local_contact == SMC_FIRST_CONTACT) 464 smc_link_save_peer_info(link, &aclc); 465 466 rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); 467 if (rc) { 468 reason_code = SMC_CLC_DECL_INTERR; 469 goto decline_rdma_unlock; 470 } 471 472 smc_close_init(smc); 473 smc_rx_init(smc); 474 475 if (local_contact == SMC_FIRST_CONTACT) { 476 rc = smc_ib_ready_link(link); 477 if (rc) { 478 reason_code = SMC_CLC_DECL_INTERR; 479 goto decline_rdma_unlock; 480 } 481 } else { 482 struct smc_buf_desc *buf_desc = smc->conn.rmb_desc; 483 484 if (!buf_desc->reused) { 485 /* register memory region for new rmb */ 486 rc = smc_wr_reg_send(link, 487 buf_desc->mr_rx[SMC_SINGLE_LINK]); 488 if (rc) { 489 reason_code = SMC_CLC_DECL_INTERR; 490 goto decline_rdma_unlock; 491 } 492 } 493 } 494 smc_rmb_sync_sg_for_device(&smc->conn); 495 496 rc = smc_clc_send_confirm(smc); 497 if (rc) 498 goto out_err_unlock; 499 500 if (local_contact == SMC_FIRST_CONTACT) { 501 /* QP confirmation over RoCE fabric */ 502 reason_code = smc_clnt_conf_first_link( 503 smc, &smcibdev->gid[ibport - 1]); 504 if (reason_code < 0) { 505 rc = reason_code; 506 goto out_err_unlock; 507 } 508 if (reason_code > 0) 509 goto decline_rdma_unlock; 510 } 511 512 mutex_unlock(&smc_create_lgr_pending); 513 smc_tx_init(smc); 514 515 out_connected: 516 smc_copy_sock_settings_to_clc(smc); 517 if (smc->sk.sk_state == SMC_INIT) 518 smc->sk.sk_state = SMC_ACTIVE; 519 520 return rc ? rc : local_contact; 521 522 decline_rdma_unlock: 523 if (local_contact == SMC_FIRST_CONTACT) 524 smc_lgr_forget(smc->conn.lgr); 525 mutex_unlock(&smc_create_lgr_pending); 526 smc_conn_free(&smc->conn); 527 decline_rdma: 528 /* RDMA setup failed, switch back to TCP */ 529 smc->use_fallback = true; 530 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 531 rc = smc_clc_send_decline(smc, reason_code); 532 if (rc < 0) 533 goto out_err; 534 } 535 goto out_connected; 536 537 out_err_unlock: 538 if (local_contact == SMC_FIRST_CONTACT) 539 smc_lgr_forget(smc->conn.lgr); 540 mutex_unlock(&smc_create_lgr_pending); 541 smc_conn_free(&smc->conn); 542 out_err: 543 if (smc->sk.sk_state == SMC_INIT) 544 sock_put(&smc->sk); /* passive closing */ 545 return rc; 546 } 547 548 static int smc_connect(struct socket *sock, struct sockaddr *addr, 549 int alen, int flags) 550 { 551 struct sock *sk = sock->sk; 552 struct smc_sock *smc; 553 int rc = -EINVAL; 554 555 smc = smc_sk(sk); 556 557 /* separate smc parameter checking to be safe */ 558 if (alen < sizeof(addr->sa_family)) 559 goto out_err; 560 if (addr->sa_family != AF_INET) 561 goto out_err; 562 smc->addr = addr; /* needed for nonblocking connect */ 563 564 lock_sock(sk); 565 switch (sk->sk_state) { 566 default: 567 goto out; 568 case SMC_ACTIVE: 569 rc = -EISCONN; 570 goto out; 571 case SMC_INIT: 572 rc = 0; 573 break; 574 } 575 576 smc_copy_sock_settings_to_clc(smc); 577 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 578 rc = kernel_connect(smc->clcsock, addr, alen, flags); 579 if (rc) 580 goto out; 581 582 /* setup RDMA connection */ 583 rc = smc_connect_rdma(smc); 584 if (rc < 0) 585 goto out; 586 else 587 rc = 0; /* success cases including fallback */ 588 589 out: 590 release_sock(sk); 591 out_err: 592 return rc; 593 } 594 595 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 596 { 597 struct socket *new_clcsock = NULL; 598 struct sock *lsk = &lsmc->sk; 599 struct sock *new_sk; 600 int rc; 601 602 release_sock(lsk); 603 new_sk = smc_sock_alloc(sock_net(lsk), NULL); 604 if (!new_sk) { 605 rc = -ENOMEM; 606 lsk->sk_err = ENOMEM; 607 *new_smc = NULL; 608 lock_sock(lsk); 609 goto out; 610 } 611 *new_smc = smc_sk(new_sk); 612 613 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); 614 lock_sock(lsk); 615 if (rc < 0) 616 lsk->sk_err = -rc; 617 if (rc < 0 || lsk->sk_state == SMC_CLOSED) { 618 if (new_clcsock) 619 sock_release(new_clcsock); 620 new_sk->sk_state = SMC_CLOSED; 621 sock_set_flag(new_sk, SOCK_DEAD); 622 new_sk->sk_prot->unhash(new_sk); 623 sock_put(new_sk); /* final */ 624 *new_smc = NULL; 625 goto out; 626 } 627 628 (*new_smc)->clcsock = new_clcsock; 629 out: 630 return rc; 631 } 632 633 /* add a just created sock to the accept queue of the listen sock as 634 * candidate for a following socket accept call from user space 635 */ 636 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 637 { 638 struct smc_sock *par = smc_sk(parent); 639 640 sock_hold(sk); /* sock_put in smc_accept_unlink () */ 641 spin_lock(&par->accept_q_lock); 642 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 643 spin_unlock(&par->accept_q_lock); 644 sk_acceptq_added(parent); 645 } 646 647 /* remove a socket from the accept queue of its parental listening socket */ 648 static void smc_accept_unlink(struct sock *sk) 649 { 650 struct smc_sock *par = smc_sk(sk)->listen_smc; 651 652 spin_lock(&par->accept_q_lock); 653 list_del_init(&smc_sk(sk)->accept_q); 654 spin_unlock(&par->accept_q_lock); 655 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 656 sock_put(sk); /* sock_hold in smc_accept_enqueue */ 657 } 658 659 /* remove a sock from the accept queue to bind it to a new socket created 660 * for a socket accept call from user space 661 */ 662 struct sock *smc_accept_dequeue(struct sock *parent, 663 struct socket *new_sock) 664 { 665 struct smc_sock *isk, *n; 666 struct sock *new_sk; 667 668 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 669 new_sk = (struct sock *)isk; 670 671 smc_accept_unlink(new_sk); 672 if (new_sk->sk_state == SMC_CLOSED) { 673 if (isk->clcsock) { 674 sock_release(isk->clcsock); 675 isk->clcsock = NULL; 676 } 677 new_sk->sk_prot->unhash(new_sk); 678 sock_put(new_sk); /* final */ 679 continue; 680 } 681 if (new_sock) 682 sock_graft(new_sk, new_sock); 683 return new_sk; 684 } 685 return NULL; 686 } 687 688 /* clean up for a created but never accepted sock */ 689 void smc_close_non_accepted(struct sock *sk) 690 { 691 struct smc_sock *smc = smc_sk(sk); 692 693 lock_sock(sk); 694 if (!sk->sk_lingertime) 695 /* wait for peer closing */ 696 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 697 if (!smc->use_fallback) { 698 smc_close_active(smc); 699 sock_set_flag(sk, SOCK_DEAD); 700 sk->sk_shutdown |= SHUTDOWN_MASK; 701 } 702 if (smc->clcsock) { 703 struct socket *tcp; 704 705 tcp = smc->clcsock; 706 smc->clcsock = NULL; 707 sock_release(tcp); 708 } 709 if (smc->use_fallback) { 710 sock_put(sk); /* passive closing */ 711 sk->sk_state = SMC_CLOSED; 712 } else { 713 if (sk->sk_state == SMC_CLOSED) 714 smc_conn_free(&smc->conn); 715 } 716 release_sock(sk); 717 sk->sk_prot->unhash(sk); 718 sock_put(sk); /* final sock_put */ 719 } 720 721 static int smc_serv_conf_first_link(struct smc_sock *smc) 722 { 723 struct smc_link_group *lgr = smc->conn.lgr; 724 struct smc_link *link; 725 int rest; 726 int rc; 727 728 link = &lgr->lnk[SMC_SINGLE_LINK]; 729 730 rc = smc_wr_reg_send(link, 731 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 732 if (rc) 733 return SMC_CLC_DECL_INTERR; 734 735 /* send CONFIRM LINK request to client over the RoCE fabric */ 736 rc = smc_llc_send_confirm_link(link, 737 link->smcibdev->mac[link->ibport - 1], 738 &link->smcibdev->gid[link->ibport - 1], 739 SMC_LLC_REQ); 740 if (rc < 0) 741 return SMC_CLC_DECL_TCL; 742 743 /* receive CONFIRM LINK response from client over the RoCE fabric */ 744 rest = wait_for_completion_interruptible_timeout( 745 &link->llc_confirm_resp, 746 SMC_LLC_WAIT_FIRST_TIME); 747 if (rest <= 0) { 748 struct smc_clc_msg_decline dclc; 749 750 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 751 SMC_CLC_DECLINE); 752 } 753 754 return rc; 755 } 756 757 /* setup for RDMA connection of server */ 758 static void smc_listen_work(struct work_struct *work) 759 { 760 struct smc_sock *new_smc = container_of(work, struct smc_sock, 761 smc_listen_work); 762 struct smc_clc_msg_proposal_prefix *pclc_prfx; 763 struct socket *newclcsock = new_smc->clcsock; 764 struct smc_sock *lsmc = new_smc->listen_smc; 765 struct smc_clc_msg_accept_confirm cclc; 766 int local_contact = SMC_REUSE_CONTACT; 767 struct sock *newsmcsk = &new_smc->sk; 768 struct smc_clc_msg_proposal *pclc; 769 struct smc_ib_device *smcibdev; 770 struct sockaddr_in peeraddr; 771 u8 buf[SMC_CLC_MAX_LEN]; 772 struct smc_link *link; 773 int reason_code = 0; 774 int rc = 0, len; 775 __be32 subnet; 776 u8 prefix_len; 777 u8 ibport; 778 779 /* check if peer is smc capable */ 780 if (!tcp_sk(newclcsock->sk)->syn_smc) { 781 new_smc->use_fallback = true; 782 goto out_connected; 783 } 784 785 /* do inband token exchange - 786 *wait for and receive SMC Proposal CLC message 787 */ 788 reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), 789 SMC_CLC_PROPOSAL); 790 if (reason_code < 0) 791 goto out_err; 792 if (reason_code > 0) 793 goto decline_rdma; 794 795 /* IPSec connections opt out of SMC-R optimizations */ 796 if (using_ipsec(new_smc)) { 797 reason_code = SMC_CLC_DECL_IPSEC; 798 goto decline_rdma; 799 } 800 801 /* PNET table look up: search active ib_device and port 802 * within same PNETID that also contains the ethernet device 803 * used for the internal TCP socket 804 */ 805 smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); 806 if (!smcibdev) { 807 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 808 goto decline_rdma; 809 } 810 811 /* determine subnet and mask from internal TCP socket */ 812 rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); 813 if (rc) { 814 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 815 goto decline_rdma; 816 } 817 818 pclc = (struct smc_clc_msg_proposal *)&buf; 819 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 820 if (pclc_prfx->outgoing_subnet != subnet || 821 pclc_prfx->prefix_len != prefix_len) { 822 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 823 goto decline_rdma; 824 } 825 826 /* get address of the peer connected to the internal TCP socket */ 827 kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len); 828 829 /* allocate connection / link group */ 830 mutex_lock(&smc_create_lgr_pending); 831 local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, 832 smcibdev, ibport, &pclc->lcl, 0); 833 if (local_contact < 0) { 834 rc = local_contact; 835 if (rc == -ENOMEM) 836 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 837 goto decline_rdma_unlock; 838 } 839 link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 840 841 /* create send buffer and rmb */ 842 rc = smc_buf_create(new_smc); 843 if (rc) { 844 reason_code = SMC_CLC_DECL_MEM; 845 goto decline_rdma_unlock; 846 } 847 848 smc_close_init(new_smc); 849 smc_rx_init(new_smc); 850 851 if (local_contact != SMC_FIRST_CONTACT) { 852 struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc; 853 854 if (!buf_desc->reused) { 855 /* register memory region for new rmb */ 856 rc = smc_wr_reg_send(link, 857 buf_desc->mr_rx[SMC_SINGLE_LINK]); 858 if (rc) { 859 reason_code = SMC_CLC_DECL_INTERR; 860 goto decline_rdma_unlock; 861 } 862 } 863 } 864 smc_rmb_sync_sg_for_device(&new_smc->conn); 865 866 rc = smc_clc_send_accept(new_smc, local_contact); 867 if (rc) 868 goto out_err_unlock; 869 870 /* receive SMC Confirm CLC message */ 871 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), 872 SMC_CLC_CONFIRM); 873 if (reason_code < 0) 874 goto out_err_unlock; 875 if (reason_code > 0) 876 goto decline_rdma_unlock; 877 smc_conn_save_peer_info(new_smc, &cclc); 878 if (local_contact == SMC_FIRST_CONTACT) 879 smc_link_save_peer_info(link, &cclc); 880 881 rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); 882 if (rc) { 883 reason_code = SMC_CLC_DECL_INTERR; 884 goto decline_rdma_unlock; 885 } 886 887 if (local_contact == SMC_FIRST_CONTACT) { 888 rc = smc_ib_ready_link(link); 889 if (rc) { 890 reason_code = SMC_CLC_DECL_INTERR; 891 goto decline_rdma_unlock; 892 } 893 /* QP confirmation over RoCE fabric */ 894 reason_code = smc_serv_conf_first_link(new_smc); 895 if (reason_code < 0) 896 /* peer is not aware of a problem */ 897 goto out_err_unlock; 898 if (reason_code > 0) 899 goto decline_rdma_unlock; 900 } 901 902 smc_tx_init(new_smc); 903 mutex_unlock(&smc_create_lgr_pending); 904 905 out_connected: 906 sk_refcnt_debug_inc(newsmcsk); 907 if (newsmcsk->sk_state == SMC_INIT) 908 newsmcsk->sk_state = SMC_ACTIVE; 909 enqueue: 910 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 911 if (lsmc->sk.sk_state == SMC_LISTEN) { 912 smc_accept_enqueue(&lsmc->sk, newsmcsk); 913 } else { /* no longer listening */ 914 smc_close_non_accepted(newsmcsk); 915 } 916 release_sock(&lsmc->sk); 917 918 /* Wake up accept */ 919 lsmc->sk.sk_data_ready(&lsmc->sk); 920 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 921 return; 922 923 decline_rdma_unlock: 924 if (local_contact == SMC_FIRST_CONTACT) 925 smc_lgr_forget(new_smc->conn.lgr); 926 mutex_unlock(&smc_create_lgr_pending); 927 decline_rdma: 928 /* RDMA setup failed, switch back to TCP */ 929 smc_conn_free(&new_smc->conn); 930 new_smc->use_fallback = true; 931 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 932 if (smc_clc_send_decline(new_smc, reason_code) < 0) 933 goto out_err; 934 } 935 goto out_connected; 936 937 out_err_unlock: 938 if (local_contact == SMC_FIRST_CONTACT) 939 smc_lgr_forget(new_smc->conn.lgr); 940 mutex_unlock(&smc_create_lgr_pending); 941 out_err: 942 if (newsmcsk->sk_state == SMC_INIT) 943 sock_put(&new_smc->sk); /* passive closing */ 944 newsmcsk->sk_state = SMC_CLOSED; 945 smc_conn_free(&new_smc->conn); 946 goto enqueue; /* queue new sock with sk_err set */ 947 } 948 949 static void smc_tcp_listen_work(struct work_struct *work) 950 { 951 struct smc_sock *lsmc = container_of(work, struct smc_sock, 952 tcp_listen_work); 953 struct sock *lsk = &lsmc->sk; 954 struct smc_sock *new_smc; 955 int rc = 0; 956 957 lock_sock(lsk); 958 while (lsk->sk_state == SMC_LISTEN) { 959 rc = smc_clcsock_accept(lsmc, &new_smc); 960 if (rc) 961 goto out; 962 if (!new_smc) 963 continue; 964 965 new_smc->listen_smc = lsmc; 966 new_smc->use_fallback = false; /* assume rdma capability first*/ 967 sock_hold(lsk); /* sock_put in smc_listen_work */ 968 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 969 smc_copy_sock_settings_to_smc(new_smc); 970 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 971 if (!schedule_work(&new_smc->smc_listen_work)) 972 sock_put(&new_smc->sk); 973 } 974 975 out: 976 if (lsmc->clcsock) { 977 sock_release(lsmc->clcsock); 978 lsmc->clcsock = NULL; 979 } 980 release_sock(lsk); 981 sock_put(&lsmc->sk); /* sock_hold in smc_listen */ 982 } 983 984 static int smc_listen(struct socket *sock, int backlog) 985 { 986 struct sock *sk = sock->sk; 987 struct smc_sock *smc; 988 int rc; 989 990 smc = smc_sk(sk); 991 lock_sock(sk); 992 993 rc = -EINVAL; 994 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) 995 goto out; 996 997 rc = 0; 998 if (sk->sk_state == SMC_LISTEN) { 999 sk->sk_max_ack_backlog = backlog; 1000 goto out; 1001 } 1002 /* some socket options are handled in core, so we could not apply 1003 * them to the clc socket -- copy smc socket options to clc socket 1004 */ 1005 smc_copy_sock_settings_to_clc(smc); 1006 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1007 1008 rc = kernel_listen(smc->clcsock, backlog); 1009 if (rc) 1010 goto out; 1011 sk->sk_max_ack_backlog = backlog; 1012 sk->sk_ack_backlog = 0; 1013 sk->sk_state = SMC_LISTEN; 1014 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 1015 sock_hold(sk); /* sock_hold in tcp_listen_worker */ 1016 if (!schedule_work(&smc->tcp_listen_work)) 1017 sock_put(sk); 1018 1019 out: 1020 release_sock(sk); 1021 return rc; 1022 } 1023 1024 static int smc_accept(struct socket *sock, struct socket *new_sock, 1025 int flags, bool kern) 1026 { 1027 struct sock *sk = sock->sk, *nsk; 1028 DECLARE_WAITQUEUE(wait, current); 1029 struct smc_sock *lsmc; 1030 long timeo; 1031 int rc = 0; 1032 1033 lsmc = smc_sk(sk); 1034 sock_hold(sk); /* sock_put below */ 1035 lock_sock(sk); 1036 1037 if (lsmc->sk.sk_state != SMC_LISTEN) { 1038 rc = -EINVAL; 1039 goto out; 1040 } 1041 1042 /* Wait for an incoming connection */ 1043 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1044 add_wait_queue_exclusive(sk_sleep(sk), &wait); 1045 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 1046 set_current_state(TASK_INTERRUPTIBLE); 1047 if (!timeo) { 1048 rc = -EAGAIN; 1049 break; 1050 } 1051 release_sock(sk); 1052 timeo = schedule_timeout(timeo); 1053 /* wakeup by sk_data_ready in smc_listen_work() */ 1054 sched_annotate_sleep(); 1055 lock_sock(sk); 1056 if (signal_pending(current)) { 1057 rc = sock_intr_errno(timeo); 1058 break; 1059 } 1060 } 1061 set_current_state(TASK_RUNNING); 1062 remove_wait_queue(sk_sleep(sk), &wait); 1063 1064 if (!rc) 1065 rc = sock_error(nsk); 1066 1067 out: 1068 release_sock(sk); 1069 sock_put(sk); /* sock_hold above */ 1070 return rc; 1071 } 1072 1073 static int smc_getname(struct socket *sock, struct sockaddr *addr, 1074 int *len, int peer) 1075 { 1076 struct smc_sock *smc; 1077 1078 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 1079 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 1080 return -ENOTCONN; 1081 1082 smc = smc_sk(sock->sk); 1083 1084 return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer); 1085 } 1086 1087 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1088 { 1089 struct sock *sk = sock->sk; 1090 struct smc_sock *smc; 1091 int rc = -EPIPE; 1092 1093 smc = smc_sk(sk); 1094 lock_sock(sk); 1095 if ((sk->sk_state != SMC_ACTIVE) && 1096 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1097 (sk->sk_state != SMC_INIT)) 1098 goto out; 1099 if (smc->use_fallback) 1100 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1101 else 1102 rc = smc_tx_sendmsg(smc, msg, len); 1103 out: 1104 release_sock(sk); 1105 return rc; 1106 } 1107 1108 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1109 int flags) 1110 { 1111 struct sock *sk = sock->sk; 1112 struct smc_sock *smc; 1113 int rc = -ENOTCONN; 1114 1115 smc = smc_sk(sk); 1116 lock_sock(sk); 1117 if ((sk->sk_state == SMC_INIT) || 1118 (sk->sk_state == SMC_LISTEN) || 1119 (sk->sk_state == SMC_CLOSED)) 1120 goto out; 1121 1122 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1123 rc = 0; 1124 goto out; 1125 } 1126 1127 if (smc->use_fallback) 1128 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1129 else 1130 rc = smc_rx_recvmsg(smc, msg, len, flags); 1131 1132 out: 1133 release_sock(sk); 1134 return rc; 1135 } 1136 1137 static __poll_t smc_accept_poll(struct sock *parent) 1138 { 1139 struct smc_sock *isk = smc_sk(parent); 1140 __poll_t mask = 0; 1141 1142 spin_lock(&isk->accept_q_lock); 1143 if (!list_empty(&isk->accept_q)) 1144 mask = EPOLLIN | EPOLLRDNORM; 1145 spin_unlock(&isk->accept_q_lock); 1146 1147 return mask; 1148 } 1149 1150 static __poll_t smc_poll(struct file *file, struct socket *sock, 1151 poll_table *wait) 1152 { 1153 struct sock *sk = sock->sk; 1154 __poll_t mask = 0; 1155 struct smc_sock *smc; 1156 int rc; 1157 1158 if (!sk) 1159 return EPOLLNVAL; 1160 1161 smc = smc_sk(sock->sk); 1162 sock_hold(sk); 1163 lock_sock(sk); 1164 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { 1165 /* delegate to CLC child sock */ 1166 release_sock(sk); 1167 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 1168 /* if non-blocking connect finished ... */ 1169 lock_sock(sk); 1170 if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) { 1171 sk->sk_err = smc->clcsock->sk->sk_err; 1172 if (sk->sk_err) { 1173 mask |= EPOLLERR; 1174 } else { 1175 rc = smc_connect_rdma(smc); 1176 if (rc < 0) 1177 mask |= EPOLLERR; 1178 /* success cases including fallback */ 1179 mask |= EPOLLOUT | EPOLLWRNORM; 1180 } 1181 } 1182 } else { 1183 if (sk->sk_state != SMC_CLOSED) { 1184 release_sock(sk); 1185 sock_poll_wait(file, sk_sleep(sk), wait); 1186 lock_sock(sk); 1187 } 1188 if (sk->sk_err) 1189 mask |= EPOLLERR; 1190 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1191 (sk->sk_state == SMC_CLOSED)) 1192 mask |= EPOLLHUP; 1193 if (sk->sk_state == SMC_LISTEN) { 1194 /* woken up by sk_data_ready in smc_listen_work() */ 1195 mask = smc_accept_poll(sk); 1196 } else { 1197 if (atomic_read(&smc->conn.sndbuf_space) || 1198 sk->sk_shutdown & SEND_SHUTDOWN) { 1199 mask |= EPOLLOUT | EPOLLWRNORM; 1200 } else { 1201 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1202 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1203 } 1204 if (atomic_read(&smc->conn.bytes_to_rcv)) 1205 mask |= EPOLLIN | EPOLLRDNORM; 1206 if (sk->sk_shutdown & RCV_SHUTDOWN) 1207 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1208 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1209 mask |= EPOLLIN; 1210 } 1211 1212 } 1213 release_sock(sk); 1214 sock_put(sk); 1215 1216 return mask; 1217 } 1218 1219 static int smc_shutdown(struct socket *sock, int how) 1220 { 1221 struct sock *sk = sock->sk; 1222 struct smc_sock *smc; 1223 int rc = -EINVAL; 1224 int rc1 = 0; 1225 1226 smc = smc_sk(sk); 1227 1228 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 1229 return rc; 1230 1231 lock_sock(sk); 1232 1233 rc = -ENOTCONN; 1234 if ((sk->sk_state != SMC_LISTEN) && 1235 (sk->sk_state != SMC_ACTIVE) && 1236 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1237 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1238 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1239 (sk->sk_state != SMC_APPCLOSEWAIT2) && 1240 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 1241 goto out; 1242 if (smc->use_fallback) { 1243 rc = kernel_sock_shutdown(smc->clcsock, how); 1244 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 1245 if (sk->sk_shutdown == SHUTDOWN_MASK) 1246 sk->sk_state = SMC_CLOSED; 1247 goto out; 1248 } 1249 switch (how) { 1250 case SHUT_RDWR: /* shutdown in both directions */ 1251 rc = smc_close_active(smc); 1252 break; 1253 case SHUT_WR: 1254 rc = smc_close_shutdown_write(smc); 1255 break; 1256 case SHUT_RD: 1257 if (sk->sk_state == SMC_LISTEN) 1258 rc = smc_close_active(smc); 1259 else 1260 rc = 0; 1261 /* nothing more to do because peer is not involved */ 1262 break; 1263 } 1264 rc1 = kernel_sock_shutdown(smc->clcsock, how); 1265 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 1266 sk->sk_shutdown |= how + 1; 1267 1268 out: 1269 release_sock(sk); 1270 return rc ? rc : rc1; 1271 } 1272 1273 static int smc_setsockopt(struct socket *sock, int level, int optname, 1274 char __user *optval, unsigned int optlen) 1275 { 1276 struct sock *sk = sock->sk; 1277 struct smc_sock *smc; 1278 1279 smc = smc_sk(sk); 1280 1281 /* generic setsockopts reaching us here always apply to the 1282 * CLC socket 1283 */ 1284 return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1285 optval, optlen); 1286 } 1287 1288 static int smc_getsockopt(struct socket *sock, int level, int optname, 1289 char __user *optval, int __user *optlen) 1290 { 1291 struct smc_sock *smc; 1292 1293 smc = smc_sk(sock->sk); 1294 /* socket options apply to the CLC socket */ 1295 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 1296 optval, optlen); 1297 } 1298 1299 static int smc_ioctl(struct socket *sock, unsigned int cmd, 1300 unsigned long arg) 1301 { 1302 struct smc_sock *smc; 1303 1304 smc = smc_sk(sock->sk); 1305 if (smc->use_fallback) 1306 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1307 else 1308 return sock_no_ioctl(sock, cmd, arg); 1309 } 1310 1311 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1312 int offset, size_t size, int flags) 1313 { 1314 struct sock *sk = sock->sk; 1315 struct smc_sock *smc; 1316 int rc = -EPIPE; 1317 1318 smc = smc_sk(sk); 1319 lock_sock(sk); 1320 if (sk->sk_state != SMC_ACTIVE) 1321 goto out; 1322 if (smc->use_fallback) 1323 rc = kernel_sendpage(smc->clcsock, page, offset, 1324 size, flags); 1325 else 1326 rc = sock_no_sendpage(sock, page, offset, size, flags); 1327 1328 out: 1329 release_sock(sk); 1330 return rc; 1331 } 1332 1333 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1334 struct pipe_inode_info *pipe, size_t len, 1335 unsigned int flags) 1336 { 1337 struct sock *sk = sock->sk; 1338 struct smc_sock *smc; 1339 int rc = -ENOTCONN; 1340 1341 smc = smc_sk(sk); 1342 lock_sock(sk); 1343 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) 1344 goto out; 1345 if (smc->use_fallback) { 1346 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1347 pipe, len, flags); 1348 } else { 1349 rc = -EOPNOTSUPP; 1350 } 1351 out: 1352 release_sock(sk); 1353 return rc; 1354 } 1355 1356 /* must look like tcp */ 1357 static const struct proto_ops smc_sock_ops = { 1358 .family = PF_SMC, 1359 .owner = THIS_MODULE, 1360 .release = smc_release, 1361 .bind = smc_bind, 1362 .connect = smc_connect, 1363 .socketpair = sock_no_socketpair, 1364 .accept = smc_accept, 1365 .getname = smc_getname, 1366 .poll = smc_poll, 1367 .ioctl = smc_ioctl, 1368 .listen = smc_listen, 1369 .shutdown = smc_shutdown, 1370 .setsockopt = smc_setsockopt, 1371 .getsockopt = smc_getsockopt, 1372 .sendmsg = smc_sendmsg, 1373 .recvmsg = smc_recvmsg, 1374 .mmap = sock_no_mmap, 1375 .sendpage = smc_sendpage, 1376 .splice_read = smc_splice_read, 1377 }; 1378 1379 static int smc_create(struct net *net, struct socket *sock, int protocol, 1380 int kern) 1381 { 1382 struct smc_sock *smc; 1383 struct sock *sk; 1384 int rc; 1385 1386 rc = -ESOCKTNOSUPPORT; 1387 if (sock->type != SOCK_STREAM) 1388 goto out; 1389 1390 rc = -EPROTONOSUPPORT; 1391 if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) 1392 goto out; 1393 1394 rc = -ENOBUFS; 1395 sock->ops = &smc_sock_ops; 1396 sk = smc_sock_alloc(net, sock); 1397 if (!sk) 1398 goto out; 1399 1400 /* create internal TCP socket for CLC handshake and fallback */ 1401 smc = smc_sk(sk); 1402 smc->use_fallback = false; /* assume rdma capability first */ 1403 rc = sock_create_kern(net, PF_INET, SOCK_STREAM, 1404 IPPROTO_TCP, &smc->clcsock); 1405 if (rc) { 1406 sk_common_release(sk); 1407 goto out; 1408 } 1409 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 1410 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 1411 1412 out: 1413 return rc; 1414 } 1415 1416 static const struct net_proto_family smc_sock_family_ops = { 1417 .family = PF_SMC, 1418 .owner = THIS_MODULE, 1419 .create = smc_create, 1420 }; 1421 1422 static int __init smc_init(void) 1423 { 1424 int rc; 1425 1426 rc = smc_pnet_init(); 1427 if (rc) 1428 return rc; 1429 1430 rc = smc_llc_init(); 1431 if (rc) { 1432 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 1433 goto out_pnet; 1434 } 1435 1436 rc = smc_cdc_init(); 1437 if (rc) { 1438 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 1439 goto out_pnet; 1440 } 1441 1442 rc = proto_register(&smc_proto, 1); 1443 if (rc) { 1444 pr_err("%s: proto_register fails with %d\n", __func__, rc); 1445 goto out_pnet; 1446 } 1447 1448 rc = sock_register(&smc_sock_family_ops); 1449 if (rc) { 1450 pr_err("%s: sock_register fails with %d\n", __func__, rc); 1451 goto out_proto; 1452 } 1453 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 1454 1455 rc = smc_ib_register_client(); 1456 if (rc) { 1457 pr_err("%s: ib_register fails with %d\n", __func__, rc); 1458 goto out_sock; 1459 } 1460 1461 static_branch_enable(&tcp_have_smc); 1462 return 0; 1463 1464 out_sock: 1465 sock_unregister(PF_SMC); 1466 out_proto: 1467 proto_unregister(&smc_proto); 1468 out_pnet: 1469 smc_pnet_exit(); 1470 return rc; 1471 } 1472 1473 static void __exit smc_exit(void) 1474 { 1475 struct smc_link_group *lgr, *lg; 1476 LIST_HEAD(lgr_freeing_list); 1477 1478 spin_lock_bh(&smc_lgr_list.lock); 1479 if (!list_empty(&smc_lgr_list.list)) 1480 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1481 spin_unlock_bh(&smc_lgr_list.lock); 1482 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1483 list_del_init(&lgr->list); 1484 smc_lgr_free(lgr); /* free link group */ 1485 } 1486 static_branch_disable(&tcp_have_smc); 1487 smc_ib_unregister_client(); 1488 sock_unregister(PF_SMC); 1489 proto_unregister(&smc_proto); 1490 smc_pnet_exit(); 1491 } 1492 1493 module_init(smc_init); 1494 module_exit(smc_exit); 1495 1496 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 1497 MODULE_DESCRIPTION("smc socket address family"); 1498 MODULE_LICENSE("GPL"); 1499 MODULE_ALIAS_NETPROTO(PF_SMC); 1500