1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 5 * applies to SOCK_STREAM sockets only 6 * offers an alternative communication option for TCP-protocol sockets 7 * applicable with RoCE-cards only 8 * 9 * Initial restrictions: 10 * - non-blocking connect postponed 11 * - IPv6 support postponed 12 * - support for alternate links postponed 13 * - partial support for non-blocking sockets only 14 * - support for urgent data postponed 15 * 16 * Copyright IBM Corp. 2016 17 * 18 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 19 * based on prototype from Frank Blaschka 20 */ 21 22 #define KMSG_COMPONENT "smc" 23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/socket.h> 27 #include <linux/inetdevice.h> 28 #include <linux/workqueue.h> 29 #include <linux/in.h> 30 #include <linux/sched/signal.h> 31 32 #include <net/sock.h> 33 #include <net/tcp.h> 34 #include <net/smc.h> 35 36 #include "smc.h" 37 #include "smc_clc.h" 38 #include "smc_llc.h" 39 #include "smc_cdc.h" 40 #include "smc_core.h" 41 #include "smc_ib.h" 42 #include "smc_pnet.h" 43 #include "smc_tx.h" 44 #include "smc_rx.h" 45 #include "smc_close.h" 46 47 static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group 48 * creation 49 */ 50 51 struct smc_lgr_list smc_lgr_list = { /* established link groups */ 52 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 53 .list = LIST_HEAD_INIT(smc_lgr_list.list), 54 }; 55 56 static void smc_tcp_listen_work(struct work_struct *); 57 58 static void smc_set_keepalive(struct sock *sk, int val) 59 { 60 struct smc_sock *smc = smc_sk(sk); 61 62 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 63 } 64 65 static struct smc_hashinfo smc_v4_hashinfo = { 66 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 67 }; 68 69 int smc_hash_sk(struct sock *sk) 70 { 71 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 72 struct hlist_head *head; 73 74 head = &h->ht; 75 76 write_lock_bh(&h->lock); 77 sk_add_node(sk, head); 78 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 79 write_unlock_bh(&h->lock); 80 81 return 0; 82 } 83 EXPORT_SYMBOL_GPL(smc_hash_sk); 84 85 void smc_unhash_sk(struct sock *sk) 86 { 87 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 88 89 write_lock_bh(&h->lock); 90 if (sk_del_node_init(sk)) 91 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 92 write_unlock_bh(&h->lock); 93 } 94 EXPORT_SYMBOL_GPL(smc_unhash_sk); 95 96 struct proto smc_proto = { 97 .name = "SMC", 98 .owner = THIS_MODULE, 99 .keepalive = smc_set_keepalive, 100 .hash = smc_hash_sk, 101 .unhash = smc_unhash_sk, 102 .obj_size = sizeof(struct smc_sock), 103 .h.smc_hash = &smc_v4_hashinfo, 104 .slab_flags = SLAB_TYPESAFE_BY_RCU, 105 }; 106 EXPORT_SYMBOL_GPL(smc_proto); 107 108 static int smc_release(struct socket *sock) 109 { 110 struct sock *sk = sock->sk; 111 struct smc_sock *smc; 112 int rc = 0; 113 114 if (!sk) 115 goto out; 116 117 smc = smc_sk(sk); 118 if (sk->sk_state == SMC_LISTEN) 119 /* smc_close_non_accepted() is called and acquires 120 * sock lock for child sockets again 121 */ 122 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 123 else 124 lock_sock(sk); 125 126 if (!smc->use_fallback) { 127 rc = smc_close_active(smc); 128 sock_set_flag(sk, SOCK_DEAD); 129 sk->sk_shutdown |= SHUTDOWN_MASK; 130 } 131 if (smc->clcsock) { 132 sock_release(smc->clcsock); 133 smc->clcsock = NULL; 134 } 135 if (smc->use_fallback) { 136 sock_put(sk); /* passive closing */ 137 sk->sk_state = SMC_CLOSED; 138 sk->sk_state_change(sk); 139 } 140 141 /* detach socket */ 142 sock_orphan(sk); 143 sock->sk = NULL; 144 if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) 145 smc_conn_free(&smc->conn); 146 release_sock(sk); 147 148 sk->sk_prot->unhash(sk); 149 sock_put(sk); /* final sock_put */ 150 out: 151 return rc; 152 } 153 154 static void smc_destruct(struct sock *sk) 155 { 156 if (sk->sk_state != SMC_CLOSED) 157 return; 158 if (!sock_flag(sk, SOCK_DEAD)) 159 return; 160 161 sk_refcnt_debug_dec(sk); 162 } 163 164 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) 165 { 166 struct smc_sock *smc; 167 struct sock *sk; 168 169 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); 170 if (!sk) 171 return NULL; 172 173 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 174 sk->sk_state = SMC_INIT; 175 sk->sk_destruct = smc_destruct; 176 sk->sk_protocol = SMCPROTO_SMC; 177 smc = smc_sk(sk); 178 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 179 INIT_LIST_HEAD(&smc->accept_q); 180 spin_lock_init(&smc->accept_q_lock); 181 sk->sk_prot->hash(sk); 182 sk_refcnt_debug_inc(sk); 183 184 return sk; 185 } 186 187 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 188 int addr_len) 189 { 190 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 191 struct sock *sk = sock->sk; 192 struct smc_sock *smc; 193 int rc; 194 195 smc = smc_sk(sk); 196 197 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 198 rc = -EINVAL; 199 if (addr_len < sizeof(struct sockaddr_in)) 200 goto out; 201 202 rc = -EAFNOSUPPORT; 203 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 204 if ((addr->sin_family != AF_INET) && 205 ((addr->sin_family != AF_UNSPEC) || 206 (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) 207 goto out; 208 209 lock_sock(sk); 210 211 /* Check if socket is already active */ 212 rc = -EINVAL; 213 if (sk->sk_state != SMC_INIT) 214 goto out_rel; 215 216 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 217 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 218 219 out_rel: 220 release_sock(sk); 221 out: 222 return rc; 223 } 224 225 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 226 unsigned long mask) 227 { 228 /* options we don't get control via setsockopt for */ 229 nsk->sk_type = osk->sk_type; 230 nsk->sk_sndbuf = osk->sk_sndbuf; 231 nsk->sk_rcvbuf = osk->sk_rcvbuf; 232 nsk->sk_sndtimeo = osk->sk_sndtimeo; 233 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 234 nsk->sk_mark = osk->sk_mark; 235 nsk->sk_priority = osk->sk_priority; 236 nsk->sk_rcvlowat = osk->sk_rcvlowat; 237 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 238 nsk->sk_err = osk->sk_err; 239 240 nsk->sk_flags &= ~mask; 241 nsk->sk_flags |= osk->sk_flags & mask; 242 } 243 244 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 245 (1UL << SOCK_KEEPOPEN) | \ 246 (1UL << SOCK_LINGER) | \ 247 (1UL << SOCK_BROADCAST) | \ 248 (1UL << SOCK_TIMESTAMP) | \ 249 (1UL << SOCK_DBG) | \ 250 (1UL << SOCK_RCVTSTAMP) | \ 251 (1UL << SOCK_RCVTSTAMPNS) | \ 252 (1UL << SOCK_LOCALROUTE) | \ 253 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 254 (1UL << SOCK_RXQ_OVFL) | \ 255 (1UL << SOCK_WIFI_STATUS) | \ 256 (1UL << SOCK_NOFCS) | \ 257 (1UL << SOCK_FILTER_LOCKED)) 258 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 259 * clc socket (since smc is not called for these options from net/core) 260 */ 261 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 262 { 263 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 264 } 265 266 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 267 (1UL << SOCK_KEEPOPEN) | \ 268 (1UL << SOCK_LINGER) | \ 269 (1UL << SOCK_DBG)) 270 /* copy only settings and flags relevant for smc from clc to smc socket */ 271 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 272 { 273 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 274 } 275 276 /* determine subnet and mask of internal TCP socket */ 277 int smc_netinfo_by_tcpsk(struct socket *clcsock, 278 __be32 *subnet, u8 *prefix_len) 279 { 280 struct dst_entry *dst = sk_dst_get(clcsock->sk); 281 struct in_device *in_dev; 282 struct sockaddr_in addr; 283 int rc = -ENOENT; 284 285 if (!dst) { 286 rc = -ENOTCONN; 287 goto out; 288 } 289 if (!dst->dev) { 290 rc = -ENODEV; 291 goto out_rel; 292 } 293 294 /* get address to which the internal TCP socket is bound */ 295 kernel_getsockname(clcsock, (struct sockaddr *)&addr); 296 /* analyze IPv4 specific data of net_device belonging to TCP socket */ 297 rcu_read_lock(); 298 in_dev = __in_dev_get_rcu(dst->dev); 299 for_ifa(in_dev) { 300 if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) 301 continue; 302 *prefix_len = inet_mask_len(ifa->ifa_mask); 303 *subnet = ifa->ifa_address & ifa->ifa_mask; 304 rc = 0; 305 break; 306 } endfor_ifa(in_dev); 307 rcu_read_unlock(); 308 309 out_rel: 310 dst_release(dst); 311 out: 312 return rc; 313 } 314 315 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) 316 { 317 struct smc_link_group *lgr = smc->conn.lgr; 318 struct smc_link *link; 319 int rest; 320 int rc; 321 322 link = &lgr->lnk[SMC_SINGLE_LINK]; 323 /* receive CONFIRM LINK request from server over RoCE fabric */ 324 rest = wait_for_completion_interruptible_timeout( 325 &link->llc_confirm, 326 SMC_LLC_WAIT_FIRST_TIME); 327 if (rest <= 0) { 328 struct smc_clc_msg_decline dclc; 329 330 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 331 SMC_CLC_DECLINE); 332 return rc; 333 } 334 335 rc = smc_ib_modify_qp_rts(link); 336 if (rc) 337 return SMC_CLC_DECL_INTERR; 338 339 smc_wr_remember_qp_attr(link); 340 341 rc = smc_wr_reg_send(link, 342 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 343 if (rc) 344 return SMC_CLC_DECL_INTERR; 345 346 /* send CONFIRM LINK response over RoCE fabric */ 347 rc = smc_llc_send_confirm_link(link, 348 link->smcibdev->mac[link->ibport - 1], 349 gid, SMC_LLC_RESP); 350 if (rc < 0) 351 return SMC_CLC_DECL_TCL; 352 353 return rc; 354 } 355 356 static void smc_conn_save_peer_info(struct smc_sock *smc, 357 struct smc_clc_msg_accept_confirm *clc) 358 { 359 smc->conn.peer_conn_idx = clc->conn_idx; 360 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 361 smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); 362 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 363 } 364 365 static void smc_link_save_peer_info(struct smc_link *link, 366 struct smc_clc_msg_accept_confirm *clc) 367 { 368 link->peer_qpn = ntoh24(clc->qpn); 369 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); 370 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); 371 link->peer_psn = ntoh24(clc->psn); 372 link->peer_mtu = clc->qp_mtu; 373 } 374 375 static void smc_lgr_forget(struct smc_link_group *lgr) 376 { 377 spin_lock_bh(&smc_lgr_list.lock); 378 /* do not use this link group for new connections */ 379 if (!list_empty(&lgr->list)) 380 list_del_init(&lgr->list); 381 spin_unlock_bh(&smc_lgr_list.lock); 382 } 383 384 /* setup for RDMA connection of client */ 385 static int smc_connect_rdma(struct smc_sock *smc) 386 { 387 struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr; 388 struct smc_clc_msg_accept_confirm aclc; 389 int local_contact = SMC_FIRST_CONTACT; 390 struct smc_ib_device *smcibdev; 391 struct smc_link *link; 392 u8 srv_first_contact; 393 int reason_code = 0; 394 int rc = 0; 395 u8 ibport; 396 397 sock_hold(&smc->sk); /* sock put in passive closing */ 398 399 if (!tcp_sk(smc->clcsock->sk)->syn_smc) { 400 /* peer has not signalled SMC-capability */ 401 smc->use_fallback = true; 402 goto out_connected; 403 } 404 405 /* IPSec connections opt out of SMC-R optimizations */ 406 if (using_ipsec(smc)) { 407 reason_code = SMC_CLC_DECL_IPSEC; 408 goto decline_rdma; 409 } 410 411 /* PNET table look up: search active ib_device and port 412 * within same PNETID that also contains the ethernet device 413 * used for the internal TCP socket 414 */ 415 smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); 416 if (!smcibdev) { 417 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 418 goto decline_rdma; 419 } 420 421 /* do inband token exchange */ 422 reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); 423 if (reason_code < 0) { 424 rc = reason_code; 425 goto out_err; 426 } 427 if (reason_code > 0) /* configuration error */ 428 goto decline_rdma; 429 /* receive SMC Accept CLC message */ 430 reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), 431 SMC_CLC_ACCEPT); 432 if (reason_code < 0) { 433 rc = reason_code; 434 goto out_err; 435 } 436 if (reason_code > 0) 437 goto decline_rdma; 438 439 srv_first_contact = aclc.hdr.flag; 440 mutex_lock(&smc_create_lgr_pending); 441 local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev, 442 ibport, &aclc.lcl, srv_first_contact); 443 if (local_contact < 0) { 444 rc = local_contact; 445 if (rc == -ENOMEM) 446 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 447 else if (rc == -ENOLINK) 448 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 449 goto decline_rdma_unlock; 450 } 451 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 452 453 smc_conn_save_peer_info(smc, &aclc); 454 455 /* create send buffer and rmb */ 456 rc = smc_buf_create(smc); 457 if (rc) { 458 reason_code = SMC_CLC_DECL_MEM; 459 goto decline_rdma_unlock; 460 } 461 462 if (local_contact == SMC_FIRST_CONTACT) 463 smc_link_save_peer_info(link, &aclc); 464 465 rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); 466 if (rc) { 467 reason_code = SMC_CLC_DECL_INTERR; 468 goto decline_rdma_unlock; 469 } 470 471 smc_close_init(smc); 472 smc_rx_init(smc); 473 474 if (local_contact == SMC_FIRST_CONTACT) { 475 rc = smc_ib_ready_link(link); 476 if (rc) { 477 reason_code = SMC_CLC_DECL_INTERR; 478 goto decline_rdma_unlock; 479 } 480 } else { 481 struct smc_buf_desc *buf_desc = smc->conn.rmb_desc; 482 483 if (!buf_desc->reused) { 484 /* register memory region for new rmb */ 485 rc = smc_wr_reg_send(link, 486 buf_desc->mr_rx[SMC_SINGLE_LINK]); 487 if (rc) { 488 reason_code = SMC_CLC_DECL_INTERR; 489 goto decline_rdma_unlock; 490 } 491 } 492 } 493 smc_rmb_sync_sg_for_device(&smc->conn); 494 495 rc = smc_clc_send_confirm(smc); 496 if (rc) 497 goto out_err_unlock; 498 499 if (local_contact == SMC_FIRST_CONTACT) { 500 /* QP confirmation over RoCE fabric */ 501 reason_code = smc_clnt_conf_first_link( 502 smc, &smcibdev->gid[ibport - 1]); 503 if (reason_code < 0) { 504 rc = reason_code; 505 goto out_err_unlock; 506 } 507 if (reason_code > 0) 508 goto decline_rdma_unlock; 509 } 510 511 mutex_unlock(&smc_create_lgr_pending); 512 smc_tx_init(smc); 513 514 out_connected: 515 smc_copy_sock_settings_to_clc(smc); 516 if (smc->sk.sk_state == SMC_INIT) 517 smc->sk.sk_state = SMC_ACTIVE; 518 519 return rc ? rc : local_contact; 520 521 decline_rdma_unlock: 522 if (local_contact == SMC_FIRST_CONTACT) 523 smc_lgr_forget(smc->conn.lgr); 524 mutex_unlock(&smc_create_lgr_pending); 525 smc_conn_free(&smc->conn); 526 decline_rdma: 527 /* RDMA setup failed, switch back to TCP */ 528 smc->use_fallback = true; 529 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 530 rc = smc_clc_send_decline(smc, reason_code); 531 if (rc < 0) 532 goto out_err; 533 } 534 goto out_connected; 535 536 out_err_unlock: 537 if (local_contact == SMC_FIRST_CONTACT) 538 smc_lgr_forget(smc->conn.lgr); 539 mutex_unlock(&smc_create_lgr_pending); 540 smc_conn_free(&smc->conn); 541 out_err: 542 if (smc->sk.sk_state == SMC_INIT) 543 sock_put(&smc->sk); /* passive closing */ 544 return rc; 545 } 546 547 static int smc_connect(struct socket *sock, struct sockaddr *addr, 548 int alen, int flags) 549 { 550 struct sock *sk = sock->sk; 551 struct smc_sock *smc; 552 int rc = -EINVAL; 553 554 smc = smc_sk(sk); 555 556 /* separate smc parameter checking to be safe */ 557 if (alen < sizeof(addr->sa_family)) 558 goto out_err; 559 if (addr->sa_family != AF_INET) 560 goto out_err; 561 smc->addr = addr; /* needed for nonblocking connect */ 562 563 lock_sock(sk); 564 switch (sk->sk_state) { 565 default: 566 goto out; 567 case SMC_ACTIVE: 568 rc = -EISCONN; 569 goto out; 570 case SMC_INIT: 571 rc = 0; 572 break; 573 } 574 575 smc_copy_sock_settings_to_clc(smc); 576 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 577 rc = kernel_connect(smc->clcsock, addr, alen, flags); 578 if (rc) 579 goto out; 580 581 /* setup RDMA connection */ 582 rc = smc_connect_rdma(smc); 583 if (rc < 0) 584 goto out; 585 else 586 rc = 0; /* success cases including fallback */ 587 588 out: 589 release_sock(sk); 590 out_err: 591 return rc; 592 } 593 594 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 595 { 596 struct socket *new_clcsock = NULL; 597 struct sock *lsk = &lsmc->sk; 598 struct sock *new_sk; 599 int rc; 600 601 release_sock(lsk); 602 new_sk = smc_sock_alloc(sock_net(lsk), NULL); 603 if (!new_sk) { 604 rc = -ENOMEM; 605 lsk->sk_err = ENOMEM; 606 *new_smc = NULL; 607 lock_sock(lsk); 608 goto out; 609 } 610 *new_smc = smc_sk(new_sk); 611 612 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); 613 lock_sock(lsk); 614 if (rc < 0) 615 lsk->sk_err = -rc; 616 if (rc < 0 || lsk->sk_state == SMC_CLOSED) { 617 if (new_clcsock) 618 sock_release(new_clcsock); 619 new_sk->sk_state = SMC_CLOSED; 620 sock_set_flag(new_sk, SOCK_DEAD); 621 new_sk->sk_prot->unhash(new_sk); 622 sock_put(new_sk); /* final */ 623 *new_smc = NULL; 624 goto out; 625 } 626 627 (*new_smc)->clcsock = new_clcsock; 628 out: 629 return rc; 630 } 631 632 /* add a just created sock to the accept queue of the listen sock as 633 * candidate for a following socket accept call from user space 634 */ 635 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 636 { 637 struct smc_sock *par = smc_sk(parent); 638 639 sock_hold(sk); /* sock_put in smc_accept_unlink () */ 640 spin_lock(&par->accept_q_lock); 641 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 642 spin_unlock(&par->accept_q_lock); 643 sk_acceptq_added(parent); 644 } 645 646 /* remove a socket from the accept queue of its parental listening socket */ 647 static void smc_accept_unlink(struct sock *sk) 648 { 649 struct smc_sock *par = smc_sk(sk)->listen_smc; 650 651 spin_lock(&par->accept_q_lock); 652 list_del_init(&smc_sk(sk)->accept_q); 653 spin_unlock(&par->accept_q_lock); 654 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 655 sock_put(sk); /* sock_hold in smc_accept_enqueue */ 656 } 657 658 /* remove a sock from the accept queue to bind it to a new socket created 659 * for a socket accept call from user space 660 */ 661 struct sock *smc_accept_dequeue(struct sock *parent, 662 struct socket *new_sock) 663 { 664 struct smc_sock *isk, *n; 665 struct sock *new_sk; 666 667 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 668 new_sk = (struct sock *)isk; 669 670 smc_accept_unlink(new_sk); 671 if (new_sk->sk_state == SMC_CLOSED) { 672 if (isk->clcsock) { 673 sock_release(isk->clcsock); 674 isk->clcsock = NULL; 675 } 676 new_sk->sk_prot->unhash(new_sk); 677 sock_put(new_sk); /* final */ 678 continue; 679 } 680 if (new_sock) 681 sock_graft(new_sk, new_sock); 682 return new_sk; 683 } 684 return NULL; 685 } 686 687 /* clean up for a created but never accepted sock */ 688 void smc_close_non_accepted(struct sock *sk) 689 { 690 struct smc_sock *smc = smc_sk(sk); 691 692 lock_sock(sk); 693 if (!sk->sk_lingertime) 694 /* wait for peer closing */ 695 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 696 if (!smc->use_fallback) { 697 smc_close_active(smc); 698 sock_set_flag(sk, SOCK_DEAD); 699 sk->sk_shutdown |= SHUTDOWN_MASK; 700 } 701 if (smc->clcsock) { 702 struct socket *tcp; 703 704 tcp = smc->clcsock; 705 smc->clcsock = NULL; 706 sock_release(tcp); 707 } 708 if (smc->use_fallback) { 709 sock_put(sk); /* passive closing */ 710 sk->sk_state = SMC_CLOSED; 711 } else { 712 if (sk->sk_state == SMC_CLOSED) 713 smc_conn_free(&smc->conn); 714 } 715 release_sock(sk); 716 sk->sk_prot->unhash(sk); 717 sock_put(sk); /* final sock_put */ 718 } 719 720 static int smc_serv_conf_first_link(struct smc_sock *smc) 721 { 722 struct smc_link_group *lgr = smc->conn.lgr; 723 struct smc_link *link; 724 int rest; 725 int rc; 726 727 link = &lgr->lnk[SMC_SINGLE_LINK]; 728 729 rc = smc_wr_reg_send(link, 730 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 731 if (rc) 732 return SMC_CLC_DECL_INTERR; 733 734 /* send CONFIRM LINK request to client over the RoCE fabric */ 735 rc = smc_llc_send_confirm_link(link, 736 link->smcibdev->mac[link->ibport - 1], 737 &link->smcibdev->gid[link->ibport - 1], 738 SMC_LLC_REQ); 739 if (rc < 0) 740 return SMC_CLC_DECL_TCL; 741 742 /* receive CONFIRM LINK response from client over the RoCE fabric */ 743 rest = wait_for_completion_interruptible_timeout( 744 &link->llc_confirm_resp, 745 SMC_LLC_WAIT_FIRST_TIME); 746 if (rest <= 0) { 747 struct smc_clc_msg_decline dclc; 748 749 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 750 SMC_CLC_DECLINE); 751 } 752 753 return rc; 754 } 755 756 /* setup for RDMA connection of server */ 757 static void smc_listen_work(struct work_struct *work) 758 { 759 struct smc_sock *new_smc = container_of(work, struct smc_sock, 760 smc_listen_work); 761 struct smc_clc_msg_proposal_prefix *pclc_prfx; 762 struct socket *newclcsock = new_smc->clcsock; 763 struct smc_sock *lsmc = new_smc->listen_smc; 764 struct smc_clc_msg_accept_confirm cclc; 765 int local_contact = SMC_REUSE_CONTACT; 766 struct sock *newsmcsk = &new_smc->sk; 767 struct smc_clc_msg_proposal *pclc; 768 struct smc_ib_device *smcibdev; 769 struct sockaddr_in peeraddr; 770 u8 buf[SMC_CLC_MAX_LEN]; 771 struct smc_link *link; 772 int reason_code = 0; 773 int rc = 0; 774 __be32 subnet; 775 u8 prefix_len; 776 u8 ibport; 777 778 /* check if peer is smc capable */ 779 if (!tcp_sk(newclcsock->sk)->syn_smc) { 780 new_smc->use_fallback = true; 781 goto out_connected; 782 } 783 784 /* do inband token exchange - 785 *wait for and receive SMC Proposal CLC message 786 */ 787 reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), 788 SMC_CLC_PROPOSAL); 789 if (reason_code < 0) 790 goto out_err; 791 if (reason_code > 0) 792 goto decline_rdma; 793 794 /* IPSec connections opt out of SMC-R optimizations */ 795 if (using_ipsec(new_smc)) { 796 reason_code = SMC_CLC_DECL_IPSEC; 797 goto decline_rdma; 798 } 799 800 /* PNET table look up: search active ib_device and port 801 * within same PNETID that also contains the ethernet device 802 * used for the internal TCP socket 803 */ 804 smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); 805 if (!smcibdev) { 806 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 807 goto decline_rdma; 808 } 809 810 /* determine subnet and mask from internal TCP socket */ 811 rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); 812 if (rc) { 813 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 814 goto decline_rdma; 815 } 816 817 pclc = (struct smc_clc_msg_proposal *)&buf; 818 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 819 if (pclc_prfx->outgoing_subnet != subnet || 820 pclc_prfx->prefix_len != prefix_len) { 821 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 822 goto decline_rdma; 823 } 824 825 /* get address of the peer connected to the internal TCP socket */ 826 kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr); 827 828 /* allocate connection / link group */ 829 mutex_lock(&smc_create_lgr_pending); 830 local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, 831 smcibdev, ibport, &pclc->lcl, 0); 832 if (local_contact < 0) { 833 rc = local_contact; 834 if (rc == -ENOMEM) 835 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 836 goto decline_rdma_unlock; 837 } 838 link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 839 840 /* create send buffer and rmb */ 841 rc = smc_buf_create(new_smc); 842 if (rc) { 843 reason_code = SMC_CLC_DECL_MEM; 844 goto decline_rdma_unlock; 845 } 846 847 smc_close_init(new_smc); 848 smc_rx_init(new_smc); 849 850 if (local_contact != SMC_FIRST_CONTACT) { 851 struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc; 852 853 if (!buf_desc->reused) { 854 /* register memory region for new rmb */ 855 rc = smc_wr_reg_send(link, 856 buf_desc->mr_rx[SMC_SINGLE_LINK]); 857 if (rc) { 858 reason_code = SMC_CLC_DECL_INTERR; 859 goto decline_rdma_unlock; 860 } 861 } 862 } 863 smc_rmb_sync_sg_for_device(&new_smc->conn); 864 865 rc = smc_clc_send_accept(new_smc, local_contact); 866 if (rc) 867 goto out_err_unlock; 868 869 /* receive SMC Confirm CLC message */ 870 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), 871 SMC_CLC_CONFIRM); 872 if (reason_code < 0) 873 goto out_err_unlock; 874 if (reason_code > 0) 875 goto decline_rdma_unlock; 876 smc_conn_save_peer_info(new_smc, &cclc); 877 if (local_contact == SMC_FIRST_CONTACT) 878 smc_link_save_peer_info(link, &cclc); 879 880 rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); 881 if (rc) { 882 reason_code = SMC_CLC_DECL_INTERR; 883 goto decline_rdma_unlock; 884 } 885 886 if (local_contact == SMC_FIRST_CONTACT) { 887 rc = smc_ib_ready_link(link); 888 if (rc) { 889 reason_code = SMC_CLC_DECL_INTERR; 890 goto decline_rdma_unlock; 891 } 892 /* QP confirmation over RoCE fabric */ 893 reason_code = smc_serv_conf_first_link(new_smc); 894 if (reason_code < 0) 895 /* peer is not aware of a problem */ 896 goto out_err_unlock; 897 if (reason_code > 0) 898 goto decline_rdma_unlock; 899 } 900 901 smc_tx_init(new_smc); 902 mutex_unlock(&smc_create_lgr_pending); 903 904 out_connected: 905 sk_refcnt_debug_inc(newsmcsk); 906 if (newsmcsk->sk_state == SMC_INIT) 907 newsmcsk->sk_state = SMC_ACTIVE; 908 enqueue: 909 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 910 if (lsmc->sk.sk_state == SMC_LISTEN) { 911 smc_accept_enqueue(&lsmc->sk, newsmcsk); 912 } else { /* no longer listening */ 913 smc_close_non_accepted(newsmcsk); 914 } 915 release_sock(&lsmc->sk); 916 917 /* Wake up accept */ 918 lsmc->sk.sk_data_ready(&lsmc->sk); 919 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 920 return; 921 922 decline_rdma_unlock: 923 if (local_contact == SMC_FIRST_CONTACT) 924 smc_lgr_forget(new_smc->conn.lgr); 925 mutex_unlock(&smc_create_lgr_pending); 926 decline_rdma: 927 /* RDMA setup failed, switch back to TCP */ 928 smc_conn_free(&new_smc->conn); 929 new_smc->use_fallback = true; 930 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 931 if (smc_clc_send_decline(new_smc, reason_code) < 0) 932 goto out_err; 933 } 934 goto out_connected; 935 936 out_err_unlock: 937 if (local_contact == SMC_FIRST_CONTACT) 938 smc_lgr_forget(new_smc->conn.lgr); 939 mutex_unlock(&smc_create_lgr_pending); 940 out_err: 941 if (newsmcsk->sk_state == SMC_INIT) 942 sock_put(&new_smc->sk); /* passive closing */ 943 newsmcsk->sk_state = SMC_CLOSED; 944 smc_conn_free(&new_smc->conn); 945 goto enqueue; /* queue new sock with sk_err set */ 946 } 947 948 static void smc_tcp_listen_work(struct work_struct *work) 949 { 950 struct smc_sock *lsmc = container_of(work, struct smc_sock, 951 tcp_listen_work); 952 struct sock *lsk = &lsmc->sk; 953 struct smc_sock *new_smc; 954 int rc = 0; 955 956 lock_sock(lsk); 957 while (lsk->sk_state == SMC_LISTEN) { 958 rc = smc_clcsock_accept(lsmc, &new_smc); 959 if (rc) 960 goto out; 961 if (!new_smc) 962 continue; 963 964 new_smc->listen_smc = lsmc; 965 new_smc->use_fallback = false; /* assume rdma capability first*/ 966 sock_hold(lsk); /* sock_put in smc_listen_work */ 967 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 968 smc_copy_sock_settings_to_smc(new_smc); 969 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 970 if (!schedule_work(&new_smc->smc_listen_work)) 971 sock_put(&new_smc->sk); 972 } 973 974 out: 975 if (lsmc->clcsock) { 976 sock_release(lsmc->clcsock); 977 lsmc->clcsock = NULL; 978 } 979 release_sock(lsk); 980 /* no more listening, wake up smc_close_wait_listen_clcsock and 981 * accept 982 */ 983 lsk->sk_state_change(lsk); 984 sock_put(&lsmc->sk); /* sock_hold in smc_listen */ 985 } 986 987 static int smc_listen(struct socket *sock, int backlog) 988 { 989 struct sock *sk = sock->sk; 990 struct smc_sock *smc; 991 int rc; 992 993 smc = smc_sk(sk); 994 lock_sock(sk); 995 996 rc = -EINVAL; 997 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) 998 goto out; 999 1000 rc = 0; 1001 if (sk->sk_state == SMC_LISTEN) { 1002 sk->sk_max_ack_backlog = backlog; 1003 goto out; 1004 } 1005 /* some socket options are handled in core, so we could not apply 1006 * them to the clc socket -- copy smc socket options to clc socket 1007 */ 1008 smc_copy_sock_settings_to_clc(smc); 1009 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1010 1011 rc = kernel_listen(smc->clcsock, backlog); 1012 if (rc) 1013 goto out; 1014 sk->sk_max_ack_backlog = backlog; 1015 sk->sk_ack_backlog = 0; 1016 sk->sk_state = SMC_LISTEN; 1017 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 1018 sock_hold(sk); /* sock_hold in tcp_listen_worker */ 1019 if (!schedule_work(&smc->tcp_listen_work)) 1020 sock_put(sk); 1021 1022 out: 1023 release_sock(sk); 1024 return rc; 1025 } 1026 1027 static int smc_accept(struct socket *sock, struct socket *new_sock, 1028 int flags, bool kern) 1029 { 1030 struct sock *sk = sock->sk, *nsk; 1031 DECLARE_WAITQUEUE(wait, current); 1032 struct smc_sock *lsmc; 1033 long timeo; 1034 int rc = 0; 1035 1036 lsmc = smc_sk(sk); 1037 sock_hold(sk); /* sock_put below */ 1038 lock_sock(sk); 1039 1040 if (lsmc->sk.sk_state != SMC_LISTEN) { 1041 rc = -EINVAL; 1042 goto out; 1043 } 1044 1045 /* Wait for an incoming connection */ 1046 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1047 add_wait_queue_exclusive(sk_sleep(sk), &wait); 1048 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 1049 set_current_state(TASK_INTERRUPTIBLE); 1050 if (!timeo) { 1051 rc = -EAGAIN; 1052 break; 1053 } 1054 release_sock(sk); 1055 timeo = schedule_timeout(timeo); 1056 /* wakeup by sk_data_ready in smc_listen_work() */ 1057 sched_annotate_sleep(); 1058 lock_sock(sk); 1059 if (signal_pending(current)) { 1060 rc = sock_intr_errno(timeo); 1061 break; 1062 } 1063 } 1064 set_current_state(TASK_RUNNING); 1065 remove_wait_queue(sk_sleep(sk), &wait); 1066 1067 if (!rc) 1068 rc = sock_error(nsk); 1069 1070 out: 1071 release_sock(sk); 1072 sock_put(sk); /* sock_hold above */ 1073 return rc; 1074 } 1075 1076 static int smc_getname(struct socket *sock, struct sockaddr *addr, 1077 int peer) 1078 { 1079 struct smc_sock *smc; 1080 1081 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 1082 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 1083 return -ENOTCONN; 1084 1085 smc = smc_sk(sock->sk); 1086 1087 return smc->clcsock->ops->getname(smc->clcsock, addr, peer); 1088 } 1089 1090 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1091 { 1092 struct sock *sk = sock->sk; 1093 struct smc_sock *smc; 1094 int rc = -EPIPE; 1095 1096 smc = smc_sk(sk); 1097 lock_sock(sk); 1098 if ((sk->sk_state != SMC_ACTIVE) && 1099 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1100 (sk->sk_state != SMC_INIT)) 1101 goto out; 1102 if (smc->use_fallback) 1103 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1104 else 1105 rc = smc_tx_sendmsg(smc, msg, len); 1106 out: 1107 release_sock(sk); 1108 return rc; 1109 } 1110 1111 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1112 int flags) 1113 { 1114 struct sock *sk = sock->sk; 1115 struct smc_sock *smc; 1116 int rc = -ENOTCONN; 1117 1118 smc = smc_sk(sk); 1119 lock_sock(sk); 1120 if ((sk->sk_state == SMC_INIT) || 1121 (sk->sk_state == SMC_LISTEN) || 1122 (sk->sk_state == SMC_CLOSED)) 1123 goto out; 1124 1125 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1126 rc = 0; 1127 goto out; 1128 } 1129 1130 if (smc->use_fallback) 1131 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1132 else 1133 rc = smc_rx_recvmsg(smc, msg, len, flags); 1134 1135 out: 1136 release_sock(sk); 1137 return rc; 1138 } 1139 1140 static __poll_t smc_accept_poll(struct sock *parent) 1141 { 1142 struct smc_sock *isk = smc_sk(parent); 1143 __poll_t mask = 0; 1144 1145 spin_lock(&isk->accept_q_lock); 1146 if (!list_empty(&isk->accept_q)) 1147 mask = EPOLLIN | EPOLLRDNORM; 1148 spin_unlock(&isk->accept_q_lock); 1149 1150 return mask; 1151 } 1152 1153 static __poll_t smc_poll(struct file *file, struct socket *sock, 1154 poll_table *wait) 1155 { 1156 struct sock *sk = sock->sk; 1157 __poll_t mask = 0; 1158 struct smc_sock *smc; 1159 int rc; 1160 1161 if (!sk) 1162 return EPOLLNVAL; 1163 1164 smc = smc_sk(sock->sk); 1165 sock_hold(sk); 1166 lock_sock(sk); 1167 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { 1168 /* delegate to CLC child sock */ 1169 release_sock(sk); 1170 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 1171 /* if non-blocking connect finished ... */ 1172 lock_sock(sk); 1173 if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) { 1174 sk->sk_err = smc->clcsock->sk->sk_err; 1175 if (sk->sk_err) { 1176 mask |= EPOLLERR; 1177 } else { 1178 rc = smc_connect_rdma(smc); 1179 if (rc < 0) 1180 mask |= EPOLLERR; 1181 /* success cases including fallback */ 1182 mask |= EPOLLOUT | EPOLLWRNORM; 1183 } 1184 } 1185 } else { 1186 if (sk->sk_state != SMC_CLOSED) { 1187 release_sock(sk); 1188 sock_poll_wait(file, sk_sleep(sk), wait); 1189 lock_sock(sk); 1190 } 1191 if (sk->sk_err) 1192 mask |= EPOLLERR; 1193 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1194 (sk->sk_state == SMC_CLOSED)) 1195 mask |= EPOLLHUP; 1196 if (sk->sk_state == SMC_LISTEN) { 1197 /* woken up by sk_data_ready in smc_listen_work() */ 1198 mask = smc_accept_poll(sk); 1199 } else { 1200 if (atomic_read(&smc->conn.sndbuf_space) || 1201 sk->sk_shutdown & SEND_SHUTDOWN) { 1202 mask |= EPOLLOUT | EPOLLWRNORM; 1203 } else { 1204 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1205 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1206 } 1207 if (atomic_read(&smc->conn.bytes_to_rcv)) 1208 mask |= EPOLLIN | EPOLLRDNORM; 1209 if (sk->sk_shutdown & RCV_SHUTDOWN) 1210 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1211 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1212 mask |= EPOLLIN; 1213 } 1214 1215 } 1216 release_sock(sk); 1217 sock_put(sk); 1218 1219 return mask; 1220 } 1221 1222 static int smc_shutdown(struct socket *sock, int how) 1223 { 1224 struct sock *sk = sock->sk; 1225 struct smc_sock *smc; 1226 int rc = -EINVAL; 1227 int rc1 = 0; 1228 1229 smc = smc_sk(sk); 1230 1231 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 1232 return rc; 1233 1234 lock_sock(sk); 1235 1236 rc = -ENOTCONN; 1237 if ((sk->sk_state != SMC_LISTEN) && 1238 (sk->sk_state != SMC_ACTIVE) && 1239 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1240 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1241 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1242 (sk->sk_state != SMC_APPCLOSEWAIT2) && 1243 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 1244 goto out; 1245 if (smc->use_fallback) { 1246 rc = kernel_sock_shutdown(smc->clcsock, how); 1247 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 1248 if (sk->sk_shutdown == SHUTDOWN_MASK) 1249 sk->sk_state = SMC_CLOSED; 1250 goto out; 1251 } 1252 switch (how) { 1253 case SHUT_RDWR: /* shutdown in both directions */ 1254 rc = smc_close_active(smc); 1255 break; 1256 case SHUT_WR: 1257 rc = smc_close_shutdown_write(smc); 1258 break; 1259 case SHUT_RD: 1260 if (sk->sk_state == SMC_LISTEN) 1261 rc = smc_close_active(smc); 1262 else 1263 rc = 0; 1264 /* nothing more to do because peer is not involved */ 1265 break; 1266 } 1267 rc1 = kernel_sock_shutdown(smc->clcsock, how); 1268 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 1269 sk->sk_shutdown |= how + 1; 1270 1271 out: 1272 release_sock(sk); 1273 return rc ? rc : rc1; 1274 } 1275 1276 static int smc_setsockopt(struct socket *sock, int level, int optname, 1277 char __user *optval, unsigned int optlen) 1278 { 1279 struct sock *sk = sock->sk; 1280 struct smc_sock *smc; 1281 1282 smc = smc_sk(sk); 1283 1284 /* generic setsockopts reaching us here always apply to the 1285 * CLC socket 1286 */ 1287 return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1288 optval, optlen); 1289 } 1290 1291 static int smc_getsockopt(struct socket *sock, int level, int optname, 1292 char __user *optval, int __user *optlen) 1293 { 1294 struct smc_sock *smc; 1295 1296 smc = smc_sk(sock->sk); 1297 /* socket options apply to the CLC socket */ 1298 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 1299 optval, optlen); 1300 } 1301 1302 static int smc_ioctl(struct socket *sock, unsigned int cmd, 1303 unsigned long arg) 1304 { 1305 struct smc_sock *smc; 1306 1307 smc = smc_sk(sock->sk); 1308 if (smc->use_fallback) 1309 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1310 else 1311 return sock_no_ioctl(sock, cmd, arg); 1312 } 1313 1314 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1315 int offset, size_t size, int flags) 1316 { 1317 struct sock *sk = sock->sk; 1318 struct smc_sock *smc; 1319 int rc = -EPIPE; 1320 1321 smc = smc_sk(sk); 1322 lock_sock(sk); 1323 if (sk->sk_state != SMC_ACTIVE) 1324 goto out; 1325 if (smc->use_fallback) 1326 rc = kernel_sendpage(smc->clcsock, page, offset, 1327 size, flags); 1328 else 1329 rc = sock_no_sendpage(sock, page, offset, size, flags); 1330 1331 out: 1332 release_sock(sk); 1333 return rc; 1334 } 1335 1336 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1337 struct pipe_inode_info *pipe, size_t len, 1338 unsigned int flags) 1339 { 1340 struct sock *sk = sock->sk; 1341 struct smc_sock *smc; 1342 int rc = -ENOTCONN; 1343 1344 smc = smc_sk(sk); 1345 lock_sock(sk); 1346 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) 1347 goto out; 1348 if (smc->use_fallback) { 1349 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1350 pipe, len, flags); 1351 } else { 1352 rc = -EOPNOTSUPP; 1353 } 1354 out: 1355 release_sock(sk); 1356 return rc; 1357 } 1358 1359 /* must look like tcp */ 1360 static const struct proto_ops smc_sock_ops = { 1361 .family = PF_SMC, 1362 .owner = THIS_MODULE, 1363 .release = smc_release, 1364 .bind = smc_bind, 1365 .connect = smc_connect, 1366 .socketpair = sock_no_socketpair, 1367 .accept = smc_accept, 1368 .getname = smc_getname, 1369 .poll = smc_poll, 1370 .ioctl = smc_ioctl, 1371 .listen = smc_listen, 1372 .shutdown = smc_shutdown, 1373 .setsockopt = smc_setsockopt, 1374 .getsockopt = smc_getsockopt, 1375 .sendmsg = smc_sendmsg, 1376 .recvmsg = smc_recvmsg, 1377 .mmap = sock_no_mmap, 1378 .sendpage = smc_sendpage, 1379 .splice_read = smc_splice_read, 1380 }; 1381 1382 static int smc_create(struct net *net, struct socket *sock, int protocol, 1383 int kern) 1384 { 1385 struct smc_sock *smc; 1386 struct sock *sk; 1387 int rc; 1388 1389 rc = -ESOCKTNOSUPPORT; 1390 if (sock->type != SOCK_STREAM) 1391 goto out; 1392 1393 rc = -EPROTONOSUPPORT; 1394 if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) 1395 goto out; 1396 1397 rc = -ENOBUFS; 1398 sock->ops = &smc_sock_ops; 1399 sk = smc_sock_alloc(net, sock); 1400 if (!sk) 1401 goto out; 1402 1403 /* create internal TCP socket for CLC handshake and fallback */ 1404 smc = smc_sk(sk); 1405 smc->use_fallback = false; /* assume rdma capability first */ 1406 rc = sock_create_kern(net, PF_INET, SOCK_STREAM, 1407 IPPROTO_TCP, &smc->clcsock); 1408 if (rc) 1409 sk_common_release(sk); 1410 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 1411 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 1412 1413 out: 1414 return rc; 1415 } 1416 1417 static const struct net_proto_family smc_sock_family_ops = { 1418 .family = PF_SMC, 1419 .owner = THIS_MODULE, 1420 .create = smc_create, 1421 }; 1422 1423 static int __init smc_init(void) 1424 { 1425 int rc; 1426 1427 rc = smc_pnet_init(); 1428 if (rc) 1429 return rc; 1430 1431 rc = smc_llc_init(); 1432 if (rc) { 1433 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 1434 goto out_pnet; 1435 } 1436 1437 rc = smc_cdc_init(); 1438 if (rc) { 1439 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 1440 goto out_pnet; 1441 } 1442 1443 rc = proto_register(&smc_proto, 1); 1444 if (rc) { 1445 pr_err("%s: proto_register fails with %d\n", __func__, rc); 1446 goto out_pnet; 1447 } 1448 1449 rc = sock_register(&smc_sock_family_ops); 1450 if (rc) { 1451 pr_err("%s: sock_register fails with %d\n", __func__, rc); 1452 goto out_proto; 1453 } 1454 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 1455 1456 rc = smc_ib_register_client(); 1457 if (rc) { 1458 pr_err("%s: ib_register fails with %d\n", __func__, rc); 1459 goto out_sock; 1460 } 1461 1462 static_branch_enable(&tcp_have_smc); 1463 return 0; 1464 1465 out_sock: 1466 sock_unregister(PF_SMC); 1467 out_proto: 1468 proto_unregister(&smc_proto); 1469 out_pnet: 1470 smc_pnet_exit(); 1471 return rc; 1472 } 1473 1474 static void __exit smc_exit(void) 1475 { 1476 struct smc_link_group *lgr, *lg; 1477 LIST_HEAD(lgr_freeing_list); 1478 1479 spin_lock_bh(&smc_lgr_list.lock); 1480 if (!list_empty(&smc_lgr_list.list)) 1481 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1482 spin_unlock_bh(&smc_lgr_list.lock); 1483 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1484 list_del_init(&lgr->list); 1485 smc_lgr_free(lgr); /* free link group */ 1486 } 1487 static_branch_disable(&tcp_have_smc); 1488 smc_ib_unregister_client(); 1489 sock_unregister(PF_SMC); 1490 proto_unregister(&smc_proto); 1491 smc_pnet_exit(); 1492 } 1493 1494 module_init(smc_init); 1495 module_exit(smc_exit); 1496 1497 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 1498 MODULE_DESCRIPTION("smc socket address family"); 1499 MODULE_LICENSE("GPL"); 1500 MODULE_ALIAS_NETPROTO(PF_SMC); 1501