1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * Manage send buffer. 6 * Producer: 7 * Copy user space data into send buffer, if send buffer space available. 8 * Consumer: 9 * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available. 10 * 11 * Copyright IBM Corp. 2016 12 * 13 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 14 */ 15 16 #include <linux/net.h> 17 #include <linux/rcupdate.h> 18 #include <linux/workqueue.h> 19 #include <linux/sched/signal.h> 20 21 #include <net/sock.h> 22 #include <net/tcp.h> 23 24 #include "smc.h" 25 #include "smc_wr.h" 26 #include "smc_cdc.h" 27 #include "smc_close.h" 28 #include "smc_ism.h" 29 #include "smc_tx.h" 30 #include "smc_stats.h" 31 #include "smc_tracepoint.h" 32 33 #define SMC_TX_WORK_DELAY 0 34 35 /***************************** sndbuf producer *******************************/ 36 37 /* callback implementation for sk.sk_write_space() 38 * to wakeup sndbuf producers that blocked with smc_tx_wait(). 39 * called under sk_socket lock. 40 */ 41 static void smc_tx_write_space(struct sock *sk) 42 { 43 struct socket *sock = sk->sk_socket; 44 struct smc_sock *smc = smc_sk(sk); 45 struct socket_wq *wq; 46 47 /* similar to sk_stream_write_space */ 48 if (atomic_read(&smc->conn.sndbuf_space) && sock) { 49 if (test_bit(SOCK_NOSPACE, &sock->flags)) 50 SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); 51 clear_bit(SOCK_NOSPACE, &sock->flags); 52 rcu_read_lock(); 53 wq = rcu_dereference(sk->sk_wq); 54 if (skwq_has_sleeper(wq)) 55 wake_up_interruptible_poll(&wq->wait, 56 EPOLLOUT | EPOLLWRNORM | 57 EPOLLWRBAND); 58 if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) 59 sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); 60 rcu_read_unlock(); 61 } 62 } 63 64 /* Wakeup sndbuf producers that blocked with smc_tx_wait(). 65 * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). 66 */ 67 void smc_tx_sndbuf_nonfull(struct smc_sock *smc) 68 { 69 if (smc->sk.sk_socket && 70 test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags)) 71 smc->sk.sk_write_space(&smc->sk); 72 } 73 74 /* blocks sndbuf producer until at least one byte of free space available 75 * or urgent Byte was consumed 76 */ 77 static int smc_tx_wait(struct smc_sock *smc, int flags) 78 { 79 DEFINE_WAIT_FUNC(wait, woken_wake_function); 80 struct smc_connection *conn = &smc->conn; 81 struct sock *sk = &smc->sk; 82 long timeo; 83 int rc = 0; 84 85 /* similar to sk_stream_wait_memory */ 86 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 87 add_wait_queue(sk_sleep(sk), &wait); 88 while (1) { 89 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 90 if (sk->sk_err || 91 (sk->sk_shutdown & SEND_SHUTDOWN) || 92 conn->killed || 93 conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { 94 rc = -EPIPE; 95 break; 96 } 97 if (smc_cdc_rxed_any_close(conn)) { 98 rc = -ECONNRESET; 99 break; 100 } 101 if (!timeo) { 102 /* ensure EPOLLOUT is subsequently generated */ 103 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 104 rc = -EAGAIN; 105 break; 106 } 107 if (signal_pending(current)) { 108 rc = sock_intr_errno(timeo); 109 break; 110 } 111 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 112 if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend) 113 break; /* at least 1 byte of free & no urgent data */ 114 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 115 sk_wait_event(sk, &timeo, 116 READ_ONCE(sk->sk_err) || 117 (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || 118 smc_cdc_rxed_any_close(conn) || 119 (atomic_read(&conn->sndbuf_space) && 120 !conn->urg_tx_pend), 121 &wait); 122 } 123 remove_wait_queue(sk_sleep(sk), &wait); 124 return rc; 125 } 126 127 static bool smc_tx_is_corked(struct smc_sock *smc) 128 { 129 struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); 130 131 return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; 132 } 133 134 /* If we have pending CDC messages, do not send: 135 * Because CQE of this CDC message will happen shortly, it gives 136 * a chance to coalesce future sendmsg() payload in to one RDMA Write, 137 * without need for a timer, and with no latency trade off. 138 * Algorithm here: 139 * 1. First message should never cork 140 * 2. If we have pending Tx CDC messages, wait for the first CDC 141 * message's completion 142 * 3. Don't cork to much data in a single RDMA Write to prevent burst 143 * traffic, total corked message should not exceed sendbuf/2 144 */ 145 static bool smc_should_autocork(struct smc_sock *smc) 146 { 147 struct smc_connection *conn = &smc->conn; 148 int corking_size; 149 150 corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1, 151 sock_net(&smc->sk)->smc.sysctl_autocorking_size); 152 153 if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || 154 smc_tx_prepared_sends(conn) > corking_size) 155 return false; 156 return true; 157 } 158 159 static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) 160 { 161 struct smc_connection *conn = &smc->conn; 162 163 if (smc_should_autocork(smc)) 164 return true; 165 166 /* for a corked socket defer the RDMA writes if 167 * sndbuf_space is still available. The applications 168 * should known how/when to uncork it. 169 */ 170 if ((msg->msg_flags & MSG_MORE || 171 smc_tx_is_corked(smc)) && 172 atomic_read(&conn->sndbuf_space)) 173 return true; 174 175 return false; 176 } 177 178 /* sndbuf producer: main API called by socket layer. 179 * called under sock lock. 180 */ 181 int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) 182 { 183 size_t copylen, send_done = 0, send_remaining = len; 184 size_t chunk_len, chunk_off, chunk_len_sum; 185 struct smc_connection *conn = &smc->conn; 186 union smc_host_cursor prep; 187 struct sock *sk = &smc->sk; 188 char *sndbuf_base; 189 int tx_cnt_prep; 190 int writespace; 191 int rc, chunk; 192 193 /* This should be in poll */ 194 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 195 196 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { 197 rc = -EPIPE; 198 goto out_err; 199 } 200 201 if (sk->sk_state == SMC_INIT) 202 return -ENOTCONN; 203 204 if (len > conn->sndbuf_desc->len) 205 SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); 206 207 if (len > conn->peer_rmbe_size) 208 SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); 209 210 if (msg->msg_flags & MSG_OOB) 211 SMC_STAT_INC(smc, urg_data_cnt); 212 213 while (msg_data_left(msg)) { 214 if (smc->sk.sk_shutdown & SEND_SHUTDOWN || 215 (smc->sk.sk_err == ECONNABORTED) || 216 conn->killed) 217 return -EPIPE; 218 if (smc_cdc_rxed_any_close(conn)) 219 return send_done ?: -ECONNRESET; 220 221 if (msg->msg_flags & MSG_OOB) 222 conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; 223 224 if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { 225 if (send_done) 226 return send_done; 227 rc = smc_tx_wait(smc, msg->msg_flags); 228 if (rc) 229 goto out_err; 230 continue; 231 } 232 233 /* initialize variables for 1st iteration of subsequent loop */ 234 /* could be just 1 byte, even after smc_tx_wait above */ 235 writespace = atomic_read(&conn->sndbuf_space); 236 /* not more than what user space asked for */ 237 copylen = min_t(size_t, send_remaining, writespace); 238 /* determine start of sndbuf */ 239 sndbuf_base = conn->sndbuf_desc->cpu_addr; 240 smc_curs_copy(&prep, &conn->tx_curs_prep, conn); 241 tx_cnt_prep = prep.count; 242 /* determine chunks where to write into sndbuf */ 243 /* either unwrapped case, or 1st chunk of wrapped case */ 244 chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len - 245 tx_cnt_prep); 246 chunk_len_sum = chunk_len; 247 chunk_off = tx_cnt_prep; 248 for (chunk = 0; chunk < 2; chunk++) { 249 rc = memcpy_from_msg(sndbuf_base + chunk_off, 250 msg, chunk_len); 251 if (rc) { 252 smc_sndbuf_sync_sg_for_device(conn); 253 if (send_done) 254 return send_done; 255 goto out_err; 256 } 257 send_done += chunk_len; 258 send_remaining -= chunk_len; 259 260 if (chunk_len_sum == copylen) 261 break; /* either on 1st or 2nd iteration */ 262 /* prepare next (== 2nd) iteration */ 263 chunk_len = copylen - chunk_len; /* remainder */ 264 chunk_len_sum += chunk_len; 265 chunk_off = 0; /* modulo offset in send ring buffer */ 266 } 267 smc_sndbuf_sync_sg_for_device(conn); 268 /* update cursors */ 269 smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); 270 smc_curs_copy(&conn->tx_curs_prep, &prep, conn); 271 /* increased in send tasklet smc_cdc_tx_handler() */ 272 smp_mb__before_atomic(); 273 atomic_sub(copylen, &conn->sndbuf_space); 274 /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ 275 smp_mb__after_atomic(); 276 /* since we just produced more new data into sndbuf, 277 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC 278 */ 279 if ((msg->msg_flags & MSG_OOB) && !send_remaining) 280 conn->urg_tx_pend = true; 281 /* If we need to cork, do nothing and wait for the next 282 * sendmsg() call or push on tx completion 283 */ 284 if (!smc_tx_should_cork(smc, msg)) 285 smc_tx_sndbuf_nonempty(conn); 286 287 trace_smc_tx_sendmsg(smc, copylen); 288 } /* while (msg_data_left(msg)) */ 289 290 return send_done; 291 292 out_err: 293 rc = sk_stream_error(sk, msg->msg_flags, rc); 294 /* make sure we wake any epoll edge trigger waiter */ 295 if (unlikely(rc == -EAGAIN)) 296 sk->sk_write_space(sk); 297 return rc; 298 } 299 300 /***************************** sndbuf consumer *******************************/ 301 302 /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ 303 int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, 304 u32 offset, int signal) 305 { 306 int rc; 307 308 rc = smc_ism_write(conn->lgr->smcd, conn->peer_token, 309 conn->peer_rmbe_idx, signal, conn->tx_off + offset, 310 data, len); 311 if (rc) 312 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; 313 return rc; 314 } 315 316 /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ 317 static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, 318 int num_sges, struct ib_rdma_wr *rdma_wr) 319 { 320 struct smc_link_group *lgr = conn->lgr; 321 struct smc_link *link = conn->lnk; 322 int rc; 323 324 rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); 325 rdma_wr->wr.num_sge = num_sges; 326 rdma_wr->remote_addr = 327 lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + 328 /* RMBE within RMB */ 329 conn->tx_off + 330 /* offset within RMBE */ 331 peer_rmbe_offset; 332 rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; 333 rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); 334 if (rc) 335 smcr_link_down_cond_sched(link); 336 return rc; 337 } 338 339 /* sndbuf consumer */ 340 static inline void smc_tx_advance_cursors(struct smc_connection *conn, 341 union smc_host_cursor *prod, 342 union smc_host_cursor *sent, 343 size_t len) 344 { 345 smc_curs_add(conn->peer_rmbe_size, prod, len); 346 /* increased in recv tasklet smc_cdc_msg_rcv() */ 347 smp_mb__before_atomic(); 348 /* data in flight reduces usable snd_wnd */ 349 atomic_sub(len, &conn->peer_rmbe_space); 350 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ 351 smp_mb__after_atomic(); 352 smc_curs_add(conn->sndbuf_desc->len, sent, len); 353 } 354 355 /* SMC-R helper for smc_tx_rdma_writes() */ 356 static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, 357 size_t src_off, size_t src_len, 358 size_t dst_off, size_t dst_len, 359 struct smc_rdma_wr *wr_rdma_buf) 360 { 361 struct smc_link *link = conn->lnk; 362 363 dma_addr_t dma_addr = 364 sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); 365 u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; 366 int src_len_sum = src_len, dst_len_sum = dst_len; 367 int sent_count = src_off; 368 int srcchunk, dstchunk; 369 int num_sges; 370 int rc; 371 372 for (dstchunk = 0; dstchunk < 2; dstchunk++) { 373 struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk]; 374 struct ib_sge *sge = wr->wr.sg_list; 375 u64 base_addr = dma_addr; 376 377 if (dst_len < link->qp_attr.cap.max_inline_data) { 378 base_addr = virt_addr; 379 wr->wr.send_flags |= IB_SEND_INLINE; 380 } else { 381 wr->wr.send_flags &= ~IB_SEND_INLINE; 382 } 383 384 num_sges = 0; 385 for (srcchunk = 0; srcchunk < 2; srcchunk++) { 386 sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? 387 (virt_addr + src_off) : (base_addr + src_off); 388 sge[srcchunk].length = src_len; 389 if (conn->sndbuf_desc->is_vm) 390 sge[srcchunk].lkey = 391 conn->sndbuf_desc->mr[link->link_idx]->lkey; 392 num_sges++; 393 394 src_off += src_len; 395 if (src_off >= conn->sndbuf_desc->len) 396 src_off -= conn->sndbuf_desc->len; 397 /* modulo in send ring */ 398 if (src_len_sum == dst_len) 399 break; /* either on 1st or 2nd iteration */ 400 /* prepare next (== 2nd) iteration */ 401 src_len = dst_len - src_len; /* remainder */ 402 src_len_sum += src_len; 403 } 404 rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr); 405 if (rc) 406 return rc; 407 if (dst_len_sum == len) 408 break; /* either on 1st or 2nd iteration */ 409 /* prepare next (== 2nd) iteration */ 410 dst_off = 0; /* modulo offset in RMBE ring buffer */ 411 dst_len = len - dst_len; /* remainder */ 412 dst_len_sum += dst_len; 413 src_len = min_t(int, dst_len, conn->sndbuf_desc->len - 414 sent_count); 415 src_len_sum = src_len; 416 } 417 return 0; 418 } 419 420 /* SMC-D helper for smc_tx_rdma_writes() */ 421 static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, 422 size_t src_off, size_t src_len, 423 size_t dst_off, size_t dst_len) 424 { 425 int src_len_sum = src_len, dst_len_sum = dst_len; 426 int srcchunk, dstchunk; 427 int rc; 428 429 if (conn->sndbuf_desc->is_attached) 430 return 0; 431 432 for (dstchunk = 0; dstchunk < 2; dstchunk++) { 433 for (srcchunk = 0; srcchunk < 2; srcchunk++) { 434 void *data = conn->sndbuf_desc->cpu_addr + src_off; 435 436 rc = smcd_tx_ism_write(conn, data, src_len, dst_off + 437 sizeof(struct smcd_cdc_msg), 0); 438 if (rc) 439 return rc; 440 dst_off += src_len; 441 src_off += src_len; 442 if (src_off >= conn->sndbuf_desc->len) 443 src_off -= conn->sndbuf_desc->len; 444 /* modulo in send ring */ 445 if (src_len_sum == dst_len) 446 break; /* either on 1st or 2nd iteration */ 447 /* prepare next (== 2nd) iteration */ 448 src_len = dst_len - src_len; /* remainder */ 449 src_len_sum += src_len; 450 } 451 if (dst_len_sum == len) 452 break; /* either on 1st or 2nd iteration */ 453 /* prepare next (== 2nd) iteration */ 454 dst_off = 0; /* modulo offset in RMBE ring buffer */ 455 dst_len = len - dst_len; /* remainder */ 456 dst_len_sum += dst_len; 457 src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off); 458 src_len_sum = src_len; 459 } 460 return 0; 461 } 462 463 /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; 464 * usable snd_wnd as max transmit 465 */ 466 static int smc_tx_rdma_writes(struct smc_connection *conn, 467 struct smc_rdma_wr *wr_rdma_buf) 468 { 469 size_t len, src_len, dst_off, dst_len; /* current chunk values */ 470 union smc_host_cursor sent, prep, prod, cons; 471 struct smc_cdc_producer_flags *pflags; 472 int to_send, rmbespace; 473 int rc; 474 475 /* source: sndbuf */ 476 smc_curs_copy(&sent, &conn->tx_curs_sent, conn); 477 smc_curs_copy(&prep, &conn->tx_curs_prep, conn); 478 /* cf. wmem_alloc - (snd_max - snd_una) */ 479 to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); 480 if (to_send <= 0) 481 return 0; 482 483 /* destination: RMBE */ 484 /* cf. snd_wnd */ 485 rmbespace = atomic_read(&conn->peer_rmbe_space); 486 if (rmbespace <= 0) { 487 struct smc_sock *smc = container_of(conn, struct smc_sock, 488 conn); 489 SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); 490 return 0; 491 } 492 smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); 493 smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); 494 495 /* if usable snd_wnd closes ask peer to advertise once it opens again */ 496 pflags = &conn->local_tx_ctrl.prod_flags; 497 pflags->write_blocked = (to_send >= rmbespace); 498 /* cf. usable snd_wnd */ 499 len = min(to_send, rmbespace); 500 501 /* initialize variables for first iteration of subsequent nested loop */ 502 dst_off = prod.count; 503 if (prod.wrap == cons.wrap) { 504 /* the filled destination area is unwrapped, 505 * hence the available free destination space is wrapped 506 * and we need 2 destination chunks of sum len; start with 1st 507 * which is limited by what's available in sndbuf 508 */ 509 dst_len = min_t(size_t, 510 conn->peer_rmbe_size - prod.count, len); 511 } else { 512 /* the filled destination area is wrapped, 513 * hence the available free destination space is unwrapped 514 * and we need a single destination chunk of entire len 515 */ 516 dst_len = len; 517 } 518 /* dst_len determines the maximum src_len */ 519 if (sent.count + dst_len <= conn->sndbuf_desc->len) { 520 /* unwrapped src case: single chunk of entire dst_len */ 521 src_len = dst_len; 522 } else { 523 /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ 524 src_len = conn->sndbuf_desc->len - sent.count; 525 } 526 527 if (conn->lgr->is_smcd) 528 rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len, 529 dst_off, dst_len); 530 else 531 rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len, 532 dst_off, dst_len, wr_rdma_buf); 533 if (rc) 534 return rc; 535 536 if (conn->urg_tx_pend && len == to_send) 537 pflags->urg_data_present = 1; 538 smc_tx_advance_cursors(conn, &prod, &sent, len); 539 /* update connection's cursors with advanced local cursors */ 540 smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn); 541 /* dst: peer RMBE */ 542 smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */ 543 544 return 0; 545 } 546 547 /* Wakeup sndbuf consumers from any context (IRQ or process) 548 * since there is more data to transmit; usable snd_wnd as max transmit 549 */ 550 static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) 551 { 552 struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; 553 struct smc_link *link = conn->lnk; 554 struct smc_rdma_wr *wr_rdma_buf; 555 struct smc_cdc_tx_pend *pend; 556 struct smc_wr_buf *wr_buf; 557 int rc; 558 559 if (!link || !smc_wr_tx_link_hold(link)) 560 return -ENOLINK; 561 rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); 562 if (rc < 0) { 563 smc_wr_tx_link_put(link); 564 if (rc == -EBUSY) { 565 struct smc_sock *smc = 566 container_of(conn, struct smc_sock, conn); 567 568 if (smc->sk.sk_err == ECONNABORTED) 569 return sock_error(&smc->sk); 570 if (conn->killed) 571 return -EPIPE; 572 rc = 0; 573 mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 574 SMC_TX_WORK_DELAY); 575 } 576 return rc; 577 } 578 579 spin_lock_bh(&conn->send_lock); 580 if (link != conn->lnk) { 581 /* link of connection changed, tx_work will restart */ 582 smc_wr_tx_put_slot(link, 583 (struct smc_wr_tx_pend_priv *)pend); 584 rc = -ENOLINK; 585 goto out_unlock; 586 } 587 if (!pflags->urg_data_present) { 588 rc = smc_tx_rdma_writes(conn, wr_rdma_buf); 589 if (rc) { 590 smc_wr_tx_put_slot(link, 591 (struct smc_wr_tx_pend_priv *)pend); 592 goto out_unlock; 593 } 594 } 595 596 rc = smc_cdc_msg_send(conn, wr_buf, pend); 597 if (!rc && pflags->urg_data_present) { 598 pflags->urg_data_pending = 0; 599 pflags->urg_data_present = 0; 600 } 601 602 out_unlock: 603 spin_unlock_bh(&conn->send_lock); 604 smc_wr_tx_link_put(link); 605 return rc; 606 } 607 608 static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) 609 { 610 struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; 611 int rc = 0; 612 613 spin_lock_bh(&conn->send_lock); 614 if (!pflags->urg_data_present) 615 rc = smc_tx_rdma_writes(conn, NULL); 616 if (!rc) 617 rc = smcd_cdc_msg_send(conn); 618 619 if (!rc && pflags->urg_data_present) { 620 pflags->urg_data_pending = 0; 621 pflags->urg_data_present = 0; 622 } 623 spin_unlock_bh(&conn->send_lock); 624 return rc; 625 } 626 627 int smc_tx_sndbuf_nonempty(struct smc_connection *conn) 628 { 629 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 630 int rc = 0; 631 632 /* No data in the send queue */ 633 if (unlikely(smc_tx_prepared_sends(conn) <= 0)) 634 goto out; 635 636 /* Peer don't have RMBE space */ 637 if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { 638 SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); 639 goto out; 640 } 641 642 if (conn->killed || 643 conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { 644 rc = -EPIPE; /* connection being aborted */ 645 goto out; 646 } 647 if (conn->lgr->is_smcd) 648 rc = smcd_tx_sndbuf_nonempty(conn); 649 else 650 rc = smcr_tx_sndbuf_nonempty(conn); 651 652 if (!rc) { 653 /* trigger socket release if connection is closing */ 654 smc_close_wake_tx_prepared(smc); 655 } 656 657 out: 658 return rc; 659 } 660 661 /* Wakeup sndbuf consumers from process context 662 * since there is more data to transmit. The caller 663 * must hold sock lock. 664 */ 665 void smc_tx_pending(struct smc_connection *conn) 666 { 667 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 668 int rc; 669 670 if (smc->sk.sk_err) 671 return; 672 673 rc = smc_tx_sndbuf_nonempty(conn); 674 if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && 675 !atomic_read(&conn->bytes_to_rcv)) 676 conn->local_rx_ctrl.prod_flags.write_blocked = 0; 677 } 678 679 /* Wakeup sndbuf consumers from process context 680 * since there is more data to transmit in locked 681 * sock. 682 */ 683 void smc_tx_work(struct work_struct *work) 684 { 685 struct smc_connection *conn = container_of(to_delayed_work(work), 686 struct smc_connection, 687 tx_work); 688 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 689 690 lock_sock(&smc->sk); 691 smc_tx_pending(conn); 692 release_sock(&smc->sk); 693 } 694 695 void smc_tx_consumer_update(struct smc_connection *conn, bool force) 696 { 697 union smc_host_cursor cfed, cons, prod; 698 int sender_free = conn->rmb_desc->len; 699 int to_confirm; 700 701 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); 702 smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn); 703 to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); 704 if (to_confirm > conn->rmbe_update_limit) { 705 smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn); 706 sender_free = conn->rmb_desc->len - 707 smc_curs_diff_large(conn->rmb_desc->len, 708 &cfed, &prod); 709 } 710 711 if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || 712 force || 713 ((to_confirm > conn->rmbe_update_limit) && 714 ((sender_free <= (conn->rmb_desc->len / 2)) || 715 conn->local_rx_ctrl.prod_flags.write_blocked))) { 716 if (conn->killed || 717 conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) 718 return; 719 if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && 720 !conn->killed) { 721 queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 722 SMC_TX_WORK_DELAY); 723 return; 724 } 725 } 726 if (conn->local_rx_ctrl.prod_flags.write_blocked && 727 !atomic_read(&conn->bytes_to_rcv)) 728 conn->local_rx_ctrl.prod_flags.write_blocked = 0; 729 } 730 731 /***************************** send initialize *******************************/ 732 733 /* Initialize send properties on connection establishment. NB: not __init! */ 734 void smc_tx_init(struct smc_sock *smc) 735 { 736 smc->sk.sk_write_space = smc_tx_write_space; 737 } 738