1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_ratelimit.h" 36 37 #ifdef TCP_OFFLOAD 38 #include <sys/param.h> 39 #include <sys/aio.h> 40 #include <sys/file.h> 41 #include <sys/kernel.h> 42 #include <sys/ktr.h> 43 #include <sys/module.h> 44 #include <sys/proc.h> 45 #include <sys/protosw.h> 46 #include <sys/domain.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sglist.h> 50 #include <sys/taskqueue.h> 51 #include <netinet/in.h> 52 #include <netinet/in_pcb.h> 53 #include <netinet/ip.h> 54 #include <netinet/ip6.h> 55 #define TCPSTATES 56 #include <netinet/tcp_fsm.h> 57 #include <netinet/tcp_seq.h> 58 #include <netinet/tcp_var.h> 59 #include <netinet/toecore.h> 60 61 #include <security/mac/mac_framework.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_extern.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_map.h> 67 #include <vm/vm_page.h> 68 69 #include "common/common.h" 70 #include "common/t4_msg.h" 71 #include "common/t4_regs.h" 72 #include "common/t4_tcb.h" 73 #include "tom/t4_tom_l2t.h" 74 #include "tom/t4_tom.h" 75 76 static void t4_aiotx_cancel(struct kaiocb *job); 77 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 78 79 static size_t 80 aiotx_mbuf_pgoff(struct mbuf *m) 81 { 82 struct aiotx_buffer *ab; 83 84 MPASS(IS_AIOTX_MBUF(m)); 85 ab = m->m_ext.ext_arg1; 86 return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE); 87 } 88 89 static vm_page_t * 90 aiotx_mbuf_pages(struct mbuf *m) 91 { 92 struct aiotx_buffer *ab; 93 int npages; 94 95 MPASS(IS_AIOTX_MBUF(m)); 96 ab = m->m_ext.ext_arg1; 97 npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE; 98 return (ab->ps.pages + npages); 99 } 100 101 void 102 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) 103 { 104 struct wrqe *wr; 105 struct fw_flowc_wr *flowc; 106 unsigned int nparams, flowclen, paramidx; 107 struct vi_info *vi = toep->vi; 108 struct port_info *pi = vi->pi; 109 struct adapter *sc = pi->adapter; 110 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 111 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 112 113 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 114 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 115 116 if (ftxp != NULL) 117 nparams = 8; 118 else 119 nparams = 6; 120 if (toep->ulp_mode == ULP_MODE_TLS) 121 nparams++; 122 if (toep->tls.fcplenmax != 0) 123 nparams++; 124 if (toep->tc_idx != -1) { 125 MPASS(toep->tc_idx >= 0 && 126 toep->tc_idx < sc->chip_params->nsched_cls); 127 nparams++; 128 } 129 130 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 131 132 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 133 if (wr == NULL) { 134 /* XXX */ 135 panic("%s: allocation failure.", __func__); 136 } 137 flowc = wrtod(wr); 138 memset(flowc, 0, wr->wr_len); 139 140 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 141 V_FW_FLOWC_WR_NPARAMS(nparams)); 142 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 143 V_FW_WR_FLOWID(toep->tid)); 144 145 #define FLOWC_PARAM(__m, __v) \ 146 do { \ 147 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 148 flowc->mnemval[paramidx].val = htobe32(__v); \ 149 paramidx++; \ 150 } while (0) 151 152 paramidx = 0; 153 154 FLOWC_PARAM(PFNVFN, pfvf); 155 FLOWC_PARAM(CH, pi->tx_chan); 156 FLOWC_PARAM(PORT, pi->tx_chan); 157 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 158 if (ftxp) { 159 uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); 160 161 FLOWC_PARAM(SNDNXT, ftxp->snd_nxt); 162 FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt); 163 FLOWC_PARAM(SNDBUF, sndbuf); 164 FLOWC_PARAM(MSS, ftxp->mss); 165 166 CTR6(KTR_CXGBE, 167 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 168 __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt, 169 ftxp->rcv_nxt); 170 } else { 171 FLOWC_PARAM(SNDBUF, 512); 172 FLOWC_PARAM(MSS, 512); 173 174 CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); 175 } 176 if (toep->ulp_mode == ULP_MODE_TLS) 177 FLOWC_PARAM(ULP_MODE, toep->ulp_mode); 178 if (toep->tls.fcplenmax != 0) 179 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 180 if (toep->tc_idx != -1) 181 FLOWC_PARAM(SCHEDCLASS, toep->tc_idx); 182 #undef FLOWC_PARAM 183 184 KASSERT(paramidx == nparams, ("nparams mismatch")); 185 186 txsd->tx_credits = howmany(flowclen, 16); 187 txsd->plen = 0; 188 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 189 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 190 toep->tx_credits -= txsd->tx_credits; 191 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 192 toep->txsd_pidx = 0; 193 toep->txsd_avail--; 194 195 toep->flags |= TPF_FLOWC_WR_SENT; 196 t4_wrq_tx(sc, wr); 197 } 198 199 #ifdef RATELIMIT 200 /* 201 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 202 */ 203 static int 204 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 205 { 206 int tc_idx, rc; 207 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 208 const int port_id = toep->vi->pi->port_id; 209 210 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 211 212 if (kbps == 0) { 213 /* unbind */ 214 tc_idx = -1; 215 } else { 216 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 217 if (rc != 0) 218 return (rc); 219 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 220 } 221 222 if (toep->tc_idx != tc_idx) { 223 struct wrqe *wr; 224 struct fw_flowc_wr *flowc; 225 int nparams = 1, flowclen, flowclen16; 226 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 227 228 flowclen = sizeof(*flowc) + nparams * sizeof(struct 229 fw_flowc_mnemval); 230 flowclen16 = howmany(flowclen, 16); 231 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 232 (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { 233 if (tc_idx >= 0) 234 t4_release_cl_rl(sc, port_id, tc_idx); 235 return (ENOMEM); 236 } 237 238 flowc = wrtod(wr); 239 memset(flowc, 0, wr->wr_len); 240 241 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 242 V_FW_FLOWC_WR_NPARAMS(nparams)); 243 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 244 V_FW_WR_FLOWID(toep->tid)); 245 246 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 247 if (tc_idx == -1) 248 flowc->mnemval[0].val = htobe32(0xff); 249 else 250 flowc->mnemval[0].val = htobe32(tc_idx); 251 252 txsd->tx_credits = flowclen16; 253 txsd->plen = 0; 254 toep->tx_credits -= txsd->tx_credits; 255 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 256 toep->txsd_pidx = 0; 257 toep->txsd_avail--; 258 t4_wrq_tx(sc, wr); 259 } 260 261 if (toep->tc_idx >= 0) 262 t4_release_cl_rl(sc, port_id, toep->tc_idx); 263 toep->tc_idx = tc_idx; 264 265 return (0); 266 } 267 #endif 268 269 void 270 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 271 { 272 struct wrqe *wr; 273 struct cpl_abort_req *req; 274 int tid = toep->tid; 275 struct inpcb *inp = toep->inp; 276 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 277 278 INP_WLOCK_ASSERT(inp); 279 280 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 281 __func__, toep->tid, 282 inp->inp_flags & INP_DROPPED ? "inp dropped" : 283 tcpstates[tp->t_state], 284 toep->flags, inp->inp_flags, 285 toep->flags & TPF_ABORT_SHUTDOWN ? 286 " (abort already in progress)" : ""); 287 288 if (toep->flags & TPF_ABORT_SHUTDOWN) 289 return; /* abort already in progress */ 290 291 toep->flags |= TPF_ABORT_SHUTDOWN; 292 293 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 294 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 295 296 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 297 if (wr == NULL) { 298 /* XXX */ 299 panic("%s: allocation failure.", __func__); 300 } 301 req = wrtod(wr); 302 303 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 304 if (inp->inp_flags & INP_DROPPED) 305 req->rsvd0 = htobe32(snd_nxt); 306 else 307 req->rsvd0 = htobe32(tp->snd_nxt); 308 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 309 req->cmd = CPL_ABORT_SEND_RST; 310 311 /* 312 * XXX: What's the correct way to tell that the inp hasn't been detached 313 * from its socket? Should I even be flushing the snd buffer here? 314 */ 315 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 316 struct socket *so = inp->inp_socket; 317 318 if (so != NULL) /* because I'm not sure. See comment above */ 319 sbflush(&so->so_snd); 320 } 321 322 t4_l2t_send(sc, wr, toep->l2te); 323 } 324 325 /* 326 * Called when a connection is established to translate the TCP options 327 * reported by HW to FreeBSD's native format. 328 */ 329 static void 330 assign_rxopt(struct tcpcb *tp, uint16_t opt) 331 { 332 struct toepcb *toep = tp->t_toe; 333 struct inpcb *inp = tp->t_inpcb; 334 struct adapter *sc = td_adapter(toep->td); 335 336 INP_LOCK_ASSERT(inp); 337 338 toep->tcp_opt = opt; 339 toep->mtu_idx = G_TCPOPT_MSS(opt); 340 tp->t_maxseg = sc->params.mtus[toep->mtu_idx]; 341 if (inp->inp_inc.inc_flags & INC_ISIPV6) 342 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 343 else 344 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 345 346 toep->emss = tp->t_maxseg; 347 if (G_TCPOPT_TSTAMP(opt)) { 348 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 349 tp->ts_recent = 0; /* hmmm */ 350 tp->ts_recent_age = tcp_ts_getticks(); 351 toep->emss -= TCPOLEN_TSTAMP_APPA; 352 } 353 354 CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u", 355 __func__, toep->tid, toep->mtu_idx, 356 sc->params.mtus[G_TCPOPT_MSS(opt)], tp->t_maxseg, toep->emss); 357 358 if (G_TCPOPT_SACK(opt)) 359 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 360 else 361 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 362 363 if (G_TCPOPT_WSCALE_OK(opt)) 364 tp->t_flags |= TF_RCVD_SCALE; 365 366 /* Doing window scaling? */ 367 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 368 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 369 tp->rcv_scale = tp->request_r_scale; 370 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 371 } 372 } 373 374 /* 375 * Completes some final bits of initialization for just established connections 376 * and changes their state to TCPS_ESTABLISHED. 377 * 378 * The ISNs are from the exchange of SYNs. 379 */ 380 void 381 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 382 { 383 struct inpcb *inp = toep->inp; 384 struct socket *so = inp->inp_socket; 385 struct tcpcb *tp = intotcpcb(inp); 386 long bufsize; 387 uint16_t tcpopt = be16toh(opt); 388 struct flowc_tx_params ftxp; 389 390 INP_WLOCK_ASSERT(inp); 391 KASSERT(tp->t_state == TCPS_SYN_SENT || 392 tp->t_state == TCPS_SYN_RECEIVED, 393 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 394 395 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 396 __func__, toep->tid, so, inp, tp, toep); 397 398 tcp_state_change(tp, TCPS_ESTABLISHED); 399 tp->t_starttime = ticks; 400 TCPSTAT_INC(tcps_connects); 401 402 tp->irs = irs; 403 tcp_rcvseqinit(tp); 404 tp->rcv_wnd = (u_int)toep->opt0_rcv_bufsize << 10; 405 tp->rcv_adv += tp->rcv_wnd; 406 tp->last_ack_sent = tp->rcv_nxt; 407 408 tp->iss = iss; 409 tcp_sendseqinit(tp); 410 tp->snd_una = iss + 1; 411 tp->snd_nxt = iss + 1; 412 tp->snd_max = iss + 1; 413 414 assign_rxopt(tp, tcpopt); 415 416 SOCKBUF_LOCK(&so->so_snd); 417 if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) 418 bufsize = V_tcp_autosndbuf_max; 419 else 420 bufsize = sbspace(&so->so_snd); 421 SOCKBUF_UNLOCK(&so->so_snd); 422 423 ftxp.snd_nxt = tp->snd_nxt; 424 ftxp.rcv_nxt = tp->rcv_nxt; 425 ftxp.snd_space = bufsize; 426 ftxp.mss = toep->emss; 427 send_flowc_wr(toep, &ftxp); 428 429 soisconnected(so); 430 } 431 432 int 433 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 434 { 435 struct wrqe *wr; 436 struct cpl_rx_data_ack *req; 437 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 438 439 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 440 441 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 442 if (wr == NULL) 443 return (0); 444 req = wrtod(wr); 445 446 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 447 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 448 449 t4_wrq_tx(sc, wr); 450 return (credits); 451 } 452 453 void 454 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 455 { 456 struct wrqe *wr; 457 struct cpl_rx_data_ack *req; 458 459 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 460 if (wr == NULL) 461 return; 462 req = wrtod(wr); 463 464 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 465 req->credit_dack = htobe32(F_RX_MODULATE_RX); 466 467 t4_wrq_tx(sc, wr); 468 } 469 470 void 471 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 472 { 473 struct adapter *sc = tod->tod_softc; 474 struct inpcb *inp = tp->t_inpcb; 475 struct socket *so = inp->inp_socket; 476 struct sockbuf *sb = &so->so_rcv; 477 struct toepcb *toep = tp->t_toe; 478 int rx_credits; 479 480 INP_WLOCK_ASSERT(inp); 481 SOCKBUF_LOCK_ASSERT(sb); 482 483 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 484 if (toep->ulp_mode == ULP_MODE_TLS) { 485 if (toep->tls.rcv_over >= rx_credits) { 486 toep->tls.rcv_over -= rx_credits; 487 rx_credits = 0; 488 } else { 489 rx_credits -= toep->tls.rcv_over; 490 toep->tls.rcv_over = 0; 491 } 492 } 493 494 if (rx_credits > 0 && 495 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 496 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 497 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 498 rx_credits = send_rx_credits(sc, toep, rx_credits); 499 tp->rcv_wnd += rx_credits; 500 tp->rcv_adv += rx_credits; 501 } else if (toep->flags & TPF_FORCE_CREDITS) 502 send_rx_modulate(sc, toep); 503 } 504 505 void 506 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 507 { 508 struct inpcb *inp = tp->t_inpcb; 509 struct socket *so = inp->inp_socket; 510 struct sockbuf *sb = &so->so_rcv; 511 512 SOCKBUF_LOCK(sb); 513 t4_rcvd_locked(tod, tp); 514 SOCKBUF_UNLOCK(sb); 515 } 516 517 /* 518 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 519 */ 520 int 521 t4_close_conn(struct adapter *sc, struct toepcb *toep) 522 { 523 struct wrqe *wr; 524 struct cpl_close_con_req *req; 525 unsigned int tid = toep->tid; 526 527 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 528 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 529 530 if (toep->flags & TPF_FIN_SENT) 531 return (0); 532 533 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 534 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 535 536 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 537 if (wr == NULL) { 538 /* XXX */ 539 panic("%s: allocation failure.", __func__); 540 } 541 req = wrtod(wr); 542 543 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 544 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 545 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 546 V_FW_WR_FLOWID(tid)); 547 req->wr.wr_lo = cpu_to_be64(0); 548 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 549 req->rsvd = 0; 550 551 toep->flags |= TPF_FIN_SENT; 552 toep->flags &= ~TPF_SEND_FIN; 553 t4_l2t_send(sc, wr, toep->l2te); 554 555 return (0); 556 } 557 558 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 559 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 560 561 /* Maximum amount of immediate data we could stuff in a WR */ 562 static inline int 563 max_imm_payload(int tx_credits) 564 { 565 const int n = 2; /* Use only up to 2 desc for imm. data WR */ 566 567 KASSERT(tx_credits >= 0 && 568 tx_credits <= MAX_OFLD_TX_CREDITS, 569 ("%s: %d credits", __func__, tx_credits)); 570 571 if (tx_credits < MIN_OFLD_TX_CREDITS) 572 return (0); 573 574 if (tx_credits >= (n * EQ_ESIZE) / 16) 575 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 576 else 577 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 578 } 579 580 /* Maximum number of SGL entries we could stuff in a WR */ 581 static inline int 582 max_dsgl_nsegs(int tx_credits) 583 { 584 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 585 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 586 587 KASSERT(tx_credits >= 0 && 588 tx_credits <= MAX_OFLD_TX_CREDITS, 589 ("%s: %d credits", __func__, tx_credits)); 590 591 if (tx_credits < MIN_OFLD_TX_CREDITS) 592 return (0); 593 594 nseg += 2 * (sge_pair_credits * 16 / 24); 595 if ((sge_pair_credits * 16) % 24 == 16) 596 nseg++; 597 598 return (nseg); 599 } 600 601 static inline void 602 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 603 unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign) 604 { 605 struct fw_ofld_tx_data_wr *txwr = dst; 606 607 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 608 V_FW_WR_IMMDLEN(immdlen)); 609 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 610 V_FW_WR_LEN16(credits)); 611 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) | 612 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 613 txwr->plen = htobe32(plen); 614 615 if (txalign > 0) { 616 struct tcpcb *tp = intotcpcb(toep->inp); 617 618 if (plen < 2 * toep->emss) 619 txwr->lsodisable_to_flags |= 620 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 621 else 622 txwr->lsodisable_to_flags |= 623 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 624 (tp->t_flags & TF_NODELAY ? 0 : 625 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 626 } 627 } 628 629 /* 630 * Generate a DSGL from a starting mbuf. The total number of segments and the 631 * maximum segments in any one mbuf are provided. 632 */ 633 static void 634 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 635 { 636 struct mbuf *m; 637 struct ulptx_sgl *usgl = dst; 638 int i, j, rc; 639 struct sglist sg; 640 struct sglist_seg segs[n]; 641 642 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 643 644 sglist_init(&sg, n, segs); 645 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 646 V_ULPTX_NSGE(nsegs)); 647 648 i = -1; 649 for (m = start; m != stop; m = m->m_next) { 650 if (IS_AIOTX_MBUF(m)) 651 rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m), 652 aiotx_mbuf_pgoff(m), m->m_len); 653 else if (m->m_flags & M_NOMAP) 654 rc = sglist_append_mb_ext_pgs(&sg, m); 655 else 656 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 657 if (__predict_false(rc != 0)) 658 panic("%s: sglist_append %d", __func__, rc); 659 660 for (j = 0; j < sg.sg_nseg; i++, j++) { 661 if (i < 0) { 662 usgl->len0 = htobe32(segs[j].ss_len); 663 usgl->addr0 = htobe64(segs[j].ss_paddr); 664 } else { 665 usgl->sge[i / 2].len[i & 1] = 666 htobe32(segs[j].ss_len); 667 usgl->sge[i / 2].addr[i & 1] = 668 htobe64(segs[j].ss_paddr); 669 } 670 #ifdef INVARIANTS 671 nsegs--; 672 #endif 673 } 674 sglist_reset(&sg); 675 } 676 if (i & 1) 677 usgl->sge[i / 2].len[1] = htobe32(0); 678 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 679 __func__, nsegs, start, stop)); 680 } 681 682 /* 683 * Max number of SGL entries an offload tx work request can have. This is 41 684 * (1 + 40) for a full 512B work request. 685 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 686 */ 687 #define OFLD_SGL_LEN (41) 688 689 /* 690 * Send data and/or a FIN to the peer. 691 * 692 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 693 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 694 * was transmitted. 695 * 696 * drop indicates the number of bytes that should be dropped from the head of 697 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 698 * contention on the send buffer lock (before this change it used to do 699 * sowwakeup and then t4_push_frames right after that when recovering from tx 700 * stalls). When drop is set this function MUST drop the bytes and wake up any 701 * writers. 702 */ 703 void 704 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 705 { 706 struct mbuf *sndptr, *m, *sb_sndptr; 707 struct fw_ofld_tx_data_wr *txwr; 708 struct wrqe *wr; 709 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 710 struct inpcb *inp = toep->inp; 711 struct tcpcb *tp = intotcpcb(inp); 712 struct socket *so = inp->inp_socket; 713 struct sockbuf *sb = &so->so_snd; 714 int tx_credits, shove, compl, sowwakeup; 715 struct ofld_tx_sdesc *txsd; 716 bool aiotx_mbuf_seen; 717 718 INP_WLOCK_ASSERT(inp); 719 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 720 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 721 722 KASSERT(toep->ulp_mode == ULP_MODE_NONE || 723 toep->ulp_mode == ULP_MODE_TCPDDP || 724 toep->ulp_mode == ULP_MODE_TLS || 725 toep->ulp_mode == ULP_MODE_RDMA, 726 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 727 728 #ifdef VERBOSE_TRACES 729 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 730 __func__, toep->tid, toep->flags, tp->t_flags, drop); 731 #endif 732 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 733 return; 734 735 #ifdef RATELIMIT 736 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 737 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 738 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 739 } 740 #endif 741 742 /* 743 * This function doesn't resume by itself. Someone else must clear the 744 * flag and call this function. 745 */ 746 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 747 KASSERT(drop == 0, 748 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 749 return; 750 } 751 752 txsd = &toep->txsd[toep->txsd_pidx]; 753 do { 754 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 755 max_imm = max_imm_payload(tx_credits); 756 max_nsegs = max_dsgl_nsegs(tx_credits); 757 758 SOCKBUF_LOCK(sb); 759 sowwakeup = drop; 760 if (drop) { 761 sbdrop_locked(sb, drop); 762 drop = 0; 763 } 764 sb_sndptr = sb->sb_sndptr; 765 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 766 plen = 0; 767 nsegs = 0; 768 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 769 aiotx_mbuf_seen = false; 770 for (m = sndptr; m != NULL; m = m->m_next) { 771 int n; 772 773 if (IS_AIOTX_MBUF(m)) 774 n = sglist_count_vmpages(aiotx_mbuf_pages(m), 775 aiotx_mbuf_pgoff(m), m->m_len); 776 else if (m->m_flags & M_NOMAP) 777 n = sglist_count_mb_ext_pgs(m); 778 else 779 n = sglist_count(mtod(m, void *), m->m_len); 780 781 nsegs += n; 782 plen += m->m_len; 783 784 /* This mbuf sent us _over_ the nsegs limit, back out */ 785 if (plen > max_imm && nsegs > max_nsegs) { 786 nsegs -= n; 787 plen -= m->m_len; 788 if (plen == 0) { 789 /* Too few credits */ 790 toep->flags |= TPF_TX_SUSPENDED; 791 if (sowwakeup) { 792 if (!TAILQ_EMPTY( 793 &toep->aiotx_jobq)) 794 t4_aiotx_queue_toep(so, 795 toep); 796 sowwakeup_locked(so); 797 } else 798 SOCKBUF_UNLOCK(sb); 799 SOCKBUF_UNLOCK_ASSERT(sb); 800 return; 801 } 802 break; 803 } 804 805 if (IS_AIOTX_MBUF(m)) 806 aiotx_mbuf_seen = true; 807 if (max_nsegs_1mbuf < n) 808 max_nsegs_1mbuf = n; 809 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 810 811 /* This mbuf put us right at the max_nsegs limit */ 812 if (plen > max_imm && nsegs == max_nsegs) { 813 m = m->m_next; 814 break; 815 } 816 } 817 818 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 819 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 820 compl = 1; 821 else 822 compl = 0; 823 824 if (sb->sb_flags & SB_AUTOSIZE && 825 V_tcp_do_autosndbuf && 826 sb->sb_hiwat < V_tcp_autosndbuf_max && 827 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 828 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 829 V_tcp_autosndbuf_max); 830 831 if (!sbreserve_locked(sb, newsize, so, NULL)) 832 sb->sb_flags &= ~SB_AUTOSIZE; 833 else 834 sowwakeup = 1; /* room available */ 835 } 836 if (sowwakeup) { 837 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 838 t4_aiotx_queue_toep(so, toep); 839 sowwakeup_locked(so); 840 } else 841 SOCKBUF_UNLOCK(sb); 842 SOCKBUF_UNLOCK_ASSERT(sb); 843 844 /* nothing to send */ 845 if (plen == 0) { 846 KASSERT(m == NULL, 847 ("%s: nothing to send, but m != NULL", __func__)); 848 break; 849 } 850 851 if (__predict_false(toep->flags & TPF_FIN_SENT)) 852 panic("%s: excess tx.", __func__); 853 854 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 855 if (plen <= max_imm && !aiotx_mbuf_seen) { 856 857 /* Immediate data tx */ 858 859 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 860 toep->ofld_txq); 861 if (wr == NULL) { 862 /* XXX: how will we recover from this? */ 863 toep->flags |= TPF_TX_SUSPENDED; 864 return; 865 } 866 txwr = wrtod(wr); 867 credits = howmany(wr->wr_len, 16); 868 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0, 869 sc->tt.tx_align); 870 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 871 nsegs = 0; 872 } else { 873 int wr_len; 874 875 /* DSGL tx */ 876 877 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 878 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 879 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 880 if (wr == NULL) { 881 /* XXX: how will we recover from this? */ 882 toep->flags |= TPF_TX_SUSPENDED; 883 return; 884 } 885 txwr = wrtod(wr); 886 credits = howmany(wr_len, 16); 887 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0, 888 sc->tt.tx_align); 889 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 890 max_nsegs_1mbuf); 891 if (wr_len & 0xf) { 892 uint64_t *pad = (uint64_t *) 893 ((uintptr_t)txwr + wr_len); 894 *pad = 0; 895 } 896 } 897 898 KASSERT(toep->tx_credits >= credits, 899 ("%s: not enough credits", __func__)); 900 901 toep->tx_credits -= credits; 902 toep->tx_nocompl += credits; 903 toep->plen_nocompl += plen; 904 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 905 toep->tx_nocompl >= toep->tx_total / 4) 906 compl = 1; 907 908 if (compl || toep->ulp_mode == ULP_MODE_RDMA) { 909 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 910 toep->tx_nocompl = 0; 911 toep->plen_nocompl = 0; 912 } 913 914 tp->snd_nxt += plen; 915 tp->snd_max += plen; 916 917 SOCKBUF_LOCK(sb); 918 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 919 sb->sb_sndptr = sb_sndptr; 920 SOCKBUF_UNLOCK(sb); 921 922 toep->flags |= TPF_TX_DATA_SENT; 923 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 924 toep->flags |= TPF_TX_SUSPENDED; 925 926 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 927 txsd->plen = plen; 928 txsd->tx_credits = credits; 929 txsd++; 930 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 931 toep->txsd_pidx = 0; 932 txsd = &toep->txsd[0]; 933 } 934 toep->txsd_avail--; 935 936 t4_l2t_send(sc, wr, toep->l2te); 937 } while (m != NULL); 938 939 /* Send a FIN if requested, but only if there's no more data to send */ 940 if (m == NULL && toep->flags & TPF_SEND_FIN) 941 t4_close_conn(sc, toep); 942 } 943 944 static inline void 945 rqdrop_locked(struct mbufq *q, int plen) 946 { 947 struct mbuf *m; 948 949 while (plen > 0) { 950 m = mbufq_dequeue(q); 951 952 /* Too many credits. */ 953 MPASS(m != NULL); 954 M_ASSERTPKTHDR(m); 955 956 /* Partial credits. */ 957 MPASS(plen >= m->m_pkthdr.len); 958 959 plen -= m->m_pkthdr.len; 960 m_freem(m); 961 } 962 } 963 964 void 965 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 966 { 967 struct mbuf *sndptr, *m; 968 struct fw_ofld_tx_data_wr *txwr; 969 struct wrqe *wr; 970 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 971 u_int adjusted_plen, ulp_submode; 972 struct inpcb *inp = toep->inp; 973 struct tcpcb *tp = intotcpcb(inp); 974 int tx_credits, shove; 975 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 976 struct mbufq *pduq = &toep->ulp_pduq; 977 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 978 979 INP_WLOCK_ASSERT(inp); 980 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 981 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 982 KASSERT(toep->ulp_mode == ULP_MODE_ISCSI, 983 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 984 985 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 986 return; 987 988 /* 989 * This function doesn't resume by itself. Someone else must clear the 990 * flag and call this function. 991 */ 992 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 993 KASSERT(drop == 0, 994 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 995 return; 996 } 997 998 if (drop) 999 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1000 1001 while ((sndptr = mbufq_first(pduq)) != NULL) { 1002 M_ASSERTPKTHDR(sndptr); 1003 1004 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 1005 max_imm = max_imm_payload(tx_credits); 1006 max_nsegs = max_dsgl_nsegs(tx_credits); 1007 1008 plen = 0; 1009 nsegs = 0; 1010 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 1011 for (m = sndptr; m != NULL; m = m->m_next) { 1012 int n = sglist_count(mtod(m, void *), m->m_len); 1013 1014 nsegs += n; 1015 plen += m->m_len; 1016 1017 /* 1018 * This mbuf would send us _over_ the nsegs limit. 1019 * Suspend tx because the PDU can't be sent out. 1020 */ 1021 if (plen > max_imm && nsegs > max_nsegs) { 1022 toep->flags |= TPF_TX_SUSPENDED; 1023 return; 1024 } 1025 1026 if (max_nsegs_1mbuf < n) 1027 max_nsegs_1mbuf = n; 1028 } 1029 1030 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1031 panic("%s: excess tx.", __func__); 1032 1033 /* 1034 * We have a PDU to send. All of it goes out in one WR so 'm' 1035 * is NULL. A PDU's length is always a multiple of 4. 1036 */ 1037 MPASS(m == NULL); 1038 MPASS((plen & 3) == 0); 1039 MPASS(sndptr->m_pkthdr.len == plen); 1040 1041 shove = !(tp->t_flags & TF_MORETOCOME); 1042 ulp_submode = mbuf_ulp_submode(sndptr); 1043 MPASS(ulp_submode < nitems(ulp_extra_len)); 1044 1045 /* 1046 * plen doesn't include header and data digests, which are 1047 * generated and inserted in the right places by the TOE, but 1048 * they do occupy TCP sequence space and need to be accounted 1049 * for. 1050 */ 1051 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1052 if (plen <= max_imm) { 1053 1054 /* Immediate data tx */ 1055 1056 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1057 toep->ofld_txq); 1058 if (wr == NULL) { 1059 /* XXX: how will we recover from this? */ 1060 toep->flags |= TPF_TX_SUSPENDED; 1061 return; 1062 } 1063 txwr = wrtod(wr); 1064 credits = howmany(wr->wr_len, 16); 1065 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1066 shove, ulp_submode, sc->tt.tx_align); 1067 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1068 nsegs = 0; 1069 } else { 1070 int wr_len; 1071 1072 /* DSGL tx */ 1073 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1074 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1075 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 1076 if (wr == NULL) { 1077 /* XXX: how will we recover from this? */ 1078 toep->flags |= TPF_TX_SUSPENDED; 1079 return; 1080 } 1081 txwr = wrtod(wr); 1082 credits = howmany(wr_len, 16); 1083 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1084 shove, ulp_submode, sc->tt.tx_align); 1085 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1086 max_nsegs_1mbuf); 1087 if (wr_len & 0xf) { 1088 uint64_t *pad = (uint64_t *) 1089 ((uintptr_t)txwr + wr_len); 1090 *pad = 0; 1091 } 1092 } 1093 1094 KASSERT(toep->tx_credits >= credits, 1095 ("%s: not enough credits", __func__)); 1096 1097 m = mbufq_dequeue(pduq); 1098 MPASS(m == sndptr); 1099 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1100 1101 toep->tx_credits -= credits; 1102 toep->tx_nocompl += credits; 1103 toep->plen_nocompl += plen; 1104 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1105 toep->tx_nocompl >= toep->tx_total / 4) { 1106 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1107 toep->tx_nocompl = 0; 1108 toep->plen_nocompl = 0; 1109 } 1110 1111 tp->snd_nxt += adjusted_plen; 1112 tp->snd_max += adjusted_plen; 1113 1114 toep->flags |= TPF_TX_DATA_SENT; 1115 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1116 toep->flags |= TPF_TX_SUSPENDED; 1117 1118 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1119 txsd->plen = plen; 1120 txsd->tx_credits = credits; 1121 txsd++; 1122 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1123 toep->txsd_pidx = 0; 1124 txsd = &toep->txsd[0]; 1125 } 1126 toep->txsd_avail--; 1127 1128 t4_l2t_send(sc, wr, toep->l2te); 1129 } 1130 1131 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1132 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1133 t4_close_conn(sc, toep); 1134 } 1135 1136 int 1137 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1138 { 1139 struct adapter *sc = tod->tod_softc; 1140 #ifdef INVARIANTS 1141 struct inpcb *inp = tp->t_inpcb; 1142 #endif 1143 struct toepcb *toep = tp->t_toe; 1144 1145 INP_WLOCK_ASSERT(inp); 1146 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1147 ("%s: inp %p dropped.", __func__, inp)); 1148 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1149 1150 if (toep->ulp_mode == ULP_MODE_ISCSI) 1151 t4_push_pdus(sc, toep, 0); 1152 else if (tls_tx_key(toep)) 1153 t4_push_tls_records(sc, toep, 0); 1154 else 1155 t4_push_frames(sc, toep, 0); 1156 1157 return (0); 1158 } 1159 1160 int 1161 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1162 { 1163 struct adapter *sc = tod->tod_softc; 1164 #ifdef INVARIANTS 1165 struct inpcb *inp = tp->t_inpcb; 1166 #endif 1167 struct toepcb *toep = tp->t_toe; 1168 1169 INP_WLOCK_ASSERT(inp); 1170 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1171 ("%s: inp %p dropped.", __func__, inp)); 1172 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1173 1174 toep->flags |= TPF_SEND_FIN; 1175 if (tp->t_state >= TCPS_ESTABLISHED) { 1176 if (toep->ulp_mode == ULP_MODE_ISCSI) 1177 t4_push_pdus(sc, toep, 0); 1178 else if (tls_tx_key(toep)) 1179 t4_push_tls_records(sc, toep, 0); 1180 else 1181 t4_push_frames(sc, toep, 0); 1182 } 1183 1184 return (0); 1185 } 1186 1187 int 1188 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1189 { 1190 struct adapter *sc = tod->tod_softc; 1191 #if defined(INVARIANTS) 1192 struct inpcb *inp = tp->t_inpcb; 1193 #endif 1194 struct toepcb *toep = tp->t_toe; 1195 1196 INP_WLOCK_ASSERT(inp); 1197 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1198 ("%s: inp %p dropped.", __func__, inp)); 1199 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1200 1201 /* hmmmm */ 1202 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1203 ("%s: flowc for tid %u [%s] not sent already", 1204 __func__, toep->tid, tcpstates[tp->t_state])); 1205 1206 send_reset(sc, toep, 0); 1207 return (0); 1208 } 1209 1210 /* 1211 * Peer has sent us a FIN. 1212 */ 1213 static int 1214 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1215 { 1216 struct adapter *sc = iq->adapter; 1217 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1218 unsigned int tid = GET_TID(cpl); 1219 struct toepcb *toep = lookup_tid(sc, tid); 1220 struct inpcb *inp = toep->inp; 1221 struct tcpcb *tp = NULL; 1222 struct socket *so; 1223 struct epoch_tracker et; 1224 #ifdef INVARIANTS 1225 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1226 #endif 1227 1228 KASSERT(opcode == CPL_PEER_CLOSE, 1229 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1230 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1231 1232 if (__predict_false(toep->flags & TPF_SYNQE)) { 1233 /* 1234 * do_pass_establish must have run before do_peer_close and if 1235 * this is still a synqe instead of a toepcb then the connection 1236 * must be getting aborted. 1237 */ 1238 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1239 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1240 toep, toep->flags); 1241 return (0); 1242 } 1243 1244 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1245 1246 CURVNET_SET(toep->vnet); 1247 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1248 INP_WLOCK(inp); 1249 tp = intotcpcb(inp); 1250 1251 CTR6(KTR_CXGBE, 1252 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1253 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1254 toep->ddp.flags, inp); 1255 1256 if (toep->flags & TPF_ABORT_SHUTDOWN) 1257 goto done; 1258 1259 tp->rcv_nxt++; /* FIN */ 1260 1261 so = inp->inp_socket; 1262 socantrcvmore(so); 1263 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1264 DDP_LOCK(toep); 1265 if (__predict_false(toep->ddp.flags & 1266 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1267 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1268 DDP_UNLOCK(toep); 1269 } 1270 1271 if (toep->ulp_mode != ULP_MODE_RDMA) { 1272 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1273 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1274 be32toh(cpl->rcv_nxt))); 1275 } 1276 1277 switch (tp->t_state) { 1278 case TCPS_SYN_RECEIVED: 1279 tp->t_starttime = ticks; 1280 /* FALLTHROUGH */ 1281 1282 case TCPS_ESTABLISHED: 1283 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1284 break; 1285 1286 case TCPS_FIN_WAIT_1: 1287 tcp_state_change(tp, TCPS_CLOSING); 1288 break; 1289 1290 case TCPS_FIN_WAIT_2: 1291 tcp_twstart(tp); 1292 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1293 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1294 CURVNET_RESTORE(); 1295 1296 INP_WLOCK(inp); 1297 final_cpl_received(toep); 1298 return (0); 1299 1300 default: 1301 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1302 __func__, tid, tp->t_state); 1303 } 1304 done: 1305 INP_WUNLOCK(inp); 1306 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1307 CURVNET_RESTORE(); 1308 return (0); 1309 } 1310 1311 /* 1312 * Peer has ACK'd our FIN. 1313 */ 1314 static int 1315 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1316 struct mbuf *m) 1317 { 1318 struct adapter *sc = iq->adapter; 1319 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1320 unsigned int tid = GET_TID(cpl); 1321 struct toepcb *toep = lookup_tid(sc, tid); 1322 struct inpcb *inp = toep->inp; 1323 struct tcpcb *tp = NULL; 1324 struct socket *so = NULL; 1325 struct epoch_tracker et; 1326 #ifdef INVARIANTS 1327 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1328 #endif 1329 1330 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1331 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1332 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1333 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1334 1335 CURVNET_SET(toep->vnet); 1336 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1337 INP_WLOCK(inp); 1338 tp = intotcpcb(inp); 1339 1340 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1341 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1342 1343 if (toep->flags & TPF_ABORT_SHUTDOWN) 1344 goto done; 1345 1346 so = inp->inp_socket; 1347 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1348 1349 switch (tp->t_state) { 1350 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1351 tcp_twstart(tp); 1352 release: 1353 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1354 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1355 CURVNET_RESTORE(); 1356 1357 INP_WLOCK(inp); 1358 final_cpl_received(toep); /* no more CPLs expected */ 1359 1360 return (0); 1361 case TCPS_LAST_ACK: 1362 if (tcp_close(tp)) 1363 INP_WUNLOCK(inp); 1364 goto release; 1365 1366 case TCPS_FIN_WAIT_1: 1367 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1368 soisdisconnected(so); 1369 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1370 break; 1371 1372 default: 1373 log(LOG_ERR, 1374 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1375 __func__, tid, tcpstates[tp->t_state]); 1376 } 1377 done: 1378 INP_WUNLOCK(inp); 1379 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1380 CURVNET_RESTORE(); 1381 return (0); 1382 } 1383 1384 void 1385 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1386 int rst_status) 1387 { 1388 struct wrqe *wr; 1389 struct cpl_abort_rpl *cpl; 1390 1391 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1392 if (wr == NULL) { 1393 /* XXX */ 1394 panic("%s: allocation failure.", __func__); 1395 } 1396 cpl = wrtod(wr); 1397 1398 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1399 cpl->cmd = rst_status; 1400 1401 t4_wrq_tx(sc, wr); 1402 } 1403 1404 static int 1405 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1406 { 1407 switch (abort_reason) { 1408 case CPL_ERR_BAD_SYN: 1409 case CPL_ERR_CONN_RESET: 1410 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1411 case CPL_ERR_XMIT_TIMEDOUT: 1412 case CPL_ERR_PERSIST_TIMEDOUT: 1413 case CPL_ERR_FINWAIT2_TIMEDOUT: 1414 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1415 return (ETIMEDOUT); 1416 default: 1417 return (EIO); 1418 } 1419 } 1420 1421 /* 1422 * TCP RST from the peer, timeout, or some other such critical error. 1423 */ 1424 static int 1425 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1426 { 1427 struct adapter *sc = iq->adapter; 1428 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1429 unsigned int tid = GET_TID(cpl); 1430 struct toepcb *toep = lookup_tid(sc, tid); 1431 struct sge_wrq *ofld_txq = toep->ofld_txq; 1432 struct inpcb *inp; 1433 struct tcpcb *tp; 1434 struct epoch_tracker et; 1435 #ifdef INVARIANTS 1436 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1437 #endif 1438 1439 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1440 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1441 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1442 1443 if (toep->flags & TPF_SYNQE) 1444 return (do_abort_req_synqe(iq, rss, m)); 1445 1446 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1447 1448 if (negative_advice(cpl->status)) { 1449 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1450 __func__, cpl->status, tid, toep->flags); 1451 return (0); /* Ignore negative advice */ 1452 } 1453 1454 inp = toep->inp; 1455 CURVNET_SET(toep->vnet); 1456 INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for tcp_close */ 1457 INP_WLOCK(inp); 1458 1459 tp = intotcpcb(inp); 1460 1461 CTR6(KTR_CXGBE, 1462 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1463 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1464 inp->inp_flags, cpl->status); 1465 1466 /* 1467 * If we'd initiated an abort earlier the reply to it is responsible for 1468 * cleaning up resources. Otherwise we tear everything down right here 1469 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1470 */ 1471 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1472 INP_WUNLOCK(inp); 1473 goto done; 1474 } 1475 toep->flags |= TPF_ABORT_SHUTDOWN; 1476 1477 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1478 struct socket *so = inp->inp_socket; 1479 1480 if (so != NULL) 1481 so_error_set(so, abort_status_to_errno(tp, 1482 cpl->status)); 1483 tp = tcp_close(tp); 1484 if (tp == NULL) 1485 INP_WLOCK(inp); /* re-acquire */ 1486 } 1487 1488 final_cpl_received(toep); 1489 done: 1490 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1491 CURVNET_RESTORE(); 1492 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1493 return (0); 1494 } 1495 1496 /* 1497 * Reply to the CPL_ABORT_REQ (send_reset) 1498 */ 1499 static int 1500 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1501 { 1502 struct adapter *sc = iq->adapter; 1503 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1504 unsigned int tid = GET_TID(cpl); 1505 struct toepcb *toep = lookup_tid(sc, tid); 1506 struct inpcb *inp = toep->inp; 1507 #ifdef INVARIANTS 1508 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1509 #endif 1510 1511 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1512 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1513 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1514 1515 if (toep->flags & TPF_SYNQE) 1516 return (do_abort_rpl_synqe(iq, rss, m)); 1517 1518 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1519 1520 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1521 __func__, tid, toep, inp, cpl->status); 1522 1523 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1524 ("%s: wasn't expecting abort reply", __func__)); 1525 1526 INP_WLOCK(inp); 1527 final_cpl_received(toep); 1528 1529 return (0); 1530 } 1531 1532 static int 1533 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1534 { 1535 struct adapter *sc = iq->adapter; 1536 const struct cpl_rx_data *cpl = mtod(m, const void *); 1537 unsigned int tid = GET_TID(cpl); 1538 struct toepcb *toep = lookup_tid(sc, tid); 1539 struct inpcb *inp = toep->inp; 1540 struct tcpcb *tp; 1541 struct socket *so; 1542 struct sockbuf *sb; 1543 struct epoch_tracker et; 1544 int len, rx_credits; 1545 uint32_t ddp_placed = 0; 1546 1547 if (__predict_false(toep->flags & TPF_SYNQE)) { 1548 /* 1549 * do_pass_establish must have run before do_rx_data and if this 1550 * is still a synqe instead of a toepcb then the connection must 1551 * be getting aborted. 1552 */ 1553 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1554 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1555 toep, toep->flags); 1556 m_freem(m); 1557 return (0); 1558 } 1559 1560 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1561 1562 /* strip off CPL header */ 1563 m_adj(m, sizeof(*cpl)); 1564 len = m->m_pkthdr.len; 1565 1566 INP_WLOCK(inp); 1567 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1568 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1569 __func__, tid, len, inp->inp_flags); 1570 INP_WUNLOCK(inp); 1571 m_freem(m); 1572 return (0); 1573 } 1574 1575 tp = intotcpcb(inp); 1576 1577 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1578 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1579 1580 tp->rcv_nxt += len; 1581 if (tp->rcv_wnd < len) { 1582 KASSERT(toep->ulp_mode == ULP_MODE_RDMA, 1583 ("%s: negative window size", __func__)); 1584 } 1585 1586 tp->rcv_wnd -= len; 1587 tp->t_rcvtime = ticks; 1588 1589 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1590 DDP_LOCK(toep); 1591 so = inp_inpcbtosocket(inp); 1592 sb = &so->so_rcv; 1593 SOCKBUF_LOCK(sb); 1594 1595 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1596 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1597 __func__, tid, len); 1598 m_freem(m); 1599 SOCKBUF_UNLOCK(sb); 1600 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1601 DDP_UNLOCK(toep); 1602 INP_WUNLOCK(inp); 1603 1604 CURVNET_SET(toep->vnet); 1605 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1606 INP_WLOCK(inp); 1607 tp = tcp_drop(tp, ECONNRESET); 1608 if (tp) 1609 INP_WUNLOCK(inp); 1610 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1611 CURVNET_RESTORE(); 1612 1613 return (0); 1614 } 1615 1616 /* receive buffer autosize */ 1617 MPASS(toep->vnet == so->so_vnet); 1618 CURVNET_SET(toep->vnet); 1619 if (sb->sb_flags & SB_AUTOSIZE && 1620 V_tcp_do_autorcvbuf && 1621 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1622 len > (sbspace(sb) / 8 * 7)) { 1623 unsigned int hiwat = sb->sb_hiwat; 1624 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1625 V_tcp_autorcvbuf_max); 1626 1627 if (!sbreserve_locked(sb, newsize, so, NULL)) 1628 sb->sb_flags &= ~SB_AUTOSIZE; 1629 } 1630 1631 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1632 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1633 1634 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1635 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1636 __func__, tid, len); 1637 1638 if (changed) { 1639 if (toep->ddp.flags & DDP_SC_REQ) 1640 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1641 else { 1642 KASSERT(cpl->ddp_off == 1, 1643 ("%s: DDP switched on by itself.", 1644 __func__)); 1645 1646 /* Fell out of DDP mode */ 1647 toep->ddp.flags &= ~DDP_ON; 1648 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1649 __func__); 1650 1651 insert_ddp_data(toep, ddp_placed); 1652 } 1653 } 1654 1655 if (toep->ddp.flags & DDP_ON) { 1656 /* 1657 * CPL_RX_DATA with DDP on can only be an indicate. 1658 * Start posting queued AIO requests via DDP. The 1659 * payload that arrived in this indicate is appended 1660 * to the socket buffer as usual. 1661 */ 1662 handle_ddp_indicate(toep); 1663 } 1664 } 1665 1666 sbappendstream_locked(sb, m, 0); 1667 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1668 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1669 rx_credits = send_rx_credits(sc, toep, rx_credits); 1670 tp->rcv_wnd += rx_credits; 1671 tp->rcv_adv += rx_credits; 1672 } 1673 1674 if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1675 sbavail(sb) != 0) { 1676 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1677 tid); 1678 ddp_queue_toep(toep); 1679 } 1680 sorwakeup_locked(so); 1681 SOCKBUF_UNLOCK_ASSERT(sb); 1682 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1683 DDP_UNLOCK(toep); 1684 1685 INP_WUNLOCK(inp); 1686 CURVNET_RESTORE(); 1687 return (0); 1688 } 1689 1690 static int 1691 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1692 { 1693 struct adapter *sc = iq->adapter; 1694 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1695 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1696 struct toepcb *toep = lookup_tid(sc, tid); 1697 struct inpcb *inp; 1698 struct tcpcb *tp; 1699 struct socket *so; 1700 uint8_t credits = cpl->credits; 1701 struct ofld_tx_sdesc *txsd; 1702 int plen; 1703 #ifdef INVARIANTS 1704 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1705 #endif 1706 1707 /* 1708 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1709 * now this comes back carrying the credits for the flowc. 1710 */ 1711 if (__predict_false(toep->flags & TPF_SYNQE)) { 1712 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1713 ("%s: credits for a synq entry %p", __func__, toep)); 1714 return (0); 1715 } 1716 1717 inp = toep->inp; 1718 1719 KASSERT(opcode == CPL_FW4_ACK, 1720 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1721 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1722 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1723 1724 INP_WLOCK(inp); 1725 1726 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1727 INP_WUNLOCK(inp); 1728 return (0); 1729 } 1730 1731 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1732 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1733 1734 tp = intotcpcb(inp); 1735 1736 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1737 tcp_seq snd_una = be32toh(cpl->snd_una); 1738 1739 #ifdef INVARIANTS 1740 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1741 log(LOG_ERR, 1742 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1743 __func__, snd_una, toep->tid, tp->snd_una); 1744 } 1745 #endif 1746 1747 if (tp->snd_una != snd_una) { 1748 tp->snd_una = snd_una; 1749 tp->ts_recent_age = tcp_ts_getticks(); 1750 } 1751 } 1752 1753 #ifdef VERBOSE_TRACES 1754 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1755 #endif 1756 so = inp->inp_socket; 1757 txsd = &toep->txsd[toep->txsd_cidx]; 1758 plen = 0; 1759 while (credits) { 1760 KASSERT(credits >= txsd->tx_credits, 1761 ("%s: too many (or partial) credits", __func__)); 1762 credits -= txsd->tx_credits; 1763 toep->tx_credits += txsd->tx_credits; 1764 plen += txsd->plen; 1765 if (txsd->iv_buffer) { 1766 free(txsd->iv_buffer, M_CXGBE); 1767 txsd->iv_buffer = NULL; 1768 } 1769 txsd++; 1770 toep->txsd_avail++; 1771 KASSERT(toep->txsd_avail <= toep->txsd_total, 1772 ("%s: txsd avail > total", __func__)); 1773 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1774 txsd = &toep->txsd[0]; 1775 toep->txsd_cidx = 0; 1776 } 1777 } 1778 1779 if (toep->tx_credits == toep->tx_total) { 1780 toep->tx_nocompl = 0; 1781 toep->plen_nocompl = 0; 1782 } 1783 1784 if (toep->flags & TPF_TX_SUSPENDED && 1785 toep->tx_credits >= toep->tx_total / 4) { 1786 #ifdef VERBOSE_TRACES 1787 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1788 tid); 1789 #endif 1790 toep->flags &= ~TPF_TX_SUSPENDED; 1791 CURVNET_SET(toep->vnet); 1792 if (toep->ulp_mode == ULP_MODE_ISCSI) 1793 t4_push_pdus(sc, toep, plen); 1794 else if (tls_tx_key(toep)) 1795 t4_push_tls_records(sc, toep, plen); 1796 else 1797 t4_push_frames(sc, toep, plen); 1798 CURVNET_RESTORE(); 1799 } else if (plen > 0) { 1800 struct sockbuf *sb = &so->so_snd; 1801 int sbu; 1802 1803 SOCKBUF_LOCK(sb); 1804 sbu = sbused(sb); 1805 if (toep->ulp_mode == ULP_MODE_ISCSI) { 1806 1807 if (__predict_false(sbu > 0)) { 1808 /* 1809 * The data trasmitted before the tid's ULP mode 1810 * changed to ISCSI is still in so_snd. 1811 * Incoming credits should account for so_snd 1812 * first. 1813 */ 1814 sbdrop_locked(sb, min(sbu, plen)); 1815 plen -= min(sbu, plen); 1816 } 1817 sowwakeup_locked(so); /* unlocks so_snd */ 1818 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1819 } else { 1820 #ifdef VERBOSE_TRACES 1821 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1822 tid, plen); 1823 #endif 1824 sbdrop_locked(sb, plen); 1825 if (tls_tx_key(toep)) { 1826 struct tls_ofld_info *tls_ofld = &toep->tls; 1827 1828 MPASS(tls_ofld->sb_off >= plen); 1829 tls_ofld->sb_off -= plen; 1830 } 1831 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1832 t4_aiotx_queue_toep(so, toep); 1833 sowwakeup_locked(so); /* unlocks so_snd */ 1834 } 1835 SOCKBUF_UNLOCK_ASSERT(sb); 1836 } 1837 1838 INP_WUNLOCK(inp); 1839 1840 return (0); 1841 } 1842 1843 void 1844 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1845 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1846 { 1847 struct wrqe *wr; 1848 struct cpl_set_tcb_field *req; 1849 struct ofld_tx_sdesc *txsd; 1850 1851 MPASS((cookie & ~M_COOKIE) == 0); 1852 if (reply) { 1853 MPASS(cookie != CPL_COOKIE_RESERVED); 1854 } 1855 1856 wr = alloc_wrqe(sizeof(*req), wrq); 1857 if (wr == NULL) { 1858 /* XXX */ 1859 panic("%s: allocation failure.", __func__); 1860 } 1861 req = wrtod(wr); 1862 1863 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1864 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1865 if (reply == 0) 1866 req->reply_ctrl |= htobe16(F_NO_REPLY); 1867 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1868 req->mask = htobe64(mask); 1869 req->val = htobe64(val); 1870 if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { 1871 txsd = &toep->txsd[toep->txsd_pidx]; 1872 txsd->tx_credits = howmany(sizeof(*req), 16); 1873 txsd->plen = 0; 1874 KASSERT(toep->tx_credits >= txsd->tx_credits && 1875 toep->txsd_avail > 0, 1876 ("%s: not enough credits (%d)", __func__, 1877 toep->tx_credits)); 1878 toep->tx_credits -= txsd->tx_credits; 1879 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1880 toep->txsd_pidx = 0; 1881 toep->txsd_avail--; 1882 } 1883 1884 t4_wrq_tx(sc, wr); 1885 } 1886 1887 void 1888 t4_init_cpl_io_handlers(void) 1889 { 1890 1891 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1892 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1893 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1894 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1895 CPL_COOKIE_TOM); 1896 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1897 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1898 } 1899 1900 void 1901 t4_uninit_cpl_io_handlers(void) 1902 { 1903 1904 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1905 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1906 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1907 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1908 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1909 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1910 } 1911 1912 /* 1913 * Use the 'backend3' field in AIO jobs to store the amount of data 1914 * sent by the AIO job so far and the 'backend4' field to hold an 1915 * error that should be reported when the job is completed. 1916 */ 1917 #define aio_sent backend3 1918 #define aio_error backend4 1919 1920 #define jobtotid(job) \ 1921 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1922 1923 static void 1924 free_aiotx_buffer(struct aiotx_buffer *ab) 1925 { 1926 struct kaiocb *job; 1927 long status; 1928 int error; 1929 1930 if (refcount_release(&ab->refcount) == 0) 1931 return; 1932 1933 job = ab->job; 1934 error = job->aio_error; 1935 status = job->aio_sent; 1936 vm_page_unhold_pages(ab->ps.pages, ab->ps.npages); 1937 free(ab, M_CXGBE); 1938 #ifdef VERBOSE_TRACES 1939 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1940 jobtotid(job), job, status, error); 1941 #endif 1942 if (error == ECANCELED && status != 0) 1943 error = 0; 1944 if (error == ECANCELED) 1945 aio_cancel(job); 1946 else if (error) 1947 aio_complete(job, -1, error); 1948 else 1949 aio_complete(job, status, 0); 1950 } 1951 1952 static void 1953 t4_aiotx_mbuf_free(struct mbuf *m) 1954 { 1955 struct aiotx_buffer *ab = m->m_ext.ext_arg1; 1956 1957 #ifdef VERBOSE_TRACES 1958 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1959 m->m_len, jobtotid(ab->job)); 1960 #endif 1961 free_aiotx_buffer(ab); 1962 } 1963 1964 /* 1965 * Hold the buffer backing an AIO request and return an AIO transmit 1966 * buffer. 1967 */ 1968 static int 1969 hold_aio(struct kaiocb *job) 1970 { 1971 struct aiotx_buffer *ab; 1972 struct vmspace *vm; 1973 vm_map_t map; 1974 vm_offset_t start, end, pgoff; 1975 int n; 1976 1977 MPASS(job->backend1 == NULL); 1978 1979 /* 1980 * The AIO subsystem will cancel and drain all requests before 1981 * permitting a process to exit or exec, so p_vmspace should 1982 * be stable here. 1983 */ 1984 vm = job->userproc->p_vmspace; 1985 map = &vm->vm_map; 1986 start = (uintptr_t)job->uaiocb.aio_buf; 1987 pgoff = start & PAGE_MASK; 1988 end = round_page(start + job->uaiocb.aio_nbytes); 1989 start = trunc_page(start); 1990 n = atop(end - start); 1991 1992 ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | 1993 M_ZERO); 1994 refcount_init(&ab->refcount, 1); 1995 ab->ps.pages = (vm_page_t *)(ab + 1); 1996 ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start, 1997 VM_PROT_WRITE, ab->ps.pages, n); 1998 if (ab->ps.npages < 0) { 1999 free(ab, M_CXGBE); 2000 return (EFAULT); 2001 } 2002 2003 KASSERT(ab->ps.npages == n, 2004 ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n)); 2005 2006 ab->ps.offset = pgoff; 2007 ab->ps.len = job->uaiocb.aio_nbytes; 2008 ab->job = job; 2009 job->backend1 = ab; 2010 #ifdef VERBOSE_TRACES 2011 CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", 2012 __func__, jobtotid(job), &ab->ps, job, ab->ps.npages); 2013 #endif 2014 return (0); 2015 } 2016 2017 static void 2018 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2019 { 2020 struct adapter *sc; 2021 struct sockbuf *sb; 2022 struct file *fp; 2023 struct aiotx_buffer *ab; 2024 struct inpcb *inp; 2025 struct tcpcb *tp; 2026 struct mbuf *m; 2027 int error; 2028 bool moretocome, sendmore; 2029 2030 sc = td_adapter(toep->td); 2031 sb = &so->so_snd; 2032 SOCKBUF_UNLOCK(sb); 2033 fp = job->fd_file; 2034 ab = job->backend1; 2035 m = NULL; 2036 2037 #ifdef MAC 2038 error = mac_socket_check_send(fp->f_cred, so); 2039 if (error != 0) 2040 goto out; 2041 #endif 2042 2043 if (ab == NULL) { 2044 error = hold_aio(job); 2045 if (error != 0) 2046 goto out; 2047 ab = job->backend1; 2048 } 2049 2050 /* Inline sosend_generic(). */ 2051 2052 job->msgsnd = 1; 2053 2054 error = sblock(sb, SBL_WAIT); 2055 MPASS(error == 0); 2056 2057 sendanother: 2058 m = m_get(M_WAITOK, MT_DATA); 2059 2060 SOCKBUF_LOCK(sb); 2061 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2062 SOCKBUF_UNLOCK(sb); 2063 sbunlock(sb); 2064 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2065 PROC_LOCK(job->userproc); 2066 kern_psignal(job->userproc, SIGPIPE); 2067 PROC_UNLOCK(job->userproc); 2068 } 2069 error = EPIPE; 2070 goto out; 2071 } 2072 if (so->so_error) { 2073 error = so->so_error; 2074 so->so_error = 0; 2075 SOCKBUF_UNLOCK(sb); 2076 sbunlock(sb); 2077 goto out; 2078 } 2079 if ((so->so_state & SS_ISCONNECTED) == 0) { 2080 SOCKBUF_UNLOCK(sb); 2081 sbunlock(sb); 2082 error = ENOTCONN; 2083 goto out; 2084 } 2085 if (sbspace(sb) < sb->sb_lowat) { 2086 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2087 2088 /* 2089 * Don't block if there is too little room in the socket 2090 * buffer. Instead, requeue the request. 2091 */ 2092 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2093 SOCKBUF_UNLOCK(sb); 2094 sbunlock(sb); 2095 error = ECANCELED; 2096 goto out; 2097 } 2098 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2099 SOCKBUF_UNLOCK(sb); 2100 sbunlock(sb); 2101 goto out; 2102 } 2103 2104 /* 2105 * Write as much data as the socket permits, but no more than a 2106 * a single sndbuf at a time. 2107 */ 2108 m->m_len = sbspace(sb); 2109 if (m->m_len > ab->ps.len - job->aio_sent) { 2110 m->m_len = ab->ps.len - job->aio_sent; 2111 moretocome = false; 2112 } else 2113 moretocome = true; 2114 if (m->m_len > sc->tt.sndbuf) { 2115 m->m_len = sc->tt.sndbuf; 2116 sendmore = true; 2117 } else 2118 sendmore = false; 2119 2120 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2121 moretocome = true; 2122 SOCKBUF_UNLOCK(sb); 2123 MPASS(m->m_len != 0); 2124 2125 /* Inlined tcp_usr_send(). */ 2126 2127 inp = toep->inp; 2128 INP_WLOCK(inp); 2129 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2130 INP_WUNLOCK(inp); 2131 sbunlock(sb); 2132 error = ECONNRESET; 2133 goto out; 2134 } 2135 2136 refcount_acquire(&ab->refcount); 2137 m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab, 2138 (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV); 2139 m->m_ext.ext_flags |= EXT_FLAG_AIOTX; 2140 job->aio_sent += m->m_len; 2141 2142 sbappendstream(sb, m, 0); 2143 m = NULL; 2144 2145 if (!(inp->inp_flags & INP_DROPPED)) { 2146 tp = intotcpcb(inp); 2147 if (moretocome) 2148 tp->t_flags |= TF_MORETOCOME; 2149 error = tp->t_fb->tfb_tcp_output(tp); 2150 if (moretocome) 2151 tp->t_flags &= ~TF_MORETOCOME; 2152 } 2153 2154 INP_WUNLOCK(inp); 2155 if (sendmore) 2156 goto sendanother; 2157 sbunlock(sb); 2158 2159 if (error) 2160 goto out; 2161 2162 /* 2163 * If this is a non-blocking socket and the request has not 2164 * been fully completed, requeue it until the socket is ready 2165 * again. 2166 */ 2167 if (job->aio_sent < job->uaiocb.aio_nbytes && 2168 !(so->so_state & SS_NBIO)) { 2169 SOCKBUF_LOCK(sb); 2170 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2171 SOCKBUF_UNLOCK(sb); 2172 error = ECANCELED; 2173 goto out; 2174 } 2175 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2176 return; 2177 } 2178 2179 /* 2180 * If the request will not be requeued, drop a reference on 2181 * the aiotx buffer. Any mbufs in flight should still 2182 * contain a reference, but this drops the reference that the 2183 * job owns while it is waiting to queue mbufs to the socket. 2184 */ 2185 free_aiotx_buffer(ab); 2186 2187 out: 2188 if (error) { 2189 if (ab != NULL) { 2190 job->aio_error = error; 2191 free_aiotx_buffer(ab); 2192 } else { 2193 MPASS(job->aio_sent == 0); 2194 aio_complete(job, -1, error); 2195 } 2196 } 2197 if (m != NULL) 2198 m_free(m); 2199 SOCKBUF_LOCK(sb); 2200 } 2201 2202 static void 2203 t4_aiotx_task(void *context, int pending) 2204 { 2205 struct toepcb *toep = context; 2206 struct socket *so; 2207 struct kaiocb *job; 2208 2209 so = toep->aiotx_so; 2210 CURVNET_SET(toep->vnet); 2211 SOCKBUF_LOCK(&so->so_snd); 2212 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2213 job = TAILQ_FIRST(&toep->aiotx_jobq); 2214 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2215 if (!aio_clear_cancel_function(job)) 2216 continue; 2217 2218 t4_aiotx_process_job(toep, so, job); 2219 } 2220 toep->aiotx_so = NULL; 2221 SOCKBUF_UNLOCK(&so->so_snd); 2222 CURVNET_RESTORE(); 2223 2224 free_toepcb(toep); 2225 SOCK_LOCK(so); 2226 sorele(so); 2227 } 2228 2229 static void 2230 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2231 { 2232 2233 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2234 #ifdef VERBOSE_TRACES 2235 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2236 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2237 #endif 2238 if (toep->aiotx_so != NULL) 2239 return; 2240 soref(so); 2241 toep->aiotx_so = so; 2242 hold_toepcb(toep); 2243 soaio_enqueue(&toep->aiotx_task); 2244 } 2245 2246 static void 2247 t4_aiotx_cancel(struct kaiocb *job) 2248 { 2249 struct aiotx_buffer *ab; 2250 struct socket *so; 2251 struct sockbuf *sb; 2252 struct tcpcb *tp; 2253 struct toepcb *toep; 2254 2255 so = job->fd_file->f_data; 2256 tp = so_sototcpcb(so); 2257 toep = tp->t_toe; 2258 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2259 sb = &so->so_snd; 2260 2261 SOCKBUF_LOCK(sb); 2262 if (!aio_cancel_cleared(job)) 2263 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2264 SOCKBUF_UNLOCK(sb); 2265 2266 ab = job->backend1; 2267 if (ab != NULL) 2268 free_aiotx_buffer(ab); 2269 else 2270 aio_cancel(job); 2271 } 2272 2273 int 2274 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2275 { 2276 struct tcpcb *tp = so_sototcpcb(so); 2277 struct toepcb *toep = tp->t_toe; 2278 struct adapter *sc = td_adapter(toep->td); 2279 2280 /* This only handles writes. */ 2281 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2282 return (EOPNOTSUPP); 2283 2284 if (!sc->tt.tx_zcopy) 2285 return (EOPNOTSUPP); 2286 2287 if (tls_tx_key(toep)) 2288 return (EOPNOTSUPP); 2289 2290 SOCKBUF_LOCK(&so->so_snd); 2291 #ifdef VERBOSE_TRACES 2292 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2293 #endif 2294 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2295 panic("new job was cancelled"); 2296 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2297 if (sowriteable(so)) 2298 t4_aiotx_queue_toep(so, toep); 2299 SOCKBUF_UNLOCK(&so->so_snd); 2300 return (0); 2301 } 2302 2303 void 2304 aiotx_init_toep(struct toepcb *toep) 2305 { 2306 2307 TAILQ_INIT(&toep->aiotx_jobq); 2308 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2309 } 2310 #endif 2311