1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_ratelimit.h" 36 37 #ifdef TCP_OFFLOAD 38 #include <sys/param.h> 39 #include <sys/aio.h> 40 #include <sys/file.h> 41 #include <sys/kernel.h> 42 #include <sys/ktr.h> 43 #include <sys/module.h> 44 #include <sys/proc.h> 45 #include <sys/protosw.h> 46 #include <sys/domain.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sglist.h> 50 #include <sys/taskqueue.h> 51 #include <netinet/in.h> 52 #include <netinet/in_pcb.h> 53 #include <netinet/ip.h> 54 #include <netinet/ip6.h> 55 #define TCPSTATES 56 #include <netinet/tcp_fsm.h> 57 #include <netinet/tcp_seq.h> 58 #include <netinet/tcp_var.h> 59 #include <netinet/toecore.h> 60 61 #include <security/mac/mac_framework.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_extern.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_map.h> 67 #include <vm/vm_page.h> 68 69 #include "common/common.h" 70 #include "common/t4_msg.h" 71 #include "common/t4_regs.h" 72 #include "common/t4_tcb.h" 73 #include "tom/t4_tom_l2t.h" 74 #include "tom/t4_tom.h" 75 76 static void t4_aiotx_cancel(struct kaiocb *job); 77 static void t4_aiotx_queue_toep(struct toepcb *toep); 78 79 static size_t 80 aiotx_mbuf_pgoff(struct mbuf *m) 81 { 82 struct aiotx_buffer *ab; 83 84 MPASS(IS_AIOTX_MBUF(m)); 85 ab = m->m_ext.ext_arg1; 86 return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE); 87 } 88 89 static vm_page_t * 90 aiotx_mbuf_pages(struct mbuf *m) 91 { 92 struct aiotx_buffer *ab; 93 int npages; 94 95 MPASS(IS_AIOTX_MBUF(m)); 96 ab = m->m_ext.ext_arg1; 97 npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE; 98 return (ab->ps.pages + npages); 99 } 100 101 void 102 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) 103 { 104 struct wrqe *wr; 105 struct fw_flowc_wr *flowc; 106 unsigned int nparams, flowclen, paramidx; 107 struct vi_info *vi = toep->vi; 108 struct port_info *pi = vi->pi; 109 struct adapter *sc = pi->adapter; 110 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 111 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 112 113 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 114 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 115 116 if (ftxp != NULL) 117 nparams = 8; 118 else 119 nparams = 6; 120 if (toep->ulp_mode == ULP_MODE_TLS) 121 nparams++; 122 if (toep->tls.fcplenmax != 0) 123 nparams++; 124 if (toep->tc_idx != -1) { 125 MPASS(toep->tc_idx >= 0 && 126 toep->tc_idx < sc->chip_params->nsched_cls); 127 nparams++; 128 } 129 130 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 131 132 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 133 if (wr == NULL) { 134 /* XXX */ 135 panic("%s: allocation failure.", __func__); 136 } 137 flowc = wrtod(wr); 138 memset(flowc, 0, wr->wr_len); 139 140 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 141 V_FW_FLOWC_WR_NPARAMS(nparams)); 142 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 143 V_FW_WR_FLOWID(toep->tid)); 144 145 #define FLOWC_PARAM(__m, __v) \ 146 do { \ 147 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 148 flowc->mnemval[paramidx].val = htobe32(__v); \ 149 paramidx++; \ 150 } while (0) 151 152 paramidx = 0; 153 154 FLOWC_PARAM(PFNVFN, pfvf); 155 FLOWC_PARAM(CH, pi->tx_chan); 156 FLOWC_PARAM(PORT, pi->tx_chan); 157 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 158 if (ftxp) { 159 uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); 160 161 FLOWC_PARAM(SNDNXT, ftxp->snd_nxt); 162 FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt); 163 FLOWC_PARAM(SNDBUF, sndbuf); 164 FLOWC_PARAM(MSS, ftxp->mss); 165 166 CTR6(KTR_CXGBE, 167 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 168 __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt, 169 ftxp->rcv_nxt); 170 } else { 171 FLOWC_PARAM(SNDBUF, 512); 172 FLOWC_PARAM(MSS, 512); 173 174 CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); 175 } 176 if (toep->ulp_mode == ULP_MODE_TLS) 177 FLOWC_PARAM(ULP_MODE, toep->ulp_mode); 178 if (toep->tls.fcplenmax != 0) 179 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 180 if (toep->tc_idx != -1) 181 FLOWC_PARAM(SCHEDCLASS, toep->tc_idx); 182 #undef FLOWC_PARAM 183 184 KASSERT(paramidx == nparams, ("nparams mismatch")); 185 186 txsd->tx_credits = howmany(flowclen, 16); 187 txsd->plen = 0; 188 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 189 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 190 toep->tx_credits -= txsd->tx_credits; 191 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 192 toep->txsd_pidx = 0; 193 toep->txsd_avail--; 194 195 toep->flags |= TPF_FLOWC_WR_SENT; 196 t4_wrq_tx(sc, wr); 197 } 198 199 #ifdef RATELIMIT 200 /* 201 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 202 */ 203 static int 204 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 205 { 206 int tc_idx, rc; 207 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 208 const int port_id = toep->vi->pi->port_id; 209 210 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 211 212 if (kbps == 0) { 213 /* unbind */ 214 tc_idx = -1; 215 } else { 216 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 217 if (rc != 0) 218 return (rc); 219 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 220 } 221 222 if (toep->tc_idx != tc_idx) { 223 struct wrqe *wr; 224 struct fw_flowc_wr *flowc; 225 int nparams = 1, flowclen, flowclen16; 226 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 227 228 flowclen = sizeof(*flowc) + nparams * sizeof(struct 229 fw_flowc_mnemval); 230 flowclen16 = howmany(flowclen, 16); 231 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 232 (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { 233 if (tc_idx >= 0) 234 t4_release_cl_rl(sc, port_id, tc_idx); 235 return (ENOMEM); 236 } 237 238 flowc = wrtod(wr); 239 memset(flowc, 0, wr->wr_len); 240 241 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 242 V_FW_FLOWC_WR_NPARAMS(nparams)); 243 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 244 V_FW_WR_FLOWID(toep->tid)); 245 246 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 247 if (tc_idx == -1) 248 flowc->mnemval[0].val = htobe32(0xff); 249 else 250 flowc->mnemval[0].val = htobe32(tc_idx); 251 252 txsd->tx_credits = flowclen16; 253 txsd->plen = 0; 254 toep->tx_credits -= txsd->tx_credits; 255 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 256 toep->txsd_pidx = 0; 257 toep->txsd_avail--; 258 t4_wrq_tx(sc, wr); 259 } 260 261 if (toep->tc_idx >= 0) 262 t4_release_cl_rl(sc, port_id, toep->tc_idx); 263 toep->tc_idx = tc_idx; 264 265 return (0); 266 } 267 #endif 268 269 void 270 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 271 { 272 struct wrqe *wr; 273 struct cpl_abort_req *req; 274 int tid = toep->tid; 275 struct inpcb *inp = toep->inp; 276 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 277 278 INP_WLOCK_ASSERT(inp); 279 280 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 281 __func__, toep->tid, 282 inp->inp_flags & INP_DROPPED ? "inp dropped" : 283 tcpstates[tp->t_state], 284 toep->flags, inp->inp_flags, 285 toep->flags & TPF_ABORT_SHUTDOWN ? 286 " (abort already in progress)" : ""); 287 288 if (toep->flags & TPF_ABORT_SHUTDOWN) 289 return; /* abort already in progress */ 290 291 toep->flags |= TPF_ABORT_SHUTDOWN; 292 293 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 294 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 295 296 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 297 if (wr == NULL) { 298 /* XXX */ 299 panic("%s: allocation failure.", __func__); 300 } 301 req = wrtod(wr); 302 303 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 304 if (inp->inp_flags & INP_DROPPED) 305 req->rsvd0 = htobe32(snd_nxt); 306 else 307 req->rsvd0 = htobe32(tp->snd_nxt); 308 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 309 req->cmd = CPL_ABORT_SEND_RST; 310 311 /* 312 * XXX: What's the correct way to tell that the inp hasn't been detached 313 * from its socket? Should I even be flushing the snd buffer here? 314 */ 315 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 316 struct socket *so = inp->inp_socket; 317 318 if (so != NULL) /* because I'm not sure. See comment above */ 319 sbflush(&so->so_snd); 320 } 321 322 t4_l2t_send(sc, wr, toep->l2te); 323 } 324 325 /* 326 * Called when a connection is established to translate the TCP options 327 * reported by HW to FreeBSD's native format. 328 */ 329 static void 330 assign_rxopt(struct tcpcb *tp, unsigned int opt) 331 { 332 struct toepcb *toep = tp->t_toe; 333 struct inpcb *inp = tp->t_inpcb; 334 struct adapter *sc = td_adapter(toep->td); 335 int n; 336 337 INP_LOCK_ASSERT(inp); 338 339 if (inp->inp_inc.inc_flags & INC_ISIPV6) 340 n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 341 else 342 n = sizeof(struct ip) + sizeof(struct tcphdr); 343 tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; 344 345 if (G_TCPOPT_TSTAMP(opt)) { 346 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 347 tp->ts_recent = 0; /* hmmm */ 348 tp->ts_recent_age = tcp_ts_getticks(); 349 tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; 350 } 351 352 CTR5(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), mss %u", __func__, 353 toep->tid, G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)], 354 tp->t_maxseg); 355 356 if (G_TCPOPT_SACK(opt)) 357 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 358 else 359 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 360 361 if (G_TCPOPT_WSCALE_OK(opt)) 362 tp->t_flags |= TF_RCVD_SCALE; 363 364 /* Doing window scaling? */ 365 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 366 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 367 tp->rcv_scale = tp->request_r_scale; 368 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 369 } 370 } 371 372 /* 373 * Completes some final bits of initialization for just established connections 374 * and changes their state to TCPS_ESTABLISHED. 375 * 376 * The ISNs are from the exchange of SYNs. 377 */ 378 void 379 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 380 { 381 struct inpcb *inp = toep->inp; 382 struct socket *so = inp->inp_socket; 383 struct tcpcb *tp = intotcpcb(inp); 384 long bufsize; 385 uint16_t tcpopt = be16toh(opt); 386 struct flowc_tx_params ftxp; 387 388 INP_WLOCK_ASSERT(inp); 389 KASSERT(tp->t_state == TCPS_SYN_SENT || 390 tp->t_state == TCPS_SYN_RECEIVED, 391 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 392 393 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 394 __func__, toep->tid, so, inp, tp, toep); 395 396 tcp_state_change(tp, TCPS_ESTABLISHED); 397 tp->t_starttime = ticks; 398 TCPSTAT_INC(tcps_connects); 399 400 tp->irs = irs; 401 tcp_rcvseqinit(tp); 402 tp->rcv_wnd = toep->rx_credits << 10; 403 tp->rcv_adv += tp->rcv_wnd; 404 tp->last_ack_sent = tp->rcv_nxt; 405 406 /* 407 * If we were unable to send all rx credits via opt0, save the remainder 408 * in rx_credits so that they can be handed over with the next credit 409 * update. 410 */ 411 SOCKBUF_LOCK(&so->so_rcv); 412 bufsize = select_rcv_wnd(so); 413 SOCKBUF_UNLOCK(&so->so_rcv); 414 toep->rx_credits = bufsize - tp->rcv_wnd; 415 416 tp->iss = iss; 417 tcp_sendseqinit(tp); 418 tp->snd_una = iss + 1; 419 tp->snd_nxt = iss + 1; 420 tp->snd_max = iss + 1; 421 422 assign_rxopt(tp, tcpopt); 423 424 SOCKBUF_LOCK(&so->so_snd); 425 if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) 426 bufsize = V_tcp_autosndbuf_max; 427 else 428 bufsize = sbspace(&so->so_snd); 429 SOCKBUF_UNLOCK(&so->so_snd); 430 431 ftxp.snd_nxt = tp->snd_nxt; 432 ftxp.rcv_nxt = tp->rcv_nxt; 433 ftxp.snd_space = bufsize; 434 ftxp.mss = tp->t_maxseg; 435 send_flowc_wr(toep, &ftxp); 436 437 soisconnected(so); 438 } 439 440 int 441 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 442 { 443 struct wrqe *wr; 444 struct cpl_rx_data_ack *req; 445 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 446 447 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 448 449 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 450 if (wr == NULL) 451 return (0); 452 req = wrtod(wr); 453 454 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 455 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 456 457 t4_wrq_tx(sc, wr); 458 return (credits); 459 } 460 461 void 462 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 463 { 464 struct wrqe *wr; 465 struct cpl_rx_data_ack *req; 466 467 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 468 if (wr == NULL) 469 return; 470 req = wrtod(wr); 471 472 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 473 req->credit_dack = htobe32(F_RX_MODULATE_RX); 474 475 t4_wrq_tx(sc, wr); 476 } 477 478 void 479 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 480 { 481 struct adapter *sc = tod->tod_softc; 482 struct inpcb *inp = tp->t_inpcb; 483 struct socket *so = inp->inp_socket; 484 struct sockbuf *sb = &so->so_rcv; 485 struct toepcb *toep = tp->t_toe; 486 int credits; 487 488 INP_WLOCK_ASSERT(inp); 489 490 SOCKBUF_LOCK_ASSERT(sb); 491 KASSERT(toep->sb_cc >= sbused(sb), 492 ("%s: sb %p has more data (%d) than last time (%d).", 493 __func__, sb, sbused(sb), toep->sb_cc)); 494 495 credits = toep->sb_cc - sbused(sb); 496 toep->sb_cc = sbused(sb); 497 if (toep->ulp_mode == ULP_MODE_TLS) { 498 if (toep->tls.rcv_over >= credits) { 499 toep->tls.rcv_over -= credits; 500 credits = 0; 501 } else { 502 credits -= toep->tls.rcv_over; 503 toep->tls.rcv_over = 0; 504 } 505 } 506 toep->rx_credits += credits; 507 508 if (toep->rx_credits > 0 && 509 (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 || 510 (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 511 toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) { 512 513 credits = send_rx_credits(sc, toep, toep->rx_credits); 514 toep->rx_credits -= credits; 515 tp->rcv_wnd += credits; 516 tp->rcv_adv += credits; 517 } else if (toep->flags & TPF_FORCE_CREDITS) 518 send_rx_modulate(sc, toep); 519 } 520 521 void 522 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 523 { 524 struct inpcb *inp = tp->t_inpcb; 525 struct socket *so = inp->inp_socket; 526 struct sockbuf *sb = &so->so_rcv; 527 528 SOCKBUF_LOCK(sb); 529 t4_rcvd_locked(tod, tp); 530 SOCKBUF_UNLOCK(sb); 531 } 532 533 /* 534 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 535 */ 536 int 537 t4_close_conn(struct adapter *sc, struct toepcb *toep) 538 { 539 struct wrqe *wr; 540 struct cpl_close_con_req *req; 541 unsigned int tid = toep->tid; 542 543 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 544 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 545 546 if (toep->flags & TPF_FIN_SENT) 547 return (0); 548 549 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 550 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 551 552 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 553 if (wr == NULL) { 554 /* XXX */ 555 panic("%s: allocation failure.", __func__); 556 } 557 req = wrtod(wr); 558 559 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 560 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 561 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 562 V_FW_WR_FLOWID(tid)); 563 req->wr.wr_lo = cpu_to_be64(0); 564 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 565 req->rsvd = 0; 566 567 toep->flags |= TPF_FIN_SENT; 568 toep->flags &= ~TPF_SEND_FIN; 569 t4_l2t_send(sc, wr, toep->l2te); 570 571 return (0); 572 } 573 574 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 575 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 576 577 /* Maximum amount of immediate data we could stuff in a WR */ 578 static inline int 579 max_imm_payload(int tx_credits) 580 { 581 const int n = 2; /* Use only up to 2 desc for imm. data WR */ 582 583 KASSERT(tx_credits >= 0 && 584 tx_credits <= MAX_OFLD_TX_CREDITS, 585 ("%s: %d credits", __func__, tx_credits)); 586 587 if (tx_credits < MIN_OFLD_TX_CREDITS) 588 return (0); 589 590 if (tx_credits >= (n * EQ_ESIZE) / 16) 591 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 592 else 593 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 594 } 595 596 /* Maximum number of SGL entries we could stuff in a WR */ 597 static inline int 598 max_dsgl_nsegs(int tx_credits) 599 { 600 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 601 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 602 603 KASSERT(tx_credits >= 0 && 604 tx_credits <= MAX_OFLD_TX_CREDITS, 605 ("%s: %d credits", __func__, tx_credits)); 606 607 if (tx_credits < MIN_OFLD_TX_CREDITS) 608 return (0); 609 610 nseg += 2 * (sge_pair_credits * 16 / 24); 611 if ((sge_pair_credits * 16) % 24 == 16) 612 nseg++; 613 614 return (nseg); 615 } 616 617 static inline void 618 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 619 unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign) 620 { 621 struct fw_ofld_tx_data_wr *txwr = dst; 622 623 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 624 V_FW_WR_IMMDLEN(immdlen)); 625 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 626 V_FW_WR_LEN16(credits)); 627 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) | 628 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 629 txwr->plen = htobe32(plen); 630 631 if (txalign > 0) { 632 struct tcpcb *tp = intotcpcb(toep->inp); 633 634 if (plen < 2 * tp->t_maxseg) 635 txwr->lsodisable_to_flags |= 636 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 637 else 638 txwr->lsodisable_to_flags |= 639 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 640 (tp->t_flags & TF_NODELAY ? 0 : 641 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 642 } 643 } 644 645 /* 646 * Generate a DSGL from a starting mbuf. The total number of segments and the 647 * maximum segments in any one mbuf are provided. 648 */ 649 static void 650 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 651 { 652 struct mbuf *m; 653 struct ulptx_sgl *usgl = dst; 654 int i, j, rc; 655 struct sglist sg; 656 struct sglist_seg segs[n]; 657 658 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 659 660 sglist_init(&sg, n, segs); 661 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 662 V_ULPTX_NSGE(nsegs)); 663 664 i = -1; 665 for (m = start; m != stop; m = m->m_next) { 666 if (IS_AIOTX_MBUF(m)) 667 rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m), 668 aiotx_mbuf_pgoff(m), m->m_len); 669 else 670 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 671 if (__predict_false(rc != 0)) 672 panic("%s: sglist_append %d", __func__, rc); 673 674 for (j = 0; j < sg.sg_nseg; i++, j++) { 675 if (i < 0) { 676 usgl->len0 = htobe32(segs[j].ss_len); 677 usgl->addr0 = htobe64(segs[j].ss_paddr); 678 } else { 679 usgl->sge[i / 2].len[i & 1] = 680 htobe32(segs[j].ss_len); 681 usgl->sge[i / 2].addr[i & 1] = 682 htobe64(segs[j].ss_paddr); 683 } 684 #ifdef INVARIANTS 685 nsegs--; 686 #endif 687 } 688 sglist_reset(&sg); 689 } 690 if (i & 1) 691 usgl->sge[i / 2].len[1] = htobe32(0); 692 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 693 __func__, nsegs, start, stop)); 694 } 695 696 /* 697 * Max number of SGL entries an offload tx work request can have. This is 41 698 * (1 + 40) for a full 512B work request. 699 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 700 */ 701 #define OFLD_SGL_LEN (41) 702 703 /* 704 * Send data and/or a FIN to the peer. 705 * 706 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 707 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 708 * was transmitted. 709 * 710 * drop indicates the number of bytes that should be dropped from the head of 711 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 712 * contention on the send buffer lock (before this change it used to do 713 * sowwakeup and then t4_push_frames right after that when recovering from tx 714 * stalls). When drop is set this function MUST drop the bytes and wake up any 715 * writers. 716 */ 717 void 718 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 719 { 720 struct mbuf *sndptr, *m, *sb_sndptr; 721 struct fw_ofld_tx_data_wr *txwr; 722 struct wrqe *wr; 723 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 724 struct inpcb *inp = toep->inp; 725 struct tcpcb *tp = intotcpcb(inp); 726 struct socket *so = inp->inp_socket; 727 struct sockbuf *sb = &so->so_snd; 728 int tx_credits, shove, compl, sowwakeup; 729 struct ofld_tx_sdesc *txsd; 730 bool aiotx_mbuf_seen; 731 732 INP_WLOCK_ASSERT(inp); 733 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 734 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 735 736 KASSERT(toep->ulp_mode == ULP_MODE_NONE || 737 toep->ulp_mode == ULP_MODE_TCPDDP || 738 toep->ulp_mode == ULP_MODE_TLS || 739 toep->ulp_mode == ULP_MODE_RDMA, 740 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 741 742 #ifdef VERBOSE_TRACES 743 CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 744 __func__, toep->tid, toep->flags, tp->t_flags); 745 #endif 746 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 747 return; 748 749 #ifdef RATELIMIT 750 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 751 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 752 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 753 } 754 #endif 755 756 /* 757 * This function doesn't resume by itself. Someone else must clear the 758 * flag and call this function. 759 */ 760 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 761 KASSERT(drop == 0, 762 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 763 return; 764 } 765 766 txsd = &toep->txsd[toep->txsd_pidx]; 767 do { 768 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 769 max_imm = max_imm_payload(tx_credits); 770 max_nsegs = max_dsgl_nsegs(tx_credits); 771 772 SOCKBUF_LOCK(sb); 773 sowwakeup = drop; 774 if (drop) { 775 sbdrop_locked(sb, drop); 776 drop = 0; 777 } 778 sb_sndptr = sb->sb_sndptr; 779 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 780 plen = 0; 781 nsegs = 0; 782 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 783 aiotx_mbuf_seen = false; 784 for (m = sndptr; m != NULL; m = m->m_next) { 785 int n; 786 787 if (IS_AIOTX_MBUF(m)) 788 n = sglist_count_vmpages(aiotx_mbuf_pages(m), 789 aiotx_mbuf_pgoff(m), m->m_len); 790 else 791 n = sglist_count(mtod(m, void *), m->m_len); 792 793 nsegs += n; 794 plen += m->m_len; 795 796 /* This mbuf sent us _over_ the nsegs limit, back out */ 797 if (plen > max_imm && nsegs > max_nsegs) { 798 nsegs -= n; 799 plen -= m->m_len; 800 if (plen == 0) { 801 /* Too few credits */ 802 toep->flags |= TPF_TX_SUSPENDED; 803 if (sowwakeup) { 804 if (!TAILQ_EMPTY( 805 &toep->aiotx_jobq)) 806 t4_aiotx_queue_toep( 807 toep); 808 sowwakeup_locked(so); 809 } else 810 SOCKBUF_UNLOCK(sb); 811 SOCKBUF_UNLOCK_ASSERT(sb); 812 return; 813 } 814 break; 815 } 816 817 if (IS_AIOTX_MBUF(m)) 818 aiotx_mbuf_seen = true; 819 if (max_nsegs_1mbuf < n) 820 max_nsegs_1mbuf = n; 821 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 822 823 /* This mbuf put us right at the max_nsegs limit */ 824 if (plen > max_imm && nsegs == max_nsegs) { 825 m = m->m_next; 826 break; 827 } 828 } 829 830 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 831 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 832 compl = 1; 833 else 834 compl = 0; 835 836 if (sb->sb_flags & SB_AUTOSIZE && 837 V_tcp_do_autosndbuf && 838 sb->sb_hiwat < V_tcp_autosndbuf_max && 839 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 840 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 841 V_tcp_autosndbuf_max); 842 843 if (!sbreserve_locked(sb, newsize, so, NULL)) 844 sb->sb_flags &= ~SB_AUTOSIZE; 845 else 846 sowwakeup = 1; /* room available */ 847 } 848 if (sowwakeup) { 849 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 850 t4_aiotx_queue_toep(toep); 851 sowwakeup_locked(so); 852 } else 853 SOCKBUF_UNLOCK(sb); 854 SOCKBUF_UNLOCK_ASSERT(sb); 855 856 /* nothing to send */ 857 if (plen == 0) { 858 KASSERT(m == NULL, 859 ("%s: nothing to send, but m != NULL", __func__)); 860 break; 861 } 862 863 if (__predict_false(toep->flags & TPF_FIN_SENT)) 864 panic("%s: excess tx.", __func__); 865 866 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 867 if (plen <= max_imm && !aiotx_mbuf_seen) { 868 869 /* Immediate data tx */ 870 871 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 872 toep->ofld_txq); 873 if (wr == NULL) { 874 /* XXX: how will we recover from this? */ 875 toep->flags |= TPF_TX_SUSPENDED; 876 return; 877 } 878 txwr = wrtod(wr); 879 credits = howmany(wr->wr_len, 16); 880 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0, 881 sc->tt.tx_align); 882 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 883 nsegs = 0; 884 } else { 885 int wr_len; 886 887 /* DSGL tx */ 888 889 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 890 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 891 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 892 if (wr == NULL) { 893 /* XXX: how will we recover from this? */ 894 toep->flags |= TPF_TX_SUSPENDED; 895 return; 896 } 897 txwr = wrtod(wr); 898 credits = howmany(wr_len, 16); 899 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0, 900 sc->tt.tx_align); 901 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 902 max_nsegs_1mbuf); 903 if (wr_len & 0xf) { 904 uint64_t *pad = (uint64_t *) 905 ((uintptr_t)txwr + wr_len); 906 *pad = 0; 907 } 908 } 909 910 KASSERT(toep->tx_credits >= credits, 911 ("%s: not enough credits", __func__)); 912 913 toep->tx_credits -= credits; 914 toep->tx_nocompl += credits; 915 toep->plen_nocompl += plen; 916 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 917 toep->tx_nocompl >= toep->tx_total / 4) 918 compl = 1; 919 920 if (compl || toep->ulp_mode == ULP_MODE_RDMA) { 921 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 922 toep->tx_nocompl = 0; 923 toep->plen_nocompl = 0; 924 } 925 926 tp->snd_nxt += plen; 927 tp->snd_max += plen; 928 929 SOCKBUF_LOCK(sb); 930 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 931 sb->sb_sndptr = sb_sndptr; 932 SOCKBUF_UNLOCK(sb); 933 934 toep->flags |= TPF_TX_DATA_SENT; 935 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 936 toep->flags |= TPF_TX_SUSPENDED; 937 938 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 939 txsd->plen = plen; 940 txsd->tx_credits = credits; 941 txsd++; 942 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 943 toep->txsd_pidx = 0; 944 txsd = &toep->txsd[0]; 945 } 946 toep->txsd_avail--; 947 948 t4_l2t_send(sc, wr, toep->l2te); 949 } while (m != NULL); 950 951 /* Send a FIN if requested, but only if there's no more data to send */ 952 if (m == NULL && toep->flags & TPF_SEND_FIN) 953 t4_close_conn(sc, toep); 954 } 955 956 static inline void 957 rqdrop_locked(struct mbufq *q, int plen) 958 { 959 struct mbuf *m; 960 961 while (plen > 0) { 962 m = mbufq_dequeue(q); 963 964 /* Too many credits. */ 965 MPASS(m != NULL); 966 M_ASSERTPKTHDR(m); 967 968 /* Partial credits. */ 969 MPASS(plen >= m->m_pkthdr.len); 970 971 plen -= m->m_pkthdr.len; 972 m_freem(m); 973 } 974 } 975 976 void 977 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 978 { 979 struct mbuf *sndptr, *m; 980 struct fw_ofld_tx_data_wr *txwr; 981 struct wrqe *wr; 982 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 983 u_int adjusted_plen, ulp_submode; 984 struct inpcb *inp = toep->inp; 985 struct tcpcb *tp = intotcpcb(inp); 986 int tx_credits, shove; 987 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 988 struct mbufq *pduq = &toep->ulp_pduq; 989 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 990 991 INP_WLOCK_ASSERT(inp); 992 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 993 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 994 KASSERT(toep->ulp_mode == ULP_MODE_ISCSI, 995 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 996 997 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 998 return; 999 1000 /* 1001 * This function doesn't resume by itself. Someone else must clear the 1002 * flag and call this function. 1003 */ 1004 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 1005 KASSERT(drop == 0, 1006 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1007 return; 1008 } 1009 1010 if (drop) 1011 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1012 1013 while ((sndptr = mbufq_first(pduq)) != NULL) { 1014 M_ASSERTPKTHDR(sndptr); 1015 1016 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 1017 max_imm = max_imm_payload(tx_credits); 1018 max_nsegs = max_dsgl_nsegs(tx_credits); 1019 1020 plen = 0; 1021 nsegs = 0; 1022 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 1023 for (m = sndptr; m != NULL; m = m->m_next) { 1024 int n = sglist_count(mtod(m, void *), m->m_len); 1025 1026 nsegs += n; 1027 plen += m->m_len; 1028 1029 /* 1030 * This mbuf would send us _over_ the nsegs limit. 1031 * Suspend tx because the PDU can't be sent out. 1032 */ 1033 if (plen > max_imm && nsegs > max_nsegs) { 1034 toep->flags |= TPF_TX_SUSPENDED; 1035 return; 1036 } 1037 1038 if (max_nsegs_1mbuf < n) 1039 max_nsegs_1mbuf = n; 1040 } 1041 1042 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1043 panic("%s: excess tx.", __func__); 1044 1045 /* 1046 * We have a PDU to send. All of it goes out in one WR so 'm' 1047 * is NULL. A PDU's length is always a multiple of 4. 1048 */ 1049 MPASS(m == NULL); 1050 MPASS((plen & 3) == 0); 1051 MPASS(sndptr->m_pkthdr.len == plen); 1052 1053 shove = !(tp->t_flags & TF_MORETOCOME); 1054 ulp_submode = mbuf_ulp_submode(sndptr); 1055 MPASS(ulp_submode < nitems(ulp_extra_len)); 1056 1057 /* 1058 * plen doesn't include header and data digests, which are 1059 * generated and inserted in the right places by the TOE, but 1060 * they do occupy TCP sequence space and need to be accounted 1061 * for. 1062 */ 1063 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1064 if (plen <= max_imm) { 1065 1066 /* Immediate data tx */ 1067 1068 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1069 toep->ofld_txq); 1070 if (wr == NULL) { 1071 /* XXX: how will we recover from this? */ 1072 toep->flags |= TPF_TX_SUSPENDED; 1073 return; 1074 } 1075 txwr = wrtod(wr); 1076 credits = howmany(wr->wr_len, 16); 1077 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1078 shove, ulp_submode, sc->tt.tx_align); 1079 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1080 nsegs = 0; 1081 } else { 1082 int wr_len; 1083 1084 /* DSGL tx */ 1085 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1086 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1087 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 1088 if (wr == NULL) { 1089 /* XXX: how will we recover from this? */ 1090 toep->flags |= TPF_TX_SUSPENDED; 1091 return; 1092 } 1093 txwr = wrtod(wr); 1094 credits = howmany(wr_len, 16); 1095 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1096 shove, ulp_submode, sc->tt.tx_align); 1097 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1098 max_nsegs_1mbuf); 1099 if (wr_len & 0xf) { 1100 uint64_t *pad = (uint64_t *) 1101 ((uintptr_t)txwr + wr_len); 1102 *pad = 0; 1103 } 1104 } 1105 1106 KASSERT(toep->tx_credits >= credits, 1107 ("%s: not enough credits", __func__)); 1108 1109 m = mbufq_dequeue(pduq); 1110 MPASS(m == sndptr); 1111 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1112 1113 toep->tx_credits -= credits; 1114 toep->tx_nocompl += credits; 1115 toep->plen_nocompl += plen; 1116 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1117 toep->tx_nocompl >= toep->tx_total / 4) { 1118 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1119 toep->tx_nocompl = 0; 1120 toep->plen_nocompl = 0; 1121 } 1122 1123 tp->snd_nxt += adjusted_plen; 1124 tp->snd_max += adjusted_plen; 1125 1126 toep->flags |= TPF_TX_DATA_SENT; 1127 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1128 toep->flags |= TPF_TX_SUSPENDED; 1129 1130 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1131 txsd->plen = plen; 1132 txsd->tx_credits = credits; 1133 txsd++; 1134 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1135 toep->txsd_pidx = 0; 1136 txsd = &toep->txsd[0]; 1137 } 1138 toep->txsd_avail--; 1139 1140 t4_l2t_send(sc, wr, toep->l2te); 1141 } 1142 1143 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1144 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1145 t4_close_conn(sc, toep); 1146 } 1147 1148 int 1149 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1150 { 1151 struct adapter *sc = tod->tod_softc; 1152 #ifdef INVARIANTS 1153 struct inpcb *inp = tp->t_inpcb; 1154 #endif 1155 struct toepcb *toep = tp->t_toe; 1156 1157 INP_WLOCK_ASSERT(inp); 1158 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1159 ("%s: inp %p dropped.", __func__, inp)); 1160 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1161 1162 if (toep->ulp_mode == ULP_MODE_ISCSI) 1163 t4_push_pdus(sc, toep, 0); 1164 else if (tls_tx_key(toep)) 1165 t4_push_tls_records(sc, toep, 0); 1166 else 1167 t4_push_frames(sc, toep, 0); 1168 1169 return (0); 1170 } 1171 1172 int 1173 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1174 { 1175 struct adapter *sc = tod->tod_softc; 1176 #ifdef INVARIANTS 1177 struct inpcb *inp = tp->t_inpcb; 1178 #endif 1179 struct toepcb *toep = tp->t_toe; 1180 1181 INP_WLOCK_ASSERT(inp); 1182 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1183 ("%s: inp %p dropped.", __func__, inp)); 1184 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1185 1186 toep->flags |= TPF_SEND_FIN; 1187 if (tp->t_state >= TCPS_ESTABLISHED) { 1188 if (toep->ulp_mode == ULP_MODE_ISCSI) 1189 t4_push_pdus(sc, toep, 0); 1190 else if (tls_tx_key(toep)) 1191 t4_push_tls_records(sc, toep, 0); 1192 else 1193 t4_push_frames(sc, toep, 0); 1194 } 1195 1196 return (0); 1197 } 1198 1199 int 1200 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1201 { 1202 struct adapter *sc = tod->tod_softc; 1203 #if defined(INVARIANTS) 1204 struct inpcb *inp = tp->t_inpcb; 1205 #endif 1206 struct toepcb *toep = tp->t_toe; 1207 1208 INP_WLOCK_ASSERT(inp); 1209 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1210 ("%s: inp %p dropped.", __func__, inp)); 1211 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1212 1213 /* hmmmm */ 1214 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1215 ("%s: flowc for tid %u [%s] not sent already", 1216 __func__, toep->tid, tcpstates[tp->t_state])); 1217 1218 send_reset(sc, toep, 0); 1219 return (0); 1220 } 1221 1222 /* 1223 * Peer has sent us a FIN. 1224 */ 1225 static int 1226 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1227 { 1228 struct adapter *sc = iq->adapter; 1229 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1230 unsigned int tid = GET_TID(cpl); 1231 struct toepcb *toep = lookup_tid(sc, tid); 1232 struct inpcb *inp = toep->inp; 1233 struct tcpcb *tp = NULL; 1234 struct socket *so; 1235 struct epoch_tracker et; 1236 #ifdef INVARIANTS 1237 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1238 #endif 1239 1240 KASSERT(opcode == CPL_PEER_CLOSE, 1241 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1242 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1243 1244 if (__predict_false(toep->flags & TPF_SYNQE)) { 1245 /* 1246 * do_pass_establish must have run before do_peer_close and if 1247 * this is still a synqe instead of a toepcb then the connection 1248 * must be getting aborted. 1249 */ 1250 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1251 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1252 toep, toep->flags); 1253 return (0); 1254 } 1255 1256 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1257 1258 CURVNET_SET(toep->vnet); 1259 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1260 INP_WLOCK(inp); 1261 tp = intotcpcb(inp); 1262 1263 CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, 1264 tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp); 1265 1266 if (toep->flags & TPF_ABORT_SHUTDOWN) 1267 goto done; 1268 1269 tp->rcv_nxt++; /* FIN */ 1270 1271 so = inp->inp_socket; 1272 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1273 DDP_LOCK(toep); 1274 if (__predict_false(toep->ddp.flags & 1275 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1276 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1277 DDP_UNLOCK(toep); 1278 } 1279 socantrcvmore(so); 1280 1281 if (toep->ulp_mode != ULP_MODE_RDMA) { 1282 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1283 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1284 be32toh(cpl->rcv_nxt))); 1285 } 1286 1287 switch (tp->t_state) { 1288 case TCPS_SYN_RECEIVED: 1289 tp->t_starttime = ticks; 1290 /* FALLTHROUGH */ 1291 1292 case TCPS_ESTABLISHED: 1293 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1294 break; 1295 1296 case TCPS_FIN_WAIT_1: 1297 tcp_state_change(tp, TCPS_CLOSING); 1298 break; 1299 1300 case TCPS_FIN_WAIT_2: 1301 tcp_twstart(tp); 1302 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1303 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1304 CURVNET_RESTORE(); 1305 1306 INP_WLOCK(inp); 1307 final_cpl_received(toep); 1308 return (0); 1309 1310 default: 1311 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1312 __func__, tid, tp->t_state); 1313 } 1314 done: 1315 INP_WUNLOCK(inp); 1316 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1317 CURVNET_RESTORE(); 1318 return (0); 1319 } 1320 1321 /* 1322 * Peer has ACK'd our FIN. 1323 */ 1324 static int 1325 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1326 struct mbuf *m) 1327 { 1328 struct adapter *sc = iq->adapter; 1329 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1330 unsigned int tid = GET_TID(cpl); 1331 struct toepcb *toep = lookup_tid(sc, tid); 1332 struct inpcb *inp = toep->inp; 1333 struct tcpcb *tp = NULL; 1334 struct socket *so = NULL; 1335 struct epoch_tracker et; 1336 #ifdef INVARIANTS 1337 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1338 #endif 1339 1340 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1341 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1342 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1343 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1344 1345 CURVNET_SET(toep->vnet); 1346 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1347 INP_WLOCK(inp); 1348 tp = intotcpcb(inp); 1349 1350 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1351 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1352 1353 if (toep->flags & TPF_ABORT_SHUTDOWN) 1354 goto done; 1355 1356 so = inp->inp_socket; 1357 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1358 1359 switch (tp->t_state) { 1360 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1361 tcp_twstart(tp); 1362 release: 1363 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1364 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1365 CURVNET_RESTORE(); 1366 1367 INP_WLOCK(inp); 1368 final_cpl_received(toep); /* no more CPLs expected */ 1369 1370 return (0); 1371 case TCPS_LAST_ACK: 1372 if (tcp_close(tp)) 1373 INP_WUNLOCK(inp); 1374 goto release; 1375 1376 case TCPS_FIN_WAIT_1: 1377 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1378 soisdisconnected(so); 1379 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1380 break; 1381 1382 default: 1383 log(LOG_ERR, 1384 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1385 __func__, tid, tcpstates[tp->t_state]); 1386 } 1387 done: 1388 INP_WUNLOCK(inp); 1389 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1390 CURVNET_RESTORE(); 1391 return (0); 1392 } 1393 1394 void 1395 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1396 int rst_status) 1397 { 1398 struct wrqe *wr; 1399 struct cpl_abort_rpl *cpl; 1400 1401 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1402 if (wr == NULL) { 1403 /* XXX */ 1404 panic("%s: allocation failure.", __func__); 1405 } 1406 cpl = wrtod(wr); 1407 1408 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1409 cpl->cmd = rst_status; 1410 1411 t4_wrq_tx(sc, wr); 1412 } 1413 1414 static int 1415 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1416 { 1417 switch (abort_reason) { 1418 case CPL_ERR_BAD_SYN: 1419 case CPL_ERR_CONN_RESET: 1420 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1421 case CPL_ERR_XMIT_TIMEDOUT: 1422 case CPL_ERR_PERSIST_TIMEDOUT: 1423 case CPL_ERR_FINWAIT2_TIMEDOUT: 1424 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1425 return (ETIMEDOUT); 1426 default: 1427 return (EIO); 1428 } 1429 } 1430 1431 /* 1432 * TCP RST from the peer, timeout, or some other such critical error. 1433 */ 1434 static int 1435 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1436 { 1437 struct adapter *sc = iq->adapter; 1438 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1439 unsigned int tid = GET_TID(cpl); 1440 struct toepcb *toep = lookup_tid(sc, tid); 1441 struct sge_wrq *ofld_txq = toep->ofld_txq; 1442 struct inpcb *inp; 1443 struct tcpcb *tp; 1444 struct epoch_tracker et; 1445 #ifdef INVARIANTS 1446 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1447 #endif 1448 1449 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1450 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1451 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1452 1453 if (toep->flags & TPF_SYNQE) 1454 return (do_abort_req_synqe(iq, rss, m)); 1455 1456 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1457 1458 if (negative_advice(cpl->status)) { 1459 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1460 __func__, cpl->status, tid, toep->flags); 1461 return (0); /* Ignore negative advice */ 1462 } 1463 1464 inp = toep->inp; 1465 CURVNET_SET(toep->vnet); 1466 INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for tcp_close */ 1467 INP_WLOCK(inp); 1468 1469 tp = intotcpcb(inp); 1470 1471 CTR6(KTR_CXGBE, 1472 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1473 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1474 inp->inp_flags, cpl->status); 1475 1476 /* 1477 * If we'd initiated an abort earlier the reply to it is responsible for 1478 * cleaning up resources. Otherwise we tear everything down right here 1479 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1480 */ 1481 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1482 INP_WUNLOCK(inp); 1483 goto done; 1484 } 1485 toep->flags |= TPF_ABORT_SHUTDOWN; 1486 1487 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1488 struct socket *so = inp->inp_socket; 1489 1490 if (so != NULL) 1491 so_error_set(so, abort_status_to_errno(tp, 1492 cpl->status)); 1493 tp = tcp_close(tp); 1494 if (tp == NULL) 1495 INP_WLOCK(inp); /* re-acquire */ 1496 } 1497 1498 final_cpl_received(toep); 1499 done: 1500 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1501 CURVNET_RESTORE(); 1502 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1503 return (0); 1504 } 1505 1506 /* 1507 * Reply to the CPL_ABORT_REQ (send_reset) 1508 */ 1509 static int 1510 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1511 { 1512 struct adapter *sc = iq->adapter; 1513 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1514 unsigned int tid = GET_TID(cpl); 1515 struct toepcb *toep = lookup_tid(sc, tid); 1516 struct inpcb *inp = toep->inp; 1517 #ifdef INVARIANTS 1518 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1519 #endif 1520 1521 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1522 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1523 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1524 1525 if (toep->flags & TPF_SYNQE) 1526 return (do_abort_rpl_synqe(iq, rss, m)); 1527 1528 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1529 1530 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1531 __func__, tid, toep, inp, cpl->status); 1532 1533 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1534 ("%s: wasn't expecting abort reply", __func__)); 1535 1536 INP_WLOCK(inp); 1537 final_cpl_received(toep); 1538 1539 return (0); 1540 } 1541 1542 static int 1543 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1544 { 1545 struct adapter *sc = iq->adapter; 1546 const struct cpl_rx_data *cpl = mtod(m, const void *); 1547 unsigned int tid = GET_TID(cpl); 1548 struct toepcb *toep = lookup_tid(sc, tid); 1549 struct inpcb *inp = toep->inp; 1550 struct tcpcb *tp; 1551 struct socket *so; 1552 struct sockbuf *sb; 1553 struct epoch_tracker et; 1554 int len; 1555 uint32_t ddp_placed = 0; 1556 1557 if (__predict_false(toep->flags & TPF_SYNQE)) { 1558 /* 1559 * do_pass_establish must have run before do_rx_data and if this 1560 * is still a synqe instead of a toepcb then the connection must 1561 * be getting aborted. 1562 */ 1563 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1564 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1565 toep, toep->flags); 1566 m_freem(m); 1567 return (0); 1568 } 1569 1570 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1571 1572 /* strip off CPL header */ 1573 m_adj(m, sizeof(*cpl)); 1574 len = m->m_pkthdr.len; 1575 1576 INP_WLOCK(inp); 1577 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1578 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1579 __func__, tid, len, inp->inp_flags); 1580 INP_WUNLOCK(inp); 1581 m_freem(m); 1582 return (0); 1583 } 1584 1585 tp = intotcpcb(inp); 1586 1587 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1588 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1589 1590 tp->rcv_nxt += len; 1591 if (tp->rcv_wnd < len) { 1592 KASSERT(toep->ulp_mode == ULP_MODE_RDMA, 1593 ("%s: negative window size", __func__)); 1594 } 1595 1596 tp->rcv_wnd -= len; 1597 tp->t_rcvtime = ticks; 1598 1599 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1600 DDP_LOCK(toep); 1601 so = inp_inpcbtosocket(inp); 1602 sb = &so->so_rcv; 1603 SOCKBUF_LOCK(sb); 1604 1605 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1606 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1607 __func__, tid, len); 1608 m_freem(m); 1609 SOCKBUF_UNLOCK(sb); 1610 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1611 DDP_UNLOCK(toep); 1612 INP_WUNLOCK(inp); 1613 1614 CURVNET_SET(toep->vnet); 1615 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1616 INP_WLOCK(inp); 1617 tp = tcp_drop(tp, ECONNRESET); 1618 if (tp) 1619 INP_WUNLOCK(inp); 1620 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1621 CURVNET_RESTORE(); 1622 1623 return (0); 1624 } 1625 1626 /* receive buffer autosize */ 1627 MPASS(toep->vnet == so->so_vnet); 1628 CURVNET_SET(toep->vnet); 1629 if (sb->sb_flags & SB_AUTOSIZE && 1630 V_tcp_do_autorcvbuf && 1631 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1632 len > (sbspace(sb) / 8 * 7)) { 1633 unsigned int hiwat = sb->sb_hiwat; 1634 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1635 V_tcp_autorcvbuf_max); 1636 1637 if (!sbreserve_locked(sb, newsize, so, NULL)) 1638 sb->sb_flags &= ~SB_AUTOSIZE; 1639 else 1640 toep->rx_credits += newsize - hiwat; 1641 } 1642 1643 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1644 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1645 1646 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1647 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1648 __func__, tid, len); 1649 1650 if (changed) { 1651 if (toep->ddp.flags & DDP_SC_REQ) 1652 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1653 else { 1654 KASSERT(cpl->ddp_off == 1, 1655 ("%s: DDP switched on by itself.", 1656 __func__)); 1657 1658 /* Fell out of DDP mode */ 1659 toep->ddp.flags &= ~DDP_ON; 1660 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1661 __func__); 1662 1663 insert_ddp_data(toep, ddp_placed); 1664 } 1665 } 1666 1667 if (toep->ddp.flags & DDP_ON) { 1668 /* 1669 * CPL_RX_DATA with DDP on can only be an indicate. 1670 * Start posting queued AIO requests via DDP. The 1671 * payload that arrived in this indicate is appended 1672 * to the socket buffer as usual. 1673 */ 1674 handle_ddp_indicate(toep); 1675 } 1676 } 1677 1678 KASSERT(toep->sb_cc >= sbused(sb), 1679 ("%s: sb %p has more data (%d) than last time (%d).", 1680 __func__, sb, sbused(sb), toep->sb_cc)); 1681 toep->rx_credits += toep->sb_cc - sbused(sb); 1682 sbappendstream_locked(sb, m, 0); 1683 toep->sb_cc = sbused(sb); 1684 if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) { 1685 int credits; 1686 1687 credits = send_rx_credits(sc, toep, toep->rx_credits); 1688 toep->rx_credits -= credits; 1689 tp->rcv_wnd += credits; 1690 tp->rcv_adv += credits; 1691 } 1692 1693 if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1694 sbavail(sb) != 0) { 1695 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1696 tid); 1697 ddp_queue_toep(toep); 1698 } 1699 sorwakeup_locked(so); 1700 SOCKBUF_UNLOCK_ASSERT(sb); 1701 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1702 DDP_UNLOCK(toep); 1703 1704 INP_WUNLOCK(inp); 1705 CURVNET_RESTORE(); 1706 return (0); 1707 } 1708 1709 static int 1710 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1711 { 1712 struct adapter *sc = iq->adapter; 1713 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1714 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1715 struct toepcb *toep = lookup_tid(sc, tid); 1716 struct inpcb *inp; 1717 struct tcpcb *tp; 1718 struct socket *so; 1719 uint8_t credits = cpl->credits; 1720 struct ofld_tx_sdesc *txsd; 1721 int plen; 1722 #ifdef INVARIANTS 1723 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1724 #endif 1725 1726 /* 1727 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1728 * now this comes back carrying the credits for the flowc. 1729 */ 1730 if (__predict_false(toep->flags & TPF_SYNQE)) { 1731 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1732 ("%s: credits for a synq entry %p", __func__, toep)); 1733 return (0); 1734 } 1735 1736 inp = toep->inp; 1737 1738 KASSERT(opcode == CPL_FW4_ACK, 1739 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1740 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1741 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1742 1743 INP_WLOCK(inp); 1744 1745 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1746 INP_WUNLOCK(inp); 1747 return (0); 1748 } 1749 1750 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1751 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1752 1753 tp = intotcpcb(inp); 1754 1755 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1756 tcp_seq snd_una = be32toh(cpl->snd_una); 1757 1758 #ifdef INVARIANTS 1759 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1760 log(LOG_ERR, 1761 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1762 __func__, snd_una, toep->tid, tp->snd_una); 1763 } 1764 #endif 1765 1766 if (tp->snd_una != snd_una) { 1767 tp->snd_una = snd_una; 1768 tp->ts_recent_age = tcp_ts_getticks(); 1769 } 1770 } 1771 1772 #ifdef VERBOSE_TRACES 1773 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1774 #endif 1775 so = inp->inp_socket; 1776 txsd = &toep->txsd[toep->txsd_cidx]; 1777 plen = 0; 1778 while (credits) { 1779 KASSERT(credits >= txsd->tx_credits, 1780 ("%s: too many (or partial) credits", __func__)); 1781 credits -= txsd->tx_credits; 1782 toep->tx_credits += txsd->tx_credits; 1783 plen += txsd->plen; 1784 if (txsd->iv_buffer) { 1785 free(txsd->iv_buffer, M_CXGBE); 1786 txsd->iv_buffer = NULL; 1787 } 1788 txsd++; 1789 toep->txsd_avail++; 1790 KASSERT(toep->txsd_avail <= toep->txsd_total, 1791 ("%s: txsd avail > total", __func__)); 1792 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1793 txsd = &toep->txsd[0]; 1794 toep->txsd_cidx = 0; 1795 } 1796 } 1797 1798 if (toep->tx_credits == toep->tx_total) { 1799 toep->tx_nocompl = 0; 1800 toep->plen_nocompl = 0; 1801 } 1802 1803 if (toep->flags & TPF_TX_SUSPENDED && 1804 toep->tx_credits >= toep->tx_total / 4) { 1805 #ifdef VERBOSE_TRACES 1806 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1807 tid); 1808 #endif 1809 toep->flags &= ~TPF_TX_SUSPENDED; 1810 CURVNET_SET(toep->vnet); 1811 if (toep->ulp_mode == ULP_MODE_ISCSI) 1812 t4_push_pdus(sc, toep, plen); 1813 else if (tls_tx_key(toep)) 1814 t4_push_tls_records(sc, toep, plen); 1815 else 1816 t4_push_frames(sc, toep, plen); 1817 CURVNET_RESTORE(); 1818 } else if (plen > 0) { 1819 struct sockbuf *sb = &so->so_snd; 1820 int sbu; 1821 1822 SOCKBUF_LOCK(sb); 1823 sbu = sbused(sb); 1824 if (toep->ulp_mode == ULP_MODE_ISCSI) { 1825 1826 if (__predict_false(sbu > 0)) { 1827 /* 1828 * The data trasmitted before the tid's ULP mode 1829 * changed to ISCSI is still in so_snd. 1830 * Incoming credits should account for so_snd 1831 * first. 1832 */ 1833 sbdrop_locked(sb, min(sbu, plen)); 1834 plen -= min(sbu, plen); 1835 } 1836 sowwakeup_locked(so); /* unlocks so_snd */ 1837 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1838 } else { 1839 #ifdef VERBOSE_TRACES 1840 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1841 tid, plen); 1842 #endif 1843 sbdrop_locked(sb, plen); 1844 if (tls_tx_key(toep)) { 1845 struct tls_ofld_info *tls_ofld = &toep->tls; 1846 1847 MPASS(tls_ofld->sb_off >= plen); 1848 tls_ofld->sb_off -= plen; 1849 } 1850 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1851 t4_aiotx_queue_toep(toep); 1852 sowwakeup_locked(so); /* unlocks so_snd */ 1853 } 1854 SOCKBUF_UNLOCK_ASSERT(sb); 1855 } 1856 1857 INP_WUNLOCK(inp); 1858 1859 return (0); 1860 } 1861 1862 void 1863 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1864 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1865 { 1866 struct wrqe *wr; 1867 struct cpl_set_tcb_field *req; 1868 struct ofld_tx_sdesc *txsd; 1869 1870 MPASS((cookie & ~M_COOKIE) == 0); 1871 if (reply) { 1872 MPASS(cookie != CPL_COOKIE_RESERVED); 1873 } 1874 1875 wr = alloc_wrqe(sizeof(*req), wrq); 1876 if (wr == NULL) { 1877 /* XXX */ 1878 panic("%s: allocation failure.", __func__); 1879 } 1880 req = wrtod(wr); 1881 1882 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1883 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1884 if (reply == 0) 1885 req->reply_ctrl |= htobe16(F_NO_REPLY); 1886 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1887 req->mask = htobe64(mask); 1888 req->val = htobe64(val); 1889 if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { 1890 txsd = &toep->txsd[toep->txsd_pidx]; 1891 txsd->tx_credits = howmany(sizeof(*req), 16); 1892 txsd->plen = 0; 1893 KASSERT(toep->tx_credits >= txsd->tx_credits && 1894 toep->txsd_avail > 0, 1895 ("%s: not enough credits (%d)", __func__, 1896 toep->tx_credits)); 1897 toep->tx_credits -= txsd->tx_credits; 1898 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1899 toep->txsd_pidx = 0; 1900 toep->txsd_avail--; 1901 } 1902 1903 t4_wrq_tx(sc, wr); 1904 } 1905 1906 void 1907 t4_init_cpl_io_handlers(void) 1908 { 1909 1910 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1911 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1912 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1913 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1914 CPL_COOKIE_TOM); 1915 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1916 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1917 } 1918 1919 void 1920 t4_uninit_cpl_io_handlers(void) 1921 { 1922 1923 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1924 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1925 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1926 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1927 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1928 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1929 } 1930 1931 /* 1932 * Use the 'backend3' field in AIO jobs to store the amount of data 1933 * sent by the AIO job so far and the 'backend4' field to hold an 1934 * error that should be reported when the job is completed. 1935 */ 1936 #define aio_sent backend3 1937 #define aio_error backend4 1938 1939 #define jobtotid(job) \ 1940 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1941 1942 static void 1943 free_aiotx_buffer(struct aiotx_buffer *ab) 1944 { 1945 struct kaiocb *job; 1946 long status; 1947 int error; 1948 1949 if (refcount_release(&ab->refcount) == 0) 1950 return; 1951 1952 job = ab->job; 1953 error = job->aio_error; 1954 status = job->aio_sent; 1955 vm_page_unhold_pages(ab->ps.pages, ab->ps.npages); 1956 free(ab, M_CXGBE); 1957 #ifdef VERBOSE_TRACES 1958 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1959 jobtotid(job), job, status, error); 1960 #endif 1961 if (error == ECANCELED && status != 0) 1962 error = 0; 1963 if (error == ECANCELED) 1964 aio_cancel(job); 1965 else if (error) 1966 aio_complete(job, -1, error); 1967 else 1968 aio_complete(job, status, 0); 1969 } 1970 1971 static void 1972 t4_aiotx_mbuf_free(struct mbuf *m) 1973 { 1974 struct aiotx_buffer *ab = m->m_ext.ext_arg1; 1975 1976 #ifdef VERBOSE_TRACES 1977 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1978 m->m_len, jobtotid(ab->job)); 1979 #endif 1980 free_aiotx_buffer(ab); 1981 } 1982 1983 /* 1984 * Hold the buffer backing an AIO request and return an AIO transmit 1985 * buffer. 1986 */ 1987 static int 1988 hold_aio(struct kaiocb *job) 1989 { 1990 struct aiotx_buffer *ab; 1991 struct vmspace *vm; 1992 vm_map_t map; 1993 vm_offset_t start, end, pgoff; 1994 int n; 1995 1996 MPASS(job->backend1 == NULL); 1997 1998 /* 1999 * The AIO subsystem will cancel and drain all requests before 2000 * permitting a process to exit or exec, so p_vmspace should 2001 * be stable here. 2002 */ 2003 vm = job->userproc->p_vmspace; 2004 map = &vm->vm_map; 2005 start = (uintptr_t)job->uaiocb.aio_buf; 2006 pgoff = start & PAGE_MASK; 2007 end = round_page(start + job->uaiocb.aio_nbytes); 2008 start = trunc_page(start); 2009 n = atop(end - start); 2010 2011 ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | 2012 M_ZERO); 2013 refcount_init(&ab->refcount, 1); 2014 ab->ps.pages = (vm_page_t *)(ab + 1); 2015 ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start, 2016 VM_PROT_WRITE, ab->ps.pages, n); 2017 if (ab->ps.npages < 0) { 2018 free(ab, M_CXGBE); 2019 return (EFAULT); 2020 } 2021 2022 KASSERT(ab->ps.npages == n, 2023 ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n)); 2024 2025 ab->ps.offset = pgoff; 2026 ab->ps.len = job->uaiocb.aio_nbytes; 2027 ab->job = job; 2028 job->backend1 = ab; 2029 #ifdef VERBOSE_TRACES 2030 CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", 2031 __func__, jobtotid(job), &ab->ps, job, ab->ps.npages); 2032 #endif 2033 return (0); 2034 } 2035 2036 static void 2037 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2038 { 2039 struct adapter *sc; 2040 struct sockbuf *sb; 2041 struct file *fp; 2042 struct aiotx_buffer *ab; 2043 struct inpcb *inp; 2044 struct tcpcb *tp; 2045 struct mbuf *m; 2046 int error; 2047 bool moretocome, sendmore; 2048 2049 sc = td_adapter(toep->td); 2050 sb = &so->so_snd; 2051 SOCKBUF_UNLOCK(sb); 2052 fp = job->fd_file; 2053 ab = job->backend1; 2054 m = NULL; 2055 2056 #ifdef MAC 2057 error = mac_socket_check_send(fp->f_cred, so); 2058 if (error != 0) 2059 goto out; 2060 #endif 2061 2062 if (ab == NULL) { 2063 error = hold_aio(job); 2064 if (error != 0) 2065 goto out; 2066 ab = job->backend1; 2067 } 2068 2069 /* Inline sosend_generic(). */ 2070 2071 job->msgsnd = 1; 2072 2073 error = sblock(sb, SBL_WAIT); 2074 MPASS(error == 0); 2075 2076 sendanother: 2077 m = m_get(M_WAITOK, MT_DATA); 2078 2079 SOCKBUF_LOCK(sb); 2080 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2081 SOCKBUF_UNLOCK(sb); 2082 sbunlock(sb); 2083 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2084 PROC_LOCK(job->userproc); 2085 kern_psignal(job->userproc, SIGPIPE); 2086 PROC_UNLOCK(job->userproc); 2087 } 2088 error = EPIPE; 2089 goto out; 2090 } 2091 if (so->so_error) { 2092 error = so->so_error; 2093 so->so_error = 0; 2094 SOCKBUF_UNLOCK(sb); 2095 sbunlock(sb); 2096 goto out; 2097 } 2098 if ((so->so_state & SS_ISCONNECTED) == 0) { 2099 SOCKBUF_UNLOCK(sb); 2100 sbunlock(sb); 2101 error = ENOTCONN; 2102 goto out; 2103 } 2104 if (sbspace(sb) < sb->sb_lowat) { 2105 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2106 2107 /* 2108 * Don't block if there is too little room in the socket 2109 * buffer. Instead, requeue the request. 2110 */ 2111 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2112 SOCKBUF_UNLOCK(sb); 2113 sbunlock(sb); 2114 error = ECANCELED; 2115 goto out; 2116 } 2117 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2118 SOCKBUF_UNLOCK(sb); 2119 sbunlock(sb); 2120 goto out; 2121 } 2122 2123 /* 2124 * Write as much data as the socket permits, but no more than a 2125 * a single sndbuf at a time. 2126 */ 2127 m->m_len = sbspace(sb); 2128 if (m->m_len > ab->ps.len - job->aio_sent) { 2129 m->m_len = ab->ps.len - job->aio_sent; 2130 moretocome = false; 2131 } else 2132 moretocome = true; 2133 if (m->m_len > sc->tt.sndbuf) { 2134 m->m_len = sc->tt.sndbuf; 2135 sendmore = true; 2136 } else 2137 sendmore = false; 2138 2139 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2140 moretocome = true; 2141 SOCKBUF_UNLOCK(sb); 2142 MPASS(m->m_len != 0); 2143 2144 /* Inlined tcp_usr_send(). */ 2145 2146 inp = toep->inp; 2147 INP_WLOCK(inp); 2148 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2149 INP_WUNLOCK(inp); 2150 sbunlock(sb); 2151 error = ECONNRESET; 2152 goto out; 2153 } 2154 2155 refcount_acquire(&ab->refcount); 2156 m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab, 2157 (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV); 2158 m->m_ext.ext_flags |= EXT_FLAG_AIOTX; 2159 job->aio_sent += m->m_len; 2160 2161 sbappendstream(sb, m, 0); 2162 m = NULL; 2163 2164 if (!(inp->inp_flags & INP_DROPPED)) { 2165 tp = intotcpcb(inp); 2166 if (moretocome) 2167 tp->t_flags |= TF_MORETOCOME; 2168 error = tp->t_fb->tfb_tcp_output(tp); 2169 if (moretocome) 2170 tp->t_flags &= ~TF_MORETOCOME; 2171 } 2172 2173 INP_WUNLOCK(inp); 2174 if (sendmore) 2175 goto sendanother; 2176 sbunlock(sb); 2177 2178 if (error) 2179 goto out; 2180 2181 /* 2182 * If this is a non-blocking socket and the request has not 2183 * been fully completed, requeue it until the socket is ready 2184 * again. 2185 */ 2186 if (job->aio_sent < job->uaiocb.aio_nbytes && 2187 !(so->so_state & SS_NBIO)) { 2188 SOCKBUF_LOCK(sb); 2189 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2190 SOCKBUF_UNLOCK(sb); 2191 error = ECANCELED; 2192 goto out; 2193 } 2194 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2195 return; 2196 } 2197 2198 /* 2199 * If the request will not be requeued, drop a reference on 2200 * the aiotx buffer. Any mbufs in flight should still 2201 * contain a reference, but this drops the reference that the 2202 * job owns while it is waiting to queue mbufs to the socket. 2203 */ 2204 free_aiotx_buffer(ab); 2205 2206 out: 2207 if (error) { 2208 if (ab != NULL) { 2209 job->aio_error = error; 2210 free_aiotx_buffer(ab); 2211 } else { 2212 MPASS(job->aio_sent == 0); 2213 aio_complete(job, -1, error); 2214 } 2215 } 2216 if (m != NULL) 2217 m_free(m); 2218 SOCKBUF_LOCK(sb); 2219 } 2220 2221 static void 2222 t4_aiotx_task(void *context, int pending) 2223 { 2224 struct toepcb *toep = context; 2225 struct inpcb *inp = toep->inp; 2226 struct socket *so = inp->inp_socket; 2227 struct kaiocb *job; 2228 2229 CURVNET_SET(toep->vnet); 2230 SOCKBUF_LOCK(&so->so_snd); 2231 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2232 job = TAILQ_FIRST(&toep->aiotx_jobq); 2233 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2234 if (!aio_clear_cancel_function(job)) 2235 continue; 2236 2237 t4_aiotx_process_job(toep, so, job); 2238 } 2239 toep->aiotx_task_active = false; 2240 SOCKBUF_UNLOCK(&so->so_snd); 2241 CURVNET_RESTORE(); 2242 2243 free_toepcb(toep); 2244 } 2245 2246 static void 2247 t4_aiotx_queue_toep(struct toepcb *toep) 2248 { 2249 2250 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2251 #ifdef VERBOSE_TRACES 2252 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2253 __func__, toep->tid, toep->aiotx_task_active ? "true" : "false"); 2254 #endif 2255 if (toep->aiotx_task_active) 2256 return; 2257 toep->aiotx_task_active = true; 2258 hold_toepcb(toep); 2259 soaio_enqueue(&toep->aiotx_task); 2260 } 2261 2262 static void 2263 t4_aiotx_cancel(struct kaiocb *job) 2264 { 2265 struct aiotx_buffer *ab; 2266 struct socket *so; 2267 struct sockbuf *sb; 2268 struct tcpcb *tp; 2269 struct toepcb *toep; 2270 2271 so = job->fd_file->f_data; 2272 tp = so_sototcpcb(so); 2273 toep = tp->t_toe; 2274 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2275 sb = &so->so_snd; 2276 2277 SOCKBUF_LOCK(sb); 2278 if (!aio_cancel_cleared(job)) 2279 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2280 SOCKBUF_UNLOCK(sb); 2281 2282 ab = job->backend1; 2283 if (ab != NULL) 2284 free_aiotx_buffer(ab); 2285 else 2286 aio_cancel(job); 2287 } 2288 2289 int 2290 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2291 { 2292 struct tcpcb *tp = so_sototcpcb(so); 2293 struct toepcb *toep = tp->t_toe; 2294 struct adapter *sc = td_adapter(toep->td); 2295 2296 /* This only handles writes. */ 2297 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2298 return (EOPNOTSUPP); 2299 2300 if (!sc->tt.tx_zcopy) 2301 return (EOPNOTSUPP); 2302 2303 if (tls_tx_key(toep)) 2304 return (EOPNOTSUPP); 2305 2306 SOCKBUF_LOCK(&so->so_snd); 2307 #ifdef VERBOSE_TRACES 2308 CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job); 2309 #endif 2310 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2311 panic("new job was cancelled"); 2312 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2313 if (sowriteable(so)) 2314 t4_aiotx_queue_toep(toep); 2315 SOCKBUF_UNLOCK(&so->so_snd); 2316 return (0); 2317 } 2318 2319 void 2320 aiotx_init_toep(struct toepcb *toep) 2321 { 2322 2323 TAILQ_INIT(&toep->aiotx_jobq); 2324 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2325 } 2326 #endif 2327