1 /*- 2 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 33 #ifdef TCP_OFFLOAD 34 #include <sys/param.h> 35 #include <sys/aio.h> 36 #include <sys/file.h> 37 #include <sys/kernel.h> 38 #include <sys/ktr.h> 39 #include <sys/module.h> 40 #include <sys/proc.h> 41 #include <sys/protosw.h> 42 #include <sys/domain.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/sglist.h> 46 #include <sys/taskqueue.h> 47 #include <netinet/in.h> 48 #include <netinet/in_pcb.h> 49 #include <netinet/ip.h> 50 #include <netinet/ip6.h> 51 #define TCPSTATES 52 #include <netinet/tcp_fsm.h> 53 #include <netinet/tcp_seq.h> 54 #include <netinet/tcp_var.h> 55 #include <netinet/toecore.h> 56 57 #include <security/mac/mac_framework.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_extern.h> 61 #include <vm/pmap.h> 62 #include <vm/vm_map.h> 63 #include <vm/vm_page.h> 64 65 #include "common/common.h" 66 #include "common/t4_msg.h" 67 #include "common/t4_regs.h" 68 #include "common/t4_tcb.h" 69 #include "tom/t4_tom_l2t.h" 70 #include "tom/t4_tom.h" 71 72 VNET_DECLARE(int, tcp_do_autosndbuf); 73 #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) 74 VNET_DECLARE(int, tcp_autosndbuf_inc); 75 #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) 76 VNET_DECLARE(int, tcp_autosndbuf_max); 77 #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) 78 VNET_DECLARE(int, tcp_do_autorcvbuf); 79 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 80 VNET_DECLARE(int, tcp_autorcvbuf_inc); 81 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 82 VNET_DECLARE(int, tcp_autorcvbuf_max); 83 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 84 85 #define IS_AIOTX_MBUF(m) \ 86 ((m)->m_flags & M_EXT && (m)->m_ext.ext_flags & EXT_FLAG_AIOTX) 87 88 static void t4_aiotx_cancel(struct kaiocb *job); 89 static void t4_aiotx_queue_toep(struct toepcb *toep); 90 91 static size_t 92 aiotx_mbuf_pgoff(struct mbuf *m) 93 { 94 struct aiotx_buffer *ab; 95 96 MPASS(IS_AIOTX_MBUF(m)); 97 ab = m->m_ext.ext_arg1; 98 return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE); 99 } 100 101 static vm_page_t * 102 aiotx_mbuf_pages(struct mbuf *m) 103 { 104 struct aiotx_buffer *ab; 105 int npages; 106 107 MPASS(IS_AIOTX_MBUF(m)); 108 ab = m->m_ext.ext_arg1; 109 npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE; 110 return (ab->ps.pages + npages); 111 } 112 113 void 114 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) 115 { 116 struct wrqe *wr; 117 struct fw_flowc_wr *flowc; 118 unsigned int nparams = ftxp ? 8 : 6, flowclen; 119 struct vi_info *vi = toep->vi; 120 struct port_info *pi = vi->pi; 121 struct adapter *sc = pi->adapter; 122 unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN; 123 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 124 125 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 126 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 127 128 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 129 130 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 131 if (wr == NULL) { 132 /* XXX */ 133 panic("%s: allocation failure.", __func__); 134 } 135 flowc = wrtod(wr); 136 memset(flowc, 0, wr->wr_len); 137 138 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 139 V_FW_FLOWC_WR_NPARAMS(nparams)); 140 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 141 V_FW_WR_FLOWID(toep->tid)); 142 143 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 144 flowc->mnemval[0].val = htobe32(pfvf); 145 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 146 flowc->mnemval[1].val = htobe32(pi->tx_chan); 147 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 148 flowc->mnemval[2].val = htobe32(pi->tx_chan); 149 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 150 flowc->mnemval[3].val = htobe32(toep->ofld_rxq->iq.abs_id); 151 if (ftxp) { 152 uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); 153 154 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT; 155 flowc->mnemval[4].val = htobe32(ftxp->snd_nxt); 156 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT; 157 flowc->mnemval[5].val = htobe32(ftxp->rcv_nxt); 158 flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; 159 flowc->mnemval[6].val = htobe32(sndbuf); 160 flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; 161 flowc->mnemval[7].val = htobe32(ftxp->mss); 162 163 CTR6(KTR_CXGBE, 164 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 165 __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt, 166 ftxp->rcv_nxt); 167 } else { 168 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 169 flowc->mnemval[4].val = htobe32(512); 170 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 171 flowc->mnemval[5].val = htobe32(512); 172 173 CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); 174 } 175 176 txsd->tx_credits = howmany(flowclen, 16); 177 txsd->plen = 0; 178 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 179 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 180 toep->tx_credits -= txsd->tx_credits; 181 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 182 toep->txsd_pidx = 0; 183 toep->txsd_avail--; 184 185 toep->flags |= TPF_FLOWC_WR_SENT; 186 t4_wrq_tx(sc, wr); 187 } 188 189 void 190 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 191 { 192 struct wrqe *wr; 193 struct cpl_abort_req *req; 194 int tid = toep->tid; 195 struct inpcb *inp = toep->inp; 196 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 197 198 INP_WLOCK_ASSERT(inp); 199 200 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 201 __func__, toep->tid, 202 inp->inp_flags & INP_DROPPED ? "inp dropped" : 203 tcpstates[tp->t_state], 204 toep->flags, inp->inp_flags, 205 toep->flags & TPF_ABORT_SHUTDOWN ? 206 " (abort already in progress)" : ""); 207 208 if (toep->flags & TPF_ABORT_SHUTDOWN) 209 return; /* abort already in progress */ 210 211 toep->flags |= TPF_ABORT_SHUTDOWN; 212 213 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 214 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 215 216 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 217 if (wr == NULL) { 218 /* XXX */ 219 panic("%s: allocation failure.", __func__); 220 } 221 req = wrtod(wr); 222 223 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 224 if (inp->inp_flags & INP_DROPPED) 225 req->rsvd0 = htobe32(snd_nxt); 226 else 227 req->rsvd0 = htobe32(tp->snd_nxt); 228 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 229 req->cmd = CPL_ABORT_SEND_RST; 230 231 /* 232 * XXX: What's the correct way to tell that the inp hasn't been detached 233 * from its socket? Should I even be flushing the snd buffer here? 234 */ 235 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 236 struct socket *so = inp->inp_socket; 237 238 if (so != NULL) /* because I'm not sure. See comment above */ 239 sbflush(&so->so_snd); 240 } 241 242 t4_l2t_send(sc, wr, toep->l2te); 243 } 244 245 /* 246 * Called when a connection is established to translate the TCP options 247 * reported by HW to FreeBSD's native format. 248 */ 249 static void 250 assign_rxopt(struct tcpcb *tp, unsigned int opt) 251 { 252 struct toepcb *toep = tp->t_toe; 253 struct inpcb *inp = tp->t_inpcb; 254 struct adapter *sc = td_adapter(toep->td); 255 int n; 256 257 INP_LOCK_ASSERT(inp); 258 259 if (inp->inp_inc.inc_flags & INC_ISIPV6) 260 n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 261 else 262 n = sizeof(struct ip) + sizeof(struct tcphdr); 263 tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; 264 265 CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid, 266 G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]); 267 268 if (G_TCPOPT_TSTAMP(opt)) { 269 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 270 tp->ts_recent = 0; /* hmmm */ 271 tp->ts_recent_age = tcp_ts_getticks(); 272 } 273 274 if (G_TCPOPT_SACK(opt)) 275 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 276 else 277 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 278 279 if (G_TCPOPT_WSCALE_OK(opt)) 280 tp->t_flags |= TF_RCVD_SCALE; 281 282 /* Doing window scaling? */ 283 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 284 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 285 tp->rcv_scale = tp->request_r_scale; 286 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 287 } 288 } 289 290 /* 291 * Completes some final bits of initialization for just established connections 292 * and changes their state to TCPS_ESTABLISHED. 293 * 294 * The ISNs are from after the exchange of SYNs. i.e., the true ISN + 1. 295 */ 296 void 297 make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn, 298 uint16_t opt) 299 { 300 struct inpcb *inp = toep->inp; 301 struct socket *so = inp->inp_socket; 302 struct tcpcb *tp = intotcpcb(inp); 303 long bufsize; 304 uint32_t iss = be32toh(snd_isn) - 1; /* true ISS */ 305 uint32_t irs = be32toh(rcv_isn) - 1; /* true IRS */ 306 uint16_t tcpopt = be16toh(opt); 307 struct flowc_tx_params ftxp; 308 309 CURVNET_SET(so->so_vnet); 310 INP_WLOCK_ASSERT(inp); 311 KASSERT(tp->t_state == TCPS_SYN_SENT || 312 tp->t_state == TCPS_SYN_RECEIVED, 313 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 314 315 CTR4(KTR_CXGBE, "%s: tid %d, toep %p, inp %p", 316 __func__, toep->tid, toep, inp); 317 318 tp->t_state = TCPS_ESTABLISHED; 319 tp->t_starttime = ticks; 320 TCPSTAT_INC(tcps_connects); 321 322 tp->irs = irs; 323 tcp_rcvseqinit(tp); 324 tp->rcv_wnd = toep->rx_credits << 10; 325 tp->rcv_adv += tp->rcv_wnd; 326 tp->last_ack_sent = tp->rcv_nxt; 327 328 /* 329 * If we were unable to send all rx credits via opt0, save the remainder 330 * in rx_credits so that they can be handed over with the next credit 331 * update. 332 */ 333 SOCKBUF_LOCK(&so->so_rcv); 334 bufsize = select_rcv_wnd(so); 335 SOCKBUF_UNLOCK(&so->so_rcv); 336 toep->rx_credits = bufsize - tp->rcv_wnd; 337 338 tp->iss = iss; 339 tcp_sendseqinit(tp); 340 tp->snd_una = iss + 1; 341 tp->snd_nxt = iss + 1; 342 tp->snd_max = iss + 1; 343 344 assign_rxopt(tp, tcpopt); 345 346 SOCKBUF_LOCK(&so->so_snd); 347 if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) 348 bufsize = V_tcp_autosndbuf_max; 349 else 350 bufsize = sbspace(&so->so_snd); 351 SOCKBUF_UNLOCK(&so->so_snd); 352 353 ftxp.snd_nxt = tp->snd_nxt; 354 ftxp.rcv_nxt = tp->rcv_nxt; 355 ftxp.snd_space = bufsize; 356 ftxp.mss = tp->t_maxseg; 357 send_flowc_wr(toep, &ftxp); 358 359 soisconnected(so); 360 CURVNET_RESTORE(); 361 } 362 363 static int 364 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 365 { 366 struct wrqe *wr; 367 struct cpl_rx_data_ack *req; 368 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 369 370 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 371 372 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 373 if (wr == NULL) 374 return (0); 375 req = wrtod(wr); 376 377 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 378 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 379 380 t4_wrq_tx(sc, wr); 381 return (credits); 382 } 383 384 void 385 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 386 { 387 struct adapter *sc = tod->tod_softc; 388 struct inpcb *inp = tp->t_inpcb; 389 struct socket *so = inp->inp_socket; 390 struct sockbuf *sb = &so->so_rcv; 391 struct toepcb *toep = tp->t_toe; 392 int credits; 393 394 INP_WLOCK_ASSERT(inp); 395 396 SOCKBUF_LOCK_ASSERT(sb); 397 KASSERT(toep->sb_cc >= sbused(sb), 398 ("%s: sb %p has more data (%d) than last time (%d).", 399 __func__, sb, sbused(sb), toep->sb_cc)); 400 401 toep->rx_credits += toep->sb_cc - sbused(sb); 402 toep->sb_cc = sbused(sb); 403 404 if (toep->rx_credits > 0 && 405 (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 || 406 (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 407 toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) { 408 409 credits = send_rx_credits(sc, toep, toep->rx_credits); 410 toep->rx_credits -= credits; 411 tp->rcv_wnd += credits; 412 tp->rcv_adv += credits; 413 } 414 } 415 416 void 417 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 418 { 419 struct inpcb *inp = tp->t_inpcb; 420 struct socket *so = inp->inp_socket; 421 struct sockbuf *sb = &so->so_rcv; 422 423 SOCKBUF_LOCK(sb); 424 t4_rcvd_locked(tod, tp); 425 SOCKBUF_UNLOCK(sb); 426 } 427 428 /* 429 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 430 */ 431 static int 432 close_conn(struct adapter *sc, struct toepcb *toep) 433 { 434 struct wrqe *wr; 435 struct cpl_close_con_req *req; 436 unsigned int tid = toep->tid; 437 438 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 439 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 440 441 if (toep->flags & TPF_FIN_SENT) 442 return (0); 443 444 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 445 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 446 447 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 448 if (wr == NULL) { 449 /* XXX */ 450 panic("%s: allocation failure.", __func__); 451 } 452 req = wrtod(wr); 453 454 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 455 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 456 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 457 V_FW_WR_FLOWID(tid)); 458 req->wr.wr_lo = cpu_to_be64(0); 459 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 460 req->rsvd = 0; 461 462 toep->flags |= TPF_FIN_SENT; 463 toep->flags &= ~TPF_SEND_FIN; 464 t4_l2t_send(sc, wr, toep->l2te); 465 466 return (0); 467 } 468 469 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 470 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 471 472 /* Maximum amount of immediate data we could stuff in a WR */ 473 static inline int 474 max_imm_payload(int tx_credits) 475 { 476 const int n = 2; /* Use only up to 2 desc for imm. data WR */ 477 478 KASSERT(tx_credits >= 0 && 479 tx_credits <= MAX_OFLD_TX_CREDITS, 480 ("%s: %d credits", __func__, tx_credits)); 481 482 if (tx_credits < MIN_OFLD_TX_CREDITS) 483 return (0); 484 485 if (tx_credits >= (n * EQ_ESIZE) / 16) 486 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 487 else 488 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 489 } 490 491 /* Maximum number of SGL entries we could stuff in a WR */ 492 static inline int 493 max_dsgl_nsegs(int tx_credits) 494 { 495 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 496 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 497 498 KASSERT(tx_credits >= 0 && 499 tx_credits <= MAX_OFLD_TX_CREDITS, 500 ("%s: %d credits", __func__, tx_credits)); 501 502 if (tx_credits < MIN_OFLD_TX_CREDITS) 503 return (0); 504 505 nseg += 2 * (sge_pair_credits * 16 / 24); 506 if ((sge_pair_credits * 16) % 24 == 16) 507 nseg++; 508 509 return (nseg); 510 } 511 512 static inline void 513 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 514 unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign) 515 { 516 struct fw_ofld_tx_data_wr *txwr = dst; 517 518 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 519 V_FW_WR_IMMDLEN(immdlen)); 520 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 521 V_FW_WR_LEN16(credits)); 522 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) | 523 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 524 txwr->plen = htobe32(plen); 525 526 if (txalign > 0) { 527 struct tcpcb *tp = intotcpcb(toep->inp); 528 529 if (plen < 2 * tp->t_maxseg || is_10G_port(toep->vi->pi)) 530 txwr->lsodisable_to_flags |= 531 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 532 else 533 txwr->lsodisable_to_flags |= 534 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 535 (tp->t_flags & TF_NODELAY ? 0 : 536 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 537 } 538 } 539 540 /* 541 * Generate a DSGL from a starting mbuf. The total number of segments and the 542 * maximum segments in any one mbuf are provided. 543 */ 544 static void 545 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 546 { 547 struct mbuf *m; 548 struct ulptx_sgl *usgl = dst; 549 int i, j, rc; 550 struct sglist sg; 551 struct sglist_seg segs[n]; 552 553 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 554 555 sglist_init(&sg, n, segs); 556 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 557 V_ULPTX_NSGE(nsegs)); 558 559 i = -1; 560 for (m = start; m != stop; m = m->m_next) { 561 if (IS_AIOTX_MBUF(m)) 562 rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m), 563 aiotx_mbuf_pgoff(m), m->m_len); 564 else 565 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 566 if (__predict_false(rc != 0)) 567 panic("%s: sglist_append %d", __func__, rc); 568 569 for (j = 0; j < sg.sg_nseg; i++, j++) { 570 if (i < 0) { 571 usgl->len0 = htobe32(segs[j].ss_len); 572 usgl->addr0 = htobe64(segs[j].ss_paddr); 573 } else { 574 usgl->sge[i / 2].len[i & 1] = 575 htobe32(segs[j].ss_len); 576 usgl->sge[i / 2].addr[i & 1] = 577 htobe64(segs[j].ss_paddr); 578 } 579 #ifdef INVARIANTS 580 nsegs--; 581 #endif 582 } 583 sglist_reset(&sg); 584 } 585 if (i & 1) 586 usgl->sge[i / 2].len[1] = htobe32(0); 587 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 588 __func__, nsegs, start, stop)); 589 } 590 591 /* 592 * Max number of SGL entries an offload tx work request can have. This is 41 593 * (1 + 40) for a full 512B work request. 594 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 595 */ 596 #define OFLD_SGL_LEN (41) 597 598 /* 599 * Send data and/or a FIN to the peer. 600 * 601 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 602 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 603 * was transmitted. 604 * 605 * drop indicates the number of bytes that should be dropped from the head of 606 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 607 * contention on the send buffer lock (before this change it used to do 608 * sowwakeup and then t4_push_frames right after that when recovering from tx 609 * stalls). When drop is set this function MUST drop the bytes and wake up any 610 * writers. 611 */ 612 void 613 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 614 { 615 struct mbuf *sndptr, *m, *sb_sndptr; 616 struct fw_ofld_tx_data_wr *txwr; 617 struct wrqe *wr; 618 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 619 struct inpcb *inp = toep->inp; 620 struct tcpcb *tp = intotcpcb(inp); 621 struct socket *so = inp->inp_socket; 622 struct sockbuf *sb = &so->so_snd; 623 int tx_credits, shove, compl, sowwakeup; 624 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 625 bool aiotx_mbuf_seen; 626 627 INP_WLOCK_ASSERT(inp); 628 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 629 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 630 631 KASSERT(toep->ulp_mode == ULP_MODE_NONE || 632 toep->ulp_mode == ULP_MODE_TCPDDP || 633 toep->ulp_mode == ULP_MODE_RDMA, 634 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 635 636 #ifdef VERBOSE_TRACES 637 CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 638 __func__, toep->tid, toep->flags, tp->t_flags); 639 #endif 640 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 641 return; 642 643 /* 644 * This function doesn't resume by itself. Someone else must clear the 645 * flag and call this function. 646 */ 647 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 648 KASSERT(drop == 0, 649 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 650 return; 651 } 652 653 do { 654 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 655 max_imm = max_imm_payload(tx_credits); 656 max_nsegs = max_dsgl_nsegs(tx_credits); 657 658 SOCKBUF_LOCK(sb); 659 sowwakeup = drop; 660 if (drop) { 661 sbdrop_locked(sb, drop); 662 drop = 0; 663 } 664 sb_sndptr = sb->sb_sndptr; 665 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 666 plen = 0; 667 nsegs = 0; 668 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 669 aiotx_mbuf_seen = false; 670 for (m = sndptr; m != NULL; m = m->m_next) { 671 int n; 672 673 if (IS_AIOTX_MBUF(m)) 674 n = sglist_count_vmpages(aiotx_mbuf_pages(m), 675 aiotx_mbuf_pgoff(m), m->m_len); 676 else 677 n = sglist_count(mtod(m, void *), m->m_len); 678 679 nsegs += n; 680 plen += m->m_len; 681 682 /* This mbuf sent us _over_ the nsegs limit, back out */ 683 if (plen > max_imm && nsegs > max_nsegs) { 684 nsegs -= n; 685 plen -= m->m_len; 686 if (plen == 0) { 687 /* Too few credits */ 688 toep->flags |= TPF_TX_SUSPENDED; 689 if (sowwakeup) { 690 if (!TAILQ_EMPTY( 691 &toep->aiotx_jobq)) 692 t4_aiotx_queue_toep( 693 toep); 694 sowwakeup_locked(so); 695 } else 696 SOCKBUF_UNLOCK(sb); 697 SOCKBUF_UNLOCK_ASSERT(sb); 698 return; 699 } 700 break; 701 } 702 703 if (IS_AIOTX_MBUF(m)) 704 aiotx_mbuf_seen = true; 705 if (max_nsegs_1mbuf < n) 706 max_nsegs_1mbuf = n; 707 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 708 709 /* This mbuf put us right at the max_nsegs limit */ 710 if (plen > max_imm && nsegs == max_nsegs) { 711 m = m->m_next; 712 break; 713 } 714 } 715 716 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 717 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 718 compl = 1; 719 else 720 compl = 0; 721 722 if (sb->sb_flags & SB_AUTOSIZE && 723 V_tcp_do_autosndbuf && 724 sb->sb_hiwat < V_tcp_autosndbuf_max && 725 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 726 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 727 V_tcp_autosndbuf_max); 728 729 if (!sbreserve_locked(sb, newsize, so, NULL)) 730 sb->sb_flags &= ~SB_AUTOSIZE; 731 else 732 sowwakeup = 1; /* room available */ 733 } 734 if (sowwakeup) { 735 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 736 t4_aiotx_queue_toep(toep); 737 sowwakeup_locked(so); 738 } else 739 SOCKBUF_UNLOCK(sb); 740 SOCKBUF_UNLOCK_ASSERT(sb); 741 742 /* nothing to send */ 743 if (plen == 0) { 744 KASSERT(m == NULL, 745 ("%s: nothing to send, but m != NULL", __func__)); 746 break; 747 } 748 749 if (__predict_false(toep->flags & TPF_FIN_SENT)) 750 panic("%s: excess tx.", __func__); 751 752 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 753 if (plen <= max_imm && !aiotx_mbuf_seen) { 754 755 /* Immediate data tx */ 756 757 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 758 toep->ofld_txq); 759 if (wr == NULL) { 760 /* XXX: how will we recover from this? */ 761 toep->flags |= TPF_TX_SUSPENDED; 762 return; 763 } 764 txwr = wrtod(wr); 765 credits = howmany(wr->wr_len, 16); 766 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0, 767 sc->tt.tx_align); 768 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 769 nsegs = 0; 770 } else { 771 int wr_len; 772 773 /* DSGL tx */ 774 775 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 776 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 777 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 778 if (wr == NULL) { 779 /* XXX: how will we recover from this? */ 780 toep->flags |= TPF_TX_SUSPENDED; 781 return; 782 } 783 txwr = wrtod(wr); 784 credits = howmany(wr_len, 16); 785 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0, 786 sc->tt.tx_align); 787 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 788 max_nsegs_1mbuf); 789 if (wr_len & 0xf) { 790 uint64_t *pad = (uint64_t *) 791 ((uintptr_t)txwr + wr_len); 792 *pad = 0; 793 } 794 } 795 796 KASSERT(toep->tx_credits >= credits, 797 ("%s: not enough credits", __func__)); 798 799 toep->tx_credits -= credits; 800 toep->tx_nocompl += credits; 801 toep->plen_nocompl += plen; 802 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 803 toep->tx_nocompl >= toep->tx_total / 4) 804 compl = 1; 805 806 if (compl || toep->ulp_mode == ULP_MODE_RDMA) { 807 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 808 toep->tx_nocompl = 0; 809 toep->plen_nocompl = 0; 810 } 811 812 tp->snd_nxt += plen; 813 tp->snd_max += plen; 814 815 SOCKBUF_LOCK(sb); 816 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 817 sb->sb_sndptr = sb_sndptr; 818 SOCKBUF_UNLOCK(sb); 819 820 toep->flags |= TPF_TX_DATA_SENT; 821 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 822 toep->flags |= TPF_TX_SUSPENDED; 823 824 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 825 txsd->plen = plen; 826 txsd->tx_credits = credits; 827 txsd++; 828 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 829 toep->txsd_pidx = 0; 830 txsd = &toep->txsd[0]; 831 } 832 toep->txsd_avail--; 833 834 t4_l2t_send(sc, wr, toep->l2te); 835 } while (m != NULL); 836 837 /* Send a FIN if requested, but only if there's no more data to send */ 838 if (m == NULL && toep->flags & TPF_SEND_FIN) 839 close_conn(sc, toep); 840 } 841 842 static inline void 843 rqdrop_locked(struct mbufq *q, int plen) 844 { 845 struct mbuf *m; 846 847 while (plen > 0) { 848 m = mbufq_dequeue(q); 849 850 /* Too many credits. */ 851 MPASS(m != NULL); 852 M_ASSERTPKTHDR(m); 853 854 /* Partial credits. */ 855 MPASS(plen >= m->m_pkthdr.len); 856 857 plen -= m->m_pkthdr.len; 858 m_freem(m); 859 } 860 } 861 862 void 863 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 864 { 865 struct mbuf *sndptr, *m; 866 struct fw_ofld_tx_data_wr *txwr; 867 struct wrqe *wr; 868 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 869 u_int adjusted_plen, ulp_submode; 870 struct inpcb *inp = toep->inp; 871 struct tcpcb *tp = intotcpcb(inp); 872 int tx_credits, shove; 873 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 874 struct mbufq *pduq = &toep->ulp_pduq; 875 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 876 877 INP_WLOCK_ASSERT(inp); 878 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 879 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 880 KASSERT(toep->ulp_mode == ULP_MODE_ISCSI, 881 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 882 883 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 884 return; 885 886 /* 887 * This function doesn't resume by itself. Someone else must clear the 888 * flag and call this function. 889 */ 890 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 891 KASSERT(drop == 0, 892 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 893 return; 894 } 895 896 if (drop) 897 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 898 899 while ((sndptr = mbufq_first(pduq)) != NULL) { 900 M_ASSERTPKTHDR(sndptr); 901 902 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 903 max_imm = max_imm_payload(tx_credits); 904 max_nsegs = max_dsgl_nsegs(tx_credits); 905 906 plen = 0; 907 nsegs = 0; 908 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 909 for (m = sndptr; m != NULL; m = m->m_next) { 910 int n = sglist_count(mtod(m, void *), m->m_len); 911 912 nsegs += n; 913 plen += m->m_len; 914 915 /* 916 * This mbuf would send us _over_ the nsegs limit. 917 * Suspend tx because the PDU can't be sent out. 918 */ 919 if (plen > max_imm && nsegs > max_nsegs) { 920 toep->flags |= TPF_TX_SUSPENDED; 921 return; 922 } 923 924 if (max_nsegs_1mbuf < n) 925 max_nsegs_1mbuf = n; 926 } 927 928 if (__predict_false(toep->flags & TPF_FIN_SENT)) 929 panic("%s: excess tx.", __func__); 930 931 /* 932 * We have a PDU to send. All of it goes out in one WR so 'm' 933 * is NULL. A PDU's length is always a multiple of 4. 934 */ 935 MPASS(m == NULL); 936 MPASS((plen & 3) == 0); 937 MPASS(sndptr->m_pkthdr.len == plen); 938 939 shove = !(tp->t_flags & TF_MORETOCOME); 940 ulp_submode = mbuf_ulp_submode(sndptr); 941 MPASS(ulp_submode < nitems(ulp_extra_len)); 942 943 /* 944 * plen doesn't include header and data digests, which are 945 * generated and inserted in the right places by the TOE, but 946 * they do occupy TCP sequence space and need to be accounted 947 * for. 948 */ 949 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 950 if (plen <= max_imm) { 951 952 /* Immediate data tx */ 953 954 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 955 toep->ofld_txq); 956 if (wr == NULL) { 957 /* XXX: how will we recover from this? */ 958 toep->flags |= TPF_TX_SUSPENDED; 959 return; 960 } 961 txwr = wrtod(wr); 962 credits = howmany(wr->wr_len, 16); 963 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 964 shove, ulp_submode, sc->tt.tx_align); 965 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 966 nsegs = 0; 967 } else { 968 int wr_len; 969 970 /* DSGL tx */ 971 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 972 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 973 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 974 if (wr == NULL) { 975 /* XXX: how will we recover from this? */ 976 toep->flags |= TPF_TX_SUSPENDED; 977 return; 978 } 979 txwr = wrtod(wr); 980 credits = howmany(wr_len, 16); 981 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 982 shove, ulp_submode, sc->tt.tx_align); 983 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 984 max_nsegs_1mbuf); 985 if (wr_len & 0xf) { 986 uint64_t *pad = (uint64_t *) 987 ((uintptr_t)txwr + wr_len); 988 *pad = 0; 989 } 990 } 991 992 KASSERT(toep->tx_credits >= credits, 993 ("%s: not enough credits", __func__)); 994 995 m = mbufq_dequeue(pduq); 996 MPASS(m == sndptr); 997 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 998 999 toep->tx_credits -= credits; 1000 toep->tx_nocompl += credits; 1001 toep->plen_nocompl += plen; 1002 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1003 toep->tx_nocompl >= toep->tx_total / 4) { 1004 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1005 toep->tx_nocompl = 0; 1006 toep->plen_nocompl = 0; 1007 } 1008 1009 tp->snd_nxt += adjusted_plen; 1010 tp->snd_max += adjusted_plen; 1011 1012 toep->flags |= TPF_TX_DATA_SENT; 1013 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1014 toep->flags |= TPF_TX_SUSPENDED; 1015 1016 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1017 txsd->plen = plen; 1018 txsd->tx_credits = credits; 1019 txsd++; 1020 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1021 toep->txsd_pidx = 0; 1022 txsd = &toep->txsd[0]; 1023 } 1024 toep->txsd_avail--; 1025 1026 t4_l2t_send(sc, wr, toep->l2te); 1027 } 1028 1029 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1030 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1031 close_conn(sc, toep); 1032 } 1033 1034 int 1035 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1036 { 1037 struct adapter *sc = tod->tod_softc; 1038 #ifdef INVARIANTS 1039 struct inpcb *inp = tp->t_inpcb; 1040 #endif 1041 struct toepcb *toep = tp->t_toe; 1042 1043 INP_WLOCK_ASSERT(inp); 1044 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1045 ("%s: inp %p dropped.", __func__, inp)); 1046 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1047 1048 if (toep->ulp_mode == ULP_MODE_ISCSI) 1049 t4_push_pdus(sc, toep, 0); 1050 else 1051 t4_push_frames(sc, toep, 0); 1052 1053 return (0); 1054 } 1055 1056 int 1057 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1058 { 1059 struct adapter *sc = tod->tod_softc; 1060 #ifdef INVARIANTS 1061 struct inpcb *inp = tp->t_inpcb; 1062 #endif 1063 struct toepcb *toep = tp->t_toe; 1064 1065 INP_WLOCK_ASSERT(inp); 1066 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1067 ("%s: inp %p dropped.", __func__, inp)); 1068 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1069 1070 toep->flags |= TPF_SEND_FIN; 1071 if (tp->t_state >= TCPS_ESTABLISHED) { 1072 if (toep->ulp_mode == ULP_MODE_ISCSI) 1073 t4_push_pdus(sc, toep, 0); 1074 else 1075 t4_push_frames(sc, toep, 0); 1076 } 1077 1078 return (0); 1079 } 1080 1081 int 1082 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1083 { 1084 struct adapter *sc = tod->tod_softc; 1085 #if defined(INVARIANTS) 1086 struct inpcb *inp = tp->t_inpcb; 1087 #endif 1088 struct toepcb *toep = tp->t_toe; 1089 1090 INP_WLOCK_ASSERT(inp); 1091 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1092 ("%s: inp %p dropped.", __func__, inp)); 1093 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1094 1095 /* hmmmm */ 1096 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1097 ("%s: flowc for tid %u [%s] not sent already", 1098 __func__, toep->tid, tcpstates[tp->t_state])); 1099 1100 send_reset(sc, toep, 0); 1101 return (0); 1102 } 1103 1104 /* 1105 * Peer has sent us a FIN. 1106 */ 1107 static int 1108 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1109 { 1110 struct adapter *sc = iq->adapter; 1111 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1112 unsigned int tid = GET_TID(cpl); 1113 struct toepcb *toep = lookup_tid(sc, tid); 1114 struct inpcb *inp = toep->inp; 1115 struct tcpcb *tp = NULL; 1116 struct socket *so; 1117 #ifdef INVARIANTS 1118 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1119 #endif 1120 1121 KASSERT(opcode == CPL_PEER_CLOSE, 1122 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1123 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1124 1125 if (__predict_false(toep->flags & TPF_SYNQE)) { 1126 #ifdef INVARIANTS 1127 struct synq_entry *synqe = (void *)toep; 1128 1129 INP_WLOCK(synqe->lctx->inp); 1130 if (synqe->flags & TPF_SYNQE_HAS_L2TE) { 1131 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1132 ("%s: listen socket closed but tid %u not aborted.", 1133 __func__, tid)); 1134 } else { 1135 /* 1136 * do_pass_accept_req is still running and will 1137 * eventually take care of this tid. 1138 */ 1139 } 1140 INP_WUNLOCK(synqe->lctx->inp); 1141 #endif 1142 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1143 toep, toep->flags); 1144 return (0); 1145 } 1146 1147 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1148 1149 INP_INFO_RLOCK(&V_tcbinfo); 1150 INP_WLOCK(inp); 1151 tp = intotcpcb(inp); 1152 1153 CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, 1154 tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp); 1155 1156 if (toep->flags & TPF_ABORT_SHUTDOWN) 1157 goto done; 1158 1159 tp->rcv_nxt++; /* FIN */ 1160 1161 so = inp->inp_socket; 1162 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1163 DDP_LOCK(toep); 1164 if (__predict_false(toep->ddp_flags & 1165 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1166 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1167 DDP_UNLOCK(toep); 1168 } 1169 socantrcvmore(so); 1170 1171 if (toep->ulp_mode != ULP_MODE_RDMA) { 1172 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1173 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1174 be32toh(cpl->rcv_nxt))); 1175 } 1176 1177 switch (tp->t_state) { 1178 case TCPS_SYN_RECEIVED: 1179 tp->t_starttime = ticks; 1180 /* FALLTHROUGH */ 1181 1182 case TCPS_ESTABLISHED: 1183 tp->t_state = TCPS_CLOSE_WAIT; 1184 break; 1185 1186 case TCPS_FIN_WAIT_1: 1187 tp->t_state = TCPS_CLOSING; 1188 break; 1189 1190 case TCPS_FIN_WAIT_2: 1191 tcp_twstart(tp); 1192 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1193 INP_INFO_RUNLOCK(&V_tcbinfo); 1194 1195 INP_WLOCK(inp); 1196 final_cpl_received(toep); 1197 return (0); 1198 1199 default: 1200 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1201 __func__, tid, tp->t_state); 1202 } 1203 done: 1204 INP_WUNLOCK(inp); 1205 INP_INFO_RUNLOCK(&V_tcbinfo); 1206 return (0); 1207 } 1208 1209 /* 1210 * Peer has ACK'd our FIN. 1211 */ 1212 static int 1213 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1214 struct mbuf *m) 1215 { 1216 struct adapter *sc = iq->adapter; 1217 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1218 unsigned int tid = GET_TID(cpl); 1219 struct toepcb *toep = lookup_tid(sc, tid); 1220 struct inpcb *inp = toep->inp; 1221 struct tcpcb *tp = NULL; 1222 struct socket *so = NULL; 1223 #ifdef INVARIANTS 1224 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1225 #endif 1226 1227 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1228 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1229 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1230 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1231 1232 INP_INFO_RLOCK(&V_tcbinfo); 1233 INP_WLOCK(inp); 1234 tp = intotcpcb(inp); 1235 1236 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1237 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1238 1239 if (toep->flags & TPF_ABORT_SHUTDOWN) 1240 goto done; 1241 1242 so = inp->inp_socket; 1243 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1244 1245 switch (tp->t_state) { 1246 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1247 tcp_twstart(tp); 1248 release: 1249 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1250 INP_INFO_RUNLOCK(&V_tcbinfo); 1251 1252 INP_WLOCK(inp); 1253 final_cpl_received(toep); /* no more CPLs expected */ 1254 1255 return (0); 1256 case TCPS_LAST_ACK: 1257 if (tcp_close(tp)) 1258 INP_WUNLOCK(inp); 1259 goto release; 1260 1261 case TCPS_FIN_WAIT_1: 1262 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1263 soisdisconnected(so); 1264 tp->t_state = TCPS_FIN_WAIT_2; 1265 break; 1266 1267 default: 1268 log(LOG_ERR, 1269 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1270 __func__, tid, tcpstates[tp->t_state]); 1271 } 1272 done: 1273 INP_WUNLOCK(inp); 1274 INP_INFO_RUNLOCK(&V_tcbinfo); 1275 return (0); 1276 } 1277 1278 void 1279 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1280 int rst_status) 1281 { 1282 struct wrqe *wr; 1283 struct cpl_abort_rpl *cpl; 1284 1285 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1286 if (wr == NULL) { 1287 /* XXX */ 1288 panic("%s: allocation failure.", __func__); 1289 } 1290 cpl = wrtod(wr); 1291 1292 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1293 cpl->cmd = rst_status; 1294 1295 t4_wrq_tx(sc, wr); 1296 } 1297 1298 static int 1299 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1300 { 1301 switch (abort_reason) { 1302 case CPL_ERR_BAD_SYN: 1303 case CPL_ERR_CONN_RESET: 1304 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1305 case CPL_ERR_XMIT_TIMEDOUT: 1306 case CPL_ERR_PERSIST_TIMEDOUT: 1307 case CPL_ERR_FINWAIT2_TIMEDOUT: 1308 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1309 return (ETIMEDOUT); 1310 default: 1311 return (EIO); 1312 } 1313 } 1314 1315 /* 1316 * TCP RST from the peer, timeout, or some other such critical error. 1317 */ 1318 static int 1319 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1320 { 1321 struct adapter *sc = iq->adapter; 1322 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1323 unsigned int tid = GET_TID(cpl); 1324 struct toepcb *toep = lookup_tid(sc, tid); 1325 struct sge_wrq *ofld_txq = toep->ofld_txq; 1326 struct inpcb *inp; 1327 struct tcpcb *tp; 1328 #ifdef INVARIANTS 1329 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1330 #endif 1331 1332 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1333 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1334 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1335 1336 if (toep->flags & TPF_SYNQE) 1337 return (do_abort_req_synqe(iq, rss, m)); 1338 1339 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1340 1341 if (negative_advice(cpl->status)) { 1342 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1343 __func__, cpl->status, tid, toep->flags); 1344 return (0); /* Ignore negative advice */ 1345 } 1346 1347 inp = toep->inp; 1348 INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ 1349 INP_WLOCK(inp); 1350 1351 tp = intotcpcb(inp); 1352 1353 CTR6(KTR_CXGBE, 1354 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1355 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1356 inp->inp_flags, cpl->status); 1357 1358 /* 1359 * If we'd initiated an abort earlier the reply to it is responsible for 1360 * cleaning up resources. Otherwise we tear everything down right here 1361 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1362 */ 1363 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1364 INP_WUNLOCK(inp); 1365 goto done; 1366 } 1367 toep->flags |= TPF_ABORT_SHUTDOWN; 1368 1369 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1370 struct socket *so = inp->inp_socket; 1371 1372 if (so != NULL) 1373 so_error_set(so, abort_status_to_errno(tp, 1374 cpl->status)); 1375 tp = tcp_close(tp); 1376 if (tp == NULL) 1377 INP_WLOCK(inp); /* re-acquire */ 1378 } 1379 1380 final_cpl_received(toep); 1381 done: 1382 INP_INFO_RUNLOCK(&V_tcbinfo); 1383 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1384 return (0); 1385 } 1386 1387 /* 1388 * Reply to the CPL_ABORT_REQ (send_reset) 1389 */ 1390 static int 1391 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1392 { 1393 struct adapter *sc = iq->adapter; 1394 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1395 unsigned int tid = GET_TID(cpl); 1396 struct toepcb *toep = lookup_tid(sc, tid); 1397 struct inpcb *inp = toep->inp; 1398 #ifdef INVARIANTS 1399 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1400 #endif 1401 1402 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1403 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1404 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1405 1406 if (toep->flags & TPF_SYNQE) 1407 return (do_abort_rpl_synqe(iq, rss, m)); 1408 1409 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1410 1411 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1412 __func__, tid, toep, inp, cpl->status); 1413 1414 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1415 ("%s: wasn't expecting abort reply", __func__)); 1416 1417 INP_WLOCK(inp); 1418 final_cpl_received(toep); 1419 1420 return (0); 1421 } 1422 1423 static int 1424 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1425 { 1426 struct adapter *sc = iq->adapter; 1427 const struct cpl_rx_data *cpl = mtod(m, const void *); 1428 unsigned int tid = GET_TID(cpl); 1429 struct toepcb *toep = lookup_tid(sc, tid); 1430 struct inpcb *inp = toep->inp; 1431 struct tcpcb *tp; 1432 struct socket *so; 1433 struct sockbuf *sb; 1434 int len; 1435 uint32_t ddp_placed = 0; 1436 1437 if (__predict_false(toep->flags & TPF_SYNQE)) { 1438 #ifdef INVARIANTS 1439 struct synq_entry *synqe = (void *)toep; 1440 1441 INP_WLOCK(synqe->lctx->inp); 1442 if (synqe->flags & TPF_SYNQE_HAS_L2TE) { 1443 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1444 ("%s: listen socket closed but tid %u not aborted.", 1445 __func__, tid)); 1446 } else { 1447 /* 1448 * do_pass_accept_req is still running and will 1449 * eventually take care of this tid. 1450 */ 1451 } 1452 INP_WUNLOCK(synqe->lctx->inp); 1453 #endif 1454 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1455 toep, toep->flags); 1456 m_freem(m); 1457 return (0); 1458 } 1459 1460 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1461 1462 /* strip off CPL header */ 1463 m_adj(m, sizeof(*cpl)); 1464 len = m->m_pkthdr.len; 1465 1466 INP_WLOCK(inp); 1467 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1468 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1469 __func__, tid, len, inp->inp_flags); 1470 INP_WUNLOCK(inp); 1471 m_freem(m); 1472 return (0); 1473 } 1474 1475 tp = intotcpcb(inp); 1476 1477 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1478 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1479 1480 tp->rcv_nxt += len; 1481 if (tp->rcv_wnd < len) { 1482 KASSERT(toep->ulp_mode == ULP_MODE_RDMA, 1483 ("%s: negative window size", __func__)); 1484 } 1485 1486 tp->rcv_wnd -= len; 1487 tp->t_rcvtime = ticks; 1488 1489 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1490 DDP_LOCK(toep); 1491 so = inp_inpcbtosocket(inp); 1492 sb = &so->so_rcv; 1493 SOCKBUF_LOCK(sb); 1494 1495 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1496 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1497 __func__, tid, len); 1498 m_freem(m); 1499 SOCKBUF_UNLOCK(sb); 1500 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1501 DDP_UNLOCK(toep); 1502 INP_WUNLOCK(inp); 1503 1504 INP_INFO_RLOCK(&V_tcbinfo); 1505 INP_WLOCK(inp); 1506 tp = tcp_drop(tp, ECONNRESET); 1507 if (tp) 1508 INP_WUNLOCK(inp); 1509 INP_INFO_RUNLOCK(&V_tcbinfo); 1510 1511 return (0); 1512 } 1513 1514 /* receive buffer autosize */ 1515 CURVNET_SET(so->so_vnet); 1516 if (sb->sb_flags & SB_AUTOSIZE && 1517 V_tcp_do_autorcvbuf && 1518 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1519 len > (sbspace(sb) / 8 * 7)) { 1520 unsigned int hiwat = sb->sb_hiwat; 1521 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 1522 V_tcp_autorcvbuf_max); 1523 1524 if (!sbreserve_locked(sb, newsize, so, NULL)) 1525 sb->sb_flags &= ~SB_AUTOSIZE; 1526 else 1527 toep->rx_credits += newsize - hiwat; 1528 } 1529 1530 if (toep->ddp_waiting_count != 0 || toep->ddp_active_count != 0) 1531 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", __func__, 1532 tid, len); 1533 1534 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1535 int changed = !(toep->ddp_flags & DDP_ON) ^ cpl->ddp_off; 1536 1537 if (changed) { 1538 if (toep->ddp_flags & DDP_SC_REQ) 1539 toep->ddp_flags ^= DDP_ON | DDP_SC_REQ; 1540 else { 1541 KASSERT(cpl->ddp_off == 1, 1542 ("%s: DDP switched on by itself.", 1543 __func__)); 1544 1545 /* Fell out of DDP mode */ 1546 toep->ddp_flags &= ~DDP_ON; 1547 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1548 __func__); 1549 1550 insert_ddp_data(toep, ddp_placed); 1551 } 1552 } 1553 1554 if (toep->ddp_flags & DDP_ON) { 1555 /* 1556 * CPL_RX_DATA with DDP on can only be an indicate. 1557 * Start posting queued AIO requests via DDP. The 1558 * payload that arrived in this indicate is appended 1559 * to the socket buffer as usual. 1560 */ 1561 handle_ddp_indicate(toep); 1562 } 1563 } 1564 1565 KASSERT(toep->sb_cc >= sbused(sb), 1566 ("%s: sb %p has more data (%d) than last time (%d).", 1567 __func__, sb, sbused(sb), toep->sb_cc)); 1568 toep->rx_credits += toep->sb_cc - sbused(sb); 1569 sbappendstream_locked(sb, m, 0); 1570 toep->sb_cc = sbused(sb); 1571 if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) { 1572 int credits; 1573 1574 credits = send_rx_credits(sc, toep, toep->rx_credits); 1575 toep->rx_credits -= credits; 1576 tp->rcv_wnd += credits; 1577 tp->rcv_adv += credits; 1578 } 1579 1580 if (toep->ddp_waiting_count > 0 && sbavail(sb) != 0) { 1581 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1582 tid); 1583 ddp_queue_toep(toep); 1584 } 1585 sorwakeup_locked(so); 1586 SOCKBUF_UNLOCK_ASSERT(sb); 1587 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1588 DDP_UNLOCK(toep); 1589 1590 INP_WUNLOCK(inp); 1591 CURVNET_RESTORE(); 1592 return (0); 1593 } 1594 1595 #define S_CPL_FW4_ACK_OPCODE 24 1596 #define M_CPL_FW4_ACK_OPCODE 0xff 1597 #define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE) 1598 #define G_CPL_FW4_ACK_OPCODE(x) \ 1599 (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE) 1600 1601 #define S_CPL_FW4_ACK_FLOWID 0 1602 #define M_CPL_FW4_ACK_FLOWID 0xffffff 1603 #define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID) 1604 #define G_CPL_FW4_ACK_FLOWID(x) \ 1605 (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID) 1606 1607 #define S_CPL_FW4_ACK_CR 24 1608 #define M_CPL_FW4_ACK_CR 0xff 1609 #define V_CPL_FW4_ACK_CR(x) ((x) << S_CPL_FW4_ACK_CR) 1610 #define G_CPL_FW4_ACK_CR(x) (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR) 1611 1612 #define S_CPL_FW4_ACK_SEQVAL 0 1613 #define M_CPL_FW4_ACK_SEQVAL 0x1 1614 #define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL) 1615 #define G_CPL_FW4_ACK_SEQVAL(x) \ 1616 (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL) 1617 #define F_CPL_FW4_ACK_SEQVAL V_CPL_FW4_ACK_SEQVAL(1U) 1618 1619 static int 1620 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1621 { 1622 struct adapter *sc = iq->adapter; 1623 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1624 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1625 struct toepcb *toep = lookup_tid(sc, tid); 1626 struct inpcb *inp; 1627 struct tcpcb *tp; 1628 struct socket *so; 1629 uint8_t credits = cpl->credits; 1630 struct ofld_tx_sdesc *txsd; 1631 int plen; 1632 #ifdef INVARIANTS 1633 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1634 #endif 1635 1636 /* 1637 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1638 * now this comes back carrying the credits for the flowc. 1639 */ 1640 if (__predict_false(toep->flags & TPF_SYNQE)) { 1641 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1642 ("%s: credits for a synq entry %p", __func__, toep)); 1643 return (0); 1644 } 1645 1646 inp = toep->inp; 1647 1648 KASSERT(opcode == CPL_FW4_ACK, 1649 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1650 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1651 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1652 1653 INP_WLOCK(inp); 1654 1655 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1656 INP_WUNLOCK(inp); 1657 return (0); 1658 } 1659 1660 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1661 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1662 1663 tp = intotcpcb(inp); 1664 1665 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1666 tcp_seq snd_una = be32toh(cpl->snd_una); 1667 1668 #ifdef INVARIANTS 1669 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1670 log(LOG_ERR, 1671 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1672 __func__, snd_una, toep->tid, tp->snd_una); 1673 } 1674 #endif 1675 1676 if (tp->snd_una != snd_una) { 1677 tp->snd_una = snd_una; 1678 tp->ts_recent_age = tcp_ts_getticks(); 1679 } 1680 } 1681 1682 #ifdef VERBOSE_TRACES 1683 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1684 #endif 1685 so = inp->inp_socket; 1686 txsd = &toep->txsd[toep->txsd_cidx]; 1687 plen = 0; 1688 while (credits) { 1689 KASSERT(credits >= txsd->tx_credits, 1690 ("%s: too many (or partial) credits", __func__)); 1691 credits -= txsd->tx_credits; 1692 toep->tx_credits += txsd->tx_credits; 1693 plen += txsd->plen; 1694 txsd++; 1695 toep->txsd_avail++; 1696 KASSERT(toep->txsd_avail <= toep->txsd_total, 1697 ("%s: txsd avail > total", __func__)); 1698 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1699 txsd = &toep->txsd[0]; 1700 toep->txsd_cidx = 0; 1701 } 1702 } 1703 1704 if (toep->tx_credits == toep->tx_total) { 1705 toep->tx_nocompl = 0; 1706 toep->plen_nocompl = 0; 1707 } 1708 1709 if (toep->flags & TPF_TX_SUSPENDED && 1710 toep->tx_credits >= toep->tx_total / 4) { 1711 #ifdef VERBOSE_TRACES 1712 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1713 tid); 1714 #endif 1715 toep->flags &= ~TPF_TX_SUSPENDED; 1716 if (toep->ulp_mode == ULP_MODE_ISCSI) 1717 t4_push_pdus(sc, toep, plen); 1718 else 1719 t4_push_frames(sc, toep, plen); 1720 } else if (plen > 0) { 1721 struct sockbuf *sb = &so->so_snd; 1722 int sbu; 1723 1724 SOCKBUF_LOCK(sb); 1725 sbu = sbused(sb); 1726 if (toep->ulp_mode == ULP_MODE_ISCSI) { 1727 1728 if (__predict_false(sbu > 0)) { 1729 /* 1730 * The data trasmitted before the tid's ULP mode 1731 * changed to ISCSI is still in so_snd. 1732 * Incoming credits should account for so_snd 1733 * first. 1734 */ 1735 sbdrop_locked(sb, min(sbu, plen)); 1736 plen -= min(sbu, plen); 1737 } 1738 sowwakeup_locked(so); /* unlocks so_snd */ 1739 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1740 } else { 1741 #ifdef VERBOSE_TRACES 1742 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1743 tid, plen); 1744 #endif 1745 sbdrop_locked(sb, plen); 1746 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1747 t4_aiotx_queue_toep(toep); 1748 sowwakeup_locked(so); /* unlocks so_snd */ 1749 } 1750 SOCKBUF_UNLOCK_ASSERT(sb); 1751 } 1752 1753 INP_WUNLOCK(inp); 1754 1755 return (0); 1756 } 1757 1758 int 1759 do_set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1760 { 1761 struct adapter *sc = iq->adapter; 1762 const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); 1763 unsigned int tid = GET_TID(cpl); 1764 struct toepcb *toep; 1765 #ifdef INVARIANTS 1766 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1767 #endif 1768 1769 KASSERT(opcode == CPL_SET_TCB_RPL, 1770 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1771 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1772 MPASS(iq != &sc->sge.fwq); 1773 1774 toep = lookup_tid(sc, tid); 1775 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1776 handle_ddp_tcb_rpl(toep, cpl); 1777 return (0); 1778 } 1779 1780 /* 1781 * TOM and/or other ULPs don't request replies for CPL_SET_TCB or 1782 * CPL_SET_TCB_FIELD requests. This can easily change and when it does 1783 * the dispatch code will go here. 1784 */ 1785 #ifdef INVARIANTS 1786 panic("%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p", __func__, 1787 tid, iq); 1788 #else 1789 log(LOG_ERR, "%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p\n", 1790 __func__, tid, iq); 1791 #endif 1792 1793 return (0); 1794 } 1795 1796 void 1797 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, int tid, 1798 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie, int iqid) 1799 { 1800 struct wrqe *wr; 1801 struct cpl_set_tcb_field *req; 1802 1803 MPASS((cookie & ~M_COOKIE) == 0); 1804 MPASS((iqid & ~M_QUEUENO) == 0); 1805 1806 wr = alloc_wrqe(sizeof(*req), wrq); 1807 if (wr == NULL) { 1808 /* XXX */ 1809 panic("%s: allocation failure.", __func__); 1810 } 1811 req = wrtod(wr); 1812 1813 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, tid); 1814 req->reply_ctrl = htobe16(V_QUEUENO(iqid)); 1815 if (reply == 0) 1816 req->reply_ctrl |= htobe16(F_NO_REPLY); 1817 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1818 req->mask = htobe64(mask); 1819 req->val = htobe64(val); 1820 1821 t4_wrq_tx(sc, wr); 1822 } 1823 1824 void 1825 t4_init_cpl_io_handlers(void) 1826 { 1827 1828 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1829 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1830 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1831 t4_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 1832 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1833 t4_register_cpl_handler(CPL_FW4_ACK, do_fw4_ack); 1834 } 1835 1836 void 1837 t4_uninit_cpl_io_handlers(void) 1838 { 1839 1840 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1841 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1842 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1843 t4_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 1844 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1845 t4_register_cpl_handler(CPL_FW4_ACK, do_fw4_ack); 1846 } 1847 1848 /* 1849 * Use the 'backend3' field in AIO jobs to store the amount of data 1850 * sent by the AIO job so far and the 'backend4' field to hold an 1851 * error that should be reported when the job is completed. 1852 */ 1853 #define aio_sent backend3 1854 #define aio_error backend4 1855 1856 #define jobtotid(job) \ 1857 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1858 1859 static void 1860 free_aiotx_buffer(struct aiotx_buffer *ab) 1861 { 1862 struct kaiocb *job; 1863 long status; 1864 int error; 1865 1866 if (refcount_release(&ab->refcount) == 0) 1867 return; 1868 1869 job = ab->job; 1870 error = job->aio_error; 1871 status = job->aio_sent; 1872 vm_page_unhold_pages(ab->ps.pages, ab->ps.npages); 1873 free(ab, M_CXGBE); 1874 #ifdef VERBOSE_TRACES 1875 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1876 jobtotid(job), job, status, error); 1877 #endif 1878 if (error == ECANCELED && status != 0) 1879 error = 0; 1880 if (error == ECANCELED) 1881 aio_cancel(job); 1882 else if (error) 1883 aio_complete(job, -1, error); 1884 else 1885 aio_complete(job, status, 0); 1886 } 1887 1888 static void 1889 t4_aiotx_mbuf_free(struct mbuf *m, void *buffer, void *arg) 1890 { 1891 struct aiotx_buffer *ab = buffer; 1892 1893 #ifdef VERBOSE_TRACES 1894 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1895 m->m_len, jobtotid(ab->job)); 1896 #endif 1897 free_aiotx_buffer(ab); 1898 } 1899 1900 /* 1901 * Hold the buffer backing an AIO request and return an AIO transmit 1902 * buffer. 1903 */ 1904 static int 1905 hold_aio(struct kaiocb *job) 1906 { 1907 struct aiotx_buffer *ab; 1908 struct vmspace *vm; 1909 vm_map_t map; 1910 vm_offset_t start, end, pgoff; 1911 int n; 1912 1913 MPASS(job->backend1 == NULL); 1914 1915 /* 1916 * The AIO subsystem will cancel and drain all requests before 1917 * permitting a process to exit or exec, so p_vmspace should 1918 * be stable here. 1919 */ 1920 vm = job->userproc->p_vmspace; 1921 map = &vm->vm_map; 1922 start = (uintptr_t)job->uaiocb.aio_buf; 1923 pgoff = start & PAGE_MASK; 1924 end = round_page(start + job->uaiocb.aio_nbytes); 1925 start = trunc_page(start); 1926 n = atop(end - start); 1927 1928 ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | 1929 M_ZERO); 1930 refcount_init(&ab->refcount, 1); 1931 ab->ps.pages = (vm_page_t *)(ab + 1); 1932 ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start, 1933 VM_PROT_WRITE, ab->ps.pages, n); 1934 if (ab->ps.npages < 0) { 1935 free(ab, M_CXGBE); 1936 return (EFAULT); 1937 } 1938 1939 KASSERT(ab->ps.npages == n, 1940 ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n)); 1941 1942 ab->ps.offset = pgoff; 1943 ab->ps.len = job->uaiocb.aio_nbytes; 1944 ab->job = job; 1945 job->backend1 = ab; 1946 #ifdef VERBOSE_TRACES 1947 CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", 1948 __func__, jobtotid(job), &ab->ps, job, ab->ps.npages); 1949 #endif 1950 return (0); 1951 } 1952 1953 static void 1954 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 1955 { 1956 struct adapter *sc; 1957 struct sockbuf *sb; 1958 struct file *fp; 1959 struct aiotx_buffer *ab; 1960 struct inpcb *inp; 1961 struct tcpcb *tp; 1962 struct mbuf *m; 1963 int error; 1964 bool moretocome, sendmore; 1965 1966 sc = td_adapter(toep->td); 1967 sb = &so->so_snd; 1968 SOCKBUF_UNLOCK(sb); 1969 fp = job->fd_file; 1970 ab = job->backend1; 1971 m = NULL; 1972 1973 #ifdef MAC 1974 error = mac_socket_check_send(fp->f_cred, so); 1975 if (error != 0) 1976 goto out; 1977 #endif 1978 1979 if (ab == NULL) { 1980 error = hold_aio(job); 1981 if (error != 0) 1982 goto out; 1983 ab = job->backend1; 1984 } 1985 1986 /* Inline sosend_generic(). */ 1987 1988 job->msgsnd = 1; 1989 1990 error = sblock(sb, SBL_WAIT); 1991 MPASS(error == 0); 1992 1993 sendanother: 1994 m = m_get(M_WAITOK, MT_DATA); 1995 1996 SOCKBUF_LOCK(sb); 1997 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1998 SOCKBUF_UNLOCK(sb); 1999 sbunlock(sb); 2000 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2001 PROC_LOCK(job->userproc); 2002 kern_psignal(job->userproc, SIGPIPE); 2003 PROC_UNLOCK(job->userproc); 2004 } 2005 error = EPIPE; 2006 goto out; 2007 } 2008 if (so->so_error) { 2009 error = so->so_error; 2010 so->so_error = 0; 2011 SOCKBUF_UNLOCK(sb); 2012 sbunlock(sb); 2013 goto out; 2014 } 2015 if ((so->so_state & SS_ISCONNECTED) == 0) { 2016 SOCKBUF_UNLOCK(sb); 2017 sbunlock(sb); 2018 error = ENOTCONN; 2019 goto out; 2020 } 2021 if (sbspace(sb) < sb->sb_lowat) { 2022 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2023 2024 /* 2025 * Don't block if there is too little room in the socket 2026 * buffer. Instead, requeue the request. 2027 */ 2028 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2029 SOCKBUF_UNLOCK(sb); 2030 sbunlock(sb); 2031 error = ECANCELED; 2032 goto out; 2033 } 2034 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2035 SOCKBUF_UNLOCK(sb); 2036 sbunlock(sb); 2037 goto out; 2038 } 2039 2040 /* 2041 * Write as much data as the socket permits, but no more than a 2042 * a single sndbuf at a time. 2043 */ 2044 m->m_len = sbspace(sb); 2045 if (m->m_len > ab->ps.len - job->aio_sent) { 2046 m->m_len = ab->ps.len - job->aio_sent; 2047 moretocome = false; 2048 } else 2049 moretocome = true; 2050 if (m->m_len > sc->tt.sndbuf) { 2051 m->m_len = sc->tt.sndbuf; 2052 sendmore = true; 2053 } else 2054 sendmore = false; 2055 2056 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2057 moretocome = true; 2058 SOCKBUF_UNLOCK(sb); 2059 MPASS(m->m_len != 0); 2060 2061 /* Inlined tcp_usr_send(). */ 2062 2063 inp = toep->inp; 2064 INP_WLOCK(inp); 2065 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2066 INP_WUNLOCK(inp); 2067 sbunlock(sb); 2068 error = ECONNRESET; 2069 goto out; 2070 } 2071 2072 refcount_acquire(&ab->refcount); 2073 m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab, 2074 (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV); 2075 m->m_ext.ext_flags |= EXT_FLAG_AIOTX; 2076 job->aio_sent += m->m_len; 2077 2078 sbappendstream(sb, m, 0); 2079 m = NULL; 2080 2081 if (!(inp->inp_flags & INP_DROPPED)) { 2082 tp = intotcpcb(inp); 2083 if (moretocome) 2084 tp->t_flags |= TF_MORETOCOME; 2085 error = tp->t_fb->tfb_tcp_output(tp); 2086 if (moretocome) 2087 tp->t_flags &= ~TF_MORETOCOME; 2088 } 2089 2090 INP_WUNLOCK(inp); 2091 if (sendmore) 2092 goto sendanother; 2093 sbunlock(sb); 2094 2095 if (error) 2096 goto out; 2097 2098 /* 2099 * If this is a non-blocking socket and the request has not 2100 * been fully completed, requeue it until the socket is ready 2101 * again. 2102 */ 2103 if (job->aio_sent < job->uaiocb.aio_nbytes && 2104 !(so->so_state & SS_NBIO)) { 2105 SOCKBUF_LOCK(sb); 2106 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2107 SOCKBUF_UNLOCK(sb); 2108 error = ECANCELED; 2109 goto out; 2110 } 2111 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2112 return; 2113 } 2114 2115 /* 2116 * If the request will not be requeued, drop a reference on 2117 * the aiotx buffer. Any mbufs in flight should still 2118 * contain a reference, but this drops the reference that the 2119 * job owns while it is waiting to queue mbufs to the socket. 2120 */ 2121 free_aiotx_buffer(ab); 2122 2123 out: 2124 if (error) { 2125 if (ab != NULL) { 2126 job->aio_error = error; 2127 free_aiotx_buffer(ab); 2128 } else { 2129 MPASS(job->aio_sent == 0); 2130 aio_complete(job, -1, error); 2131 } 2132 } 2133 if (m != NULL) 2134 m_free(m); 2135 SOCKBUF_LOCK(sb); 2136 } 2137 2138 static void 2139 t4_aiotx_task(void *context, int pending) 2140 { 2141 struct toepcb *toep = context; 2142 struct inpcb *inp = toep->inp; 2143 struct socket *so = inp->inp_socket; 2144 struct kaiocb *job; 2145 2146 CURVNET_SET(so->so_vnet); 2147 SOCKBUF_LOCK(&so->so_snd); 2148 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2149 job = TAILQ_FIRST(&toep->aiotx_jobq); 2150 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2151 if (!aio_clear_cancel_function(job)) 2152 continue; 2153 2154 t4_aiotx_process_job(toep, so, job); 2155 } 2156 toep->aiotx_task_active = false; 2157 SOCKBUF_UNLOCK(&so->so_snd); 2158 CURVNET_RESTORE(); 2159 2160 free_toepcb(toep); 2161 } 2162 2163 static void 2164 t4_aiotx_queue_toep(struct toepcb *toep) 2165 { 2166 2167 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2168 #ifdef VERBOSE_TRACES 2169 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2170 __func__, toep->tid, toep->aiotx_task_active ? "true" : "false"); 2171 #endif 2172 if (toep->aiotx_task_active) 2173 return; 2174 toep->aiotx_task_active = true; 2175 hold_toepcb(toep); 2176 soaio_enqueue(&toep->aiotx_task); 2177 } 2178 2179 static void 2180 t4_aiotx_cancel(struct kaiocb *job) 2181 { 2182 struct aiotx_buffer *ab; 2183 struct socket *so; 2184 struct sockbuf *sb; 2185 struct tcpcb *tp; 2186 struct toepcb *toep; 2187 2188 so = job->fd_file->f_data; 2189 tp = so_sototcpcb(so); 2190 toep = tp->t_toe; 2191 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2192 sb = &so->so_snd; 2193 2194 SOCKBUF_LOCK(sb); 2195 if (!aio_cancel_cleared(job)) 2196 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2197 SOCKBUF_UNLOCK(sb); 2198 2199 ab = job->backend1; 2200 if (ab != NULL) 2201 free_aiotx_buffer(ab); 2202 else 2203 aio_cancel(job); 2204 } 2205 2206 int 2207 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2208 { 2209 struct tcpcb *tp = so_sototcpcb(so); 2210 struct toepcb *toep = tp->t_toe; 2211 struct adapter *sc = td_adapter(toep->td); 2212 2213 /* This only handles writes. */ 2214 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2215 return (EOPNOTSUPP); 2216 2217 if (!sc->tt.tx_zcopy) 2218 return (EOPNOTSUPP); 2219 2220 SOCKBUF_LOCK(&so->so_snd); 2221 #ifdef VERBOSE_TRACES 2222 CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job); 2223 #endif 2224 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2225 panic("new job was cancelled"); 2226 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2227 if (sowriteable(so)) 2228 t4_aiotx_queue_toep(toep); 2229 SOCKBUF_UNLOCK(&so->so_snd); 2230 return (0); 2231 } 2232 2233 void 2234 aiotx_init_toep(struct toepcb *toep) 2235 { 2236 2237 TAILQ_INIT(&toep->aiotx_jobq); 2238 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2239 } 2240 #endif 2241