1 /*- 2 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_ratelimit.h" 34 35 #ifdef TCP_OFFLOAD 36 #include <sys/param.h> 37 #include <sys/aio.h> 38 #include <sys/file.h> 39 #include <sys/kernel.h> 40 #include <sys/ktr.h> 41 #include <sys/module.h> 42 #include <sys/proc.h> 43 #include <sys/protosw.h> 44 #include <sys/domain.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/sglist.h> 48 #include <sys/taskqueue.h> 49 #include <netinet/in.h> 50 #include <netinet/in_pcb.h> 51 #include <netinet/ip.h> 52 #include <netinet/ip6.h> 53 #define TCPSTATES 54 #include <netinet/tcp_fsm.h> 55 #include <netinet/tcp_seq.h> 56 #include <netinet/tcp_var.h> 57 #include <netinet/toecore.h> 58 59 #include <security/mac/mac_framework.h> 60 61 #include <vm/vm.h> 62 #include <vm/vm_extern.h> 63 #include <vm/pmap.h> 64 #include <vm/vm_map.h> 65 #include <vm/vm_page.h> 66 67 #include "common/common.h" 68 #include "common/t4_msg.h" 69 #include "common/t4_regs.h" 70 #include "common/t4_tcb.h" 71 #include "tom/t4_tom_l2t.h" 72 #include "tom/t4_tom.h" 73 74 #define IS_AIOTX_MBUF(m) \ 75 ((m)->m_flags & M_EXT && (m)->m_ext.ext_flags & EXT_FLAG_AIOTX) 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct toepcb *toep); 79 80 static size_t 81 aiotx_mbuf_pgoff(struct mbuf *m) 82 { 83 struct aiotx_buffer *ab; 84 85 MPASS(IS_AIOTX_MBUF(m)); 86 ab = m->m_ext.ext_arg1; 87 return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE); 88 } 89 90 static vm_page_t * 91 aiotx_mbuf_pages(struct mbuf *m) 92 { 93 struct aiotx_buffer *ab; 94 int npages; 95 96 MPASS(IS_AIOTX_MBUF(m)); 97 ab = m->m_ext.ext_arg1; 98 npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE; 99 return (ab->ps.pages + npages); 100 } 101 102 void 103 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) 104 { 105 struct wrqe *wr; 106 struct fw_flowc_wr *flowc; 107 unsigned int nparams = ftxp ? 8 : 6, flowclen; 108 struct vi_info *vi = toep->vi; 109 struct port_info *pi = vi->pi; 110 struct adapter *sc = pi->adapter; 111 unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN; 112 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 113 114 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 115 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 116 117 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 118 119 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 120 if (wr == NULL) { 121 /* XXX */ 122 panic("%s: allocation failure.", __func__); 123 } 124 flowc = wrtod(wr); 125 memset(flowc, 0, wr->wr_len); 126 127 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 128 V_FW_FLOWC_WR_NPARAMS(nparams)); 129 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 130 V_FW_WR_FLOWID(toep->tid)); 131 132 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 133 flowc->mnemval[0].val = htobe32(pfvf); 134 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 135 flowc->mnemval[1].val = htobe32(pi->tx_chan); 136 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 137 flowc->mnemval[2].val = htobe32(pi->tx_chan); 138 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 139 flowc->mnemval[3].val = htobe32(toep->ofld_rxq->iq.abs_id); 140 if (ftxp) { 141 uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); 142 143 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT; 144 flowc->mnemval[4].val = htobe32(ftxp->snd_nxt); 145 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT; 146 flowc->mnemval[5].val = htobe32(ftxp->rcv_nxt); 147 flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; 148 flowc->mnemval[6].val = htobe32(sndbuf); 149 flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; 150 flowc->mnemval[7].val = htobe32(ftxp->mss); 151 152 CTR6(KTR_CXGBE, 153 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 154 __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt, 155 ftxp->rcv_nxt); 156 } else { 157 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 158 flowc->mnemval[4].val = htobe32(512); 159 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 160 flowc->mnemval[5].val = htobe32(512); 161 162 CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); 163 } 164 165 txsd->tx_credits = howmany(flowclen, 16); 166 txsd->plen = 0; 167 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 168 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 169 toep->tx_credits -= txsd->tx_credits; 170 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 171 toep->txsd_pidx = 0; 172 toep->txsd_avail--; 173 174 toep->flags |= TPF_FLOWC_WR_SENT; 175 t4_wrq_tx(sc, wr); 176 } 177 178 #ifdef RATELIMIT 179 /* 180 * Input is Bytes/second (so_max_pacing-rate), chip counts in Kilobits/second. 181 */ 182 static int 183 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 184 { 185 int tc_idx, rc; 186 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 187 const int port_id = toep->vi->pi->port_id; 188 189 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 190 191 if (kbps == 0) { 192 /* unbind */ 193 tc_idx = -1; 194 } else { 195 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 196 if (rc != 0) 197 return (rc); 198 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 199 } 200 201 if (toep->tc_idx != tc_idx) { 202 struct wrqe *wr; 203 struct fw_flowc_wr *flowc; 204 int nparams = 1, flowclen, flowclen16; 205 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 206 207 flowclen = sizeof(*flowc) + nparams * sizeof(struct 208 fw_flowc_mnemval); 209 flowclen16 = howmany(flowclen, 16); 210 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 211 (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { 212 if (tc_idx >= 0) 213 t4_release_cl_rl_kbps(sc, port_id, tc_idx); 214 return (ENOMEM); 215 } 216 217 flowc = wrtod(wr); 218 memset(flowc, 0, wr->wr_len); 219 220 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 221 V_FW_FLOWC_WR_NPARAMS(nparams)); 222 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 223 V_FW_WR_FLOWID(toep->tid)); 224 225 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 226 if (tc_idx == -1) 227 flowc->mnemval[0].val = htobe32(0xff); 228 else 229 flowc->mnemval[0].val = htobe32(tc_idx); 230 231 txsd->tx_credits = flowclen16; 232 txsd->plen = 0; 233 toep->tx_credits -= txsd->tx_credits; 234 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 235 toep->txsd_pidx = 0; 236 toep->txsd_avail--; 237 t4_wrq_tx(sc, wr); 238 } 239 240 if (toep->tc_idx >= 0) 241 t4_release_cl_rl_kbps(sc, port_id, toep->tc_idx); 242 toep->tc_idx = tc_idx; 243 244 return (0); 245 } 246 #endif 247 248 void 249 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 250 { 251 struct wrqe *wr; 252 struct cpl_abort_req *req; 253 int tid = toep->tid; 254 struct inpcb *inp = toep->inp; 255 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 256 257 INP_WLOCK_ASSERT(inp); 258 259 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 260 __func__, toep->tid, 261 inp->inp_flags & INP_DROPPED ? "inp dropped" : 262 tcpstates[tp->t_state], 263 toep->flags, inp->inp_flags, 264 toep->flags & TPF_ABORT_SHUTDOWN ? 265 " (abort already in progress)" : ""); 266 267 if (toep->flags & TPF_ABORT_SHUTDOWN) 268 return; /* abort already in progress */ 269 270 toep->flags |= TPF_ABORT_SHUTDOWN; 271 272 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 273 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 274 275 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 276 if (wr == NULL) { 277 /* XXX */ 278 panic("%s: allocation failure.", __func__); 279 } 280 req = wrtod(wr); 281 282 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 283 if (inp->inp_flags & INP_DROPPED) 284 req->rsvd0 = htobe32(snd_nxt); 285 else 286 req->rsvd0 = htobe32(tp->snd_nxt); 287 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 288 req->cmd = CPL_ABORT_SEND_RST; 289 290 /* 291 * XXX: What's the correct way to tell that the inp hasn't been detached 292 * from its socket? Should I even be flushing the snd buffer here? 293 */ 294 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 295 struct socket *so = inp->inp_socket; 296 297 if (so != NULL) /* because I'm not sure. See comment above */ 298 sbflush(&so->so_snd); 299 } 300 301 t4_l2t_send(sc, wr, toep->l2te); 302 } 303 304 /* 305 * Called when a connection is established to translate the TCP options 306 * reported by HW to FreeBSD's native format. 307 */ 308 static void 309 assign_rxopt(struct tcpcb *tp, unsigned int opt) 310 { 311 struct toepcb *toep = tp->t_toe; 312 struct inpcb *inp = tp->t_inpcb; 313 struct adapter *sc = td_adapter(toep->td); 314 int n; 315 316 INP_LOCK_ASSERT(inp); 317 318 if (inp->inp_inc.inc_flags & INC_ISIPV6) 319 n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 320 else 321 n = sizeof(struct ip) + sizeof(struct tcphdr); 322 if (V_tcp_do_rfc1323) 323 n += TCPOLEN_TSTAMP_APPA; 324 tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; 325 326 CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid, 327 G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]); 328 329 if (G_TCPOPT_TSTAMP(opt)) { 330 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 331 tp->ts_recent = 0; /* hmmm */ 332 tp->ts_recent_age = tcp_ts_getticks(); 333 } 334 335 if (G_TCPOPT_SACK(opt)) 336 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 337 else 338 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 339 340 if (G_TCPOPT_WSCALE_OK(opt)) 341 tp->t_flags |= TF_RCVD_SCALE; 342 343 /* Doing window scaling? */ 344 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 345 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 346 tp->rcv_scale = tp->request_r_scale; 347 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 348 } 349 } 350 351 /* 352 * Completes some final bits of initialization for just established connections 353 * and changes their state to TCPS_ESTABLISHED. 354 * 355 * The ISNs are from after the exchange of SYNs. i.e., the true ISN + 1. 356 */ 357 void 358 make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn, 359 uint16_t opt) 360 { 361 struct inpcb *inp = toep->inp; 362 struct socket *so = inp->inp_socket; 363 struct tcpcb *tp = intotcpcb(inp); 364 long bufsize; 365 uint32_t iss = be32toh(snd_isn) - 1; /* true ISS */ 366 uint32_t irs = be32toh(rcv_isn) - 1; /* true IRS */ 367 uint16_t tcpopt = be16toh(opt); 368 struct flowc_tx_params ftxp; 369 370 INP_WLOCK_ASSERT(inp); 371 KASSERT(tp->t_state == TCPS_SYN_SENT || 372 tp->t_state == TCPS_SYN_RECEIVED, 373 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 374 375 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 376 __func__, toep->tid, so, inp, tp, toep); 377 378 tp->t_state = TCPS_ESTABLISHED; 379 tp->t_starttime = ticks; 380 TCPSTAT_INC(tcps_connects); 381 382 tp->irs = irs; 383 tcp_rcvseqinit(tp); 384 tp->rcv_wnd = toep->rx_credits << 10; 385 tp->rcv_adv += tp->rcv_wnd; 386 tp->last_ack_sent = tp->rcv_nxt; 387 388 /* 389 * If we were unable to send all rx credits via opt0, save the remainder 390 * in rx_credits so that they can be handed over with the next credit 391 * update. 392 */ 393 SOCKBUF_LOCK(&so->so_rcv); 394 bufsize = select_rcv_wnd(so); 395 SOCKBUF_UNLOCK(&so->so_rcv); 396 toep->rx_credits = bufsize - tp->rcv_wnd; 397 398 tp->iss = iss; 399 tcp_sendseqinit(tp); 400 tp->snd_una = iss + 1; 401 tp->snd_nxt = iss + 1; 402 tp->snd_max = iss + 1; 403 404 assign_rxopt(tp, tcpopt); 405 406 SOCKBUF_LOCK(&so->so_snd); 407 if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) 408 bufsize = V_tcp_autosndbuf_max; 409 else 410 bufsize = sbspace(&so->so_snd); 411 SOCKBUF_UNLOCK(&so->so_snd); 412 413 ftxp.snd_nxt = tp->snd_nxt; 414 ftxp.rcv_nxt = tp->rcv_nxt; 415 ftxp.snd_space = bufsize; 416 ftxp.mss = tp->t_maxseg; 417 send_flowc_wr(toep, &ftxp); 418 419 soisconnected(so); 420 } 421 422 static int 423 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 424 { 425 struct wrqe *wr; 426 struct cpl_rx_data_ack *req; 427 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 428 429 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 430 431 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 432 if (wr == NULL) 433 return (0); 434 req = wrtod(wr); 435 436 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 437 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 438 439 t4_wrq_tx(sc, wr); 440 return (credits); 441 } 442 443 void 444 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 445 { 446 struct adapter *sc = tod->tod_softc; 447 struct inpcb *inp = tp->t_inpcb; 448 struct socket *so = inp->inp_socket; 449 struct sockbuf *sb = &so->so_rcv; 450 struct toepcb *toep = tp->t_toe; 451 int credits; 452 453 INP_WLOCK_ASSERT(inp); 454 455 SOCKBUF_LOCK_ASSERT(sb); 456 KASSERT(toep->sb_cc >= sbused(sb), 457 ("%s: sb %p has more data (%d) than last time (%d).", 458 __func__, sb, sbused(sb), toep->sb_cc)); 459 460 toep->rx_credits += toep->sb_cc - sbused(sb); 461 toep->sb_cc = sbused(sb); 462 463 if (toep->rx_credits > 0 && 464 (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 || 465 (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 466 toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) { 467 468 credits = send_rx_credits(sc, toep, toep->rx_credits); 469 toep->rx_credits -= credits; 470 tp->rcv_wnd += credits; 471 tp->rcv_adv += credits; 472 } 473 } 474 475 void 476 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 477 { 478 struct inpcb *inp = tp->t_inpcb; 479 struct socket *so = inp->inp_socket; 480 struct sockbuf *sb = &so->so_rcv; 481 482 SOCKBUF_LOCK(sb); 483 t4_rcvd_locked(tod, tp); 484 SOCKBUF_UNLOCK(sb); 485 } 486 487 /* 488 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 489 */ 490 static int 491 close_conn(struct adapter *sc, struct toepcb *toep) 492 { 493 struct wrqe *wr; 494 struct cpl_close_con_req *req; 495 unsigned int tid = toep->tid; 496 497 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 498 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 499 500 if (toep->flags & TPF_FIN_SENT) 501 return (0); 502 503 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 504 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 505 506 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 507 if (wr == NULL) { 508 /* XXX */ 509 panic("%s: allocation failure.", __func__); 510 } 511 req = wrtod(wr); 512 513 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 514 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 515 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 516 V_FW_WR_FLOWID(tid)); 517 req->wr.wr_lo = cpu_to_be64(0); 518 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 519 req->rsvd = 0; 520 521 toep->flags |= TPF_FIN_SENT; 522 toep->flags &= ~TPF_SEND_FIN; 523 t4_l2t_send(sc, wr, toep->l2te); 524 525 return (0); 526 } 527 528 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 529 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 530 531 /* Maximum amount of immediate data we could stuff in a WR */ 532 static inline int 533 max_imm_payload(int tx_credits) 534 { 535 const int n = 2; /* Use only up to 2 desc for imm. data WR */ 536 537 KASSERT(tx_credits >= 0 && 538 tx_credits <= MAX_OFLD_TX_CREDITS, 539 ("%s: %d credits", __func__, tx_credits)); 540 541 if (tx_credits < MIN_OFLD_TX_CREDITS) 542 return (0); 543 544 if (tx_credits >= (n * EQ_ESIZE) / 16) 545 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 546 else 547 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 548 } 549 550 /* Maximum number of SGL entries we could stuff in a WR */ 551 static inline int 552 max_dsgl_nsegs(int tx_credits) 553 { 554 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 555 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 556 557 KASSERT(tx_credits >= 0 && 558 tx_credits <= MAX_OFLD_TX_CREDITS, 559 ("%s: %d credits", __func__, tx_credits)); 560 561 if (tx_credits < MIN_OFLD_TX_CREDITS) 562 return (0); 563 564 nseg += 2 * (sge_pair_credits * 16 / 24); 565 if ((sge_pair_credits * 16) % 24 == 16) 566 nseg++; 567 568 return (nseg); 569 } 570 571 static inline void 572 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 573 unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign) 574 { 575 struct fw_ofld_tx_data_wr *txwr = dst; 576 577 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 578 V_FW_WR_IMMDLEN(immdlen)); 579 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 580 V_FW_WR_LEN16(credits)); 581 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) | 582 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 583 txwr->plen = htobe32(plen); 584 585 if (txalign > 0) { 586 struct tcpcb *tp = intotcpcb(toep->inp); 587 588 if (plen < 2 * tp->t_maxseg || is_10G_port(toep->vi->pi)) 589 txwr->lsodisable_to_flags |= 590 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 591 else 592 txwr->lsodisable_to_flags |= 593 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 594 (tp->t_flags & TF_NODELAY ? 0 : 595 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 596 } 597 } 598 599 /* 600 * Generate a DSGL from a starting mbuf. The total number of segments and the 601 * maximum segments in any one mbuf are provided. 602 */ 603 static void 604 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 605 { 606 struct mbuf *m; 607 struct ulptx_sgl *usgl = dst; 608 int i, j, rc; 609 struct sglist sg; 610 struct sglist_seg segs[n]; 611 612 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 613 614 sglist_init(&sg, n, segs); 615 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 616 V_ULPTX_NSGE(nsegs)); 617 618 i = -1; 619 for (m = start; m != stop; m = m->m_next) { 620 if (IS_AIOTX_MBUF(m)) 621 rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m), 622 aiotx_mbuf_pgoff(m), m->m_len); 623 else 624 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 625 if (__predict_false(rc != 0)) 626 panic("%s: sglist_append %d", __func__, rc); 627 628 for (j = 0; j < sg.sg_nseg; i++, j++) { 629 if (i < 0) { 630 usgl->len0 = htobe32(segs[j].ss_len); 631 usgl->addr0 = htobe64(segs[j].ss_paddr); 632 } else { 633 usgl->sge[i / 2].len[i & 1] = 634 htobe32(segs[j].ss_len); 635 usgl->sge[i / 2].addr[i & 1] = 636 htobe64(segs[j].ss_paddr); 637 } 638 #ifdef INVARIANTS 639 nsegs--; 640 #endif 641 } 642 sglist_reset(&sg); 643 } 644 if (i & 1) 645 usgl->sge[i / 2].len[1] = htobe32(0); 646 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 647 __func__, nsegs, start, stop)); 648 } 649 650 /* 651 * Max number of SGL entries an offload tx work request can have. This is 41 652 * (1 + 40) for a full 512B work request. 653 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 654 */ 655 #define OFLD_SGL_LEN (41) 656 657 /* 658 * Send data and/or a FIN to the peer. 659 * 660 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 661 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 662 * was transmitted. 663 * 664 * drop indicates the number of bytes that should be dropped from the head of 665 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 666 * contention on the send buffer lock (before this change it used to do 667 * sowwakeup and then t4_push_frames right after that when recovering from tx 668 * stalls). When drop is set this function MUST drop the bytes and wake up any 669 * writers. 670 */ 671 void 672 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 673 { 674 struct mbuf *sndptr, *m, *sb_sndptr; 675 struct fw_ofld_tx_data_wr *txwr; 676 struct wrqe *wr; 677 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 678 struct inpcb *inp = toep->inp; 679 struct tcpcb *tp = intotcpcb(inp); 680 struct socket *so = inp->inp_socket; 681 struct sockbuf *sb = &so->so_snd; 682 int tx_credits, shove, compl, sowwakeup; 683 struct ofld_tx_sdesc *txsd; 684 bool aiotx_mbuf_seen; 685 686 INP_WLOCK_ASSERT(inp); 687 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 688 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 689 690 KASSERT(toep->ulp_mode == ULP_MODE_NONE || 691 toep->ulp_mode == ULP_MODE_TCPDDP || 692 toep->ulp_mode == ULP_MODE_RDMA, 693 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 694 695 #ifdef VERBOSE_TRACES 696 CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 697 __func__, toep->tid, toep->flags, tp->t_flags); 698 #endif 699 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 700 return; 701 702 #ifdef RATELIMIT 703 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 704 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 705 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 706 } 707 #endif 708 709 /* 710 * This function doesn't resume by itself. Someone else must clear the 711 * flag and call this function. 712 */ 713 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 714 KASSERT(drop == 0, 715 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 716 return; 717 } 718 719 txsd = &toep->txsd[toep->txsd_pidx]; 720 do { 721 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 722 max_imm = max_imm_payload(tx_credits); 723 max_nsegs = max_dsgl_nsegs(tx_credits); 724 725 SOCKBUF_LOCK(sb); 726 sowwakeup = drop; 727 if (drop) { 728 sbdrop_locked(sb, drop); 729 drop = 0; 730 } 731 sb_sndptr = sb->sb_sndptr; 732 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 733 plen = 0; 734 nsegs = 0; 735 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 736 aiotx_mbuf_seen = false; 737 for (m = sndptr; m != NULL; m = m->m_next) { 738 int n; 739 740 if (IS_AIOTX_MBUF(m)) 741 n = sglist_count_vmpages(aiotx_mbuf_pages(m), 742 aiotx_mbuf_pgoff(m), m->m_len); 743 else 744 n = sglist_count(mtod(m, void *), m->m_len); 745 746 nsegs += n; 747 plen += m->m_len; 748 749 /* This mbuf sent us _over_ the nsegs limit, back out */ 750 if (plen > max_imm && nsegs > max_nsegs) { 751 nsegs -= n; 752 plen -= m->m_len; 753 if (plen == 0) { 754 /* Too few credits */ 755 toep->flags |= TPF_TX_SUSPENDED; 756 if (sowwakeup) { 757 if (!TAILQ_EMPTY( 758 &toep->aiotx_jobq)) 759 t4_aiotx_queue_toep( 760 toep); 761 sowwakeup_locked(so); 762 } else 763 SOCKBUF_UNLOCK(sb); 764 SOCKBUF_UNLOCK_ASSERT(sb); 765 return; 766 } 767 break; 768 } 769 770 if (IS_AIOTX_MBUF(m)) 771 aiotx_mbuf_seen = true; 772 if (max_nsegs_1mbuf < n) 773 max_nsegs_1mbuf = n; 774 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 775 776 /* This mbuf put us right at the max_nsegs limit */ 777 if (plen > max_imm && nsegs == max_nsegs) { 778 m = m->m_next; 779 break; 780 } 781 } 782 783 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 784 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 785 compl = 1; 786 else 787 compl = 0; 788 789 if (sb->sb_flags & SB_AUTOSIZE && 790 V_tcp_do_autosndbuf && 791 sb->sb_hiwat < V_tcp_autosndbuf_max && 792 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 793 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 794 V_tcp_autosndbuf_max); 795 796 if (!sbreserve_locked(sb, newsize, so, NULL)) 797 sb->sb_flags &= ~SB_AUTOSIZE; 798 else 799 sowwakeup = 1; /* room available */ 800 } 801 if (sowwakeup) { 802 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 803 t4_aiotx_queue_toep(toep); 804 sowwakeup_locked(so); 805 } else 806 SOCKBUF_UNLOCK(sb); 807 SOCKBUF_UNLOCK_ASSERT(sb); 808 809 /* nothing to send */ 810 if (plen == 0) { 811 KASSERT(m == NULL, 812 ("%s: nothing to send, but m != NULL", __func__)); 813 break; 814 } 815 816 if (__predict_false(toep->flags & TPF_FIN_SENT)) 817 panic("%s: excess tx.", __func__); 818 819 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 820 if (plen <= max_imm && !aiotx_mbuf_seen) { 821 822 /* Immediate data tx */ 823 824 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 825 toep->ofld_txq); 826 if (wr == NULL) { 827 /* XXX: how will we recover from this? */ 828 toep->flags |= TPF_TX_SUSPENDED; 829 return; 830 } 831 txwr = wrtod(wr); 832 credits = howmany(wr->wr_len, 16); 833 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0, 834 sc->tt.tx_align); 835 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 836 nsegs = 0; 837 } else { 838 int wr_len; 839 840 /* DSGL tx */ 841 842 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 843 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 844 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 845 if (wr == NULL) { 846 /* XXX: how will we recover from this? */ 847 toep->flags |= TPF_TX_SUSPENDED; 848 return; 849 } 850 txwr = wrtod(wr); 851 credits = howmany(wr_len, 16); 852 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0, 853 sc->tt.tx_align); 854 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 855 max_nsegs_1mbuf); 856 if (wr_len & 0xf) { 857 uint64_t *pad = (uint64_t *) 858 ((uintptr_t)txwr + wr_len); 859 *pad = 0; 860 } 861 } 862 863 KASSERT(toep->tx_credits >= credits, 864 ("%s: not enough credits", __func__)); 865 866 toep->tx_credits -= credits; 867 toep->tx_nocompl += credits; 868 toep->plen_nocompl += plen; 869 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 870 toep->tx_nocompl >= toep->tx_total / 4) 871 compl = 1; 872 873 if (compl || toep->ulp_mode == ULP_MODE_RDMA) { 874 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 875 toep->tx_nocompl = 0; 876 toep->plen_nocompl = 0; 877 } 878 879 tp->snd_nxt += plen; 880 tp->snd_max += plen; 881 882 SOCKBUF_LOCK(sb); 883 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 884 sb->sb_sndptr = sb_sndptr; 885 SOCKBUF_UNLOCK(sb); 886 887 toep->flags |= TPF_TX_DATA_SENT; 888 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 889 toep->flags |= TPF_TX_SUSPENDED; 890 891 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 892 txsd->plen = plen; 893 txsd->tx_credits = credits; 894 txsd++; 895 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 896 toep->txsd_pidx = 0; 897 txsd = &toep->txsd[0]; 898 } 899 toep->txsd_avail--; 900 901 t4_l2t_send(sc, wr, toep->l2te); 902 } while (m != NULL); 903 904 /* Send a FIN if requested, but only if there's no more data to send */ 905 if (m == NULL && toep->flags & TPF_SEND_FIN) 906 close_conn(sc, toep); 907 } 908 909 static inline void 910 rqdrop_locked(struct mbufq *q, int plen) 911 { 912 struct mbuf *m; 913 914 while (plen > 0) { 915 m = mbufq_dequeue(q); 916 917 /* Too many credits. */ 918 MPASS(m != NULL); 919 M_ASSERTPKTHDR(m); 920 921 /* Partial credits. */ 922 MPASS(plen >= m->m_pkthdr.len); 923 924 plen -= m->m_pkthdr.len; 925 m_freem(m); 926 } 927 } 928 929 void 930 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 931 { 932 struct mbuf *sndptr, *m; 933 struct fw_ofld_tx_data_wr *txwr; 934 struct wrqe *wr; 935 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 936 u_int adjusted_plen, ulp_submode; 937 struct inpcb *inp = toep->inp; 938 struct tcpcb *tp = intotcpcb(inp); 939 int tx_credits, shove; 940 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 941 struct mbufq *pduq = &toep->ulp_pduq; 942 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 943 944 INP_WLOCK_ASSERT(inp); 945 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 946 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 947 KASSERT(toep->ulp_mode == ULP_MODE_ISCSI, 948 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 949 950 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 951 return; 952 953 /* 954 * This function doesn't resume by itself. Someone else must clear the 955 * flag and call this function. 956 */ 957 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 958 KASSERT(drop == 0, 959 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 960 return; 961 } 962 963 if (drop) 964 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 965 966 while ((sndptr = mbufq_first(pduq)) != NULL) { 967 M_ASSERTPKTHDR(sndptr); 968 969 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 970 max_imm = max_imm_payload(tx_credits); 971 max_nsegs = max_dsgl_nsegs(tx_credits); 972 973 plen = 0; 974 nsegs = 0; 975 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 976 for (m = sndptr; m != NULL; m = m->m_next) { 977 int n = sglist_count(mtod(m, void *), m->m_len); 978 979 nsegs += n; 980 plen += m->m_len; 981 982 /* 983 * This mbuf would send us _over_ the nsegs limit. 984 * Suspend tx because the PDU can't be sent out. 985 */ 986 if (plen > max_imm && nsegs > max_nsegs) { 987 toep->flags |= TPF_TX_SUSPENDED; 988 return; 989 } 990 991 if (max_nsegs_1mbuf < n) 992 max_nsegs_1mbuf = n; 993 } 994 995 if (__predict_false(toep->flags & TPF_FIN_SENT)) 996 panic("%s: excess tx.", __func__); 997 998 /* 999 * We have a PDU to send. All of it goes out in one WR so 'm' 1000 * is NULL. A PDU's length is always a multiple of 4. 1001 */ 1002 MPASS(m == NULL); 1003 MPASS((plen & 3) == 0); 1004 MPASS(sndptr->m_pkthdr.len == plen); 1005 1006 shove = !(tp->t_flags & TF_MORETOCOME); 1007 ulp_submode = mbuf_ulp_submode(sndptr); 1008 MPASS(ulp_submode < nitems(ulp_extra_len)); 1009 1010 /* 1011 * plen doesn't include header and data digests, which are 1012 * generated and inserted in the right places by the TOE, but 1013 * they do occupy TCP sequence space and need to be accounted 1014 * for. 1015 */ 1016 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1017 if (plen <= max_imm) { 1018 1019 /* Immediate data tx */ 1020 1021 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1022 toep->ofld_txq); 1023 if (wr == NULL) { 1024 /* XXX: how will we recover from this? */ 1025 toep->flags |= TPF_TX_SUSPENDED; 1026 return; 1027 } 1028 txwr = wrtod(wr); 1029 credits = howmany(wr->wr_len, 16); 1030 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1031 shove, ulp_submode, sc->tt.tx_align); 1032 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1033 nsegs = 0; 1034 } else { 1035 int wr_len; 1036 1037 /* DSGL tx */ 1038 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1039 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1040 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 1041 if (wr == NULL) { 1042 /* XXX: how will we recover from this? */ 1043 toep->flags |= TPF_TX_SUSPENDED; 1044 return; 1045 } 1046 txwr = wrtod(wr); 1047 credits = howmany(wr_len, 16); 1048 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1049 shove, ulp_submode, sc->tt.tx_align); 1050 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1051 max_nsegs_1mbuf); 1052 if (wr_len & 0xf) { 1053 uint64_t *pad = (uint64_t *) 1054 ((uintptr_t)txwr + wr_len); 1055 *pad = 0; 1056 } 1057 } 1058 1059 KASSERT(toep->tx_credits >= credits, 1060 ("%s: not enough credits", __func__)); 1061 1062 m = mbufq_dequeue(pduq); 1063 MPASS(m == sndptr); 1064 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1065 1066 toep->tx_credits -= credits; 1067 toep->tx_nocompl += credits; 1068 toep->plen_nocompl += plen; 1069 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1070 toep->tx_nocompl >= toep->tx_total / 4) { 1071 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1072 toep->tx_nocompl = 0; 1073 toep->plen_nocompl = 0; 1074 } 1075 1076 tp->snd_nxt += adjusted_plen; 1077 tp->snd_max += adjusted_plen; 1078 1079 toep->flags |= TPF_TX_DATA_SENT; 1080 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1081 toep->flags |= TPF_TX_SUSPENDED; 1082 1083 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1084 txsd->plen = plen; 1085 txsd->tx_credits = credits; 1086 txsd++; 1087 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1088 toep->txsd_pidx = 0; 1089 txsd = &toep->txsd[0]; 1090 } 1091 toep->txsd_avail--; 1092 1093 t4_l2t_send(sc, wr, toep->l2te); 1094 } 1095 1096 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1097 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1098 close_conn(sc, toep); 1099 } 1100 1101 int 1102 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1103 { 1104 struct adapter *sc = tod->tod_softc; 1105 #ifdef INVARIANTS 1106 struct inpcb *inp = tp->t_inpcb; 1107 #endif 1108 struct toepcb *toep = tp->t_toe; 1109 1110 INP_WLOCK_ASSERT(inp); 1111 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1112 ("%s: inp %p dropped.", __func__, inp)); 1113 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1114 1115 if (toep->ulp_mode == ULP_MODE_ISCSI) 1116 t4_push_pdus(sc, toep, 0); 1117 else 1118 t4_push_frames(sc, toep, 0); 1119 1120 return (0); 1121 } 1122 1123 int 1124 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1125 { 1126 struct adapter *sc = tod->tod_softc; 1127 #ifdef INVARIANTS 1128 struct inpcb *inp = tp->t_inpcb; 1129 #endif 1130 struct toepcb *toep = tp->t_toe; 1131 1132 INP_WLOCK_ASSERT(inp); 1133 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1134 ("%s: inp %p dropped.", __func__, inp)); 1135 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1136 1137 toep->flags |= TPF_SEND_FIN; 1138 if (tp->t_state >= TCPS_ESTABLISHED) { 1139 if (toep->ulp_mode == ULP_MODE_ISCSI) 1140 t4_push_pdus(sc, toep, 0); 1141 else 1142 t4_push_frames(sc, toep, 0); 1143 } 1144 1145 return (0); 1146 } 1147 1148 int 1149 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1150 { 1151 struct adapter *sc = tod->tod_softc; 1152 #if defined(INVARIANTS) 1153 struct inpcb *inp = tp->t_inpcb; 1154 #endif 1155 struct toepcb *toep = tp->t_toe; 1156 1157 INP_WLOCK_ASSERT(inp); 1158 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1159 ("%s: inp %p dropped.", __func__, inp)); 1160 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1161 1162 /* hmmmm */ 1163 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1164 ("%s: flowc for tid %u [%s] not sent already", 1165 __func__, toep->tid, tcpstates[tp->t_state])); 1166 1167 send_reset(sc, toep, 0); 1168 return (0); 1169 } 1170 1171 /* 1172 * Peer has sent us a FIN. 1173 */ 1174 static int 1175 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1176 { 1177 struct adapter *sc = iq->adapter; 1178 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1179 unsigned int tid = GET_TID(cpl); 1180 struct toepcb *toep = lookup_tid(sc, tid); 1181 struct inpcb *inp = toep->inp; 1182 struct tcpcb *tp = NULL; 1183 struct socket *so; 1184 #ifdef INVARIANTS 1185 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1186 #endif 1187 1188 KASSERT(opcode == CPL_PEER_CLOSE, 1189 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1190 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1191 1192 if (__predict_false(toep->flags & TPF_SYNQE)) { 1193 #ifdef INVARIANTS 1194 struct synq_entry *synqe = (void *)toep; 1195 1196 INP_WLOCK(synqe->lctx->inp); 1197 if (synqe->flags & TPF_SYNQE_HAS_L2TE) { 1198 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1199 ("%s: listen socket closed but tid %u not aborted.", 1200 __func__, tid)); 1201 } else { 1202 /* 1203 * do_pass_accept_req is still running and will 1204 * eventually take care of this tid. 1205 */ 1206 } 1207 INP_WUNLOCK(synqe->lctx->inp); 1208 #endif 1209 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1210 toep, toep->flags); 1211 return (0); 1212 } 1213 1214 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1215 1216 CURVNET_SET(toep->vnet); 1217 INP_INFO_RLOCK(&V_tcbinfo); 1218 INP_WLOCK(inp); 1219 tp = intotcpcb(inp); 1220 1221 CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, 1222 tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp); 1223 1224 if (toep->flags & TPF_ABORT_SHUTDOWN) 1225 goto done; 1226 1227 tp->rcv_nxt++; /* FIN */ 1228 1229 so = inp->inp_socket; 1230 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1231 DDP_LOCK(toep); 1232 if (__predict_false(toep->ddp_flags & 1233 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1234 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1235 DDP_UNLOCK(toep); 1236 } 1237 socantrcvmore(so); 1238 1239 if (toep->ulp_mode != ULP_MODE_RDMA) { 1240 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1241 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1242 be32toh(cpl->rcv_nxt))); 1243 } 1244 1245 switch (tp->t_state) { 1246 case TCPS_SYN_RECEIVED: 1247 tp->t_starttime = ticks; 1248 /* FALLTHROUGH */ 1249 1250 case TCPS_ESTABLISHED: 1251 tp->t_state = TCPS_CLOSE_WAIT; 1252 break; 1253 1254 case TCPS_FIN_WAIT_1: 1255 tp->t_state = TCPS_CLOSING; 1256 break; 1257 1258 case TCPS_FIN_WAIT_2: 1259 tcp_twstart(tp); 1260 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1261 INP_INFO_RUNLOCK(&V_tcbinfo); 1262 CURVNET_RESTORE(); 1263 1264 INP_WLOCK(inp); 1265 final_cpl_received(toep); 1266 return (0); 1267 1268 default: 1269 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1270 __func__, tid, tp->t_state); 1271 } 1272 done: 1273 INP_WUNLOCK(inp); 1274 INP_INFO_RUNLOCK(&V_tcbinfo); 1275 CURVNET_RESTORE(); 1276 return (0); 1277 } 1278 1279 /* 1280 * Peer has ACK'd our FIN. 1281 */ 1282 static int 1283 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1284 struct mbuf *m) 1285 { 1286 struct adapter *sc = iq->adapter; 1287 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1288 unsigned int tid = GET_TID(cpl); 1289 struct toepcb *toep = lookup_tid(sc, tid); 1290 struct inpcb *inp = toep->inp; 1291 struct tcpcb *tp = NULL; 1292 struct socket *so = NULL; 1293 #ifdef INVARIANTS 1294 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1295 #endif 1296 1297 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1298 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1299 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1300 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1301 1302 CURVNET_SET(toep->vnet); 1303 INP_INFO_RLOCK(&V_tcbinfo); 1304 INP_WLOCK(inp); 1305 tp = intotcpcb(inp); 1306 1307 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1308 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1309 1310 if (toep->flags & TPF_ABORT_SHUTDOWN) 1311 goto done; 1312 1313 so = inp->inp_socket; 1314 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1315 1316 switch (tp->t_state) { 1317 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1318 tcp_twstart(tp); 1319 release: 1320 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1321 INP_INFO_RUNLOCK(&V_tcbinfo); 1322 CURVNET_RESTORE(); 1323 1324 INP_WLOCK(inp); 1325 final_cpl_received(toep); /* no more CPLs expected */ 1326 1327 return (0); 1328 case TCPS_LAST_ACK: 1329 if (tcp_close(tp)) 1330 INP_WUNLOCK(inp); 1331 goto release; 1332 1333 case TCPS_FIN_WAIT_1: 1334 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1335 soisdisconnected(so); 1336 tp->t_state = TCPS_FIN_WAIT_2; 1337 break; 1338 1339 default: 1340 log(LOG_ERR, 1341 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1342 __func__, tid, tcpstates[tp->t_state]); 1343 } 1344 done: 1345 INP_WUNLOCK(inp); 1346 INP_INFO_RUNLOCK(&V_tcbinfo); 1347 CURVNET_RESTORE(); 1348 return (0); 1349 } 1350 1351 void 1352 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1353 int rst_status) 1354 { 1355 struct wrqe *wr; 1356 struct cpl_abort_rpl *cpl; 1357 1358 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1359 if (wr == NULL) { 1360 /* XXX */ 1361 panic("%s: allocation failure.", __func__); 1362 } 1363 cpl = wrtod(wr); 1364 1365 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1366 cpl->cmd = rst_status; 1367 1368 t4_wrq_tx(sc, wr); 1369 } 1370 1371 static int 1372 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1373 { 1374 switch (abort_reason) { 1375 case CPL_ERR_BAD_SYN: 1376 case CPL_ERR_CONN_RESET: 1377 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1378 case CPL_ERR_XMIT_TIMEDOUT: 1379 case CPL_ERR_PERSIST_TIMEDOUT: 1380 case CPL_ERR_FINWAIT2_TIMEDOUT: 1381 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1382 return (ETIMEDOUT); 1383 default: 1384 return (EIO); 1385 } 1386 } 1387 1388 /* 1389 * TCP RST from the peer, timeout, or some other such critical error. 1390 */ 1391 static int 1392 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1393 { 1394 struct adapter *sc = iq->adapter; 1395 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1396 unsigned int tid = GET_TID(cpl); 1397 struct toepcb *toep = lookup_tid(sc, tid); 1398 struct sge_wrq *ofld_txq = toep->ofld_txq; 1399 struct inpcb *inp; 1400 struct tcpcb *tp; 1401 #ifdef INVARIANTS 1402 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1403 #endif 1404 1405 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1406 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1407 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1408 1409 if (toep->flags & TPF_SYNQE) 1410 return (do_abort_req_synqe(iq, rss, m)); 1411 1412 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1413 1414 if (negative_advice(cpl->status)) { 1415 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1416 __func__, cpl->status, tid, toep->flags); 1417 return (0); /* Ignore negative advice */ 1418 } 1419 1420 inp = toep->inp; 1421 CURVNET_SET(toep->vnet); 1422 INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ 1423 INP_WLOCK(inp); 1424 1425 tp = intotcpcb(inp); 1426 1427 CTR6(KTR_CXGBE, 1428 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1429 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1430 inp->inp_flags, cpl->status); 1431 1432 /* 1433 * If we'd initiated an abort earlier the reply to it is responsible for 1434 * cleaning up resources. Otherwise we tear everything down right here 1435 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1436 */ 1437 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1438 INP_WUNLOCK(inp); 1439 goto done; 1440 } 1441 toep->flags |= TPF_ABORT_SHUTDOWN; 1442 1443 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1444 struct socket *so = inp->inp_socket; 1445 1446 if (so != NULL) 1447 so_error_set(so, abort_status_to_errno(tp, 1448 cpl->status)); 1449 tp = tcp_close(tp); 1450 if (tp == NULL) 1451 INP_WLOCK(inp); /* re-acquire */ 1452 } 1453 1454 final_cpl_received(toep); 1455 done: 1456 INP_INFO_RUNLOCK(&V_tcbinfo); 1457 CURVNET_RESTORE(); 1458 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1459 return (0); 1460 } 1461 1462 /* 1463 * Reply to the CPL_ABORT_REQ (send_reset) 1464 */ 1465 static int 1466 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1467 { 1468 struct adapter *sc = iq->adapter; 1469 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1470 unsigned int tid = GET_TID(cpl); 1471 struct toepcb *toep = lookup_tid(sc, tid); 1472 struct inpcb *inp = toep->inp; 1473 #ifdef INVARIANTS 1474 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1475 #endif 1476 1477 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1478 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1479 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1480 1481 if (toep->flags & TPF_SYNQE) 1482 return (do_abort_rpl_synqe(iq, rss, m)); 1483 1484 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1485 1486 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1487 __func__, tid, toep, inp, cpl->status); 1488 1489 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1490 ("%s: wasn't expecting abort reply", __func__)); 1491 1492 INP_WLOCK(inp); 1493 final_cpl_received(toep); 1494 1495 return (0); 1496 } 1497 1498 static int 1499 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1500 { 1501 struct adapter *sc = iq->adapter; 1502 const struct cpl_rx_data *cpl = mtod(m, const void *); 1503 unsigned int tid = GET_TID(cpl); 1504 struct toepcb *toep = lookup_tid(sc, tid); 1505 struct inpcb *inp = toep->inp; 1506 struct tcpcb *tp; 1507 struct socket *so; 1508 struct sockbuf *sb; 1509 int len; 1510 uint32_t ddp_placed = 0; 1511 1512 if (__predict_false(toep->flags & TPF_SYNQE)) { 1513 #ifdef INVARIANTS 1514 struct synq_entry *synqe = (void *)toep; 1515 1516 INP_WLOCK(synqe->lctx->inp); 1517 if (synqe->flags & TPF_SYNQE_HAS_L2TE) { 1518 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1519 ("%s: listen socket closed but tid %u not aborted.", 1520 __func__, tid)); 1521 } else { 1522 /* 1523 * do_pass_accept_req is still running and will 1524 * eventually take care of this tid. 1525 */ 1526 } 1527 INP_WUNLOCK(synqe->lctx->inp); 1528 #endif 1529 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1530 toep, toep->flags); 1531 m_freem(m); 1532 return (0); 1533 } 1534 1535 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1536 1537 /* strip off CPL header */ 1538 m_adj(m, sizeof(*cpl)); 1539 len = m->m_pkthdr.len; 1540 1541 INP_WLOCK(inp); 1542 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1543 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1544 __func__, tid, len, inp->inp_flags); 1545 INP_WUNLOCK(inp); 1546 m_freem(m); 1547 return (0); 1548 } 1549 1550 tp = intotcpcb(inp); 1551 1552 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1553 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1554 1555 tp->rcv_nxt += len; 1556 if (tp->rcv_wnd < len) { 1557 KASSERT(toep->ulp_mode == ULP_MODE_RDMA, 1558 ("%s: negative window size", __func__)); 1559 } 1560 1561 tp->rcv_wnd -= len; 1562 tp->t_rcvtime = ticks; 1563 1564 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1565 DDP_LOCK(toep); 1566 so = inp_inpcbtosocket(inp); 1567 sb = &so->so_rcv; 1568 SOCKBUF_LOCK(sb); 1569 1570 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1571 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1572 __func__, tid, len); 1573 m_freem(m); 1574 SOCKBUF_UNLOCK(sb); 1575 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1576 DDP_UNLOCK(toep); 1577 INP_WUNLOCK(inp); 1578 1579 CURVNET_SET(toep->vnet); 1580 INP_INFO_RLOCK(&V_tcbinfo); 1581 INP_WLOCK(inp); 1582 tp = tcp_drop(tp, ECONNRESET); 1583 if (tp) 1584 INP_WUNLOCK(inp); 1585 INP_INFO_RUNLOCK(&V_tcbinfo); 1586 CURVNET_RESTORE(); 1587 1588 return (0); 1589 } 1590 1591 /* receive buffer autosize */ 1592 MPASS(toep->vnet == so->so_vnet); 1593 CURVNET_SET(toep->vnet); 1594 if (sb->sb_flags & SB_AUTOSIZE && 1595 V_tcp_do_autorcvbuf && 1596 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1597 len > (sbspace(sb) / 8 * 7)) { 1598 unsigned int hiwat = sb->sb_hiwat; 1599 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 1600 V_tcp_autorcvbuf_max); 1601 1602 if (!sbreserve_locked(sb, newsize, so, NULL)) 1603 sb->sb_flags &= ~SB_AUTOSIZE; 1604 else 1605 toep->rx_credits += newsize - hiwat; 1606 } 1607 1608 if (toep->ddp_waiting_count != 0 || toep->ddp_active_count != 0) 1609 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", __func__, 1610 tid, len); 1611 1612 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1613 int changed = !(toep->ddp_flags & DDP_ON) ^ cpl->ddp_off; 1614 1615 if (changed) { 1616 if (toep->ddp_flags & DDP_SC_REQ) 1617 toep->ddp_flags ^= DDP_ON | DDP_SC_REQ; 1618 else { 1619 KASSERT(cpl->ddp_off == 1, 1620 ("%s: DDP switched on by itself.", 1621 __func__)); 1622 1623 /* Fell out of DDP mode */ 1624 toep->ddp_flags &= ~DDP_ON; 1625 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1626 __func__); 1627 1628 insert_ddp_data(toep, ddp_placed); 1629 } 1630 } 1631 1632 if (toep->ddp_flags & DDP_ON) { 1633 /* 1634 * CPL_RX_DATA with DDP on can only be an indicate. 1635 * Start posting queued AIO requests via DDP. The 1636 * payload that arrived in this indicate is appended 1637 * to the socket buffer as usual. 1638 */ 1639 handle_ddp_indicate(toep); 1640 } 1641 } 1642 1643 KASSERT(toep->sb_cc >= sbused(sb), 1644 ("%s: sb %p has more data (%d) than last time (%d).", 1645 __func__, sb, sbused(sb), toep->sb_cc)); 1646 toep->rx_credits += toep->sb_cc - sbused(sb); 1647 sbappendstream_locked(sb, m, 0); 1648 toep->sb_cc = sbused(sb); 1649 if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) { 1650 int credits; 1651 1652 credits = send_rx_credits(sc, toep, toep->rx_credits); 1653 toep->rx_credits -= credits; 1654 tp->rcv_wnd += credits; 1655 tp->rcv_adv += credits; 1656 } 1657 1658 if (toep->ddp_waiting_count > 0 && sbavail(sb) != 0) { 1659 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1660 tid); 1661 ddp_queue_toep(toep); 1662 } 1663 sorwakeup_locked(so); 1664 SOCKBUF_UNLOCK_ASSERT(sb); 1665 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1666 DDP_UNLOCK(toep); 1667 1668 INP_WUNLOCK(inp); 1669 CURVNET_RESTORE(); 1670 return (0); 1671 } 1672 1673 #define S_CPL_FW4_ACK_OPCODE 24 1674 #define M_CPL_FW4_ACK_OPCODE 0xff 1675 #define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE) 1676 #define G_CPL_FW4_ACK_OPCODE(x) \ 1677 (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE) 1678 1679 #define S_CPL_FW4_ACK_FLOWID 0 1680 #define M_CPL_FW4_ACK_FLOWID 0xffffff 1681 #define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID) 1682 #define G_CPL_FW4_ACK_FLOWID(x) \ 1683 (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID) 1684 1685 #define S_CPL_FW4_ACK_CR 24 1686 #define M_CPL_FW4_ACK_CR 0xff 1687 #define V_CPL_FW4_ACK_CR(x) ((x) << S_CPL_FW4_ACK_CR) 1688 #define G_CPL_FW4_ACK_CR(x) (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR) 1689 1690 #define S_CPL_FW4_ACK_SEQVAL 0 1691 #define M_CPL_FW4_ACK_SEQVAL 0x1 1692 #define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL) 1693 #define G_CPL_FW4_ACK_SEQVAL(x) \ 1694 (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL) 1695 #define F_CPL_FW4_ACK_SEQVAL V_CPL_FW4_ACK_SEQVAL(1U) 1696 1697 static int 1698 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1699 { 1700 struct adapter *sc = iq->adapter; 1701 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1702 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1703 struct toepcb *toep = lookup_tid(sc, tid); 1704 struct inpcb *inp; 1705 struct tcpcb *tp; 1706 struct socket *so; 1707 uint8_t credits = cpl->credits; 1708 struct ofld_tx_sdesc *txsd; 1709 int plen; 1710 #ifdef INVARIANTS 1711 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1712 #endif 1713 1714 /* 1715 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1716 * now this comes back carrying the credits for the flowc. 1717 */ 1718 if (__predict_false(toep->flags & TPF_SYNQE)) { 1719 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1720 ("%s: credits for a synq entry %p", __func__, toep)); 1721 return (0); 1722 } 1723 1724 inp = toep->inp; 1725 1726 KASSERT(opcode == CPL_FW4_ACK, 1727 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1728 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1729 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1730 1731 INP_WLOCK(inp); 1732 1733 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1734 INP_WUNLOCK(inp); 1735 return (0); 1736 } 1737 1738 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1739 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1740 1741 tp = intotcpcb(inp); 1742 1743 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1744 tcp_seq snd_una = be32toh(cpl->snd_una); 1745 1746 #ifdef INVARIANTS 1747 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1748 log(LOG_ERR, 1749 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1750 __func__, snd_una, toep->tid, tp->snd_una); 1751 } 1752 #endif 1753 1754 if (tp->snd_una != snd_una) { 1755 tp->snd_una = snd_una; 1756 tp->ts_recent_age = tcp_ts_getticks(); 1757 } 1758 } 1759 1760 #ifdef VERBOSE_TRACES 1761 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1762 #endif 1763 so = inp->inp_socket; 1764 txsd = &toep->txsd[toep->txsd_cidx]; 1765 plen = 0; 1766 while (credits) { 1767 KASSERT(credits >= txsd->tx_credits, 1768 ("%s: too many (or partial) credits", __func__)); 1769 credits -= txsd->tx_credits; 1770 toep->tx_credits += txsd->tx_credits; 1771 plen += txsd->plen; 1772 txsd++; 1773 toep->txsd_avail++; 1774 KASSERT(toep->txsd_avail <= toep->txsd_total, 1775 ("%s: txsd avail > total", __func__)); 1776 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1777 txsd = &toep->txsd[0]; 1778 toep->txsd_cidx = 0; 1779 } 1780 } 1781 1782 if (toep->tx_credits == toep->tx_total) { 1783 toep->tx_nocompl = 0; 1784 toep->plen_nocompl = 0; 1785 } 1786 1787 if (toep->flags & TPF_TX_SUSPENDED && 1788 toep->tx_credits >= toep->tx_total / 4) { 1789 #ifdef VERBOSE_TRACES 1790 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1791 tid); 1792 #endif 1793 toep->flags &= ~TPF_TX_SUSPENDED; 1794 CURVNET_SET(toep->vnet); 1795 if (toep->ulp_mode == ULP_MODE_ISCSI) 1796 t4_push_pdus(sc, toep, plen); 1797 else 1798 t4_push_frames(sc, toep, plen); 1799 CURVNET_RESTORE(); 1800 } else if (plen > 0) { 1801 struct sockbuf *sb = &so->so_snd; 1802 int sbu; 1803 1804 SOCKBUF_LOCK(sb); 1805 sbu = sbused(sb); 1806 if (toep->ulp_mode == ULP_MODE_ISCSI) { 1807 1808 if (__predict_false(sbu > 0)) { 1809 /* 1810 * The data trasmitted before the tid's ULP mode 1811 * changed to ISCSI is still in so_snd. 1812 * Incoming credits should account for so_snd 1813 * first. 1814 */ 1815 sbdrop_locked(sb, min(sbu, plen)); 1816 plen -= min(sbu, plen); 1817 } 1818 sowwakeup_locked(so); /* unlocks so_snd */ 1819 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1820 } else { 1821 #ifdef VERBOSE_TRACES 1822 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1823 tid, plen); 1824 #endif 1825 sbdrop_locked(sb, plen); 1826 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1827 t4_aiotx_queue_toep(toep); 1828 sowwakeup_locked(so); /* unlocks so_snd */ 1829 } 1830 SOCKBUF_UNLOCK_ASSERT(sb); 1831 } 1832 1833 INP_WUNLOCK(inp); 1834 1835 return (0); 1836 } 1837 1838 int 1839 do_set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1840 { 1841 struct adapter *sc = iq->adapter; 1842 const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); 1843 unsigned int tid = GET_TID(cpl); 1844 struct toepcb *toep; 1845 #ifdef INVARIANTS 1846 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1847 #endif 1848 1849 KASSERT(opcode == CPL_SET_TCB_RPL, 1850 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1851 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1852 MPASS(iq != &sc->sge.fwq); 1853 1854 toep = lookup_tid(sc, tid); 1855 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1856 handle_ddp_tcb_rpl(toep, cpl); 1857 return (0); 1858 } 1859 1860 /* 1861 * TOM and/or other ULPs don't request replies for CPL_SET_TCB or 1862 * CPL_SET_TCB_FIELD requests. This can easily change and when it does 1863 * the dispatch code will go here. 1864 */ 1865 #ifdef INVARIANTS 1866 panic("%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p", __func__, 1867 tid, iq); 1868 #else 1869 log(LOG_ERR, "%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p\n", 1870 __func__, tid, iq); 1871 #endif 1872 1873 return (0); 1874 } 1875 1876 void 1877 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, int tid, 1878 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie, int iqid) 1879 { 1880 struct wrqe *wr; 1881 struct cpl_set_tcb_field *req; 1882 1883 MPASS((cookie & ~M_COOKIE) == 0); 1884 MPASS((iqid & ~M_QUEUENO) == 0); 1885 1886 wr = alloc_wrqe(sizeof(*req), wrq); 1887 if (wr == NULL) { 1888 /* XXX */ 1889 panic("%s: allocation failure.", __func__); 1890 } 1891 req = wrtod(wr); 1892 1893 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, tid); 1894 req->reply_ctrl = htobe16(V_QUEUENO(iqid)); 1895 if (reply == 0) 1896 req->reply_ctrl |= htobe16(F_NO_REPLY); 1897 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1898 req->mask = htobe64(mask); 1899 req->val = htobe64(val); 1900 1901 t4_wrq_tx(sc, wr); 1902 } 1903 1904 void 1905 t4_init_cpl_io_handlers(void) 1906 { 1907 1908 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1909 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1910 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1911 t4_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 1912 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1913 t4_register_cpl_handler(CPL_FW4_ACK, do_fw4_ack); 1914 } 1915 1916 void 1917 t4_uninit_cpl_io_handlers(void) 1918 { 1919 1920 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1921 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1922 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1923 t4_register_cpl_handler(CPL_ABORT_RPL_RSS, NULL); 1924 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1925 t4_register_cpl_handler(CPL_FW4_ACK, NULL); 1926 } 1927 1928 /* 1929 * Use the 'backend3' field in AIO jobs to store the amount of data 1930 * sent by the AIO job so far and the 'backend4' field to hold an 1931 * error that should be reported when the job is completed. 1932 */ 1933 #define aio_sent backend3 1934 #define aio_error backend4 1935 1936 #define jobtotid(job) \ 1937 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1938 1939 static void 1940 free_aiotx_buffer(struct aiotx_buffer *ab) 1941 { 1942 struct kaiocb *job; 1943 long status; 1944 int error; 1945 1946 if (refcount_release(&ab->refcount) == 0) 1947 return; 1948 1949 job = ab->job; 1950 error = job->aio_error; 1951 status = job->aio_sent; 1952 vm_page_unhold_pages(ab->ps.pages, ab->ps.npages); 1953 free(ab, M_CXGBE); 1954 #ifdef VERBOSE_TRACES 1955 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1956 jobtotid(job), job, status, error); 1957 #endif 1958 if (error == ECANCELED && status != 0) 1959 error = 0; 1960 if (error == ECANCELED) 1961 aio_cancel(job); 1962 else if (error) 1963 aio_complete(job, -1, error); 1964 else 1965 aio_complete(job, status, 0); 1966 } 1967 1968 static void 1969 t4_aiotx_mbuf_free(struct mbuf *m) 1970 { 1971 struct aiotx_buffer *ab = m->m_ext.ext_arg1; 1972 1973 #ifdef VERBOSE_TRACES 1974 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1975 m->m_len, jobtotid(ab->job)); 1976 #endif 1977 free_aiotx_buffer(ab); 1978 } 1979 1980 /* 1981 * Hold the buffer backing an AIO request and return an AIO transmit 1982 * buffer. 1983 */ 1984 static int 1985 hold_aio(struct kaiocb *job) 1986 { 1987 struct aiotx_buffer *ab; 1988 struct vmspace *vm; 1989 vm_map_t map; 1990 vm_offset_t start, end, pgoff; 1991 int n; 1992 1993 MPASS(job->backend1 == NULL); 1994 1995 /* 1996 * The AIO subsystem will cancel and drain all requests before 1997 * permitting a process to exit or exec, so p_vmspace should 1998 * be stable here. 1999 */ 2000 vm = job->userproc->p_vmspace; 2001 map = &vm->vm_map; 2002 start = (uintptr_t)job->uaiocb.aio_buf; 2003 pgoff = start & PAGE_MASK; 2004 end = round_page(start + job->uaiocb.aio_nbytes); 2005 start = trunc_page(start); 2006 n = atop(end - start); 2007 2008 ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | 2009 M_ZERO); 2010 refcount_init(&ab->refcount, 1); 2011 ab->ps.pages = (vm_page_t *)(ab + 1); 2012 ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start, 2013 VM_PROT_WRITE, ab->ps.pages, n); 2014 if (ab->ps.npages < 0) { 2015 free(ab, M_CXGBE); 2016 return (EFAULT); 2017 } 2018 2019 KASSERT(ab->ps.npages == n, 2020 ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n)); 2021 2022 ab->ps.offset = pgoff; 2023 ab->ps.len = job->uaiocb.aio_nbytes; 2024 ab->job = job; 2025 job->backend1 = ab; 2026 #ifdef VERBOSE_TRACES 2027 CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", 2028 __func__, jobtotid(job), &ab->ps, job, ab->ps.npages); 2029 #endif 2030 return (0); 2031 } 2032 2033 static void 2034 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2035 { 2036 struct adapter *sc; 2037 struct sockbuf *sb; 2038 struct file *fp; 2039 struct aiotx_buffer *ab; 2040 struct inpcb *inp; 2041 struct tcpcb *tp; 2042 struct mbuf *m; 2043 int error; 2044 bool moretocome, sendmore; 2045 2046 sc = td_adapter(toep->td); 2047 sb = &so->so_snd; 2048 SOCKBUF_UNLOCK(sb); 2049 fp = job->fd_file; 2050 ab = job->backend1; 2051 m = NULL; 2052 2053 #ifdef MAC 2054 error = mac_socket_check_send(fp->f_cred, so); 2055 if (error != 0) 2056 goto out; 2057 #endif 2058 2059 if (ab == NULL) { 2060 error = hold_aio(job); 2061 if (error != 0) 2062 goto out; 2063 ab = job->backend1; 2064 } 2065 2066 /* Inline sosend_generic(). */ 2067 2068 job->msgsnd = 1; 2069 2070 error = sblock(sb, SBL_WAIT); 2071 MPASS(error == 0); 2072 2073 sendanother: 2074 m = m_get(M_WAITOK, MT_DATA); 2075 2076 SOCKBUF_LOCK(sb); 2077 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2078 SOCKBUF_UNLOCK(sb); 2079 sbunlock(sb); 2080 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2081 PROC_LOCK(job->userproc); 2082 kern_psignal(job->userproc, SIGPIPE); 2083 PROC_UNLOCK(job->userproc); 2084 } 2085 error = EPIPE; 2086 goto out; 2087 } 2088 if (so->so_error) { 2089 error = so->so_error; 2090 so->so_error = 0; 2091 SOCKBUF_UNLOCK(sb); 2092 sbunlock(sb); 2093 goto out; 2094 } 2095 if ((so->so_state & SS_ISCONNECTED) == 0) { 2096 SOCKBUF_UNLOCK(sb); 2097 sbunlock(sb); 2098 error = ENOTCONN; 2099 goto out; 2100 } 2101 if (sbspace(sb) < sb->sb_lowat) { 2102 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2103 2104 /* 2105 * Don't block if there is too little room in the socket 2106 * buffer. Instead, requeue the request. 2107 */ 2108 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2109 SOCKBUF_UNLOCK(sb); 2110 sbunlock(sb); 2111 error = ECANCELED; 2112 goto out; 2113 } 2114 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2115 SOCKBUF_UNLOCK(sb); 2116 sbunlock(sb); 2117 goto out; 2118 } 2119 2120 /* 2121 * Write as much data as the socket permits, but no more than a 2122 * a single sndbuf at a time. 2123 */ 2124 m->m_len = sbspace(sb); 2125 if (m->m_len > ab->ps.len - job->aio_sent) { 2126 m->m_len = ab->ps.len - job->aio_sent; 2127 moretocome = false; 2128 } else 2129 moretocome = true; 2130 if (m->m_len > sc->tt.sndbuf) { 2131 m->m_len = sc->tt.sndbuf; 2132 sendmore = true; 2133 } else 2134 sendmore = false; 2135 2136 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2137 moretocome = true; 2138 SOCKBUF_UNLOCK(sb); 2139 MPASS(m->m_len != 0); 2140 2141 /* Inlined tcp_usr_send(). */ 2142 2143 inp = toep->inp; 2144 INP_WLOCK(inp); 2145 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2146 INP_WUNLOCK(inp); 2147 sbunlock(sb); 2148 error = ECONNRESET; 2149 goto out; 2150 } 2151 2152 refcount_acquire(&ab->refcount); 2153 m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab, 2154 (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV); 2155 m->m_ext.ext_flags |= EXT_FLAG_AIOTX; 2156 job->aio_sent += m->m_len; 2157 2158 sbappendstream(sb, m, 0); 2159 m = NULL; 2160 2161 if (!(inp->inp_flags & INP_DROPPED)) { 2162 tp = intotcpcb(inp); 2163 if (moretocome) 2164 tp->t_flags |= TF_MORETOCOME; 2165 error = tp->t_fb->tfb_tcp_output(tp); 2166 if (moretocome) 2167 tp->t_flags &= ~TF_MORETOCOME; 2168 } 2169 2170 INP_WUNLOCK(inp); 2171 if (sendmore) 2172 goto sendanother; 2173 sbunlock(sb); 2174 2175 if (error) 2176 goto out; 2177 2178 /* 2179 * If this is a non-blocking socket and the request has not 2180 * been fully completed, requeue it until the socket is ready 2181 * again. 2182 */ 2183 if (job->aio_sent < job->uaiocb.aio_nbytes && 2184 !(so->so_state & SS_NBIO)) { 2185 SOCKBUF_LOCK(sb); 2186 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2187 SOCKBUF_UNLOCK(sb); 2188 error = ECANCELED; 2189 goto out; 2190 } 2191 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2192 return; 2193 } 2194 2195 /* 2196 * If the request will not be requeued, drop a reference on 2197 * the aiotx buffer. Any mbufs in flight should still 2198 * contain a reference, but this drops the reference that the 2199 * job owns while it is waiting to queue mbufs to the socket. 2200 */ 2201 free_aiotx_buffer(ab); 2202 2203 out: 2204 if (error) { 2205 if (ab != NULL) { 2206 job->aio_error = error; 2207 free_aiotx_buffer(ab); 2208 } else { 2209 MPASS(job->aio_sent == 0); 2210 aio_complete(job, -1, error); 2211 } 2212 } 2213 if (m != NULL) 2214 m_free(m); 2215 SOCKBUF_LOCK(sb); 2216 } 2217 2218 static void 2219 t4_aiotx_task(void *context, int pending) 2220 { 2221 struct toepcb *toep = context; 2222 struct inpcb *inp = toep->inp; 2223 struct socket *so = inp->inp_socket; 2224 struct kaiocb *job; 2225 2226 CURVNET_SET(toep->vnet); 2227 SOCKBUF_LOCK(&so->so_snd); 2228 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2229 job = TAILQ_FIRST(&toep->aiotx_jobq); 2230 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2231 if (!aio_clear_cancel_function(job)) 2232 continue; 2233 2234 t4_aiotx_process_job(toep, so, job); 2235 } 2236 toep->aiotx_task_active = false; 2237 SOCKBUF_UNLOCK(&so->so_snd); 2238 CURVNET_RESTORE(); 2239 2240 free_toepcb(toep); 2241 } 2242 2243 static void 2244 t4_aiotx_queue_toep(struct toepcb *toep) 2245 { 2246 2247 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2248 #ifdef VERBOSE_TRACES 2249 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2250 __func__, toep->tid, toep->aiotx_task_active ? "true" : "false"); 2251 #endif 2252 if (toep->aiotx_task_active) 2253 return; 2254 toep->aiotx_task_active = true; 2255 hold_toepcb(toep); 2256 soaio_enqueue(&toep->aiotx_task); 2257 } 2258 2259 static void 2260 t4_aiotx_cancel(struct kaiocb *job) 2261 { 2262 struct aiotx_buffer *ab; 2263 struct socket *so; 2264 struct sockbuf *sb; 2265 struct tcpcb *tp; 2266 struct toepcb *toep; 2267 2268 so = job->fd_file->f_data; 2269 tp = so_sototcpcb(so); 2270 toep = tp->t_toe; 2271 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2272 sb = &so->so_snd; 2273 2274 SOCKBUF_LOCK(sb); 2275 if (!aio_cancel_cleared(job)) 2276 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2277 SOCKBUF_UNLOCK(sb); 2278 2279 ab = job->backend1; 2280 if (ab != NULL) 2281 free_aiotx_buffer(ab); 2282 else 2283 aio_cancel(job); 2284 } 2285 2286 int 2287 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2288 { 2289 struct tcpcb *tp = so_sototcpcb(so); 2290 struct toepcb *toep = tp->t_toe; 2291 struct adapter *sc = td_adapter(toep->td); 2292 2293 /* This only handles writes. */ 2294 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2295 return (EOPNOTSUPP); 2296 2297 if (!sc->tt.tx_zcopy) 2298 return (EOPNOTSUPP); 2299 2300 SOCKBUF_LOCK(&so->so_snd); 2301 #ifdef VERBOSE_TRACES 2302 CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job); 2303 #endif 2304 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2305 panic("new job was cancelled"); 2306 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2307 if (sowriteable(so)) 2308 t4_aiotx_queue_toep(toep); 2309 SOCKBUF_UNLOCK(&so->so_snd); 2310 return (0); 2311 } 2312 2313 void 2314 aiotx_init_toep(struct toepcb *toep) 2315 { 2316 2317 TAILQ_INIT(&toep->aiotx_jobq); 2318 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2319 } 2320 #endif 2321