1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_kern_tls.h" 36 #include "opt_ratelimit.h" 37 38 #ifdef TCP_OFFLOAD 39 #include <sys/param.h> 40 #include <sys/aio.h> 41 #include <sys/file.h> 42 #include <sys/kernel.h> 43 #include <sys/ktr.h> 44 #include <sys/module.h> 45 #include <sys/proc.h> 46 #include <sys/protosw.h> 47 #include <sys/domain.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sglist.h> 51 #include <sys/taskqueue.h> 52 #include <netinet/in.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip6.h> 56 #define TCPSTATES 57 #include <netinet/tcp_fsm.h> 58 #include <netinet/tcp_seq.h> 59 #include <netinet/tcp_var.h> 60 #include <netinet/toecore.h> 61 62 #include <security/mac/mac_framework.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_page.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (ulp_mode(toep) == ULP_MODE_TLS) 100 nparams++; 101 if (toep->tls.fcplenmax != 0) 102 nparams++; 103 if (toep->params.tc_idx != -1) { 104 MPASS(toep->params.tc_idx >= 0 && 105 toep->params.tc_idx < sc->params.nsched_cls); 106 nparams++; 107 } 108 109 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 110 111 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 112 if (wr == NULL) { 113 /* XXX */ 114 panic("%s: allocation failure.", __func__); 115 } 116 flowc = wrtod(wr); 117 memset(flowc, 0, wr->wr_len); 118 119 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 120 V_FW_FLOWC_WR_NPARAMS(nparams)); 121 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 122 V_FW_WR_FLOWID(toep->tid)); 123 124 #define FLOWC_PARAM(__m, __v) \ 125 do { \ 126 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 127 flowc->mnemval[paramidx].val = htobe32(__v); \ 128 paramidx++; \ 129 } while (0) 130 131 paramidx = 0; 132 133 FLOWC_PARAM(PFNVFN, pfvf); 134 FLOWC_PARAM(CH, pi->tx_chan); 135 FLOWC_PARAM(PORT, pi->tx_chan); 136 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 137 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 138 if (tp) { 139 FLOWC_PARAM(MSS, toep->params.emss); 140 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 141 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 142 } else 143 FLOWC_PARAM(MSS, 512); 144 CTR6(KTR_CXGBE, 145 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 146 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 147 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 148 149 if (ulp_mode(toep) == ULP_MODE_TLS) 150 FLOWC_PARAM(ULP_MODE, ulp_mode(toep)); 151 if (toep->tls.fcplenmax != 0) 152 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 153 if (toep->params.tc_idx != -1) 154 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 155 #undef FLOWC_PARAM 156 157 KASSERT(paramidx == nparams, ("nparams mismatch")); 158 159 txsd->tx_credits = howmany(flowclen, 16); 160 txsd->plen = 0; 161 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 162 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 163 toep->tx_credits -= txsd->tx_credits; 164 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 165 toep->txsd_pidx = 0; 166 toep->txsd_avail--; 167 168 toep->flags |= TPF_FLOWC_WR_SENT; 169 t4_wrq_tx(sc, wr); 170 } 171 172 #ifdef RATELIMIT 173 /* 174 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 175 */ 176 static int 177 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 178 { 179 int tc_idx, rc; 180 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 181 const int port_id = toep->vi->pi->port_id; 182 183 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 184 185 if (kbps == 0) { 186 /* unbind */ 187 tc_idx = -1; 188 } else { 189 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 190 if (rc != 0) 191 return (rc); 192 MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls); 193 } 194 195 if (toep->params.tc_idx != tc_idx) { 196 struct wrqe *wr; 197 struct fw_flowc_wr *flowc; 198 int nparams = 1, flowclen, flowclen16; 199 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 200 201 flowclen = sizeof(*flowc) + nparams * sizeof(struct 202 fw_flowc_mnemval); 203 flowclen16 = howmany(flowclen, 16); 204 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 205 (wr = alloc_wrqe(roundup2(flowclen, 16), 206 &toep->ofld_txq->wrq)) == NULL) { 207 if (tc_idx >= 0) 208 t4_release_cl_rl(sc, port_id, tc_idx); 209 return (ENOMEM); 210 } 211 212 flowc = wrtod(wr); 213 memset(flowc, 0, wr->wr_len); 214 215 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 216 V_FW_FLOWC_WR_NPARAMS(nparams)); 217 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 218 V_FW_WR_FLOWID(toep->tid)); 219 220 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 221 if (tc_idx == -1) 222 flowc->mnemval[0].val = htobe32(0xff); 223 else 224 flowc->mnemval[0].val = htobe32(tc_idx); 225 226 txsd->tx_credits = flowclen16; 227 txsd->plen = 0; 228 toep->tx_credits -= txsd->tx_credits; 229 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 230 toep->txsd_pidx = 0; 231 toep->txsd_avail--; 232 t4_wrq_tx(sc, wr); 233 } 234 235 if (toep->params.tc_idx >= 0) 236 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 237 toep->params.tc_idx = tc_idx; 238 239 return (0); 240 } 241 #endif 242 243 void 244 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 245 { 246 struct wrqe *wr; 247 struct cpl_abort_req *req; 248 int tid = toep->tid; 249 struct inpcb *inp = toep->inp; 250 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 251 252 INP_WLOCK_ASSERT(inp); 253 254 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 255 __func__, toep->tid, 256 inp->inp_flags & INP_DROPPED ? "inp dropped" : 257 tcpstates[tp->t_state], 258 toep->flags, inp->inp_flags, 259 toep->flags & TPF_ABORT_SHUTDOWN ? 260 " (abort already in progress)" : ""); 261 262 if (toep->flags & TPF_ABORT_SHUTDOWN) 263 return; /* abort already in progress */ 264 265 toep->flags |= TPF_ABORT_SHUTDOWN; 266 267 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 268 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 269 270 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 271 if (wr == NULL) { 272 /* XXX */ 273 panic("%s: allocation failure.", __func__); 274 } 275 req = wrtod(wr); 276 277 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 278 if (inp->inp_flags & INP_DROPPED) 279 req->rsvd0 = htobe32(snd_nxt); 280 else 281 req->rsvd0 = htobe32(tp->snd_nxt); 282 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 283 req->cmd = CPL_ABORT_SEND_RST; 284 285 /* 286 * XXX: What's the correct way to tell that the inp hasn't been detached 287 * from its socket? Should I even be flushing the snd buffer here? 288 */ 289 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 290 struct socket *so = inp->inp_socket; 291 292 if (so != NULL) /* because I'm not sure. See comment above */ 293 sbflush(&so->so_snd); 294 } 295 296 t4_l2t_send(sc, wr, toep->l2te); 297 } 298 299 /* 300 * Called when a connection is established to translate the TCP options 301 * reported by HW to FreeBSD's native format. 302 */ 303 static void 304 assign_rxopt(struct tcpcb *tp, uint16_t opt) 305 { 306 struct toepcb *toep = tp->t_toe; 307 struct inpcb *inp = tp->t_inpcb; 308 struct adapter *sc = td_adapter(toep->td); 309 310 INP_LOCK_ASSERT(inp); 311 312 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 313 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 314 if (inp->inp_inc.inc_flags & INC_ISIPV6) 315 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 316 else 317 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 318 319 toep->params.emss = tp->t_maxseg; 320 if (G_TCPOPT_TSTAMP(opt)) { 321 toep->params.tstamp = 1; 322 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 323 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 324 tp->ts_recent = 0; /* hmmm */ 325 tp->ts_recent_age = tcp_ts_getticks(); 326 } else 327 toep->params.tstamp = 0; 328 329 if (G_TCPOPT_SACK(opt)) { 330 toep->params.sack = 1; 331 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 332 } else { 333 toep->params.sack = 0; 334 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 335 } 336 337 if (G_TCPOPT_WSCALE_OK(opt)) 338 tp->t_flags |= TF_RCVD_SCALE; 339 340 /* Doing window scaling? */ 341 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 342 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 343 tp->rcv_scale = tp->request_r_scale; 344 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 345 } else 346 toep->params.wscale = 0; 347 348 CTR6(KTR_CXGBE, 349 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 350 toep->tid, toep->params.mtu_idx, toep->params.emss, 351 toep->params.tstamp, toep->params.sack, toep->params.wscale); 352 } 353 354 /* 355 * Completes some final bits of initialization for just established connections 356 * and changes their state to TCPS_ESTABLISHED. 357 * 358 * The ISNs are from the exchange of SYNs. 359 */ 360 void 361 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 362 { 363 struct inpcb *inp = toep->inp; 364 struct socket *so = inp->inp_socket; 365 struct tcpcb *tp = intotcpcb(inp); 366 uint16_t tcpopt = be16toh(opt); 367 368 INP_WLOCK_ASSERT(inp); 369 KASSERT(tp->t_state == TCPS_SYN_SENT || 370 tp->t_state == TCPS_SYN_RECEIVED, 371 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 372 373 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 374 __func__, toep->tid, so, inp, tp, toep); 375 376 tcp_state_change(tp, TCPS_ESTABLISHED); 377 tp->t_starttime = ticks; 378 TCPSTAT_INC(tcps_connects); 379 380 tp->irs = irs; 381 tcp_rcvseqinit(tp); 382 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 383 tp->rcv_adv += tp->rcv_wnd; 384 tp->last_ack_sent = tp->rcv_nxt; 385 386 tp->iss = iss; 387 tcp_sendseqinit(tp); 388 tp->snd_una = iss + 1; 389 tp->snd_nxt = iss + 1; 390 tp->snd_max = iss + 1; 391 392 assign_rxopt(tp, tcpopt); 393 send_flowc_wr(toep, tp); 394 395 soisconnected(so); 396 397 if (ulp_mode(toep) == ULP_MODE_TLS) 398 tls_establish(toep); 399 } 400 401 int 402 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 403 { 404 struct wrqe *wr; 405 struct cpl_rx_data_ack *req; 406 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 407 408 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 409 410 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 411 if (wr == NULL) 412 return (0); 413 req = wrtod(wr); 414 415 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 416 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 417 418 t4_wrq_tx(sc, wr); 419 return (credits); 420 } 421 422 void 423 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 424 { 425 struct wrqe *wr; 426 struct cpl_rx_data_ack *req; 427 428 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 429 if (wr == NULL) 430 return; 431 req = wrtod(wr); 432 433 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 434 req->credit_dack = htobe32(F_RX_MODULATE_RX); 435 436 t4_wrq_tx(sc, wr); 437 } 438 439 void 440 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 441 { 442 struct adapter *sc = tod->tod_softc; 443 struct inpcb *inp = tp->t_inpcb; 444 struct socket *so = inp->inp_socket; 445 struct sockbuf *sb = &so->so_rcv; 446 struct toepcb *toep = tp->t_toe; 447 int rx_credits; 448 449 INP_WLOCK_ASSERT(inp); 450 SOCKBUF_LOCK_ASSERT(sb); 451 452 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 453 if (rx_credits > 0 && 454 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 455 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 456 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 457 rx_credits = send_rx_credits(sc, toep, rx_credits); 458 tp->rcv_wnd += rx_credits; 459 tp->rcv_adv += rx_credits; 460 } else if (toep->flags & TPF_FORCE_CREDITS) 461 send_rx_modulate(sc, toep); 462 } 463 464 void 465 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 466 { 467 struct inpcb *inp = tp->t_inpcb; 468 struct socket *so = inp->inp_socket; 469 struct sockbuf *sb = &so->so_rcv; 470 471 SOCKBUF_LOCK(sb); 472 t4_rcvd_locked(tod, tp); 473 SOCKBUF_UNLOCK(sb); 474 } 475 476 /* 477 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 478 */ 479 int 480 t4_close_conn(struct adapter *sc, struct toepcb *toep) 481 { 482 struct wrqe *wr; 483 struct cpl_close_con_req *req; 484 unsigned int tid = toep->tid; 485 486 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 487 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 488 489 if (toep->flags & TPF_FIN_SENT) 490 return (0); 491 492 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 493 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 494 495 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 496 if (wr == NULL) { 497 /* XXX */ 498 panic("%s: allocation failure.", __func__); 499 } 500 req = wrtod(wr); 501 502 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 503 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 504 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 505 V_FW_WR_FLOWID(tid)); 506 req->wr.wr_lo = cpu_to_be64(0); 507 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 508 req->rsvd = 0; 509 510 toep->flags |= TPF_FIN_SENT; 511 toep->flags &= ~TPF_SEND_FIN; 512 t4_l2t_send(sc, wr, toep->l2te); 513 514 return (0); 515 } 516 517 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 518 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 519 520 /* Maximum amount of immediate data we could stuff in a WR */ 521 static inline int 522 max_imm_payload(int tx_credits) 523 { 524 const int n = 1; /* Use no more than one desc for imm. data WR */ 525 526 KASSERT(tx_credits >= 0 && 527 tx_credits <= MAX_OFLD_TX_CREDITS, 528 ("%s: %d credits", __func__, tx_credits)); 529 530 if (tx_credits < MIN_OFLD_TX_CREDITS) 531 return (0); 532 533 if (tx_credits >= (n * EQ_ESIZE) / 16) 534 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 535 else 536 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 537 } 538 539 /* Maximum number of SGL entries we could stuff in a WR */ 540 static inline int 541 max_dsgl_nsegs(int tx_credits) 542 { 543 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 544 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 545 546 KASSERT(tx_credits >= 0 && 547 tx_credits <= MAX_OFLD_TX_CREDITS, 548 ("%s: %d credits", __func__, tx_credits)); 549 550 if (tx_credits < MIN_OFLD_TX_CREDITS) 551 return (0); 552 553 nseg += 2 * (sge_pair_credits * 16 / 24); 554 if ((sge_pair_credits * 16) % 24 == 16) 555 nseg++; 556 557 return (nseg); 558 } 559 560 static inline void 561 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 562 unsigned int plen, uint8_t credits, int shove, int ulp_submode) 563 { 564 struct fw_ofld_tx_data_wr *txwr = dst; 565 566 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 567 V_FW_WR_IMMDLEN(immdlen)); 568 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 569 V_FW_WR_LEN16(credits)); 570 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 571 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 572 txwr->plen = htobe32(plen); 573 574 if (toep->params.tx_align > 0) { 575 if (plen < 2 * toep->params.emss) 576 txwr->lsodisable_to_flags |= 577 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 578 else 579 txwr->lsodisable_to_flags |= 580 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 581 (toep->params.nagle == 0 ? 0 : 582 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 583 } 584 } 585 586 /* 587 * Generate a DSGL from a starting mbuf. The total number of segments and the 588 * maximum segments in any one mbuf are provided. 589 */ 590 static void 591 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 592 { 593 struct mbuf *m; 594 struct ulptx_sgl *usgl = dst; 595 int i, j, rc; 596 struct sglist sg; 597 struct sglist_seg segs[n]; 598 599 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 600 601 sglist_init(&sg, n, segs); 602 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 603 V_ULPTX_NSGE(nsegs)); 604 605 i = -1; 606 for (m = start; m != stop; m = m->m_next) { 607 if (m->m_flags & M_EXTPG) 608 rc = sglist_append_mbuf_epg(&sg, m, 609 mtod(m, vm_offset_t), m->m_len); 610 else 611 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 612 if (__predict_false(rc != 0)) 613 panic("%s: sglist_append %d", __func__, rc); 614 615 for (j = 0; j < sg.sg_nseg; i++, j++) { 616 if (i < 0) { 617 usgl->len0 = htobe32(segs[j].ss_len); 618 usgl->addr0 = htobe64(segs[j].ss_paddr); 619 } else { 620 usgl->sge[i / 2].len[i & 1] = 621 htobe32(segs[j].ss_len); 622 usgl->sge[i / 2].addr[i & 1] = 623 htobe64(segs[j].ss_paddr); 624 } 625 #ifdef INVARIANTS 626 nsegs--; 627 #endif 628 } 629 sglist_reset(&sg); 630 } 631 if (i & 1) 632 usgl->sge[i / 2].len[1] = htobe32(0); 633 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 634 __func__, nsegs, start, stop)); 635 } 636 637 /* 638 * Max number of SGL entries an offload tx work request can have. This is 41 639 * (1 + 40) for a full 512B work request. 640 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 641 */ 642 #define OFLD_SGL_LEN (41) 643 644 /* 645 * Send data and/or a FIN to the peer. 646 * 647 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 648 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 649 * was transmitted. 650 * 651 * drop indicates the number of bytes that should be dropped from the head of 652 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 653 * contention on the send buffer lock (before this change it used to do 654 * sowwakeup and then t4_push_frames right after that when recovering from tx 655 * stalls). When drop is set this function MUST drop the bytes and wake up any 656 * writers. 657 */ 658 void 659 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 660 { 661 struct mbuf *sndptr, *m, *sb_sndptr; 662 struct fw_ofld_tx_data_wr *txwr; 663 struct wrqe *wr; 664 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 665 struct inpcb *inp = toep->inp; 666 struct tcpcb *tp = intotcpcb(inp); 667 struct socket *so = inp->inp_socket; 668 struct sockbuf *sb = &so->so_snd; 669 int tx_credits, shove, compl, sowwakeup; 670 struct ofld_tx_sdesc *txsd; 671 bool nomap_mbuf_seen; 672 673 INP_WLOCK_ASSERT(inp); 674 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 675 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 676 677 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 678 ulp_mode(toep) == ULP_MODE_TCPDDP || 679 ulp_mode(toep) == ULP_MODE_TLS || 680 ulp_mode(toep) == ULP_MODE_RDMA, 681 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 682 683 #ifdef VERBOSE_TRACES 684 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 685 __func__, toep->tid, toep->flags, tp->t_flags, drop); 686 #endif 687 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 688 return; 689 690 #ifdef RATELIMIT 691 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 692 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 693 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 694 } 695 #endif 696 697 /* 698 * This function doesn't resume by itself. Someone else must clear the 699 * flag and call this function. 700 */ 701 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 702 KASSERT(drop == 0, 703 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 704 return; 705 } 706 707 txsd = &toep->txsd[toep->txsd_pidx]; 708 do { 709 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 710 max_imm = max_imm_payload(tx_credits); 711 max_nsegs = max_dsgl_nsegs(tx_credits); 712 713 SOCKBUF_LOCK(sb); 714 sowwakeup = drop; 715 if (drop) { 716 sbdrop_locked(sb, drop); 717 drop = 0; 718 } 719 sb_sndptr = sb->sb_sndptr; 720 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 721 plen = 0; 722 nsegs = 0; 723 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 724 nomap_mbuf_seen = false; 725 for (m = sndptr; m != NULL; m = m->m_next) { 726 int n; 727 728 if ((m->m_flags & M_NOTAVAIL) != 0) 729 break; 730 if (m->m_flags & M_EXTPG) { 731 #ifdef KERN_TLS 732 if (m->m_epg_tls != NULL) { 733 toep->flags |= TPF_KTLS; 734 if (plen == 0) { 735 SOCKBUF_UNLOCK(sb); 736 t4_push_ktls(sc, toep, 0); 737 return; 738 } 739 break; 740 } 741 #endif 742 n = sglist_count_mbuf_epg(m, 743 mtod(m, vm_offset_t), m->m_len); 744 } else 745 n = sglist_count(mtod(m, void *), m->m_len); 746 747 nsegs += n; 748 plen += m->m_len; 749 750 /* This mbuf sent us _over_ the nsegs limit, back out */ 751 if (plen > max_imm && nsegs > max_nsegs) { 752 nsegs -= n; 753 plen -= m->m_len; 754 if (plen == 0) { 755 /* Too few credits */ 756 toep->flags |= TPF_TX_SUSPENDED; 757 if (sowwakeup) { 758 if (!TAILQ_EMPTY( 759 &toep->aiotx_jobq)) 760 t4_aiotx_queue_toep(so, 761 toep); 762 sowwakeup_locked(so); 763 } else 764 SOCKBUF_UNLOCK(sb); 765 SOCKBUF_UNLOCK_ASSERT(sb); 766 return; 767 } 768 break; 769 } 770 771 if (m->m_flags & M_EXTPG) 772 nomap_mbuf_seen = true; 773 if (max_nsegs_1mbuf < n) 774 max_nsegs_1mbuf = n; 775 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 776 777 /* This mbuf put us right at the max_nsegs limit */ 778 if (plen > max_imm && nsegs == max_nsegs) { 779 m = m->m_next; 780 break; 781 } 782 } 783 784 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 785 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 786 compl = 1; 787 else 788 compl = 0; 789 790 if (sb->sb_flags & SB_AUTOSIZE && 791 V_tcp_do_autosndbuf && 792 sb->sb_hiwat < V_tcp_autosndbuf_max && 793 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 794 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 795 V_tcp_autosndbuf_max); 796 797 if (!sbreserve_locked(sb, newsize, so, NULL)) 798 sb->sb_flags &= ~SB_AUTOSIZE; 799 else 800 sowwakeup = 1; /* room available */ 801 } 802 if (sowwakeup) { 803 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 804 t4_aiotx_queue_toep(so, toep); 805 sowwakeup_locked(so); 806 } else 807 SOCKBUF_UNLOCK(sb); 808 SOCKBUF_UNLOCK_ASSERT(sb); 809 810 /* nothing to send */ 811 if (plen == 0) { 812 KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0, 813 ("%s: nothing to send, but m != NULL is ready", 814 __func__)); 815 break; 816 } 817 818 if (__predict_false(toep->flags & TPF_FIN_SENT)) 819 panic("%s: excess tx.", __func__); 820 821 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 822 if (plen <= max_imm && !nomap_mbuf_seen) { 823 824 /* Immediate data tx */ 825 826 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 827 &toep->ofld_txq->wrq); 828 if (wr == NULL) { 829 /* XXX: how will we recover from this? */ 830 toep->flags |= TPF_TX_SUSPENDED; 831 return; 832 } 833 txwr = wrtod(wr); 834 credits = howmany(wr->wr_len, 16); 835 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); 836 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 837 nsegs = 0; 838 } else { 839 int wr_len; 840 841 /* DSGL tx */ 842 843 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 844 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 845 wr = alloc_wrqe(roundup2(wr_len, 16), 846 &toep->ofld_txq->wrq); 847 if (wr == NULL) { 848 /* XXX: how will we recover from this? */ 849 toep->flags |= TPF_TX_SUSPENDED; 850 return; 851 } 852 txwr = wrtod(wr); 853 credits = howmany(wr_len, 16); 854 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); 855 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 856 max_nsegs_1mbuf); 857 if (wr_len & 0xf) { 858 uint64_t *pad = (uint64_t *) 859 ((uintptr_t)txwr + wr_len); 860 *pad = 0; 861 } 862 } 863 864 KASSERT(toep->tx_credits >= credits, 865 ("%s: not enough credits", __func__)); 866 867 toep->tx_credits -= credits; 868 toep->tx_nocompl += credits; 869 toep->plen_nocompl += plen; 870 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 871 toep->tx_nocompl >= toep->tx_total / 4) 872 compl = 1; 873 874 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 875 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 876 toep->tx_nocompl = 0; 877 toep->plen_nocompl = 0; 878 } 879 880 tp->snd_nxt += plen; 881 tp->snd_max += plen; 882 883 SOCKBUF_LOCK(sb); 884 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 885 sb->sb_sndptr = sb_sndptr; 886 SOCKBUF_UNLOCK(sb); 887 888 toep->flags |= TPF_TX_DATA_SENT; 889 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 890 toep->flags |= TPF_TX_SUSPENDED; 891 892 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 893 txsd->plen = plen; 894 txsd->tx_credits = credits; 895 txsd++; 896 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 897 toep->txsd_pidx = 0; 898 txsd = &toep->txsd[0]; 899 } 900 toep->txsd_avail--; 901 902 t4_l2t_send(sc, wr, toep->l2te); 903 } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0); 904 905 /* Send a FIN if requested, but only if there's no more data to send */ 906 if (m == NULL && toep->flags & TPF_SEND_FIN) 907 t4_close_conn(sc, toep); 908 } 909 910 static inline void 911 rqdrop_locked(struct mbufq *q, int plen) 912 { 913 struct mbuf *m; 914 915 while (plen > 0) { 916 m = mbufq_dequeue(q); 917 918 /* Too many credits. */ 919 MPASS(m != NULL); 920 M_ASSERTPKTHDR(m); 921 922 /* Partial credits. */ 923 MPASS(plen >= m->m_pkthdr.len); 924 925 plen -= m->m_pkthdr.len; 926 m_freem(m); 927 } 928 } 929 930 static struct wrqe * 931 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) 932 { 933 struct mbuf *m; 934 struct fw_ofld_tx_data_wr *txwr; 935 struct wrqe *wr; 936 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 937 u_int adjusted_plen, ulp_submode; 938 struct inpcb *inp = toep->inp; 939 struct tcpcb *tp = intotcpcb(inp); 940 int tx_credits, shove; 941 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 942 943 M_ASSERTPKTHDR(sndptr); 944 945 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 946 if (mbuf_raw_wr(sndptr)) { 947 plen = sndptr->m_pkthdr.len; 948 KASSERT(plen <= SGE_MAX_WR_LEN, 949 ("raw WR len %u is greater than max WR len", plen)); 950 if (plen > tx_credits * 16) 951 return (NULL); 952 953 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 954 if (__predict_false(wr == NULL)) 955 return (NULL); 956 957 m_copydata(sndptr, 0, plen, wrtod(wr)); 958 return (wr); 959 } 960 961 max_imm = max_imm_payload(tx_credits); 962 max_nsegs = max_dsgl_nsegs(tx_credits); 963 964 plen = 0; 965 nsegs = 0; 966 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 967 for (m = sndptr; m != NULL; m = m->m_next) { 968 int n = sglist_count(mtod(m, void *), m->m_len); 969 970 nsegs += n; 971 plen += m->m_len; 972 973 /* 974 * This mbuf would send us _over_ the nsegs limit. 975 * Suspend tx because the PDU can't be sent out. 976 */ 977 if (plen > max_imm && nsegs > max_nsegs) 978 return (NULL); 979 980 if (max_nsegs_1mbuf < n) 981 max_nsegs_1mbuf = n; 982 } 983 984 if (__predict_false(toep->flags & TPF_FIN_SENT)) 985 panic("%s: excess tx.", __func__); 986 987 /* 988 * We have a PDU to send. All of it goes out in one WR so 'm' 989 * is NULL. A PDU's length is always a multiple of 4. 990 */ 991 MPASS(m == NULL); 992 MPASS((plen & 3) == 0); 993 MPASS(sndptr->m_pkthdr.len == plen); 994 995 shove = !(tp->t_flags & TF_MORETOCOME); 996 ulp_submode = mbuf_ulp_submode(sndptr); 997 MPASS(ulp_submode < nitems(ulp_extra_len)); 998 999 /* 1000 * plen doesn't include header and data digests, which are 1001 * generated and inserted in the right places by the TOE, but 1002 * they do occupy TCP sequence space and need to be accounted 1003 * for. 1004 */ 1005 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1006 if (plen <= max_imm) { 1007 1008 /* Immediate data tx */ 1009 1010 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1011 &toep->ofld_txq->wrq); 1012 if (wr == NULL) { 1013 /* XXX: how will we recover from this? */ 1014 return (NULL); 1015 } 1016 txwr = wrtod(wr); 1017 credits = howmany(wr->wr_len, 16); 1018 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1019 shove, ulp_submode); 1020 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1021 nsegs = 0; 1022 } else { 1023 int wr_len; 1024 1025 /* DSGL tx */ 1026 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1027 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1028 wr = alloc_wrqe(roundup2(wr_len, 16), 1029 &toep->ofld_txq->wrq); 1030 if (wr == NULL) { 1031 /* XXX: how will we recover from this? */ 1032 return (NULL); 1033 } 1034 txwr = wrtod(wr); 1035 credits = howmany(wr_len, 16); 1036 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1037 shove, ulp_submode); 1038 write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); 1039 if (wr_len & 0xf) { 1040 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); 1041 *pad = 0; 1042 } 1043 } 1044 1045 tp->snd_nxt += adjusted_plen; 1046 tp->snd_max += adjusted_plen; 1047 1048 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, 1); 1049 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); 1050 1051 return (wr); 1052 } 1053 1054 void 1055 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 1056 { 1057 struct mbuf *sndptr, *m; 1058 struct fw_wr_hdr *wrhdr; 1059 struct wrqe *wr; 1060 u_int plen, credits; 1061 struct inpcb *inp = toep->inp; 1062 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 1063 struct mbufq *pduq = &toep->ulp_pduq; 1064 1065 INP_WLOCK_ASSERT(inp); 1066 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1067 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 1068 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 1069 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 1070 1071 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 1072 return; 1073 1074 /* 1075 * This function doesn't resume by itself. Someone else must clear the 1076 * flag and call this function. 1077 */ 1078 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 1079 KASSERT(drop == 0, 1080 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1081 return; 1082 } 1083 1084 if (drop) { 1085 struct socket *so = inp->inp_socket; 1086 struct sockbuf *sb = &so->so_snd; 1087 int sbu; 1088 1089 /* 1090 * An unlocked read is ok here as the data should only 1091 * transition from a non-zero value to either another 1092 * non-zero value or zero. Once it is zero it should 1093 * stay zero. 1094 */ 1095 if (__predict_false(sbused(sb)) > 0) { 1096 SOCKBUF_LOCK(sb); 1097 sbu = sbused(sb); 1098 if (sbu > 0) { 1099 /* 1100 * The data transmitted before the 1101 * tid's ULP mode changed to ISCSI is 1102 * still in so_snd. Incoming credits 1103 * should account for so_snd first. 1104 */ 1105 sbdrop_locked(sb, min(sbu, drop)); 1106 drop -= min(sbu, drop); 1107 } 1108 sowwakeup_locked(so); /* unlocks so_snd */ 1109 } 1110 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1111 } 1112 1113 while ((sndptr = mbufq_first(pduq)) != NULL) { 1114 wr = write_iscsi_mbuf_wr(toep, sndptr); 1115 if (wr == NULL) { 1116 toep->flags |= TPF_TX_SUSPENDED; 1117 return; 1118 } 1119 1120 plen = sndptr->m_pkthdr.len; 1121 credits = howmany(wr->wr_len, 16); 1122 KASSERT(toep->tx_credits >= credits, 1123 ("%s: not enough credits", __func__)); 1124 1125 m = mbufq_dequeue(pduq); 1126 MPASS(m == sndptr); 1127 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1128 1129 toep->tx_credits -= credits; 1130 toep->tx_nocompl += credits; 1131 toep->plen_nocompl += plen; 1132 1133 /* 1134 * Ensure there are enough credits for a full-sized WR 1135 * as page pod WRs can be full-sized. 1136 */ 1137 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && 1138 toep->tx_nocompl >= toep->tx_total / 4) { 1139 wrhdr = wrtod(wr); 1140 wrhdr->hi |= htobe32(F_FW_WR_COMPL); 1141 toep->tx_nocompl = 0; 1142 toep->plen_nocompl = 0; 1143 } 1144 1145 toep->flags |= TPF_TX_DATA_SENT; 1146 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1147 toep->flags |= TPF_TX_SUSPENDED; 1148 1149 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1150 txsd->plen = plen; 1151 txsd->tx_credits = credits; 1152 txsd++; 1153 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1154 toep->txsd_pidx = 0; 1155 txsd = &toep->txsd[0]; 1156 } 1157 toep->txsd_avail--; 1158 1159 t4_l2t_send(sc, wr, toep->l2te); 1160 } 1161 1162 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1163 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1164 t4_close_conn(sc, toep); 1165 } 1166 1167 static inline void 1168 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1169 { 1170 1171 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1172 t4_push_pdus(sc, toep, drop); 1173 else if (toep->flags & TPF_KTLS) 1174 t4_push_ktls(sc, toep, drop); 1175 else 1176 t4_push_frames(sc, toep, drop); 1177 } 1178 1179 int 1180 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1181 { 1182 struct adapter *sc = tod->tod_softc; 1183 #ifdef INVARIANTS 1184 struct inpcb *inp = tp->t_inpcb; 1185 #endif 1186 struct toepcb *toep = tp->t_toe; 1187 1188 INP_WLOCK_ASSERT(inp); 1189 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1190 ("%s: inp %p dropped.", __func__, inp)); 1191 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1192 1193 t4_push_data(sc, toep, 0); 1194 1195 return (0); 1196 } 1197 1198 int 1199 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1200 { 1201 struct adapter *sc = tod->tod_softc; 1202 #ifdef INVARIANTS 1203 struct inpcb *inp = tp->t_inpcb; 1204 #endif 1205 struct toepcb *toep = tp->t_toe; 1206 1207 INP_WLOCK_ASSERT(inp); 1208 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1209 ("%s: inp %p dropped.", __func__, inp)); 1210 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1211 1212 toep->flags |= TPF_SEND_FIN; 1213 if (tp->t_state >= TCPS_ESTABLISHED) 1214 t4_push_data(sc, toep, 0); 1215 1216 return (0); 1217 } 1218 1219 int 1220 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1221 { 1222 struct adapter *sc = tod->tod_softc; 1223 #if defined(INVARIANTS) 1224 struct inpcb *inp = tp->t_inpcb; 1225 #endif 1226 struct toepcb *toep = tp->t_toe; 1227 1228 INP_WLOCK_ASSERT(inp); 1229 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1230 ("%s: inp %p dropped.", __func__, inp)); 1231 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1232 1233 /* hmmmm */ 1234 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1235 ("%s: flowc for tid %u [%s] not sent already", 1236 __func__, toep->tid, tcpstates[tp->t_state])); 1237 1238 send_reset(sc, toep, 0); 1239 return (0); 1240 } 1241 1242 /* 1243 * Peer has sent us a FIN. 1244 */ 1245 static int 1246 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1247 { 1248 struct adapter *sc = iq->adapter; 1249 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1250 unsigned int tid = GET_TID(cpl); 1251 struct toepcb *toep = lookup_tid(sc, tid); 1252 struct inpcb *inp = toep->inp; 1253 struct tcpcb *tp = NULL; 1254 struct socket *so; 1255 struct epoch_tracker et; 1256 #ifdef INVARIANTS 1257 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1258 #endif 1259 1260 KASSERT(opcode == CPL_PEER_CLOSE, 1261 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1262 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1263 1264 if (__predict_false(toep->flags & TPF_SYNQE)) { 1265 /* 1266 * do_pass_establish must have run before do_peer_close and if 1267 * this is still a synqe instead of a toepcb then the connection 1268 * must be getting aborted. 1269 */ 1270 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1271 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1272 toep, toep->flags); 1273 return (0); 1274 } 1275 1276 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1277 1278 CURVNET_SET(toep->vnet); 1279 NET_EPOCH_ENTER(et); 1280 INP_WLOCK(inp); 1281 tp = intotcpcb(inp); 1282 1283 CTR6(KTR_CXGBE, 1284 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1285 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1286 toep->ddp.flags, inp); 1287 1288 if (toep->flags & TPF_ABORT_SHUTDOWN) 1289 goto done; 1290 1291 if (ulp_mode(toep) == ULP_MODE_RDMA || 1292 (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) { 1293 /* 1294 * There might be data received via DDP before the FIN 1295 * not reported to the driver. Just assume the 1296 * sequence number in the CPL is correct as the 1297 * sequence number of the FIN. 1298 */ 1299 } else { 1300 KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt), 1301 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1302 be32toh(cpl->rcv_nxt))); 1303 } 1304 1305 tp->rcv_nxt = be32toh(cpl->rcv_nxt); 1306 1307 so = inp->inp_socket; 1308 socantrcvmore(so); 1309 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1310 DDP_LOCK(toep); 1311 if (__predict_false(toep->ddp.flags & 1312 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1313 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1314 DDP_UNLOCK(toep); 1315 } 1316 1317 switch (tp->t_state) { 1318 case TCPS_SYN_RECEIVED: 1319 tp->t_starttime = ticks; 1320 /* FALLTHROUGH */ 1321 1322 case TCPS_ESTABLISHED: 1323 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1324 break; 1325 1326 case TCPS_FIN_WAIT_1: 1327 tcp_state_change(tp, TCPS_CLOSING); 1328 break; 1329 1330 case TCPS_FIN_WAIT_2: 1331 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1332 tcp_twstart(tp); 1333 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1334 NET_EPOCH_EXIT(et); 1335 CURVNET_RESTORE(); 1336 1337 INP_WLOCK(inp); 1338 final_cpl_received(toep); 1339 return (0); 1340 1341 default: 1342 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1343 __func__, tid, tp->t_state); 1344 } 1345 done: 1346 INP_WUNLOCK(inp); 1347 NET_EPOCH_EXIT(et); 1348 CURVNET_RESTORE(); 1349 return (0); 1350 } 1351 1352 /* 1353 * Peer has ACK'd our FIN. 1354 */ 1355 static int 1356 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1357 struct mbuf *m) 1358 { 1359 struct adapter *sc = iq->adapter; 1360 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1361 unsigned int tid = GET_TID(cpl); 1362 struct toepcb *toep = lookup_tid(sc, tid); 1363 struct inpcb *inp = toep->inp; 1364 struct tcpcb *tp = NULL; 1365 struct socket *so = NULL; 1366 struct epoch_tracker et; 1367 #ifdef INVARIANTS 1368 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1369 #endif 1370 1371 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1372 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1373 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1374 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1375 1376 CURVNET_SET(toep->vnet); 1377 NET_EPOCH_ENTER(et); 1378 INP_WLOCK(inp); 1379 tp = intotcpcb(inp); 1380 1381 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1382 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1383 1384 if (toep->flags & TPF_ABORT_SHUTDOWN) 1385 goto done; 1386 1387 so = inp->inp_socket; 1388 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1389 1390 switch (tp->t_state) { 1391 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1392 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1393 tcp_twstart(tp); 1394 release: 1395 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1396 NET_EPOCH_EXIT(et); 1397 CURVNET_RESTORE(); 1398 1399 INP_WLOCK(inp); 1400 final_cpl_received(toep); /* no more CPLs expected */ 1401 1402 return (0); 1403 case TCPS_LAST_ACK: 1404 if (tcp_close(tp)) 1405 INP_WUNLOCK(inp); 1406 goto release; 1407 1408 case TCPS_FIN_WAIT_1: 1409 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1410 soisdisconnected(so); 1411 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1412 break; 1413 1414 default: 1415 log(LOG_ERR, 1416 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1417 __func__, tid, tcpstates[tp->t_state]); 1418 } 1419 done: 1420 INP_WUNLOCK(inp); 1421 NET_EPOCH_EXIT(et); 1422 CURVNET_RESTORE(); 1423 return (0); 1424 } 1425 1426 void 1427 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, 1428 int rst_status) 1429 { 1430 struct wrqe *wr; 1431 struct cpl_abort_rpl *cpl; 1432 1433 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); 1434 if (wr == NULL) { 1435 /* XXX */ 1436 panic("%s: allocation failure.", __func__); 1437 } 1438 cpl = wrtod(wr); 1439 1440 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1441 cpl->cmd = rst_status; 1442 1443 t4_wrq_tx(sc, wr); 1444 } 1445 1446 static int 1447 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1448 { 1449 switch (abort_reason) { 1450 case CPL_ERR_BAD_SYN: 1451 case CPL_ERR_CONN_RESET: 1452 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1453 case CPL_ERR_XMIT_TIMEDOUT: 1454 case CPL_ERR_PERSIST_TIMEDOUT: 1455 case CPL_ERR_FINWAIT2_TIMEDOUT: 1456 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1457 return (ETIMEDOUT); 1458 default: 1459 return (EIO); 1460 } 1461 } 1462 1463 /* 1464 * TCP RST from the peer, timeout, or some other such critical error. 1465 */ 1466 static int 1467 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1468 { 1469 struct adapter *sc = iq->adapter; 1470 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1471 unsigned int tid = GET_TID(cpl); 1472 struct toepcb *toep = lookup_tid(sc, tid); 1473 struct sge_ofld_txq *ofld_txq = toep->ofld_txq; 1474 struct inpcb *inp; 1475 struct tcpcb *tp; 1476 struct epoch_tracker et; 1477 #ifdef INVARIANTS 1478 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1479 #endif 1480 1481 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1482 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1483 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1484 1485 if (toep->flags & TPF_SYNQE) 1486 return (do_abort_req_synqe(iq, rss, m)); 1487 1488 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1489 1490 if (negative_advice(cpl->status)) { 1491 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1492 __func__, cpl->status, tid, toep->flags); 1493 return (0); /* Ignore negative advice */ 1494 } 1495 1496 inp = toep->inp; 1497 CURVNET_SET(toep->vnet); 1498 NET_EPOCH_ENTER(et); /* for tcp_close */ 1499 INP_WLOCK(inp); 1500 1501 tp = intotcpcb(inp); 1502 1503 CTR6(KTR_CXGBE, 1504 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1505 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1506 inp->inp_flags, cpl->status); 1507 1508 /* 1509 * If we'd initiated an abort earlier the reply to it is responsible for 1510 * cleaning up resources. Otherwise we tear everything down right here 1511 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1512 */ 1513 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1514 INP_WUNLOCK(inp); 1515 goto done; 1516 } 1517 toep->flags |= TPF_ABORT_SHUTDOWN; 1518 1519 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1520 struct socket *so = inp->inp_socket; 1521 1522 if (so != NULL) 1523 so_error_set(so, abort_status_to_errno(tp, 1524 cpl->status)); 1525 tp = tcp_close(tp); 1526 if (tp == NULL) 1527 INP_WLOCK(inp); /* re-acquire */ 1528 } 1529 1530 final_cpl_received(toep); 1531 done: 1532 NET_EPOCH_EXIT(et); 1533 CURVNET_RESTORE(); 1534 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1535 return (0); 1536 } 1537 1538 /* 1539 * Reply to the CPL_ABORT_REQ (send_reset) 1540 */ 1541 static int 1542 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1543 { 1544 struct adapter *sc = iq->adapter; 1545 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1546 unsigned int tid = GET_TID(cpl); 1547 struct toepcb *toep = lookup_tid(sc, tid); 1548 struct inpcb *inp = toep->inp; 1549 #ifdef INVARIANTS 1550 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1551 #endif 1552 1553 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1554 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1555 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1556 1557 if (toep->flags & TPF_SYNQE) 1558 return (do_abort_rpl_synqe(iq, rss, m)); 1559 1560 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1561 1562 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1563 __func__, tid, toep, inp, cpl->status); 1564 1565 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1566 ("%s: wasn't expecting abort reply", __func__)); 1567 1568 INP_WLOCK(inp); 1569 final_cpl_received(toep); 1570 1571 return (0); 1572 } 1573 1574 static int 1575 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1576 { 1577 struct adapter *sc = iq->adapter; 1578 const struct cpl_rx_data *cpl = mtod(m, const void *); 1579 unsigned int tid = GET_TID(cpl); 1580 struct toepcb *toep = lookup_tid(sc, tid); 1581 struct inpcb *inp = toep->inp; 1582 struct tcpcb *tp; 1583 struct socket *so; 1584 struct sockbuf *sb; 1585 struct epoch_tracker et; 1586 int len, rx_credits; 1587 uint32_t ddp_placed = 0; 1588 1589 if (__predict_false(toep->flags & TPF_SYNQE)) { 1590 /* 1591 * do_pass_establish must have run before do_rx_data and if this 1592 * is still a synqe instead of a toepcb then the connection must 1593 * be getting aborted. 1594 */ 1595 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1596 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1597 toep, toep->flags); 1598 m_freem(m); 1599 return (0); 1600 } 1601 1602 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1603 1604 /* strip off CPL header */ 1605 m_adj(m, sizeof(*cpl)); 1606 len = m->m_pkthdr.len; 1607 1608 INP_WLOCK(inp); 1609 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1610 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1611 __func__, tid, len, inp->inp_flags); 1612 INP_WUNLOCK(inp); 1613 m_freem(m); 1614 return (0); 1615 } 1616 1617 tp = intotcpcb(inp); 1618 1619 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1620 toep->flags & TPF_TLS_RECEIVE)) { 1621 /* Received "raw" data on a TLS socket. */ 1622 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1623 __func__, tid, len); 1624 do_rx_data_tls(cpl, toep, m); 1625 return (0); 1626 } 1627 1628 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1629 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1630 1631 tp->rcv_nxt += len; 1632 if (tp->rcv_wnd < len) { 1633 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1634 ("%s: negative window size", __func__)); 1635 } 1636 1637 tp->rcv_wnd -= len; 1638 tp->t_rcvtime = ticks; 1639 1640 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1641 DDP_LOCK(toep); 1642 so = inp_inpcbtosocket(inp); 1643 sb = &so->so_rcv; 1644 SOCKBUF_LOCK(sb); 1645 1646 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1647 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1648 __func__, tid, len); 1649 m_freem(m); 1650 SOCKBUF_UNLOCK(sb); 1651 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1652 DDP_UNLOCK(toep); 1653 INP_WUNLOCK(inp); 1654 1655 CURVNET_SET(toep->vnet); 1656 NET_EPOCH_ENTER(et); 1657 INP_WLOCK(inp); 1658 tp = tcp_drop(tp, ECONNRESET); 1659 if (tp) 1660 INP_WUNLOCK(inp); 1661 NET_EPOCH_EXIT(et); 1662 CURVNET_RESTORE(); 1663 1664 return (0); 1665 } 1666 1667 /* receive buffer autosize */ 1668 MPASS(toep->vnet == so->so_vnet); 1669 CURVNET_SET(toep->vnet); 1670 if (sb->sb_flags & SB_AUTOSIZE && 1671 V_tcp_do_autorcvbuf && 1672 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1673 len > (sbspace(sb) / 8 * 7)) { 1674 unsigned int hiwat = sb->sb_hiwat; 1675 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1676 V_tcp_autorcvbuf_max); 1677 1678 if (!sbreserve_locked(sb, newsize, so, NULL)) 1679 sb->sb_flags &= ~SB_AUTOSIZE; 1680 } 1681 1682 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1683 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1684 1685 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1686 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1687 __func__, tid, len); 1688 1689 if (changed) { 1690 if (toep->ddp.flags & DDP_SC_REQ) 1691 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1692 else { 1693 KASSERT(cpl->ddp_off == 1, 1694 ("%s: DDP switched on by itself.", 1695 __func__)); 1696 1697 /* Fell out of DDP mode */ 1698 toep->ddp.flags &= ~DDP_ON; 1699 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1700 __func__); 1701 1702 insert_ddp_data(toep, ddp_placed); 1703 } 1704 } 1705 1706 if (toep->ddp.flags & DDP_ON) { 1707 /* 1708 * CPL_RX_DATA with DDP on can only be an indicate. 1709 * Start posting queued AIO requests via DDP. The 1710 * payload that arrived in this indicate is appended 1711 * to the socket buffer as usual. 1712 */ 1713 handle_ddp_indicate(toep); 1714 } 1715 } 1716 1717 sbappendstream_locked(sb, m, 0); 1718 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1719 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1720 rx_credits = send_rx_credits(sc, toep, rx_credits); 1721 tp->rcv_wnd += rx_credits; 1722 tp->rcv_adv += rx_credits; 1723 } 1724 1725 if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1726 sbavail(sb) != 0) { 1727 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1728 tid); 1729 ddp_queue_toep(toep); 1730 } 1731 sorwakeup_locked(so); 1732 SOCKBUF_UNLOCK_ASSERT(sb); 1733 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1734 DDP_UNLOCK(toep); 1735 1736 INP_WUNLOCK(inp); 1737 CURVNET_RESTORE(); 1738 return (0); 1739 } 1740 1741 static int 1742 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1743 { 1744 struct adapter *sc = iq->adapter; 1745 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1746 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1747 struct toepcb *toep = lookup_tid(sc, tid); 1748 struct inpcb *inp; 1749 struct tcpcb *tp; 1750 struct socket *so; 1751 uint8_t credits = cpl->credits; 1752 struct ofld_tx_sdesc *txsd; 1753 int plen; 1754 #ifdef INVARIANTS 1755 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1756 #endif 1757 1758 /* 1759 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1760 * now this comes back carrying the credits for the flowc. 1761 */ 1762 if (__predict_false(toep->flags & TPF_SYNQE)) { 1763 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1764 ("%s: credits for a synq entry %p", __func__, toep)); 1765 return (0); 1766 } 1767 1768 inp = toep->inp; 1769 1770 KASSERT(opcode == CPL_FW4_ACK, 1771 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1772 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1773 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1774 1775 INP_WLOCK(inp); 1776 1777 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1778 INP_WUNLOCK(inp); 1779 return (0); 1780 } 1781 1782 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1783 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1784 1785 tp = intotcpcb(inp); 1786 1787 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1788 tcp_seq snd_una = be32toh(cpl->snd_una); 1789 1790 #ifdef INVARIANTS 1791 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1792 log(LOG_ERR, 1793 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1794 __func__, snd_una, toep->tid, tp->snd_una); 1795 } 1796 #endif 1797 1798 if (tp->snd_una != snd_una) { 1799 tp->snd_una = snd_una; 1800 tp->ts_recent_age = tcp_ts_getticks(); 1801 } 1802 } 1803 1804 #ifdef VERBOSE_TRACES 1805 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1806 #endif 1807 so = inp->inp_socket; 1808 txsd = &toep->txsd[toep->txsd_cidx]; 1809 plen = 0; 1810 while (credits) { 1811 KASSERT(credits >= txsd->tx_credits, 1812 ("%s: too many (or partial) credits", __func__)); 1813 credits -= txsd->tx_credits; 1814 toep->tx_credits += txsd->tx_credits; 1815 plen += txsd->plen; 1816 txsd++; 1817 toep->txsd_avail++; 1818 KASSERT(toep->txsd_avail <= toep->txsd_total, 1819 ("%s: txsd avail > total", __func__)); 1820 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1821 txsd = &toep->txsd[0]; 1822 toep->txsd_cidx = 0; 1823 } 1824 } 1825 1826 if (toep->tx_credits == toep->tx_total) { 1827 toep->tx_nocompl = 0; 1828 toep->plen_nocompl = 0; 1829 } 1830 1831 if (toep->flags & TPF_TX_SUSPENDED && 1832 toep->tx_credits >= toep->tx_total / 4) { 1833 #ifdef VERBOSE_TRACES 1834 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1835 tid); 1836 #endif 1837 toep->flags &= ~TPF_TX_SUSPENDED; 1838 CURVNET_SET(toep->vnet); 1839 t4_push_data(sc, toep, plen); 1840 CURVNET_RESTORE(); 1841 } else if (plen > 0) { 1842 struct sockbuf *sb = &so->so_snd; 1843 int sbu; 1844 1845 SOCKBUF_LOCK(sb); 1846 sbu = sbused(sb); 1847 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 1848 if (__predict_false(sbu > 0)) { 1849 /* 1850 * The data transmitted before the 1851 * tid's ULP mode changed to ISCSI is 1852 * still in so_snd. Incoming credits 1853 * should account for so_snd first. 1854 */ 1855 sbdrop_locked(sb, min(sbu, plen)); 1856 plen -= min(sbu, plen); 1857 } 1858 sowwakeup_locked(so); /* unlocks so_snd */ 1859 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1860 } else { 1861 #ifdef VERBOSE_TRACES 1862 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1863 tid, plen); 1864 #endif 1865 sbdrop_locked(sb, plen); 1866 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1867 t4_aiotx_queue_toep(so, toep); 1868 sowwakeup_locked(so); /* unlocks so_snd */ 1869 } 1870 SOCKBUF_UNLOCK_ASSERT(sb); 1871 } 1872 1873 INP_WUNLOCK(inp); 1874 1875 return (0); 1876 } 1877 1878 void 1879 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1880 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1881 { 1882 struct wrqe *wr; 1883 struct cpl_set_tcb_field *req; 1884 struct ofld_tx_sdesc *txsd; 1885 1886 MPASS((cookie & ~M_COOKIE) == 0); 1887 if (reply) { 1888 MPASS(cookie != CPL_COOKIE_RESERVED); 1889 } 1890 1891 wr = alloc_wrqe(sizeof(*req), wrq); 1892 if (wr == NULL) { 1893 /* XXX */ 1894 panic("%s: allocation failure.", __func__); 1895 } 1896 req = wrtod(wr); 1897 1898 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1899 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1900 if (reply == 0) 1901 req->reply_ctrl |= htobe16(F_NO_REPLY); 1902 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1903 req->mask = htobe64(mask); 1904 req->val = htobe64(val); 1905 if (wrq->eq.type == EQ_OFLD) { 1906 txsd = &toep->txsd[toep->txsd_pidx]; 1907 txsd->tx_credits = howmany(sizeof(*req), 16); 1908 txsd->plen = 0; 1909 KASSERT(toep->tx_credits >= txsd->tx_credits && 1910 toep->txsd_avail > 0, 1911 ("%s: not enough credits (%d)", __func__, 1912 toep->tx_credits)); 1913 toep->tx_credits -= txsd->tx_credits; 1914 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1915 toep->txsd_pidx = 0; 1916 toep->txsd_avail--; 1917 } 1918 1919 t4_wrq_tx(sc, wr); 1920 } 1921 1922 void 1923 t4_init_cpl_io_handlers(void) 1924 { 1925 1926 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1927 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1928 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1929 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1930 CPL_COOKIE_TOM); 1931 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1932 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1933 } 1934 1935 void 1936 t4_uninit_cpl_io_handlers(void) 1937 { 1938 1939 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1940 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1941 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1942 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1943 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1944 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1945 } 1946 1947 /* 1948 * Use the 'backend1' field in AIO jobs to hold an error that should 1949 * be reported when the job is completed, the 'backend3' field to 1950 * store the amount of data sent by the AIO job so far, and the 1951 * 'backend4' field to hold a reference count on the job. 1952 * 1953 * Each unmapped mbuf holds a reference on the job as does the queue 1954 * so long as the job is queued. 1955 */ 1956 #define aio_error backend1 1957 #define aio_sent backend3 1958 #define aio_refs backend4 1959 1960 #define jobtotid(job) \ 1961 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1962 1963 static void 1964 aiotx_free_job(struct kaiocb *job) 1965 { 1966 long status; 1967 int error; 1968 1969 if (refcount_release(&job->aio_refs) == 0) 1970 return; 1971 1972 error = (intptr_t)job->aio_error; 1973 status = job->aio_sent; 1974 #ifdef VERBOSE_TRACES 1975 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1976 jobtotid(job), job, status, error); 1977 #endif 1978 if (error != 0 && status != 0) 1979 error = 0; 1980 if (error == ECANCELED) 1981 aio_cancel(job); 1982 else if (error) 1983 aio_complete(job, -1, error); 1984 else { 1985 job->msgsnd = 1; 1986 aio_complete(job, status, 0); 1987 } 1988 } 1989 1990 static void 1991 aiotx_free_pgs(struct mbuf *m) 1992 { 1993 struct kaiocb *job; 1994 vm_page_t pg; 1995 1996 M_ASSERTEXTPG(m); 1997 job = m->m_ext.ext_arg1; 1998 #ifdef VERBOSE_TRACES 1999 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 2000 m->m_len, jobtotid(job)); 2001 #endif 2002 2003 for (int i = 0; i < m->m_epg_npgs; i++) { 2004 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 2005 vm_page_unwire(pg, PQ_ACTIVE); 2006 } 2007 2008 aiotx_free_job(job); 2009 } 2010 2011 /* 2012 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 2013 * of an AIO job. 2014 */ 2015 static struct mbuf * 2016 alloc_aiotx_mbuf(struct kaiocb *job, int len) 2017 { 2018 struct vmspace *vm; 2019 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 2020 struct mbuf *m, *top, *last; 2021 vm_map_t map; 2022 vm_offset_t start; 2023 int i, mlen, npages, pgoff; 2024 2025 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 2026 ("%s(%p, %d): request to send beyond end of buffer", __func__, 2027 job, len)); 2028 2029 /* 2030 * The AIO subsystem will cancel and drain all requests before 2031 * permitting a process to exit or exec, so p_vmspace should 2032 * be stable here. 2033 */ 2034 vm = job->userproc->p_vmspace; 2035 map = &vm->vm_map; 2036 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 2037 pgoff = start & PAGE_MASK; 2038 2039 top = NULL; 2040 last = NULL; 2041 while (len > 0) { 2042 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 2043 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 2044 ("%s: next start (%#jx + %#x) is not page aligned", 2045 __func__, (uintmax_t)start, mlen)); 2046 2047 npages = vm_fault_quick_hold_pages(map, start, mlen, 2048 VM_PROT_WRITE, pgs, nitems(pgs)); 2049 if (npages < 0) 2050 break; 2051 2052 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs); 2053 if (m == NULL) { 2054 vm_page_unhold_pages(pgs, npages); 2055 break; 2056 } 2057 2058 m->m_epg_1st_off = pgoff; 2059 m->m_epg_npgs = npages; 2060 if (npages == 1) { 2061 KASSERT(mlen + pgoff <= PAGE_SIZE, 2062 ("%s: single page is too large (off %d len %d)", 2063 __func__, pgoff, mlen)); 2064 m->m_epg_last_len = mlen; 2065 } else { 2066 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2067 (npages - 2) * PAGE_SIZE; 2068 } 2069 for (i = 0; i < npages; i++) 2070 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2071 2072 m->m_len = mlen; 2073 m->m_ext.ext_size = npages * PAGE_SIZE; 2074 m->m_ext.ext_arg1 = job; 2075 refcount_acquire(&job->aio_refs); 2076 2077 #ifdef VERBOSE_TRACES 2078 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2079 __func__, jobtotid(job), m, job, npages); 2080 #endif 2081 2082 if (top == NULL) 2083 top = m; 2084 else 2085 last->m_next = m; 2086 last = m; 2087 2088 len -= mlen; 2089 start += mlen; 2090 pgoff = 0; 2091 } 2092 2093 return (top); 2094 } 2095 2096 static void 2097 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2098 { 2099 struct sockbuf *sb; 2100 struct file *fp; 2101 struct inpcb *inp; 2102 struct tcpcb *tp; 2103 struct mbuf *m; 2104 int error, len; 2105 bool moretocome, sendmore; 2106 2107 sb = &so->so_snd; 2108 SOCKBUF_UNLOCK(sb); 2109 fp = job->fd_file; 2110 m = NULL; 2111 2112 #ifdef MAC 2113 error = mac_socket_check_send(fp->f_cred, so); 2114 if (error != 0) 2115 goto out; 2116 #endif 2117 2118 /* Inline sosend_generic(). */ 2119 2120 error = sblock(sb, SBL_WAIT); 2121 MPASS(error == 0); 2122 2123 sendanother: 2124 SOCKBUF_LOCK(sb); 2125 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2126 SOCKBUF_UNLOCK(sb); 2127 sbunlock(sb); 2128 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2129 PROC_LOCK(job->userproc); 2130 kern_psignal(job->userproc, SIGPIPE); 2131 PROC_UNLOCK(job->userproc); 2132 } 2133 error = EPIPE; 2134 goto out; 2135 } 2136 if (so->so_error) { 2137 error = so->so_error; 2138 so->so_error = 0; 2139 SOCKBUF_UNLOCK(sb); 2140 sbunlock(sb); 2141 goto out; 2142 } 2143 if ((so->so_state & SS_ISCONNECTED) == 0) { 2144 SOCKBUF_UNLOCK(sb); 2145 sbunlock(sb); 2146 error = ENOTCONN; 2147 goto out; 2148 } 2149 if (sbspace(sb) < sb->sb_lowat) { 2150 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2151 2152 /* 2153 * Don't block if there is too little room in the socket 2154 * buffer. Instead, requeue the request. 2155 */ 2156 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2157 SOCKBUF_UNLOCK(sb); 2158 sbunlock(sb); 2159 error = ECANCELED; 2160 goto out; 2161 } 2162 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2163 SOCKBUF_UNLOCK(sb); 2164 sbunlock(sb); 2165 goto out; 2166 } 2167 2168 /* 2169 * Write as much data as the socket permits, but no more than a 2170 * a single sndbuf at a time. 2171 */ 2172 len = sbspace(sb); 2173 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2174 len = job->uaiocb.aio_nbytes - job->aio_sent; 2175 moretocome = false; 2176 } else 2177 moretocome = true; 2178 if (len > toep->params.sndbuf) { 2179 len = toep->params.sndbuf; 2180 sendmore = true; 2181 } else 2182 sendmore = false; 2183 2184 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2185 moretocome = true; 2186 SOCKBUF_UNLOCK(sb); 2187 MPASS(len != 0); 2188 2189 m = alloc_aiotx_mbuf(job, len); 2190 if (m == NULL) { 2191 sbunlock(sb); 2192 error = EFAULT; 2193 goto out; 2194 } 2195 2196 /* Inlined tcp_usr_send(). */ 2197 2198 inp = toep->inp; 2199 INP_WLOCK(inp); 2200 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2201 INP_WUNLOCK(inp); 2202 sbunlock(sb); 2203 error = ECONNRESET; 2204 goto out; 2205 } 2206 2207 job->aio_sent += m_length(m, NULL); 2208 2209 sbappendstream(sb, m, 0); 2210 m = NULL; 2211 2212 if (!(inp->inp_flags & INP_DROPPED)) { 2213 tp = intotcpcb(inp); 2214 if (moretocome) 2215 tp->t_flags |= TF_MORETOCOME; 2216 error = tp->t_fb->tfb_tcp_output(tp); 2217 if (moretocome) 2218 tp->t_flags &= ~TF_MORETOCOME; 2219 } 2220 2221 INP_WUNLOCK(inp); 2222 if (sendmore) 2223 goto sendanother; 2224 sbunlock(sb); 2225 2226 if (error) 2227 goto out; 2228 2229 /* 2230 * If this is a blocking socket and the request has not been 2231 * fully completed, requeue it until the socket is ready 2232 * again. 2233 */ 2234 if (job->aio_sent < job->uaiocb.aio_nbytes && 2235 !(so->so_state & SS_NBIO)) { 2236 SOCKBUF_LOCK(sb); 2237 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2238 SOCKBUF_UNLOCK(sb); 2239 error = ECANCELED; 2240 goto out; 2241 } 2242 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2243 return; 2244 } 2245 2246 /* 2247 * If the request will not be requeued, drop the queue's 2248 * reference to the job. Any mbufs in flight should still 2249 * hold a reference, but this drops the reference that the 2250 * queue owns while it is waiting to queue mbufs to the 2251 * socket. 2252 */ 2253 aiotx_free_job(job); 2254 2255 out: 2256 if (error) { 2257 job->aio_error = (void *)(intptr_t)error; 2258 aiotx_free_job(job); 2259 } 2260 m_freem(m); 2261 SOCKBUF_LOCK(sb); 2262 } 2263 2264 static void 2265 t4_aiotx_task(void *context, int pending) 2266 { 2267 struct toepcb *toep = context; 2268 struct socket *so; 2269 struct kaiocb *job; 2270 2271 so = toep->aiotx_so; 2272 CURVNET_SET(toep->vnet); 2273 SOCKBUF_LOCK(&so->so_snd); 2274 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2275 job = TAILQ_FIRST(&toep->aiotx_jobq); 2276 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2277 if (!aio_clear_cancel_function(job)) 2278 continue; 2279 2280 t4_aiotx_process_job(toep, so, job); 2281 } 2282 toep->aiotx_so = NULL; 2283 SOCKBUF_UNLOCK(&so->so_snd); 2284 CURVNET_RESTORE(); 2285 2286 free_toepcb(toep); 2287 SOCK_LOCK(so); 2288 sorele(so); 2289 } 2290 2291 static void 2292 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2293 { 2294 2295 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2296 #ifdef VERBOSE_TRACES 2297 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2298 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2299 #endif 2300 if (toep->aiotx_so != NULL) 2301 return; 2302 soref(so); 2303 toep->aiotx_so = so; 2304 hold_toepcb(toep); 2305 soaio_enqueue(&toep->aiotx_task); 2306 } 2307 2308 static void 2309 t4_aiotx_cancel(struct kaiocb *job) 2310 { 2311 struct socket *so; 2312 struct sockbuf *sb; 2313 struct tcpcb *tp; 2314 struct toepcb *toep; 2315 2316 so = job->fd_file->f_data; 2317 tp = so_sototcpcb(so); 2318 toep = tp->t_toe; 2319 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2320 sb = &so->so_snd; 2321 2322 SOCKBUF_LOCK(sb); 2323 if (!aio_cancel_cleared(job)) 2324 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2325 SOCKBUF_UNLOCK(sb); 2326 2327 job->aio_error = (void *)(intptr_t)ECANCELED; 2328 aiotx_free_job(job); 2329 } 2330 2331 int 2332 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2333 { 2334 struct tcpcb *tp = so_sototcpcb(so); 2335 struct toepcb *toep = tp->t_toe; 2336 struct adapter *sc = td_adapter(toep->td); 2337 2338 /* This only handles writes. */ 2339 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2340 return (EOPNOTSUPP); 2341 2342 if (!sc->tt.tx_zcopy) 2343 return (EOPNOTSUPP); 2344 2345 if (tls_tx_key(toep)) 2346 return (EOPNOTSUPP); 2347 2348 SOCKBUF_LOCK(&so->so_snd); 2349 #ifdef VERBOSE_TRACES 2350 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2351 #endif 2352 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2353 panic("new job was cancelled"); 2354 refcount_init(&job->aio_refs, 1); 2355 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2356 if (sowriteable(so)) 2357 t4_aiotx_queue_toep(so, toep); 2358 SOCKBUF_UNLOCK(&so->so_snd); 2359 return (0); 2360 } 2361 2362 void 2363 aiotx_init_toep(struct toepcb *toep) 2364 { 2365 2366 TAILQ_INIT(&toep->aiotx_jobq); 2367 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2368 } 2369 #endif 2370