1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_kern_tls.h" 36 #include "opt_ratelimit.h" 37 38 #ifdef TCP_OFFLOAD 39 #include <sys/param.h> 40 #include <sys/aio.h> 41 #include <sys/file.h> 42 #include <sys/kernel.h> 43 #include <sys/ktr.h> 44 #include <sys/module.h> 45 #include <sys/proc.h> 46 #include <sys/protosw.h> 47 #include <sys/domain.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sglist.h> 51 #include <sys/taskqueue.h> 52 #include <netinet/in.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip6.h> 56 #define TCPSTATES 57 #include <netinet/tcp_fsm.h> 58 #include <netinet/tcp_seq.h> 59 #include <netinet/tcp_var.h> 60 #include <netinet/toecore.h> 61 62 #include <security/mac/mac_framework.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_page.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (ulp_mode(toep) == ULP_MODE_TLS) 100 nparams++; 101 if (toep->tls.fcplenmax != 0) 102 nparams++; 103 if (toep->params.tc_idx != -1) { 104 MPASS(toep->params.tc_idx >= 0 && 105 toep->params.tc_idx < sc->chip_params->nsched_cls); 106 nparams++; 107 } 108 109 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 110 111 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 112 if (wr == NULL) { 113 /* XXX */ 114 panic("%s: allocation failure.", __func__); 115 } 116 flowc = wrtod(wr); 117 memset(flowc, 0, wr->wr_len); 118 119 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 120 V_FW_FLOWC_WR_NPARAMS(nparams)); 121 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 122 V_FW_WR_FLOWID(toep->tid)); 123 124 #define FLOWC_PARAM(__m, __v) \ 125 do { \ 126 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 127 flowc->mnemval[paramidx].val = htobe32(__v); \ 128 paramidx++; \ 129 } while (0) 130 131 paramidx = 0; 132 133 FLOWC_PARAM(PFNVFN, pfvf); 134 FLOWC_PARAM(CH, pi->tx_chan); 135 FLOWC_PARAM(PORT, pi->tx_chan); 136 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 137 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 138 if (tp) { 139 FLOWC_PARAM(MSS, toep->params.emss); 140 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 141 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 142 } else 143 FLOWC_PARAM(MSS, 512); 144 CTR6(KTR_CXGBE, 145 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 146 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 147 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 148 149 if (ulp_mode(toep) == ULP_MODE_TLS) 150 FLOWC_PARAM(ULP_MODE, ulp_mode(toep)); 151 if (toep->tls.fcplenmax != 0) 152 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 153 if (toep->params.tc_idx != -1) 154 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 155 #undef FLOWC_PARAM 156 157 KASSERT(paramidx == nparams, ("nparams mismatch")); 158 159 txsd->tx_credits = howmany(flowclen, 16); 160 txsd->plen = 0; 161 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 162 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 163 toep->tx_credits -= txsd->tx_credits; 164 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 165 toep->txsd_pidx = 0; 166 toep->txsd_avail--; 167 168 toep->flags |= TPF_FLOWC_WR_SENT; 169 t4_wrq_tx(sc, wr); 170 } 171 172 #ifdef RATELIMIT 173 /* 174 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 175 */ 176 static int 177 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 178 { 179 int tc_idx, rc; 180 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 181 const int port_id = toep->vi->pi->port_id; 182 183 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 184 185 if (kbps == 0) { 186 /* unbind */ 187 tc_idx = -1; 188 } else { 189 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 190 if (rc != 0) 191 return (rc); 192 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 193 } 194 195 if (toep->params.tc_idx != tc_idx) { 196 struct wrqe *wr; 197 struct fw_flowc_wr *flowc; 198 int nparams = 1, flowclen, flowclen16; 199 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 200 201 flowclen = sizeof(*flowc) + nparams * sizeof(struct 202 fw_flowc_mnemval); 203 flowclen16 = howmany(flowclen, 16); 204 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 205 (wr = alloc_wrqe(roundup2(flowclen, 16), 206 &toep->ofld_txq->wrq)) == NULL) { 207 if (tc_idx >= 0) 208 t4_release_cl_rl(sc, port_id, tc_idx); 209 return (ENOMEM); 210 } 211 212 flowc = wrtod(wr); 213 memset(flowc, 0, wr->wr_len); 214 215 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 216 V_FW_FLOWC_WR_NPARAMS(nparams)); 217 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 218 V_FW_WR_FLOWID(toep->tid)); 219 220 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 221 if (tc_idx == -1) 222 flowc->mnemval[0].val = htobe32(0xff); 223 else 224 flowc->mnemval[0].val = htobe32(tc_idx); 225 226 txsd->tx_credits = flowclen16; 227 txsd->plen = 0; 228 toep->tx_credits -= txsd->tx_credits; 229 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 230 toep->txsd_pidx = 0; 231 toep->txsd_avail--; 232 t4_wrq_tx(sc, wr); 233 } 234 235 if (toep->params.tc_idx >= 0) 236 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 237 toep->params.tc_idx = tc_idx; 238 239 return (0); 240 } 241 #endif 242 243 void 244 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 245 { 246 struct wrqe *wr; 247 struct cpl_abort_req *req; 248 int tid = toep->tid; 249 struct inpcb *inp = toep->inp; 250 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 251 252 INP_WLOCK_ASSERT(inp); 253 254 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 255 __func__, toep->tid, 256 inp->inp_flags & INP_DROPPED ? "inp dropped" : 257 tcpstates[tp->t_state], 258 toep->flags, inp->inp_flags, 259 toep->flags & TPF_ABORT_SHUTDOWN ? 260 " (abort already in progress)" : ""); 261 262 if (toep->flags & TPF_ABORT_SHUTDOWN) 263 return; /* abort already in progress */ 264 265 toep->flags |= TPF_ABORT_SHUTDOWN; 266 267 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 268 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 269 270 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 271 if (wr == NULL) { 272 /* XXX */ 273 panic("%s: allocation failure.", __func__); 274 } 275 req = wrtod(wr); 276 277 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 278 if (inp->inp_flags & INP_DROPPED) 279 req->rsvd0 = htobe32(snd_nxt); 280 else 281 req->rsvd0 = htobe32(tp->snd_nxt); 282 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 283 req->cmd = CPL_ABORT_SEND_RST; 284 285 /* 286 * XXX: What's the correct way to tell that the inp hasn't been detached 287 * from its socket? Should I even be flushing the snd buffer here? 288 */ 289 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 290 struct socket *so = inp->inp_socket; 291 292 if (so != NULL) /* because I'm not sure. See comment above */ 293 sbflush(&so->so_snd); 294 } 295 296 t4_l2t_send(sc, wr, toep->l2te); 297 } 298 299 /* 300 * Called when a connection is established to translate the TCP options 301 * reported by HW to FreeBSD's native format. 302 */ 303 static void 304 assign_rxopt(struct tcpcb *tp, uint16_t opt) 305 { 306 struct toepcb *toep = tp->t_toe; 307 struct inpcb *inp = tp->t_inpcb; 308 struct adapter *sc = td_adapter(toep->td); 309 310 INP_LOCK_ASSERT(inp); 311 312 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 313 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 314 if (inp->inp_inc.inc_flags & INC_ISIPV6) 315 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 316 else 317 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 318 319 toep->params.emss = tp->t_maxseg; 320 if (G_TCPOPT_TSTAMP(opt)) { 321 toep->params.tstamp = 1; 322 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 323 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 324 tp->ts_recent = 0; /* hmmm */ 325 tp->ts_recent_age = tcp_ts_getticks(); 326 } else 327 toep->params.tstamp = 0; 328 329 if (G_TCPOPT_SACK(opt)) { 330 toep->params.sack = 1; 331 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 332 } else { 333 toep->params.sack = 0; 334 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 335 } 336 337 if (G_TCPOPT_WSCALE_OK(opt)) 338 tp->t_flags |= TF_RCVD_SCALE; 339 340 /* Doing window scaling? */ 341 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 342 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 343 tp->rcv_scale = tp->request_r_scale; 344 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 345 } else 346 toep->params.wscale = 0; 347 348 CTR6(KTR_CXGBE, 349 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 350 toep->tid, toep->params.mtu_idx, toep->params.emss, 351 toep->params.tstamp, toep->params.sack, toep->params.wscale); 352 } 353 354 /* 355 * Completes some final bits of initialization for just established connections 356 * and changes their state to TCPS_ESTABLISHED. 357 * 358 * The ISNs are from the exchange of SYNs. 359 */ 360 void 361 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 362 { 363 struct inpcb *inp = toep->inp; 364 struct socket *so = inp->inp_socket; 365 struct tcpcb *tp = intotcpcb(inp); 366 uint16_t tcpopt = be16toh(opt); 367 368 INP_WLOCK_ASSERT(inp); 369 KASSERT(tp->t_state == TCPS_SYN_SENT || 370 tp->t_state == TCPS_SYN_RECEIVED, 371 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 372 373 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 374 __func__, toep->tid, so, inp, tp, toep); 375 376 tcp_state_change(tp, TCPS_ESTABLISHED); 377 tp->t_starttime = ticks; 378 TCPSTAT_INC(tcps_connects); 379 380 tp->irs = irs; 381 tcp_rcvseqinit(tp); 382 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 383 tp->rcv_adv += tp->rcv_wnd; 384 tp->last_ack_sent = tp->rcv_nxt; 385 386 tp->iss = iss; 387 tcp_sendseqinit(tp); 388 tp->snd_una = iss + 1; 389 tp->snd_nxt = iss + 1; 390 tp->snd_max = iss + 1; 391 392 assign_rxopt(tp, tcpopt); 393 send_flowc_wr(toep, tp); 394 395 soisconnected(so); 396 397 if (ulp_mode(toep) == ULP_MODE_TLS) 398 tls_establish(toep); 399 } 400 401 int 402 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 403 { 404 struct wrqe *wr; 405 struct cpl_rx_data_ack *req; 406 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 407 408 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 409 410 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 411 if (wr == NULL) 412 return (0); 413 req = wrtod(wr); 414 415 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 416 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 417 418 t4_wrq_tx(sc, wr); 419 return (credits); 420 } 421 422 void 423 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 424 { 425 struct wrqe *wr; 426 struct cpl_rx_data_ack *req; 427 428 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 429 if (wr == NULL) 430 return; 431 req = wrtod(wr); 432 433 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 434 req->credit_dack = htobe32(F_RX_MODULATE_RX); 435 436 t4_wrq_tx(sc, wr); 437 } 438 439 void 440 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 441 { 442 struct adapter *sc = tod->tod_softc; 443 struct inpcb *inp = tp->t_inpcb; 444 struct socket *so = inp->inp_socket; 445 struct sockbuf *sb = &so->so_rcv; 446 struct toepcb *toep = tp->t_toe; 447 int rx_credits; 448 449 INP_WLOCK_ASSERT(inp); 450 SOCKBUF_LOCK_ASSERT(sb); 451 452 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 453 if (rx_credits > 0 && 454 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 455 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 456 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 457 rx_credits = send_rx_credits(sc, toep, rx_credits); 458 tp->rcv_wnd += rx_credits; 459 tp->rcv_adv += rx_credits; 460 } else if (toep->flags & TPF_FORCE_CREDITS) 461 send_rx_modulate(sc, toep); 462 } 463 464 void 465 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 466 { 467 struct inpcb *inp = tp->t_inpcb; 468 struct socket *so = inp->inp_socket; 469 struct sockbuf *sb = &so->so_rcv; 470 471 SOCKBUF_LOCK(sb); 472 t4_rcvd_locked(tod, tp); 473 SOCKBUF_UNLOCK(sb); 474 } 475 476 /* 477 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 478 */ 479 int 480 t4_close_conn(struct adapter *sc, struct toepcb *toep) 481 { 482 struct wrqe *wr; 483 struct cpl_close_con_req *req; 484 unsigned int tid = toep->tid; 485 486 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 487 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 488 489 if (toep->flags & TPF_FIN_SENT) 490 return (0); 491 492 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 493 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 494 495 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 496 if (wr == NULL) { 497 /* XXX */ 498 panic("%s: allocation failure.", __func__); 499 } 500 req = wrtod(wr); 501 502 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 503 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 504 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 505 V_FW_WR_FLOWID(tid)); 506 req->wr.wr_lo = cpu_to_be64(0); 507 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 508 req->rsvd = 0; 509 510 toep->flags |= TPF_FIN_SENT; 511 toep->flags &= ~TPF_SEND_FIN; 512 t4_l2t_send(sc, wr, toep->l2te); 513 514 return (0); 515 } 516 517 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 518 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 519 520 /* Maximum amount of immediate data we could stuff in a WR */ 521 static inline int 522 max_imm_payload(int tx_credits) 523 { 524 const int n = 1; /* Use no more than one desc for imm. data WR */ 525 526 KASSERT(tx_credits >= 0 && 527 tx_credits <= MAX_OFLD_TX_CREDITS, 528 ("%s: %d credits", __func__, tx_credits)); 529 530 if (tx_credits < MIN_OFLD_TX_CREDITS) 531 return (0); 532 533 if (tx_credits >= (n * EQ_ESIZE) / 16) 534 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 535 else 536 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 537 } 538 539 /* Maximum number of SGL entries we could stuff in a WR */ 540 static inline int 541 max_dsgl_nsegs(int tx_credits) 542 { 543 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 544 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 545 546 KASSERT(tx_credits >= 0 && 547 tx_credits <= MAX_OFLD_TX_CREDITS, 548 ("%s: %d credits", __func__, tx_credits)); 549 550 if (tx_credits < MIN_OFLD_TX_CREDITS) 551 return (0); 552 553 nseg += 2 * (sge_pair_credits * 16 / 24); 554 if ((sge_pair_credits * 16) % 24 == 16) 555 nseg++; 556 557 return (nseg); 558 } 559 560 static inline void 561 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 562 unsigned int plen, uint8_t credits, int shove, int ulp_submode) 563 { 564 struct fw_ofld_tx_data_wr *txwr = dst; 565 566 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 567 V_FW_WR_IMMDLEN(immdlen)); 568 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 569 V_FW_WR_LEN16(credits)); 570 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 571 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 572 txwr->plen = htobe32(plen); 573 574 if (toep->params.tx_align > 0) { 575 if (plen < 2 * toep->params.emss) 576 txwr->lsodisable_to_flags |= 577 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 578 else 579 txwr->lsodisable_to_flags |= 580 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 581 (toep->params.nagle == 0 ? 0 : 582 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 583 } 584 } 585 586 /* 587 * Generate a DSGL from a starting mbuf. The total number of segments and the 588 * maximum segments in any one mbuf are provided. 589 */ 590 static void 591 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 592 { 593 struct mbuf *m; 594 struct ulptx_sgl *usgl = dst; 595 int i, j, rc; 596 struct sglist sg; 597 struct sglist_seg segs[n]; 598 599 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 600 601 sglist_init(&sg, n, segs); 602 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 603 V_ULPTX_NSGE(nsegs)); 604 605 i = -1; 606 for (m = start; m != stop; m = m->m_next) { 607 if (m->m_flags & M_EXTPG) 608 rc = sglist_append_mbuf_epg(&sg, m, 609 mtod(m, vm_offset_t), m->m_len); 610 else 611 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 612 if (__predict_false(rc != 0)) 613 panic("%s: sglist_append %d", __func__, rc); 614 615 for (j = 0; j < sg.sg_nseg; i++, j++) { 616 if (i < 0) { 617 usgl->len0 = htobe32(segs[j].ss_len); 618 usgl->addr0 = htobe64(segs[j].ss_paddr); 619 } else { 620 usgl->sge[i / 2].len[i & 1] = 621 htobe32(segs[j].ss_len); 622 usgl->sge[i / 2].addr[i & 1] = 623 htobe64(segs[j].ss_paddr); 624 } 625 #ifdef INVARIANTS 626 nsegs--; 627 #endif 628 } 629 sglist_reset(&sg); 630 } 631 if (i & 1) 632 usgl->sge[i / 2].len[1] = htobe32(0); 633 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 634 __func__, nsegs, start, stop)); 635 } 636 637 /* 638 * Max number of SGL entries an offload tx work request can have. This is 41 639 * (1 + 40) for a full 512B work request. 640 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 641 */ 642 #define OFLD_SGL_LEN (41) 643 644 /* 645 * Send data and/or a FIN to the peer. 646 * 647 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 648 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 649 * was transmitted. 650 * 651 * drop indicates the number of bytes that should be dropped from the head of 652 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 653 * contention on the send buffer lock (before this change it used to do 654 * sowwakeup and then t4_push_frames right after that when recovering from tx 655 * stalls). When drop is set this function MUST drop the bytes and wake up any 656 * writers. 657 */ 658 void 659 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 660 { 661 struct mbuf *sndptr, *m, *sb_sndptr; 662 struct fw_ofld_tx_data_wr *txwr; 663 struct wrqe *wr; 664 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 665 struct inpcb *inp = toep->inp; 666 struct tcpcb *tp = intotcpcb(inp); 667 struct socket *so = inp->inp_socket; 668 struct sockbuf *sb = &so->so_snd; 669 int tx_credits, shove, compl, sowwakeup; 670 struct ofld_tx_sdesc *txsd; 671 bool nomap_mbuf_seen; 672 673 INP_WLOCK_ASSERT(inp); 674 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 675 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 676 677 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 678 ulp_mode(toep) == ULP_MODE_TCPDDP || 679 ulp_mode(toep) == ULP_MODE_TLS || 680 ulp_mode(toep) == ULP_MODE_RDMA, 681 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 682 683 #ifdef VERBOSE_TRACES 684 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 685 __func__, toep->tid, toep->flags, tp->t_flags, drop); 686 #endif 687 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 688 return; 689 690 #ifdef RATELIMIT 691 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 692 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 693 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 694 } 695 #endif 696 697 /* 698 * This function doesn't resume by itself. Someone else must clear the 699 * flag and call this function. 700 */ 701 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 702 KASSERT(drop == 0, 703 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 704 return; 705 } 706 707 txsd = &toep->txsd[toep->txsd_pidx]; 708 do { 709 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 710 max_imm = max_imm_payload(tx_credits); 711 max_nsegs = max_dsgl_nsegs(tx_credits); 712 713 SOCKBUF_LOCK(sb); 714 sowwakeup = drop; 715 if (drop) { 716 sbdrop_locked(sb, drop); 717 drop = 0; 718 } 719 sb_sndptr = sb->sb_sndptr; 720 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 721 plen = 0; 722 nsegs = 0; 723 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 724 nomap_mbuf_seen = false; 725 for (m = sndptr; m != NULL; m = m->m_next) { 726 int n; 727 728 if ((m->m_flags & M_NOTAVAIL) != 0) 729 break; 730 if (m->m_flags & M_EXTPG) { 731 #ifdef KERN_TLS 732 if (m->m_epg_tls != NULL) { 733 toep->flags |= TPF_KTLS; 734 if (plen == 0) { 735 SOCKBUF_UNLOCK(sb); 736 t4_push_ktls(sc, toep, 0); 737 return; 738 } 739 break; 740 } 741 #endif 742 n = sglist_count_mbuf_epg(m, 743 mtod(m, vm_offset_t), m->m_len); 744 } else 745 n = sglist_count(mtod(m, void *), m->m_len); 746 747 nsegs += n; 748 plen += m->m_len; 749 750 /* This mbuf sent us _over_ the nsegs limit, back out */ 751 if (plen > max_imm && nsegs > max_nsegs) { 752 nsegs -= n; 753 plen -= m->m_len; 754 if (plen == 0) { 755 /* Too few credits */ 756 toep->flags |= TPF_TX_SUSPENDED; 757 if (sowwakeup) { 758 if (!TAILQ_EMPTY( 759 &toep->aiotx_jobq)) 760 t4_aiotx_queue_toep(so, 761 toep); 762 sowwakeup_locked(so); 763 } else 764 SOCKBUF_UNLOCK(sb); 765 SOCKBUF_UNLOCK_ASSERT(sb); 766 return; 767 } 768 break; 769 } 770 771 if (m->m_flags & M_EXTPG) 772 nomap_mbuf_seen = true; 773 if (max_nsegs_1mbuf < n) 774 max_nsegs_1mbuf = n; 775 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 776 777 /* This mbuf put us right at the max_nsegs limit */ 778 if (plen > max_imm && nsegs == max_nsegs) { 779 m = m->m_next; 780 break; 781 } 782 } 783 784 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 785 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 786 compl = 1; 787 else 788 compl = 0; 789 790 if (sb->sb_flags & SB_AUTOSIZE && 791 V_tcp_do_autosndbuf && 792 sb->sb_hiwat < V_tcp_autosndbuf_max && 793 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 794 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 795 V_tcp_autosndbuf_max); 796 797 if (!sbreserve_locked(sb, newsize, so, NULL)) 798 sb->sb_flags &= ~SB_AUTOSIZE; 799 else 800 sowwakeup = 1; /* room available */ 801 } 802 if (sowwakeup) { 803 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 804 t4_aiotx_queue_toep(so, toep); 805 sowwakeup_locked(so); 806 } else 807 SOCKBUF_UNLOCK(sb); 808 SOCKBUF_UNLOCK_ASSERT(sb); 809 810 /* nothing to send */ 811 if (plen == 0) { 812 KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0, 813 ("%s: nothing to send, but m != NULL is ready", 814 __func__)); 815 break; 816 } 817 818 if (__predict_false(toep->flags & TPF_FIN_SENT)) 819 panic("%s: excess tx.", __func__); 820 821 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 822 if (plen <= max_imm && !nomap_mbuf_seen) { 823 824 /* Immediate data tx */ 825 826 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 827 &toep->ofld_txq->wrq); 828 if (wr == NULL) { 829 /* XXX: how will we recover from this? */ 830 toep->flags |= TPF_TX_SUSPENDED; 831 return; 832 } 833 txwr = wrtod(wr); 834 credits = howmany(wr->wr_len, 16); 835 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); 836 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 837 nsegs = 0; 838 } else { 839 int wr_len; 840 841 /* DSGL tx */ 842 843 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 844 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 845 wr = alloc_wrqe(roundup2(wr_len, 16), 846 &toep->ofld_txq->wrq); 847 if (wr == NULL) { 848 /* XXX: how will we recover from this? */ 849 toep->flags |= TPF_TX_SUSPENDED; 850 return; 851 } 852 txwr = wrtod(wr); 853 credits = howmany(wr_len, 16); 854 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); 855 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 856 max_nsegs_1mbuf); 857 if (wr_len & 0xf) { 858 uint64_t *pad = (uint64_t *) 859 ((uintptr_t)txwr + wr_len); 860 *pad = 0; 861 } 862 } 863 864 KASSERT(toep->tx_credits >= credits, 865 ("%s: not enough credits", __func__)); 866 867 toep->tx_credits -= credits; 868 toep->tx_nocompl += credits; 869 toep->plen_nocompl += plen; 870 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 871 toep->tx_nocompl >= toep->tx_total / 4) 872 compl = 1; 873 874 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 875 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 876 toep->tx_nocompl = 0; 877 toep->plen_nocompl = 0; 878 } 879 880 tp->snd_nxt += plen; 881 tp->snd_max += plen; 882 883 SOCKBUF_LOCK(sb); 884 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 885 sb->sb_sndptr = sb_sndptr; 886 SOCKBUF_UNLOCK(sb); 887 888 toep->flags |= TPF_TX_DATA_SENT; 889 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 890 toep->flags |= TPF_TX_SUSPENDED; 891 892 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 893 txsd->plen = plen; 894 txsd->tx_credits = credits; 895 txsd++; 896 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 897 toep->txsd_pidx = 0; 898 txsd = &toep->txsd[0]; 899 } 900 toep->txsd_avail--; 901 902 t4_l2t_send(sc, wr, toep->l2te); 903 } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0); 904 905 /* Send a FIN if requested, but only if there's no more data to send */ 906 if (m == NULL && toep->flags & TPF_SEND_FIN) 907 t4_close_conn(sc, toep); 908 } 909 910 static inline void 911 rqdrop_locked(struct mbufq *q, int plen) 912 { 913 struct mbuf *m; 914 915 while (plen > 0) { 916 m = mbufq_dequeue(q); 917 918 /* Too many credits. */ 919 MPASS(m != NULL); 920 M_ASSERTPKTHDR(m); 921 922 /* Partial credits. */ 923 MPASS(plen >= m->m_pkthdr.len); 924 925 plen -= m->m_pkthdr.len; 926 m_freem(m); 927 } 928 } 929 930 static struct wrqe * 931 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) 932 { 933 struct mbuf *m; 934 struct fw_ofld_tx_data_wr *txwr; 935 struct wrqe *wr; 936 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 937 u_int adjusted_plen, ulp_submode; 938 struct inpcb *inp = toep->inp; 939 struct tcpcb *tp = intotcpcb(inp); 940 int tx_credits, shove; 941 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 942 943 M_ASSERTPKTHDR(sndptr); 944 945 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 946 if (mbuf_raw_wr(sndptr)) { 947 plen = sndptr->m_pkthdr.len; 948 KASSERT(plen <= SGE_MAX_WR_LEN, 949 ("raw WR len %u is greater than max WR len", plen)); 950 if (plen > tx_credits * 16) 951 return (NULL); 952 953 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 954 if (__predict_false(wr == NULL)) 955 return (NULL); 956 957 m_copydata(sndptr, 0, plen, wrtod(wr)); 958 return (wr); 959 } 960 961 max_imm = max_imm_payload(tx_credits); 962 max_nsegs = max_dsgl_nsegs(tx_credits); 963 964 plen = 0; 965 nsegs = 0; 966 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 967 for (m = sndptr; m != NULL; m = m->m_next) { 968 int n = sglist_count(mtod(m, void *), m->m_len); 969 970 nsegs += n; 971 plen += m->m_len; 972 973 /* 974 * This mbuf would send us _over_ the nsegs limit. 975 * Suspend tx because the PDU can't be sent out. 976 */ 977 if (plen > max_imm && nsegs > max_nsegs) 978 return (NULL); 979 980 if (max_nsegs_1mbuf < n) 981 max_nsegs_1mbuf = n; 982 } 983 984 if (__predict_false(toep->flags & TPF_FIN_SENT)) 985 panic("%s: excess tx.", __func__); 986 987 /* 988 * We have a PDU to send. All of it goes out in one WR so 'm' 989 * is NULL. A PDU's length is always a multiple of 4. 990 */ 991 MPASS(m == NULL); 992 MPASS((plen & 3) == 0); 993 MPASS(sndptr->m_pkthdr.len == plen); 994 995 shove = !(tp->t_flags & TF_MORETOCOME); 996 ulp_submode = mbuf_ulp_submode(sndptr); 997 MPASS(ulp_submode < nitems(ulp_extra_len)); 998 999 /* 1000 * plen doesn't include header and data digests, which are 1001 * generated and inserted in the right places by the TOE, but 1002 * they do occupy TCP sequence space and need to be accounted 1003 * for. 1004 */ 1005 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1006 if (plen <= max_imm) { 1007 1008 /* Immediate data tx */ 1009 1010 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1011 &toep->ofld_txq->wrq); 1012 if (wr == NULL) { 1013 /* XXX: how will we recover from this? */ 1014 return (NULL); 1015 } 1016 txwr = wrtod(wr); 1017 credits = howmany(wr->wr_len, 16); 1018 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1019 shove, ulp_submode); 1020 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1021 nsegs = 0; 1022 } else { 1023 int wr_len; 1024 1025 /* DSGL tx */ 1026 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1027 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1028 wr = alloc_wrqe(roundup2(wr_len, 16), 1029 &toep->ofld_txq->wrq); 1030 if (wr == NULL) { 1031 /* XXX: how will we recover from this? */ 1032 return (NULL); 1033 } 1034 txwr = wrtod(wr); 1035 credits = howmany(wr_len, 16); 1036 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1037 shove, ulp_submode); 1038 write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); 1039 if (wr_len & 0xf) { 1040 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); 1041 *pad = 0; 1042 } 1043 } 1044 1045 tp->snd_nxt += adjusted_plen; 1046 tp->snd_max += adjusted_plen; 1047 1048 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, 1); 1049 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); 1050 1051 return (wr); 1052 } 1053 1054 void 1055 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 1056 { 1057 struct mbuf *sndptr, *m; 1058 struct fw_wr_hdr *wrhdr; 1059 struct wrqe *wr; 1060 u_int plen, credits; 1061 struct inpcb *inp = toep->inp; 1062 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 1063 struct mbufq *pduq = &toep->ulp_pduq; 1064 1065 INP_WLOCK_ASSERT(inp); 1066 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1067 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 1068 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 1069 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 1070 1071 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 1072 return; 1073 1074 /* 1075 * This function doesn't resume by itself. Someone else must clear the 1076 * flag and call this function. 1077 */ 1078 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 1079 KASSERT(drop == 0, 1080 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1081 return; 1082 } 1083 1084 if (drop) { 1085 struct socket *so = inp->inp_socket; 1086 struct sockbuf *sb = &so->so_snd; 1087 int sbu; 1088 1089 /* 1090 * An unlocked read is ok here as the data should only 1091 * transition from a non-zero value to either another 1092 * non-zero value or zero. Once it is zero it should 1093 * stay zero. 1094 */ 1095 if (__predict_false(sbused(sb)) > 0) { 1096 SOCKBUF_LOCK(sb); 1097 sbu = sbused(sb); 1098 if (sbu > 0) { 1099 /* 1100 * The data transmitted before the 1101 * tid's ULP mode changed to ISCSI is 1102 * still in so_snd. Incoming credits 1103 * should account for so_snd first. 1104 */ 1105 sbdrop_locked(sb, min(sbu, drop)); 1106 drop -= min(sbu, drop); 1107 } 1108 sowwakeup_locked(so); /* unlocks so_snd */ 1109 } 1110 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1111 } 1112 1113 while ((sndptr = mbufq_first(pduq)) != NULL) { 1114 wr = write_iscsi_mbuf_wr(toep, sndptr); 1115 if (wr == NULL) { 1116 toep->flags |= TPF_TX_SUSPENDED; 1117 return; 1118 } 1119 1120 plen = sndptr->m_pkthdr.len; 1121 credits = howmany(wr->wr_len, 16); 1122 KASSERT(toep->tx_credits >= credits, 1123 ("%s: not enough credits", __func__)); 1124 1125 m = mbufq_dequeue(pduq); 1126 MPASS(m == sndptr); 1127 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1128 1129 toep->tx_credits -= credits; 1130 toep->tx_nocompl += credits; 1131 toep->plen_nocompl += plen; 1132 1133 /* 1134 * Ensure there are enough credits for a full-sized WR 1135 * as page pod WRs can be full-sized. 1136 */ 1137 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && 1138 toep->tx_nocompl >= toep->tx_total / 4) { 1139 wrhdr = wrtod(wr); 1140 wrhdr->hi |= htobe32(F_FW_WR_COMPL); 1141 toep->tx_nocompl = 0; 1142 toep->plen_nocompl = 0; 1143 } 1144 1145 toep->flags |= TPF_TX_DATA_SENT; 1146 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1147 toep->flags |= TPF_TX_SUSPENDED; 1148 1149 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1150 txsd->plen = plen; 1151 txsd->tx_credits = credits; 1152 txsd++; 1153 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1154 toep->txsd_pidx = 0; 1155 txsd = &toep->txsd[0]; 1156 } 1157 toep->txsd_avail--; 1158 1159 t4_l2t_send(sc, wr, toep->l2te); 1160 } 1161 1162 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1163 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1164 t4_close_conn(sc, toep); 1165 } 1166 1167 static inline void 1168 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1169 { 1170 1171 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1172 t4_push_pdus(sc, toep, drop); 1173 else if (tls_tx_key(toep) && toep->tls.mode == TLS_MODE_TLSOM) 1174 t4_push_tls_records(sc, toep, drop); 1175 #ifdef KERN_TLS 1176 else if (toep->flags & TPF_KTLS) 1177 t4_push_ktls(sc, toep, drop); 1178 #endif 1179 else 1180 t4_push_frames(sc, toep, drop); 1181 } 1182 1183 int 1184 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1185 { 1186 struct adapter *sc = tod->tod_softc; 1187 #ifdef INVARIANTS 1188 struct inpcb *inp = tp->t_inpcb; 1189 #endif 1190 struct toepcb *toep = tp->t_toe; 1191 1192 INP_WLOCK_ASSERT(inp); 1193 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1194 ("%s: inp %p dropped.", __func__, inp)); 1195 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1196 1197 t4_push_data(sc, toep, 0); 1198 1199 return (0); 1200 } 1201 1202 int 1203 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1204 { 1205 struct adapter *sc = tod->tod_softc; 1206 #ifdef INVARIANTS 1207 struct inpcb *inp = tp->t_inpcb; 1208 #endif 1209 struct toepcb *toep = tp->t_toe; 1210 1211 INP_WLOCK_ASSERT(inp); 1212 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1213 ("%s: inp %p dropped.", __func__, inp)); 1214 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1215 1216 toep->flags |= TPF_SEND_FIN; 1217 if (tp->t_state >= TCPS_ESTABLISHED) 1218 t4_push_data(sc, toep, 0); 1219 1220 return (0); 1221 } 1222 1223 int 1224 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1225 { 1226 struct adapter *sc = tod->tod_softc; 1227 #if defined(INVARIANTS) 1228 struct inpcb *inp = tp->t_inpcb; 1229 #endif 1230 struct toepcb *toep = tp->t_toe; 1231 1232 INP_WLOCK_ASSERT(inp); 1233 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1234 ("%s: inp %p dropped.", __func__, inp)); 1235 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1236 1237 /* hmmmm */ 1238 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1239 ("%s: flowc for tid %u [%s] not sent already", 1240 __func__, toep->tid, tcpstates[tp->t_state])); 1241 1242 send_reset(sc, toep, 0); 1243 return (0); 1244 } 1245 1246 /* 1247 * Peer has sent us a FIN. 1248 */ 1249 static int 1250 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1251 { 1252 struct adapter *sc = iq->adapter; 1253 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1254 unsigned int tid = GET_TID(cpl); 1255 struct toepcb *toep = lookup_tid(sc, tid); 1256 struct inpcb *inp = toep->inp; 1257 struct tcpcb *tp = NULL; 1258 struct socket *so; 1259 struct epoch_tracker et; 1260 #ifdef INVARIANTS 1261 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1262 #endif 1263 1264 KASSERT(opcode == CPL_PEER_CLOSE, 1265 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1266 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1267 1268 if (__predict_false(toep->flags & TPF_SYNQE)) { 1269 /* 1270 * do_pass_establish must have run before do_peer_close and if 1271 * this is still a synqe instead of a toepcb then the connection 1272 * must be getting aborted. 1273 */ 1274 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1275 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1276 toep, toep->flags); 1277 return (0); 1278 } 1279 1280 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1281 1282 CURVNET_SET(toep->vnet); 1283 NET_EPOCH_ENTER(et); 1284 INP_WLOCK(inp); 1285 tp = intotcpcb(inp); 1286 1287 CTR6(KTR_CXGBE, 1288 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1289 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1290 toep->ddp.flags, inp); 1291 1292 if (toep->flags & TPF_ABORT_SHUTDOWN) 1293 goto done; 1294 1295 tp->rcv_nxt++; /* FIN */ 1296 1297 so = inp->inp_socket; 1298 socantrcvmore(so); 1299 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1300 DDP_LOCK(toep); 1301 if (__predict_false(toep->ddp.flags & 1302 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1303 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1304 DDP_UNLOCK(toep); 1305 } 1306 1307 if (ulp_mode(toep) != ULP_MODE_RDMA) { 1308 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1309 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1310 be32toh(cpl->rcv_nxt))); 1311 } 1312 1313 switch (tp->t_state) { 1314 case TCPS_SYN_RECEIVED: 1315 tp->t_starttime = ticks; 1316 /* FALLTHROUGH */ 1317 1318 case TCPS_ESTABLISHED: 1319 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1320 break; 1321 1322 case TCPS_FIN_WAIT_1: 1323 tcp_state_change(tp, TCPS_CLOSING); 1324 break; 1325 1326 case TCPS_FIN_WAIT_2: 1327 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1328 tcp_twstart(tp); 1329 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1330 NET_EPOCH_EXIT(et); 1331 CURVNET_RESTORE(); 1332 1333 INP_WLOCK(inp); 1334 final_cpl_received(toep); 1335 return (0); 1336 1337 default: 1338 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1339 __func__, tid, tp->t_state); 1340 } 1341 done: 1342 INP_WUNLOCK(inp); 1343 NET_EPOCH_EXIT(et); 1344 CURVNET_RESTORE(); 1345 return (0); 1346 } 1347 1348 /* 1349 * Peer has ACK'd our FIN. 1350 */ 1351 static int 1352 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1353 struct mbuf *m) 1354 { 1355 struct adapter *sc = iq->adapter; 1356 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1357 unsigned int tid = GET_TID(cpl); 1358 struct toepcb *toep = lookup_tid(sc, tid); 1359 struct inpcb *inp = toep->inp; 1360 struct tcpcb *tp = NULL; 1361 struct socket *so = NULL; 1362 struct epoch_tracker et; 1363 #ifdef INVARIANTS 1364 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1365 #endif 1366 1367 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1368 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1369 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1370 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1371 1372 CURVNET_SET(toep->vnet); 1373 NET_EPOCH_ENTER(et); 1374 INP_WLOCK(inp); 1375 tp = intotcpcb(inp); 1376 1377 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1378 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1379 1380 if (toep->flags & TPF_ABORT_SHUTDOWN) 1381 goto done; 1382 1383 so = inp->inp_socket; 1384 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1385 1386 switch (tp->t_state) { 1387 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1388 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1389 tcp_twstart(tp); 1390 release: 1391 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1392 NET_EPOCH_EXIT(et); 1393 CURVNET_RESTORE(); 1394 1395 INP_WLOCK(inp); 1396 final_cpl_received(toep); /* no more CPLs expected */ 1397 1398 return (0); 1399 case TCPS_LAST_ACK: 1400 if (tcp_close(tp)) 1401 INP_WUNLOCK(inp); 1402 goto release; 1403 1404 case TCPS_FIN_WAIT_1: 1405 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1406 soisdisconnected(so); 1407 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1408 break; 1409 1410 default: 1411 log(LOG_ERR, 1412 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1413 __func__, tid, tcpstates[tp->t_state]); 1414 } 1415 done: 1416 INP_WUNLOCK(inp); 1417 NET_EPOCH_EXIT(et); 1418 CURVNET_RESTORE(); 1419 return (0); 1420 } 1421 1422 void 1423 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, 1424 int rst_status) 1425 { 1426 struct wrqe *wr; 1427 struct cpl_abort_rpl *cpl; 1428 1429 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); 1430 if (wr == NULL) { 1431 /* XXX */ 1432 panic("%s: allocation failure.", __func__); 1433 } 1434 cpl = wrtod(wr); 1435 1436 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1437 cpl->cmd = rst_status; 1438 1439 t4_wrq_tx(sc, wr); 1440 } 1441 1442 static int 1443 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1444 { 1445 switch (abort_reason) { 1446 case CPL_ERR_BAD_SYN: 1447 case CPL_ERR_CONN_RESET: 1448 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1449 case CPL_ERR_XMIT_TIMEDOUT: 1450 case CPL_ERR_PERSIST_TIMEDOUT: 1451 case CPL_ERR_FINWAIT2_TIMEDOUT: 1452 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1453 return (ETIMEDOUT); 1454 default: 1455 return (EIO); 1456 } 1457 } 1458 1459 /* 1460 * TCP RST from the peer, timeout, or some other such critical error. 1461 */ 1462 static int 1463 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1464 { 1465 struct adapter *sc = iq->adapter; 1466 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1467 unsigned int tid = GET_TID(cpl); 1468 struct toepcb *toep = lookup_tid(sc, tid); 1469 struct sge_ofld_txq *ofld_txq = toep->ofld_txq; 1470 struct inpcb *inp; 1471 struct tcpcb *tp; 1472 struct epoch_tracker et; 1473 #ifdef INVARIANTS 1474 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1475 #endif 1476 1477 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1478 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1479 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1480 1481 if (toep->flags & TPF_SYNQE) 1482 return (do_abort_req_synqe(iq, rss, m)); 1483 1484 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1485 1486 if (negative_advice(cpl->status)) { 1487 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1488 __func__, cpl->status, tid, toep->flags); 1489 return (0); /* Ignore negative advice */ 1490 } 1491 1492 inp = toep->inp; 1493 CURVNET_SET(toep->vnet); 1494 NET_EPOCH_ENTER(et); /* for tcp_close */ 1495 INP_WLOCK(inp); 1496 1497 tp = intotcpcb(inp); 1498 1499 CTR6(KTR_CXGBE, 1500 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1501 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1502 inp->inp_flags, cpl->status); 1503 1504 /* 1505 * If we'd initiated an abort earlier the reply to it is responsible for 1506 * cleaning up resources. Otherwise we tear everything down right here 1507 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1508 */ 1509 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1510 INP_WUNLOCK(inp); 1511 goto done; 1512 } 1513 toep->flags |= TPF_ABORT_SHUTDOWN; 1514 1515 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1516 struct socket *so = inp->inp_socket; 1517 1518 if (so != NULL) 1519 so_error_set(so, abort_status_to_errno(tp, 1520 cpl->status)); 1521 tp = tcp_close(tp); 1522 if (tp == NULL) 1523 INP_WLOCK(inp); /* re-acquire */ 1524 } 1525 1526 final_cpl_received(toep); 1527 done: 1528 NET_EPOCH_EXIT(et); 1529 CURVNET_RESTORE(); 1530 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1531 return (0); 1532 } 1533 1534 /* 1535 * Reply to the CPL_ABORT_REQ (send_reset) 1536 */ 1537 static int 1538 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1539 { 1540 struct adapter *sc = iq->adapter; 1541 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1542 unsigned int tid = GET_TID(cpl); 1543 struct toepcb *toep = lookup_tid(sc, tid); 1544 struct inpcb *inp = toep->inp; 1545 #ifdef INVARIANTS 1546 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1547 #endif 1548 1549 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1550 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1551 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1552 1553 if (toep->flags & TPF_SYNQE) 1554 return (do_abort_rpl_synqe(iq, rss, m)); 1555 1556 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1557 1558 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1559 __func__, tid, toep, inp, cpl->status); 1560 1561 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1562 ("%s: wasn't expecting abort reply", __func__)); 1563 1564 INP_WLOCK(inp); 1565 final_cpl_received(toep); 1566 1567 return (0); 1568 } 1569 1570 static int 1571 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1572 { 1573 struct adapter *sc = iq->adapter; 1574 const struct cpl_rx_data *cpl = mtod(m, const void *); 1575 unsigned int tid = GET_TID(cpl); 1576 struct toepcb *toep = lookup_tid(sc, tid); 1577 struct inpcb *inp = toep->inp; 1578 struct tcpcb *tp; 1579 struct socket *so; 1580 struct sockbuf *sb; 1581 struct epoch_tracker et; 1582 int len, rx_credits; 1583 uint32_t ddp_placed = 0; 1584 1585 if (__predict_false(toep->flags & TPF_SYNQE)) { 1586 /* 1587 * do_pass_establish must have run before do_rx_data and if this 1588 * is still a synqe instead of a toepcb then the connection must 1589 * be getting aborted. 1590 */ 1591 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1592 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1593 toep, toep->flags); 1594 m_freem(m); 1595 return (0); 1596 } 1597 1598 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1599 1600 /* strip off CPL header */ 1601 m_adj(m, sizeof(*cpl)); 1602 len = m->m_pkthdr.len; 1603 1604 INP_WLOCK(inp); 1605 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1606 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1607 __func__, tid, len, inp->inp_flags); 1608 INP_WUNLOCK(inp); 1609 m_freem(m); 1610 return (0); 1611 } 1612 1613 tp = intotcpcb(inp); 1614 1615 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1616 toep->flags & TPF_TLS_RECEIVE)) { 1617 /* Received "raw" data on a TLS socket. */ 1618 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1619 __func__, tid, len); 1620 do_rx_data_tls(cpl, toep, m); 1621 return (0); 1622 } 1623 1624 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1625 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1626 1627 tp->rcv_nxt += len; 1628 if (tp->rcv_wnd < len) { 1629 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1630 ("%s: negative window size", __func__)); 1631 } 1632 1633 tp->rcv_wnd -= len; 1634 tp->t_rcvtime = ticks; 1635 1636 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1637 DDP_LOCK(toep); 1638 so = inp_inpcbtosocket(inp); 1639 sb = &so->so_rcv; 1640 SOCKBUF_LOCK(sb); 1641 1642 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1643 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1644 __func__, tid, len); 1645 m_freem(m); 1646 SOCKBUF_UNLOCK(sb); 1647 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1648 DDP_UNLOCK(toep); 1649 INP_WUNLOCK(inp); 1650 1651 CURVNET_SET(toep->vnet); 1652 NET_EPOCH_ENTER(et); 1653 INP_WLOCK(inp); 1654 tp = tcp_drop(tp, ECONNRESET); 1655 if (tp) 1656 INP_WUNLOCK(inp); 1657 NET_EPOCH_EXIT(et); 1658 CURVNET_RESTORE(); 1659 1660 return (0); 1661 } 1662 1663 /* receive buffer autosize */ 1664 MPASS(toep->vnet == so->so_vnet); 1665 CURVNET_SET(toep->vnet); 1666 if (sb->sb_flags & SB_AUTOSIZE && 1667 V_tcp_do_autorcvbuf && 1668 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1669 len > (sbspace(sb) / 8 * 7)) { 1670 unsigned int hiwat = sb->sb_hiwat; 1671 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1672 V_tcp_autorcvbuf_max); 1673 1674 if (!sbreserve_locked(sb, newsize, so, NULL)) 1675 sb->sb_flags &= ~SB_AUTOSIZE; 1676 } 1677 1678 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1679 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1680 1681 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1682 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1683 __func__, tid, len); 1684 1685 if (changed) { 1686 if (toep->ddp.flags & DDP_SC_REQ) 1687 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1688 else { 1689 KASSERT(cpl->ddp_off == 1, 1690 ("%s: DDP switched on by itself.", 1691 __func__)); 1692 1693 /* Fell out of DDP mode */ 1694 toep->ddp.flags &= ~DDP_ON; 1695 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1696 __func__); 1697 1698 insert_ddp_data(toep, ddp_placed); 1699 } 1700 } 1701 1702 if (toep->ddp.flags & DDP_ON) { 1703 /* 1704 * CPL_RX_DATA with DDP on can only be an indicate. 1705 * Start posting queued AIO requests via DDP. The 1706 * payload that arrived in this indicate is appended 1707 * to the socket buffer as usual. 1708 */ 1709 handle_ddp_indicate(toep); 1710 } 1711 } 1712 1713 sbappendstream_locked(sb, m, 0); 1714 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1715 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1716 rx_credits = send_rx_credits(sc, toep, rx_credits); 1717 tp->rcv_wnd += rx_credits; 1718 tp->rcv_adv += rx_credits; 1719 } 1720 1721 if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1722 sbavail(sb) != 0) { 1723 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1724 tid); 1725 ddp_queue_toep(toep); 1726 } 1727 sorwakeup_locked(so); 1728 SOCKBUF_UNLOCK_ASSERT(sb); 1729 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1730 DDP_UNLOCK(toep); 1731 1732 INP_WUNLOCK(inp); 1733 CURVNET_RESTORE(); 1734 return (0); 1735 } 1736 1737 static int 1738 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1739 { 1740 struct adapter *sc = iq->adapter; 1741 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1742 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1743 struct toepcb *toep = lookup_tid(sc, tid); 1744 struct inpcb *inp; 1745 struct tcpcb *tp; 1746 struct socket *so; 1747 uint8_t credits = cpl->credits; 1748 struct ofld_tx_sdesc *txsd; 1749 int plen; 1750 #ifdef INVARIANTS 1751 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1752 #endif 1753 1754 /* 1755 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1756 * now this comes back carrying the credits for the flowc. 1757 */ 1758 if (__predict_false(toep->flags & TPF_SYNQE)) { 1759 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1760 ("%s: credits for a synq entry %p", __func__, toep)); 1761 return (0); 1762 } 1763 1764 inp = toep->inp; 1765 1766 KASSERT(opcode == CPL_FW4_ACK, 1767 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1768 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1769 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1770 1771 INP_WLOCK(inp); 1772 1773 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1774 INP_WUNLOCK(inp); 1775 return (0); 1776 } 1777 1778 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1779 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1780 1781 tp = intotcpcb(inp); 1782 1783 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1784 tcp_seq snd_una = be32toh(cpl->snd_una); 1785 1786 #ifdef INVARIANTS 1787 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1788 log(LOG_ERR, 1789 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1790 __func__, snd_una, toep->tid, tp->snd_una); 1791 } 1792 #endif 1793 1794 if (tp->snd_una != snd_una) { 1795 tp->snd_una = snd_una; 1796 tp->ts_recent_age = tcp_ts_getticks(); 1797 } 1798 } 1799 1800 #ifdef VERBOSE_TRACES 1801 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1802 #endif 1803 so = inp->inp_socket; 1804 txsd = &toep->txsd[toep->txsd_cidx]; 1805 plen = 0; 1806 while (credits) { 1807 KASSERT(credits >= txsd->tx_credits, 1808 ("%s: too many (or partial) credits", __func__)); 1809 credits -= txsd->tx_credits; 1810 toep->tx_credits += txsd->tx_credits; 1811 plen += txsd->plen; 1812 if (txsd->iv_buffer) { 1813 free(txsd->iv_buffer, M_CXGBE); 1814 txsd->iv_buffer = NULL; 1815 } 1816 txsd++; 1817 toep->txsd_avail++; 1818 KASSERT(toep->txsd_avail <= toep->txsd_total, 1819 ("%s: txsd avail > total", __func__)); 1820 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1821 txsd = &toep->txsd[0]; 1822 toep->txsd_cidx = 0; 1823 } 1824 } 1825 1826 if (toep->tx_credits == toep->tx_total) { 1827 toep->tx_nocompl = 0; 1828 toep->plen_nocompl = 0; 1829 } 1830 1831 if (toep->flags & TPF_TX_SUSPENDED && 1832 toep->tx_credits >= toep->tx_total / 4) { 1833 #ifdef VERBOSE_TRACES 1834 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1835 tid); 1836 #endif 1837 toep->flags &= ~TPF_TX_SUSPENDED; 1838 CURVNET_SET(toep->vnet); 1839 t4_push_data(sc, toep, plen); 1840 CURVNET_RESTORE(); 1841 } else if (plen > 0) { 1842 struct sockbuf *sb = &so->so_snd; 1843 int sbu; 1844 1845 SOCKBUF_LOCK(sb); 1846 sbu = sbused(sb); 1847 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 1848 if (__predict_false(sbu > 0)) { 1849 /* 1850 * The data transmitted before the 1851 * tid's ULP mode changed to ISCSI is 1852 * still in so_snd. Incoming credits 1853 * should account for so_snd first. 1854 */ 1855 sbdrop_locked(sb, min(sbu, plen)); 1856 plen -= min(sbu, plen); 1857 } 1858 sowwakeup_locked(so); /* unlocks so_snd */ 1859 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1860 } else { 1861 #ifdef VERBOSE_TRACES 1862 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1863 tid, plen); 1864 #endif 1865 sbdrop_locked(sb, plen); 1866 if (tls_tx_key(toep) && 1867 toep->tls.mode == TLS_MODE_TLSOM) { 1868 struct tls_ofld_info *tls_ofld = &toep->tls; 1869 1870 MPASS(tls_ofld->sb_off >= plen); 1871 tls_ofld->sb_off -= plen; 1872 } 1873 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1874 t4_aiotx_queue_toep(so, toep); 1875 sowwakeup_locked(so); /* unlocks so_snd */ 1876 } 1877 SOCKBUF_UNLOCK_ASSERT(sb); 1878 } 1879 1880 INP_WUNLOCK(inp); 1881 1882 return (0); 1883 } 1884 1885 void 1886 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1887 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1888 { 1889 struct wrqe *wr; 1890 struct cpl_set_tcb_field *req; 1891 struct ofld_tx_sdesc *txsd; 1892 1893 MPASS((cookie & ~M_COOKIE) == 0); 1894 if (reply) { 1895 MPASS(cookie != CPL_COOKIE_RESERVED); 1896 } 1897 1898 wr = alloc_wrqe(sizeof(*req), wrq); 1899 if (wr == NULL) { 1900 /* XXX */ 1901 panic("%s: allocation failure.", __func__); 1902 } 1903 req = wrtod(wr); 1904 1905 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1906 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1907 if (reply == 0) 1908 req->reply_ctrl |= htobe16(F_NO_REPLY); 1909 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1910 req->mask = htobe64(mask); 1911 req->val = htobe64(val); 1912 if (wrq->eq.type == EQ_OFLD) { 1913 txsd = &toep->txsd[toep->txsd_pidx]; 1914 txsd->tx_credits = howmany(sizeof(*req), 16); 1915 txsd->plen = 0; 1916 KASSERT(toep->tx_credits >= txsd->tx_credits && 1917 toep->txsd_avail > 0, 1918 ("%s: not enough credits (%d)", __func__, 1919 toep->tx_credits)); 1920 toep->tx_credits -= txsd->tx_credits; 1921 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1922 toep->txsd_pidx = 0; 1923 toep->txsd_avail--; 1924 } 1925 1926 t4_wrq_tx(sc, wr); 1927 } 1928 1929 void 1930 t4_init_cpl_io_handlers(void) 1931 { 1932 1933 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1934 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1935 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1936 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1937 CPL_COOKIE_TOM); 1938 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1939 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1940 } 1941 1942 void 1943 t4_uninit_cpl_io_handlers(void) 1944 { 1945 1946 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1947 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1948 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1949 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1950 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1951 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1952 } 1953 1954 /* 1955 * Use the 'backend1' field in AIO jobs to hold an error that should 1956 * be reported when the job is completed, the 'backend3' field to 1957 * store the amount of data sent by the AIO job so far, and the 1958 * 'backend4' field to hold a reference count on the job. 1959 * 1960 * Each unmapped mbuf holds a reference on the job as does the queue 1961 * so long as the job is queued. 1962 */ 1963 #define aio_error backend1 1964 #define aio_sent backend3 1965 #define aio_refs backend4 1966 1967 #define jobtotid(job) \ 1968 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1969 1970 static void 1971 aiotx_free_job(struct kaiocb *job) 1972 { 1973 long status; 1974 int error; 1975 1976 if (refcount_release(&job->aio_refs) == 0) 1977 return; 1978 1979 error = (intptr_t)job->aio_error; 1980 status = job->aio_sent; 1981 #ifdef VERBOSE_TRACES 1982 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1983 jobtotid(job), job, status, error); 1984 #endif 1985 if (error != 0 && status != 0) 1986 error = 0; 1987 if (error == ECANCELED) 1988 aio_cancel(job); 1989 else if (error) 1990 aio_complete(job, -1, error); 1991 else { 1992 job->msgsnd = 1; 1993 aio_complete(job, status, 0); 1994 } 1995 } 1996 1997 static void 1998 aiotx_free_pgs(struct mbuf *m) 1999 { 2000 struct kaiocb *job; 2001 vm_page_t pg; 2002 2003 M_ASSERTEXTPG(m); 2004 job = m->m_ext.ext_arg1; 2005 #ifdef VERBOSE_TRACES 2006 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 2007 m->m_len, jobtotid(job)); 2008 #endif 2009 2010 for (int i = 0; i < m->m_epg_npgs; i++) { 2011 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 2012 vm_page_unwire(pg, PQ_ACTIVE); 2013 } 2014 2015 aiotx_free_job(job); 2016 } 2017 2018 /* 2019 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 2020 * of an AIO job. 2021 */ 2022 static struct mbuf * 2023 alloc_aiotx_mbuf(struct kaiocb *job, int len) 2024 { 2025 struct vmspace *vm; 2026 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 2027 struct mbuf *m, *top, *last; 2028 vm_map_t map; 2029 vm_offset_t start; 2030 int i, mlen, npages, pgoff; 2031 2032 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 2033 ("%s(%p, %d): request to send beyond end of buffer", __func__, 2034 job, len)); 2035 2036 /* 2037 * The AIO subsystem will cancel and drain all requests before 2038 * permitting a process to exit or exec, so p_vmspace should 2039 * be stable here. 2040 */ 2041 vm = job->userproc->p_vmspace; 2042 map = &vm->vm_map; 2043 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 2044 pgoff = start & PAGE_MASK; 2045 2046 top = NULL; 2047 last = NULL; 2048 while (len > 0) { 2049 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 2050 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 2051 ("%s: next start (%#jx + %#x) is not page aligned", 2052 __func__, (uintmax_t)start, mlen)); 2053 2054 npages = vm_fault_quick_hold_pages(map, start, mlen, 2055 VM_PROT_WRITE, pgs, nitems(pgs)); 2056 if (npages < 0) 2057 break; 2058 2059 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs); 2060 if (m == NULL) { 2061 vm_page_unhold_pages(pgs, npages); 2062 break; 2063 } 2064 2065 m->m_epg_1st_off = pgoff; 2066 m->m_epg_npgs = npages; 2067 if (npages == 1) { 2068 KASSERT(mlen + pgoff <= PAGE_SIZE, 2069 ("%s: single page is too large (off %d len %d)", 2070 __func__, pgoff, mlen)); 2071 m->m_epg_last_len = mlen; 2072 } else { 2073 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2074 (npages - 2) * PAGE_SIZE; 2075 } 2076 for (i = 0; i < npages; i++) 2077 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2078 2079 m->m_len = mlen; 2080 m->m_ext.ext_size = npages * PAGE_SIZE; 2081 m->m_ext.ext_arg1 = job; 2082 refcount_acquire(&job->aio_refs); 2083 2084 #ifdef VERBOSE_TRACES 2085 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2086 __func__, jobtotid(job), m, job, npages); 2087 #endif 2088 2089 if (top == NULL) 2090 top = m; 2091 else 2092 last->m_next = m; 2093 last = m; 2094 2095 len -= mlen; 2096 start += mlen; 2097 pgoff = 0; 2098 } 2099 2100 return (top); 2101 } 2102 2103 static void 2104 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2105 { 2106 struct sockbuf *sb; 2107 struct file *fp; 2108 struct inpcb *inp; 2109 struct tcpcb *tp; 2110 struct mbuf *m; 2111 int error, len; 2112 bool moretocome, sendmore; 2113 2114 sb = &so->so_snd; 2115 SOCKBUF_UNLOCK(sb); 2116 fp = job->fd_file; 2117 m = NULL; 2118 2119 #ifdef MAC 2120 error = mac_socket_check_send(fp->f_cred, so); 2121 if (error != 0) 2122 goto out; 2123 #endif 2124 2125 /* Inline sosend_generic(). */ 2126 2127 error = sblock(sb, SBL_WAIT); 2128 MPASS(error == 0); 2129 2130 sendanother: 2131 SOCKBUF_LOCK(sb); 2132 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2133 SOCKBUF_UNLOCK(sb); 2134 sbunlock(sb); 2135 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2136 PROC_LOCK(job->userproc); 2137 kern_psignal(job->userproc, SIGPIPE); 2138 PROC_UNLOCK(job->userproc); 2139 } 2140 error = EPIPE; 2141 goto out; 2142 } 2143 if (so->so_error) { 2144 error = so->so_error; 2145 so->so_error = 0; 2146 SOCKBUF_UNLOCK(sb); 2147 sbunlock(sb); 2148 goto out; 2149 } 2150 if ((so->so_state & SS_ISCONNECTED) == 0) { 2151 SOCKBUF_UNLOCK(sb); 2152 sbunlock(sb); 2153 error = ENOTCONN; 2154 goto out; 2155 } 2156 if (sbspace(sb) < sb->sb_lowat) { 2157 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2158 2159 /* 2160 * Don't block if there is too little room in the socket 2161 * buffer. Instead, requeue the request. 2162 */ 2163 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2164 SOCKBUF_UNLOCK(sb); 2165 sbunlock(sb); 2166 error = ECANCELED; 2167 goto out; 2168 } 2169 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2170 SOCKBUF_UNLOCK(sb); 2171 sbunlock(sb); 2172 goto out; 2173 } 2174 2175 /* 2176 * Write as much data as the socket permits, but no more than a 2177 * a single sndbuf at a time. 2178 */ 2179 len = sbspace(sb); 2180 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2181 len = job->uaiocb.aio_nbytes - job->aio_sent; 2182 moretocome = false; 2183 } else 2184 moretocome = true; 2185 if (len > toep->params.sndbuf) { 2186 len = toep->params.sndbuf; 2187 sendmore = true; 2188 } else 2189 sendmore = false; 2190 2191 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2192 moretocome = true; 2193 SOCKBUF_UNLOCK(sb); 2194 MPASS(len != 0); 2195 2196 m = alloc_aiotx_mbuf(job, len); 2197 if (m == NULL) { 2198 sbunlock(sb); 2199 error = EFAULT; 2200 goto out; 2201 } 2202 2203 /* Inlined tcp_usr_send(). */ 2204 2205 inp = toep->inp; 2206 INP_WLOCK(inp); 2207 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2208 INP_WUNLOCK(inp); 2209 sbunlock(sb); 2210 error = ECONNRESET; 2211 goto out; 2212 } 2213 2214 job->aio_sent += m_length(m, NULL); 2215 2216 sbappendstream(sb, m, 0); 2217 m = NULL; 2218 2219 if (!(inp->inp_flags & INP_DROPPED)) { 2220 tp = intotcpcb(inp); 2221 if (moretocome) 2222 tp->t_flags |= TF_MORETOCOME; 2223 error = tp->t_fb->tfb_tcp_output(tp); 2224 if (moretocome) 2225 tp->t_flags &= ~TF_MORETOCOME; 2226 } 2227 2228 INP_WUNLOCK(inp); 2229 if (sendmore) 2230 goto sendanother; 2231 sbunlock(sb); 2232 2233 if (error) 2234 goto out; 2235 2236 /* 2237 * If this is a blocking socket and the request has not been 2238 * fully completed, requeue it until the socket is ready 2239 * again. 2240 */ 2241 if (job->aio_sent < job->uaiocb.aio_nbytes && 2242 !(so->so_state & SS_NBIO)) { 2243 SOCKBUF_LOCK(sb); 2244 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2245 SOCKBUF_UNLOCK(sb); 2246 error = ECANCELED; 2247 goto out; 2248 } 2249 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2250 return; 2251 } 2252 2253 /* 2254 * If the request will not be requeued, drop the queue's 2255 * reference to the job. Any mbufs in flight should still 2256 * hold a reference, but this drops the reference that the 2257 * queue owns while it is waiting to queue mbufs to the 2258 * socket. 2259 */ 2260 aiotx_free_job(job); 2261 2262 out: 2263 if (error) { 2264 job->aio_error = (void *)(intptr_t)error; 2265 aiotx_free_job(job); 2266 } 2267 m_freem(m); 2268 SOCKBUF_LOCK(sb); 2269 } 2270 2271 static void 2272 t4_aiotx_task(void *context, int pending) 2273 { 2274 struct toepcb *toep = context; 2275 struct socket *so; 2276 struct kaiocb *job; 2277 2278 so = toep->aiotx_so; 2279 CURVNET_SET(toep->vnet); 2280 SOCKBUF_LOCK(&so->so_snd); 2281 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2282 job = TAILQ_FIRST(&toep->aiotx_jobq); 2283 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2284 if (!aio_clear_cancel_function(job)) 2285 continue; 2286 2287 t4_aiotx_process_job(toep, so, job); 2288 } 2289 toep->aiotx_so = NULL; 2290 SOCKBUF_UNLOCK(&so->so_snd); 2291 CURVNET_RESTORE(); 2292 2293 free_toepcb(toep); 2294 SOCK_LOCK(so); 2295 sorele(so); 2296 } 2297 2298 static void 2299 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2300 { 2301 2302 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2303 #ifdef VERBOSE_TRACES 2304 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2305 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2306 #endif 2307 if (toep->aiotx_so != NULL) 2308 return; 2309 soref(so); 2310 toep->aiotx_so = so; 2311 hold_toepcb(toep); 2312 soaio_enqueue(&toep->aiotx_task); 2313 } 2314 2315 static void 2316 t4_aiotx_cancel(struct kaiocb *job) 2317 { 2318 struct socket *so; 2319 struct sockbuf *sb; 2320 struct tcpcb *tp; 2321 struct toepcb *toep; 2322 2323 so = job->fd_file->f_data; 2324 tp = so_sototcpcb(so); 2325 toep = tp->t_toe; 2326 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2327 sb = &so->so_snd; 2328 2329 SOCKBUF_LOCK(sb); 2330 if (!aio_cancel_cleared(job)) 2331 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2332 SOCKBUF_UNLOCK(sb); 2333 2334 job->aio_error = (void *)(intptr_t)ECANCELED; 2335 aiotx_free_job(job); 2336 } 2337 2338 int 2339 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2340 { 2341 struct tcpcb *tp = so_sototcpcb(so); 2342 struct toepcb *toep = tp->t_toe; 2343 struct adapter *sc = td_adapter(toep->td); 2344 2345 /* This only handles writes. */ 2346 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2347 return (EOPNOTSUPP); 2348 2349 if (!sc->tt.tx_zcopy) 2350 return (EOPNOTSUPP); 2351 2352 if (tls_tx_key(toep)) 2353 return (EOPNOTSUPP); 2354 2355 SOCKBUF_LOCK(&so->so_snd); 2356 #ifdef VERBOSE_TRACES 2357 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2358 #endif 2359 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2360 panic("new job was cancelled"); 2361 refcount_init(&job->aio_refs, 1); 2362 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2363 if (sowriteable(so)) 2364 t4_aiotx_queue_toep(so, toep); 2365 SOCKBUF_UNLOCK(&so->so_snd); 2366 return (0); 2367 } 2368 2369 void 2370 aiotx_init_toep(struct toepcb *toep) 2371 { 2372 2373 TAILQ_INIT(&toep->aiotx_jobq); 2374 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2375 } 2376 #endif 2377