1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_ratelimit.h" 36 37 #ifdef TCP_OFFLOAD 38 #include <sys/param.h> 39 #include <sys/aio.h> 40 #include <sys/file.h> 41 #include <sys/kernel.h> 42 #include <sys/ktr.h> 43 #include <sys/module.h> 44 #include <sys/proc.h> 45 #include <sys/protosw.h> 46 #include <sys/domain.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sglist.h> 50 #include <sys/taskqueue.h> 51 #include <netinet/in.h> 52 #include <netinet/in_pcb.h> 53 #include <netinet/ip.h> 54 #include <netinet/ip6.h> 55 #define TCPSTATES 56 #include <netinet/tcp_fsm.h> 57 #include <netinet/tcp_seq.h> 58 #include <netinet/tcp_var.h> 59 #include <netinet/toecore.h> 60 61 #include <security/mac/mac_framework.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_extern.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_map.h> 67 #include <vm/vm_page.h> 68 69 #include "common/common.h" 70 #include "common/t4_msg.h" 71 #include "common/t4_regs.h" 72 #include "common/t4_tcb.h" 73 #include "tom/t4_tom_l2t.h" 74 #include "tom/t4_tom.h" 75 76 static void t4_aiotx_cancel(struct kaiocb *job); 77 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 78 79 void 80 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) 81 { 82 struct wrqe *wr; 83 struct fw_flowc_wr *flowc; 84 unsigned int nparams, flowclen, paramidx; 85 struct vi_info *vi = toep->vi; 86 struct port_info *pi = vi->pi; 87 struct adapter *sc = pi->adapter; 88 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 89 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 90 91 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 92 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 93 94 if (ftxp != NULL) 95 nparams = 8; 96 else 97 nparams = 6; 98 if (toep->ulp_mode == ULP_MODE_TLS) 99 nparams++; 100 if (toep->tls.fcplenmax != 0) 101 nparams++; 102 if (toep->tc_idx != -1) { 103 MPASS(toep->tc_idx >= 0 && 104 toep->tc_idx < sc->chip_params->nsched_cls); 105 nparams++; 106 } 107 108 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 109 110 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 111 if (wr == NULL) { 112 /* XXX */ 113 panic("%s: allocation failure.", __func__); 114 } 115 flowc = wrtod(wr); 116 memset(flowc, 0, wr->wr_len); 117 118 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 119 V_FW_FLOWC_WR_NPARAMS(nparams)); 120 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 121 V_FW_WR_FLOWID(toep->tid)); 122 123 #define FLOWC_PARAM(__m, __v) \ 124 do { \ 125 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 126 flowc->mnemval[paramidx].val = htobe32(__v); \ 127 paramidx++; \ 128 } while (0) 129 130 paramidx = 0; 131 132 FLOWC_PARAM(PFNVFN, pfvf); 133 FLOWC_PARAM(CH, pi->tx_chan); 134 FLOWC_PARAM(PORT, pi->tx_chan); 135 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 136 if (ftxp) { 137 uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); 138 139 FLOWC_PARAM(SNDNXT, ftxp->snd_nxt); 140 FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt); 141 FLOWC_PARAM(SNDBUF, sndbuf); 142 FLOWC_PARAM(MSS, ftxp->mss); 143 144 CTR6(KTR_CXGBE, 145 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 146 __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt, 147 ftxp->rcv_nxt); 148 } else { 149 FLOWC_PARAM(SNDBUF, 512); 150 FLOWC_PARAM(MSS, 512); 151 152 CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); 153 } 154 if (toep->ulp_mode == ULP_MODE_TLS) 155 FLOWC_PARAM(ULP_MODE, toep->ulp_mode); 156 if (toep->tls.fcplenmax != 0) 157 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 158 if (toep->tc_idx != -1) 159 FLOWC_PARAM(SCHEDCLASS, toep->tc_idx); 160 #undef FLOWC_PARAM 161 162 KASSERT(paramidx == nparams, ("nparams mismatch")); 163 164 txsd->tx_credits = howmany(flowclen, 16); 165 txsd->plen = 0; 166 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 167 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 168 toep->tx_credits -= txsd->tx_credits; 169 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 170 toep->txsd_pidx = 0; 171 toep->txsd_avail--; 172 173 toep->flags |= TPF_FLOWC_WR_SENT; 174 t4_wrq_tx(sc, wr); 175 } 176 177 #ifdef RATELIMIT 178 /* 179 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 180 */ 181 static int 182 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 183 { 184 int tc_idx, rc; 185 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 186 const int port_id = toep->vi->pi->port_id; 187 188 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 189 190 if (kbps == 0) { 191 /* unbind */ 192 tc_idx = -1; 193 } else { 194 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 195 if (rc != 0) 196 return (rc); 197 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 198 } 199 200 if (toep->tc_idx != tc_idx) { 201 struct wrqe *wr; 202 struct fw_flowc_wr *flowc; 203 int nparams = 1, flowclen, flowclen16; 204 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 205 206 flowclen = sizeof(*flowc) + nparams * sizeof(struct 207 fw_flowc_mnemval); 208 flowclen16 = howmany(flowclen, 16); 209 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 210 (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { 211 if (tc_idx >= 0) 212 t4_release_cl_rl(sc, port_id, tc_idx); 213 return (ENOMEM); 214 } 215 216 flowc = wrtod(wr); 217 memset(flowc, 0, wr->wr_len); 218 219 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 220 V_FW_FLOWC_WR_NPARAMS(nparams)); 221 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 222 V_FW_WR_FLOWID(toep->tid)); 223 224 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 225 if (tc_idx == -1) 226 flowc->mnemval[0].val = htobe32(0xff); 227 else 228 flowc->mnemval[0].val = htobe32(tc_idx); 229 230 txsd->tx_credits = flowclen16; 231 txsd->plen = 0; 232 toep->tx_credits -= txsd->tx_credits; 233 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 234 toep->txsd_pidx = 0; 235 toep->txsd_avail--; 236 t4_wrq_tx(sc, wr); 237 } 238 239 if (toep->tc_idx >= 0) 240 t4_release_cl_rl(sc, port_id, toep->tc_idx); 241 toep->tc_idx = tc_idx; 242 243 return (0); 244 } 245 #endif 246 247 void 248 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 249 { 250 struct wrqe *wr; 251 struct cpl_abort_req *req; 252 int tid = toep->tid; 253 struct inpcb *inp = toep->inp; 254 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 255 256 INP_WLOCK_ASSERT(inp); 257 258 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 259 __func__, toep->tid, 260 inp->inp_flags & INP_DROPPED ? "inp dropped" : 261 tcpstates[tp->t_state], 262 toep->flags, inp->inp_flags, 263 toep->flags & TPF_ABORT_SHUTDOWN ? 264 " (abort already in progress)" : ""); 265 266 if (toep->flags & TPF_ABORT_SHUTDOWN) 267 return; /* abort already in progress */ 268 269 toep->flags |= TPF_ABORT_SHUTDOWN; 270 271 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 272 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 273 274 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 275 if (wr == NULL) { 276 /* XXX */ 277 panic("%s: allocation failure.", __func__); 278 } 279 req = wrtod(wr); 280 281 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 282 if (inp->inp_flags & INP_DROPPED) 283 req->rsvd0 = htobe32(snd_nxt); 284 else 285 req->rsvd0 = htobe32(tp->snd_nxt); 286 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 287 req->cmd = CPL_ABORT_SEND_RST; 288 289 /* 290 * XXX: What's the correct way to tell that the inp hasn't been detached 291 * from its socket? Should I even be flushing the snd buffer here? 292 */ 293 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 294 struct socket *so = inp->inp_socket; 295 296 if (so != NULL) /* because I'm not sure. See comment above */ 297 sbflush(&so->so_snd); 298 } 299 300 t4_l2t_send(sc, wr, toep->l2te); 301 } 302 303 /* 304 * Called when a connection is established to translate the TCP options 305 * reported by HW to FreeBSD's native format. 306 */ 307 static void 308 assign_rxopt(struct tcpcb *tp, uint16_t opt) 309 { 310 struct toepcb *toep = tp->t_toe; 311 struct inpcb *inp = tp->t_inpcb; 312 struct adapter *sc = td_adapter(toep->td); 313 314 INP_LOCK_ASSERT(inp); 315 316 toep->tcp_opt = opt; 317 toep->mtu_idx = G_TCPOPT_MSS(opt); 318 tp->t_maxseg = sc->params.mtus[toep->mtu_idx]; 319 if (inp->inp_inc.inc_flags & INC_ISIPV6) 320 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 321 else 322 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 323 324 toep->emss = tp->t_maxseg; 325 if (G_TCPOPT_TSTAMP(opt)) { 326 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 327 tp->ts_recent = 0; /* hmmm */ 328 tp->ts_recent_age = tcp_ts_getticks(); 329 toep->emss -= TCPOLEN_TSTAMP_APPA; 330 } 331 332 CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u", 333 __func__, toep->tid, toep->mtu_idx, 334 sc->params.mtus[G_TCPOPT_MSS(opt)], tp->t_maxseg, toep->emss); 335 336 if (G_TCPOPT_SACK(opt)) 337 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 338 else 339 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 340 341 if (G_TCPOPT_WSCALE_OK(opt)) 342 tp->t_flags |= TF_RCVD_SCALE; 343 344 /* Doing window scaling? */ 345 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 346 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 347 tp->rcv_scale = tp->request_r_scale; 348 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 349 } 350 } 351 352 /* 353 * Completes some final bits of initialization for just established connections 354 * and changes their state to TCPS_ESTABLISHED. 355 * 356 * The ISNs are from the exchange of SYNs. 357 */ 358 void 359 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 360 { 361 struct inpcb *inp = toep->inp; 362 struct socket *so = inp->inp_socket; 363 struct tcpcb *tp = intotcpcb(inp); 364 long bufsize; 365 uint16_t tcpopt = be16toh(opt); 366 struct flowc_tx_params ftxp; 367 368 INP_WLOCK_ASSERT(inp); 369 KASSERT(tp->t_state == TCPS_SYN_SENT || 370 tp->t_state == TCPS_SYN_RECEIVED, 371 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 372 373 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 374 __func__, toep->tid, so, inp, tp, toep); 375 376 tcp_state_change(tp, TCPS_ESTABLISHED); 377 tp->t_starttime = ticks; 378 TCPSTAT_INC(tcps_connects); 379 380 tp->irs = irs; 381 tcp_rcvseqinit(tp); 382 tp->rcv_wnd = (u_int)toep->opt0_rcv_bufsize << 10; 383 tp->rcv_adv += tp->rcv_wnd; 384 tp->last_ack_sent = tp->rcv_nxt; 385 386 tp->iss = iss; 387 tcp_sendseqinit(tp); 388 tp->snd_una = iss + 1; 389 tp->snd_nxt = iss + 1; 390 tp->snd_max = iss + 1; 391 392 assign_rxopt(tp, tcpopt); 393 394 SOCKBUF_LOCK(&so->so_snd); 395 if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) 396 bufsize = V_tcp_autosndbuf_max; 397 else 398 bufsize = sbspace(&so->so_snd); 399 SOCKBUF_UNLOCK(&so->so_snd); 400 401 ftxp.snd_nxt = tp->snd_nxt; 402 ftxp.rcv_nxt = tp->rcv_nxt; 403 ftxp.snd_space = bufsize; 404 ftxp.mss = toep->emss; 405 send_flowc_wr(toep, &ftxp); 406 407 soisconnected(so); 408 } 409 410 int 411 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 412 { 413 struct wrqe *wr; 414 struct cpl_rx_data_ack *req; 415 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 416 417 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 418 419 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 420 if (wr == NULL) 421 return (0); 422 req = wrtod(wr); 423 424 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 425 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 426 427 t4_wrq_tx(sc, wr); 428 return (credits); 429 } 430 431 void 432 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 433 { 434 struct wrqe *wr; 435 struct cpl_rx_data_ack *req; 436 437 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 438 if (wr == NULL) 439 return; 440 req = wrtod(wr); 441 442 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 443 req->credit_dack = htobe32(F_RX_MODULATE_RX); 444 445 t4_wrq_tx(sc, wr); 446 } 447 448 void 449 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 450 { 451 struct adapter *sc = tod->tod_softc; 452 struct inpcb *inp = tp->t_inpcb; 453 struct socket *so = inp->inp_socket; 454 struct sockbuf *sb = &so->so_rcv; 455 struct toepcb *toep = tp->t_toe; 456 int rx_credits; 457 458 INP_WLOCK_ASSERT(inp); 459 SOCKBUF_LOCK_ASSERT(sb); 460 461 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 462 if (toep->ulp_mode == ULP_MODE_TLS) { 463 if (toep->tls.rcv_over >= rx_credits) { 464 toep->tls.rcv_over -= rx_credits; 465 rx_credits = 0; 466 } else { 467 rx_credits -= toep->tls.rcv_over; 468 toep->tls.rcv_over = 0; 469 } 470 } 471 472 if (rx_credits > 0 && 473 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 474 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 475 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 476 rx_credits = send_rx_credits(sc, toep, rx_credits); 477 tp->rcv_wnd += rx_credits; 478 tp->rcv_adv += rx_credits; 479 } else if (toep->flags & TPF_FORCE_CREDITS) 480 send_rx_modulate(sc, toep); 481 } 482 483 void 484 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 485 { 486 struct inpcb *inp = tp->t_inpcb; 487 struct socket *so = inp->inp_socket; 488 struct sockbuf *sb = &so->so_rcv; 489 490 SOCKBUF_LOCK(sb); 491 t4_rcvd_locked(tod, tp); 492 SOCKBUF_UNLOCK(sb); 493 } 494 495 /* 496 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 497 */ 498 int 499 t4_close_conn(struct adapter *sc, struct toepcb *toep) 500 { 501 struct wrqe *wr; 502 struct cpl_close_con_req *req; 503 unsigned int tid = toep->tid; 504 505 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 506 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 507 508 if (toep->flags & TPF_FIN_SENT) 509 return (0); 510 511 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 512 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 513 514 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 515 if (wr == NULL) { 516 /* XXX */ 517 panic("%s: allocation failure.", __func__); 518 } 519 req = wrtod(wr); 520 521 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 522 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 523 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 524 V_FW_WR_FLOWID(tid)); 525 req->wr.wr_lo = cpu_to_be64(0); 526 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 527 req->rsvd = 0; 528 529 toep->flags |= TPF_FIN_SENT; 530 toep->flags &= ~TPF_SEND_FIN; 531 t4_l2t_send(sc, wr, toep->l2te); 532 533 return (0); 534 } 535 536 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 537 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 538 539 /* Maximum amount of immediate data we could stuff in a WR */ 540 static inline int 541 max_imm_payload(int tx_credits) 542 { 543 const int n = 2; /* Use only up to 2 desc for imm. data WR */ 544 545 KASSERT(tx_credits >= 0 && 546 tx_credits <= MAX_OFLD_TX_CREDITS, 547 ("%s: %d credits", __func__, tx_credits)); 548 549 if (tx_credits < MIN_OFLD_TX_CREDITS) 550 return (0); 551 552 if (tx_credits >= (n * EQ_ESIZE) / 16) 553 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 554 else 555 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 556 } 557 558 /* Maximum number of SGL entries we could stuff in a WR */ 559 static inline int 560 max_dsgl_nsegs(int tx_credits) 561 { 562 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 563 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 564 565 KASSERT(tx_credits >= 0 && 566 tx_credits <= MAX_OFLD_TX_CREDITS, 567 ("%s: %d credits", __func__, tx_credits)); 568 569 if (tx_credits < MIN_OFLD_TX_CREDITS) 570 return (0); 571 572 nseg += 2 * (sge_pair_credits * 16 / 24); 573 if ((sge_pair_credits * 16) % 24 == 16) 574 nseg++; 575 576 return (nseg); 577 } 578 579 static inline void 580 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 581 unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign) 582 { 583 struct fw_ofld_tx_data_wr *txwr = dst; 584 585 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 586 V_FW_WR_IMMDLEN(immdlen)); 587 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 588 V_FW_WR_LEN16(credits)); 589 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) | 590 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 591 txwr->plen = htobe32(plen); 592 593 if (txalign > 0) { 594 struct tcpcb *tp = intotcpcb(toep->inp); 595 596 if (plen < 2 * toep->emss) 597 txwr->lsodisable_to_flags |= 598 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 599 else 600 txwr->lsodisable_to_flags |= 601 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 602 (tp->t_flags & TF_NODELAY ? 0 : 603 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 604 } 605 } 606 607 /* 608 * Generate a DSGL from a starting mbuf. The total number of segments and the 609 * maximum segments in any one mbuf are provided. 610 */ 611 static void 612 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 613 { 614 struct mbuf *m; 615 struct ulptx_sgl *usgl = dst; 616 int i, j, rc; 617 struct sglist sg; 618 struct sglist_seg segs[n]; 619 620 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 621 622 sglist_init(&sg, n, segs); 623 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 624 V_ULPTX_NSGE(nsegs)); 625 626 i = -1; 627 for (m = start; m != stop; m = m->m_next) { 628 if (m->m_flags & M_NOMAP) 629 rc = sglist_append_mb_ext_pgs(&sg, m); 630 else 631 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 632 if (__predict_false(rc != 0)) 633 panic("%s: sglist_append %d", __func__, rc); 634 635 for (j = 0; j < sg.sg_nseg; i++, j++) { 636 if (i < 0) { 637 usgl->len0 = htobe32(segs[j].ss_len); 638 usgl->addr0 = htobe64(segs[j].ss_paddr); 639 } else { 640 usgl->sge[i / 2].len[i & 1] = 641 htobe32(segs[j].ss_len); 642 usgl->sge[i / 2].addr[i & 1] = 643 htobe64(segs[j].ss_paddr); 644 } 645 #ifdef INVARIANTS 646 nsegs--; 647 #endif 648 } 649 sglist_reset(&sg); 650 } 651 if (i & 1) 652 usgl->sge[i / 2].len[1] = htobe32(0); 653 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 654 __func__, nsegs, start, stop)); 655 } 656 657 /* 658 * Max number of SGL entries an offload tx work request can have. This is 41 659 * (1 + 40) for a full 512B work request. 660 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 661 */ 662 #define OFLD_SGL_LEN (41) 663 664 /* 665 * Send data and/or a FIN to the peer. 666 * 667 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 668 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 669 * was transmitted. 670 * 671 * drop indicates the number of bytes that should be dropped from the head of 672 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 673 * contention on the send buffer lock (before this change it used to do 674 * sowwakeup and then t4_push_frames right after that when recovering from tx 675 * stalls). When drop is set this function MUST drop the bytes and wake up any 676 * writers. 677 */ 678 void 679 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 680 { 681 struct mbuf *sndptr, *m, *sb_sndptr; 682 struct fw_ofld_tx_data_wr *txwr; 683 struct wrqe *wr; 684 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 685 struct inpcb *inp = toep->inp; 686 struct tcpcb *tp = intotcpcb(inp); 687 struct socket *so = inp->inp_socket; 688 struct sockbuf *sb = &so->so_snd; 689 int tx_credits, shove, compl, sowwakeup; 690 struct ofld_tx_sdesc *txsd; 691 bool nomap_mbuf_seen; 692 693 INP_WLOCK_ASSERT(inp); 694 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 695 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 696 697 KASSERT(toep->ulp_mode == ULP_MODE_NONE || 698 toep->ulp_mode == ULP_MODE_TCPDDP || 699 toep->ulp_mode == ULP_MODE_TLS || 700 toep->ulp_mode == ULP_MODE_RDMA, 701 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 702 703 #ifdef VERBOSE_TRACES 704 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 705 __func__, toep->tid, toep->flags, tp->t_flags, drop); 706 #endif 707 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 708 return; 709 710 #ifdef RATELIMIT 711 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 712 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 713 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 714 } 715 #endif 716 717 /* 718 * This function doesn't resume by itself. Someone else must clear the 719 * flag and call this function. 720 */ 721 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 722 KASSERT(drop == 0, 723 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 724 return; 725 } 726 727 txsd = &toep->txsd[toep->txsd_pidx]; 728 do { 729 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 730 max_imm = max_imm_payload(tx_credits); 731 max_nsegs = max_dsgl_nsegs(tx_credits); 732 733 SOCKBUF_LOCK(sb); 734 sowwakeup = drop; 735 if (drop) { 736 sbdrop_locked(sb, drop); 737 drop = 0; 738 } 739 sb_sndptr = sb->sb_sndptr; 740 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 741 plen = 0; 742 nsegs = 0; 743 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 744 nomap_mbuf_seen = false; 745 for (m = sndptr; m != NULL; m = m->m_next) { 746 int n; 747 748 if (m->m_flags & M_NOMAP) 749 n = sglist_count_mb_ext_pgs(m); 750 else 751 n = sglist_count(mtod(m, void *), m->m_len); 752 753 nsegs += n; 754 plen += m->m_len; 755 756 /* This mbuf sent us _over_ the nsegs limit, back out */ 757 if (plen > max_imm && nsegs > max_nsegs) { 758 nsegs -= n; 759 plen -= m->m_len; 760 if (plen == 0) { 761 /* Too few credits */ 762 toep->flags |= TPF_TX_SUSPENDED; 763 if (sowwakeup) { 764 if (!TAILQ_EMPTY( 765 &toep->aiotx_jobq)) 766 t4_aiotx_queue_toep(so, 767 toep); 768 sowwakeup_locked(so); 769 } else 770 SOCKBUF_UNLOCK(sb); 771 SOCKBUF_UNLOCK_ASSERT(sb); 772 return; 773 } 774 break; 775 } 776 777 if (m->m_flags & M_NOMAP) 778 nomap_mbuf_seen = true; 779 if (max_nsegs_1mbuf < n) 780 max_nsegs_1mbuf = n; 781 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 782 783 /* This mbuf put us right at the max_nsegs limit */ 784 if (plen > max_imm && nsegs == max_nsegs) { 785 m = m->m_next; 786 break; 787 } 788 } 789 790 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 791 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 792 compl = 1; 793 else 794 compl = 0; 795 796 if (sb->sb_flags & SB_AUTOSIZE && 797 V_tcp_do_autosndbuf && 798 sb->sb_hiwat < V_tcp_autosndbuf_max && 799 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 800 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 801 V_tcp_autosndbuf_max); 802 803 if (!sbreserve_locked(sb, newsize, so, NULL)) 804 sb->sb_flags &= ~SB_AUTOSIZE; 805 else 806 sowwakeup = 1; /* room available */ 807 } 808 if (sowwakeup) { 809 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 810 t4_aiotx_queue_toep(so, toep); 811 sowwakeup_locked(so); 812 } else 813 SOCKBUF_UNLOCK(sb); 814 SOCKBUF_UNLOCK_ASSERT(sb); 815 816 /* nothing to send */ 817 if (plen == 0) { 818 KASSERT(m == NULL, 819 ("%s: nothing to send, but m != NULL", __func__)); 820 break; 821 } 822 823 if (__predict_false(toep->flags & TPF_FIN_SENT)) 824 panic("%s: excess tx.", __func__); 825 826 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 827 if (plen <= max_imm && !nomap_mbuf_seen) { 828 829 /* Immediate data tx */ 830 831 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 832 toep->ofld_txq); 833 if (wr == NULL) { 834 /* XXX: how will we recover from this? */ 835 toep->flags |= TPF_TX_SUSPENDED; 836 return; 837 } 838 txwr = wrtod(wr); 839 credits = howmany(wr->wr_len, 16); 840 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0, 841 sc->tt.tx_align); 842 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 843 nsegs = 0; 844 } else { 845 int wr_len; 846 847 /* DSGL tx */ 848 849 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 850 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 851 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 852 if (wr == NULL) { 853 /* XXX: how will we recover from this? */ 854 toep->flags |= TPF_TX_SUSPENDED; 855 return; 856 } 857 txwr = wrtod(wr); 858 credits = howmany(wr_len, 16); 859 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0, 860 sc->tt.tx_align); 861 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 862 max_nsegs_1mbuf); 863 if (wr_len & 0xf) { 864 uint64_t *pad = (uint64_t *) 865 ((uintptr_t)txwr + wr_len); 866 *pad = 0; 867 } 868 } 869 870 KASSERT(toep->tx_credits >= credits, 871 ("%s: not enough credits", __func__)); 872 873 toep->tx_credits -= credits; 874 toep->tx_nocompl += credits; 875 toep->plen_nocompl += plen; 876 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 877 toep->tx_nocompl >= toep->tx_total / 4) 878 compl = 1; 879 880 if (compl || toep->ulp_mode == ULP_MODE_RDMA) { 881 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 882 toep->tx_nocompl = 0; 883 toep->plen_nocompl = 0; 884 } 885 886 tp->snd_nxt += plen; 887 tp->snd_max += plen; 888 889 SOCKBUF_LOCK(sb); 890 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 891 sb->sb_sndptr = sb_sndptr; 892 SOCKBUF_UNLOCK(sb); 893 894 toep->flags |= TPF_TX_DATA_SENT; 895 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 896 toep->flags |= TPF_TX_SUSPENDED; 897 898 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 899 txsd->plen = plen; 900 txsd->tx_credits = credits; 901 txsd++; 902 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 903 toep->txsd_pidx = 0; 904 txsd = &toep->txsd[0]; 905 } 906 toep->txsd_avail--; 907 908 t4_l2t_send(sc, wr, toep->l2te); 909 } while (m != NULL); 910 911 /* Send a FIN if requested, but only if there's no more data to send */ 912 if (m == NULL && toep->flags & TPF_SEND_FIN) 913 t4_close_conn(sc, toep); 914 } 915 916 static inline void 917 rqdrop_locked(struct mbufq *q, int plen) 918 { 919 struct mbuf *m; 920 921 while (plen > 0) { 922 m = mbufq_dequeue(q); 923 924 /* Too many credits. */ 925 MPASS(m != NULL); 926 M_ASSERTPKTHDR(m); 927 928 /* Partial credits. */ 929 MPASS(plen >= m->m_pkthdr.len); 930 931 plen -= m->m_pkthdr.len; 932 m_freem(m); 933 } 934 } 935 936 void 937 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 938 { 939 struct mbuf *sndptr, *m; 940 struct fw_ofld_tx_data_wr *txwr; 941 struct wrqe *wr; 942 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 943 u_int adjusted_plen, ulp_submode; 944 struct inpcb *inp = toep->inp; 945 struct tcpcb *tp = intotcpcb(inp); 946 int tx_credits, shove; 947 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 948 struct mbufq *pduq = &toep->ulp_pduq; 949 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 950 951 INP_WLOCK_ASSERT(inp); 952 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 953 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 954 KASSERT(toep->ulp_mode == ULP_MODE_ISCSI, 955 ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); 956 957 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 958 return; 959 960 /* 961 * This function doesn't resume by itself. Someone else must clear the 962 * flag and call this function. 963 */ 964 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 965 KASSERT(drop == 0, 966 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 967 return; 968 } 969 970 if (drop) 971 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 972 973 while ((sndptr = mbufq_first(pduq)) != NULL) { 974 M_ASSERTPKTHDR(sndptr); 975 976 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 977 max_imm = max_imm_payload(tx_credits); 978 max_nsegs = max_dsgl_nsegs(tx_credits); 979 980 plen = 0; 981 nsegs = 0; 982 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 983 for (m = sndptr; m != NULL; m = m->m_next) { 984 int n = sglist_count(mtod(m, void *), m->m_len); 985 986 nsegs += n; 987 plen += m->m_len; 988 989 /* 990 * This mbuf would send us _over_ the nsegs limit. 991 * Suspend tx because the PDU can't be sent out. 992 */ 993 if (plen > max_imm && nsegs > max_nsegs) { 994 toep->flags |= TPF_TX_SUSPENDED; 995 return; 996 } 997 998 if (max_nsegs_1mbuf < n) 999 max_nsegs_1mbuf = n; 1000 } 1001 1002 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1003 panic("%s: excess tx.", __func__); 1004 1005 /* 1006 * We have a PDU to send. All of it goes out in one WR so 'm' 1007 * is NULL. A PDU's length is always a multiple of 4. 1008 */ 1009 MPASS(m == NULL); 1010 MPASS((plen & 3) == 0); 1011 MPASS(sndptr->m_pkthdr.len == plen); 1012 1013 shove = !(tp->t_flags & TF_MORETOCOME); 1014 ulp_submode = mbuf_ulp_submode(sndptr); 1015 MPASS(ulp_submode < nitems(ulp_extra_len)); 1016 1017 /* 1018 * plen doesn't include header and data digests, which are 1019 * generated and inserted in the right places by the TOE, but 1020 * they do occupy TCP sequence space and need to be accounted 1021 * for. 1022 */ 1023 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1024 if (plen <= max_imm) { 1025 1026 /* Immediate data tx */ 1027 1028 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1029 toep->ofld_txq); 1030 if (wr == NULL) { 1031 /* XXX: how will we recover from this? */ 1032 toep->flags |= TPF_TX_SUSPENDED; 1033 return; 1034 } 1035 txwr = wrtod(wr); 1036 credits = howmany(wr->wr_len, 16); 1037 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1038 shove, ulp_submode, sc->tt.tx_align); 1039 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1040 nsegs = 0; 1041 } else { 1042 int wr_len; 1043 1044 /* DSGL tx */ 1045 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1046 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1047 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 1048 if (wr == NULL) { 1049 /* XXX: how will we recover from this? */ 1050 toep->flags |= TPF_TX_SUSPENDED; 1051 return; 1052 } 1053 txwr = wrtod(wr); 1054 credits = howmany(wr_len, 16); 1055 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1056 shove, ulp_submode, sc->tt.tx_align); 1057 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1058 max_nsegs_1mbuf); 1059 if (wr_len & 0xf) { 1060 uint64_t *pad = (uint64_t *) 1061 ((uintptr_t)txwr + wr_len); 1062 *pad = 0; 1063 } 1064 } 1065 1066 KASSERT(toep->tx_credits >= credits, 1067 ("%s: not enough credits", __func__)); 1068 1069 m = mbufq_dequeue(pduq); 1070 MPASS(m == sndptr); 1071 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1072 1073 toep->tx_credits -= credits; 1074 toep->tx_nocompl += credits; 1075 toep->plen_nocompl += plen; 1076 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1077 toep->tx_nocompl >= toep->tx_total / 4) { 1078 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1079 toep->tx_nocompl = 0; 1080 toep->plen_nocompl = 0; 1081 } 1082 1083 tp->snd_nxt += adjusted_plen; 1084 tp->snd_max += adjusted_plen; 1085 1086 toep->flags |= TPF_TX_DATA_SENT; 1087 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1088 toep->flags |= TPF_TX_SUSPENDED; 1089 1090 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1091 txsd->plen = plen; 1092 txsd->tx_credits = credits; 1093 txsd++; 1094 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1095 toep->txsd_pidx = 0; 1096 txsd = &toep->txsd[0]; 1097 } 1098 toep->txsd_avail--; 1099 1100 t4_l2t_send(sc, wr, toep->l2te); 1101 } 1102 1103 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1104 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1105 t4_close_conn(sc, toep); 1106 } 1107 1108 int 1109 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1110 { 1111 struct adapter *sc = tod->tod_softc; 1112 #ifdef INVARIANTS 1113 struct inpcb *inp = tp->t_inpcb; 1114 #endif 1115 struct toepcb *toep = tp->t_toe; 1116 1117 INP_WLOCK_ASSERT(inp); 1118 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1119 ("%s: inp %p dropped.", __func__, inp)); 1120 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1121 1122 if (toep->ulp_mode == ULP_MODE_ISCSI) 1123 t4_push_pdus(sc, toep, 0); 1124 else if (tls_tx_key(toep)) 1125 t4_push_tls_records(sc, toep, 0); 1126 else 1127 t4_push_frames(sc, toep, 0); 1128 1129 return (0); 1130 } 1131 1132 int 1133 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1134 { 1135 struct adapter *sc = tod->tod_softc; 1136 #ifdef INVARIANTS 1137 struct inpcb *inp = tp->t_inpcb; 1138 #endif 1139 struct toepcb *toep = tp->t_toe; 1140 1141 INP_WLOCK_ASSERT(inp); 1142 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1143 ("%s: inp %p dropped.", __func__, inp)); 1144 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1145 1146 toep->flags |= TPF_SEND_FIN; 1147 if (tp->t_state >= TCPS_ESTABLISHED) { 1148 if (toep->ulp_mode == ULP_MODE_ISCSI) 1149 t4_push_pdus(sc, toep, 0); 1150 else if (tls_tx_key(toep)) 1151 t4_push_tls_records(sc, toep, 0); 1152 else 1153 t4_push_frames(sc, toep, 0); 1154 } 1155 1156 return (0); 1157 } 1158 1159 int 1160 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1161 { 1162 struct adapter *sc = tod->tod_softc; 1163 #if defined(INVARIANTS) 1164 struct inpcb *inp = tp->t_inpcb; 1165 #endif 1166 struct toepcb *toep = tp->t_toe; 1167 1168 INP_WLOCK_ASSERT(inp); 1169 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1170 ("%s: inp %p dropped.", __func__, inp)); 1171 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1172 1173 /* hmmmm */ 1174 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1175 ("%s: flowc for tid %u [%s] not sent already", 1176 __func__, toep->tid, tcpstates[tp->t_state])); 1177 1178 send_reset(sc, toep, 0); 1179 return (0); 1180 } 1181 1182 /* 1183 * Peer has sent us a FIN. 1184 */ 1185 static int 1186 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1187 { 1188 struct adapter *sc = iq->adapter; 1189 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1190 unsigned int tid = GET_TID(cpl); 1191 struct toepcb *toep = lookup_tid(sc, tid); 1192 struct inpcb *inp = toep->inp; 1193 struct tcpcb *tp = NULL; 1194 struct socket *so; 1195 struct epoch_tracker et; 1196 #ifdef INVARIANTS 1197 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1198 #endif 1199 1200 KASSERT(opcode == CPL_PEER_CLOSE, 1201 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1202 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1203 1204 if (__predict_false(toep->flags & TPF_SYNQE)) { 1205 /* 1206 * do_pass_establish must have run before do_peer_close and if 1207 * this is still a synqe instead of a toepcb then the connection 1208 * must be getting aborted. 1209 */ 1210 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1211 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1212 toep, toep->flags); 1213 return (0); 1214 } 1215 1216 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1217 1218 CURVNET_SET(toep->vnet); 1219 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1220 INP_WLOCK(inp); 1221 tp = intotcpcb(inp); 1222 1223 CTR6(KTR_CXGBE, 1224 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1225 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1226 toep->ddp.flags, inp); 1227 1228 if (toep->flags & TPF_ABORT_SHUTDOWN) 1229 goto done; 1230 1231 tp->rcv_nxt++; /* FIN */ 1232 1233 so = inp->inp_socket; 1234 socantrcvmore(so); 1235 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1236 DDP_LOCK(toep); 1237 if (__predict_false(toep->ddp.flags & 1238 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1239 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1240 DDP_UNLOCK(toep); 1241 } 1242 1243 if (toep->ulp_mode != ULP_MODE_RDMA) { 1244 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1245 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1246 be32toh(cpl->rcv_nxt))); 1247 } 1248 1249 switch (tp->t_state) { 1250 case TCPS_SYN_RECEIVED: 1251 tp->t_starttime = ticks; 1252 /* FALLTHROUGH */ 1253 1254 case TCPS_ESTABLISHED: 1255 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1256 break; 1257 1258 case TCPS_FIN_WAIT_1: 1259 tcp_state_change(tp, TCPS_CLOSING); 1260 break; 1261 1262 case TCPS_FIN_WAIT_2: 1263 tcp_twstart(tp); 1264 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1265 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1266 CURVNET_RESTORE(); 1267 1268 INP_WLOCK(inp); 1269 final_cpl_received(toep); 1270 return (0); 1271 1272 default: 1273 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1274 __func__, tid, tp->t_state); 1275 } 1276 done: 1277 INP_WUNLOCK(inp); 1278 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1279 CURVNET_RESTORE(); 1280 return (0); 1281 } 1282 1283 /* 1284 * Peer has ACK'd our FIN. 1285 */ 1286 static int 1287 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1288 struct mbuf *m) 1289 { 1290 struct adapter *sc = iq->adapter; 1291 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1292 unsigned int tid = GET_TID(cpl); 1293 struct toepcb *toep = lookup_tid(sc, tid); 1294 struct inpcb *inp = toep->inp; 1295 struct tcpcb *tp = NULL; 1296 struct socket *so = NULL; 1297 struct epoch_tracker et; 1298 #ifdef INVARIANTS 1299 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1300 #endif 1301 1302 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1303 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1304 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1305 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1306 1307 CURVNET_SET(toep->vnet); 1308 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1309 INP_WLOCK(inp); 1310 tp = intotcpcb(inp); 1311 1312 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1313 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1314 1315 if (toep->flags & TPF_ABORT_SHUTDOWN) 1316 goto done; 1317 1318 so = inp->inp_socket; 1319 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1320 1321 switch (tp->t_state) { 1322 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1323 tcp_twstart(tp); 1324 release: 1325 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1326 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1327 CURVNET_RESTORE(); 1328 1329 INP_WLOCK(inp); 1330 final_cpl_received(toep); /* no more CPLs expected */ 1331 1332 return (0); 1333 case TCPS_LAST_ACK: 1334 if (tcp_close(tp)) 1335 INP_WUNLOCK(inp); 1336 goto release; 1337 1338 case TCPS_FIN_WAIT_1: 1339 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1340 soisdisconnected(so); 1341 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1342 break; 1343 1344 default: 1345 log(LOG_ERR, 1346 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1347 __func__, tid, tcpstates[tp->t_state]); 1348 } 1349 done: 1350 INP_WUNLOCK(inp); 1351 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1352 CURVNET_RESTORE(); 1353 return (0); 1354 } 1355 1356 void 1357 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1358 int rst_status) 1359 { 1360 struct wrqe *wr; 1361 struct cpl_abort_rpl *cpl; 1362 1363 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1364 if (wr == NULL) { 1365 /* XXX */ 1366 panic("%s: allocation failure.", __func__); 1367 } 1368 cpl = wrtod(wr); 1369 1370 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1371 cpl->cmd = rst_status; 1372 1373 t4_wrq_tx(sc, wr); 1374 } 1375 1376 static int 1377 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1378 { 1379 switch (abort_reason) { 1380 case CPL_ERR_BAD_SYN: 1381 case CPL_ERR_CONN_RESET: 1382 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1383 case CPL_ERR_XMIT_TIMEDOUT: 1384 case CPL_ERR_PERSIST_TIMEDOUT: 1385 case CPL_ERR_FINWAIT2_TIMEDOUT: 1386 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1387 return (ETIMEDOUT); 1388 default: 1389 return (EIO); 1390 } 1391 } 1392 1393 /* 1394 * TCP RST from the peer, timeout, or some other such critical error. 1395 */ 1396 static int 1397 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1398 { 1399 struct adapter *sc = iq->adapter; 1400 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1401 unsigned int tid = GET_TID(cpl); 1402 struct toepcb *toep = lookup_tid(sc, tid); 1403 struct sge_wrq *ofld_txq = toep->ofld_txq; 1404 struct inpcb *inp; 1405 struct tcpcb *tp; 1406 struct epoch_tracker et; 1407 #ifdef INVARIANTS 1408 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1409 #endif 1410 1411 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1412 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1413 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1414 1415 if (toep->flags & TPF_SYNQE) 1416 return (do_abort_req_synqe(iq, rss, m)); 1417 1418 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1419 1420 if (negative_advice(cpl->status)) { 1421 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1422 __func__, cpl->status, tid, toep->flags); 1423 return (0); /* Ignore negative advice */ 1424 } 1425 1426 inp = toep->inp; 1427 CURVNET_SET(toep->vnet); 1428 INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for tcp_close */ 1429 INP_WLOCK(inp); 1430 1431 tp = intotcpcb(inp); 1432 1433 CTR6(KTR_CXGBE, 1434 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1435 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1436 inp->inp_flags, cpl->status); 1437 1438 /* 1439 * If we'd initiated an abort earlier the reply to it is responsible for 1440 * cleaning up resources. Otherwise we tear everything down right here 1441 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1442 */ 1443 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1444 INP_WUNLOCK(inp); 1445 goto done; 1446 } 1447 toep->flags |= TPF_ABORT_SHUTDOWN; 1448 1449 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1450 struct socket *so = inp->inp_socket; 1451 1452 if (so != NULL) 1453 so_error_set(so, abort_status_to_errno(tp, 1454 cpl->status)); 1455 tp = tcp_close(tp); 1456 if (tp == NULL) 1457 INP_WLOCK(inp); /* re-acquire */ 1458 } 1459 1460 final_cpl_received(toep); 1461 done: 1462 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1463 CURVNET_RESTORE(); 1464 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1465 return (0); 1466 } 1467 1468 /* 1469 * Reply to the CPL_ABORT_REQ (send_reset) 1470 */ 1471 static int 1472 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1473 { 1474 struct adapter *sc = iq->adapter; 1475 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1476 unsigned int tid = GET_TID(cpl); 1477 struct toepcb *toep = lookup_tid(sc, tid); 1478 struct inpcb *inp = toep->inp; 1479 #ifdef INVARIANTS 1480 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1481 #endif 1482 1483 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1484 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1485 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1486 1487 if (toep->flags & TPF_SYNQE) 1488 return (do_abort_rpl_synqe(iq, rss, m)); 1489 1490 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1491 1492 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1493 __func__, tid, toep, inp, cpl->status); 1494 1495 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1496 ("%s: wasn't expecting abort reply", __func__)); 1497 1498 INP_WLOCK(inp); 1499 final_cpl_received(toep); 1500 1501 return (0); 1502 } 1503 1504 static int 1505 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1506 { 1507 struct adapter *sc = iq->adapter; 1508 const struct cpl_rx_data *cpl = mtod(m, const void *); 1509 unsigned int tid = GET_TID(cpl); 1510 struct toepcb *toep = lookup_tid(sc, tid); 1511 struct inpcb *inp = toep->inp; 1512 struct tcpcb *tp; 1513 struct socket *so; 1514 struct sockbuf *sb; 1515 struct epoch_tracker et; 1516 int len, rx_credits; 1517 uint32_t ddp_placed = 0; 1518 1519 if (__predict_false(toep->flags & TPF_SYNQE)) { 1520 /* 1521 * do_pass_establish must have run before do_rx_data and if this 1522 * is still a synqe instead of a toepcb then the connection must 1523 * be getting aborted. 1524 */ 1525 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1526 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1527 toep, toep->flags); 1528 m_freem(m); 1529 return (0); 1530 } 1531 1532 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1533 1534 /* strip off CPL header */ 1535 m_adj(m, sizeof(*cpl)); 1536 len = m->m_pkthdr.len; 1537 1538 INP_WLOCK(inp); 1539 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1540 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1541 __func__, tid, len, inp->inp_flags); 1542 INP_WUNLOCK(inp); 1543 m_freem(m); 1544 return (0); 1545 } 1546 1547 tp = intotcpcb(inp); 1548 1549 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1550 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1551 1552 tp->rcv_nxt += len; 1553 if (tp->rcv_wnd < len) { 1554 KASSERT(toep->ulp_mode == ULP_MODE_RDMA, 1555 ("%s: negative window size", __func__)); 1556 } 1557 1558 tp->rcv_wnd -= len; 1559 tp->t_rcvtime = ticks; 1560 1561 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1562 DDP_LOCK(toep); 1563 so = inp_inpcbtosocket(inp); 1564 sb = &so->so_rcv; 1565 SOCKBUF_LOCK(sb); 1566 1567 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1568 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1569 __func__, tid, len); 1570 m_freem(m); 1571 SOCKBUF_UNLOCK(sb); 1572 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1573 DDP_UNLOCK(toep); 1574 INP_WUNLOCK(inp); 1575 1576 CURVNET_SET(toep->vnet); 1577 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1578 INP_WLOCK(inp); 1579 tp = tcp_drop(tp, ECONNRESET); 1580 if (tp) 1581 INP_WUNLOCK(inp); 1582 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1583 CURVNET_RESTORE(); 1584 1585 return (0); 1586 } 1587 1588 /* receive buffer autosize */ 1589 MPASS(toep->vnet == so->so_vnet); 1590 CURVNET_SET(toep->vnet); 1591 if (sb->sb_flags & SB_AUTOSIZE && 1592 V_tcp_do_autorcvbuf && 1593 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1594 len > (sbspace(sb) / 8 * 7)) { 1595 unsigned int hiwat = sb->sb_hiwat; 1596 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1597 V_tcp_autorcvbuf_max); 1598 1599 if (!sbreserve_locked(sb, newsize, so, NULL)) 1600 sb->sb_flags &= ~SB_AUTOSIZE; 1601 } 1602 1603 if (toep->ulp_mode == ULP_MODE_TCPDDP) { 1604 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1605 1606 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1607 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1608 __func__, tid, len); 1609 1610 if (changed) { 1611 if (toep->ddp.flags & DDP_SC_REQ) 1612 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1613 else { 1614 KASSERT(cpl->ddp_off == 1, 1615 ("%s: DDP switched on by itself.", 1616 __func__)); 1617 1618 /* Fell out of DDP mode */ 1619 toep->ddp.flags &= ~DDP_ON; 1620 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1621 __func__); 1622 1623 insert_ddp_data(toep, ddp_placed); 1624 } 1625 } 1626 1627 if (toep->ddp.flags & DDP_ON) { 1628 /* 1629 * CPL_RX_DATA with DDP on can only be an indicate. 1630 * Start posting queued AIO requests via DDP. The 1631 * payload that arrived in this indicate is appended 1632 * to the socket buffer as usual. 1633 */ 1634 handle_ddp_indicate(toep); 1635 } 1636 } 1637 1638 sbappendstream_locked(sb, m, 0); 1639 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1640 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1641 rx_credits = send_rx_credits(sc, toep, rx_credits); 1642 tp->rcv_wnd += rx_credits; 1643 tp->rcv_adv += rx_credits; 1644 } 1645 1646 if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1647 sbavail(sb) != 0) { 1648 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1649 tid); 1650 ddp_queue_toep(toep); 1651 } 1652 sorwakeup_locked(so); 1653 SOCKBUF_UNLOCK_ASSERT(sb); 1654 if (toep->ulp_mode == ULP_MODE_TCPDDP) 1655 DDP_UNLOCK(toep); 1656 1657 INP_WUNLOCK(inp); 1658 CURVNET_RESTORE(); 1659 return (0); 1660 } 1661 1662 static int 1663 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1664 { 1665 struct adapter *sc = iq->adapter; 1666 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1667 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1668 struct toepcb *toep = lookup_tid(sc, tid); 1669 struct inpcb *inp; 1670 struct tcpcb *tp; 1671 struct socket *so; 1672 uint8_t credits = cpl->credits; 1673 struct ofld_tx_sdesc *txsd; 1674 int plen; 1675 #ifdef INVARIANTS 1676 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1677 #endif 1678 1679 /* 1680 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1681 * now this comes back carrying the credits for the flowc. 1682 */ 1683 if (__predict_false(toep->flags & TPF_SYNQE)) { 1684 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1685 ("%s: credits for a synq entry %p", __func__, toep)); 1686 return (0); 1687 } 1688 1689 inp = toep->inp; 1690 1691 KASSERT(opcode == CPL_FW4_ACK, 1692 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1693 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1694 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1695 1696 INP_WLOCK(inp); 1697 1698 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1699 INP_WUNLOCK(inp); 1700 return (0); 1701 } 1702 1703 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1704 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1705 1706 tp = intotcpcb(inp); 1707 1708 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1709 tcp_seq snd_una = be32toh(cpl->snd_una); 1710 1711 #ifdef INVARIANTS 1712 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1713 log(LOG_ERR, 1714 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1715 __func__, snd_una, toep->tid, tp->snd_una); 1716 } 1717 #endif 1718 1719 if (tp->snd_una != snd_una) { 1720 tp->snd_una = snd_una; 1721 tp->ts_recent_age = tcp_ts_getticks(); 1722 } 1723 } 1724 1725 #ifdef VERBOSE_TRACES 1726 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1727 #endif 1728 so = inp->inp_socket; 1729 txsd = &toep->txsd[toep->txsd_cidx]; 1730 plen = 0; 1731 while (credits) { 1732 KASSERT(credits >= txsd->tx_credits, 1733 ("%s: too many (or partial) credits", __func__)); 1734 credits -= txsd->tx_credits; 1735 toep->tx_credits += txsd->tx_credits; 1736 plen += txsd->plen; 1737 if (txsd->iv_buffer) { 1738 free(txsd->iv_buffer, M_CXGBE); 1739 txsd->iv_buffer = NULL; 1740 } 1741 txsd++; 1742 toep->txsd_avail++; 1743 KASSERT(toep->txsd_avail <= toep->txsd_total, 1744 ("%s: txsd avail > total", __func__)); 1745 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1746 txsd = &toep->txsd[0]; 1747 toep->txsd_cidx = 0; 1748 } 1749 } 1750 1751 if (toep->tx_credits == toep->tx_total) { 1752 toep->tx_nocompl = 0; 1753 toep->plen_nocompl = 0; 1754 } 1755 1756 if (toep->flags & TPF_TX_SUSPENDED && 1757 toep->tx_credits >= toep->tx_total / 4) { 1758 #ifdef VERBOSE_TRACES 1759 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1760 tid); 1761 #endif 1762 toep->flags &= ~TPF_TX_SUSPENDED; 1763 CURVNET_SET(toep->vnet); 1764 if (toep->ulp_mode == ULP_MODE_ISCSI) 1765 t4_push_pdus(sc, toep, plen); 1766 else if (tls_tx_key(toep)) 1767 t4_push_tls_records(sc, toep, plen); 1768 else 1769 t4_push_frames(sc, toep, plen); 1770 CURVNET_RESTORE(); 1771 } else if (plen > 0) { 1772 struct sockbuf *sb = &so->so_snd; 1773 int sbu; 1774 1775 SOCKBUF_LOCK(sb); 1776 sbu = sbused(sb); 1777 if (toep->ulp_mode == ULP_MODE_ISCSI) { 1778 1779 if (__predict_false(sbu > 0)) { 1780 /* 1781 * The data trasmitted before the tid's ULP mode 1782 * changed to ISCSI is still in so_snd. 1783 * Incoming credits should account for so_snd 1784 * first. 1785 */ 1786 sbdrop_locked(sb, min(sbu, plen)); 1787 plen -= min(sbu, plen); 1788 } 1789 sowwakeup_locked(so); /* unlocks so_snd */ 1790 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1791 } else { 1792 #ifdef VERBOSE_TRACES 1793 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1794 tid, plen); 1795 #endif 1796 sbdrop_locked(sb, plen); 1797 if (tls_tx_key(toep)) { 1798 struct tls_ofld_info *tls_ofld = &toep->tls; 1799 1800 MPASS(tls_ofld->sb_off >= plen); 1801 tls_ofld->sb_off -= plen; 1802 } 1803 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1804 t4_aiotx_queue_toep(so, toep); 1805 sowwakeup_locked(so); /* unlocks so_snd */ 1806 } 1807 SOCKBUF_UNLOCK_ASSERT(sb); 1808 } 1809 1810 INP_WUNLOCK(inp); 1811 1812 return (0); 1813 } 1814 1815 void 1816 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1817 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1818 { 1819 struct wrqe *wr; 1820 struct cpl_set_tcb_field *req; 1821 struct ofld_tx_sdesc *txsd; 1822 1823 MPASS((cookie & ~M_COOKIE) == 0); 1824 if (reply) { 1825 MPASS(cookie != CPL_COOKIE_RESERVED); 1826 } 1827 1828 wr = alloc_wrqe(sizeof(*req), wrq); 1829 if (wr == NULL) { 1830 /* XXX */ 1831 panic("%s: allocation failure.", __func__); 1832 } 1833 req = wrtod(wr); 1834 1835 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1836 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1837 if (reply == 0) 1838 req->reply_ctrl |= htobe16(F_NO_REPLY); 1839 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1840 req->mask = htobe64(mask); 1841 req->val = htobe64(val); 1842 if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { 1843 txsd = &toep->txsd[toep->txsd_pidx]; 1844 txsd->tx_credits = howmany(sizeof(*req), 16); 1845 txsd->plen = 0; 1846 KASSERT(toep->tx_credits >= txsd->tx_credits && 1847 toep->txsd_avail > 0, 1848 ("%s: not enough credits (%d)", __func__, 1849 toep->tx_credits)); 1850 toep->tx_credits -= txsd->tx_credits; 1851 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1852 toep->txsd_pidx = 0; 1853 toep->txsd_avail--; 1854 } 1855 1856 t4_wrq_tx(sc, wr); 1857 } 1858 1859 void 1860 t4_init_cpl_io_handlers(void) 1861 { 1862 1863 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1864 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1865 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1866 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1867 CPL_COOKIE_TOM); 1868 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1869 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1870 } 1871 1872 void 1873 t4_uninit_cpl_io_handlers(void) 1874 { 1875 1876 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1877 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1878 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1879 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1880 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1881 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1882 } 1883 1884 /* 1885 * Use the 'backend1' field in AIO jobs to hold an error that should 1886 * be reported when the job is completed, the 'backend3' field to 1887 * store the amount of data sent by the AIO job so far, and the 1888 * 'backend4' field to hold a reference count on the job. 1889 * 1890 * Each unmapped mbuf holds a reference on the job as does the queue 1891 * so long as the job is queued. 1892 */ 1893 #define aio_error backend1 1894 #define aio_sent backend3 1895 #define aio_refs backend4 1896 1897 #define jobtotid(job) \ 1898 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1899 1900 static void 1901 aiotx_free_job(struct kaiocb *job) 1902 { 1903 long status; 1904 int error; 1905 1906 if (refcount_release(&job->aio_refs) == 0) 1907 return; 1908 1909 error = (intptr_t)job->aio_error; 1910 status = job->aio_sent; 1911 #ifdef VERBOSE_TRACES 1912 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1913 jobtotid(job), job, status, error); 1914 #endif 1915 if (error != 0 && status != 0) 1916 error = 0; 1917 if (error == ECANCELED) 1918 aio_cancel(job); 1919 else if (error) 1920 aio_complete(job, -1, error); 1921 else { 1922 job->msgsnd = 1; 1923 aio_complete(job, status, 0); 1924 } 1925 } 1926 1927 static void 1928 aiotx_free_pgs(struct mbuf *m) 1929 { 1930 struct mbuf_ext_pgs *ext_pgs; 1931 struct kaiocb *job; 1932 struct mtx *mtx; 1933 vm_page_t pg; 1934 1935 MBUF_EXT_PGS_ASSERT(m); 1936 ext_pgs = m->m_ext.ext_pgs; 1937 job = m->m_ext.ext_arg1; 1938 #ifdef VERBOSE_TRACES 1939 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1940 m->m_len, jobtotid(job)); 1941 #endif 1942 1943 mtx = NULL; 1944 for (int i = 0; i < ext_pgs->npgs; i++) { 1945 pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); 1946 vm_page_change_lock(pg, &mtx); 1947 vm_page_unwire(pg, PQ_ACTIVE); 1948 } 1949 if (mtx != NULL) 1950 mtx_unlock(mtx); 1951 1952 aiotx_free_job(job); 1953 } 1954 1955 /* 1956 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 1957 * of an AIO job. 1958 */ 1959 static struct mbuf * 1960 alloc_aiotx_mbuf(struct kaiocb *job, int len) 1961 { 1962 struct vmspace *vm; 1963 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 1964 struct mbuf *m, *top, *last; 1965 struct mbuf_ext_pgs *ext_pgs; 1966 vm_map_t map; 1967 vm_offset_t start; 1968 int i, mlen, npages, pgoff; 1969 1970 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 1971 ("%s(%p, %d): request to send beyond end of buffer", __func__, 1972 job, len)); 1973 1974 /* 1975 * The AIO subsystem will cancel and drain all requests before 1976 * permitting a process to exit or exec, so p_vmspace should 1977 * be stable here. 1978 */ 1979 vm = job->userproc->p_vmspace; 1980 map = &vm->vm_map; 1981 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 1982 pgoff = start & PAGE_MASK; 1983 1984 top = NULL; 1985 last = NULL; 1986 while (len > 0) { 1987 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 1988 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 1989 ("%s: next start (%#jx + %#x) is not page aligned", 1990 __func__, (uintmax_t)start, mlen)); 1991 1992 npages = vm_fault_quick_hold_pages(map, start, mlen, 1993 VM_PROT_WRITE, pgs, nitems(pgs)); 1994 if (npages < 0) 1995 break; 1996 1997 m = mb_alloc_ext_pgs(M_WAITOK, false, aiotx_free_pgs); 1998 if (m == NULL) { 1999 vm_page_unhold_pages(pgs, npages); 2000 break; 2001 } 2002 2003 ext_pgs = m->m_ext.ext_pgs; 2004 ext_pgs->first_pg_off = pgoff; 2005 ext_pgs->npgs = npages; 2006 if (npages == 1) { 2007 KASSERT(mlen + pgoff <= PAGE_SIZE, 2008 ("%s: single page is too large (off %d len %d)", 2009 __func__, pgoff, mlen)); 2010 ext_pgs->last_pg_len = mlen; 2011 } else { 2012 ext_pgs->last_pg_len = mlen - (PAGE_SIZE - pgoff) - 2013 (npages - 2) * PAGE_SIZE; 2014 } 2015 for (i = 0; i < npages; i++) 2016 ext_pgs->pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2017 2018 m->m_len = mlen; 2019 m->m_ext.ext_size = npages * PAGE_SIZE; 2020 m->m_ext.ext_arg1 = job; 2021 refcount_acquire(&job->aio_refs); 2022 2023 #ifdef VERBOSE_TRACES 2024 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2025 __func__, jobtotid(job), m, job, npages); 2026 #endif 2027 2028 if (top == NULL) 2029 top = m; 2030 else 2031 last->m_next = m; 2032 last = m; 2033 2034 len -= mlen; 2035 start += mlen; 2036 pgoff = 0; 2037 } 2038 2039 return (top); 2040 } 2041 2042 static void 2043 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2044 { 2045 struct adapter *sc; 2046 struct sockbuf *sb; 2047 struct file *fp; 2048 struct inpcb *inp; 2049 struct tcpcb *tp; 2050 struct mbuf *m; 2051 int error, len; 2052 bool moretocome, sendmore; 2053 2054 sc = td_adapter(toep->td); 2055 sb = &so->so_snd; 2056 SOCKBUF_UNLOCK(sb); 2057 fp = job->fd_file; 2058 m = NULL; 2059 2060 #ifdef MAC 2061 error = mac_socket_check_send(fp->f_cred, so); 2062 if (error != 0) 2063 goto out; 2064 #endif 2065 2066 /* Inline sosend_generic(). */ 2067 2068 error = sblock(sb, SBL_WAIT); 2069 MPASS(error == 0); 2070 2071 sendanother: 2072 SOCKBUF_LOCK(sb); 2073 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2074 SOCKBUF_UNLOCK(sb); 2075 sbunlock(sb); 2076 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2077 PROC_LOCK(job->userproc); 2078 kern_psignal(job->userproc, SIGPIPE); 2079 PROC_UNLOCK(job->userproc); 2080 } 2081 error = EPIPE; 2082 goto out; 2083 } 2084 if (so->so_error) { 2085 error = so->so_error; 2086 so->so_error = 0; 2087 SOCKBUF_UNLOCK(sb); 2088 sbunlock(sb); 2089 goto out; 2090 } 2091 if ((so->so_state & SS_ISCONNECTED) == 0) { 2092 SOCKBUF_UNLOCK(sb); 2093 sbunlock(sb); 2094 error = ENOTCONN; 2095 goto out; 2096 } 2097 if (sbspace(sb) < sb->sb_lowat) { 2098 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2099 2100 /* 2101 * Don't block if there is too little room in the socket 2102 * buffer. Instead, requeue the request. 2103 */ 2104 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2105 SOCKBUF_UNLOCK(sb); 2106 sbunlock(sb); 2107 error = ECANCELED; 2108 goto out; 2109 } 2110 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2111 SOCKBUF_UNLOCK(sb); 2112 sbunlock(sb); 2113 goto out; 2114 } 2115 2116 /* 2117 * Write as much data as the socket permits, but no more than a 2118 * a single sndbuf at a time. 2119 */ 2120 len = sbspace(sb); 2121 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2122 len = job->uaiocb.aio_nbytes - job->aio_sent; 2123 moretocome = false; 2124 } else 2125 moretocome = true; 2126 if (len > sc->tt.sndbuf) { 2127 len = sc->tt.sndbuf; 2128 sendmore = true; 2129 } else 2130 sendmore = false; 2131 2132 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2133 moretocome = true; 2134 SOCKBUF_UNLOCK(sb); 2135 MPASS(len != 0); 2136 2137 m = alloc_aiotx_mbuf(job, len); 2138 if (m == NULL) { 2139 sbunlock(sb); 2140 error = EFAULT; 2141 goto out; 2142 } 2143 2144 /* Inlined tcp_usr_send(). */ 2145 2146 inp = toep->inp; 2147 INP_WLOCK(inp); 2148 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2149 INP_WUNLOCK(inp); 2150 sbunlock(sb); 2151 error = ECONNRESET; 2152 goto out; 2153 } 2154 2155 job->aio_sent += m_length(m, NULL); 2156 2157 sbappendstream(sb, m, 0); 2158 m = NULL; 2159 2160 if (!(inp->inp_flags & INP_DROPPED)) { 2161 tp = intotcpcb(inp); 2162 if (moretocome) 2163 tp->t_flags |= TF_MORETOCOME; 2164 error = tp->t_fb->tfb_tcp_output(tp); 2165 if (moretocome) 2166 tp->t_flags &= ~TF_MORETOCOME; 2167 } 2168 2169 INP_WUNLOCK(inp); 2170 if (sendmore) 2171 goto sendanother; 2172 sbunlock(sb); 2173 2174 if (error) 2175 goto out; 2176 2177 /* 2178 * If this is a blocking socket and the request has not been 2179 * fully completed, requeue it until the socket is ready 2180 * again. 2181 */ 2182 if (job->aio_sent < job->uaiocb.aio_nbytes && 2183 !(so->so_state & SS_NBIO)) { 2184 SOCKBUF_LOCK(sb); 2185 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2186 SOCKBUF_UNLOCK(sb); 2187 error = ECANCELED; 2188 goto out; 2189 } 2190 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2191 return; 2192 } 2193 2194 /* 2195 * If the request will not be requeued, drop the queue's 2196 * reference to the job. Any mbufs in flight should still 2197 * hold a reference, but this drops the reference that the 2198 * queue owns while it is waiting to queue mbufs to the 2199 * socket. 2200 */ 2201 aiotx_free_job(job); 2202 2203 out: 2204 if (error) { 2205 job->aio_error = (void *)(intptr_t)error; 2206 aiotx_free_job(job); 2207 } 2208 if (m != NULL) 2209 m_free(m); 2210 SOCKBUF_LOCK(sb); 2211 } 2212 2213 static void 2214 t4_aiotx_task(void *context, int pending) 2215 { 2216 struct toepcb *toep = context; 2217 struct socket *so; 2218 struct kaiocb *job; 2219 2220 so = toep->aiotx_so; 2221 CURVNET_SET(toep->vnet); 2222 SOCKBUF_LOCK(&so->so_snd); 2223 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2224 job = TAILQ_FIRST(&toep->aiotx_jobq); 2225 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2226 if (!aio_clear_cancel_function(job)) 2227 continue; 2228 2229 t4_aiotx_process_job(toep, so, job); 2230 } 2231 toep->aiotx_so = NULL; 2232 SOCKBUF_UNLOCK(&so->so_snd); 2233 CURVNET_RESTORE(); 2234 2235 free_toepcb(toep); 2236 SOCK_LOCK(so); 2237 sorele(so); 2238 } 2239 2240 static void 2241 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2242 { 2243 2244 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2245 #ifdef VERBOSE_TRACES 2246 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2247 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2248 #endif 2249 if (toep->aiotx_so != NULL) 2250 return; 2251 soref(so); 2252 toep->aiotx_so = so; 2253 hold_toepcb(toep); 2254 soaio_enqueue(&toep->aiotx_task); 2255 } 2256 2257 static void 2258 t4_aiotx_cancel(struct kaiocb *job) 2259 { 2260 struct socket *so; 2261 struct sockbuf *sb; 2262 struct tcpcb *tp; 2263 struct toepcb *toep; 2264 2265 so = job->fd_file->f_data; 2266 tp = so_sototcpcb(so); 2267 toep = tp->t_toe; 2268 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2269 sb = &so->so_snd; 2270 2271 SOCKBUF_LOCK(sb); 2272 if (!aio_cancel_cleared(job)) 2273 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2274 SOCKBUF_UNLOCK(sb); 2275 2276 job->aio_error = (void *)(intptr_t)ECANCELED; 2277 aiotx_free_job(job); 2278 } 2279 2280 int 2281 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2282 { 2283 struct tcpcb *tp = so_sototcpcb(so); 2284 struct toepcb *toep = tp->t_toe; 2285 struct adapter *sc = td_adapter(toep->td); 2286 2287 /* This only handles writes. */ 2288 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2289 return (EOPNOTSUPP); 2290 2291 if (!sc->tt.tx_zcopy) 2292 return (EOPNOTSUPP); 2293 2294 if (tls_tx_key(toep)) 2295 return (EOPNOTSUPP); 2296 2297 SOCKBUF_LOCK(&so->so_snd); 2298 #ifdef VERBOSE_TRACES 2299 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2300 #endif 2301 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2302 panic("new job was cancelled"); 2303 refcount_init(&job->aio_refs, 1); 2304 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2305 if (sowriteable(so)) 2306 t4_aiotx_queue_toep(so, toep); 2307 SOCKBUF_UNLOCK(&so->so_snd); 2308 return (0); 2309 } 2310 2311 void 2312 aiotx_init_toep(struct toepcb *toep) 2313 { 2314 2315 TAILQ_INIT(&toep->aiotx_jobq); 2316 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2317 } 2318 #endif 2319