1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_kern_tls.h" 36 #include "opt_ratelimit.h" 37 38 #ifdef TCP_OFFLOAD 39 #include <sys/param.h> 40 #include <sys/aio.h> 41 #include <sys/file.h> 42 #include <sys/kernel.h> 43 #include <sys/ktr.h> 44 #include <sys/module.h> 45 #include <sys/proc.h> 46 #include <sys/protosw.h> 47 #include <sys/domain.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sglist.h> 51 #include <sys/taskqueue.h> 52 #include <netinet/in.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip6.h> 56 #define TCPSTATES 57 #include <netinet/tcp_fsm.h> 58 #include <netinet/tcp_seq.h> 59 #include <netinet/tcp_var.h> 60 #include <netinet/toecore.h> 61 62 #include <security/mac/mac_framework.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_page.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (ulp_mode(toep) == ULP_MODE_TLS) 100 nparams++; 101 if (toep->tls.fcplenmax != 0) 102 nparams++; 103 if (toep->params.tc_idx != -1) { 104 MPASS(toep->params.tc_idx >= 0 && 105 toep->params.tc_idx < sc->chip_params->nsched_cls); 106 nparams++; 107 } 108 109 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 110 111 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 112 if (wr == NULL) { 113 /* XXX */ 114 panic("%s: allocation failure.", __func__); 115 } 116 flowc = wrtod(wr); 117 memset(flowc, 0, wr->wr_len); 118 119 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 120 V_FW_FLOWC_WR_NPARAMS(nparams)); 121 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 122 V_FW_WR_FLOWID(toep->tid)); 123 124 #define FLOWC_PARAM(__m, __v) \ 125 do { \ 126 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 127 flowc->mnemval[paramidx].val = htobe32(__v); \ 128 paramidx++; \ 129 } while (0) 130 131 paramidx = 0; 132 133 FLOWC_PARAM(PFNVFN, pfvf); 134 FLOWC_PARAM(CH, pi->tx_chan); 135 FLOWC_PARAM(PORT, pi->tx_chan); 136 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 137 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 138 if (tp) { 139 FLOWC_PARAM(MSS, toep->params.emss); 140 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 141 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 142 } else 143 FLOWC_PARAM(MSS, 512); 144 CTR6(KTR_CXGBE, 145 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 146 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 147 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 148 149 if (ulp_mode(toep) == ULP_MODE_TLS) 150 FLOWC_PARAM(ULP_MODE, ulp_mode(toep)); 151 if (toep->tls.fcplenmax != 0) 152 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 153 if (toep->params.tc_idx != -1) 154 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 155 #undef FLOWC_PARAM 156 157 KASSERT(paramidx == nparams, ("nparams mismatch")); 158 159 txsd->tx_credits = howmany(flowclen, 16); 160 txsd->plen = 0; 161 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 162 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 163 toep->tx_credits -= txsd->tx_credits; 164 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 165 toep->txsd_pidx = 0; 166 toep->txsd_avail--; 167 168 toep->flags |= TPF_FLOWC_WR_SENT; 169 t4_wrq_tx(sc, wr); 170 } 171 172 #ifdef RATELIMIT 173 /* 174 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 175 */ 176 static int 177 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 178 { 179 int tc_idx, rc; 180 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 181 const int port_id = toep->vi->pi->port_id; 182 183 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 184 185 if (kbps == 0) { 186 /* unbind */ 187 tc_idx = -1; 188 } else { 189 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 190 if (rc != 0) 191 return (rc); 192 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 193 } 194 195 if (toep->params.tc_idx != tc_idx) { 196 struct wrqe *wr; 197 struct fw_flowc_wr *flowc; 198 int nparams = 1, flowclen, flowclen16; 199 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 200 201 flowclen = sizeof(*flowc) + nparams * sizeof(struct 202 fw_flowc_mnemval); 203 flowclen16 = howmany(flowclen, 16); 204 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 205 (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { 206 if (tc_idx >= 0) 207 t4_release_cl_rl(sc, port_id, tc_idx); 208 return (ENOMEM); 209 } 210 211 flowc = wrtod(wr); 212 memset(flowc, 0, wr->wr_len); 213 214 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 215 V_FW_FLOWC_WR_NPARAMS(nparams)); 216 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 217 V_FW_WR_FLOWID(toep->tid)); 218 219 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 220 if (tc_idx == -1) 221 flowc->mnemval[0].val = htobe32(0xff); 222 else 223 flowc->mnemval[0].val = htobe32(tc_idx); 224 225 txsd->tx_credits = flowclen16; 226 txsd->plen = 0; 227 toep->tx_credits -= txsd->tx_credits; 228 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 229 toep->txsd_pidx = 0; 230 toep->txsd_avail--; 231 t4_wrq_tx(sc, wr); 232 } 233 234 if (toep->params.tc_idx >= 0) 235 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 236 toep->params.tc_idx = tc_idx; 237 238 return (0); 239 } 240 #endif 241 242 void 243 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 244 { 245 struct wrqe *wr; 246 struct cpl_abort_req *req; 247 int tid = toep->tid; 248 struct inpcb *inp = toep->inp; 249 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 250 251 INP_WLOCK_ASSERT(inp); 252 253 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 254 __func__, toep->tid, 255 inp->inp_flags & INP_DROPPED ? "inp dropped" : 256 tcpstates[tp->t_state], 257 toep->flags, inp->inp_flags, 258 toep->flags & TPF_ABORT_SHUTDOWN ? 259 " (abort already in progress)" : ""); 260 261 if (toep->flags & TPF_ABORT_SHUTDOWN) 262 return; /* abort already in progress */ 263 264 toep->flags |= TPF_ABORT_SHUTDOWN; 265 266 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 267 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 268 269 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 270 if (wr == NULL) { 271 /* XXX */ 272 panic("%s: allocation failure.", __func__); 273 } 274 req = wrtod(wr); 275 276 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 277 if (inp->inp_flags & INP_DROPPED) 278 req->rsvd0 = htobe32(snd_nxt); 279 else 280 req->rsvd0 = htobe32(tp->snd_nxt); 281 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 282 req->cmd = CPL_ABORT_SEND_RST; 283 284 /* 285 * XXX: What's the correct way to tell that the inp hasn't been detached 286 * from its socket? Should I even be flushing the snd buffer here? 287 */ 288 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 289 struct socket *so = inp->inp_socket; 290 291 if (so != NULL) /* because I'm not sure. See comment above */ 292 sbflush(&so->so_snd); 293 } 294 295 t4_l2t_send(sc, wr, toep->l2te); 296 } 297 298 /* 299 * Called when a connection is established to translate the TCP options 300 * reported by HW to FreeBSD's native format. 301 */ 302 static void 303 assign_rxopt(struct tcpcb *tp, uint16_t opt) 304 { 305 struct toepcb *toep = tp->t_toe; 306 struct inpcb *inp = tp->t_inpcb; 307 struct adapter *sc = td_adapter(toep->td); 308 309 INP_LOCK_ASSERT(inp); 310 311 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 312 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 313 if (inp->inp_inc.inc_flags & INC_ISIPV6) 314 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 315 else 316 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 317 318 toep->params.emss = tp->t_maxseg; 319 if (G_TCPOPT_TSTAMP(opt)) { 320 toep->params.tstamp = 1; 321 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 322 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 323 tp->ts_recent = 0; /* hmmm */ 324 tp->ts_recent_age = tcp_ts_getticks(); 325 } else 326 toep->params.tstamp = 0; 327 328 if (G_TCPOPT_SACK(opt)) { 329 toep->params.sack = 1; 330 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 331 } else { 332 toep->params.sack = 0; 333 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 334 } 335 336 if (G_TCPOPT_WSCALE_OK(opt)) 337 tp->t_flags |= TF_RCVD_SCALE; 338 339 /* Doing window scaling? */ 340 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 341 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 342 tp->rcv_scale = tp->request_r_scale; 343 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 344 } else 345 toep->params.wscale = 0; 346 347 CTR6(KTR_CXGBE, 348 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 349 toep->tid, toep->params.mtu_idx, toep->params.emss, 350 toep->params.tstamp, toep->params.sack, toep->params.wscale); 351 } 352 353 /* 354 * Completes some final bits of initialization for just established connections 355 * and changes their state to TCPS_ESTABLISHED. 356 * 357 * The ISNs are from the exchange of SYNs. 358 */ 359 void 360 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 361 { 362 struct inpcb *inp = toep->inp; 363 struct socket *so = inp->inp_socket; 364 struct tcpcb *tp = intotcpcb(inp); 365 uint16_t tcpopt = be16toh(opt); 366 367 INP_WLOCK_ASSERT(inp); 368 KASSERT(tp->t_state == TCPS_SYN_SENT || 369 tp->t_state == TCPS_SYN_RECEIVED, 370 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 371 372 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 373 __func__, toep->tid, so, inp, tp, toep); 374 375 tcp_state_change(tp, TCPS_ESTABLISHED); 376 tp->t_starttime = ticks; 377 TCPSTAT_INC(tcps_connects); 378 379 tp->irs = irs; 380 tcp_rcvseqinit(tp); 381 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 382 tp->rcv_adv += tp->rcv_wnd; 383 tp->last_ack_sent = tp->rcv_nxt; 384 385 tp->iss = iss; 386 tcp_sendseqinit(tp); 387 tp->snd_una = iss + 1; 388 tp->snd_nxt = iss + 1; 389 tp->snd_max = iss + 1; 390 391 assign_rxopt(tp, tcpopt); 392 send_flowc_wr(toep, tp); 393 394 soisconnected(so); 395 } 396 397 int 398 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 399 { 400 struct wrqe *wr; 401 struct cpl_rx_data_ack *req; 402 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 403 404 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 405 406 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 407 if (wr == NULL) 408 return (0); 409 req = wrtod(wr); 410 411 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 412 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 413 414 t4_wrq_tx(sc, wr); 415 return (credits); 416 } 417 418 void 419 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 420 { 421 struct wrqe *wr; 422 struct cpl_rx_data_ack *req; 423 424 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 425 if (wr == NULL) 426 return; 427 req = wrtod(wr); 428 429 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 430 req->credit_dack = htobe32(F_RX_MODULATE_RX); 431 432 t4_wrq_tx(sc, wr); 433 } 434 435 void 436 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 437 { 438 struct adapter *sc = tod->tod_softc; 439 struct inpcb *inp = tp->t_inpcb; 440 struct socket *so = inp->inp_socket; 441 struct sockbuf *sb = &so->so_rcv; 442 struct toepcb *toep = tp->t_toe; 443 int rx_credits; 444 445 INP_WLOCK_ASSERT(inp); 446 SOCKBUF_LOCK_ASSERT(sb); 447 448 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 449 if (rx_credits > 0 && 450 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 451 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 452 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 453 rx_credits = send_rx_credits(sc, toep, rx_credits); 454 tp->rcv_wnd += rx_credits; 455 tp->rcv_adv += rx_credits; 456 } else if (toep->flags & TPF_FORCE_CREDITS) 457 send_rx_modulate(sc, toep); 458 } 459 460 void 461 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 462 { 463 struct inpcb *inp = tp->t_inpcb; 464 struct socket *so = inp->inp_socket; 465 struct sockbuf *sb = &so->so_rcv; 466 467 SOCKBUF_LOCK(sb); 468 t4_rcvd_locked(tod, tp); 469 SOCKBUF_UNLOCK(sb); 470 } 471 472 /* 473 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 474 */ 475 int 476 t4_close_conn(struct adapter *sc, struct toepcb *toep) 477 { 478 struct wrqe *wr; 479 struct cpl_close_con_req *req; 480 unsigned int tid = toep->tid; 481 482 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 483 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 484 485 if (toep->flags & TPF_FIN_SENT) 486 return (0); 487 488 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 489 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 490 491 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 492 if (wr == NULL) { 493 /* XXX */ 494 panic("%s: allocation failure.", __func__); 495 } 496 req = wrtod(wr); 497 498 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 499 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 500 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 501 V_FW_WR_FLOWID(tid)); 502 req->wr.wr_lo = cpu_to_be64(0); 503 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 504 req->rsvd = 0; 505 506 toep->flags |= TPF_FIN_SENT; 507 toep->flags &= ~TPF_SEND_FIN; 508 t4_l2t_send(sc, wr, toep->l2te); 509 510 return (0); 511 } 512 513 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 514 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 515 516 /* Maximum amount of immediate data we could stuff in a WR */ 517 static inline int 518 max_imm_payload(int tx_credits) 519 { 520 const int n = 1; /* Use no more than one desc for imm. data WR */ 521 522 KASSERT(tx_credits >= 0 && 523 tx_credits <= MAX_OFLD_TX_CREDITS, 524 ("%s: %d credits", __func__, tx_credits)); 525 526 if (tx_credits < MIN_OFLD_TX_CREDITS) 527 return (0); 528 529 if (tx_credits >= (n * EQ_ESIZE) / 16) 530 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 531 else 532 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 533 } 534 535 /* Maximum number of SGL entries we could stuff in a WR */ 536 static inline int 537 max_dsgl_nsegs(int tx_credits) 538 { 539 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 540 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 541 542 KASSERT(tx_credits >= 0 && 543 tx_credits <= MAX_OFLD_TX_CREDITS, 544 ("%s: %d credits", __func__, tx_credits)); 545 546 if (tx_credits < MIN_OFLD_TX_CREDITS) 547 return (0); 548 549 nseg += 2 * (sge_pair_credits * 16 / 24); 550 if ((sge_pair_credits * 16) % 24 == 16) 551 nseg++; 552 553 return (nseg); 554 } 555 556 static inline void 557 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 558 unsigned int plen, uint8_t credits, int shove, int ulp_submode) 559 { 560 struct fw_ofld_tx_data_wr *txwr = dst; 561 562 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 563 V_FW_WR_IMMDLEN(immdlen)); 564 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 565 V_FW_WR_LEN16(credits)); 566 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 567 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 568 txwr->plen = htobe32(plen); 569 570 if (toep->params.tx_align > 0) { 571 if (plen < 2 * toep->params.emss) 572 txwr->lsodisable_to_flags |= 573 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 574 else 575 txwr->lsodisable_to_flags |= 576 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 577 (toep->params.nagle == 0 ? 0 : 578 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 579 } 580 } 581 582 /* 583 * Generate a DSGL from a starting mbuf. The total number of segments and the 584 * maximum segments in any one mbuf are provided. 585 */ 586 static void 587 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 588 { 589 struct mbuf *m; 590 struct ulptx_sgl *usgl = dst; 591 int i, j, rc; 592 struct sglist sg; 593 struct sglist_seg segs[n]; 594 595 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 596 597 sglist_init(&sg, n, segs); 598 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 599 V_ULPTX_NSGE(nsegs)); 600 601 i = -1; 602 for (m = start; m != stop; m = m->m_next) { 603 if (m->m_flags & M_EXTPG) 604 rc = sglist_append_mbuf_epg(&sg, m, 605 mtod(m, vm_offset_t), m->m_len); 606 else 607 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 608 if (__predict_false(rc != 0)) 609 panic("%s: sglist_append %d", __func__, rc); 610 611 for (j = 0; j < sg.sg_nseg; i++, j++) { 612 if (i < 0) { 613 usgl->len0 = htobe32(segs[j].ss_len); 614 usgl->addr0 = htobe64(segs[j].ss_paddr); 615 } else { 616 usgl->sge[i / 2].len[i & 1] = 617 htobe32(segs[j].ss_len); 618 usgl->sge[i / 2].addr[i & 1] = 619 htobe64(segs[j].ss_paddr); 620 } 621 #ifdef INVARIANTS 622 nsegs--; 623 #endif 624 } 625 sglist_reset(&sg); 626 } 627 if (i & 1) 628 usgl->sge[i / 2].len[1] = htobe32(0); 629 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 630 __func__, nsegs, start, stop)); 631 } 632 633 /* 634 * Max number of SGL entries an offload tx work request can have. This is 41 635 * (1 + 40) for a full 512B work request. 636 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 637 */ 638 #define OFLD_SGL_LEN (41) 639 640 /* 641 * Send data and/or a FIN to the peer. 642 * 643 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 644 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 645 * was transmitted. 646 * 647 * drop indicates the number of bytes that should be dropped from the head of 648 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 649 * contention on the send buffer lock (before this change it used to do 650 * sowwakeup and then t4_push_frames right after that when recovering from tx 651 * stalls). When drop is set this function MUST drop the bytes and wake up any 652 * writers. 653 */ 654 void 655 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 656 { 657 struct mbuf *sndptr, *m, *sb_sndptr; 658 struct fw_ofld_tx_data_wr *txwr; 659 struct wrqe *wr; 660 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 661 struct inpcb *inp = toep->inp; 662 struct tcpcb *tp = intotcpcb(inp); 663 struct socket *so = inp->inp_socket; 664 struct sockbuf *sb = &so->so_snd; 665 int tx_credits, shove, compl, sowwakeup; 666 struct ofld_tx_sdesc *txsd; 667 bool nomap_mbuf_seen; 668 669 INP_WLOCK_ASSERT(inp); 670 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 671 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 672 673 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 674 ulp_mode(toep) == ULP_MODE_TCPDDP || 675 ulp_mode(toep) == ULP_MODE_TLS || 676 ulp_mode(toep) == ULP_MODE_RDMA, 677 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 678 679 #ifdef VERBOSE_TRACES 680 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 681 __func__, toep->tid, toep->flags, tp->t_flags, drop); 682 #endif 683 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 684 return; 685 686 #ifdef RATELIMIT 687 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 688 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 689 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 690 } 691 #endif 692 693 /* 694 * This function doesn't resume by itself. Someone else must clear the 695 * flag and call this function. 696 */ 697 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 698 KASSERT(drop == 0, 699 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 700 return; 701 } 702 703 txsd = &toep->txsd[toep->txsd_pidx]; 704 do { 705 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 706 max_imm = max_imm_payload(tx_credits); 707 max_nsegs = max_dsgl_nsegs(tx_credits); 708 709 SOCKBUF_LOCK(sb); 710 sowwakeup = drop; 711 if (drop) { 712 sbdrop_locked(sb, drop); 713 drop = 0; 714 } 715 sb_sndptr = sb->sb_sndptr; 716 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 717 plen = 0; 718 nsegs = 0; 719 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 720 nomap_mbuf_seen = false; 721 for (m = sndptr; m != NULL; m = m->m_next) { 722 int n; 723 724 if (m->m_flags & M_EXTPG) { 725 #ifdef KERN_TLS 726 if (m->m_epg_tls != NULL) { 727 toep->flags |= TPF_KTLS; 728 if (plen == 0) { 729 SOCKBUF_UNLOCK(sb); 730 t4_push_ktls(sc, toep, 0); 731 return; 732 } 733 break; 734 } 735 #endif 736 n = sglist_count_mbuf_epg(m, 737 mtod(m, vm_offset_t), m->m_len); 738 } else 739 n = sglist_count(mtod(m, void *), m->m_len); 740 741 nsegs += n; 742 plen += m->m_len; 743 744 /* This mbuf sent us _over_ the nsegs limit, back out */ 745 if (plen > max_imm && nsegs > max_nsegs) { 746 nsegs -= n; 747 plen -= m->m_len; 748 if (plen == 0) { 749 /* Too few credits */ 750 toep->flags |= TPF_TX_SUSPENDED; 751 if (sowwakeup) { 752 if (!TAILQ_EMPTY( 753 &toep->aiotx_jobq)) 754 t4_aiotx_queue_toep(so, 755 toep); 756 sowwakeup_locked(so); 757 } else 758 SOCKBUF_UNLOCK(sb); 759 SOCKBUF_UNLOCK_ASSERT(sb); 760 return; 761 } 762 break; 763 } 764 765 if (m->m_flags & M_EXTPG) 766 nomap_mbuf_seen = true; 767 if (max_nsegs_1mbuf < n) 768 max_nsegs_1mbuf = n; 769 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 770 771 /* This mbuf put us right at the max_nsegs limit */ 772 if (plen > max_imm && nsegs == max_nsegs) { 773 m = m->m_next; 774 break; 775 } 776 } 777 778 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 779 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 780 compl = 1; 781 else 782 compl = 0; 783 784 if (sb->sb_flags & SB_AUTOSIZE && 785 V_tcp_do_autosndbuf && 786 sb->sb_hiwat < V_tcp_autosndbuf_max && 787 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 788 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 789 V_tcp_autosndbuf_max); 790 791 if (!sbreserve_locked(sb, newsize, so, NULL)) 792 sb->sb_flags &= ~SB_AUTOSIZE; 793 else 794 sowwakeup = 1; /* room available */ 795 } 796 if (sowwakeup) { 797 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 798 t4_aiotx_queue_toep(so, toep); 799 sowwakeup_locked(so); 800 } else 801 SOCKBUF_UNLOCK(sb); 802 SOCKBUF_UNLOCK_ASSERT(sb); 803 804 /* nothing to send */ 805 if (plen == 0) { 806 KASSERT(m == NULL, 807 ("%s: nothing to send, but m != NULL", __func__)); 808 break; 809 } 810 811 if (__predict_false(toep->flags & TPF_FIN_SENT)) 812 panic("%s: excess tx.", __func__); 813 814 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 815 if (plen <= max_imm && !nomap_mbuf_seen) { 816 817 /* Immediate data tx */ 818 819 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 820 toep->ofld_txq); 821 if (wr == NULL) { 822 /* XXX: how will we recover from this? */ 823 toep->flags |= TPF_TX_SUSPENDED; 824 return; 825 } 826 txwr = wrtod(wr); 827 credits = howmany(wr->wr_len, 16); 828 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); 829 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 830 nsegs = 0; 831 } else { 832 int wr_len; 833 834 /* DSGL tx */ 835 836 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 837 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 838 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 839 if (wr == NULL) { 840 /* XXX: how will we recover from this? */ 841 toep->flags |= TPF_TX_SUSPENDED; 842 return; 843 } 844 txwr = wrtod(wr); 845 credits = howmany(wr_len, 16); 846 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); 847 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 848 max_nsegs_1mbuf); 849 if (wr_len & 0xf) { 850 uint64_t *pad = (uint64_t *) 851 ((uintptr_t)txwr + wr_len); 852 *pad = 0; 853 } 854 } 855 856 KASSERT(toep->tx_credits >= credits, 857 ("%s: not enough credits", __func__)); 858 859 toep->tx_credits -= credits; 860 toep->tx_nocompl += credits; 861 toep->plen_nocompl += plen; 862 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 863 toep->tx_nocompl >= toep->tx_total / 4) 864 compl = 1; 865 866 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 867 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 868 toep->tx_nocompl = 0; 869 toep->plen_nocompl = 0; 870 } 871 872 tp->snd_nxt += plen; 873 tp->snd_max += plen; 874 875 SOCKBUF_LOCK(sb); 876 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 877 sb->sb_sndptr = sb_sndptr; 878 SOCKBUF_UNLOCK(sb); 879 880 toep->flags |= TPF_TX_DATA_SENT; 881 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 882 toep->flags |= TPF_TX_SUSPENDED; 883 884 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 885 txsd->plen = plen; 886 txsd->tx_credits = credits; 887 txsd++; 888 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 889 toep->txsd_pidx = 0; 890 txsd = &toep->txsd[0]; 891 } 892 toep->txsd_avail--; 893 894 t4_l2t_send(sc, wr, toep->l2te); 895 } while (m != NULL); 896 897 /* Send a FIN if requested, but only if there's no more data to send */ 898 if (m == NULL && toep->flags & TPF_SEND_FIN) 899 t4_close_conn(sc, toep); 900 } 901 902 static inline void 903 rqdrop_locked(struct mbufq *q, int plen) 904 { 905 struct mbuf *m; 906 907 while (plen > 0) { 908 m = mbufq_dequeue(q); 909 910 /* Too many credits. */ 911 MPASS(m != NULL); 912 M_ASSERTPKTHDR(m); 913 914 /* Partial credits. */ 915 MPASS(plen >= m->m_pkthdr.len); 916 917 plen -= m->m_pkthdr.len; 918 m_freem(m); 919 } 920 } 921 922 void 923 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 924 { 925 struct mbuf *sndptr, *m; 926 struct fw_ofld_tx_data_wr *txwr; 927 struct wrqe *wr; 928 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 929 u_int adjusted_plen, ulp_submode; 930 struct inpcb *inp = toep->inp; 931 struct tcpcb *tp = intotcpcb(inp); 932 int tx_credits, shove; 933 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 934 struct mbufq *pduq = &toep->ulp_pduq; 935 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 936 937 INP_WLOCK_ASSERT(inp); 938 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 939 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 940 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 941 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 942 943 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 944 return; 945 946 /* 947 * This function doesn't resume by itself. Someone else must clear the 948 * flag and call this function. 949 */ 950 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 951 KASSERT(drop == 0, 952 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 953 return; 954 } 955 956 if (drop) 957 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 958 959 while ((sndptr = mbufq_first(pduq)) != NULL) { 960 M_ASSERTPKTHDR(sndptr); 961 962 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 963 max_imm = max_imm_payload(tx_credits); 964 max_nsegs = max_dsgl_nsegs(tx_credits); 965 966 plen = 0; 967 nsegs = 0; 968 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 969 for (m = sndptr; m != NULL; m = m->m_next) { 970 int n = sglist_count(mtod(m, void *), m->m_len); 971 972 nsegs += n; 973 plen += m->m_len; 974 975 /* 976 * This mbuf would send us _over_ the nsegs limit. 977 * Suspend tx because the PDU can't be sent out. 978 */ 979 if (plen > max_imm && nsegs > max_nsegs) { 980 toep->flags |= TPF_TX_SUSPENDED; 981 return; 982 } 983 984 if (max_nsegs_1mbuf < n) 985 max_nsegs_1mbuf = n; 986 } 987 988 if (__predict_false(toep->flags & TPF_FIN_SENT)) 989 panic("%s: excess tx.", __func__); 990 991 /* 992 * We have a PDU to send. All of it goes out in one WR so 'm' 993 * is NULL. A PDU's length is always a multiple of 4. 994 */ 995 MPASS(m == NULL); 996 MPASS((plen & 3) == 0); 997 MPASS(sndptr->m_pkthdr.len == plen); 998 999 shove = !(tp->t_flags & TF_MORETOCOME); 1000 ulp_submode = mbuf_ulp_submode(sndptr); 1001 MPASS(ulp_submode < nitems(ulp_extra_len)); 1002 1003 /* 1004 * plen doesn't include header and data digests, which are 1005 * generated and inserted in the right places by the TOE, but 1006 * they do occupy TCP sequence space and need to be accounted 1007 * for. 1008 */ 1009 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1010 if (plen <= max_imm) { 1011 1012 /* Immediate data tx */ 1013 1014 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1015 toep->ofld_txq); 1016 if (wr == NULL) { 1017 /* XXX: how will we recover from this? */ 1018 toep->flags |= TPF_TX_SUSPENDED; 1019 return; 1020 } 1021 txwr = wrtod(wr); 1022 credits = howmany(wr->wr_len, 16); 1023 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1024 shove, ulp_submode); 1025 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1026 nsegs = 0; 1027 } else { 1028 int wr_len; 1029 1030 /* DSGL tx */ 1031 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1032 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1033 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 1034 if (wr == NULL) { 1035 /* XXX: how will we recover from this? */ 1036 toep->flags |= TPF_TX_SUSPENDED; 1037 return; 1038 } 1039 txwr = wrtod(wr); 1040 credits = howmany(wr_len, 16); 1041 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1042 shove, ulp_submode); 1043 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1044 max_nsegs_1mbuf); 1045 if (wr_len & 0xf) { 1046 uint64_t *pad = (uint64_t *) 1047 ((uintptr_t)txwr + wr_len); 1048 *pad = 0; 1049 } 1050 } 1051 1052 KASSERT(toep->tx_credits >= credits, 1053 ("%s: not enough credits", __func__)); 1054 1055 m = mbufq_dequeue(pduq); 1056 MPASS(m == sndptr); 1057 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1058 1059 toep->tx_credits -= credits; 1060 toep->tx_nocompl += credits; 1061 toep->plen_nocompl += plen; 1062 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1063 toep->tx_nocompl >= toep->tx_total / 4) { 1064 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1065 toep->tx_nocompl = 0; 1066 toep->plen_nocompl = 0; 1067 } 1068 1069 tp->snd_nxt += adjusted_plen; 1070 tp->snd_max += adjusted_plen; 1071 1072 toep->flags |= TPF_TX_DATA_SENT; 1073 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1074 toep->flags |= TPF_TX_SUSPENDED; 1075 1076 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1077 txsd->plen = plen; 1078 txsd->tx_credits = credits; 1079 txsd++; 1080 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1081 toep->txsd_pidx = 0; 1082 txsd = &toep->txsd[0]; 1083 } 1084 toep->txsd_avail--; 1085 1086 t4_l2t_send(sc, wr, toep->l2te); 1087 } 1088 1089 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1090 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1091 t4_close_conn(sc, toep); 1092 } 1093 1094 static inline void 1095 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1096 { 1097 1098 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1099 t4_push_pdus(sc, toep, drop); 1100 else if (tls_tx_key(toep) && toep->tls.mode == TLS_MODE_TLSOM) 1101 t4_push_tls_records(sc, toep, drop); 1102 #ifdef KERN_TLS 1103 else if (toep->flags & TPF_KTLS) 1104 t4_push_ktls(sc, toep, drop); 1105 #endif 1106 else 1107 t4_push_frames(sc, toep, drop); 1108 } 1109 1110 int 1111 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1112 { 1113 struct adapter *sc = tod->tod_softc; 1114 #ifdef INVARIANTS 1115 struct inpcb *inp = tp->t_inpcb; 1116 #endif 1117 struct toepcb *toep = tp->t_toe; 1118 1119 INP_WLOCK_ASSERT(inp); 1120 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1121 ("%s: inp %p dropped.", __func__, inp)); 1122 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1123 1124 t4_push_data(sc, toep, 0); 1125 1126 return (0); 1127 } 1128 1129 int 1130 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1131 { 1132 struct adapter *sc = tod->tod_softc; 1133 #ifdef INVARIANTS 1134 struct inpcb *inp = tp->t_inpcb; 1135 #endif 1136 struct toepcb *toep = tp->t_toe; 1137 1138 INP_WLOCK_ASSERT(inp); 1139 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1140 ("%s: inp %p dropped.", __func__, inp)); 1141 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1142 1143 toep->flags |= TPF_SEND_FIN; 1144 if (tp->t_state >= TCPS_ESTABLISHED) 1145 t4_push_data(sc, toep, 0); 1146 1147 return (0); 1148 } 1149 1150 int 1151 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1152 { 1153 struct adapter *sc = tod->tod_softc; 1154 #if defined(INVARIANTS) 1155 struct inpcb *inp = tp->t_inpcb; 1156 #endif 1157 struct toepcb *toep = tp->t_toe; 1158 1159 INP_WLOCK_ASSERT(inp); 1160 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1161 ("%s: inp %p dropped.", __func__, inp)); 1162 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1163 1164 /* hmmmm */ 1165 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1166 ("%s: flowc for tid %u [%s] not sent already", 1167 __func__, toep->tid, tcpstates[tp->t_state])); 1168 1169 send_reset(sc, toep, 0); 1170 return (0); 1171 } 1172 1173 /* 1174 * Peer has sent us a FIN. 1175 */ 1176 static int 1177 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1178 { 1179 struct adapter *sc = iq->adapter; 1180 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1181 unsigned int tid = GET_TID(cpl); 1182 struct toepcb *toep = lookup_tid(sc, tid); 1183 struct inpcb *inp = toep->inp; 1184 struct tcpcb *tp = NULL; 1185 struct socket *so; 1186 struct epoch_tracker et; 1187 #ifdef INVARIANTS 1188 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1189 #endif 1190 1191 KASSERT(opcode == CPL_PEER_CLOSE, 1192 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1193 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1194 1195 if (__predict_false(toep->flags & TPF_SYNQE)) { 1196 /* 1197 * do_pass_establish must have run before do_peer_close and if 1198 * this is still a synqe instead of a toepcb then the connection 1199 * must be getting aborted. 1200 */ 1201 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1202 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1203 toep, toep->flags); 1204 return (0); 1205 } 1206 1207 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1208 1209 CURVNET_SET(toep->vnet); 1210 NET_EPOCH_ENTER(et); 1211 INP_WLOCK(inp); 1212 tp = intotcpcb(inp); 1213 1214 CTR6(KTR_CXGBE, 1215 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1216 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1217 toep->ddp.flags, inp); 1218 1219 if (toep->flags & TPF_ABORT_SHUTDOWN) 1220 goto done; 1221 1222 tp->rcv_nxt++; /* FIN */ 1223 1224 so = inp->inp_socket; 1225 socantrcvmore(so); 1226 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1227 DDP_LOCK(toep); 1228 if (__predict_false(toep->ddp.flags & 1229 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1230 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1231 DDP_UNLOCK(toep); 1232 } 1233 1234 if (ulp_mode(toep) != ULP_MODE_RDMA) { 1235 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1236 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1237 be32toh(cpl->rcv_nxt))); 1238 } 1239 1240 switch (tp->t_state) { 1241 case TCPS_SYN_RECEIVED: 1242 tp->t_starttime = ticks; 1243 /* FALLTHROUGH */ 1244 1245 case TCPS_ESTABLISHED: 1246 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1247 break; 1248 1249 case TCPS_FIN_WAIT_1: 1250 tcp_state_change(tp, TCPS_CLOSING); 1251 break; 1252 1253 case TCPS_FIN_WAIT_2: 1254 tcp_twstart(tp); 1255 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1256 NET_EPOCH_EXIT(et); 1257 CURVNET_RESTORE(); 1258 1259 INP_WLOCK(inp); 1260 final_cpl_received(toep); 1261 return (0); 1262 1263 default: 1264 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1265 __func__, tid, tp->t_state); 1266 } 1267 done: 1268 INP_WUNLOCK(inp); 1269 NET_EPOCH_EXIT(et); 1270 CURVNET_RESTORE(); 1271 return (0); 1272 } 1273 1274 /* 1275 * Peer has ACK'd our FIN. 1276 */ 1277 static int 1278 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1279 struct mbuf *m) 1280 { 1281 struct adapter *sc = iq->adapter; 1282 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1283 unsigned int tid = GET_TID(cpl); 1284 struct toepcb *toep = lookup_tid(sc, tid); 1285 struct inpcb *inp = toep->inp; 1286 struct tcpcb *tp = NULL; 1287 struct socket *so = NULL; 1288 struct epoch_tracker et; 1289 #ifdef INVARIANTS 1290 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1291 #endif 1292 1293 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1294 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1295 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1296 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1297 1298 CURVNET_SET(toep->vnet); 1299 NET_EPOCH_ENTER(et); 1300 INP_WLOCK(inp); 1301 tp = intotcpcb(inp); 1302 1303 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1304 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1305 1306 if (toep->flags & TPF_ABORT_SHUTDOWN) 1307 goto done; 1308 1309 so = inp->inp_socket; 1310 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1311 1312 switch (tp->t_state) { 1313 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1314 tcp_twstart(tp); 1315 release: 1316 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1317 NET_EPOCH_EXIT(et); 1318 CURVNET_RESTORE(); 1319 1320 INP_WLOCK(inp); 1321 final_cpl_received(toep); /* no more CPLs expected */ 1322 1323 return (0); 1324 case TCPS_LAST_ACK: 1325 if (tcp_close(tp)) 1326 INP_WUNLOCK(inp); 1327 goto release; 1328 1329 case TCPS_FIN_WAIT_1: 1330 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1331 soisdisconnected(so); 1332 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1333 break; 1334 1335 default: 1336 log(LOG_ERR, 1337 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1338 __func__, tid, tcpstates[tp->t_state]); 1339 } 1340 done: 1341 INP_WUNLOCK(inp); 1342 NET_EPOCH_EXIT(et); 1343 CURVNET_RESTORE(); 1344 return (0); 1345 } 1346 1347 void 1348 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1349 int rst_status) 1350 { 1351 struct wrqe *wr; 1352 struct cpl_abort_rpl *cpl; 1353 1354 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1355 if (wr == NULL) { 1356 /* XXX */ 1357 panic("%s: allocation failure.", __func__); 1358 } 1359 cpl = wrtod(wr); 1360 1361 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1362 cpl->cmd = rst_status; 1363 1364 t4_wrq_tx(sc, wr); 1365 } 1366 1367 static int 1368 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1369 { 1370 switch (abort_reason) { 1371 case CPL_ERR_BAD_SYN: 1372 case CPL_ERR_CONN_RESET: 1373 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1374 case CPL_ERR_XMIT_TIMEDOUT: 1375 case CPL_ERR_PERSIST_TIMEDOUT: 1376 case CPL_ERR_FINWAIT2_TIMEDOUT: 1377 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1378 return (ETIMEDOUT); 1379 default: 1380 return (EIO); 1381 } 1382 } 1383 1384 /* 1385 * TCP RST from the peer, timeout, or some other such critical error. 1386 */ 1387 static int 1388 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1389 { 1390 struct adapter *sc = iq->adapter; 1391 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1392 unsigned int tid = GET_TID(cpl); 1393 struct toepcb *toep = lookup_tid(sc, tid); 1394 struct sge_wrq *ofld_txq = toep->ofld_txq; 1395 struct inpcb *inp; 1396 struct tcpcb *tp; 1397 struct epoch_tracker et; 1398 #ifdef INVARIANTS 1399 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1400 #endif 1401 1402 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1403 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1404 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1405 1406 if (toep->flags & TPF_SYNQE) 1407 return (do_abort_req_synqe(iq, rss, m)); 1408 1409 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1410 1411 if (negative_advice(cpl->status)) { 1412 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1413 __func__, cpl->status, tid, toep->flags); 1414 return (0); /* Ignore negative advice */ 1415 } 1416 1417 inp = toep->inp; 1418 CURVNET_SET(toep->vnet); 1419 NET_EPOCH_ENTER(et); /* for tcp_close */ 1420 INP_WLOCK(inp); 1421 1422 tp = intotcpcb(inp); 1423 1424 CTR6(KTR_CXGBE, 1425 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1426 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1427 inp->inp_flags, cpl->status); 1428 1429 /* 1430 * If we'd initiated an abort earlier the reply to it is responsible for 1431 * cleaning up resources. Otherwise we tear everything down right here 1432 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1433 */ 1434 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1435 INP_WUNLOCK(inp); 1436 goto done; 1437 } 1438 toep->flags |= TPF_ABORT_SHUTDOWN; 1439 1440 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1441 struct socket *so = inp->inp_socket; 1442 1443 if (so != NULL) 1444 so_error_set(so, abort_status_to_errno(tp, 1445 cpl->status)); 1446 tp = tcp_close(tp); 1447 if (tp == NULL) 1448 INP_WLOCK(inp); /* re-acquire */ 1449 } 1450 1451 final_cpl_received(toep); 1452 done: 1453 NET_EPOCH_EXIT(et); 1454 CURVNET_RESTORE(); 1455 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1456 return (0); 1457 } 1458 1459 /* 1460 * Reply to the CPL_ABORT_REQ (send_reset) 1461 */ 1462 static int 1463 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1464 { 1465 struct adapter *sc = iq->adapter; 1466 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1467 unsigned int tid = GET_TID(cpl); 1468 struct toepcb *toep = lookup_tid(sc, tid); 1469 struct inpcb *inp = toep->inp; 1470 #ifdef INVARIANTS 1471 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1472 #endif 1473 1474 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1475 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1476 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1477 1478 if (toep->flags & TPF_SYNQE) 1479 return (do_abort_rpl_synqe(iq, rss, m)); 1480 1481 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1482 1483 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1484 __func__, tid, toep, inp, cpl->status); 1485 1486 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1487 ("%s: wasn't expecting abort reply", __func__)); 1488 1489 INP_WLOCK(inp); 1490 final_cpl_received(toep); 1491 1492 return (0); 1493 } 1494 1495 static int 1496 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1497 { 1498 struct adapter *sc = iq->adapter; 1499 const struct cpl_rx_data *cpl = mtod(m, const void *); 1500 unsigned int tid = GET_TID(cpl); 1501 struct toepcb *toep = lookup_tid(sc, tid); 1502 struct inpcb *inp = toep->inp; 1503 struct tcpcb *tp; 1504 struct socket *so; 1505 struct sockbuf *sb; 1506 struct epoch_tracker et; 1507 int len, rx_credits; 1508 uint32_t ddp_placed = 0; 1509 1510 if (__predict_false(toep->flags & TPF_SYNQE)) { 1511 /* 1512 * do_pass_establish must have run before do_rx_data and if this 1513 * is still a synqe instead of a toepcb then the connection must 1514 * be getting aborted. 1515 */ 1516 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1517 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1518 toep, toep->flags); 1519 m_freem(m); 1520 return (0); 1521 } 1522 1523 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1524 1525 /* strip off CPL header */ 1526 m_adj(m, sizeof(*cpl)); 1527 len = m->m_pkthdr.len; 1528 1529 INP_WLOCK(inp); 1530 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1531 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1532 __func__, tid, len, inp->inp_flags); 1533 INP_WUNLOCK(inp); 1534 m_freem(m); 1535 return (0); 1536 } 1537 1538 tp = intotcpcb(inp); 1539 1540 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1541 toep->flags & TPF_TLS_RECEIVE)) { 1542 /* Received "raw" data on a TLS socket. */ 1543 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1544 __func__, tid, len); 1545 do_rx_data_tls(cpl, toep, m); 1546 return (0); 1547 } 1548 1549 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1550 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1551 1552 tp->rcv_nxt += len; 1553 if (tp->rcv_wnd < len) { 1554 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1555 ("%s: negative window size", __func__)); 1556 } 1557 1558 tp->rcv_wnd -= len; 1559 tp->t_rcvtime = ticks; 1560 1561 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1562 DDP_LOCK(toep); 1563 so = inp_inpcbtosocket(inp); 1564 sb = &so->so_rcv; 1565 SOCKBUF_LOCK(sb); 1566 1567 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1568 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1569 __func__, tid, len); 1570 m_freem(m); 1571 SOCKBUF_UNLOCK(sb); 1572 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1573 DDP_UNLOCK(toep); 1574 INP_WUNLOCK(inp); 1575 1576 CURVNET_SET(toep->vnet); 1577 NET_EPOCH_ENTER(et); 1578 INP_WLOCK(inp); 1579 tp = tcp_drop(tp, ECONNRESET); 1580 if (tp) 1581 INP_WUNLOCK(inp); 1582 NET_EPOCH_EXIT(et); 1583 CURVNET_RESTORE(); 1584 1585 return (0); 1586 } 1587 1588 /* receive buffer autosize */ 1589 MPASS(toep->vnet == so->so_vnet); 1590 CURVNET_SET(toep->vnet); 1591 if (sb->sb_flags & SB_AUTOSIZE && 1592 V_tcp_do_autorcvbuf && 1593 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1594 len > (sbspace(sb) / 8 * 7)) { 1595 unsigned int hiwat = sb->sb_hiwat; 1596 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1597 V_tcp_autorcvbuf_max); 1598 1599 if (!sbreserve_locked(sb, newsize, so, NULL)) 1600 sb->sb_flags &= ~SB_AUTOSIZE; 1601 } 1602 1603 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1604 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1605 1606 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1607 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1608 __func__, tid, len); 1609 1610 if (changed) { 1611 if (toep->ddp.flags & DDP_SC_REQ) 1612 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1613 else { 1614 KASSERT(cpl->ddp_off == 1, 1615 ("%s: DDP switched on by itself.", 1616 __func__)); 1617 1618 /* Fell out of DDP mode */ 1619 toep->ddp.flags &= ~DDP_ON; 1620 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1621 __func__); 1622 1623 insert_ddp_data(toep, ddp_placed); 1624 } 1625 } 1626 1627 if (toep->ddp.flags & DDP_ON) { 1628 /* 1629 * CPL_RX_DATA with DDP on can only be an indicate. 1630 * Start posting queued AIO requests via DDP. The 1631 * payload that arrived in this indicate is appended 1632 * to the socket buffer as usual. 1633 */ 1634 handle_ddp_indicate(toep); 1635 } 1636 } 1637 1638 sbappendstream_locked(sb, m, 0); 1639 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1640 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1641 rx_credits = send_rx_credits(sc, toep, rx_credits); 1642 tp->rcv_wnd += rx_credits; 1643 tp->rcv_adv += rx_credits; 1644 } 1645 1646 if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1647 sbavail(sb) != 0) { 1648 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1649 tid); 1650 ddp_queue_toep(toep); 1651 } 1652 sorwakeup_locked(so); 1653 SOCKBUF_UNLOCK_ASSERT(sb); 1654 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1655 DDP_UNLOCK(toep); 1656 1657 INP_WUNLOCK(inp); 1658 CURVNET_RESTORE(); 1659 return (0); 1660 } 1661 1662 static int 1663 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1664 { 1665 struct adapter *sc = iq->adapter; 1666 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1667 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1668 struct toepcb *toep = lookup_tid(sc, tid); 1669 struct inpcb *inp; 1670 struct tcpcb *tp; 1671 struct socket *so; 1672 uint8_t credits = cpl->credits; 1673 struct ofld_tx_sdesc *txsd; 1674 int plen; 1675 #ifdef INVARIANTS 1676 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1677 #endif 1678 1679 /* 1680 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1681 * now this comes back carrying the credits for the flowc. 1682 */ 1683 if (__predict_false(toep->flags & TPF_SYNQE)) { 1684 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1685 ("%s: credits for a synq entry %p", __func__, toep)); 1686 return (0); 1687 } 1688 1689 inp = toep->inp; 1690 1691 KASSERT(opcode == CPL_FW4_ACK, 1692 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1693 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1694 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1695 1696 INP_WLOCK(inp); 1697 1698 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1699 INP_WUNLOCK(inp); 1700 return (0); 1701 } 1702 1703 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1704 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1705 1706 tp = intotcpcb(inp); 1707 1708 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1709 tcp_seq snd_una = be32toh(cpl->snd_una); 1710 1711 #ifdef INVARIANTS 1712 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1713 log(LOG_ERR, 1714 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1715 __func__, snd_una, toep->tid, tp->snd_una); 1716 } 1717 #endif 1718 1719 if (tp->snd_una != snd_una) { 1720 tp->snd_una = snd_una; 1721 tp->ts_recent_age = tcp_ts_getticks(); 1722 } 1723 } 1724 1725 #ifdef VERBOSE_TRACES 1726 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1727 #endif 1728 so = inp->inp_socket; 1729 txsd = &toep->txsd[toep->txsd_cidx]; 1730 plen = 0; 1731 while (credits) { 1732 KASSERT(credits >= txsd->tx_credits, 1733 ("%s: too many (or partial) credits", __func__)); 1734 credits -= txsd->tx_credits; 1735 toep->tx_credits += txsd->tx_credits; 1736 plen += txsd->plen; 1737 if (txsd->iv_buffer) { 1738 free(txsd->iv_buffer, M_CXGBE); 1739 txsd->iv_buffer = NULL; 1740 } 1741 txsd++; 1742 toep->txsd_avail++; 1743 KASSERT(toep->txsd_avail <= toep->txsd_total, 1744 ("%s: txsd avail > total", __func__)); 1745 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1746 txsd = &toep->txsd[0]; 1747 toep->txsd_cidx = 0; 1748 } 1749 } 1750 1751 if (toep->tx_credits == toep->tx_total) { 1752 toep->tx_nocompl = 0; 1753 toep->plen_nocompl = 0; 1754 } 1755 1756 if (toep->flags & TPF_TX_SUSPENDED && 1757 toep->tx_credits >= toep->tx_total / 4) { 1758 #ifdef VERBOSE_TRACES 1759 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1760 tid); 1761 #endif 1762 toep->flags &= ~TPF_TX_SUSPENDED; 1763 CURVNET_SET(toep->vnet); 1764 t4_push_data(sc, toep, plen); 1765 CURVNET_RESTORE(); 1766 } else if (plen > 0) { 1767 struct sockbuf *sb = &so->so_snd; 1768 int sbu; 1769 1770 SOCKBUF_LOCK(sb); 1771 sbu = sbused(sb); 1772 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 1773 1774 if (__predict_false(sbu > 0)) { 1775 /* 1776 * The data trasmitted before the tid's ULP mode 1777 * changed to ISCSI is still in so_snd. 1778 * Incoming credits should account for so_snd 1779 * first. 1780 */ 1781 sbdrop_locked(sb, min(sbu, plen)); 1782 plen -= min(sbu, plen); 1783 } 1784 sowwakeup_locked(so); /* unlocks so_snd */ 1785 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1786 } else { 1787 #ifdef VERBOSE_TRACES 1788 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1789 tid, plen); 1790 #endif 1791 sbdrop_locked(sb, plen); 1792 if (tls_tx_key(toep) && 1793 toep->tls.mode == TLS_MODE_TLSOM) { 1794 struct tls_ofld_info *tls_ofld = &toep->tls; 1795 1796 MPASS(tls_ofld->sb_off >= plen); 1797 tls_ofld->sb_off -= plen; 1798 } 1799 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1800 t4_aiotx_queue_toep(so, toep); 1801 sowwakeup_locked(so); /* unlocks so_snd */ 1802 } 1803 SOCKBUF_UNLOCK_ASSERT(sb); 1804 } 1805 1806 INP_WUNLOCK(inp); 1807 1808 return (0); 1809 } 1810 1811 void 1812 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1813 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1814 { 1815 struct wrqe *wr; 1816 struct cpl_set_tcb_field *req; 1817 struct ofld_tx_sdesc *txsd; 1818 1819 MPASS((cookie & ~M_COOKIE) == 0); 1820 if (reply) { 1821 MPASS(cookie != CPL_COOKIE_RESERVED); 1822 } 1823 1824 wr = alloc_wrqe(sizeof(*req), wrq); 1825 if (wr == NULL) { 1826 /* XXX */ 1827 panic("%s: allocation failure.", __func__); 1828 } 1829 req = wrtod(wr); 1830 1831 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1832 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1833 if (reply == 0) 1834 req->reply_ctrl |= htobe16(F_NO_REPLY); 1835 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1836 req->mask = htobe64(mask); 1837 req->val = htobe64(val); 1838 if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { 1839 txsd = &toep->txsd[toep->txsd_pidx]; 1840 txsd->tx_credits = howmany(sizeof(*req), 16); 1841 txsd->plen = 0; 1842 KASSERT(toep->tx_credits >= txsd->tx_credits && 1843 toep->txsd_avail > 0, 1844 ("%s: not enough credits (%d)", __func__, 1845 toep->tx_credits)); 1846 toep->tx_credits -= txsd->tx_credits; 1847 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1848 toep->txsd_pidx = 0; 1849 toep->txsd_avail--; 1850 } 1851 1852 t4_wrq_tx(sc, wr); 1853 } 1854 1855 void 1856 t4_init_cpl_io_handlers(void) 1857 { 1858 1859 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1860 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1861 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1862 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1863 CPL_COOKIE_TOM); 1864 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1865 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1866 } 1867 1868 void 1869 t4_uninit_cpl_io_handlers(void) 1870 { 1871 1872 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1873 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1874 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1875 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1876 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1877 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1878 } 1879 1880 /* 1881 * Use the 'backend1' field in AIO jobs to hold an error that should 1882 * be reported when the job is completed, the 'backend3' field to 1883 * store the amount of data sent by the AIO job so far, and the 1884 * 'backend4' field to hold a reference count on the job. 1885 * 1886 * Each unmapped mbuf holds a reference on the job as does the queue 1887 * so long as the job is queued. 1888 */ 1889 #define aio_error backend1 1890 #define aio_sent backend3 1891 #define aio_refs backend4 1892 1893 #define jobtotid(job) \ 1894 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1895 1896 static void 1897 aiotx_free_job(struct kaiocb *job) 1898 { 1899 long status; 1900 int error; 1901 1902 if (refcount_release(&job->aio_refs) == 0) 1903 return; 1904 1905 error = (intptr_t)job->aio_error; 1906 status = job->aio_sent; 1907 #ifdef VERBOSE_TRACES 1908 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1909 jobtotid(job), job, status, error); 1910 #endif 1911 if (error != 0 && status != 0) 1912 error = 0; 1913 if (error == ECANCELED) 1914 aio_cancel(job); 1915 else if (error) 1916 aio_complete(job, -1, error); 1917 else { 1918 job->msgsnd = 1; 1919 aio_complete(job, status, 0); 1920 } 1921 } 1922 1923 static void 1924 aiotx_free_pgs(struct mbuf *m) 1925 { 1926 struct kaiocb *job; 1927 vm_page_t pg; 1928 1929 M_ASSERTEXTPG(m); 1930 job = m->m_ext.ext_arg1; 1931 #ifdef VERBOSE_TRACES 1932 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1933 m->m_len, jobtotid(job)); 1934 #endif 1935 1936 for (int i = 0; i < m->m_epg_npgs; i++) { 1937 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1938 vm_page_unwire(pg, PQ_ACTIVE); 1939 } 1940 1941 aiotx_free_job(job); 1942 } 1943 1944 /* 1945 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 1946 * of an AIO job. 1947 */ 1948 static struct mbuf * 1949 alloc_aiotx_mbuf(struct kaiocb *job, int len) 1950 { 1951 struct vmspace *vm; 1952 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 1953 struct mbuf *m, *top, *last; 1954 vm_map_t map; 1955 vm_offset_t start; 1956 int i, mlen, npages, pgoff; 1957 1958 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 1959 ("%s(%p, %d): request to send beyond end of buffer", __func__, 1960 job, len)); 1961 1962 /* 1963 * The AIO subsystem will cancel and drain all requests before 1964 * permitting a process to exit or exec, so p_vmspace should 1965 * be stable here. 1966 */ 1967 vm = job->userproc->p_vmspace; 1968 map = &vm->vm_map; 1969 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 1970 pgoff = start & PAGE_MASK; 1971 1972 top = NULL; 1973 last = NULL; 1974 while (len > 0) { 1975 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 1976 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 1977 ("%s: next start (%#jx + %#x) is not page aligned", 1978 __func__, (uintmax_t)start, mlen)); 1979 1980 npages = vm_fault_quick_hold_pages(map, start, mlen, 1981 VM_PROT_WRITE, pgs, nitems(pgs)); 1982 if (npages < 0) 1983 break; 1984 1985 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs); 1986 if (m == NULL) { 1987 vm_page_unhold_pages(pgs, npages); 1988 break; 1989 } 1990 1991 m->m_epg_1st_off = pgoff; 1992 m->m_epg_npgs = npages; 1993 if (npages == 1) { 1994 KASSERT(mlen + pgoff <= PAGE_SIZE, 1995 ("%s: single page is too large (off %d len %d)", 1996 __func__, pgoff, mlen)); 1997 m->m_epg_last_len = mlen; 1998 } else { 1999 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2000 (npages - 2) * PAGE_SIZE; 2001 } 2002 for (i = 0; i < npages; i++) 2003 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2004 2005 m->m_len = mlen; 2006 m->m_ext.ext_size = npages * PAGE_SIZE; 2007 m->m_ext.ext_arg1 = job; 2008 refcount_acquire(&job->aio_refs); 2009 2010 #ifdef VERBOSE_TRACES 2011 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2012 __func__, jobtotid(job), m, job, npages); 2013 #endif 2014 2015 if (top == NULL) 2016 top = m; 2017 else 2018 last->m_next = m; 2019 last = m; 2020 2021 len -= mlen; 2022 start += mlen; 2023 pgoff = 0; 2024 } 2025 2026 return (top); 2027 } 2028 2029 static void 2030 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2031 { 2032 struct sockbuf *sb; 2033 struct file *fp; 2034 struct inpcb *inp; 2035 struct tcpcb *tp; 2036 struct mbuf *m; 2037 int error, len; 2038 bool moretocome, sendmore; 2039 2040 sb = &so->so_snd; 2041 SOCKBUF_UNLOCK(sb); 2042 fp = job->fd_file; 2043 m = NULL; 2044 2045 #ifdef MAC 2046 error = mac_socket_check_send(fp->f_cred, so); 2047 if (error != 0) 2048 goto out; 2049 #endif 2050 2051 /* Inline sosend_generic(). */ 2052 2053 error = sblock(sb, SBL_WAIT); 2054 MPASS(error == 0); 2055 2056 sendanother: 2057 SOCKBUF_LOCK(sb); 2058 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2059 SOCKBUF_UNLOCK(sb); 2060 sbunlock(sb); 2061 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2062 PROC_LOCK(job->userproc); 2063 kern_psignal(job->userproc, SIGPIPE); 2064 PROC_UNLOCK(job->userproc); 2065 } 2066 error = EPIPE; 2067 goto out; 2068 } 2069 if (so->so_error) { 2070 error = so->so_error; 2071 so->so_error = 0; 2072 SOCKBUF_UNLOCK(sb); 2073 sbunlock(sb); 2074 goto out; 2075 } 2076 if ((so->so_state & SS_ISCONNECTED) == 0) { 2077 SOCKBUF_UNLOCK(sb); 2078 sbunlock(sb); 2079 error = ENOTCONN; 2080 goto out; 2081 } 2082 if (sbspace(sb) < sb->sb_lowat) { 2083 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2084 2085 /* 2086 * Don't block if there is too little room in the socket 2087 * buffer. Instead, requeue the request. 2088 */ 2089 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2090 SOCKBUF_UNLOCK(sb); 2091 sbunlock(sb); 2092 error = ECANCELED; 2093 goto out; 2094 } 2095 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2096 SOCKBUF_UNLOCK(sb); 2097 sbunlock(sb); 2098 goto out; 2099 } 2100 2101 /* 2102 * Write as much data as the socket permits, but no more than a 2103 * a single sndbuf at a time. 2104 */ 2105 len = sbspace(sb); 2106 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2107 len = job->uaiocb.aio_nbytes - job->aio_sent; 2108 moretocome = false; 2109 } else 2110 moretocome = true; 2111 if (len > toep->params.sndbuf) { 2112 len = toep->params.sndbuf; 2113 sendmore = true; 2114 } else 2115 sendmore = false; 2116 2117 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2118 moretocome = true; 2119 SOCKBUF_UNLOCK(sb); 2120 MPASS(len != 0); 2121 2122 m = alloc_aiotx_mbuf(job, len); 2123 if (m == NULL) { 2124 sbunlock(sb); 2125 error = EFAULT; 2126 goto out; 2127 } 2128 2129 /* Inlined tcp_usr_send(). */ 2130 2131 inp = toep->inp; 2132 INP_WLOCK(inp); 2133 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2134 INP_WUNLOCK(inp); 2135 sbunlock(sb); 2136 error = ECONNRESET; 2137 goto out; 2138 } 2139 2140 job->aio_sent += m_length(m, NULL); 2141 2142 sbappendstream(sb, m, 0); 2143 m = NULL; 2144 2145 if (!(inp->inp_flags & INP_DROPPED)) { 2146 tp = intotcpcb(inp); 2147 if (moretocome) 2148 tp->t_flags |= TF_MORETOCOME; 2149 error = tp->t_fb->tfb_tcp_output(tp); 2150 if (moretocome) 2151 tp->t_flags &= ~TF_MORETOCOME; 2152 } 2153 2154 INP_WUNLOCK(inp); 2155 if (sendmore) 2156 goto sendanother; 2157 sbunlock(sb); 2158 2159 if (error) 2160 goto out; 2161 2162 /* 2163 * If this is a blocking socket and the request has not been 2164 * fully completed, requeue it until the socket is ready 2165 * again. 2166 */ 2167 if (job->aio_sent < job->uaiocb.aio_nbytes && 2168 !(so->so_state & SS_NBIO)) { 2169 SOCKBUF_LOCK(sb); 2170 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2171 SOCKBUF_UNLOCK(sb); 2172 error = ECANCELED; 2173 goto out; 2174 } 2175 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2176 return; 2177 } 2178 2179 /* 2180 * If the request will not be requeued, drop the queue's 2181 * reference to the job. Any mbufs in flight should still 2182 * hold a reference, but this drops the reference that the 2183 * queue owns while it is waiting to queue mbufs to the 2184 * socket. 2185 */ 2186 aiotx_free_job(job); 2187 2188 out: 2189 if (error) { 2190 job->aio_error = (void *)(intptr_t)error; 2191 aiotx_free_job(job); 2192 } 2193 if (m != NULL) 2194 m_free(m); 2195 SOCKBUF_LOCK(sb); 2196 } 2197 2198 static void 2199 t4_aiotx_task(void *context, int pending) 2200 { 2201 struct toepcb *toep = context; 2202 struct socket *so; 2203 struct kaiocb *job; 2204 2205 so = toep->aiotx_so; 2206 CURVNET_SET(toep->vnet); 2207 SOCKBUF_LOCK(&so->so_snd); 2208 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2209 job = TAILQ_FIRST(&toep->aiotx_jobq); 2210 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2211 if (!aio_clear_cancel_function(job)) 2212 continue; 2213 2214 t4_aiotx_process_job(toep, so, job); 2215 } 2216 toep->aiotx_so = NULL; 2217 SOCKBUF_UNLOCK(&so->so_snd); 2218 CURVNET_RESTORE(); 2219 2220 free_toepcb(toep); 2221 SOCK_LOCK(so); 2222 sorele(so); 2223 } 2224 2225 static void 2226 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2227 { 2228 2229 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2230 #ifdef VERBOSE_TRACES 2231 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2232 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2233 #endif 2234 if (toep->aiotx_so != NULL) 2235 return; 2236 soref(so); 2237 toep->aiotx_so = so; 2238 hold_toepcb(toep); 2239 soaio_enqueue(&toep->aiotx_task); 2240 } 2241 2242 static void 2243 t4_aiotx_cancel(struct kaiocb *job) 2244 { 2245 struct socket *so; 2246 struct sockbuf *sb; 2247 struct tcpcb *tp; 2248 struct toepcb *toep; 2249 2250 so = job->fd_file->f_data; 2251 tp = so_sototcpcb(so); 2252 toep = tp->t_toe; 2253 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2254 sb = &so->so_snd; 2255 2256 SOCKBUF_LOCK(sb); 2257 if (!aio_cancel_cleared(job)) 2258 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2259 SOCKBUF_UNLOCK(sb); 2260 2261 job->aio_error = (void *)(intptr_t)ECANCELED; 2262 aiotx_free_job(job); 2263 } 2264 2265 int 2266 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2267 { 2268 struct tcpcb *tp = so_sototcpcb(so); 2269 struct toepcb *toep = tp->t_toe; 2270 struct adapter *sc = td_adapter(toep->td); 2271 2272 /* This only handles writes. */ 2273 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2274 return (EOPNOTSUPP); 2275 2276 if (!sc->tt.tx_zcopy) 2277 return (EOPNOTSUPP); 2278 2279 if (tls_tx_key(toep)) 2280 return (EOPNOTSUPP); 2281 2282 SOCKBUF_LOCK(&so->so_snd); 2283 #ifdef VERBOSE_TRACES 2284 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2285 #endif 2286 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2287 panic("new job was cancelled"); 2288 refcount_init(&job->aio_refs, 1); 2289 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2290 if (sowriteable(so)) 2291 t4_aiotx_queue_toep(so, toep); 2292 SOCKBUF_UNLOCK(&so->so_snd); 2293 return (0); 2294 } 2295 2296 void 2297 aiotx_init_toep(struct toepcb *toep) 2298 { 2299 2300 TAILQ_INIT(&toep->aiotx_jobq); 2301 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2302 } 2303 #endif 2304