1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_kern_tls.h" 36 #include "opt_ratelimit.h" 37 38 #ifdef TCP_OFFLOAD 39 #include <sys/param.h> 40 #include <sys/aio.h> 41 #include <sys/file.h> 42 #include <sys/kernel.h> 43 #include <sys/ktr.h> 44 #include <sys/module.h> 45 #include <sys/proc.h> 46 #include <sys/protosw.h> 47 #include <sys/domain.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sglist.h> 51 #include <sys/taskqueue.h> 52 #include <netinet/in.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip6.h> 56 #define TCPSTATES 57 #include <netinet/tcp_fsm.h> 58 #include <netinet/tcp_seq.h> 59 #include <netinet/tcp_var.h> 60 #include <netinet/toecore.h> 61 62 #include <security/mac/mac_framework.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_page.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (ulp_mode(toep) == ULP_MODE_TLS) 100 nparams++; 101 if (toep->tls.fcplenmax != 0) 102 nparams++; 103 if (toep->params.tc_idx != -1) { 104 MPASS(toep->params.tc_idx >= 0 && 105 toep->params.tc_idx < sc->chip_params->nsched_cls); 106 nparams++; 107 } 108 109 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 110 111 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 112 if (wr == NULL) { 113 /* XXX */ 114 panic("%s: allocation failure.", __func__); 115 } 116 flowc = wrtod(wr); 117 memset(flowc, 0, wr->wr_len); 118 119 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 120 V_FW_FLOWC_WR_NPARAMS(nparams)); 121 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 122 V_FW_WR_FLOWID(toep->tid)); 123 124 #define FLOWC_PARAM(__m, __v) \ 125 do { \ 126 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 127 flowc->mnemval[paramidx].val = htobe32(__v); \ 128 paramidx++; \ 129 } while (0) 130 131 paramidx = 0; 132 133 FLOWC_PARAM(PFNVFN, pfvf); 134 FLOWC_PARAM(CH, pi->tx_chan); 135 FLOWC_PARAM(PORT, pi->tx_chan); 136 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 137 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 138 if (tp) { 139 FLOWC_PARAM(MSS, toep->params.emss); 140 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 141 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 142 } else 143 FLOWC_PARAM(MSS, 512); 144 CTR6(KTR_CXGBE, 145 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 146 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 147 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 148 149 if (ulp_mode(toep) == ULP_MODE_TLS) 150 FLOWC_PARAM(ULP_MODE, ulp_mode(toep)); 151 if (toep->tls.fcplenmax != 0) 152 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 153 if (toep->params.tc_idx != -1) 154 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 155 #undef FLOWC_PARAM 156 157 KASSERT(paramidx == nparams, ("nparams mismatch")); 158 159 txsd->tx_credits = howmany(flowclen, 16); 160 txsd->plen = 0; 161 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 162 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 163 toep->tx_credits -= txsd->tx_credits; 164 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 165 toep->txsd_pidx = 0; 166 toep->txsd_avail--; 167 168 toep->flags |= TPF_FLOWC_WR_SENT; 169 t4_wrq_tx(sc, wr); 170 } 171 172 #ifdef RATELIMIT 173 /* 174 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 175 */ 176 static int 177 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 178 { 179 int tc_idx, rc; 180 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 181 const int port_id = toep->vi->pi->port_id; 182 183 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 184 185 if (kbps == 0) { 186 /* unbind */ 187 tc_idx = -1; 188 } else { 189 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 190 if (rc != 0) 191 return (rc); 192 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 193 } 194 195 if (toep->params.tc_idx != tc_idx) { 196 struct wrqe *wr; 197 struct fw_flowc_wr *flowc; 198 int nparams = 1, flowclen, flowclen16; 199 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 200 201 flowclen = sizeof(*flowc) + nparams * sizeof(struct 202 fw_flowc_mnemval); 203 flowclen16 = howmany(flowclen, 16); 204 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 205 (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { 206 if (tc_idx >= 0) 207 t4_release_cl_rl(sc, port_id, tc_idx); 208 return (ENOMEM); 209 } 210 211 flowc = wrtod(wr); 212 memset(flowc, 0, wr->wr_len); 213 214 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 215 V_FW_FLOWC_WR_NPARAMS(nparams)); 216 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 217 V_FW_WR_FLOWID(toep->tid)); 218 219 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 220 if (tc_idx == -1) 221 flowc->mnemval[0].val = htobe32(0xff); 222 else 223 flowc->mnemval[0].val = htobe32(tc_idx); 224 225 txsd->tx_credits = flowclen16; 226 txsd->plen = 0; 227 toep->tx_credits -= txsd->tx_credits; 228 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 229 toep->txsd_pidx = 0; 230 toep->txsd_avail--; 231 t4_wrq_tx(sc, wr); 232 } 233 234 if (toep->params.tc_idx >= 0) 235 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 236 toep->params.tc_idx = tc_idx; 237 238 return (0); 239 } 240 #endif 241 242 void 243 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 244 { 245 struct wrqe *wr; 246 struct cpl_abort_req *req; 247 int tid = toep->tid; 248 struct inpcb *inp = toep->inp; 249 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 250 251 INP_WLOCK_ASSERT(inp); 252 253 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 254 __func__, toep->tid, 255 inp->inp_flags & INP_DROPPED ? "inp dropped" : 256 tcpstates[tp->t_state], 257 toep->flags, inp->inp_flags, 258 toep->flags & TPF_ABORT_SHUTDOWN ? 259 " (abort already in progress)" : ""); 260 261 if (toep->flags & TPF_ABORT_SHUTDOWN) 262 return; /* abort already in progress */ 263 264 toep->flags |= TPF_ABORT_SHUTDOWN; 265 266 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 267 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 268 269 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 270 if (wr == NULL) { 271 /* XXX */ 272 panic("%s: allocation failure.", __func__); 273 } 274 req = wrtod(wr); 275 276 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 277 if (inp->inp_flags & INP_DROPPED) 278 req->rsvd0 = htobe32(snd_nxt); 279 else 280 req->rsvd0 = htobe32(tp->snd_nxt); 281 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 282 req->cmd = CPL_ABORT_SEND_RST; 283 284 /* 285 * XXX: What's the correct way to tell that the inp hasn't been detached 286 * from its socket? Should I even be flushing the snd buffer here? 287 */ 288 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 289 struct socket *so = inp->inp_socket; 290 291 if (so != NULL) /* because I'm not sure. See comment above */ 292 sbflush(&so->so_snd); 293 } 294 295 t4_l2t_send(sc, wr, toep->l2te); 296 } 297 298 /* 299 * Called when a connection is established to translate the TCP options 300 * reported by HW to FreeBSD's native format. 301 */ 302 static void 303 assign_rxopt(struct tcpcb *tp, uint16_t opt) 304 { 305 struct toepcb *toep = tp->t_toe; 306 struct inpcb *inp = tp->t_inpcb; 307 struct adapter *sc = td_adapter(toep->td); 308 309 INP_LOCK_ASSERT(inp); 310 311 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 312 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 313 if (inp->inp_inc.inc_flags & INC_ISIPV6) 314 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 315 else 316 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 317 318 toep->params.emss = tp->t_maxseg; 319 if (G_TCPOPT_TSTAMP(opt)) { 320 toep->params.tstamp = 1; 321 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 322 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 323 tp->ts_recent = 0; /* hmmm */ 324 tp->ts_recent_age = tcp_ts_getticks(); 325 } else 326 toep->params.tstamp = 0; 327 328 if (G_TCPOPT_SACK(opt)) { 329 toep->params.sack = 1; 330 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 331 } else { 332 toep->params.sack = 0; 333 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 334 } 335 336 if (G_TCPOPT_WSCALE_OK(opt)) 337 tp->t_flags |= TF_RCVD_SCALE; 338 339 /* Doing window scaling? */ 340 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 341 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 342 tp->rcv_scale = tp->request_r_scale; 343 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 344 } else 345 toep->params.wscale = 0; 346 347 CTR6(KTR_CXGBE, 348 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 349 toep->tid, toep->params.mtu_idx, toep->params.emss, 350 toep->params.tstamp, toep->params.sack, toep->params.wscale); 351 } 352 353 /* 354 * Completes some final bits of initialization for just established connections 355 * and changes their state to TCPS_ESTABLISHED. 356 * 357 * The ISNs are from the exchange of SYNs. 358 */ 359 void 360 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 361 { 362 struct inpcb *inp = toep->inp; 363 struct socket *so = inp->inp_socket; 364 struct tcpcb *tp = intotcpcb(inp); 365 uint16_t tcpopt = be16toh(opt); 366 367 INP_WLOCK_ASSERT(inp); 368 KASSERT(tp->t_state == TCPS_SYN_SENT || 369 tp->t_state == TCPS_SYN_RECEIVED, 370 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 371 372 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 373 __func__, toep->tid, so, inp, tp, toep); 374 375 tcp_state_change(tp, TCPS_ESTABLISHED); 376 tp->t_starttime = ticks; 377 TCPSTAT_INC(tcps_connects); 378 379 tp->irs = irs; 380 tcp_rcvseqinit(tp); 381 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 382 tp->rcv_adv += tp->rcv_wnd; 383 tp->last_ack_sent = tp->rcv_nxt; 384 385 tp->iss = iss; 386 tcp_sendseqinit(tp); 387 tp->snd_una = iss + 1; 388 tp->snd_nxt = iss + 1; 389 tp->snd_max = iss + 1; 390 391 assign_rxopt(tp, tcpopt); 392 send_flowc_wr(toep, tp); 393 394 soisconnected(so); 395 396 if (ulp_mode(toep) == ULP_MODE_TLS) 397 tls_establish(toep); 398 } 399 400 int 401 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 402 { 403 struct wrqe *wr; 404 struct cpl_rx_data_ack *req; 405 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 406 407 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 408 409 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 410 if (wr == NULL) 411 return (0); 412 req = wrtod(wr); 413 414 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 415 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 416 417 t4_wrq_tx(sc, wr); 418 return (credits); 419 } 420 421 void 422 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 423 { 424 struct wrqe *wr; 425 struct cpl_rx_data_ack *req; 426 427 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 428 if (wr == NULL) 429 return; 430 req = wrtod(wr); 431 432 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 433 req->credit_dack = htobe32(F_RX_MODULATE_RX); 434 435 t4_wrq_tx(sc, wr); 436 } 437 438 void 439 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 440 { 441 struct adapter *sc = tod->tod_softc; 442 struct inpcb *inp = tp->t_inpcb; 443 struct socket *so = inp->inp_socket; 444 struct sockbuf *sb = &so->so_rcv; 445 struct toepcb *toep = tp->t_toe; 446 int rx_credits; 447 448 INP_WLOCK_ASSERT(inp); 449 SOCKBUF_LOCK_ASSERT(sb); 450 451 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 452 if (rx_credits > 0 && 453 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 454 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 455 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 456 rx_credits = send_rx_credits(sc, toep, rx_credits); 457 tp->rcv_wnd += rx_credits; 458 tp->rcv_adv += rx_credits; 459 } else if (toep->flags & TPF_FORCE_CREDITS) 460 send_rx_modulate(sc, toep); 461 } 462 463 void 464 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 465 { 466 struct inpcb *inp = tp->t_inpcb; 467 struct socket *so = inp->inp_socket; 468 struct sockbuf *sb = &so->so_rcv; 469 470 SOCKBUF_LOCK(sb); 471 t4_rcvd_locked(tod, tp); 472 SOCKBUF_UNLOCK(sb); 473 } 474 475 /* 476 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 477 */ 478 int 479 t4_close_conn(struct adapter *sc, struct toepcb *toep) 480 { 481 struct wrqe *wr; 482 struct cpl_close_con_req *req; 483 unsigned int tid = toep->tid; 484 485 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 486 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 487 488 if (toep->flags & TPF_FIN_SENT) 489 return (0); 490 491 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 492 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 493 494 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 495 if (wr == NULL) { 496 /* XXX */ 497 panic("%s: allocation failure.", __func__); 498 } 499 req = wrtod(wr); 500 501 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 502 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 503 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 504 V_FW_WR_FLOWID(tid)); 505 req->wr.wr_lo = cpu_to_be64(0); 506 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 507 req->rsvd = 0; 508 509 toep->flags |= TPF_FIN_SENT; 510 toep->flags &= ~TPF_SEND_FIN; 511 t4_l2t_send(sc, wr, toep->l2te); 512 513 return (0); 514 } 515 516 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 517 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 518 519 /* Maximum amount of immediate data we could stuff in a WR */ 520 static inline int 521 max_imm_payload(int tx_credits) 522 { 523 const int n = 1; /* Use no more than one desc for imm. data WR */ 524 525 KASSERT(tx_credits >= 0 && 526 tx_credits <= MAX_OFLD_TX_CREDITS, 527 ("%s: %d credits", __func__, tx_credits)); 528 529 if (tx_credits < MIN_OFLD_TX_CREDITS) 530 return (0); 531 532 if (tx_credits >= (n * EQ_ESIZE) / 16) 533 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 534 else 535 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 536 } 537 538 /* Maximum number of SGL entries we could stuff in a WR */ 539 static inline int 540 max_dsgl_nsegs(int tx_credits) 541 { 542 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 543 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 544 545 KASSERT(tx_credits >= 0 && 546 tx_credits <= MAX_OFLD_TX_CREDITS, 547 ("%s: %d credits", __func__, tx_credits)); 548 549 if (tx_credits < MIN_OFLD_TX_CREDITS) 550 return (0); 551 552 nseg += 2 * (sge_pair_credits * 16 / 24); 553 if ((sge_pair_credits * 16) % 24 == 16) 554 nseg++; 555 556 return (nseg); 557 } 558 559 static inline void 560 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 561 unsigned int plen, uint8_t credits, int shove, int ulp_submode) 562 { 563 struct fw_ofld_tx_data_wr *txwr = dst; 564 565 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 566 V_FW_WR_IMMDLEN(immdlen)); 567 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 568 V_FW_WR_LEN16(credits)); 569 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 570 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 571 txwr->plen = htobe32(plen); 572 573 if (toep->params.tx_align > 0) { 574 if (plen < 2 * toep->params.emss) 575 txwr->lsodisable_to_flags |= 576 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 577 else 578 txwr->lsodisable_to_flags |= 579 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 580 (toep->params.nagle == 0 ? 0 : 581 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 582 } 583 } 584 585 /* 586 * Generate a DSGL from a starting mbuf. The total number of segments and the 587 * maximum segments in any one mbuf are provided. 588 */ 589 static void 590 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 591 { 592 struct mbuf *m; 593 struct ulptx_sgl *usgl = dst; 594 int i, j, rc; 595 struct sglist sg; 596 struct sglist_seg segs[n]; 597 598 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 599 600 sglist_init(&sg, n, segs); 601 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 602 V_ULPTX_NSGE(nsegs)); 603 604 i = -1; 605 for (m = start; m != stop; m = m->m_next) { 606 if (m->m_flags & M_EXTPG) 607 rc = sglist_append_mbuf_epg(&sg, m, 608 mtod(m, vm_offset_t), m->m_len); 609 else 610 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 611 if (__predict_false(rc != 0)) 612 panic("%s: sglist_append %d", __func__, rc); 613 614 for (j = 0; j < sg.sg_nseg; i++, j++) { 615 if (i < 0) { 616 usgl->len0 = htobe32(segs[j].ss_len); 617 usgl->addr0 = htobe64(segs[j].ss_paddr); 618 } else { 619 usgl->sge[i / 2].len[i & 1] = 620 htobe32(segs[j].ss_len); 621 usgl->sge[i / 2].addr[i & 1] = 622 htobe64(segs[j].ss_paddr); 623 } 624 #ifdef INVARIANTS 625 nsegs--; 626 #endif 627 } 628 sglist_reset(&sg); 629 } 630 if (i & 1) 631 usgl->sge[i / 2].len[1] = htobe32(0); 632 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 633 __func__, nsegs, start, stop)); 634 } 635 636 /* 637 * Max number of SGL entries an offload tx work request can have. This is 41 638 * (1 + 40) for a full 512B work request. 639 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 640 */ 641 #define OFLD_SGL_LEN (41) 642 643 /* 644 * Send data and/or a FIN to the peer. 645 * 646 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 647 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 648 * was transmitted. 649 * 650 * drop indicates the number of bytes that should be dropped from the head of 651 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 652 * contention on the send buffer lock (before this change it used to do 653 * sowwakeup and then t4_push_frames right after that when recovering from tx 654 * stalls). When drop is set this function MUST drop the bytes and wake up any 655 * writers. 656 */ 657 void 658 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 659 { 660 struct mbuf *sndptr, *m, *sb_sndptr; 661 struct fw_ofld_tx_data_wr *txwr; 662 struct wrqe *wr; 663 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 664 struct inpcb *inp = toep->inp; 665 struct tcpcb *tp = intotcpcb(inp); 666 struct socket *so = inp->inp_socket; 667 struct sockbuf *sb = &so->so_snd; 668 int tx_credits, shove, compl, sowwakeup; 669 struct ofld_tx_sdesc *txsd; 670 bool nomap_mbuf_seen; 671 672 INP_WLOCK_ASSERT(inp); 673 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 674 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 675 676 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 677 ulp_mode(toep) == ULP_MODE_TCPDDP || 678 ulp_mode(toep) == ULP_MODE_TLS || 679 ulp_mode(toep) == ULP_MODE_RDMA, 680 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 681 682 #ifdef VERBOSE_TRACES 683 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 684 __func__, toep->tid, toep->flags, tp->t_flags, drop); 685 #endif 686 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 687 return; 688 689 #ifdef RATELIMIT 690 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 691 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 692 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 693 } 694 #endif 695 696 /* 697 * This function doesn't resume by itself. Someone else must clear the 698 * flag and call this function. 699 */ 700 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 701 KASSERT(drop == 0, 702 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 703 return; 704 } 705 706 txsd = &toep->txsd[toep->txsd_pidx]; 707 do { 708 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 709 max_imm = max_imm_payload(tx_credits); 710 max_nsegs = max_dsgl_nsegs(tx_credits); 711 712 SOCKBUF_LOCK(sb); 713 sowwakeup = drop; 714 if (drop) { 715 sbdrop_locked(sb, drop); 716 drop = 0; 717 } 718 sb_sndptr = sb->sb_sndptr; 719 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 720 plen = 0; 721 nsegs = 0; 722 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 723 nomap_mbuf_seen = false; 724 for (m = sndptr; m != NULL; m = m->m_next) { 725 int n; 726 727 if ((m->m_flags & M_NOTAVAIL) != 0) 728 break; 729 if (m->m_flags & M_EXTPG) { 730 #ifdef KERN_TLS 731 if (m->m_epg_tls != NULL) { 732 toep->flags |= TPF_KTLS; 733 if (plen == 0) { 734 SOCKBUF_UNLOCK(sb); 735 t4_push_ktls(sc, toep, 0); 736 return; 737 } 738 break; 739 } 740 #endif 741 n = sglist_count_mbuf_epg(m, 742 mtod(m, vm_offset_t), m->m_len); 743 } else 744 n = sglist_count(mtod(m, void *), m->m_len); 745 746 nsegs += n; 747 plen += m->m_len; 748 749 /* This mbuf sent us _over_ the nsegs limit, back out */ 750 if (plen > max_imm && nsegs > max_nsegs) { 751 nsegs -= n; 752 plen -= m->m_len; 753 if (plen == 0) { 754 /* Too few credits */ 755 toep->flags |= TPF_TX_SUSPENDED; 756 if (sowwakeup) { 757 if (!TAILQ_EMPTY( 758 &toep->aiotx_jobq)) 759 t4_aiotx_queue_toep(so, 760 toep); 761 sowwakeup_locked(so); 762 } else 763 SOCKBUF_UNLOCK(sb); 764 SOCKBUF_UNLOCK_ASSERT(sb); 765 return; 766 } 767 break; 768 } 769 770 if (m->m_flags & M_EXTPG) 771 nomap_mbuf_seen = true; 772 if (max_nsegs_1mbuf < n) 773 max_nsegs_1mbuf = n; 774 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 775 776 /* This mbuf put us right at the max_nsegs limit */ 777 if (plen > max_imm && nsegs == max_nsegs) { 778 m = m->m_next; 779 break; 780 } 781 } 782 783 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 784 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 785 compl = 1; 786 else 787 compl = 0; 788 789 if (sb->sb_flags & SB_AUTOSIZE && 790 V_tcp_do_autosndbuf && 791 sb->sb_hiwat < V_tcp_autosndbuf_max && 792 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 793 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 794 V_tcp_autosndbuf_max); 795 796 if (!sbreserve_locked(sb, newsize, so, NULL)) 797 sb->sb_flags &= ~SB_AUTOSIZE; 798 else 799 sowwakeup = 1; /* room available */ 800 } 801 if (sowwakeup) { 802 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 803 t4_aiotx_queue_toep(so, toep); 804 sowwakeup_locked(so); 805 } else 806 SOCKBUF_UNLOCK(sb); 807 SOCKBUF_UNLOCK_ASSERT(sb); 808 809 /* nothing to send */ 810 if (plen == 0) { 811 KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0, 812 ("%s: nothing to send, but m != NULL is ready", 813 __func__)); 814 break; 815 } 816 817 if (__predict_false(toep->flags & TPF_FIN_SENT)) 818 panic("%s: excess tx.", __func__); 819 820 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 821 if (plen <= max_imm && !nomap_mbuf_seen) { 822 823 /* Immediate data tx */ 824 825 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 826 toep->ofld_txq); 827 if (wr == NULL) { 828 /* XXX: how will we recover from this? */ 829 toep->flags |= TPF_TX_SUSPENDED; 830 return; 831 } 832 txwr = wrtod(wr); 833 credits = howmany(wr->wr_len, 16); 834 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); 835 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 836 nsegs = 0; 837 } else { 838 int wr_len; 839 840 /* DSGL tx */ 841 842 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 843 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 844 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 845 if (wr == NULL) { 846 /* XXX: how will we recover from this? */ 847 toep->flags |= TPF_TX_SUSPENDED; 848 return; 849 } 850 txwr = wrtod(wr); 851 credits = howmany(wr_len, 16); 852 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); 853 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 854 max_nsegs_1mbuf); 855 if (wr_len & 0xf) { 856 uint64_t *pad = (uint64_t *) 857 ((uintptr_t)txwr + wr_len); 858 *pad = 0; 859 } 860 } 861 862 KASSERT(toep->tx_credits >= credits, 863 ("%s: not enough credits", __func__)); 864 865 toep->tx_credits -= credits; 866 toep->tx_nocompl += credits; 867 toep->plen_nocompl += plen; 868 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 869 toep->tx_nocompl >= toep->tx_total / 4) 870 compl = 1; 871 872 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 873 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 874 toep->tx_nocompl = 0; 875 toep->plen_nocompl = 0; 876 } 877 878 tp->snd_nxt += plen; 879 tp->snd_max += plen; 880 881 SOCKBUF_LOCK(sb); 882 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 883 sb->sb_sndptr = sb_sndptr; 884 SOCKBUF_UNLOCK(sb); 885 886 toep->flags |= TPF_TX_DATA_SENT; 887 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 888 toep->flags |= TPF_TX_SUSPENDED; 889 890 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 891 txsd->plen = plen; 892 txsd->tx_credits = credits; 893 txsd++; 894 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 895 toep->txsd_pidx = 0; 896 txsd = &toep->txsd[0]; 897 } 898 toep->txsd_avail--; 899 900 t4_l2t_send(sc, wr, toep->l2te); 901 } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0); 902 903 /* Send a FIN if requested, but only if there's no more data to send */ 904 if (m == NULL && toep->flags & TPF_SEND_FIN) 905 t4_close_conn(sc, toep); 906 } 907 908 static inline void 909 rqdrop_locked(struct mbufq *q, int plen) 910 { 911 struct mbuf *m; 912 913 while (plen > 0) { 914 m = mbufq_dequeue(q); 915 916 /* Too many credits. */ 917 MPASS(m != NULL); 918 M_ASSERTPKTHDR(m); 919 920 /* Partial credits. */ 921 MPASS(plen >= m->m_pkthdr.len); 922 923 plen -= m->m_pkthdr.len; 924 m_freem(m); 925 } 926 } 927 928 void 929 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 930 { 931 struct mbuf *sndptr, *m; 932 struct fw_ofld_tx_data_wr *txwr; 933 struct wrqe *wr; 934 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 935 u_int adjusted_plen, ulp_submode; 936 struct inpcb *inp = toep->inp; 937 struct tcpcb *tp = intotcpcb(inp); 938 int tx_credits, shove; 939 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 940 struct mbufq *pduq = &toep->ulp_pduq; 941 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 942 943 INP_WLOCK_ASSERT(inp); 944 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 945 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 946 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 947 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 948 949 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 950 return; 951 952 /* 953 * This function doesn't resume by itself. Someone else must clear the 954 * flag and call this function. 955 */ 956 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 957 KASSERT(drop == 0, 958 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 959 return; 960 } 961 962 if (drop) 963 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 964 965 while ((sndptr = mbufq_first(pduq)) != NULL) { 966 M_ASSERTPKTHDR(sndptr); 967 968 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 969 max_imm = max_imm_payload(tx_credits); 970 max_nsegs = max_dsgl_nsegs(tx_credits); 971 972 plen = 0; 973 nsegs = 0; 974 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 975 for (m = sndptr; m != NULL; m = m->m_next) { 976 int n = sglist_count(mtod(m, void *), m->m_len); 977 978 nsegs += n; 979 plen += m->m_len; 980 981 /* 982 * This mbuf would send us _over_ the nsegs limit. 983 * Suspend tx because the PDU can't be sent out. 984 */ 985 if (plen > max_imm && nsegs > max_nsegs) { 986 toep->flags |= TPF_TX_SUSPENDED; 987 return; 988 } 989 990 if (max_nsegs_1mbuf < n) 991 max_nsegs_1mbuf = n; 992 } 993 994 if (__predict_false(toep->flags & TPF_FIN_SENT)) 995 panic("%s: excess tx.", __func__); 996 997 /* 998 * We have a PDU to send. All of it goes out in one WR so 'm' 999 * is NULL. A PDU's length is always a multiple of 4. 1000 */ 1001 MPASS(m == NULL); 1002 MPASS((plen & 3) == 0); 1003 MPASS(sndptr->m_pkthdr.len == plen); 1004 1005 shove = !(tp->t_flags & TF_MORETOCOME); 1006 ulp_submode = mbuf_ulp_submode(sndptr); 1007 MPASS(ulp_submode < nitems(ulp_extra_len)); 1008 1009 /* 1010 * plen doesn't include header and data digests, which are 1011 * generated and inserted in the right places by the TOE, but 1012 * they do occupy TCP sequence space and need to be accounted 1013 * for. 1014 */ 1015 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1016 if (plen <= max_imm) { 1017 1018 /* Immediate data tx */ 1019 1020 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1021 toep->ofld_txq); 1022 if (wr == NULL) { 1023 /* XXX: how will we recover from this? */ 1024 toep->flags |= TPF_TX_SUSPENDED; 1025 return; 1026 } 1027 txwr = wrtod(wr); 1028 credits = howmany(wr->wr_len, 16); 1029 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1030 shove, ulp_submode); 1031 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1032 nsegs = 0; 1033 } else { 1034 int wr_len; 1035 1036 /* DSGL tx */ 1037 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1038 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1039 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 1040 if (wr == NULL) { 1041 /* XXX: how will we recover from this? */ 1042 toep->flags |= TPF_TX_SUSPENDED; 1043 return; 1044 } 1045 txwr = wrtod(wr); 1046 credits = howmany(wr_len, 16); 1047 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1048 shove, ulp_submode); 1049 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1050 max_nsegs_1mbuf); 1051 if (wr_len & 0xf) { 1052 uint64_t *pad = (uint64_t *) 1053 ((uintptr_t)txwr + wr_len); 1054 *pad = 0; 1055 } 1056 } 1057 1058 KASSERT(toep->tx_credits >= credits, 1059 ("%s: not enough credits", __func__)); 1060 1061 m = mbufq_dequeue(pduq); 1062 MPASS(m == sndptr); 1063 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1064 1065 toep->tx_credits -= credits; 1066 toep->tx_nocompl += credits; 1067 toep->plen_nocompl += plen; 1068 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1069 toep->tx_nocompl >= toep->tx_total / 4) { 1070 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1071 toep->tx_nocompl = 0; 1072 toep->plen_nocompl = 0; 1073 } 1074 1075 tp->snd_nxt += adjusted_plen; 1076 tp->snd_max += adjusted_plen; 1077 1078 toep->flags |= TPF_TX_DATA_SENT; 1079 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1080 toep->flags |= TPF_TX_SUSPENDED; 1081 1082 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1083 txsd->plen = plen; 1084 txsd->tx_credits = credits; 1085 txsd++; 1086 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1087 toep->txsd_pidx = 0; 1088 txsd = &toep->txsd[0]; 1089 } 1090 toep->txsd_avail--; 1091 1092 t4_l2t_send(sc, wr, toep->l2te); 1093 } 1094 1095 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1096 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1097 t4_close_conn(sc, toep); 1098 } 1099 1100 static inline void 1101 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1102 { 1103 1104 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1105 t4_push_pdus(sc, toep, drop); 1106 else if (tls_tx_key(toep) && toep->tls.mode == TLS_MODE_TLSOM) 1107 t4_push_tls_records(sc, toep, drop); 1108 #ifdef KERN_TLS 1109 else if (toep->flags & TPF_KTLS) 1110 t4_push_ktls(sc, toep, drop); 1111 #endif 1112 else 1113 t4_push_frames(sc, toep, drop); 1114 } 1115 1116 int 1117 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1118 { 1119 struct adapter *sc = tod->tod_softc; 1120 #ifdef INVARIANTS 1121 struct inpcb *inp = tp->t_inpcb; 1122 #endif 1123 struct toepcb *toep = tp->t_toe; 1124 1125 INP_WLOCK_ASSERT(inp); 1126 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1127 ("%s: inp %p dropped.", __func__, inp)); 1128 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1129 1130 t4_push_data(sc, toep, 0); 1131 1132 return (0); 1133 } 1134 1135 int 1136 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1137 { 1138 struct adapter *sc = tod->tod_softc; 1139 #ifdef INVARIANTS 1140 struct inpcb *inp = tp->t_inpcb; 1141 #endif 1142 struct toepcb *toep = tp->t_toe; 1143 1144 INP_WLOCK_ASSERT(inp); 1145 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1146 ("%s: inp %p dropped.", __func__, inp)); 1147 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1148 1149 toep->flags |= TPF_SEND_FIN; 1150 if (tp->t_state >= TCPS_ESTABLISHED) 1151 t4_push_data(sc, toep, 0); 1152 1153 return (0); 1154 } 1155 1156 int 1157 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1158 { 1159 struct adapter *sc = tod->tod_softc; 1160 #if defined(INVARIANTS) 1161 struct inpcb *inp = tp->t_inpcb; 1162 #endif 1163 struct toepcb *toep = tp->t_toe; 1164 1165 INP_WLOCK_ASSERT(inp); 1166 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1167 ("%s: inp %p dropped.", __func__, inp)); 1168 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1169 1170 /* hmmmm */ 1171 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1172 ("%s: flowc for tid %u [%s] not sent already", 1173 __func__, toep->tid, tcpstates[tp->t_state])); 1174 1175 send_reset(sc, toep, 0); 1176 return (0); 1177 } 1178 1179 /* 1180 * Peer has sent us a FIN. 1181 */ 1182 static int 1183 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1184 { 1185 struct adapter *sc = iq->adapter; 1186 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1187 unsigned int tid = GET_TID(cpl); 1188 struct toepcb *toep = lookup_tid(sc, tid); 1189 struct inpcb *inp = toep->inp; 1190 struct tcpcb *tp = NULL; 1191 struct socket *so; 1192 struct epoch_tracker et; 1193 #ifdef INVARIANTS 1194 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1195 #endif 1196 1197 KASSERT(opcode == CPL_PEER_CLOSE, 1198 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1199 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1200 1201 if (__predict_false(toep->flags & TPF_SYNQE)) { 1202 /* 1203 * do_pass_establish must have run before do_peer_close and if 1204 * this is still a synqe instead of a toepcb then the connection 1205 * must be getting aborted. 1206 */ 1207 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1208 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1209 toep, toep->flags); 1210 return (0); 1211 } 1212 1213 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1214 1215 CURVNET_SET(toep->vnet); 1216 NET_EPOCH_ENTER(et); 1217 INP_WLOCK(inp); 1218 tp = intotcpcb(inp); 1219 1220 CTR6(KTR_CXGBE, 1221 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1222 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1223 toep->ddp.flags, inp); 1224 1225 if (toep->flags & TPF_ABORT_SHUTDOWN) 1226 goto done; 1227 1228 tp->rcv_nxt++; /* FIN */ 1229 1230 so = inp->inp_socket; 1231 socantrcvmore(so); 1232 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1233 DDP_LOCK(toep); 1234 if (__predict_false(toep->ddp.flags & 1235 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1236 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1237 DDP_UNLOCK(toep); 1238 } 1239 1240 if (ulp_mode(toep) != ULP_MODE_RDMA) { 1241 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1242 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1243 be32toh(cpl->rcv_nxt))); 1244 } 1245 1246 switch (tp->t_state) { 1247 case TCPS_SYN_RECEIVED: 1248 tp->t_starttime = ticks; 1249 /* FALLTHROUGH */ 1250 1251 case TCPS_ESTABLISHED: 1252 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1253 break; 1254 1255 case TCPS_FIN_WAIT_1: 1256 tcp_state_change(tp, TCPS_CLOSING); 1257 break; 1258 1259 case TCPS_FIN_WAIT_2: 1260 tcp_twstart(tp); 1261 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1262 NET_EPOCH_EXIT(et); 1263 CURVNET_RESTORE(); 1264 1265 INP_WLOCK(inp); 1266 final_cpl_received(toep); 1267 return (0); 1268 1269 default: 1270 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1271 __func__, tid, tp->t_state); 1272 } 1273 done: 1274 INP_WUNLOCK(inp); 1275 NET_EPOCH_EXIT(et); 1276 CURVNET_RESTORE(); 1277 return (0); 1278 } 1279 1280 /* 1281 * Peer has ACK'd our FIN. 1282 */ 1283 static int 1284 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1285 struct mbuf *m) 1286 { 1287 struct adapter *sc = iq->adapter; 1288 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1289 unsigned int tid = GET_TID(cpl); 1290 struct toepcb *toep = lookup_tid(sc, tid); 1291 struct inpcb *inp = toep->inp; 1292 struct tcpcb *tp = NULL; 1293 struct socket *so = NULL; 1294 struct epoch_tracker et; 1295 #ifdef INVARIANTS 1296 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1297 #endif 1298 1299 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1300 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1301 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1302 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1303 1304 CURVNET_SET(toep->vnet); 1305 NET_EPOCH_ENTER(et); 1306 INP_WLOCK(inp); 1307 tp = intotcpcb(inp); 1308 1309 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1310 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1311 1312 if (toep->flags & TPF_ABORT_SHUTDOWN) 1313 goto done; 1314 1315 so = inp->inp_socket; 1316 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1317 1318 switch (tp->t_state) { 1319 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1320 tcp_twstart(tp); 1321 release: 1322 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1323 NET_EPOCH_EXIT(et); 1324 CURVNET_RESTORE(); 1325 1326 INP_WLOCK(inp); 1327 final_cpl_received(toep); /* no more CPLs expected */ 1328 1329 return (0); 1330 case TCPS_LAST_ACK: 1331 if (tcp_close(tp)) 1332 INP_WUNLOCK(inp); 1333 goto release; 1334 1335 case TCPS_FIN_WAIT_1: 1336 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1337 soisdisconnected(so); 1338 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1339 break; 1340 1341 default: 1342 log(LOG_ERR, 1343 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1344 __func__, tid, tcpstates[tp->t_state]); 1345 } 1346 done: 1347 INP_WUNLOCK(inp); 1348 NET_EPOCH_EXIT(et); 1349 CURVNET_RESTORE(); 1350 return (0); 1351 } 1352 1353 void 1354 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1355 int rst_status) 1356 { 1357 struct wrqe *wr; 1358 struct cpl_abort_rpl *cpl; 1359 1360 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1361 if (wr == NULL) { 1362 /* XXX */ 1363 panic("%s: allocation failure.", __func__); 1364 } 1365 cpl = wrtod(wr); 1366 1367 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1368 cpl->cmd = rst_status; 1369 1370 t4_wrq_tx(sc, wr); 1371 } 1372 1373 static int 1374 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1375 { 1376 switch (abort_reason) { 1377 case CPL_ERR_BAD_SYN: 1378 case CPL_ERR_CONN_RESET: 1379 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1380 case CPL_ERR_XMIT_TIMEDOUT: 1381 case CPL_ERR_PERSIST_TIMEDOUT: 1382 case CPL_ERR_FINWAIT2_TIMEDOUT: 1383 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1384 return (ETIMEDOUT); 1385 default: 1386 return (EIO); 1387 } 1388 } 1389 1390 /* 1391 * TCP RST from the peer, timeout, or some other such critical error. 1392 */ 1393 static int 1394 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1395 { 1396 struct adapter *sc = iq->adapter; 1397 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1398 unsigned int tid = GET_TID(cpl); 1399 struct toepcb *toep = lookup_tid(sc, tid); 1400 struct sge_wrq *ofld_txq = toep->ofld_txq; 1401 struct inpcb *inp; 1402 struct tcpcb *tp; 1403 struct epoch_tracker et; 1404 #ifdef INVARIANTS 1405 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1406 #endif 1407 1408 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1409 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1410 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1411 1412 if (toep->flags & TPF_SYNQE) 1413 return (do_abort_req_synqe(iq, rss, m)); 1414 1415 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1416 1417 if (negative_advice(cpl->status)) { 1418 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1419 __func__, cpl->status, tid, toep->flags); 1420 return (0); /* Ignore negative advice */ 1421 } 1422 1423 inp = toep->inp; 1424 CURVNET_SET(toep->vnet); 1425 NET_EPOCH_ENTER(et); /* for tcp_close */ 1426 INP_WLOCK(inp); 1427 1428 tp = intotcpcb(inp); 1429 1430 CTR6(KTR_CXGBE, 1431 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1432 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1433 inp->inp_flags, cpl->status); 1434 1435 /* 1436 * If we'd initiated an abort earlier the reply to it is responsible for 1437 * cleaning up resources. Otherwise we tear everything down right here 1438 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1439 */ 1440 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1441 INP_WUNLOCK(inp); 1442 goto done; 1443 } 1444 toep->flags |= TPF_ABORT_SHUTDOWN; 1445 1446 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1447 struct socket *so = inp->inp_socket; 1448 1449 if (so != NULL) 1450 so_error_set(so, abort_status_to_errno(tp, 1451 cpl->status)); 1452 tp = tcp_close(tp); 1453 if (tp == NULL) 1454 INP_WLOCK(inp); /* re-acquire */ 1455 } 1456 1457 final_cpl_received(toep); 1458 done: 1459 NET_EPOCH_EXIT(et); 1460 CURVNET_RESTORE(); 1461 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1462 return (0); 1463 } 1464 1465 /* 1466 * Reply to the CPL_ABORT_REQ (send_reset) 1467 */ 1468 static int 1469 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1470 { 1471 struct adapter *sc = iq->adapter; 1472 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1473 unsigned int tid = GET_TID(cpl); 1474 struct toepcb *toep = lookup_tid(sc, tid); 1475 struct inpcb *inp = toep->inp; 1476 #ifdef INVARIANTS 1477 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1478 #endif 1479 1480 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1481 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1482 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1483 1484 if (toep->flags & TPF_SYNQE) 1485 return (do_abort_rpl_synqe(iq, rss, m)); 1486 1487 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1488 1489 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1490 __func__, tid, toep, inp, cpl->status); 1491 1492 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1493 ("%s: wasn't expecting abort reply", __func__)); 1494 1495 INP_WLOCK(inp); 1496 final_cpl_received(toep); 1497 1498 return (0); 1499 } 1500 1501 static int 1502 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1503 { 1504 struct adapter *sc = iq->adapter; 1505 const struct cpl_rx_data *cpl = mtod(m, const void *); 1506 unsigned int tid = GET_TID(cpl); 1507 struct toepcb *toep = lookup_tid(sc, tid); 1508 struct inpcb *inp = toep->inp; 1509 struct tcpcb *tp; 1510 struct socket *so; 1511 struct sockbuf *sb; 1512 struct epoch_tracker et; 1513 int len, rx_credits; 1514 uint32_t ddp_placed = 0; 1515 1516 if (__predict_false(toep->flags & TPF_SYNQE)) { 1517 /* 1518 * do_pass_establish must have run before do_rx_data and if this 1519 * is still a synqe instead of a toepcb then the connection must 1520 * be getting aborted. 1521 */ 1522 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1523 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1524 toep, toep->flags); 1525 m_freem(m); 1526 return (0); 1527 } 1528 1529 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1530 1531 /* strip off CPL header */ 1532 m_adj(m, sizeof(*cpl)); 1533 len = m->m_pkthdr.len; 1534 1535 INP_WLOCK(inp); 1536 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1537 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1538 __func__, tid, len, inp->inp_flags); 1539 INP_WUNLOCK(inp); 1540 m_freem(m); 1541 return (0); 1542 } 1543 1544 tp = intotcpcb(inp); 1545 1546 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1547 toep->flags & TPF_TLS_RECEIVE)) { 1548 /* Received "raw" data on a TLS socket. */ 1549 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1550 __func__, tid, len); 1551 do_rx_data_tls(cpl, toep, m); 1552 return (0); 1553 } 1554 1555 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1556 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1557 1558 tp->rcv_nxt += len; 1559 if (tp->rcv_wnd < len) { 1560 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1561 ("%s: negative window size", __func__)); 1562 } 1563 1564 tp->rcv_wnd -= len; 1565 tp->t_rcvtime = ticks; 1566 1567 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1568 DDP_LOCK(toep); 1569 so = inp_inpcbtosocket(inp); 1570 sb = &so->so_rcv; 1571 SOCKBUF_LOCK(sb); 1572 1573 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1574 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1575 __func__, tid, len); 1576 m_freem(m); 1577 SOCKBUF_UNLOCK(sb); 1578 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1579 DDP_UNLOCK(toep); 1580 INP_WUNLOCK(inp); 1581 1582 CURVNET_SET(toep->vnet); 1583 NET_EPOCH_ENTER(et); 1584 INP_WLOCK(inp); 1585 tp = tcp_drop(tp, ECONNRESET); 1586 if (tp) 1587 INP_WUNLOCK(inp); 1588 NET_EPOCH_EXIT(et); 1589 CURVNET_RESTORE(); 1590 1591 return (0); 1592 } 1593 1594 /* receive buffer autosize */ 1595 MPASS(toep->vnet == so->so_vnet); 1596 CURVNET_SET(toep->vnet); 1597 if (sb->sb_flags & SB_AUTOSIZE && 1598 V_tcp_do_autorcvbuf && 1599 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1600 len > (sbspace(sb) / 8 * 7)) { 1601 unsigned int hiwat = sb->sb_hiwat; 1602 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1603 V_tcp_autorcvbuf_max); 1604 1605 if (!sbreserve_locked(sb, newsize, so, NULL)) 1606 sb->sb_flags &= ~SB_AUTOSIZE; 1607 } 1608 1609 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1610 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1611 1612 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1613 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1614 __func__, tid, len); 1615 1616 if (changed) { 1617 if (toep->ddp.flags & DDP_SC_REQ) 1618 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1619 else { 1620 KASSERT(cpl->ddp_off == 1, 1621 ("%s: DDP switched on by itself.", 1622 __func__)); 1623 1624 /* Fell out of DDP mode */ 1625 toep->ddp.flags &= ~DDP_ON; 1626 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1627 __func__); 1628 1629 insert_ddp_data(toep, ddp_placed); 1630 } 1631 } 1632 1633 if (toep->ddp.flags & DDP_ON) { 1634 /* 1635 * CPL_RX_DATA with DDP on can only be an indicate. 1636 * Start posting queued AIO requests via DDP. The 1637 * payload that arrived in this indicate is appended 1638 * to the socket buffer as usual. 1639 */ 1640 handle_ddp_indicate(toep); 1641 } 1642 } 1643 1644 sbappendstream_locked(sb, m, 0); 1645 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1646 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1647 rx_credits = send_rx_credits(sc, toep, rx_credits); 1648 tp->rcv_wnd += rx_credits; 1649 tp->rcv_adv += rx_credits; 1650 } 1651 1652 if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1653 sbavail(sb) != 0) { 1654 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1655 tid); 1656 ddp_queue_toep(toep); 1657 } 1658 sorwakeup_locked(so); 1659 SOCKBUF_UNLOCK_ASSERT(sb); 1660 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1661 DDP_UNLOCK(toep); 1662 1663 INP_WUNLOCK(inp); 1664 CURVNET_RESTORE(); 1665 return (0); 1666 } 1667 1668 static int 1669 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1670 { 1671 struct adapter *sc = iq->adapter; 1672 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1673 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1674 struct toepcb *toep = lookup_tid(sc, tid); 1675 struct inpcb *inp; 1676 struct tcpcb *tp; 1677 struct socket *so; 1678 uint8_t credits = cpl->credits; 1679 struct ofld_tx_sdesc *txsd; 1680 int plen; 1681 #ifdef INVARIANTS 1682 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1683 #endif 1684 1685 /* 1686 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1687 * now this comes back carrying the credits for the flowc. 1688 */ 1689 if (__predict_false(toep->flags & TPF_SYNQE)) { 1690 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1691 ("%s: credits for a synq entry %p", __func__, toep)); 1692 return (0); 1693 } 1694 1695 inp = toep->inp; 1696 1697 KASSERT(opcode == CPL_FW4_ACK, 1698 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1699 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1700 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1701 1702 INP_WLOCK(inp); 1703 1704 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1705 INP_WUNLOCK(inp); 1706 return (0); 1707 } 1708 1709 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1710 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1711 1712 tp = intotcpcb(inp); 1713 1714 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1715 tcp_seq snd_una = be32toh(cpl->snd_una); 1716 1717 #ifdef INVARIANTS 1718 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1719 log(LOG_ERR, 1720 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1721 __func__, snd_una, toep->tid, tp->snd_una); 1722 } 1723 #endif 1724 1725 if (tp->snd_una != snd_una) { 1726 tp->snd_una = snd_una; 1727 tp->ts_recent_age = tcp_ts_getticks(); 1728 } 1729 } 1730 1731 #ifdef VERBOSE_TRACES 1732 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1733 #endif 1734 so = inp->inp_socket; 1735 txsd = &toep->txsd[toep->txsd_cidx]; 1736 plen = 0; 1737 while (credits) { 1738 KASSERT(credits >= txsd->tx_credits, 1739 ("%s: too many (or partial) credits", __func__)); 1740 credits -= txsd->tx_credits; 1741 toep->tx_credits += txsd->tx_credits; 1742 plen += txsd->plen; 1743 if (txsd->iv_buffer) { 1744 free(txsd->iv_buffer, M_CXGBE); 1745 txsd->iv_buffer = NULL; 1746 } 1747 txsd++; 1748 toep->txsd_avail++; 1749 KASSERT(toep->txsd_avail <= toep->txsd_total, 1750 ("%s: txsd avail > total", __func__)); 1751 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1752 txsd = &toep->txsd[0]; 1753 toep->txsd_cidx = 0; 1754 } 1755 } 1756 1757 if (toep->tx_credits == toep->tx_total) { 1758 toep->tx_nocompl = 0; 1759 toep->plen_nocompl = 0; 1760 } 1761 1762 if (toep->flags & TPF_TX_SUSPENDED && 1763 toep->tx_credits >= toep->tx_total / 4) { 1764 #ifdef VERBOSE_TRACES 1765 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1766 tid); 1767 #endif 1768 toep->flags &= ~TPF_TX_SUSPENDED; 1769 CURVNET_SET(toep->vnet); 1770 t4_push_data(sc, toep, plen); 1771 CURVNET_RESTORE(); 1772 } else if (plen > 0) { 1773 struct sockbuf *sb = &so->so_snd; 1774 int sbu; 1775 1776 SOCKBUF_LOCK(sb); 1777 sbu = sbused(sb); 1778 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 1779 1780 if (__predict_false(sbu > 0)) { 1781 /* 1782 * The data trasmitted before the tid's ULP mode 1783 * changed to ISCSI is still in so_snd. 1784 * Incoming credits should account for so_snd 1785 * first. 1786 */ 1787 sbdrop_locked(sb, min(sbu, plen)); 1788 plen -= min(sbu, plen); 1789 } 1790 sowwakeup_locked(so); /* unlocks so_snd */ 1791 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1792 } else { 1793 #ifdef VERBOSE_TRACES 1794 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1795 tid, plen); 1796 #endif 1797 sbdrop_locked(sb, plen); 1798 if (tls_tx_key(toep) && 1799 toep->tls.mode == TLS_MODE_TLSOM) { 1800 struct tls_ofld_info *tls_ofld = &toep->tls; 1801 1802 MPASS(tls_ofld->sb_off >= plen); 1803 tls_ofld->sb_off -= plen; 1804 } 1805 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1806 t4_aiotx_queue_toep(so, toep); 1807 sowwakeup_locked(so); /* unlocks so_snd */ 1808 } 1809 SOCKBUF_UNLOCK_ASSERT(sb); 1810 } 1811 1812 INP_WUNLOCK(inp); 1813 1814 return (0); 1815 } 1816 1817 void 1818 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1819 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1820 { 1821 struct wrqe *wr; 1822 struct cpl_set_tcb_field *req; 1823 struct ofld_tx_sdesc *txsd; 1824 1825 MPASS((cookie & ~M_COOKIE) == 0); 1826 if (reply) { 1827 MPASS(cookie != CPL_COOKIE_RESERVED); 1828 } 1829 1830 wr = alloc_wrqe(sizeof(*req), wrq); 1831 if (wr == NULL) { 1832 /* XXX */ 1833 panic("%s: allocation failure.", __func__); 1834 } 1835 req = wrtod(wr); 1836 1837 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1838 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1839 if (reply == 0) 1840 req->reply_ctrl |= htobe16(F_NO_REPLY); 1841 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1842 req->mask = htobe64(mask); 1843 req->val = htobe64(val); 1844 if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { 1845 txsd = &toep->txsd[toep->txsd_pidx]; 1846 txsd->tx_credits = howmany(sizeof(*req), 16); 1847 txsd->plen = 0; 1848 KASSERT(toep->tx_credits >= txsd->tx_credits && 1849 toep->txsd_avail > 0, 1850 ("%s: not enough credits (%d)", __func__, 1851 toep->tx_credits)); 1852 toep->tx_credits -= txsd->tx_credits; 1853 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1854 toep->txsd_pidx = 0; 1855 toep->txsd_avail--; 1856 } 1857 1858 t4_wrq_tx(sc, wr); 1859 } 1860 1861 void 1862 t4_init_cpl_io_handlers(void) 1863 { 1864 1865 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1866 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1867 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1868 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1869 CPL_COOKIE_TOM); 1870 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1871 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1872 } 1873 1874 void 1875 t4_uninit_cpl_io_handlers(void) 1876 { 1877 1878 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1879 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1880 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1881 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1882 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1883 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1884 } 1885 1886 /* 1887 * Use the 'backend1' field in AIO jobs to hold an error that should 1888 * be reported when the job is completed, the 'backend3' field to 1889 * store the amount of data sent by the AIO job so far, and the 1890 * 'backend4' field to hold a reference count on the job. 1891 * 1892 * Each unmapped mbuf holds a reference on the job as does the queue 1893 * so long as the job is queued. 1894 */ 1895 #define aio_error backend1 1896 #define aio_sent backend3 1897 #define aio_refs backend4 1898 1899 #define jobtotid(job) \ 1900 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1901 1902 static void 1903 aiotx_free_job(struct kaiocb *job) 1904 { 1905 long status; 1906 int error; 1907 1908 if (refcount_release(&job->aio_refs) == 0) 1909 return; 1910 1911 error = (intptr_t)job->aio_error; 1912 status = job->aio_sent; 1913 #ifdef VERBOSE_TRACES 1914 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1915 jobtotid(job), job, status, error); 1916 #endif 1917 if (error != 0 && status != 0) 1918 error = 0; 1919 if (error == ECANCELED) 1920 aio_cancel(job); 1921 else if (error) 1922 aio_complete(job, -1, error); 1923 else { 1924 job->msgsnd = 1; 1925 aio_complete(job, status, 0); 1926 } 1927 } 1928 1929 static void 1930 aiotx_free_pgs(struct mbuf *m) 1931 { 1932 struct kaiocb *job; 1933 vm_page_t pg; 1934 1935 M_ASSERTEXTPG(m); 1936 job = m->m_ext.ext_arg1; 1937 #ifdef VERBOSE_TRACES 1938 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1939 m->m_len, jobtotid(job)); 1940 #endif 1941 1942 for (int i = 0; i < m->m_epg_npgs; i++) { 1943 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1944 vm_page_unwire(pg, PQ_ACTIVE); 1945 } 1946 1947 aiotx_free_job(job); 1948 } 1949 1950 /* 1951 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 1952 * of an AIO job. 1953 */ 1954 static struct mbuf * 1955 alloc_aiotx_mbuf(struct kaiocb *job, int len) 1956 { 1957 struct vmspace *vm; 1958 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 1959 struct mbuf *m, *top, *last; 1960 vm_map_t map; 1961 vm_offset_t start; 1962 int i, mlen, npages, pgoff; 1963 1964 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 1965 ("%s(%p, %d): request to send beyond end of buffer", __func__, 1966 job, len)); 1967 1968 /* 1969 * The AIO subsystem will cancel and drain all requests before 1970 * permitting a process to exit or exec, so p_vmspace should 1971 * be stable here. 1972 */ 1973 vm = job->userproc->p_vmspace; 1974 map = &vm->vm_map; 1975 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 1976 pgoff = start & PAGE_MASK; 1977 1978 top = NULL; 1979 last = NULL; 1980 while (len > 0) { 1981 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 1982 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 1983 ("%s: next start (%#jx + %#x) is not page aligned", 1984 __func__, (uintmax_t)start, mlen)); 1985 1986 npages = vm_fault_quick_hold_pages(map, start, mlen, 1987 VM_PROT_WRITE, pgs, nitems(pgs)); 1988 if (npages < 0) 1989 break; 1990 1991 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs); 1992 if (m == NULL) { 1993 vm_page_unhold_pages(pgs, npages); 1994 break; 1995 } 1996 1997 m->m_epg_1st_off = pgoff; 1998 m->m_epg_npgs = npages; 1999 if (npages == 1) { 2000 KASSERT(mlen + pgoff <= PAGE_SIZE, 2001 ("%s: single page is too large (off %d len %d)", 2002 __func__, pgoff, mlen)); 2003 m->m_epg_last_len = mlen; 2004 } else { 2005 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2006 (npages - 2) * PAGE_SIZE; 2007 } 2008 for (i = 0; i < npages; i++) 2009 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2010 2011 m->m_len = mlen; 2012 m->m_ext.ext_size = npages * PAGE_SIZE; 2013 m->m_ext.ext_arg1 = job; 2014 refcount_acquire(&job->aio_refs); 2015 2016 #ifdef VERBOSE_TRACES 2017 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2018 __func__, jobtotid(job), m, job, npages); 2019 #endif 2020 2021 if (top == NULL) 2022 top = m; 2023 else 2024 last->m_next = m; 2025 last = m; 2026 2027 len -= mlen; 2028 start += mlen; 2029 pgoff = 0; 2030 } 2031 2032 return (top); 2033 } 2034 2035 static void 2036 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2037 { 2038 struct sockbuf *sb; 2039 struct file *fp; 2040 struct inpcb *inp; 2041 struct tcpcb *tp; 2042 struct mbuf *m; 2043 int error, len; 2044 bool moretocome, sendmore; 2045 2046 sb = &so->so_snd; 2047 SOCKBUF_UNLOCK(sb); 2048 fp = job->fd_file; 2049 m = NULL; 2050 2051 #ifdef MAC 2052 error = mac_socket_check_send(fp->f_cred, so); 2053 if (error != 0) 2054 goto out; 2055 #endif 2056 2057 /* Inline sosend_generic(). */ 2058 2059 error = sblock(sb, SBL_WAIT); 2060 MPASS(error == 0); 2061 2062 sendanother: 2063 SOCKBUF_LOCK(sb); 2064 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2065 SOCKBUF_UNLOCK(sb); 2066 sbunlock(sb); 2067 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2068 PROC_LOCK(job->userproc); 2069 kern_psignal(job->userproc, SIGPIPE); 2070 PROC_UNLOCK(job->userproc); 2071 } 2072 error = EPIPE; 2073 goto out; 2074 } 2075 if (so->so_error) { 2076 error = so->so_error; 2077 so->so_error = 0; 2078 SOCKBUF_UNLOCK(sb); 2079 sbunlock(sb); 2080 goto out; 2081 } 2082 if ((so->so_state & SS_ISCONNECTED) == 0) { 2083 SOCKBUF_UNLOCK(sb); 2084 sbunlock(sb); 2085 error = ENOTCONN; 2086 goto out; 2087 } 2088 if (sbspace(sb) < sb->sb_lowat) { 2089 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2090 2091 /* 2092 * Don't block if there is too little room in the socket 2093 * buffer. Instead, requeue the request. 2094 */ 2095 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2096 SOCKBUF_UNLOCK(sb); 2097 sbunlock(sb); 2098 error = ECANCELED; 2099 goto out; 2100 } 2101 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2102 SOCKBUF_UNLOCK(sb); 2103 sbunlock(sb); 2104 goto out; 2105 } 2106 2107 /* 2108 * Write as much data as the socket permits, but no more than a 2109 * a single sndbuf at a time. 2110 */ 2111 len = sbspace(sb); 2112 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2113 len = job->uaiocb.aio_nbytes - job->aio_sent; 2114 moretocome = false; 2115 } else 2116 moretocome = true; 2117 if (len > toep->params.sndbuf) { 2118 len = toep->params.sndbuf; 2119 sendmore = true; 2120 } else 2121 sendmore = false; 2122 2123 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2124 moretocome = true; 2125 SOCKBUF_UNLOCK(sb); 2126 MPASS(len != 0); 2127 2128 m = alloc_aiotx_mbuf(job, len); 2129 if (m == NULL) { 2130 sbunlock(sb); 2131 error = EFAULT; 2132 goto out; 2133 } 2134 2135 /* Inlined tcp_usr_send(). */ 2136 2137 inp = toep->inp; 2138 INP_WLOCK(inp); 2139 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2140 INP_WUNLOCK(inp); 2141 sbunlock(sb); 2142 error = ECONNRESET; 2143 goto out; 2144 } 2145 2146 job->aio_sent += m_length(m, NULL); 2147 2148 sbappendstream(sb, m, 0); 2149 m = NULL; 2150 2151 if (!(inp->inp_flags & INP_DROPPED)) { 2152 tp = intotcpcb(inp); 2153 if (moretocome) 2154 tp->t_flags |= TF_MORETOCOME; 2155 error = tp->t_fb->tfb_tcp_output(tp); 2156 if (moretocome) 2157 tp->t_flags &= ~TF_MORETOCOME; 2158 } 2159 2160 INP_WUNLOCK(inp); 2161 if (sendmore) 2162 goto sendanother; 2163 sbunlock(sb); 2164 2165 if (error) 2166 goto out; 2167 2168 /* 2169 * If this is a blocking socket and the request has not been 2170 * fully completed, requeue it until the socket is ready 2171 * again. 2172 */ 2173 if (job->aio_sent < job->uaiocb.aio_nbytes && 2174 !(so->so_state & SS_NBIO)) { 2175 SOCKBUF_LOCK(sb); 2176 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2177 SOCKBUF_UNLOCK(sb); 2178 error = ECANCELED; 2179 goto out; 2180 } 2181 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2182 return; 2183 } 2184 2185 /* 2186 * If the request will not be requeued, drop the queue's 2187 * reference to the job. Any mbufs in flight should still 2188 * hold a reference, but this drops the reference that the 2189 * queue owns while it is waiting to queue mbufs to the 2190 * socket. 2191 */ 2192 aiotx_free_job(job); 2193 2194 out: 2195 if (error) { 2196 job->aio_error = (void *)(intptr_t)error; 2197 aiotx_free_job(job); 2198 } 2199 if (m != NULL) 2200 m_free(m); 2201 SOCKBUF_LOCK(sb); 2202 } 2203 2204 static void 2205 t4_aiotx_task(void *context, int pending) 2206 { 2207 struct toepcb *toep = context; 2208 struct socket *so; 2209 struct kaiocb *job; 2210 2211 so = toep->aiotx_so; 2212 CURVNET_SET(toep->vnet); 2213 SOCKBUF_LOCK(&so->so_snd); 2214 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2215 job = TAILQ_FIRST(&toep->aiotx_jobq); 2216 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2217 if (!aio_clear_cancel_function(job)) 2218 continue; 2219 2220 t4_aiotx_process_job(toep, so, job); 2221 } 2222 toep->aiotx_so = NULL; 2223 SOCKBUF_UNLOCK(&so->so_snd); 2224 CURVNET_RESTORE(); 2225 2226 free_toepcb(toep); 2227 SOCK_LOCK(so); 2228 sorele(so); 2229 } 2230 2231 static void 2232 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2233 { 2234 2235 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2236 #ifdef VERBOSE_TRACES 2237 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2238 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2239 #endif 2240 if (toep->aiotx_so != NULL) 2241 return; 2242 soref(so); 2243 toep->aiotx_so = so; 2244 hold_toepcb(toep); 2245 soaio_enqueue(&toep->aiotx_task); 2246 } 2247 2248 static void 2249 t4_aiotx_cancel(struct kaiocb *job) 2250 { 2251 struct socket *so; 2252 struct sockbuf *sb; 2253 struct tcpcb *tp; 2254 struct toepcb *toep; 2255 2256 so = job->fd_file->f_data; 2257 tp = so_sototcpcb(so); 2258 toep = tp->t_toe; 2259 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2260 sb = &so->so_snd; 2261 2262 SOCKBUF_LOCK(sb); 2263 if (!aio_cancel_cleared(job)) 2264 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2265 SOCKBUF_UNLOCK(sb); 2266 2267 job->aio_error = (void *)(intptr_t)ECANCELED; 2268 aiotx_free_job(job); 2269 } 2270 2271 int 2272 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2273 { 2274 struct tcpcb *tp = so_sototcpcb(so); 2275 struct toepcb *toep = tp->t_toe; 2276 struct adapter *sc = td_adapter(toep->td); 2277 2278 /* This only handles writes. */ 2279 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2280 return (EOPNOTSUPP); 2281 2282 if (!sc->tt.tx_zcopy) 2283 return (EOPNOTSUPP); 2284 2285 if (tls_tx_key(toep)) 2286 return (EOPNOTSUPP); 2287 2288 SOCKBUF_LOCK(&so->so_snd); 2289 #ifdef VERBOSE_TRACES 2290 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2291 #endif 2292 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2293 panic("new job was cancelled"); 2294 refcount_init(&job->aio_refs, 1); 2295 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2296 if (sowriteable(so)) 2297 t4_aiotx_queue_toep(so, toep); 2298 SOCKBUF_UNLOCK(&so->so_snd); 2299 return (0); 2300 } 2301 2302 void 2303 aiotx_init_toep(struct toepcb *toep) 2304 { 2305 2306 TAILQ_INIT(&toep->aiotx_jobq); 2307 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2308 } 2309 #endif 2310