1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_kern_tls.h" 36 #include "opt_ratelimit.h" 37 38 #ifdef TCP_OFFLOAD 39 #include <sys/param.h> 40 #include <sys/aio.h> 41 #include <sys/file.h> 42 #include <sys/kernel.h> 43 #include <sys/ktr.h> 44 #include <sys/module.h> 45 #include <sys/proc.h> 46 #include <sys/protosw.h> 47 #include <sys/domain.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sglist.h> 51 #include <sys/taskqueue.h> 52 #include <netinet/in.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip6.h> 56 #define TCPSTATES 57 #include <netinet/tcp_fsm.h> 58 #include <netinet/tcp_seq.h> 59 #include <netinet/tcp_var.h> 60 #include <netinet/toecore.h> 61 62 #include <security/mac/mac_framework.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_page.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (ulp_mode(toep) == ULP_MODE_TLS) 100 nparams++; 101 if (toep->tls.fcplenmax != 0) 102 nparams++; 103 if (toep->params.tc_idx != -1) { 104 MPASS(toep->params.tc_idx >= 0 && 105 toep->params.tc_idx < sc->chip_params->nsched_cls); 106 nparams++; 107 } 108 109 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 110 111 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 112 if (wr == NULL) { 113 /* XXX */ 114 panic("%s: allocation failure.", __func__); 115 } 116 flowc = wrtod(wr); 117 memset(flowc, 0, wr->wr_len); 118 119 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 120 V_FW_FLOWC_WR_NPARAMS(nparams)); 121 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 122 V_FW_WR_FLOWID(toep->tid)); 123 124 #define FLOWC_PARAM(__m, __v) \ 125 do { \ 126 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 127 flowc->mnemval[paramidx].val = htobe32(__v); \ 128 paramidx++; \ 129 } while (0) 130 131 paramidx = 0; 132 133 FLOWC_PARAM(PFNVFN, pfvf); 134 FLOWC_PARAM(CH, pi->tx_chan); 135 FLOWC_PARAM(PORT, pi->tx_chan); 136 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 137 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 138 if (tp) { 139 FLOWC_PARAM(MSS, toep->params.emss); 140 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 141 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 142 } else 143 FLOWC_PARAM(MSS, 512); 144 CTR6(KTR_CXGBE, 145 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 146 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 147 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 148 149 if (ulp_mode(toep) == ULP_MODE_TLS) 150 FLOWC_PARAM(ULP_MODE, ulp_mode(toep)); 151 if (toep->tls.fcplenmax != 0) 152 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 153 if (toep->params.tc_idx != -1) 154 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 155 #undef FLOWC_PARAM 156 157 KASSERT(paramidx == nparams, ("nparams mismatch")); 158 159 txsd->tx_credits = howmany(flowclen, 16); 160 txsd->plen = 0; 161 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 162 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 163 toep->tx_credits -= txsd->tx_credits; 164 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 165 toep->txsd_pidx = 0; 166 toep->txsd_avail--; 167 168 toep->flags |= TPF_FLOWC_WR_SENT; 169 t4_wrq_tx(sc, wr); 170 } 171 172 #ifdef RATELIMIT 173 /* 174 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 175 */ 176 static int 177 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 178 { 179 int tc_idx, rc; 180 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 181 const int port_id = toep->vi->pi->port_id; 182 183 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 184 185 if (kbps == 0) { 186 /* unbind */ 187 tc_idx = -1; 188 } else { 189 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 190 if (rc != 0) 191 return (rc); 192 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 193 } 194 195 if (toep->params.tc_idx != tc_idx) { 196 struct wrqe *wr; 197 struct fw_flowc_wr *flowc; 198 int nparams = 1, flowclen, flowclen16; 199 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 200 201 flowclen = sizeof(*flowc) + nparams * sizeof(struct 202 fw_flowc_mnemval); 203 flowclen16 = howmany(flowclen, 16); 204 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 205 (wr = alloc_wrqe(roundup2(flowclen, 16), 206 &toep->ofld_txq->wrq)) == NULL) { 207 if (tc_idx >= 0) 208 t4_release_cl_rl(sc, port_id, tc_idx); 209 return (ENOMEM); 210 } 211 212 flowc = wrtod(wr); 213 memset(flowc, 0, wr->wr_len); 214 215 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 216 V_FW_FLOWC_WR_NPARAMS(nparams)); 217 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 218 V_FW_WR_FLOWID(toep->tid)); 219 220 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 221 if (tc_idx == -1) 222 flowc->mnemval[0].val = htobe32(0xff); 223 else 224 flowc->mnemval[0].val = htobe32(tc_idx); 225 226 txsd->tx_credits = flowclen16; 227 txsd->plen = 0; 228 toep->tx_credits -= txsd->tx_credits; 229 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 230 toep->txsd_pidx = 0; 231 toep->txsd_avail--; 232 t4_wrq_tx(sc, wr); 233 } 234 235 if (toep->params.tc_idx >= 0) 236 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 237 toep->params.tc_idx = tc_idx; 238 239 return (0); 240 } 241 #endif 242 243 void 244 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 245 { 246 struct wrqe *wr; 247 struct cpl_abort_req *req; 248 int tid = toep->tid; 249 struct inpcb *inp = toep->inp; 250 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 251 252 INP_WLOCK_ASSERT(inp); 253 254 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 255 __func__, toep->tid, 256 inp->inp_flags & INP_DROPPED ? "inp dropped" : 257 tcpstates[tp->t_state], 258 toep->flags, inp->inp_flags, 259 toep->flags & TPF_ABORT_SHUTDOWN ? 260 " (abort already in progress)" : ""); 261 262 if (toep->flags & TPF_ABORT_SHUTDOWN) 263 return; /* abort already in progress */ 264 265 toep->flags |= TPF_ABORT_SHUTDOWN; 266 267 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 268 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 269 270 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 271 if (wr == NULL) { 272 /* XXX */ 273 panic("%s: allocation failure.", __func__); 274 } 275 req = wrtod(wr); 276 277 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 278 if (inp->inp_flags & INP_DROPPED) 279 req->rsvd0 = htobe32(snd_nxt); 280 else 281 req->rsvd0 = htobe32(tp->snd_nxt); 282 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 283 req->cmd = CPL_ABORT_SEND_RST; 284 285 /* 286 * XXX: What's the correct way to tell that the inp hasn't been detached 287 * from its socket? Should I even be flushing the snd buffer here? 288 */ 289 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 290 struct socket *so = inp->inp_socket; 291 292 if (so != NULL) /* because I'm not sure. See comment above */ 293 sbflush(&so->so_snd); 294 } 295 296 t4_l2t_send(sc, wr, toep->l2te); 297 } 298 299 /* 300 * Called when a connection is established to translate the TCP options 301 * reported by HW to FreeBSD's native format. 302 */ 303 static void 304 assign_rxopt(struct tcpcb *tp, uint16_t opt) 305 { 306 struct toepcb *toep = tp->t_toe; 307 struct inpcb *inp = tp->t_inpcb; 308 struct adapter *sc = td_adapter(toep->td); 309 310 INP_LOCK_ASSERT(inp); 311 312 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 313 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 314 if (inp->inp_inc.inc_flags & INC_ISIPV6) 315 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 316 else 317 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 318 319 toep->params.emss = tp->t_maxseg; 320 if (G_TCPOPT_TSTAMP(opt)) { 321 toep->params.tstamp = 1; 322 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 323 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 324 tp->ts_recent = 0; /* hmmm */ 325 tp->ts_recent_age = tcp_ts_getticks(); 326 } else 327 toep->params.tstamp = 0; 328 329 if (G_TCPOPT_SACK(opt)) { 330 toep->params.sack = 1; 331 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 332 } else { 333 toep->params.sack = 0; 334 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 335 } 336 337 if (G_TCPOPT_WSCALE_OK(opt)) 338 tp->t_flags |= TF_RCVD_SCALE; 339 340 /* Doing window scaling? */ 341 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 342 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 343 tp->rcv_scale = tp->request_r_scale; 344 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 345 } else 346 toep->params.wscale = 0; 347 348 CTR6(KTR_CXGBE, 349 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 350 toep->tid, toep->params.mtu_idx, toep->params.emss, 351 toep->params.tstamp, toep->params.sack, toep->params.wscale); 352 } 353 354 /* 355 * Completes some final bits of initialization for just established connections 356 * and changes their state to TCPS_ESTABLISHED. 357 * 358 * The ISNs are from the exchange of SYNs. 359 */ 360 void 361 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 362 { 363 struct inpcb *inp = toep->inp; 364 struct socket *so = inp->inp_socket; 365 struct tcpcb *tp = intotcpcb(inp); 366 uint16_t tcpopt = be16toh(opt); 367 368 INP_WLOCK_ASSERT(inp); 369 KASSERT(tp->t_state == TCPS_SYN_SENT || 370 tp->t_state == TCPS_SYN_RECEIVED, 371 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 372 373 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 374 __func__, toep->tid, so, inp, tp, toep); 375 376 tcp_state_change(tp, TCPS_ESTABLISHED); 377 tp->t_starttime = ticks; 378 TCPSTAT_INC(tcps_connects); 379 380 tp->irs = irs; 381 tcp_rcvseqinit(tp); 382 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 383 tp->rcv_adv += tp->rcv_wnd; 384 tp->last_ack_sent = tp->rcv_nxt; 385 386 tp->iss = iss; 387 tcp_sendseqinit(tp); 388 tp->snd_una = iss + 1; 389 tp->snd_nxt = iss + 1; 390 tp->snd_max = iss + 1; 391 392 assign_rxopt(tp, tcpopt); 393 send_flowc_wr(toep, tp); 394 395 soisconnected(so); 396 397 if (ulp_mode(toep) == ULP_MODE_TLS) 398 tls_establish(toep); 399 } 400 401 int 402 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 403 { 404 struct wrqe *wr; 405 struct cpl_rx_data_ack *req; 406 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 407 408 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 409 410 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 411 if (wr == NULL) 412 return (0); 413 req = wrtod(wr); 414 415 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 416 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 417 418 t4_wrq_tx(sc, wr); 419 return (credits); 420 } 421 422 void 423 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 424 { 425 struct wrqe *wr; 426 struct cpl_rx_data_ack *req; 427 428 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 429 if (wr == NULL) 430 return; 431 req = wrtod(wr); 432 433 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 434 req->credit_dack = htobe32(F_RX_MODULATE_RX); 435 436 t4_wrq_tx(sc, wr); 437 } 438 439 void 440 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 441 { 442 struct adapter *sc = tod->tod_softc; 443 struct inpcb *inp = tp->t_inpcb; 444 struct socket *so = inp->inp_socket; 445 struct sockbuf *sb = &so->so_rcv; 446 struct toepcb *toep = tp->t_toe; 447 int rx_credits; 448 449 INP_WLOCK_ASSERT(inp); 450 SOCKBUF_LOCK_ASSERT(sb); 451 452 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 453 if (rx_credits > 0 && 454 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 455 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 456 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 457 rx_credits = send_rx_credits(sc, toep, rx_credits); 458 tp->rcv_wnd += rx_credits; 459 tp->rcv_adv += rx_credits; 460 } else if (toep->flags & TPF_FORCE_CREDITS) 461 send_rx_modulate(sc, toep); 462 } 463 464 void 465 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 466 { 467 struct inpcb *inp = tp->t_inpcb; 468 struct socket *so = inp->inp_socket; 469 struct sockbuf *sb = &so->so_rcv; 470 471 SOCKBUF_LOCK(sb); 472 t4_rcvd_locked(tod, tp); 473 SOCKBUF_UNLOCK(sb); 474 } 475 476 /* 477 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 478 */ 479 int 480 t4_close_conn(struct adapter *sc, struct toepcb *toep) 481 { 482 struct wrqe *wr; 483 struct cpl_close_con_req *req; 484 unsigned int tid = toep->tid; 485 486 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 487 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 488 489 if (toep->flags & TPF_FIN_SENT) 490 return (0); 491 492 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 493 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 494 495 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 496 if (wr == NULL) { 497 /* XXX */ 498 panic("%s: allocation failure.", __func__); 499 } 500 req = wrtod(wr); 501 502 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 503 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 504 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 505 V_FW_WR_FLOWID(tid)); 506 req->wr.wr_lo = cpu_to_be64(0); 507 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 508 req->rsvd = 0; 509 510 toep->flags |= TPF_FIN_SENT; 511 toep->flags &= ~TPF_SEND_FIN; 512 t4_l2t_send(sc, wr, toep->l2te); 513 514 return (0); 515 } 516 517 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 518 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 519 520 /* Maximum amount of immediate data we could stuff in a WR */ 521 static inline int 522 max_imm_payload(int tx_credits) 523 { 524 const int n = 1; /* Use no more than one desc for imm. data WR */ 525 526 KASSERT(tx_credits >= 0 && 527 tx_credits <= MAX_OFLD_TX_CREDITS, 528 ("%s: %d credits", __func__, tx_credits)); 529 530 if (tx_credits < MIN_OFLD_TX_CREDITS) 531 return (0); 532 533 if (tx_credits >= (n * EQ_ESIZE) / 16) 534 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 535 else 536 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 537 } 538 539 /* Maximum number of SGL entries we could stuff in a WR */ 540 static inline int 541 max_dsgl_nsegs(int tx_credits) 542 { 543 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 544 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 545 546 KASSERT(tx_credits >= 0 && 547 tx_credits <= MAX_OFLD_TX_CREDITS, 548 ("%s: %d credits", __func__, tx_credits)); 549 550 if (tx_credits < MIN_OFLD_TX_CREDITS) 551 return (0); 552 553 nseg += 2 * (sge_pair_credits * 16 / 24); 554 if ((sge_pair_credits * 16) % 24 == 16) 555 nseg++; 556 557 return (nseg); 558 } 559 560 static inline void 561 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 562 unsigned int plen, uint8_t credits, int shove, int ulp_submode) 563 { 564 struct fw_ofld_tx_data_wr *txwr = dst; 565 566 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 567 V_FW_WR_IMMDLEN(immdlen)); 568 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 569 V_FW_WR_LEN16(credits)); 570 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 571 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 572 txwr->plen = htobe32(plen); 573 574 if (toep->params.tx_align > 0) { 575 if (plen < 2 * toep->params.emss) 576 txwr->lsodisable_to_flags |= 577 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 578 else 579 txwr->lsodisable_to_flags |= 580 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 581 (toep->params.nagle == 0 ? 0 : 582 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 583 } 584 } 585 586 /* 587 * Generate a DSGL from a starting mbuf. The total number of segments and the 588 * maximum segments in any one mbuf are provided. 589 */ 590 static void 591 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 592 { 593 struct mbuf *m; 594 struct ulptx_sgl *usgl = dst; 595 int i, j, rc; 596 struct sglist sg; 597 struct sglist_seg segs[n]; 598 599 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 600 601 sglist_init(&sg, n, segs); 602 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 603 V_ULPTX_NSGE(nsegs)); 604 605 i = -1; 606 for (m = start; m != stop; m = m->m_next) { 607 if (m->m_flags & M_EXTPG) 608 rc = sglist_append_mbuf_epg(&sg, m, 609 mtod(m, vm_offset_t), m->m_len); 610 else 611 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 612 if (__predict_false(rc != 0)) 613 panic("%s: sglist_append %d", __func__, rc); 614 615 for (j = 0; j < sg.sg_nseg; i++, j++) { 616 if (i < 0) { 617 usgl->len0 = htobe32(segs[j].ss_len); 618 usgl->addr0 = htobe64(segs[j].ss_paddr); 619 } else { 620 usgl->sge[i / 2].len[i & 1] = 621 htobe32(segs[j].ss_len); 622 usgl->sge[i / 2].addr[i & 1] = 623 htobe64(segs[j].ss_paddr); 624 } 625 #ifdef INVARIANTS 626 nsegs--; 627 #endif 628 } 629 sglist_reset(&sg); 630 } 631 if (i & 1) 632 usgl->sge[i / 2].len[1] = htobe32(0); 633 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 634 __func__, nsegs, start, stop)); 635 } 636 637 /* 638 * Max number of SGL entries an offload tx work request can have. This is 41 639 * (1 + 40) for a full 512B work request. 640 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 641 */ 642 #define OFLD_SGL_LEN (41) 643 644 /* 645 * Send data and/or a FIN to the peer. 646 * 647 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 648 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 649 * was transmitted. 650 * 651 * drop indicates the number of bytes that should be dropped from the head of 652 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 653 * contention on the send buffer lock (before this change it used to do 654 * sowwakeup and then t4_push_frames right after that when recovering from tx 655 * stalls). When drop is set this function MUST drop the bytes and wake up any 656 * writers. 657 */ 658 void 659 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 660 { 661 struct mbuf *sndptr, *m, *sb_sndptr; 662 struct fw_ofld_tx_data_wr *txwr; 663 struct wrqe *wr; 664 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 665 struct inpcb *inp = toep->inp; 666 struct tcpcb *tp = intotcpcb(inp); 667 struct socket *so = inp->inp_socket; 668 struct sockbuf *sb = &so->so_snd; 669 int tx_credits, shove, compl, sowwakeup; 670 struct ofld_tx_sdesc *txsd; 671 bool nomap_mbuf_seen; 672 673 INP_WLOCK_ASSERT(inp); 674 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 675 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 676 677 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 678 ulp_mode(toep) == ULP_MODE_TCPDDP || 679 ulp_mode(toep) == ULP_MODE_TLS || 680 ulp_mode(toep) == ULP_MODE_RDMA, 681 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 682 683 #ifdef VERBOSE_TRACES 684 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 685 __func__, toep->tid, toep->flags, tp->t_flags, drop); 686 #endif 687 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 688 return; 689 690 #ifdef RATELIMIT 691 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 692 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 693 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 694 } 695 #endif 696 697 /* 698 * This function doesn't resume by itself. Someone else must clear the 699 * flag and call this function. 700 */ 701 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 702 KASSERT(drop == 0, 703 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 704 return; 705 } 706 707 txsd = &toep->txsd[toep->txsd_pidx]; 708 do { 709 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 710 max_imm = max_imm_payload(tx_credits); 711 max_nsegs = max_dsgl_nsegs(tx_credits); 712 713 SOCKBUF_LOCK(sb); 714 sowwakeup = drop; 715 if (drop) { 716 sbdrop_locked(sb, drop); 717 drop = 0; 718 } 719 sb_sndptr = sb->sb_sndptr; 720 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 721 plen = 0; 722 nsegs = 0; 723 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 724 nomap_mbuf_seen = false; 725 for (m = sndptr; m != NULL; m = m->m_next) { 726 int n; 727 728 if ((m->m_flags & M_NOTAVAIL) != 0) 729 break; 730 if (m->m_flags & M_EXTPG) { 731 #ifdef KERN_TLS 732 if (m->m_epg_tls != NULL) { 733 toep->flags |= TPF_KTLS; 734 if (plen == 0) { 735 SOCKBUF_UNLOCK(sb); 736 t4_push_ktls(sc, toep, 0); 737 return; 738 } 739 break; 740 } 741 #endif 742 n = sglist_count_mbuf_epg(m, 743 mtod(m, vm_offset_t), m->m_len); 744 } else 745 n = sglist_count(mtod(m, void *), m->m_len); 746 747 nsegs += n; 748 plen += m->m_len; 749 750 /* This mbuf sent us _over_ the nsegs limit, back out */ 751 if (plen > max_imm && nsegs > max_nsegs) { 752 nsegs -= n; 753 plen -= m->m_len; 754 if (plen == 0) { 755 /* Too few credits */ 756 toep->flags |= TPF_TX_SUSPENDED; 757 if (sowwakeup) { 758 if (!TAILQ_EMPTY( 759 &toep->aiotx_jobq)) 760 t4_aiotx_queue_toep(so, 761 toep); 762 sowwakeup_locked(so); 763 } else 764 SOCKBUF_UNLOCK(sb); 765 SOCKBUF_UNLOCK_ASSERT(sb); 766 return; 767 } 768 break; 769 } 770 771 if (m->m_flags & M_EXTPG) 772 nomap_mbuf_seen = true; 773 if (max_nsegs_1mbuf < n) 774 max_nsegs_1mbuf = n; 775 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 776 777 /* This mbuf put us right at the max_nsegs limit */ 778 if (plen > max_imm && nsegs == max_nsegs) { 779 m = m->m_next; 780 break; 781 } 782 } 783 784 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 785 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 786 compl = 1; 787 else 788 compl = 0; 789 790 if (sb->sb_flags & SB_AUTOSIZE && 791 V_tcp_do_autosndbuf && 792 sb->sb_hiwat < V_tcp_autosndbuf_max && 793 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 794 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 795 V_tcp_autosndbuf_max); 796 797 if (!sbreserve_locked(sb, newsize, so, NULL)) 798 sb->sb_flags &= ~SB_AUTOSIZE; 799 else 800 sowwakeup = 1; /* room available */ 801 } 802 if (sowwakeup) { 803 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 804 t4_aiotx_queue_toep(so, toep); 805 sowwakeup_locked(so); 806 } else 807 SOCKBUF_UNLOCK(sb); 808 SOCKBUF_UNLOCK_ASSERT(sb); 809 810 /* nothing to send */ 811 if (plen == 0) { 812 KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0, 813 ("%s: nothing to send, but m != NULL is ready", 814 __func__)); 815 break; 816 } 817 818 if (__predict_false(toep->flags & TPF_FIN_SENT)) 819 panic("%s: excess tx.", __func__); 820 821 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 822 if (plen <= max_imm && !nomap_mbuf_seen) { 823 824 /* Immediate data tx */ 825 826 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 827 &toep->ofld_txq->wrq); 828 if (wr == NULL) { 829 /* XXX: how will we recover from this? */ 830 toep->flags |= TPF_TX_SUSPENDED; 831 return; 832 } 833 txwr = wrtod(wr); 834 credits = howmany(wr->wr_len, 16); 835 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); 836 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 837 nsegs = 0; 838 } else { 839 int wr_len; 840 841 /* DSGL tx */ 842 843 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 844 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 845 wr = alloc_wrqe(roundup2(wr_len, 16), 846 &toep->ofld_txq->wrq); 847 if (wr == NULL) { 848 /* XXX: how will we recover from this? */ 849 toep->flags |= TPF_TX_SUSPENDED; 850 return; 851 } 852 txwr = wrtod(wr); 853 credits = howmany(wr_len, 16); 854 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); 855 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 856 max_nsegs_1mbuf); 857 if (wr_len & 0xf) { 858 uint64_t *pad = (uint64_t *) 859 ((uintptr_t)txwr + wr_len); 860 *pad = 0; 861 } 862 } 863 864 KASSERT(toep->tx_credits >= credits, 865 ("%s: not enough credits", __func__)); 866 867 toep->tx_credits -= credits; 868 toep->tx_nocompl += credits; 869 toep->plen_nocompl += plen; 870 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 871 toep->tx_nocompl >= toep->tx_total / 4) 872 compl = 1; 873 874 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 875 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 876 toep->tx_nocompl = 0; 877 toep->plen_nocompl = 0; 878 } 879 880 tp->snd_nxt += plen; 881 tp->snd_max += plen; 882 883 SOCKBUF_LOCK(sb); 884 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 885 sb->sb_sndptr = sb_sndptr; 886 SOCKBUF_UNLOCK(sb); 887 888 toep->flags |= TPF_TX_DATA_SENT; 889 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 890 toep->flags |= TPF_TX_SUSPENDED; 891 892 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 893 txsd->plen = plen; 894 txsd->tx_credits = credits; 895 txsd++; 896 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 897 toep->txsd_pidx = 0; 898 txsd = &toep->txsd[0]; 899 } 900 toep->txsd_avail--; 901 902 t4_l2t_send(sc, wr, toep->l2te); 903 } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0); 904 905 /* Send a FIN if requested, but only if there's no more data to send */ 906 if (m == NULL && toep->flags & TPF_SEND_FIN) 907 t4_close_conn(sc, toep); 908 } 909 910 static inline void 911 rqdrop_locked(struct mbufq *q, int plen) 912 { 913 struct mbuf *m; 914 915 while (plen > 0) { 916 m = mbufq_dequeue(q); 917 918 /* Too many credits. */ 919 MPASS(m != NULL); 920 M_ASSERTPKTHDR(m); 921 922 /* Partial credits. */ 923 MPASS(plen >= m->m_pkthdr.len); 924 925 plen -= m->m_pkthdr.len; 926 m_freem(m); 927 } 928 } 929 930 static struct wrqe * 931 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) 932 { 933 struct mbuf *m; 934 struct fw_ofld_tx_data_wr *txwr; 935 struct wrqe *wr; 936 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 937 u_int adjusted_plen, ulp_submode; 938 struct inpcb *inp = toep->inp; 939 struct tcpcb *tp = intotcpcb(inp); 940 int tx_credits, shove; 941 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 942 943 M_ASSERTPKTHDR(sndptr); 944 945 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 946 if (mbuf_raw_wr(sndptr)) { 947 plen = sndptr->m_pkthdr.len; 948 KASSERT(plen <= SGE_MAX_WR_LEN, 949 ("raw WR len %u is greater than max WR len", plen)); 950 if (plen > tx_credits * 16) 951 return (NULL); 952 953 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 954 if (__predict_false(wr == NULL)) 955 return (NULL); 956 957 m_copydata(sndptr, 0, plen, wrtod(wr)); 958 return (wr); 959 } 960 961 max_imm = max_imm_payload(tx_credits); 962 max_nsegs = max_dsgl_nsegs(tx_credits); 963 964 plen = 0; 965 nsegs = 0; 966 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 967 for (m = sndptr; m != NULL; m = m->m_next) { 968 int n = sglist_count(mtod(m, void *), m->m_len); 969 970 nsegs += n; 971 plen += m->m_len; 972 973 /* 974 * This mbuf would send us _over_ the nsegs limit. 975 * Suspend tx because the PDU can't be sent out. 976 */ 977 if (plen > max_imm && nsegs > max_nsegs) 978 return (NULL); 979 980 if (max_nsegs_1mbuf < n) 981 max_nsegs_1mbuf = n; 982 } 983 984 if (__predict_false(toep->flags & TPF_FIN_SENT)) 985 panic("%s: excess tx.", __func__); 986 987 /* 988 * We have a PDU to send. All of it goes out in one WR so 'm' 989 * is NULL. A PDU's length is always a multiple of 4. 990 */ 991 MPASS(m == NULL); 992 MPASS((plen & 3) == 0); 993 MPASS(sndptr->m_pkthdr.len == plen); 994 995 shove = !(tp->t_flags & TF_MORETOCOME); 996 ulp_submode = mbuf_ulp_submode(sndptr); 997 MPASS(ulp_submode < nitems(ulp_extra_len)); 998 999 /* 1000 * plen doesn't include header and data digests, which are 1001 * generated and inserted in the right places by the TOE, but 1002 * they do occupy TCP sequence space and need to be accounted 1003 * for. 1004 */ 1005 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1006 if (plen <= max_imm) { 1007 1008 /* Immediate data tx */ 1009 1010 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1011 &toep->ofld_txq->wrq); 1012 if (wr == NULL) { 1013 /* XXX: how will we recover from this? */ 1014 return (NULL); 1015 } 1016 txwr = wrtod(wr); 1017 credits = howmany(wr->wr_len, 16); 1018 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1019 shove, ulp_submode); 1020 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1021 nsegs = 0; 1022 } else { 1023 int wr_len; 1024 1025 /* DSGL tx */ 1026 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1027 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1028 wr = alloc_wrqe(roundup2(wr_len, 16), 1029 &toep->ofld_txq->wrq); 1030 if (wr == NULL) { 1031 /* XXX: how will we recover from this? */ 1032 return (NULL); 1033 } 1034 txwr = wrtod(wr); 1035 credits = howmany(wr_len, 16); 1036 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1037 shove, ulp_submode); 1038 write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); 1039 if (wr_len & 0xf) { 1040 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); 1041 *pad = 0; 1042 } 1043 } 1044 1045 tp->snd_nxt += adjusted_plen; 1046 tp->snd_max += adjusted_plen; 1047 1048 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, 1); 1049 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); 1050 1051 return (wr); 1052 } 1053 1054 void 1055 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 1056 { 1057 struct mbuf *sndptr, *m; 1058 struct fw_wr_hdr *wrhdr; 1059 struct wrqe *wr; 1060 u_int plen, credits; 1061 struct inpcb *inp = toep->inp; 1062 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 1063 struct mbufq *pduq = &toep->ulp_pduq; 1064 1065 INP_WLOCK_ASSERT(inp); 1066 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1067 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 1068 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 1069 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 1070 1071 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 1072 return; 1073 1074 /* 1075 * This function doesn't resume by itself. Someone else must clear the 1076 * flag and call this function. 1077 */ 1078 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 1079 KASSERT(drop == 0, 1080 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1081 return; 1082 } 1083 1084 if (drop) { 1085 struct socket *so = inp->inp_socket; 1086 struct sockbuf *sb = &so->so_snd; 1087 int sbu; 1088 1089 /* 1090 * An unlocked read is ok here as the data should only 1091 * transition from a non-zero value to either another 1092 * non-zero value or zero. Once it is zero it should 1093 * stay zero. 1094 */ 1095 if (__predict_false(sbused(sb)) > 0) { 1096 SOCKBUF_LOCK(sb); 1097 sbu = sbused(sb); 1098 if (sbu > 0) { 1099 /* 1100 * The data transmitted before the 1101 * tid's ULP mode changed to ISCSI is 1102 * still in so_snd. Incoming credits 1103 * should account for so_snd first. 1104 */ 1105 sbdrop_locked(sb, min(sbu, drop)); 1106 drop -= min(sbu, drop); 1107 } 1108 sowwakeup_locked(so); /* unlocks so_snd */ 1109 } 1110 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1111 } 1112 1113 while ((sndptr = mbufq_first(pduq)) != NULL) { 1114 wr = write_iscsi_mbuf_wr(toep, sndptr); 1115 if (wr == NULL) { 1116 toep->flags |= TPF_TX_SUSPENDED; 1117 return; 1118 } 1119 1120 plen = sndptr->m_pkthdr.len; 1121 credits = howmany(wr->wr_len, 16); 1122 KASSERT(toep->tx_credits >= credits, 1123 ("%s: not enough credits", __func__)); 1124 1125 m = mbufq_dequeue(pduq); 1126 MPASS(m == sndptr); 1127 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1128 1129 toep->tx_credits -= credits; 1130 toep->tx_nocompl += credits; 1131 toep->plen_nocompl += plen; 1132 1133 /* 1134 * Ensure there are enough credits for a full-sized WR 1135 * as page pod WRs can be full-sized. 1136 */ 1137 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && 1138 toep->tx_nocompl >= toep->tx_total / 4) { 1139 wrhdr = wrtod(wr); 1140 wrhdr->hi |= htobe32(F_FW_WR_COMPL); 1141 toep->tx_nocompl = 0; 1142 toep->plen_nocompl = 0; 1143 } 1144 1145 toep->flags |= TPF_TX_DATA_SENT; 1146 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1147 toep->flags |= TPF_TX_SUSPENDED; 1148 1149 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1150 txsd->plen = plen; 1151 txsd->tx_credits = credits; 1152 txsd++; 1153 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1154 toep->txsd_pidx = 0; 1155 txsd = &toep->txsd[0]; 1156 } 1157 toep->txsd_avail--; 1158 1159 t4_l2t_send(sc, wr, toep->l2te); 1160 } 1161 1162 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1163 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1164 t4_close_conn(sc, toep); 1165 } 1166 1167 static inline void 1168 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1169 { 1170 1171 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1172 t4_push_pdus(sc, toep, drop); 1173 else if (toep->flags & TPF_KTLS) 1174 t4_push_ktls(sc, toep, drop); 1175 else 1176 t4_push_frames(sc, toep, drop); 1177 } 1178 1179 int 1180 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1181 { 1182 struct adapter *sc = tod->tod_softc; 1183 #ifdef INVARIANTS 1184 struct inpcb *inp = tp->t_inpcb; 1185 #endif 1186 struct toepcb *toep = tp->t_toe; 1187 1188 INP_WLOCK_ASSERT(inp); 1189 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1190 ("%s: inp %p dropped.", __func__, inp)); 1191 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1192 1193 t4_push_data(sc, toep, 0); 1194 1195 return (0); 1196 } 1197 1198 int 1199 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1200 { 1201 struct adapter *sc = tod->tod_softc; 1202 #ifdef INVARIANTS 1203 struct inpcb *inp = tp->t_inpcb; 1204 #endif 1205 struct toepcb *toep = tp->t_toe; 1206 1207 INP_WLOCK_ASSERT(inp); 1208 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1209 ("%s: inp %p dropped.", __func__, inp)); 1210 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1211 1212 toep->flags |= TPF_SEND_FIN; 1213 if (tp->t_state >= TCPS_ESTABLISHED) 1214 t4_push_data(sc, toep, 0); 1215 1216 return (0); 1217 } 1218 1219 int 1220 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1221 { 1222 struct adapter *sc = tod->tod_softc; 1223 #if defined(INVARIANTS) 1224 struct inpcb *inp = tp->t_inpcb; 1225 #endif 1226 struct toepcb *toep = tp->t_toe; 1227 1228 INP_WLOCK_ASSERT(inp); 1229 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1230 ("%s: inp %p dropped.", __func__, inp)); 1231 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1232 1233 /* hmmmm */ 1234 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1235 ("%s: flowc for tid %u [%s] not sent already", 1236 __func__, toep->tid, tcpstates[tp->t_state])); 1237 1238 send_reset(sc, toep, 0); 1239 return (0); 1240 } 1241 1242 /* 1243 * Peer has sent us a FIN. 1244 */ 1245 static int 1246 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1247 { 1248 struct adapter *sc = iq->adapter; 1249 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1250 unsigned int tid = GET_TID(cpl); 1251 struct toepcb *toep = lookup_tid(sc, tid); 1252 struct inpcb *inp = toep->inp; 1253 struct tcpcb *tp = NULL; 1254 struct socket *so; 1255 struct epoch_tracker et; 1256 #ifdef INVARIANTS 1257 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1258 #endif 1259 1260 KASSERT(opcode == CPL_PEER_CLOSE, 1261 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1262 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1263 1264 if (__predict_false(toep->flags & TPF_SYNQE)) { 1265 /* 1266 * do_pass_establish must have run before do_peer_close and if 1267 * this is still a synqe instead of a toepcb then the connection 1268 * must be getting aborted. 1269 */ 1270 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1271 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1272 toep, toep->flags); 1273 return (0); 1274 } 1275 1276 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1277 1278 CURVNET_SET(toep->vnet); 1279 NET_EPOCH_ENTER(et); 1280 INP_WLOCK(inp); 1281 tp = intotcpcb(inp); 1282 1283 CTR6(KTR_CXGBE, 1284 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1285 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1286 toep->ddp.flags, inp); 1287 1288 if (toep->flags & TPF_ABORT_SHUTDOWN) 1289 goto done; 1290 1291 tp->rcv_nxt++; /* FIN */ 1292 1293 so = inp->inp_socket; 1294 socantrcvmore(so); 1295 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1296 DDP_LOCK(toep); 1297 if (__predict_false(toep->ddp.flags & 1298 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1299 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1300 DDP_UNLOCK(toep); 1301 } 1302 1303 if (ulp_mode(toep) != ULP_MODE_RDMA) { 1304 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1305 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1306 be32toh(cpl->rcv_nxt))); 1307 } 1308 1309 switch (tp->t_state) { 1310 case TCPS_SYN_RECEIVED: 1311 tp->t_starttime = ticks; 1312 /* FALLTHROUGH */ 1313 1314 case TCPS_ESTABLISHED: 1315 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1316 break; 1317 1318 case TCPS_FIN_WAIT_1: 1319 tcp_state_change(tp, TCPS_CLOSING); 1320 break; 1321 1322 case TCPS_FIN_WAIT_2: 1323 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1324 tcp_twstart(tp); 1325 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1326 NET_EPOCH_EXIT(et); 1327 CURVNET_RESTORE(); 1328 1329 INP_WLOCK(inp); 1330 final_cpl_received(toep); 1331 return (0); 1332 1333 default: 1334 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1335 __func__, tid, tp->t_state); 1336 } 1337 done: 1338 INP_WUNLOCK(inp); 1339 NET_EPOCH_EXIT(et); 1340 CURVNET_RESTORE(); 1341 return (0); 1342 } 1343 1344 /* 1345 * Peer has ACK'd our FIN. 1346 */ 1347 static int 1348 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1349 struct mbuf *m) 1350 { 1351 struct adapter *sc = iq->adapter; 1352 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1353 unsigned int tid = GET_TID(cpl); 1354 struct toepcb *toep = lookup_tid(sc, tid); 1355 struct inpcb *inp = toep->inp; 1356 struct tcpcb *tp = NULL; 1357 struct socket *so = NULL; 1358 struct epoch_tracker et; 1359 #ifdef INVARIANTS 1360 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1361 #endif 1362 1363 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1364 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1365 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1366 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1367 1368 CURVNET_SET(toep->vnet); 1369 NET_EPOCH_ENTER(et); 1370 INP_WLOCK(inp); 1371 tp = intotcpcb(inp); 1372 1373 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1374 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1375 1376 if (toep->flags & TPF_ABORT_SHUTDOWN) 1377 goto done; 1378 1379 so = inp->inp_socket; 1380 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1381 1382 switch (tp->t_state) { 1383 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1384 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1385 tcp_twstart(tp); 1386 release: 1387 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1388 NET_EPOCH_EXIT(et); 1389 CURVNET_RESTORE(); 1390 1391 INP_WLOCK(inp); 1392 final_cpl_received(toep); /* no more CPLs expected */ 1393 1394 return (0); 1395 case TCPS_LAST_ACK: 1396 if (tcp_close(tp)) 1397 INP_WUNLOCK(inp); 1398 goto release; 1399 1400 case TCPS_FIN_WAIT_1: 1401 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1402 soisdisconnected(so); 1403 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1404 break; 1405 1406 default: 1407 log(LOG_ERR, 1408 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1409 __func__, tid, tcpstates[tp->t_state]); 1410 } 1411 done: 1412 INP_WUNLOCK(inp); 1413 NET_EPOCH_EXIT(et); 1414 CURVNET_RESTORE(); 1415 return (0); 1416 } 1417 1418 void 1419 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, 1420 int rst_status) 1421 { 1422 struct wrqe *wr; 1423 struct cpl_abort_rpl *cpl; 1424 1425 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); 1426 if (wr == NULL) { 1427 /* XXX */ 1428 panic("%s: allocation failure.", __func__); 1429 } 1430 cpl = wrtod(wr); 1431 1432 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1433 cpl->cmd = rst_status; 1434 1435 t4_wrq_tx(sc, wr); 1436 } 1437 1438 static int 1439 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1440 { 1441 switch (abort_reason) { 1442 case CPL_ERR_BAD_SYN: 1443 case CPL_ERR_CONN_RESET: 1444 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1445 case CPL_ERR_XMIT_TIMEDOUT: 1446 case CPL_ERR_PERSIST_TIMEDOUT: 1447 case CPL_ERR_FINWAIT2_TIMEDOUT: 1448 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1449 return (ETIMEDOUT); 1450 default: 1451 return (EIO); 1452 } 1453 } 1454 1455 /* 1456 * TCP RST from the peer, timeout, or some other such critical error. 1457 */ 1458 static int 1459 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1460 { 1461 struct adapter *sc = iq->adapter; 1462 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1463 unsigned int tid = GET_TID(cpl); 1464 struct toepcb *toep = lookup_tid(sc, tid); 1465 struct sge_ofld_txq *ofld_txq = toep->ofld_txq; 1466 struct inpcb *inp; 1467 struct tcpcb *tp; 1468 struct epoch_tracker et; 1469 #ifdef INVARIANTS 1470 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1471 #endif 1472 1473 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1474 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1475 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1476 1477 if (toep->flags & TPF_SYNQE) 1478 return (do_abort_req_synqe(iq, rss, m)); 1479 1480 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1481 1482 if (negative_advice(cpl->status)) { 1483 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1484 __func__, cpl->status, tid, toep->flags); 1485 return (0); /* Ignore negative advice */ 1486 } 1487 1488 inp = toep->inp; 1489 CURVNET_SET(toep->vnet); 1490 NET_EPOCH_ENTER(et); /* for tcp_close */ 1491 INP_WLOCK(inp); 1492 1493 tp = intotcpcb(inp); 1494 1495 CTR6(KTR_CXGBE, 1496 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1497 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1498 inp->inp_flags, cpl->status); 1499 1500 /* 1501 * If we'd initiated an abort earlier the reply to it is responsible for 1502 * cleaning up resources. Otherwise we tear everything down right here 1503 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1504 */ 1505 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1506 INP_WUNLOCK(inp); 1507 goto done; 1508 } 1509 toep->flags |= TPF_ABORT_SHUTDOWN; 1510 1511 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1512 struct socket *so = inp->inp_socket; 1513 1514 if (so != NULL) 1515 so_error_set(so, abort_status_to_errno(tp, 1516 cpl->status)); 1517 tp = tcp_close(tp); 1518 if (tp == NULL) 1519 INP_WLOCK(inp); /* re-acquire */ 1520 } 1521 1522 final_cpl_received(toep); 1523 done: 1524 NET_EPOCH_EXIT(et); 1525 CURVNET_RESTORE(); 1526 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1527 return (0); 1528 } 1529 1530 /* 1531 * Reply to the CPL_ABORT_REQ (send_reset) 1532 */ 1533 static int 1534 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1535 { 1536 struct adapter *sc = iq->adapter; 1537 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1538 unsigned int tid = GET_TID(cpl); 1539 struct toepcb *toep = lookup_tid(sc, tid); 1540 struct inpcb *inp = toep->inp; 1541 #ifdef INVARIANTS 1542 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1543 #endif 1544 1545 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1546 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1547 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1548 1549 if (toep->flags & TPF_SYNQE) 1550 return (do_abort_rpl_synqe(iq, rss, m)); 1551 1552 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1553 1554 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1555 __func__, tid, toep, inp, cpl->status); 1556 1557 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1558 ("%s: wasn't expecting abort reply", __func__)); 1559 1560 INP_WLOCK(inp); 1561 final_cpl_received(toep); 1562 1563 return (0); 1564 } 1565 1566 static int 1567 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1568 { 1569 struct adapter *sc = iq->adapter; 1570 const struct cpl_rx_data *cpl = mtod(m, const void *); 1571 unsigned int tid = GET_TID(cpl); 1572 struct toepcb *toep = lookup_tid(sc, tid); 1573 struct inpcb *inp = toep->inp; 1574 struct tcpcb *tp; 1575 struct socket *so; 1576 struct sockbuf *sb; 1577 struct epoch_tracker et; 1578 int len, rx_credits; 1579 uint32_t ddp_placed = 0; 1580 1581 if (__predict_false(toep->flags & TPF_SYNQE)) { 1582 /* 1583 * do_pass_establish must have run before do_rx_data and if this 1584 * is still a synqe instead of a toepcb then the connection must 1585 * be getting aborted. 1586 */ 1587 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1588 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1589 toep, toep->flags); 1590 m_freem(m); 1591 return (0); 1592 } 1593 1594 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1595 1596 /* strip off CPL header */ 1597 m_adj(m, sizeof(*cpl)); 1598 len = m->m_pkthdr.len; 1599 1600 INP_WLOCK(inp); 1601 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1602 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1603 __func__, tid, len, inp->inp_flags); 1604 INP_WUNLOCK(inp); 1605 m_freem(m); 1606 return (0); 1607 } 1608 1609 tp = intotcpcb(inp); 1610 1611 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1612 toep->flags & TPF_TLS_RECEIVE)) { 1613 /* Received "raw" data on a TLS socket. */ 1614 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1615 __func__, tid, len); 1616 do_rx_data_tls(cpl, toep, m); 1617 return (0); 1618 } 1619 1620 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1621 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1622 1623 tp->rcv_nxt += len; 1624 if (tp->rcv_wnd < len) { 1625 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1626 ("%s: negative window size", __func__)); 1627 } 1628 1629 tp->rcv_wnd -= len; 1630 tp->t_rcvtime = ticks; 1631 1632 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1633 DDP_LOCK(toep); 1634 so = inp_inpcbtosocket(inp); 1635 sb = &so->so_rcv; 1636 SOCKBUF_LOCK(sb); 1637 1638 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1639 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1640 __func__, tid, len); 1641 m_freem(m); 1642 SOCKBUF_UNLOCK(sb); 1643 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1644 DDP_UNLOCK(toep); 1645 INP_WUNLOCK(inp); 1646 1647 CURVNET_SET(toep->vnet); 1648 NET_EPOCH_ENTER(et); 1649 INP_WLOCK(inp); 1650 tp = tcp_drop(tp, ECONNRESET); 1651 if (tp) 1652 INP_WUNLOCK(inp); 1653 NET_EPOCH_EXIT(et); 1654 CURVNET_RESTORE(); 1655 1656 return (0); 1657 } 1658 1659 /* receive buffer autosize */ 1660 MPASS(toep->vnet == so->so_vnet); 1661 CURVNET_SET(toep->vnet); 1662 if (sb->sb_flags & SB_AUTOSIZE && 1663 V_tcp_do_autorcvbuf && 1664 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1665 len > (sbspace(sb) / 8 * 7)) { 1666 unsigned int hiwat = sb->sb_hiwat; 1667 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1668 V_tcp_autorcvbuf_max); 1669 1670 if (!sbreserve_locked(sb, newsize, so, NULL)) 1671 sb->sb_flags &= ~SB_AUTOSIZE; 1672 } 1673 1674 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1675 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1676 1677 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1678 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1679 __func__, tid, len); 1680 1681 if (changed) { 1682 if (toep->ddp.flags & DDP_SC_REQ) 1683 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1684 else { 1685 KASSERT(cpl->ddp_off == 1, 1686 ("%s: DDP switched on by itself.", 1687 __func__)); 1688 1689 /* Fell out of DDP mode */ 1690 toep->ddp.flags &= ~DDP_ON; 1691 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1692 __func__); 1693 1694 insert_ddp_data(toep, ddp_placed); 1695 } 1696 } 1697 1698 if (toep->ddp.flags & DDP_ON) { 1699 /* 1700 * CPL_RX_DATA with DDP on can only be an indicate. 1701 * Start posting queued AIO requests via DDP. The 1702 * payload that arrived in this indicate is appended 1703 * to the socket buffer as usual. 1704 */ 1705 handle_ddp_indicate(toep); 1706 } 1707 } 1708 1709 sbappendstream_locked(sb, m, 0); 1710 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1711 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1712 rx_credits = send_rx_credits(sc, toep, rx_credits); 1713 tp->rcv_wnd += rx_credits; 1714 tp->rcv_adv += rx_credits; 1715 } 1716 1717 if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1718 sbavail(sb) != 0) { 1719 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1720 tid); 1721 ddp_queue_toep(toep); 1722 } 1723 sorwakeup_locked(so); 1724 SOCKBUF_UNLOCK_ASSERT(sb); 1725 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1726 DDP_UNLOCK(toep); 1727 1728 INP_WUNLOCK(inp); 1729 CURVNET_RESTORE(); 1730 return (0); 1731 } 1732 1733 static int 1734 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1735 { 1736 struct adapter *sc = iq->adapter; 1737 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1738 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1739 struct toepcb *toep = lookup_tid(sc, tid); 1740 struct inpcb *inp; 1741 struct tcpcb *tp; 1742 struct socket *so; 1743 uint8_t credits = cpl->credits; 1744 struct ofld_tx_sdesc *txsd; 1745 int plen; 1746 #ifdef INVARIANTS 1747 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1748 #endif 1749 1750 /* 1751 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1752 * now this comes back carrying the credits for the flowc. 1753 */ 1754 if (__predict_false(toep->flags & TPF_SYNQE)) { 1755 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1756 ("%s: credits for a synq entry %p", __func__, toep)); 1757 return (0); 1758 } 1759 1760 inp = toep->inp; 1761 1762 KASSERT(opcode == CPL_FW4_ACK, 1763 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1764 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1765 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1766 1767 INP_WLOCK(inp); 1768 1769 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1770 INP_WUNLOCK(inp); 1771 return (0); 1772 } 1773 1774 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1775 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1776 1777 tp = intotcpcb(inp); 1778 1779 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1780 tcp_seq snd_una = be32toh(cpl->snd_una); 1781 1782 #ifdef INVARIANTS 1783 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1784 log(LOG_ERR, 1785 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1786 __func__, snd_una, toep->tid, tp->snd_una); 1787 } 1788 #endif 1789 1790 if (tp->snd_una != snd_una) { 1791 tp->snd_una = snd_una; 1792 tp->ts_recent_age = tcp_ts_getticks(); 1793 } 1794 } 1795 1796 #ifdef VERBOSE_TRACES 1797 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1798 #endif 1799 so = inp->inp_socket; 1800 txsd = &toep->txsd[toep->txsd_cidx]; 1801 plen = 0; 1802 while (credits) { 1803 KASSERT(credits >= txsd->tx_credits, 1804 ("%s: too many (or partial) credits", __func__)); 1805 credits -= txsd->tx_credits; 1806 toep->tx_credits += txsd->tx_credits; 1807 plen += txsd->plen; 1808 txsd++; 1809 toep->txsd_avail++; 1810 KASSERT(toep->txsd_avail <= toep->txsd_total, 1811 ("%s: txsd avail > total", __func__)); 1812 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1813 txsd = &toep->txsd[0]; 1814 toep->txsd_cidx = 0; 1815 } 1816 } 1817 1818 if (toep->tx_credits == toep->tx_total) { 1819 toep->tx_nocompl = 0; 1820 toep->plen_nocompl = 0; 1821 } 1822 1823 if (toep->flags & TPF_TX_SUSPENDED && 1824 toep->tx_credits >= toep->tx_total / 4) { 1825 #ifdef VERBOSE_TRACES 1826 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1827 tid); 1828 #endif 1829 toep->flags &= ~TPF_TX_SUSPENDED; 1830 CURVNET_SET(toep->vnet); 1831 t4_push_data(sc, toep, plen); 1832 CURVNET_RESTORE(); 1833 } else if (plen > 0) { 1834 struct sockbuf *sb = &so->so_snd; 1835 int sbu; 1836 1837 SOCKBUF_LOCK(sb); 1838 sbu = sbused(sb); 1839 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 1840 if (__predict_false(sbu > 0)) { 1841 /* 1842 * The data transmitted before the 1843 * tid's ULP mode changed to ISCSI is 1844 * still in so_snd. Incoming credits 1845 * should account for so_snd first. 1846 */ 1847 sbdrop_locked(sb, min(sbu, plen)); 1848 plen -= min(sbu, plen); 1849 } 1850 sowwakeup_locked(so); /* unlocks so_snd */ 1851 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1852 } else { 1853 #ifdef VERBOSE_TRACES 1854 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1855 tid, plen); 1856 #endif 1857 sbdrop_locked(sb, plen); 1858 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1859 t4_aiotx_queue_toep(so, toep); 1860 sowwakeup_locked(so); /* unlocks so_snd */ 1861 } 1862 SOCKBUF_UNLOCK_ASSERT(sb); 1863 } 1864 1865 INP_WUNLOCK(inp); 1866 1867 return (0); 1868 } 1869 1870 void 1871 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1872 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1873 { 1874 struct wrqe *wr; 1875 struct cpl_set_tcb_field *req; 1876 struct ofld_tx_sdesc *txsd; 1877 1878 MPASS((cookie & ~M_COOKIE) == 0); 1879 if (reply) { 1880 MPASS(cookie != CPL_COOKIE_RESERVED); 1881 } 1882 1883 wr = alloc_wrqe(sizeof(*req), wrq); 1884 if (wr == NULL) { 1885 /* XXX */ 1886 panic("%s: allocation failure.", __func__); 1887 } 1888 req = wrtod(wr); 1889 1890 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1891 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1892 if (reply == 0) 1893 req->reply_ctrl |= htobe16(F_NO_REPLY); 1894 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1895 req->mask = htobe64(mask); 1896 req->val = htobe64(val); 1897 if (wrq->eq.type == EQ_OFLD) { 1898 txsd = &toep->txsd[toep->txsd_pidx]; 1899 txsd->tx_credits = howmany(sizeof(*req), 16); 1900 txsd->plen = 0; 1901 KASSERT(toep->tx_credits >= txsd->tx_credits && 1902 toep->txsd_avail > 0, 1903 ("%s: not enough credits (%d)", __func__, 1904 toep->tx_credits)); 1905 toep->tx_credits -= txsd->tx_credits; 1906 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1907 toep->txsd_pidx = 0; 1908 toep->txsd_avail--; 1909 } 1910 1911 t4_wrq_tx(sc, wr); 1912 } 1913 1914 void 1915 t4_init_cpl_io_handlers(void) 1916 { 1917 1918 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1919 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1920 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1921 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1922 CPL_COOKIE_TOM); 1923 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1924 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1925 } 1926 1927 void 1928 t4_uninit_cpl_io_handlers(void) 1929 { 1930 1931 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1932 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1933 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1934 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1935 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1936 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1937 } 1938 1939 /* 1940 * Use the 'backend1' field in AIO jobs to hold an error that should 1941 * be reported when the job is completed, the 'backend3' field to 1942 * store the amount of data sent by the AIO job so far, and the 1943 * 'backend4' field to hold a reference count on the job. 1944 * 1945 * Each unmapped mbuf holds a reference on the job as does the queue 1946 * so long as the job is queued. 1947 */ 1948 #define aio_error backend1 1949 #define aio_sent backend3 1950 #define aio_refs backend4 1951 1952 #define jobtotid(job) \ 1953 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1954 1955 static void 1956 aiotx_free_job(struct kaiocb *job) 1957 { 1958 long status; 1959 int error; 1960 1961 if (refcount_release(&job->aio_refs) == 0) 1962 return; 1963 1964 error = (intptr_t)job->aio_error; 1965 status = job->aio_sent; 1966 #ifdef VERBOSE_TRACES 1967 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1968 jobtotid(job), job, status, error); 1969 #endif 1970 if (error != 0 && status != 0) 1971 error = 0; 1972 if (error == ECANCELED) 1973 aio_cancel(job); 1974 else if (error) 1975 aio_complete(job, -1, error); 1976 else { 1977 job->msgsnd = 1; 1978 aio_complete(job, status, 0); 1979 } 1980 } 1981 1982 static void 1983 aiotx_free_pgs(struct mbuf *m) 1984 { 1985 struct kaiocb *job; 1986 vm_page_t pg; 1987 1988 M_ASSERTEXTPG(m); 1989 job = m->m_ext.ext_arg1; 1990 #ifdef VERBOSE_TRACES 1991 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1992 m->m_len, jobtotid(job)); 1993 #endif 1994 1995 for (int i = 0; i < m->m_epg_npgs; i++) { 1996 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1997 vm_page_unwire(pg, PQ_ACTIVE); 1998 } 1999 2000 aiotx_free_job(job); 2001 } 2002 2003 /* 2004 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 2005 * of an AIO job. 2006 */ 2007 static struct mbuf * 2008 alloc_aiotx_mbuf(struct kaiocb *job, int len) 2009 { 2010 struct vmspace *vm; 2011 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 2012 struct mbuf *m, *top, *last; 2013 vm_map_t map; 2014 vm_offset_t start; 2015 int i, mlen, npages, pgoff; 2016 2017 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 2018 ("%s(%p, %d): request to send beyond end of buffer", __func__, 2019 job, len)); 2020 2021 /* 2022 * The AIO subsystem will cancel and drain all requests before 2023 * permitting a process to exit or exec, so p_vmspace should 2024 * be stable here. 2025 */ 2026 vm = job->userproc->p_vmspace; 2027 map = &vm->vm_map; 2028 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 2029 pgoff = start & PAGE_MASK; 2030 2031 top = NULL; 2032 last = NULL; 2033 while (len > 0) { 2034 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 2035 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 2036 ("%s: next start (%#jx + %#x) is not page aligned", 2037 __func__, (uintmax_t)start, mlen)); 2038 2039 npages = vm_fault_quick_hold_pages(map, start, mlen, 2040 VM_PROT_WRITE, pgs, nitems(pgs)); 2041 if (npages < 0) 2042 break; 2043 2044 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs); 2045 if (m == NULL) { 2046 vm_page_unhold_pages(pgs, npages); 2047 break; 2048 } 2049 2050 m->m_epg_1st_off = pgoff; 2051 m->m_epg_npgs = npages; 2052 if (npages == 1) { 2053 KASSERT(mlen + pgoff <= PAGE_SIZE, 2054 ("%s: single page is too large (off %d len %d)", 2055 __func__, pgoff, mlen)); 2056 m->m_epg_last_len = mlen; 2057 } else { 2058 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2059 (npages - 2) * PAGE_SIZE; 2060 } 2061 for (i = 0; i < npages; i++) 2062 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2063 2064 m->m_len = mlen; 2065 m->m_ext.ext_size = npages * PAGE_SIZE; 2066 m->m_ext.ext_arg1 = job; 2067 refcount_acquire(&job->aio_refs); 2068 2069 #ifdef VERBOSE_TRACES 2070 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2071 __func__, jobtotid(job), m, job, npages); 2072 #endif 2073 2074 if (top == NULL) 2075 top = m; 2076 else 2077 last->m_next = m; 2078 last = m; 2079 2080 len -= mlen; 2081 start += mlen; 2082 pgoff = 0; 2083 } 2084 2085 return (top); 2086 } 2087 2088 static void 2089 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2090 { 2091 struct sockbuf *sb; 2092 struct file *fp; 2093 struct inpcb *inp; 2094 struct tcpcb *tp; 2095 struct mbuf *m; 2096 int error, len; 2097 bool moretocome, sendmore; 2098 2099 sb = &so->so_snd; 2100 SOCKBUF_UNLOCK(sb); 2101 fp = job->fd_file; 2102 m = NULL; 2103 2104 #ifdef MAC 2105 error = mac_socket_check_send(fp->f_cred, so); 2106 if (error != 0) 2107 goto out; 2108 #endif 2109 2110 /* Inline sosend_generic(). */ 2111 2112 error = sblock(sb, SBL_WAIT); 2113 MPASS(error == 0); 2114 2115 sendanother: 2116 SOCKBUF_LOCK(sb); 2117 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2118 SOCKBUF_UNLOCK(sb); 2119 sbunlock(sb); 2120 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2121 PROC_LOCK(job->userproc); 2122 kern_psignal(job->userproc, SIGPIPE); 2123 PROC_UNLOCK(job->userproc); 2124 } 2125 error = EPIPE; 2126 goto out; 2127 } 2128 if (so->so_error) { 2129 error = so->so_error; 2130 so->so_error = 0; 2131 SOCKBUF_UNLOCK(sb); 2132 sbunlock(sb); 2133 goto out; 2134 } 2135 if ((so->so_state & SS_ISCONNECTED) == 0) { 2136 SOCKBUF_UNLOCK(sb); 2137 sbunlock(sb); 2138 error = ENOTCONN; 2139 goto out; 2140 } 2141 if (sbspace(sb) < sb->sb_lowat) { 2142 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2143 2144 /* 2145 * Don't block if there is too little room in the socket 2146 * buffer. Instead, requeue the request. 2147 */ 2148 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2149 SOCKBUF_UNLOCK(sb); 2150 sbunlock(sb); 2151 error = ECANCELED; 2152 goto out; 2153 } 2154 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2155 SOCKBUF_UNLOCK(sb); 2156 sbunlock(sb); 2157 goto out; 2158 } 2159 2160 /* 2161 * Write as much data as the socket permits, but no more than a 2162 * a single sndbuf at a time. 2163 */ 2164 len = sbspace(sb); 2165 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2166 len = job->uaiocb.aio_nbytes - job->aio_sent; 2167 moretocome = false; 2168 } else 2169 moretocome = true; 2170 if (len > toep->params.sndbuf) { 2171 len = toep->params.sndbuf; 2172 sendmore = true; 2173 } else 2174 sendmore = false; 2175 2176 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2177 moretocome = true; 2178 SOCKBUF_UNLOCK(sb); 2179 MPASS(len != 0); 2180 2181 m = alloc_aiotx_mbuf(job, len); 2182 if (m == NULL) { 2183 sbunlock(sb); 2184 error = EFAULT; 2185 goto out; 2186 } 2187 2188 /* Inlined tcp_usr_send(). */ 2189 2190 inp = toep->inp; 2191 INP_WLOCK(inp); 2192 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2193 INP_WUNLOCK(inp); 2194 sbunlock(sb); 2195 error = ECONNRESET; 2196 goto out; 2197 } 2198 2199 job->aio_sent += m_length(m, NULL); 2200 2201 sbappendstream(sb, m, 0); 2202 m = NULL; 2203 2204 if (!(inp->inp_flags & INP_DROPPED)) { 2205 tp = intotcpcb(inp); 2206 if (moretocome) 2207 tp->t_flags |= TF_MORETOCOME; 2208 error = tp->t_fb->tfb_tcp_output(tp); 2209 if (moretocome) 2210 tp->t_flags &= ~TF_MORETOCOME; 2211 } 2212 2213 INP_WUNLOCK(inp); 2214 if (sendmore) 2215 goto sendanother; 2216 sbunlock(sb); 2217 2218 if (error) 2219 goto out; 2220 2221 /* 2222 * If this is a blocking socket and the request has not been 2223 * fully completed, requeue it until the socket is ready 2224 * again. 2225 */ 2226 if (job->aio_sent < job->uaiocb.aio_nbytes && 2227 !(so->so_state & SS_NBIO)) { 2228 SOCKBUF_LOCK(sb); 2229 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2230 SOCKBUF_UNLOCK(sb); 2231 error = ECANCELED; 2232 goto out; 2233 } 2234 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2235 return; 2236 } 2237 2238 /* 2239 * If the request will not be requeued, drop the queue's 2240 * reference to the job. Any mbufs in flight should still 2241 * hold a reference, but this drops the reference that the 2242 * queue owns while it is waiting to queue mbufs to the 2243 * socket. 2244 */ 2245 aiotx_free_job(job); 2246 2247 out: 2248 if (error) { 2249 job->aio_error = (void *)(intptr_t)error; 2250 aiotx_free_job(job); 2251 } 2252 m_freem(m); 2253 SOCKBUF_LOCK(sb); 2254 } 2255 2256 static void 2257 t4_aiotx_task(void *context, int pending) 2258 { 2259 struct toepcb *toep = context; 2260 struct socket *so; 2261 struct kaiocb *job; 2262 2263 so = toep->aiotx_so; 2264 CURVNET_SET(toep->vnet); 2265 SOCKBUF_LOCK(&so->so_snd); 2266 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2267 job = TAILQ_FIRST(&toep->aiotx_jobq); 2268 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2269 if (!aio_clear_cancel_function(job)) 2270 continue; 2271 2272 t4_aiotx_process_job(toep, so, job); 2273 } 2274 toep->aiotx_so = NULL; 2275 SOCKBUF_UNLOCK(&so->so_snd); 2276 CURVNET_RESTORE(); 2277 2278 free_toepcb(toep); 2279 SOCK_LOCK(so); 2280 sorele(so); 2281 } 2282 2283 static void 2284 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2285 { 2286 2287 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2288 #ifdef VERBOSE_TRACES 2289 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2290 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2291 #endif 2292 if (toep->aiotx_so != NULL) 2293 return; 2294 soref(so); 2295 toep->aiotx_so = so; 2296 hold_toepcb(toep); 2297 soaio_enqueue(&toep->aiotx_task); 2298 } 2299 2300 static void 2301 t4_aiotx_cancel(struct kaiocb *job) 2302 { 2303 struct socket *so; 2304 struct sockbuf *sb; 2305 struct tcpcb *tp; 2306 struct toepcb *toep; 2307 2308 so = job->fd_file->f_data; 2309 tp = so_sototcpcb(so); 2310 toep = tp->t_toe; 2311 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2312 sb = &so->so_snd; 2313 2314 SOCKBUF_LOCK(sb); 2315 if (!aio_cancel_cleared(job)) 2316 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2317 SOCKBUF_UNLOCK(sb); 2318 2319 job->aio_error = (void *)(intptr_t)ECANCELED; 2320 aiotx_free_job(job); 2321 } 2322 2323 int 2324 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2325 { 2326 struct tcpcb *tp = so_sototcpcb(so); 2327 struct toepcb *toep = tp->t_toe; 2328 struct adapter *sc = td_adapter(toep->td); 2329 2330 /* This only handles writes. */ 2331 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2332 return (EOPNOTSUPP); 2333 2334 if (!sc->tt.tx_zcopy) 2335 return (EOPNOTSUPP); 2336 2337 if (tls_tx_key(toep)) 2338 return (EOPNOTSUPP); 2339 2340 SOCKBUF_LOCK(&so->so_snd); 2341 #ifdef VERBOSE_TRACES 2342 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2343 #endif 2344 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2345 panic("new job was cancelled"); 2346 refcount_init(&job->aio_refs, 1); 2347 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2348 if (sowriteable(so)) 2349 t4_aiotx_queue_toep(so, toep); 2350 SOCKBUF_UNLOCK(&so->so_snd); 2351 return (0); 2352 } 2353 2354 void 2355 aiotx_init_toep(struct toepcb *toep) 2356 { 2357 2358 TAILQ_INIT(&toep->aiotx_jobq); 2359 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2360 } 2361 #endif 2362