1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_kern_tls.h" 34 #include "opt_ratelimit.h" 35 36 #ifdef TCP_OFFLOAD 37 #include <sys/param.h> 38 #include <sys/aio.h> 39 #include <sys/file.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/module.h> 43 #include <sys/proc.h> 44 #include <sys/protosw.h> 45 #include <sys/domain.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sglist.h> 49 #include <sys/taskqueue.h> 50 #include <netinet/in.h> 51 #include <netinet/in_pcb.h> 52 #include <netinet/ip.h> 53 #include <netinet/ip6.h> 54 #define TCPSTATES 55 #include <netinet/tcp_fsm.h> 56 #include <netinet/tcp_seq.h> 57 #include <netinet/tcp_var.h> 58 #include <netinet/toecore.h> 59 60 #include <security/mac/mac_framework.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_extern.h> 64 #include <vm/pmap.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_page.h> 67 68 #include <dev/iscsi/iscsi_proto.h> 69 #include <dev/nvmf/nvmf_proto.h> 70 71 #include "common/common.h" 72 #include "common/t4_msg.h" 73 #include "common/t4_regs.h" 74 #include "common/t4_tcb.h" 75 #include "tom/t4_tom_l2t.h" 76 #include "tom/t4_tom.h" 77 78 static void t4_aiotx_cancel(struct kaiocb *job); 79 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 80 81 void 82 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 83 { 84 struct wrqe *wr; 85 struct fw_flowc_wr *flowc; 86 unsigned int nparams, flowclen, paramidx; 87 struct vi_info *vi = toep->vi; 88 struct port_info *pi = vi->pi; 89 struct adapter *sc = pi->adapter; 90 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 91 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 92 93 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 94 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 95 96 if (tp != NULL) 97 nparams = 8; 98 else 99 nparams = 6; 100 if (toep->params.tc_idx != -1) { 101 MPASS(toep->params.tc_idx >= 0 && 102 toep->params.tc_idx < sc->params.nsched_cls); 103 nparams++; 104 } 105 106 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 107 108 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 109 if (wr == NULL) { 110 /* XXX */ 111 panic("%s: allocation failure.", __func__); 112 } 113 flowc = wrtod(wr); 114 memset(flowc, 0, wr->wr_len); 115 116 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 117 V_FW_FLOWC_WR_NPARAMS(nparams)); 118 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 119 V_FW_WR_FLOWID(toep->tid)); 120 121 #define FLOWC_PARAM(__m, __v) \ 122 do { \ 123 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 124 flowc->mnemval[paramidx].val = htobe32(__v); \ 125 paramidx++; \ 126 } while (0) 127 128 paramidx = 0; 129 130 FLOWC_PARAM(PFNVFN, pfvf); 131 /* Firmware expects hw port and will translate to channel itself. */ 132 FLOWC_PARAM(CH, pi->hw_port); 133 FLOWC_PARAM(PORT, pi->hw_port); 134 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 135 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 136 if (tp) { 137 FLOWC_PARAM(MSS, toep->params.emss); 138 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 139 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 140 } else 141 FLOWC_PARAM(MSS, 512); 142 CTR6(KTR_CXGBE, 143 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 144 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 145 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 146 147 if (toep->params.tc_idx != -1) 148 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 149 #undef FLOWC_PARAM 150 151 KASSERT(paramidx == nparams, ("nparams mismatch")); 152 153 KASSERT(howmany(flowclen, 16) <= MAX_OFLD_TX_SDESC_CREDITS, 154 ("%s: tx_credits %u too large", __func__, howmany(flowclen, 16))); 155 txsd->tx_credits = howmany(flowclen, 16); 156 txsd->plen = 0; 157 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 158 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 159 toep->tx_credits -= txsd->tx_credits; 160 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 161 toep->txsd_pidx = 0; 162 toep->txsd_avail--; 163 164 toep->flags |= TPF_FLOWC_WR_SENT; 165 t4_wrq_tx(sc, wr); 166 } 167 168 #ifdef RATELIMIT 169 /* 170 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 171 */ 172 static int 173 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 174 { 175 int tc_idx, rc; 176 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 177 const int port_id = toep->vi->pi->port_id; 178 179 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 180 181 if (kbps == 0) { 182 /* unbind */ 183 tc_idx = -1; 184 } else { 185 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 186 if (rc != 0) 187 return (rc); 188 MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls); 189 } 190 191 if (toep->params.tc_idx != tc_idx) { 192 struct wrqe *wr; 193 struct fw_flowc_wr *flowc; 194 int nparams = 1, flowclen, flowclen16; 195 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 196 197 flowclen = sizeof(*flowc) + nparams * sizeof(struct 198 fw_flowc_mnemval); 199 flowclen16 = howmany(flowclen, 16); 200 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 201 (wr = alloc_wrqe(roundup2(flowclen, 16), 202 &toep->ofld_txq->wrq)) == NULL) { 203 if (tc_idx >= 0) 204 t4_release_cl_rl(sc, port_id, tc_idx); 205 return (ENOMEM); 206 } 207 208 flowc = wrtod(wr); 209 memset(flowc, 0, wr->wr_len); 210 211 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 212 V_FW_FLOWC_WR_NPARAMS(nparams)); 213 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 214 V_FW_WR_FLOWID(toep->tid)); 215 216 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 217 if (tc_idx == -1) 218 flowc->mnemval[0].val = htobe32(0xff); 219 else 220 flowc->mnemval[0].val = htobe32(tc_idx); 221 222 KASSERT(flowclen16 <= MAX_OFLD_TX_SDESC_CREDITS, 223 ("%s: tx_credits %u too large", __func__, flowclen16)); 224 txsd->tx_credits = flowclen16; 225 txsd->plen = 0; 226 toep->tx_credits -= txsd->tx_credits; 227 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 228 toep->txsd_pidx = 0; 229 toep->txsd_avail--; 230 t4_wrq_tx(sc, wr); 231 } 232 233 if (toep->params.tc_idx >= 0) 234 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 235 toep->params.tc_idx = tc_idx; 236 237 return (0); 238 } 239 #endif 240 241 void 242 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 243 { 244 struct wrqe *wr; 245 struct cpl_abort_req *req; 246 int tid = toep->tid; 247 struct inpcb *inp = toep->inp; 248 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 249 250 INP_WLOCK_ASSERT(inp); 251 252 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 253 __func__, toep->tid, 254 inp->inp_flags & INP_DROPPED ? "inp dropped" : 255 tcpstates[tp->t_state], 256 toep->flags, inp->inp_flags, 257 toep->flags & TPF_ABORT_SHUTDOWN ? 258 " (abort already in progress)" : ""); 259 260 if (toep->flags & TPF_ABORT_SHUTDOWN) 261 return; /* abort already in progress */ 262 263 toep->flags |= TPF_ABORT_SHUTDOWN; 264 265 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 266 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 267 268 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 269 if (wr == NULL) { 270 /* XXX */ 271 panic("%s: allocation failure.", __func__); 272 } 273 req = wrtod(wr); 274 275 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 276 if (inp->inp_flags & INP_DROPPED) 277 req->rsvd0 = htobe32(snd_nxt); 278 else 279 req->rsvd0 = htobe32(tp->snd_nxt); 280 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 281 req->cmd = CPL_ABORT_SEND_RST; 282 283 /* 284 * XXX: What's the correct way to tell that the inp hasn't been detached 285 * from its socket? Should I even be flushing the snd buffer here? 286 */ 287 if ((inp->inp_flags & INP_DROPPED) == 0) { 288 struct socket *so = inp->inp_socket; 289 290 if (so != NULL) /* because I'm not sure. See comment above */ 291 sbflush(&so->so_snd); 292 } 293 294 t4_l2t_send(sc, wr, toep->l2te); 295 } 296 297 /* 298 * Called when a connection is established to translate the TCP options 299 * reported by HW to FreeBSD's native format. 300 */ 301 static void 302 assign_rxopt(struct tcpcb *tp, uint16_t opt) 303 { 304 struct toepcb *toep = tp->t_toe; 305 struct inpcb *inp = tptoinpcb(tp); 306 struct adapter *sc = td_adapter(toep->td); 307 308 INP_LOCK_ASSERT(inp); 309 310 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 311 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 312 if (inp->inp_inc.inc_flags & INC_ISIPV6) 313 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 314 else 315 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 316 317 toep->params.emss = tp->t_maxseg; 318 if (G_TCPOPT_TSTAMP(opt)) { 319 toep->params.tstamp = 1; 320 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 321 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 322 tp->ts_recent = 0; /* hmmm */ 323 tp->ts_recent_age = tcp_ts_getticks(); 324 } else 325 toep->params.tstamp = 0; 326 327 if (G_TCPOPT_SACK(opt)) { 328 toep->params.sack = 1; 329 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 330 } else { 331 toep->params.sack = 0; 332 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 333 } 334 335 if (G_TCPOPT_WSCALE_OK(opt)) 336 tp->t_flags |= TF_RCVD_SCALE; 337 338 /* Doing window scaling? */ 339 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 340 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 341 tp->rcv_scale = tp->request_r_scale; 342 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 343 } else 344 toep->params.wscale = 0; 345 346 CTR6(KTR_CXGBE, 347 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 348 toep->tid, toep->params.mtu_idx, toep->params.emss, 349 toep->params.tstamp, toep->params.sack, toep->params.wscale); 350 } 351 352 /* 353 * Completes some final bits of initialization for just established connections 354 * and changes their state to TCPS_ESTABLISHED. 355 * 356 * The ISNs are from the exchange of SYNs. 357 */ 358 void 359 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 360 { 361 struct inpcb *inp = toep->inp; 362 struct socket *so = inp->inp_socket; 363 struct tcpcb *tp = intotcpcb(inp); 364 uint16_t tcpopt = be16toh(opt); 365 366 INP_WLOCK_ASSERT(inp); 367 KASSERT(tp->t_state == TCPS_SYN_SENT || 368 tp->t_state == TCPS_SYN_RECEIVED, 369 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 370 371 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 372 __func__, toep->tid, so, inp, tp, toep); 373 374 tcp_state_change(tp, TCPS_ESTABLISHED); 375 tp->t_starttime = ticks; 376 TCPSTAT_INC(tcps_connects); 377 378 tp->irs = irs; 379 tcp_rcvseqinit(tp); 380 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 381 tp->rcv_adv += tp->rcv_wnd; 382 tp->last_ack_sent = tp->rcv_nxt; 383 384 tp->iss = iss; 385 tcp_sendseqinit(tp); 386 tp->snd_una = iss + 1; 387 tp->snd_nxt = iss + 1; 388 tp->snd_max = iss + 1; 389 390 assign_rxopt(tp, tcpopt); 391 send_flowc_wr(toep, tp); 392 393 soisconnected(so); 394 } 395 396 int 397 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 398 { 399 struct wrqe *wr; 400 struct cpl_rx_data_ack *req; 401 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 402 403 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 404 405 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 406 if (wr == NULL) 407 return (0); 408 req = wrtod(wr); 409 410 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 411 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 412 413 t4_wrq_tx(sc, wr); 414 return (credits); 415 } 416 417 void 418 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 419 { 420 struct adapter *sc = tod->tod_softc; 421 struct inpcb *inp = tptoinpcb(tp); 422 struct socket *so = inp->inp_socket; 423 struct sockbuf *sb = &so->so_rcv; 424 struct toepcb *toep = tp->t_toe; 425 int rx_credits; 426 427 INP_WLOCK_ASSERT(inp); 428 SOCKBUF_LOCK_ASSERT(sb); 429 430 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 431 if (rx_credits > 0 && 432 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 433 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 434 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 435 rx_credits = send_rx_credits(sc, toep, rx_credits); 436 tp->rcv_wnd += rx_credits; 437 tp->rcv_adv += rx_credits; 438 } 439 } 440 441 void 442 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 443 { 444 struct inpcb *inp = tptoinpcb(tp); 445 struct socket *so = inp->inp_socket; 446 struct sockbuf *sb = &so->so_rcv; 447 448 SOCKBUF_LOCK(sb); 449 t4_rcvd_locked(tod, tp); 450 SOCKBUF_UNLOCK(sb); 451 } 452 453 /* 454 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 455 */ 456 int 457 t4_close_conn(struct adapter *sc, struct toepcb *toep) 458 { 459 struct wrqe *wr; 460 struct cpl_close_con_req *req; 461 unsigned int tid = toep->tid; 462 463 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 464 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 465 466 if (toep->flags & TPF_FIN_SENT) 467 return (0); 468 469 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 470 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 471 472 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 473 if (wr == NULL) { 474 /* XXX */ 475 panic("%s: allocation failure.", __func__); 476 } 477 req = wrtod(wr); 478 479 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 480 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 481 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 482 V_FW_WR_FLOWID(tid)); 483 req->wr.wr_lo = cpu_to_be64(0); 484 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 485 req->rsvd = 0; 486 487 toep->flags |= TPF_FIN_SENT; 488 toep->flags &= ~TPF_SEND_FIN; 489 t4_l2t_send(sc, wr, toep->l2te); 490 491 return (0); 492 } 493 494 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 495 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 496 #define MIN_ISO_TX_CREDITS (howmany(sizeof(struct cpl_tx_data_iso), 16)) 497 #define MIN_TX_CREDITS(iso) \ 498 (MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0)) 499 #define MIN_OFLD_TX_V2_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_v2_wr) + 1, 16)) 500 #define MIN_TX_V2_CREDITS(iso) \ 501 (MIN_OFLD_TX_V2_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0)) 502 503 _Static_assert(MAX_OFLD_TX_CREDITS <= MAX_OFLD_TX_SDESC_CREDITS, 504 "MAX_OFLD_TX_SDESC_CREDITS too small"); 505 506 /* Maximum amount of immediate data we could stuff in a WR */ 507 static inline int 508 max_imm_payload(int tx_credits, int iso) 509 { 510 const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0; 511 const int n = 1; /* Use no more than one desc for imm. data WR */ 512 513 KASSERT(tx_credits >= 0 && 514 tx_credits <= MAX_OFLD_TX_CREDITS, 515 ("%s: %d credits", __func__, tx_credits)); 516 517 if (tx_credits < MIN_TX_CREDITS(iso)) 518 return (0); 519 520 if (tx_credits >= (n * EQ_ESIZE) / 16) 521 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) - 522 iso_cpl_size); 523 else 524 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) - 525 iso_cpl_size); 526 } 527 528 /* Maximum number of SGL entries we could stuff in a WR */ 529 static inline int 530 max_dsgl_nsegs(int tx_credits, int iso) 531 { 532 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 533 int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso); 534 535 KASSERT(tx_credits >= 0 && 536 tx_credits <= MAX_OFLD_TX_CREDITS, 537 ("%s: %d credits", __func__, tx_credits)); 538 539 if (tx_credits < MIN_TX_CREDITS(iso)) 540 return (0); 541 542 nseg += 2 * (sge_pair_credits * 16 / 24); 543 if ((sge_pair_credits * 16) % 24 == 16) 544 nseg++; 545 546 return (nseg); 547 } 548 549 /* Maximum amount of immediate data we could stuff in a WR */ 550 static inline int 551 max_imm_payload_v2(int tx_credits, int iso) 552 { 553 const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0; 554 555 KASSERT(tx_credits >= 0 && 556 tx_credits <= MAX_OFLD_TX_CREDITS, 557 ("%s: %d credits", __func__, tx_credits)); 558 559 if (tx_credits < MIN_TX_V2_CREDITS(iso)) 560 return (0); 561 562 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_v2_wr) - 563 iso_cpl_size); 564 } 565 566 /* Maximum number of SGL entries we could stuff in a WR */ 567 static inline int 568 max_dsgl_nsegs_v2(int tx_credits, int iso, int imm_payload) 569 { 570 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 571 int sge_pair_credits = tx_credits - MIN_TX_V2_CREDITS(iso); 572 573 KASSERT(tx_credits >= 0 && 574 tx_credits <= MAX_OFLD_TX_CREDITS, 575 ("%s: %d credits", __func__, tx_credits)); 576 577 if (tx_credits < MIN_TX_V2_CREDITS(iso) || 578 sge_pair_credits <= howmany(imm_payload, 16)) 579 return (0); 580 sge_pair_credits -= howmany(imm_payload, 16); 581 582 nseg += 2 * (sge_pair_credits * 16 / 24); 583 if ((sge_pair_credits * 16) % 24 == 16) 584 nseg++; 585 586 return (nseg); 587 } 588 589 static inline void 590 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode, 591 unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, 592 int ulp_submode) 593 { 594 struct fw_ofld_tx_data_wr *txwr = dst; 595 596 txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) | 597 V_FW_WR_IMMDLEN(immdlen)); 598 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 599 V_FW_WR_LEN16(credits)); 600 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 601 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 602 txwr->plen = htobe32(plen); 603 604 if (toep->params.tx_align > 0) { 605 if (plen < 2 * toep->params.emss) 606 txwr->lsodisable_to_flags |= 607 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 608 else 609 txwr->lsodisable_to_flags |= 610 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 611 (toep->params.nagle == 0 ? 0 : 612 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 613 } 614 } 615 616 static inline void 617 write_tx_v2_wr(void *dst, struct toepcb *toep, int fw_wr_opcode, 618 unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, 619 int ulp_submode) 620 { 621 struct fw_ofld_tx_data_v2_wr *txwr = dst; 622 uint32_t flags; 623 624 memset(txwr, 0, sizeof(*txwr)); 625 txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) | 626 V_FW_WR_IMMDLEN(immdlen)); 627 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 628 V_FW_WR_LEN16(credits)); 629 txwr->plen = htobe32(plen); 630 flags = V_TX_ULP_MODE(ULP_MODE_NVMET) | V_TX_ULP_SUBMODE(ulp_submode) | 631 V_TX_URG(0) | V_TX_SHOVE(shove); 632 633 if (toep->params.tx_align > 0) { 634 if (plen < 2 * toep->params.emss) 635 flags |= F_FW_OFLD_TX_DATA_WR_LSODISABLE; 636 else 637 flags |= F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 638 (toep->params.nagle == 0 ? 0 : 639 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE); 640 } 641 642 txwr->lsodisable_to_flags = htobe32(flags); 643 } 644 645 /* 646 * Generate a DSGL from a starting mbuf. The total number of segments and the 647 * maximum segments in any one mbuf are provided. 648 */ 649 static void 650 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 651 { 652 struct mbuf *m; 653 struct ulptx_sgl *usgl = dst; 654 int i, j, rc; 655 struct sglist sg; 656 struct sglist_seg segs[n]; 657 658 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 659 660 sglist_init(&sg, n, segs); 661 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 662 V_ULPTX_NSGE(nsegs)); 663 664 i = -1; 665 for (m = start; m != stop; m = m->m_next) { 666 if (m->m_flags & M_EXTPG) 667 rc = sglist_append_mbuf_epg(&sg, m, 668 mtod(m, vm_offset_t), m->m_len); 669 else 670 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 671 if (__predict_false(rc != 0)) 672 panic("%s: sglist_append %d", __func__, rc); 673 674 for (j = 0; j < sg.sg_nseg; i++, j++) { 675 if (i < 0) { 676 usgl->len0 = htobe32(segs[j].ss_len); 677 usgl->addr0 = htobe64(segs[j].ss_paddr); 678 } else { 679 usgl->sge[i / 2].len[i & 1] = 680 htobe32(segs[j].ss_len); 681 usgl->sge[i / 2].addr[i & 1] = 682 htobe64(segs[j].ss_paddr); 683 } 684 #ifdef INVARIANTS 685 nsegs--; 686 #endif 687 } 688 sglist_reset(&sg); 689 } 690 if (i & 1) 691 usgl->sge[i / 2].len[1] = htobe32(0); 692 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 693 __func__, nsegs, start, stop)); 694 } 695 696 bool 697 t4_push_raw_wr(struct adapter *sc, struct toepcb *toep, struct mbuf *m) 698 { 699 #ifdef INVARIANTS 700 struct inpcb *inp = toep->inp; 701 #endif 702 struct wrqe *wr; 703 struct ofld_tx_sdesc *txsd; 704 u_int credits, plen; 705 706 INP_WLOCK_ASSERT(inp); 707 MPASS(mbuf_raw_wr(m)); 708 plen = m->m_pkthdr.len; 709 credits = howmany(plen, 16); 710 if (credits > toep->tx_credits) 711 return (false); 712 713 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 714 if (wr == NULL) 715 return (false); 716 717 m_copydata(m, 0, plen, wrtod(wr)); 718 m_freem(m); 719 720 toep->tx_credits -= credits; 721 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 722 toep->flags |= TPF_TX_SUSPENDED; 723 724 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 725 KASSERT(credits <= MAX_OFLD_TX_SDESC_CREDITS, 726 ("%s: tx_credits %u too large", __func__, credits)); 727 txsd = &toep->txsd[toep->txsd_pidx]; 728 txsd->plen = 0; 729 txsd->tx_credits = credits; 730 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 731 toep->txsd_pidx = 0; 732 toep->txsd_avail--; 733 734 t4_wrq_tx(sc, wr); 735 return (true); 736 } 737 738 /* 739 * Max number of SGL entries an offload tx work request can have. This is 41 740 * (1 + 40) for a full 512B work request. 741 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 742 */ 743 #define OFLD_SGL_LEN (41) 744 745 /* 746 * Send data and/or a FIN to the peer. 747 * 748 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 749 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 750 * was transmitted. 751 * 752 * drop indicates the number of bytes that should be dropped from the head of 753 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 754 * contention on the send buffer lock (before this change it used to do 755 * sowwakeup and then t4_push_frames right after that when recovering from tx 756 * stalls). When drop is set this function MUST drop the bytes and wake up any 757 * writers. 758 */ 759 static void 760 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 761 { 762 struct mbuf *sndptr, *m, *sb_sndptr; 763 struct fw_ofld_tx_data_wr *txwr; 764 struct wrqe *wr; 765 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 766 struct inpcb *inp = toep->inp; 767 struct tcpcb *tp = intotcpcb(inp); 768 struct socket *so = inp->inp_socket; 769 struct sockbuf *sb = &so->so_snd; 770 struct mbufq *pduq = &toep->ulp_pduq; 771 int tx_credits, shove, compl, sowwakeup; 772 struct ofld_tx_sdesc *txsd; 773 bool nomap_mbuf_seen; 774 775 INP_WLOCK_ASSERT(inp); 776 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 777 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 778 779 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 780 ulp_mode(toep) == ULP_MODE_TCPDDP || 781 ulp_mode(toep) == ULP_MODE_TLS || 782 ulp_mode(toep) == ULP_MODE_RDMA, 783 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 784 785 #ifdef VERBOSE_TRACES 786 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 787 __func__, toep->tid, toep->flags, tp->t_flags, drop); 788 #endif 789 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 790 return; 791 792 #ifdef RATELIMIT 793 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 794 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 795 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 796 } 797 #endif 798 799 /* 800 * This function doesn't resume by itself. Someone else must clear the 801 * flag and call this function. 802 */ 803 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 804 KASSERT(drop == 0, 805 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 806 return; 807 } 808 809 txsd = &toep->txsd[toep->txsd_pidx]; 810 do { 811 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 812 max_imm = max_imm_payload(tx_credits, 0); 813 max_nsegs = max_dsgl_nsegs(tx_credits, 0); 814 815 if (__predict_false((sndptr = mbufq_first(pduq)) != NULL)) { 816 if (!t4_push_raw_wr(sc, toep, sndptr)) { 817 toep->flags |= TPF_TX_SUSPENDED; 818 return; 819 } 820 821 m = mbufq_dequeue(pduq); 822 MPASS(m == sndptr); 823 824 txsd = &toep->txsd[toep->txsd_pidx]; 825 continue; 826 } 827 828 SOCKBUF_LOCK(sb); 829 sowwakeup = drop; 830 if (drop) { 831 sbdrop_locked(sb, drop); 832 drop = 0; 833 } 834 sb_sndptr = sb->sb_sndptr; 835 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 836 plen = 0; 837 nsegs = 0; 838 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 839 nomap_mbuf_seen = false; 840 for (m = sndptr; m != NULL; m = m->m_next) { 841 int n; 842 843 if ((m->m_flags & M_NOTREADY) != 0) 844 break; 845 if (plen + m->m_len > MAX_OFLD_TX_SDESC_PLEN) 846 break; 847 if (m->m_flags & M_EXTPG) { 848 #ifdef KERN_TLS 849 if (m->m_epg_tls != NULL) { 850 toep->flags |= TPF_KTLS; 851 if (plen == 0) { 852 SOCKBUF_UNLOCK(sb); 853 t4_push_ktls(sc, toep, 0); 854 return; 855 } 856 break; 857 } 858 #endif 859 n = sglist_count_mbuf_epg(m, 860 mtod(m, vm_offset_t), m->m_len); 861 } else 862 n = sglist_count(mtod(m, void *), m->m_len); 863 864 nsegs += n; 865 plen += m->m_len; 866 867 /* This mbuf sent us _over_ the nsegs limit, back out */ 868 if (plen > max_imm && nsegs > max_nsegs) { 869 nsegs -= n; 870 plen -= m->m_len; 871 if (plen == 0) { 872 /* Too few credits */ 873 toep->flags |= TPF_TX_SUSPENDED; 874 if (sowwakeup) { 875 if (!TAILQ_EMPTY( 876 &toep->aiotx_jobq)) 877 t4_aiotx_queue_toep(so, 878 toep); 879 sowwakeup_locked(so); 880 } else 881 SOCKBUF_UNLOCK(sb); 882 SOCKBUF_UNLOCK_ASSERT(sb); 883 return; 884 } 885 break; 886 } 887 888 if (m->m_flags & M_EXTPG) 889 nomap_mbuf_seen = true; 890 if (max_nsegs_1mbuf < n) 891 max_nsegs_1mbuf = n; 892 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 893 894 /* This mbuf put us right at the max_nsegs limit */ 895 if (plen > max_imm && nsegs == max_nsegs) { 896 m = m->m_next; 897 break; 898 } 899 } 900 901 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 902 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 903 compl = 1; 904 else 905 compl = 0; 906 907 if (sb->sb_flags & SB_AUTOSIZE && 908 V_tcp_do_autosndbuf && 909 sb->sb_hiwat < V_tcp_autosndbuf_max && 910 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 911 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 912 V_tcp_autosndbuf_max); 913 914 if (!sbreserve_locked(so, SO_SND, newsize, NULL)) 915 sb->sb_flags &= ~SB_AUTOSIZE; 916 else 917 sowwakeup = 1; /* room available */ 918 } 919 if (sowwakeup) { 920 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 921 t4_aiotx_queue_toep(so, toep); 922 sowwakeup_locked(so); 923 } else 924 SOCKBUF_UNLOCK(sb); 925 SOCKBUF_UNLOCK_ASSERT(sb); 926 927 /* nothing to send */ 928 if (plen == 0) { 929 KASSERT(m == NULL || (m->m_flags & M_NOTREADY) != 0, 930 ("%s: nothing to send, but m != NULL is ready", 931 __func__)); 932 break; 933 } 934 935 if (__predict_false(toep->flags & TPF_FIN_SENT)) 936 panic("%s: excess tx.", __func__); 937 938 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 939 if (plen <= max_imm && !nomap_mbuf_seen) { 940 941 /* Immediate data tx */ 942 943 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 944 &toep->ofld_txq->wrq); 945 if (wr == NULL) { 946 /* XXX: how will we recover from this? */ 947 toep->flags |= TPF_TX_SUSPENDED; 948 return; 949 } 950 txwr = wrtod(wr); 951 credits = howmany(wr->wr_len, 16); 952 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen, 953 credits, shove, 0); 954 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 955 nsegs = 0; 956 } else { 957 int wr_len; 958 959 /* DSGL tx */ 960 961 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 962 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 963 wr = alloc_wrqe(roundup2(wr_len, 16), 964 &toep->ofld_txq->wrq); 965 if (wr == NULL) { 966 /* XXX: how will we recover from this? */ 967 toep->flags |= TPF_TX_SUSPENDED; 968 return; 969 } 970 txwr = wrtod(wr); 971 credits = howmany(wr_len, 16); 972 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen, 973 credits, shove, 0); 974 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 975 max_nsegs_1mbuf); 976 if (wr_len & 0xf) { 977 uint64_t *pad = (uint64_t *) 978 ((uintptr_t)txwr + wr_len); 979 *pad = 0; 980 } 981 } 982 983 KASSERT(toep->tx_credits >= credits, 984 ("%s: not enough credits", __func__)); 985 986 toep->tx_credits -= credits; 987 toep->tx_nocompl += credits; 988 toep->plen_nocompl += plen; 989 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 990 toep->tx_nocompl >= toep->tx_total / 4) 991 compl = 1; 992 993 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 994 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 995 toep->tx_nocompl = 0; 996 toep->plen_nocompl = 0; 997 } 998 999 tp->snd_nxt += plen; 1000 tp->snd_max += plen; 1001 1002 SOCKBUF_LOCK(sb); 1003 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 1004 sb->sb_sndptr = sb_sndptr; 1005 SOCKBUF_UNLOCK(sb); 1006 1007 toep->flags |= TPF_TX_DATA_SENT; 1008 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1009 toep->flags |= TPF_TX_SUSPENDED; 1010 1011 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1012 KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, 1013 ("%s: plen %u too large", __func__, plen)); 1014 txsd->plen = plen; 1015 txsd->tx_credits = credits; 1016 txsd++; 1017 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1018 toep->txsd_pidx = 0; 1019 txsd = &toep->txsd[0]; 1020 } 1021 toep->txsd_avail--; 1022 1023 t4_l2t_send(sc, wr, toep->l2te); 1024 } while (m != NULL && (m->m_flags & M_NOTREADY) == 0); 1025 1026 /* Send a FIN if requested, but only if there's no more data to send */ 1027 if (m == NULL && toep->flags & TPF_SEND_FIN) 1028 t4_close_conn(sc, toep); 1029 } 1030 1031 static inline void 1032 rqdrop_locked(struct mbufq *q, int plen) 1033 { 1034 struct mbuf *m; 1035 1036 while (plen > 0) { 1037 m = mbufq_dequeue(q); 1038 1039 /* Too many credits. */ 1040 MPASS(m != NULL); 1041 M_ASSERTPKTHDR(m); 1042 1043 /* Partial credits. */ 1044 MPASS(plen >= m->m_pkthdr.len); 1045 1046 plen -= m->m_pkthdr.len; 1047 m_freem(m); 1048 } 1049 } 1050 1051 /* 1052 * Not a bit in the TCB, but is a bit in the ulp_submode field of the 1053 * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR. 1054 */ 1055 #define ULP_ISO G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO) 1056 1057 static void 1058 write_iscsi_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, 1059 uint16_t mss, int len, int npdu) 1060 { 1061 struct cpl_tx_data_iso *cpl; 1062 unsigned int burst_size; 1063 unsigned int last; 1064 1065 /* 1066 * The firmware will set the 'F' bit on the last PDU when 1067 * either condition is true: 1068 * 1069 * - this large PDU is marked as the "last" slice 1070 * 1071 * - the amount of data payload bytes equals the burst_size 1072 * 1073 * The strategy used here is to always set the burst_size 1074 * artificially high (len includes the size of the template 1075 * BHS) and only set the "last" flag if the original PDU had 1076 * 'F' set. 1077 */ 1078 burst_size = len; 1079 last = !!(flags & CXGBE_ISO_F); 1080 1081 cpl = (struct cpl_tx_data_iso *)dst; 1082 cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) | 1083 V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) | 1084 V_CPL_TX_DATA_ISO_CPLHDRLEN(0) | 1085 V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) | 1086 V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) | 1087 V_CPL_TX_DATA_ISO_IMMEDIATE(0) | 1088 V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags))); 1089 1090 cpl->ahs_len = 0; 1091 cpl->mpdu = htons(DIV_ROUND_UP(mss, 4)); 1092 cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4)); 1093 cpl->len = htonl(len); 1094 cpl->reserved2_seglen_offset = htonl(0); 1095 cpl->datasn_offset = htonl(0); 1096 cpl->buffer_offset = htonl(0); 1097 cpl->reserved3 = 0; 1098 } 1099 1100 static struct wrqe * 1101 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) 1102 { 1103 struct mbuf *m; 1104 struct fw_ofld_tx_data_wr *txwr; 1105 struct cpl_tx_data_iso *cpl_iso; 1106 void *p; 1107 struct wrqe *wr; 1108 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 1109 u_int adjusted_plen, imm_data, ulp_submode; 1110 struct inpcb *inp = toep->inp; 1111 struct tcpcb *tp = intotcpcb(inp); 1112 int tx_credits, shove, npdu, wr_len; 1113 uint16_t iso_mss; 1114 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 1115 bool iso, nomap_mbuf_seen; 1116 1117 M_ASSERTPKTHDR(sndptr); 1118 1119 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 1120 if (mbuf_raw_wr(sndptr)) { 1121 plen = sndptr->m_pkthdr.len; 1122 KASSERT(plen <= SGE_MAX_WR_LEN, 1123 ("raw WR len %u is greater than max WR len", plen)); 1124 if (plen > tx_credits * 16) 1125 return (NULL); 1126 1127 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 1128 if (__predict_false(wr == NULL)) 1129 return (NULL); 1130 1131 m_copydata(sndptr, 0, plen, wrtod(wr)); 1132 return (wr); 1133 } 1134 1135 iso = mbuf_iscsi_iso(sndptr); 1136 max_imm = max_imm_payload(tx_credits, iso); 1137 max_nsegs = max_dsgl_nsegs(tx_credits, iso); 1138 iso_mss = mbuf_iscsi_iso_mss(sndptr); 1139 1140 plen = 0; 1141 nsegs = 0; 1142 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 1143 nomap_mbuf_seen = false; 1144 for (m = sndptr; m != NULL; m = m->m_next) { 1145 int n; 1146 1147 if (m->m_flags & M_EXTPG) 1148 n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), 1149 m->m_len); 1150 else 1151 n = sglist_count(mtod(m, void *), m->m_len); 1152 1153 nsegs += n; 1154 plen += m->m_len; 1155 1156 /* 1157 * This mbuf would send us _over_ the nsegs limit. 1158 * Suspend tx because the PDU can't be sent out. 1159 */ 1160 if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs) 1161 return (NULL); 1162 1163 if (m->m_flags & M_EXTPG) 1164 nomap_mbuf_seen = true; 1165 if (max_nsegs_1mbuf < n) 1166 max_nsegs_1mbuf = n; 1167 } 1168 1169 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1170 panic("%s: excess tx.", __func__); 1171 1172 /* 1173 * We have a PDU to send. All of it goes out in one WR so 'm' 1174 * is NULL. A PDU's length is always a multiple of 4. 1175 */ 1176 MPASS(m == NULL); 1177 MPASS((plen & 3) == 0); 1178 MPASS(sndptr->m_pkthdr.len == plen); 1179 1180 shove = !(tp->t_flags & TF_MORETOCOME); 1181 1182 /* 1183 * plen doesn't include header and data digests, which are 1184 * generated and inserted in the right places by the TOE, but 1185 * they do occupy TCP sequence space and need to be accounted 1186 * for. 1187 */ 1188 ulp_submode = mbuf_ulp_submode(sndptr); 1189 MPASS(ulp_submode < nitems(ulp_extra_len)); 1190 npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1; 1191 adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu; 1192 if (iso) 1193 adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1); 1194 wr_len = sizeof(*txwr); 1195 if (iso) 1196 wr_len += sizeof(struct cpl_tx_data_iso); 1197 if (plen <= max_imm && !nomap_mbuf_seen) { 1198 /* Immediate data tx */ 1199 imm_data = plen; 1200 wr_len += plen; 1201 nsegs = 0; 1202 } else { 1203 /* DSGL tx */ 1204 imm_data = 0; 1205 wr_len += sizeof(struct ulptx_sgl) + 1206 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1207 } 1208 1209 wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); 1210 if (wr == NULL) { 1211 /* XXX: how will we recover from this? */ 1212 return (NULL); 1213 } 1214 txwr = wrtod(wr); 1215 credits = howmany(wr->wr_len, 16); 1216 1217 if (iso) { 1218 write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR, 1219 imm_data + sizeof(struct cpl_tx_data_iso), 1220 adjusted_plen, credits, shove, ulp_submode | ULP_ISO); 1221 cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1); 1222 MPASS(plen == sndptr->m_pkthdr.len); 1223 write_iscsi_tx_data_iso(cpl_iso, ulp_submode, 1224 mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu); 1225 p = cpl_iso + 1; 1226 } else { 1227 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data, 1228 adjusted_plen, credits, shove, ulp_submode); 1229 p = txwr + 1; 1230 } 1231 1232 if (imm_data != 0) { 1233 m_copydata(sndptr, 0, plen, p); 1234 } else { 1235 write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf); 1236 if (wr_len & 0xf) { 1237 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); 1238 *pad = 0; 1239 } 1240 } 1241 1242 KASSERT(toep->tx_credits >= credits, 1243 ("%s: not enough credits: credits %u " 1244 "toep->tx_credits %u tx_credits %u nsegs %u " 1245 "max_nsegs %u iso %d", __func__, credits, 1246 toep->tx_credits, tx_credits, nsegs, max_nsegs, iso)); 1247 1248 tp->snd_nxt += adjusted_plen; 1249 tp->snd_max += adjusted_plen; 1250 1251 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu); 1252 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); 1253 if (iso) 1254 counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1); 1255 1256 return (wr); 1257 } 1258 1259 static void 1260 write_nvme_tx_data_iso(void *dst, u_int ulp_submode, u_int iso_type, 1261 uint16_t mss, int len, int npdu, int pdo) 1262 { 1263 struct cpl_t7_tx_data_iso *cpl; 1264 unsigned int burst_size; 1265 1266 /* 1267 * TODO: Need to figure out how the LAST_PDU and SUCCESS flags 1268 * are handled. 1269 * 1270 * - Does len need padding bytes? (If so, does padding need 1271 * to be in DSGL input?) 1272 * 1273 * - burst always 0? 1274 */ 1275 burst_size = 0; 1276 1277 cpl = (struct cpl_t7_tx_data_iso *)dst; 1278 cpl->op_to_scsi = htonl(V_CPL_T7_TX_DATA_ISO_OPCODE(CPL_TX_DATA_ISO) | 1279 V_CPL_T7_TX_DATA_ISO_FIRST(1) | 1280 V_CPL_T7_TX_DATA_ISO_LAST(1) | 1281 V_CPL_T7_TX_DATA_ISO_CPLHDRLEN(0) | 1282 V_CPL_T7_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) | 1283 V_CPL_T7_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) | 1284 V_CPL_T7_TX_DATA_ISO_IMMEDIATE(0) | 1285 V_CPL_T7_TX_DATA_ISO_SCSI(iso_type)); 1286 1287 cpl->nvme_tcp_pkd = F_CPL_T7_TX_DATA_ISO_NVME_TCP; 1288 cpl->ahs = 0; 1289 cpl->mpdu = htons(DIV_ROUND_UP(mss, 4)); 1290 cpl->burst = htonl(DIV_ROUND_UP(burst_size, 4)); 1291 cpl->size = htonl(len); 1292 cpl->num_pi_bytes_seglen_offset = htonl(0); 1293 cpl->datasn_offset = htonl(0); 1294 cpl->buffer_offset = htonl(0); 1295 cpl->pdo_pkd = pdo; 1296 } 1297 1298 static struct wrqe * 1299 write_nvme_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) 1300 { 1301 struct mbuf *m; 1302 const struct nvme_tcp_common_pdu_hdr *hdr; 1303 struct fw_v2_nvmet_tx_data_wr *txwr; 1304 struct cpl_tx_data_iso *cpl_iso; 1305 void *p; 1306 struct wrqe *wr; 1307 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 1308 u_int adjusted_plen, imm_data, ulp_submode; 1309 struct inpcb *inp = toep->inp; 1310 struct tcpcb *tp = intotcpcb(inp); 1311 int tx_credits, shove, npdu, wr_len; 1312 uint16_t iso_mss; 1313 bool iso, nomap_mbuf_seen; 1314 1315 M_ASSERTPKTHDR(sndptr); 1316 1317 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 1318 if (mbuf_raw_wr(sndptr)) { 1319 plen = sndptr->m_pkthdr.len; 1320 KASSERT(plen <= SGE_MAX_WR_LEN, 1321 ("raw WR len %u is greater than max WR len", plen)); 1322 if (plen > tx_credits * 16) 1323 return (NULL); 1324 1325 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 1326 if (__predict_false(wr == NULL)) 1327 return (NULL); 1328 1329 m_copydata(sndptr, 0, plen, wrtod(wr)); 1330 return (wr); 1331 } 1332 1333 /* 1334 * The first mbuf is the PDU header that is always sent as 1335 * immediate data. 1336 */ 1337 imm_data = sndptr->m_len; 1338 1339 iso = mbuf_iscsi_iso(sndptr); 1340 max_imm = max_imm_payload_v2(tx_credits, iso); 1341 1342 /* 1343 * Not enough credits for the PDU header. 1344 */ 1345 if (imm_data > max_imm) 1346 return (NULL); 1347 1348 max_nsegs = max_dsgl_nsegs_v2(tx_credits, iso, imm_data); 1349 iso_mss = mbuf_iscsi_iso_mss(sndptr); 1350 1351 plen = imm_data; 1352 nsegs = 0; 1353 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 1354 nomap_mbuf_seen = false; 1355 for (m = sndptr->m_next; m != NULL; m = m->m_next) { 1356 int n; 1357 1358 if (m->m_flags & M_EXTPG) 1359 n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), 1360 m->m_len); 1361 else 1362 n = sglist_count(mtod(m, void *), m->m_len); 1363 1364 nsegs += n; 1365 plen += m->m_len; 1366 1367 /* 1368 * This mbuf would send us _over_ the nsegs limit. 1369 * Suspend tx because the PDU can't be sent out. 1370 */ 1371 if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs) 1372 return (NULL); 1373 1374 if (m->m_flags & M_EXTPG) 1375 nomap_mbuf_seen = true; 1376 if (max_nsegs_1mbuf < n) 1377 max_nsegs_1mbuf = n; 1378 } 1379 1380 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1381 panic("%s: excess tx.", __func__); 1382 1383 /* 1384 * We have a PDU to send. All of it goes out in one WR so 'm' 1385 * is NULL. A PDU's length is always a multiple of 4. 1386 */ 1387 MPASS(m == NULL); 1388 MPASS((plen & 3) == 0); 1389 MPASS(sndptr->m_pkthdr.len == plen); 1390 1391 shove = !(tp->t_flags & TF_MORETOCOME); 1392 1393 /* 1394 * plen doesn't include header digests, padding, and data 1395 * digests which are generated and inserted in the right 1396 * places by the TOE, but they do occupy TCP sequence space 1397 * and need to be accounted for. 1398 * 1399 * To determine the overhead, check the PDU header in sndptr. 1400 * Note that only certain PDU types can use digests and 1401 * padding, and PDO accounts for all but the data digests for 1402 * those PDUs. 1403 */ 1404 MPASS((sndptr->m_flags & M_EXTPG) == 0); 1405 ulp_submode = mbuf_ulp_submode(sndptr); 1406 hdr = mtod(sndptr, const void *); 1407 switch (hdr->pdu_type) { 1408 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: 1409 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: 1410 MPASS(ulp_submode == 0); 1411 MPASS(!iso); 1412 break; 1413 case NVME_TCP_PDU_TYPE_CAPSULE_RESP: 1414 case NVME_TCP_PDU_TYPE_R2T: 1415 MPASS((ulp_submode & ULP_CRC_DATA) == 0); 1416 /* FALLTHROUGH */ 1417 case NVME_TCP_PDU_TYPE_CAPSULE_CMD: 1418 MPASS(!iso); 1419 break; 1420 case NVME_TCP_PDU_TYPE_H2C_DATA: 1421 case NVME_TCP_PDU_TYPE_C2H_DATA: 1422 if (le32toh(hdr->plen) + ((ulp_submode & ULP_CRC_DATA) != 0 ? 1423 sizeof(uint32_t) : 0) == plen) 1424 MPASS(!iso); 1425 break; 1426 default: 1427 __assert_unreachable(); 1428 } 1429 1430 if (iso) { 1431 npdu = howmany(plen - hdr->hlen, iso_mss); 1432 adjusted_plen = hdr->pdo * npdu + (plen - hdr->hlen); 1433 if ((ulp_submode & ULP_CRC_DATA) != 0) 1434 adjusted_plen += npdu * sizeof(uint32_t); 1435 } else { 1436 npdu = 1; 1437 adjusted_plen = le32toh(hdr->plen); 1438 } 1439 wr_len = sizeof(*txwr); 1440 if (iso) 1441 wr_len += sizeof(struct cpl_tx_data_iso); 1442 if (plen <= max_imm && !nomap_mbuf_seen) { 1443 /* Immediate data tx for full PDU */ 1444 imm_data = plen; 1445 wr_len += plen; 1446 nsegs = 0; 1447 } else { 1448 /* DSGL tx for PDU data */ 1449 wr_len += roundup2(imm_data, 16); 1450 wr_len += sizeof(struct ulptx_sgl) + 1451 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1452 } 1453 1454 wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); 1455 if (wr == NULL) { 1456 /* XXX: how will we recover from this? */ 1457 return (NULL); 1458 } 1459 txwr = wrtod(wr); 1460 credits = howmany(wr->wr_len, 16); 1461 1462 if (iso) { 1463 write_tx_v2_wr(txwr, toep, FW_V2_NVMET_TX_DATA_WR, 1464 imm_data + sizeof(struct cpl_tx_data_iso), 1465 adjusted_plen, credits, shove, ulp_submode | ULP_ISO); 1466 cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1); 1467 MPASS(plen == sndptr->m_pkthdr.len); 1468 write_nvme_tx_data_iso(cpl_iso, ulp_submode, 1469 (hdr->pdu_type & 0x1) == 0 ? 1 : 2, iso_mss, plen, npdu, 1470 hdr->pdo); 1471 p = cpl_iso + 1; 1472 } else { 1473 write_tx_v2_wr(txwr, toep, FW_V2_NVMET_TX_DATA_WR, imm_data, 1474 adjusted_plen, credits, shove, ulp_submode); 1475 p = txwr + 1; 1476 } 1477 1478 /* PDU header (and immediate data payload). */ 1479 m_copydata(sndptr, 0, imm_data, p); 1480 if (nsegs != 0) { 1481 p = roundup2((char *)p + imm_data, 16); 1482 write_tx_sgl(p, sndptr->m_next, NULL, nsegs, max_nsegs_1mbuf); 1483 if (wr_len & 0xf) { 1484 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); 1485 *pad = 0; 1486 } 1487 } 1488 1489 KASSERT(toep->tx_credits >= credits, 1490 ("%s: not enough credits: credits %u " 1491 "toep->tx_credits %u tx_credits %u nsegs %u " 1492 "max_nsegs %u iso %d", __func__, credits, 1493 toep->tx_credits, tx_credits, nsegs, max_nsegs, iso)); 1494 1495 tp->snd_nxt += adjusted_plen; 1496 tp->snd_max += adjusted_plen; 1497 1498 counter_u64_add(toep->ofld_txq->tx_nvme_pdus, npdu); 1499 counter_u64_add(toep->ofld_txq->tx_nvme_octets, plen); 1500 if (iso) 1501 counter_u64_add(toep->ofld_txq->tx_nvme_iso_wrs, 1); 1502 1503 return (wr); 1504 } 1505 1506 void 1507 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 1508 { 1509 struct mbuf *sndptr, *m; 1510 struct fw_wr_hdr *wrhdr; 1511 struct wrqe *wr; 1512 u_int plen, credits, mode; 1513 struct inpcb *inp = toep->inp; 1514 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 1515 struct mbufq *pduq = &toep->ulp_pduq; 1516 1517 INP_WLOCK_ASSERT(inp); 1518 mode = ulp_mode(toep); 1519 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1520 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 1521 KASSERT(mode == ULP_MODE_ISCSI || mode == ULP_MODE_NVMET, 1522 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 1523 1524 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 1525 return; 1526 1527 /* 1528 * This function doesn't resume by itself. Someone else must clear the 1529 * flag and call this function. 1530 */ 1531 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 1532 KASSERT(drop == 0, 1533 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1534 return; 1535 } 1536 1537 if (drop) { 1538 struct socket *so = inp->inp_socket; 1539 struct sockbuf *sb = &so->so_snd; 1540 int sbu; 1541 1542 /* 1543 * An unlocked read is ok here as the data should only 1544 * transition from a non-zero value to either another 1545 * non-zero value or zero. Once it is zero it should 1546 * stay zero. 1547 */ 1548 if (__predict_false(sbused(sb)) > 0) { 1549 SOCKBUF_LOCK(sb); 1550 sbu = sbused(sb); 1551 if (sbu > 0) { 1552 /* 1553 * The data transmitted before the 1554 * tid's ULP mode changed to ISCSI/NVMET is 1555 * still in so_snd. Incoming credits 1556 * should account for so_snd first. 1557 */ 1558 sbdrop_locked(sb, min(sbu, drop)); 1559 drop -= min(sbu, drop); 1560 } 1561 sowwakeup_locked(so); /* unlocks so_snd */ 1562 } 1563 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1564 } 1565 1566 while ((sndptr = mbufq_first(pduq)) != NULL) { 1567 if (mode == ULP_MODE_ISCSI) 1568 wr = write_iscsi_mbuf_wr(toep, sndptr); 1569 else 1570 wr = write_nvme_mbuf_wr(toep, sndptr); 1571 if (wr == NULL) { 1572 toep->flags |= TPF_TX_SUSPENDED; 1573 return; 1574 } 1575 1576 plen = sndptr->m_pkthdr.len; 1577 credits = howmany(wr->wr_len, 16); 1578 KASSERT(toep->tx_credits >= credits, 1579 ("%s: not enough credits", __func__)); 1580 1581 m = mbufq_dequeue(pduq); 1582 MPASS(m == sndptr); 1583 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1584 1585 toep->tx_credits -= credits; 1586 toep->tx_nocompl += credits; 1587 toep->plen_nocompl += plen; 1588 1589 /* 1590 * Ensure there are enough credits for a full-sized WR 1591 * as page pod WRs can be full-sized. 1592 */ 1593 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && 1594 toep->tx_nocompl >= toep->tx_total / 4) { 1595 wrhdr = wrtod(wr); 1596 wrhdr->hi |= htobe32(F_FW_WR_COMPL); 1597 toep->tx_nocompl = 0; 1598 toep->plen_nocompl = 0; 1599 } 1600 1601 toep->flags |= TPF_TX_DATA_SENT; 1602 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1603 toep->flags |= TPF_TX_SUSPENDED; 1604 1605 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1606 KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, 1607 ("%s: plen %u too large", __func__, plen)); 1608 txsd->plen = plen; 1609 txsd->tx_credits = credits; 1610 txsd++; 1611 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1612 toep->txsd_pidx = 0; 1613 txsd = &toep->txsd[0]; 1614 } 1615 toep->txsd_avail--; 1616 1617 t4_l2t_send(sc, wr, toep->l2te); 1618 } 1619 1620 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1621 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1622 t4_close_conn(sc, toep); 1623 } 1624 1625 static inline void 1626 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1627 { 1628 1629 if (ulp_mode(toep) == ULP_MODE_ISCSI || 1630 ulp_mode(toep) == ULP_MODE_NVMET) 1631 t4_push_pdus(sc, toep, drop); 1632 else if (toep->flags & TPF_KTLS) 1633 t4_push_ktls(sc, toep, drop); 1634 else 1635 t4_push_frames(sc, toep, drop); 1636 } 1637 1638 void 1639 t4_raw_wr_tx(struct adapter *sc, struct toepcb *toep, struct mbuf *m) 1640 { 1641 #ifdef INVARIANTS 1642 struct inpcb *inp = toep->inp; 1643 #endif 1644 1645 INP_WLOCK_ASSERT(inp); 1646 1647 /* 1648 * If there are other raw WRs enqueued, enqueue to preserve 1649 * FIFO ordering. 1650 */ 1651 if (!mbufq_empty(&toep->ulp_pduq)) { 1652 mbufq_enqueue(&toep->ulp_pduq, m); 1653 return; 1654 } 1655 1656 /* 1657 * Cannot call t4_push_data here as that will lock so_snd and 1658 * some callers of this run in rx handlers with so_rcv locked. 1659 * Instead, just try to transmit this WR. 1660 */ 1661 if (!t4_push_raw_wr(sc, toep, m)) { 1662 mbufq_enqueue(&toep->ulp_pduq, m); 1663 toep->flags |= TPF_TX_SUSPENDED; 1664 } 1665 } 1666 1667 int 1668 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1669 { 1670 struct adapter *sc = tod->tod_softc; 1671 #ifdef INVARIANTS 1672 struct inpcb *inp = tptoinpcb(tp); 1673 #endif 1674 struct toepcb *toep = tp->t_toe; 1675 1676 INP_WLOCK_ASSERT(inp); 1677 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1678 ("%s: inp %p dropped.", __func__, inp)); 1679 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1680 1681 t4_push_data(sc, toep, 0); 1682 1683 return (0); 1684 } 1685 1686 int 1687 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1688 { 1689 struct adapter *sc = tod->tod_softc; 1690 #ifdef INVARIANTS 1691 struct inpcb *inp = tptoinpcb(tp); 1692 #endif 1693 struct toepcb *toep = tp->t_toe; 1694 1695 INP_WLOCK_ASSERT(inp); 1696 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1697 ("%s: inp %p dropped.", __func__, inp)); 1698 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1699 1700 toep->flags |= TPF_SEND_FIN; 1701 if (tp->t_state >= TCPS_ESTABLISHED) 1702 t4_push_data(sc, toep, 0); 1703 1704 return (0); 1705 } 1706 1707 int 1708 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1709 { 1710 struct adapter *sc = tod->tod_softc; 1711 #if defined(INVARIANTS) 1712 struct inpcb *inp = tptoinpcb(tp); 1713 #endif 1714 struct toepcb *toep = tp->t_toe; 1715 1716 INP_WLOCK_ASSERT(inp); 1717 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1718 ("%s: inp %p dropped.", __func__, inp)); 1719 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1720 1721 /* hmmmm */ 1722 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1723 ("%s: flowc for tid %u [%s] not sent already", 1724 __func__, toep->tid, tcpstates[tp->t_state])); 1725 1726 send_reset(sc, toep, 0); 1727 return (0); 1728 } 1729 1730 /* 1731 * Peer has sent us a FIN. 1732 */ 1733 static int 1734 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1735 { 1736 struct adapter *sc = iq->adapter; 1737 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1738 unsigned int tid = GET_TID(cpl); 1739 struct toepcb *toep = lookup_tid(sc, tid); 1740 struct inpcb *inp = toep->inp; 1741 struct tcpcb *tp = NULL; 1742 struct socket *so; 1743 struct epoch_tracker et; 1744 #ifdef INVARIANTS 1745 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1746 #endif 1747 1748 KASSERT(opcode == CPL_PEER_CLOSE, 1749 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1750 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1751 1752 if (__predict_false(toep->flags & TPF_SYNQE)) { 1753 /* 1754 * do_pass_establish must have run before do_peer_close and if 1755 * this is still a synqe instead of a toepcb then the connection 1756 * must be getting aborted. 1757 */ 1758 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1759 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1760 toep, toep->flags); 1761 return (0); 1762 } 1763 1764 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1765 1766 CURVNET_SET(toep->vnet); 1767 NET_EPOCH_ENTER(et); 1768 INP_WLOCK(inp); 1769 tp = intotcpcb(inp); 1770 1771 CTR6(KTR_CXGBE, 1772 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1773 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1774 toep->ddp.flags, inp); 1775 1776 if (toep->flags & TPF_ABORT_SHUTDOWN) 1777 goto done; 1778 1779 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1780 DDP_LOCK(toep); 1781 if (__predict_false(toep->ddp.flags & 1782 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1783 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1784 DDP_UNLOCK(toep); 1785 } 1786 so = inp->inp_socket; 1787 socantrcvmore(so); 1788 1789 if (ulp_mode(toep) == ULP_MODE_RDMA || 1790 (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6) || 1791 ulp_mode(toep) == ULP_MODE_NVMET) { 1792 /* 1793 * There might be data received via DDP before the FIN 1794 * not reported to the driver. Just assume the 1795 * sequence number in the CPL is correct as the 1796 * sequence number of the FIN. 1797 */ 1798 } else { 1799 KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt), 1800 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1801 be32toh(cpl->rcv_nxt))); 1802 } 1803 1804 tp->rcv_nxt = be32toh(cpl->rcv_nxt); 1805 1806 switch (tp->t_state) { 1807 case TCPS_SYN_RECEIVED: 1808 tp->t_starttime = ticks; 1809 /* FALLTHROUGH */ 1810 1811 case TCPS_ESTABLISHED: 1812 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1813 break; 1814 1815 case TCPS_FIN_WAIT_1: 1816 tcp_state_change(tp, TCPS_CLOSING); 1817 break; 1818 1819 case TCPS_FIN_WAIT_2: 1820 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1821 t4_pcb_detach(NULL, tp); 1822 tcp_twstart(tp); 1823 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1824 NET_EPOCH_EXIT(et); 1825 CURVNET_RESTORE(); 1826 1827 INP_WLOCK(inp); 1828 final_cpl_received(toep); 1829 return (0); 1830 1831 default: 1832 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1833 __func__, tid, tp->t_state); 1834 } 1835 done: 1836 INP_WUNLOCK(inp); 1837 NET_EPOCH_EXIT(et); 1838 CURVNET_RESTORE(); 1839 return (0); 1840 } 1841 1842 /* 1843 * Peer has ACK'd our FIN. 1844 */ 1845 static int 1846 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1847 struct mbuf *m) 1848 { 1849 struct adapter *sc = iq->adapter; 1850 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1851 unsigned int tid = GET_TID(cpl); 1852 struct toepcb *toep = lookup_tid(sc, tid); 1853 struct inpcb *inp = toep->inp; 1854 struct tcpcb *tp = NULL; 1855 struct socket *so = NULL; 1856 struct epoch_tracker et; 1857 #ifdef INVARIANTS 1858 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1859 #endif 1860 1861 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1862 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1863 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1864 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1865 1866 CURVNET_SET(toep->vnet); 1867 NET_EPOCH_ENTER(et); 1868 INP_WLOCK(inp); 1869 tp = intotcpcb(inp); 1870 1871 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1872 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1873 1874 if (toep->flags & TPF_ABORT_SHUTDOWN) 1875 goto done; 1876 1877 so = inp->inp_socket; 1878 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1879 1880 switch (tp->t_state) { 1881 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1882 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1883 t4_pcb_detach(NULL, tp); 1884 tcp_twstart(tp); 1885 release: 1886 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1887 NET_EPOCH_EXIT(et); 1888 CURVNET_RESTORE(); 1889 1890 INP_WLOCK(inp); 1891 final_cpl_received(toep); /* no more CPLs expected */ 1892 1893 return (0); 1894 case TCPS_LAST_ACK: 1895 if (tcp_close(tp)) 1896 INP_WUNLOCK(inp); 1897 goto release; 1898 1899 case TCPS_FIN_WAIT_1: 1900 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1901 soisdisconnected(so); 1902 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1903 break; 1904 1905 default: 1906 log(LOG_ERR, 1907 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1908 __func__, tid, tcpstates[tp->t_state]); 1909 } 1910 done: 1911 INP_WUNLOCK(inp); 1912 NET_EPOCH_EXIT(et); 1913 CURVNET_RESTORE(); 1914 return (0); 1915 } 1916 1917 void 1918 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, 1919 int rst_status) 1920 { 1921 struct wrqe *wr; 1922 struct cpl_abort_rpl *cpl; 1923 1924 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); 1925 if (wr == NULL) { 1926 /* XXX */ 1927 panic("%s: allocation failure.", __func__); 1928 } 1929 cpl = wrtod(wr); 1930 1931 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1932 cpl->cmd = rst_status; 1933 1934 t4_wrq_tx(sc, wr); 1935 } 1936 1937 static int 1938 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1939 { 1940 switch (abort_reason) { 1941 case CPL_ERR_BAD_SYN: 1942 case CPL_ERR_CONN_RESET: 1943 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1944 case CPL_ERR_XMIT_TIMEDOUT: 1945 case CPL_ERR_PERSIST_TIMEDOUT: 1946 case CPL_ERR_FINWAIT2_TIMEDOUT: 1947 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1948 return (ETIMEDOUT); 1949 default: 1950 return (EIO); 1951 } 1952 } 1953 1954 /* 1955 * TCP RST from the peer, timeout, or some other such critical error. 1956 */ 1957 static int 1958 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1959 { 1960 struct adapter *sc = iq->adapter; 1961 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1962 unsigned int tid = GET_TID(cpl); 1963 struct toepcb *toep = lookup_tid(sc, tid); 1964 struct sge_ofld_txq *ofld_txq = toep->ofld_txq; 1965 struct inpcb *inp; 1966 struct tcpcb *tp; 1967 struct epoch_tracker et; 1968 #ifdef INVARIANTS 1969 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1970 #endif 1971 1972 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1973 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1974 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1975 1976 if (toep->flags & TPF_SYNQE) 1977 return (do_abort_req_synqe(iq, rss, m)); 1978 1979 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1980 1981 if (negative_advice(cpl->status)) { 1982 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1983 __func__, cpl->status, tid, toep->flags); 1984 return (0); /* Ignore negative advice */ 1985 } 1986 1987 inp = toep->inp; 1988 CURVNET_SET(toep->vnet); 1989 NET_EPOCH_ENTER(et); /* for tcp_close */ 1990 INP_WLOCK(inp); 1991 1992 tp = intotcpcb(inp); 1993 1994 CTR6(KTR_CXGBE, 1995 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1996 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1997 inp->inp_flags, cpl->status); 1998 1999 /* 2000 * If we'd initiated an abort earlier the reply to it is responsible for 2001 * cleaning up resources. Otherwise we tear everything down right here 2002 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 2003 */ 2004 if (toep->flags & TPF_ABORT_SHUTDOWN) { 2005 INP_WUNLOCK(inp); 2006 goto done; 2007 } 2008 toep->flags |= TPF_ABORT_SHUTDOWN; 2009 2010 if ((inp->inp_flags & INP_DROPPED) == 0) { 2011 struct socket *so = inp->inp_socket; 2012 2013 if (so != NULL) 2014 so_error_set(so, abort_status_to_errno(tp, 2015 cpl->status)); 2016 tp = tcp_close(tp); 2017 if (tp == NULL) 2018 INP_WLOCK(inp); /* re-acquire */ 2019 } 2020 2021 final_cpl_received(toep); 2022 done: 2023 NET_EPOCH_EXIT(et); 2024 CURVNET_RESTORE(); 2025 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 2026 return (0); 2027 } 2028 2029 /* 2030 * Reply to the CPL_ABORT_REQ (send_reset) 2031 */ 2032 static int 2033 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 2034 { 2035 struct adapter *sc = iq->adapter; 2036 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 2037 unsigned int tid = GET_TID(cpl); 2038 struct toepcb *toep = lookup_tid(sc, tid); 2039 struct inpcb *inp = toep->inp; 2040 #ifdef INVARIANTS 2041 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 2042 #endif 2043 2044 KASSERT(opcode == CPL_ABORT_RPL_RSS, 2045 ("%s: unexpected opcode 0x%x", __func__, opcode)); 2046 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 2047 2048 if (toep->flags & TPF_SYNQE) 2049 return (do_abort_rpl_synqe(iq, rss, m)); 2050 2051 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 2052 2053 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 2054 __func__, tid, toep, inp, cpl->status); 2055 2056 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 2057 ("%s: wasn't expecting abort reply", __func__)); 2058 2059 INP_WLOCK(inp); 2060 final_cpl_received(toep); 2061 2062 return (0); 2063 } 2064 2065 static int 2066 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 2067 { 2068 struct adapter *sc = iq->adapter; 2069 const struct cpl_rx_data *cpl = mtod(m, const void *); 2070 unsigned int tid = GET_TID(cpl); 2071 struct toepcb *toep = lookup_tid(sc, tid); 2072 struct inpcb *inp = toep->inp; 2073 struct tcpcb *tp; 2074 struct socket *so; 2075 struct sockbuf *sb; 2076 struct epoch_tracker et; 2077 int len; 2078 uint32_t ddp_placed = 0; 2079 2080 if (__predict_false(toep->flags & TPF_SYNQE)) { 2081 /* 2082 * do_pass_establish must have run before do_rx_data and if this 2083 * is still a synqe instead of a toepcb then the connection must 2084 * be getting aborted. 2085 */ 2086 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 2087 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 2088 toep, toep->flags); 2089 m_freem(m); 2090 return (0); 2091 } 2092 2093 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 2094 2095 /* strip off CPL header */ 2096 m_adj(m, sizeof(*cpl)); 2097 len = m->m_pkthdr.len; 2098 2099 INP_WLOCK(inp); 2100 if (inp->inp_flags & INP_DROPPED) { 2101 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 2102 __func__, tid, len, inp->inp_flags); 2103 INP_WUNLOCK(inp); 2104 m_freem(m); 2105 return (0); 2106 } 2107 2108 tp = intotcpcb(inp); 2109 2110 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 2111 toep->flags & TPF_TLS_RECEIVE)) { 2112 /* Received "raw" data on a TLS socket. */ 2113 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 2114 __func__, tid, len); 2115 do_rx_data_tls(cpl, toep, m); 2116 return (0); 2117 } 2118 2119 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 2120 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 2121 2122 tp->rcv_nxt += len; 2123 if (tp->rcv_wnd < len) { 2124 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 2125 ("%s: negative window size", __func__)); 2126 } 2127 2128 tp->rcv_wnd -= len; 2129 tp->t_rcvtime = ticks; 2130 2131 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 2132 DDP_LOCK(toep); 2133 so = inp_inpcbtosocket(inp); 2134 sb = &so->so_rcv; 2135 SOCKBUF_LOCK(sb); 2136 2137 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 2138 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 2139 __func__, tid, len); 2140 m_freem(m); 2141 SOCKBUF_UNLOCK(sb); 2142 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 2143 DDP_UNLOCK(toep); 2144 INP_WUNLOCK(inp); 2145 2146 CURVNET_SET(toep->vnet); 2147 NET_EPOCH_ENTER(et); 2148 INP_WLOCK(inp); 2149 tp = tcp_drop(tp, ECONNRESET); 2150 if (tp) 2151 INP_WUNLOCK(inp); 2152 NET_EPOCH_EXIT(et); 2153 CURVNET_RESTORE(); 2154 2155 return (0); 2156 } 2157 2158 /* receive buffer autosize */ 2159 MPASS(toep->vnet == so->so_vnet); 2160 CURVNET_SET(toep->vnet); 2161 if (sb->sb_flags & SB_AUTOSIZE && 2162 V_tcp_do_autorcvbuf && 2163 sb->sb_hiwat < V_tcp_autorcvbuf_max && 2164 len > (sbspace(sb) / 8 * 7)) { 2165 unsigned int hiwat = sb->sb_hiwat; 2166 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 2167 V_tcp_autorcvbuf_max); 2168 2169 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 2170 sb->sb_flags &= ~SB_AUTOSIZE; 2171 } 2172 2173 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 2174 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 2175 2176 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 2177 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 2178 __func__, tid, len); 2179 2180 if (changed) { 2181 if (toep->ddp.flags & DDP_SC_REQ) 2182 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 2183 else if (cpl->ddp_off == 1) { 2184 /* Fell out of DDP mode */ 2185 toep->ddp.flags &= ~DDP_ON; 2186 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 2187 __func__); 2188 2189 insert_ddp_data(toep, ddp_placed); 2190 } else { 2191 /* 2192 * Data was received while still 2193 * ULP_MODE_NONE, just fall through. 2194 */ 2195 } 2196 } 2197 2198 if (toep->ddp.flags & DDP_ON) { 2199 /* 2200 * CPL_RX_DATA with DDP on can only be an indicate. 2201 * Start posting queued AIO requests via DDP. The 2202 * payload that arrived in this indicate is appended 2203 * to the socket buffer as usual. 2204 */ 2205 handle_ddp_indicate(toep); 2206 } 2207 } 2208 2209 sbappendstream_locked(sb, m, 0); 2210 t4_rcvd_locked(&toep->td->tod, tp); 2211 2212 if (ulp_mode(toep) == ULP_MODE_TCPDDP && 2213 (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 && 2214 sbavail(sb) != 0) { 2215 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 2216 tid); 2217 ddp_queue_toep(toep); 2218 } 2219 if (toep->flags & TPF_TLS_STARTING) 2220 tls_received_starting_data(sc, toep, sb, len); 2221 sorwakeup_locked(so); 2222 SOCKBUF_UNLOCK_ASSERT(sb); 2223 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 2224 DDP_UNLOCK(toep); 2225 2226 INP_WUNLOCK(inp); 2227 CURVNET_RESTORE(); 2228 return (0); 2229 } 2230 2231 static int 2232 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 2233 { 2234 struct adapter *sc = iq->adapter; 2235 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 2236 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 2237 struct toepcb *toep = lookup_tid(sc, tid); 2238 struct inpcb *inp; 2239 struct tcpcb *tp; 2240 struct socket *so; 2241 uint8_t credits = cpl->credits; 2242 struct ofld_tx_sdesc *txsd; 2243 int plen; 2244 #ifdef INVARIANTS 2245 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 2246 #endif 2247 2248 /* 2249 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 2250 * now this comes back carrying the credits for the flowc. 2251 */ 2252 if (__predict_false(toep->flags & TPF_SYNQE)) { 2253 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 2254 ("%s: credits for a synq entry %p", __func__, toep)); 2255 return (0); 2256 } 2257 2258 inp = toep->inp; 2259 2260 KASSERT(opcode == CPL_FW4_ACK, 2261 ("%s: unexpected opcode 0x%x", __func__, opcode)); 2262 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 2263 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 2264 2265 INP_WLOCK(inp); 2266 2267 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 2268 INP_WUNLOCK(inp); 2269 return (0); 2270 } 2271 2272 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 2273 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 2274 2275 tp = intotcpcb(inp); 2276 2277 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 2278 tcp_seq snd_una = be32toh(cpl->snd_una); 2279 2280 #ifdef INVARIANTS 2281 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 2282 log(LOG_ERR, 2283 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 2284 __func__, snd_una, toep->tid, tp->snd_una); 2285 } 2286 #endif 2287 2288 if (tp->snd_una != snd_una) { 2289 tp->snd_una = snd_una; 2290 tp->ts_recent_age = tcp_ts_getticks(); 2291 } 2292 } 2293 2294 #ifdef VERBOSE_TRACES 2295 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 2296 #endif 2297 so = inp->inp_socket; 2298 txsd = &toep->txsd[toep->txsd_cidx]; 2299 plen = 0; 2300 while (credits) { 2301 KASSERT(credits >= txsd->tx_credits, 2302 ("%s: too many (or partial) credits", __func__)); 2303 credits -= txsd->tx_credits; 2304 toep->tx_credits += txsd->tx_credits; 2305 plen += txsd->plen; 2306 txsd++; 2307 toep->txsd_avail++; 2308 KASSERT(toep->txsd_avail <= toep->txsd_total, 2309 ("%s: txsd avail > total", __func__)); 2310 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 2311 txsd = &toep->txsd[0]; 2312 toep->txsd_cidx = 0; 2313 } 2314 } 2315 2316 if (toep->tx_credits == toep->tx_total) { 2317 toep->tx_nocompl = 0; 2318 toep->plen_nocompl = 0; 2319 } 2320 2321 if (toep->flags & TPF_TX_SUSPENDED && 2322 toep->tx_credits >= toep->tx_total / 4) { 2323 #ifdef VERBOSE_TRACES 2324 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 2325 tid); 2326 #endif 2327 toep->flags &= ~TPF_TX_SUSPENDED; 2328 CURVNET_SET(toep->vnet); 2329 t4_push_data(sc, toep, plen); 2330 CURVNET_RESTORE(); 2331 } else if (plen > 0) { 2332 struct sockbuf *sb = &so->so_snd; 2333 int sbu; 2334 2335 SOCKBUF_LOCK(sb); 2336 sbu = sbused(sb); 2337 if (ulp_mode(toep) == ULP_MODE_ISCSI || 2338 ulp_mode(toep) == ULP_MODE_NVMET) { 2339 if (__predict_false(sbu > 0)) { 2340 /* 2341 * The data transmitted before the 2342 * tid's ULP mode changed to ISCSI is 2343 * still in so_snd. Incoming credits 2344 * should account for so_snd first. 2345 */ 2346 sbdrop_locked(sb, min(sbu, plen)); 2347 plen -= min(sbu, plen); 2348 } 2349 sowwakeup_locked(so); /* unlocks so_snd */ 2350 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 2351 } else { 2352 #ifdef VERBOSE_TRACES 2353 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 2354 tid, plen); 2355 #endif 2356 sbdrop_locked(sb, plen); 2357 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2358 t4_aiotx_queue_toep(so, toep); 2359 sowwakeup_locked(so); /* unlocks so_snd */ 2360 } 2361 SOCKBUF_UNLOCK_ASSERT(sb); 2362 } 2363 2364 INP_WUNLOCK(inp); 2365 2366 return (0); 2367 } 2368 2369 void 2370 write_set_tcb_field(struct adapter *sc, void *dst, struct toepcb *toep, 2371 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 2372 { 2373 struct cpl_set_tcb_field *req = dst; 2374 2375 MPASS((cookie & ~M_COOKIE) == 0); 2376 if (reply) { 2377 MPASS(cookie != CPL_COOKIE_RESERVED); 2378 } 2379 2380 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 2381 if (reply == 0) { 2382 req->reply_ctrl = htobe16(F_NO_REPLY); 2383 } else { 2384 const int qid = toep->ofld_rxq->iq.abs_id; 2385 if (chip_id(sc) >= CHELSIO_T7) { 2386 req->reply_ctrl = htobe16(V_T7_QUEUENO(qid) | 2387 V_T7_REPLY_CHAN(0) | V_NO_REPLY(0)); 2388 } else { 2389 req->reply_ctrl = htobe16(V_QUEUENO(qid) | 2390 V_REPLY_CHAN(0) | V_NO_REPLY(0)); 2391 } 2392 } 2393 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 2394 req->mask = htobe64(mask); 2395 req->val = htobe64(val); 2396 } 2397 2398 void 2399 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 2400 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 2401 { 2402 struct wrqe *wr; 2403 struct ofld_tx_sdesc *txsd; 2404 const u_int len = sizeof(struct cpl_set_tcb_field); 2405 2406 wr = alloc_wrqe(len, wrq); 2407 if (wr == NULL) { 2408 /* XXX */ 2409 panic("%s: allocation failure.", __func__); 2410 } 2411 write_set_tcb_field(sc, wrtod(wr), toep, word, mask, val, reply, 2412 cookie); 2413 2414 if (wrq->eq.type == EQ_OFLD) { 2415 txsd = &toep->txsd[toep->txsd_pidx]; 2416 _Static_assert(howmany(len, 16) <= MAX_OFLD_TX_SDESC_CREDITS, 2417 "MAX_OFLD_TX_SDESC_CREDITS too small"); 2418 txsd->tx_credits = howmany(len, 16); 2419 txsd->plen = 0; 2420 KASSERT(toep->tx_credits >= txsd->tx_credits && 2421 toep->txsd_avail > 0, 2422 ("%s: not enough credits (%d)", __func__, 2423 toep->tx_credits)); 2424 toep->tx_credits -= txsd->tx_credits; 2425 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 2426 toep->txsd_pidx = 0; 2427 toep->txsd_avail--; 2428 } 2429 2430 t4_wrq_tx(sc, wr); 2431 } 2432 2433 void 2434 t4_init_cpl_io_handlers(void) 2435 { 2436 2437 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 2438 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 2439 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 2440 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 2441 CPL_COOKIE_TOM); 2442 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 2443 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 2444 } 2445 2446 void 2447 t4_uninit_cpl_io_handlers(void) 2448 { 2449 2450 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 2451 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 2452 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 2453 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 2454 t4_register_cpl_handler(CPL_RX_DATA, NULL); 2455 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 2456 } 2457 2458 /* 2459 * Use the 'backend1' field in AIO jobs to hold an error that should 2460 * be reported when the job is completed, the 'backend3' field to 2461 * store the amount of data sent by the AIO job so far, and the 2462 * 'backend4' field to hold a reference count on the job. 2463 * 2464 * Each unmapped mbuf holds a reference on the job as does the queue 2465 * so long as the job is queued. 2466 */ 2467 #define aio_error backend1 2468 #define aio_sent backend3 2469 #define aio_refs backend4 2470 2471 #ifdef VERBOSE_TRACES 2472 static int 2473 jobtotid(struct kaiocb *job) 2474 { 2475 struct socket *so; 2476 struct tcpcb *tp; 2477 struct toepcb *toep; 2478 2479 so = job->fd_file->f_data; 2480 tp = sototcpcb(so); 2481 toep = tp->t_toe; 2482 return (toep->tid); 2483 } 2484 #endif 2485 2486 static void 2487 aiotx_free_job(struct kaiocb *job) 2488 { 2489 long status; 2490 int error; 2491 2492 if (refcount_release(&job->aio_refs) == 0) 2493 return; 2494 2495 error = (intptr_t)job->aio_error; 2496 status = job->aio_sent; 2497 #ifdef VERBOSE_TRACES 2498 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 2499 jobtotid(job), job, status, error); 2500 #endif 2501 if (error != 0 && status != 0) 2502 error = 0; 2503 if (error == ECANCELED) 2504 aio_cancel(job); 2505 else if (error) 2506 aio_complete(job, -1, error); 2507 else { 2508 job->msgsnd = 1; 2509 aio_complete(job, status, 0); 2510 } 2511 } 2512 2513 static void 2514 aiotx_free_pgs(struct mbuf *m) 2515 { 2516 struct kaiocb *job; 2517 vm_page_t pg; 2518 2519 M_ASSERTEXTPG(m); 2520 job = m->m_ext.ext_arg1; 2521 #ifdef VERBOSE_TRACES 2522 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 2523 m->m_len, jobtotid(job)); 2524 #endif 2525 2526 for (int i = 0; i < m->m_epg_npgs; i++) { 2527 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 2528 vm_page_unwire(pg, PQ_ACTIVE); 2529 } 2530 2531 aiotx_free_job(job); 2532 } 2533 2534 /* 2535 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 2536 * of an AIO job. 2537 */ 2538 static struct mbuf * 2539 alloc_aiotx_mbuf(struct kaiocb *job, int len) 2540 { 2541 struct vmspace *vm; 2542 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 2543 struct mbuf *m, *top, *last; 2544 vm_map_t map; 2545 vm_offset_t start; 2546 int i, mlen, npages, pgoff; 2547 2548 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 2549 ("%s(%p, %d): request to send beyond end of buffer", __func__, 2550 job, len)); 2551 2552 /* 2553 * The AIO subsystem will cancel and drain all requests before 2554 * permitting a process to exit or exec, so p_vmspace should 2555 * be stable here. 2556 */ 2557 vm = job->userproc->p_vmspace; 2558 map = &vm->vm_map; 2559 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 2560 pgoff = start & PAGE_MASK; 2561 2562 top = NULL; 2563 last = NULL; 2564 while (len > 0) { 2565 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 2566 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 2567 ("%s: next start (%#jx + %#x) is not page aligned", 2568 __func__, (uintmax_t)start, mlen)); 2569 2570 npages = vm_fault_quick_hold_pages(map, start, mlen, 2571 VM_PROT_WRITE, pgs, nitems(pgs)); 2572 if (npages < 0) 2573 break; 2574 2575 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs, M_RDONLY); 2576 m->m_epg_1st_off = pgoff; 2577 m->m_epg_npgs = npages; 2578 if (npages == 1) { 2579 KASSERT(mlen + pgoff <= PAGE_SIZE, 2580 ("%s: single page is too large (off %d len %d)", 2581 __func__, pgoff, mlen)); 2582 m->m_epg_last_len = mlen; 2583 } else { 2584 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2585 (npages - 2) * PAGE_SIZE; 2586 } 2587 for (i = 0; i < npages; i++) 2588 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2589 2590 m->m_len = mlen; 2591 m->m_ext.ext_size = npages * PAGE_SIZE; 2592 m->m_ext.ext_arg1 = job; 2593 refcount_acquire(&job->aio_refs); 2594 2595 #ifdef VERBOSE_TRACES 2596 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2597 __func__, jobtotid(job), m, job, npages); 2598 #endif 2599 2600 if (top == NULL) 2601 top = m; 2602 else 2603 last->m_next = m; 2604 last = m; 2605 2606 len -= mlen; 2607 start += mlen; 2608 pgoff = 0; 2609 } 2610 2611 return (top); 2612 } 2613 2614 static void 2615 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2616 { 2617 struct sockbuf *sb; 2618 struct inpcb *inp; 2619 struct tcpcb *tp; 2620 struct mbuf *m; 2621 u_int sent; 2622 int error, len; 2623 bool moretocome, sendmore; 2624 2625 sb = &so->so_snd; 2626 SOCKBUF_UNLOCK(sb); 2627 m = NULL; 2628 2629 #ifdef MAC 2630 error = mac_socket_check_send(job->fd_file->f_cred, so); 2631 if (error != 0) 2632 goto out; 2633 #endif 2634 2635 /* Inline sosend_generic(). */ 2636 2637 error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); 2638 MPASS(error == 0); 2639 2640 sendanother: 2641 SOCKBUF_LOCK(sb); 2642 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2643 SOCKBUF_UNLOCK(sb); 2644 SOCK_IO_SEND_UNLOCK(so); 2645 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2646 PROC_LOCK(job->userproc); 2647 kern_psignal(job->userproc, SIGPIPE); 2648 PROC_UNLOCK(job->userproc); 2649 } 2650 error = EPIPE; 2651 goto out; 2652 } 2653 if (so->so_error) { 2654 error = so->so_error; 2655 so->so_error = 0; 2656 SOCKBUF_UNLOCK(sb); 2657 SOCK_IO_SEND_UNLOCK(so); 2658 goto out; 2659 } 2660 if ((so->so_state & SS_ISCONNECTED) == 0) { 2661 SOCKBUF_UNLOCK(sb); 2662 SOCK_IO_SEND_UNLOCK(so); 2663 error = ENOTCONN; 2664 goto out; 2665 } 2666 if (sbspace(sb) < sb->sb_lowat) { 2667 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2668 2669 /* 2670 * Don't block if there is too little room in the socket 2671 * buffer. Instead, requeue the request. 2672 */ 2673 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2674 SOCKBUF_UNLOCK(sb); 2675 SOCK_IO_SEND_UNLOCK(so); 2676 error = ECANCELED; 2677 goto out; 2678 } 2679 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2680 SOCKBUF_UNLOCK(sb); 2681 SOCK_IO_SEND_UNLOCK(so); 2682 goto out; 2683 } 2684 2685 /* 2686 * Write as much data as the socket permits, but no more than a 2687 * a single sndbuf at a time. 2688 */ 2689 len = sbspace(sb); 2690 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2691 len = job->uaiocb.aio_nbytes - job->aio_sent; 2692 moretocome = false; 2693 } else 2694 moretocome = true; 2695 if (len > toep->params.sndbuf) { 2696 len = toep->params.sndbuf; 2697 sendmore = true; 2698 } else 2699 sendmore = false; 2700 2701 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2702 moretocome = true; 2703 SOCKBUF_UNLOCK(sb); 2704 MPASS(len != 0); 2705 2706 m = alloc_aiotx_mbuf(job, len); 2707 if (m == NULL) { 2708 SOCK_IO_SEND_UNLOCK(so); 2709 error = EFAULT; 2710 goto out; 2711 } 2712 2713 /* Inlined tcp_usr_send(). */ 2714 2715 inp = toep->inp; 2716 INP_WLOCK(inp); 2717 if (inp->inp_flags & INP_DROPPED) { 2718 INP_WUNLOCK(inp); 2719 SOCK_IO_SEND_UNLOCK(so); 2720 error = ECONNRESET; 2721 goto out; 2722 } 2723 2724 sent = m_length(m, NULL); 2725 job->aio_sent += sent; 2726 counter_u64_add(toep->ofld_txq->tx_aio_octets, sent); 2727 2728 sbappendstream(sb, m, 0); 2729 m = NULL; 2730 2731 if (!(inp->inp_flags & INP_DROPPED)) { 2732 tp = intotcpcb(inp); 2733 if (moretocome) 2734 tp->t_flags |= TF_MORETOCOME; 2735 error = tcp_output(tp); 2736 if (error < 0) { 2737 INP_UNLOCK_ASSERT(inp); 2738 SOCK_IO_SEND_UNLOCK(so); 2739 error = -error; 2740 goto out; 2741 } 2742 if (moretocome) 2743 tp->t_flags &= ~TF_MORETOCOME; 2744 } 2745 2746 INP_WUNLOCK(inp); 2747 if (sendmore) 2748 goto sendanother; 2749 SOCK_IO_SEND_UNLOCK(so); 2750 2751 if (error) 2752 goto out; 2753 2754 /* 2755 * If this is a blocking socket and the request has not been 2756 * fully completed, requeue it until the socket is ready 2757 * again. 2758 */ 2759 if (job->aio_sent < job->uaiocb.aio_nbytes && 2760 !(so->so_state & SS_NBIO)) { 2761 SOCKBUF_LOCK(sb); 2762 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2763 SOCKBUF_UNLOCK(sb); 2764 error = ECANCELED; 2765 goto out; 2766 } 2767 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2768 return; 2769 } 2770 2771 /* 2772 * If the request will not be requeued, drop the queue's 2773 * reference to the job. Any mbufs in flight should still 2774 * hold a reference, but this drops the reference that the 2775 * queue owns while it is waiting to queue mbufs to the 2776 * socket. 2777 */ 2778 aiotx_free_job(job); 2779 counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1); 2780 2781 out: 2782 if (error) { 2783 job->aio_error = (void *)(intptr_t)error; 2784 aiotx_free_job(job); 2785 } 2786 m_freem(m); 2787 SOCKBUF_LOCK(sb); 2788 } 2789 2790 static void 2791 t4_aiotx_task(void *context, int pending) 2792 { 2793 struct toepcb *toep = context; 2794 struct socket *so; 2795 struct kaiocb *job; 2796 struct epoch_tracker et; 2797 2798 so = toep->aiotx_so; 2799 CURVNET_SET(toep->vnet); 2800 NET_EPOCH_ENTER(et); 2801 SOCKBUF_LOCK(&so->so_snd); 2802 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2803 job = TAILQ_FIRST(&toep->aiotx_jobq); 2804 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2805 if (!aio_clear_cancel_function(job)) 2806 continue; 2807 2808 t4_aiotx_process_job(toep, so, job); 2809 } 2810 toep->aiotx_so = NULL; 2811 SOCKBUF_UNLOCK(&so->so_snd); 2812 NET_EPOCH_EXIT(et); 2813 2814 free_toepcb(toep); 2815 sorele(so); 2816 CURVNET_RESTORE(); 2817 } 2818 2819 static void 2820 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2821 { 2822 2823 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2824 #ifdef VERBOSE_TRACES 2825 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2826 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2827 #endif 2828 if (toep->aiotx_so != NULL) 2829 return; 2830 soref(so); 2831 toep->aiotx_so = so; 2832 hold_toepcb(toep); 2833 soaio_enqueue(&toep->aiotx_task); 2834 } 2835 2836 static void 2837 t4_aiotx_cancel(struct kaiocb *job) 2838 { 2839 struct socket *so; 2840 struct sockbuf *sb; 2841 struct tcpcb *tp; 2842 struct toepcb *toep; 2843 2844 so = job->fd_file->f_data; 2845 tp = sototcpcb(so); 2846 toep = tp->t_toe; 2847 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2848 sb = &so->so_snd; 2849 2850 SOCKBUF_LOCK(sb); 2851 if (!aio_cancel_cleared(job)) 2852 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2853 SOCKBUF_UNLOCK(sb); 2854 2855 job->aio_error = (void *)(intptr_t)ECANCELED; 2856 aiotx_free_job(job); 2857 } 2858 2859 int 2860 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2861 { 2862 struct tcpcb *tp = sototcpcb(so); 2863 struct toepcb *toep = tp->t_toe; 2864 struct adapter *sc = td_adapter(toep->td); 2865 2866 /* This only handles writes. */ 2867 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2868 return (EOPNOTSUPP); 2869 2870 if (!sc->tt.tx_zcopy) 2871 return (EOPNOTSUPP); 2872 2873 if (tls_tx_key(toep)) 2874 return (EOPNOTSUPP); 2875 2876 SOCKBUF_LOCK(&so->so_snd); 2877 #ifdef VERBOSE_TRACES 2878 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2879 #endif 2880 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2881 panic("new job was cancelled"); 2882 refcount_init(&job->aio_refs, 1); 2883 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2884 if (sowriteable(so)) 2885 t4_aiotx_queue_toep(so, toep); 2886 SOCKBUF_UNLOCK(&so->so_snd); 2887 return (0); 2888 } 2889 2890 void 2891 aiotx_init_toep(struct toepcb *toep) 2892 { 2893 2894 TAILQ_INIT(&toep->aiotx_jobq); 2895 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2896 } 2897 #endif 2898