1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_kern_tls.h" 34 #include "opt_ratelimit.h" 35 36 #ifdef TCP_OFFLOAD 37 #include <sys/param.h> 38 #include <sys/aio.h> 39 #include <sys/file.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/module.h> 43 #include <sys/proc.h> 44 #include <sys/protosw.h> 45 #include <sys/domain.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sglist.h> 49 #include <sys/taskqueue.h> 50 #include <netinet/in.h> 51 #include <netinet/in_pcb.h> 52 #include <netinet/ip.h> 53 #include <netinet/ip6.h> 54 #define TCPSTATES 55 #include <netinet/tcp_fsm.h> 56 #include <netinet/tcp_seq.h> 57 #include <netinet/tcp_var.h> 58 #include <netinet/toecore.h> 59 60 #include <security/mac/mac_framework.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_extern.h> 64 #include <vm/pmap.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_page.h> 67 68 #include <dev/iscsi/iscsi_proto.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (toep->params.tc_idx != -1) { 100 MPASS(toep->params.tc_idx >= 0 && 101 toep->params.tc_idx < sc->params.nsched_cls); 102 nparams++; 103 } 104 105 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 106 107 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 108 if (wr == NULL) { 109 /* XXX */ 110 panic("%s: allocation failure.", __func__); 111 } 112 flowc = wrtod(wr); 113 memset(flowc, 0, wr->wr_len); 114 115 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 116 V_FW_FLOWC_WR_NPARAMS(nparams)); 117 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 118 V_FW_WR_FLOWID(toep->tid)); 119 120 #define FLOWC_PARAM(__m, __v) \ 121 do { \ 122 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 123 flowc->mnemval[paramidx].val = htobe32(__v); \ 124 paramidx++; \ 125 } while (0) 126 127 paramidx = 0; 128 129 FLOWC_PARAM(PFNVFN, pfvf); 130 /* Firmware expects hw port and will translate to channel itself. */ 131 FLOWC_PARAM(CH, pi->hw_port); 132 FLOWC_PARAM(PORT, pi->hw_port); 133 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 134 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 135 if (tp) { 136 FLOWC_PARAM(MSS, toep->params.emss); 137 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 138 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 139 } else 140 FLOWC_PARAM(MSS, 512); 141 CTR6(KTR_CXGBE, 142 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 143 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 144 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 145 146 if (toep->params.tc_idx != -1) 147 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 148 #undef FLOWC_PARAM 149 150 KASSERT(paramidx == nparams, ("nparams mismatch")); 151 152 KASSERT(howmany(flowclen, 16) <= MAX_OFLD_TX_SDESC_CREDITS, 153 ("%s: tx_credits %u too large", __func__, howmany(flowclen, 16))); 154 txsd->tx_credits = howmany(flowclen, 16); 155 txsd->plen = 0; 156 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 157 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 158 toep->tx_credits -= txsd->tx_credits; 159 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 160 toep->txsd_pidx = 0; 161 toep->txsd_avail--; 162 163 toep->flags |= TPF_FLOWC_WR_SENT; 164 t4_wrq_tx(sc, wr); 165 } 166 167 #ifdef RATELIMIT 168 /* 169 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 170 */ 171 static int 172 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 173 { 174 int tc_idx, rc; 175 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 176 const int port_id = toep->vi->pi->port_id; 177 178 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 179 180 if (kbps == 0) { 181 /* unbind */ 182 tc_idx = -1; 183 } else { 184 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 185 if (rc != 0) 186 return (rc); 187 MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls); 188 } 189 190 if (toep->params.tc_idx != tc_idx) { 191 struct wrqe *wr; 192 struct fw_flowc_wr *flowc; 193 int nparams = 1, flowclen, flowclen16; 194 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 195 196 flowclen = sizeof(*flowc) + nparams * sizeof(struct 197 fw_flowc_mnemval); 198 flowclen16 = howmany(flowclen, 16); 199 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 200 (wr = alloc_wrqe(roundup2(flowclen, 16), 201 &toep->ofld_txq->wrq)) == NULL) { 202 if (tc_idx >= 0) 203 t4_release_cl_rl(sc, port_id, tc_idx); 204 return (ENOMEM); 205 } 206 207 flowc = wrtod(wr); 208 memset(flowc, 0, wr->wr_len); 209 210 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 211 V_FW_FLOWC_WR_NPARAMS(nparams)); 212 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 213 V_FW_WR_FLOWID(toep->tid)); 214 215 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 216 if (tc_idx == -1) 217 flowc->mnemval[0].val = htobe32(0xff); 218 else 219 flowc->mnemval[0].val = htobe32(tc_idx); 220 221 KASSERT(flowclen16 <= MAX_OFLD_TX_SDESC_CREDITS, 222 ("%s: tx_credits %u too large", __func__, flowclen16)); 223 txsd->tx_credits = flowclen16; 224 txsd->plen = 0; 225 toep->tx_credits -= txsd->tx_credits; 226 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 227 toep->txsd_pidx = 0; 228 toep->txsd_avail--; 229 t4_wrq_tx(sc, wr); 230 } 231 232 if (toep->params.tc_idx >= 0) 233 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 234 toep->params.tc_idx = tc_idx; 235 236 return (0); 237 } 238 #endif 239 240 void 241 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 242 { 243 struct wrqe *wr; 244 struct cpl_abort_req *req; 245 int tid = toep->tid; 246 struct inpcb *inp = toep->inp; 247 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 248 249 INP_WLOCK_ASSERT(inp); 250 251 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 252 __func__, toep->tid, 253 inp->inp_flags & INP_DROPPED ? "inp dropped" : 254 tcpstates[tp->t_state], 255 toep->flags, inp->inp_flags, 256 toep->flags & TPF_ABORT_SHUTDOWN ? 257 " (abort already in progress)" : ""); 258 259 if (toep->flags & TPF_ABORT_SHUTDOWN) 260 return; /* abort already in progress */ 261 262 toep->flags |= TPF_ABORT_SHUTDOWN; 263 264 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 265 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 266 267 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 268 if (wr == NULL) { 269 /* XXX */ 270 panic("%s: allocation failure.", __func__); 271 } 272 req = wrtod(wr); 273 274 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 275 if (inp->inp_flags & INP_DROPPED) 276 req->rsvd0 = htobe32(snd_nxt); 277 else 278 req->rsvd0 = htobe32(tp->snd_nxt); 279 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 280 req->cmd = CPL_ABORT_SEND_RST; 281 282 /* 283 * XXX: What's the correct way to tell that the inp hasn't been detached 284 * from its socket? Should I even be flushing the snd buffer here? 285 */ 286 if ((inp->inp_flags & INP_DROPPED) == 0) { 287 struct socket *so = inp->inp_socket; 288 289 if (so != NULL) /* because I'm not sure. See comment above */ 290 sbflush(&so->so_snd); 291 } 292 293 t4_l2t_send(sc, wr, toep->l2te); 294 } 295 296 /* 297 * Called when a connection is established to translate the TCP options 298 * reported by HW to FreeBSD's native format. 299 */ 300 static void 301 assign_rxopt(struct tcpcb *tp, uint16_t opt) 302 { 303 struct toepcb *toep = tp->t_toe; 304 struct inpcb *inp = tptoinpcb(tp); 305 struct adapter *sc = td_adapter(toep->td); 306 307 INP_LOCK_ASSERT(inp); 308 309 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 310 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 311 if (inp->inp_inc.inc_flags & INC_ISIPV6) 312 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 313 else 314 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 315 316 toep->params.emss = tp->t_maxseg; 317 if (G_TCPOPT_TSTAMP(opt)) { 318 toep->params.tstamp = 1; 319 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 320 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 321 tp->ts_recent = 0; /* hmmm */ 322 tp->ts_recent_age = tcp_ts_getticks(); 323 } else 324 toep->params.tstamp = 0; 325 326 if (G_TCPOPT_SACK(opt)) { 327 toep->params.sack = 1; 328 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 329 } else { 330 toep->params.sack = 0; 331 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 332 } 333 334 if (G_TCPOPT_WSCALE_OK(opt)) 335 tp->t_flags |= TF_RCVD_SCALE; 336 337 /* Doing window scaling? */ 338 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 339 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 340 tp->rcv_scale = tp->request_r_scale; 341 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 342 } else 343 toep->params.wscale = 0; 344 345 CTR6(KTR_CXGBE, 346 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 347 toep->tid, toep->params.mtu_idx, toep->params.emss, 348 toep->params.tstamp, toep->params.sack, toep->params.wscale); 349 } 350 351 /* 352 * Completes some final bits of initialization for just established connections 353 * and changes their state to TCPS_ESTABLISHED. 354 * 355 * The ISNs are from the exchange of SYNs. 356 */ 357 void 358 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 359 { 360 struct inpcb *inp = toep->inp; 361 struct socket *so = inp->inp_socket; 362 struct tcpcb *tp = intotcpcb(inp); 363 uint16_t tcpopt = be16toh(opt); 364 365 INP_WLOCK_ASSERT(inp); 366 KASSERT(tp->t_state == TCPS_SYN_SENT || 367 tp->t_state == TCPS_SYN_RECEIVED, 368 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 369 370 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 371 __func__, toep->tid, so, inp, tp, toep); 372 373 tcp_state_change(tp, TCPS_ESTABLISHED); 374 tp->t_starttime = ticks; 375 TCPSTAT_INC(tcps_connects); 376 377 tp->irs = irs; 378 tcp_rcvseqinit(tp); 379 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 380 tp->rcv_adv += tp->rcv_wnd; 381 tp->last_ack_sent = tp->rcv_nxt; 382 383 tp->iss = iss; 384 tcp_sendseqinit(tp); 385 tp->snd_una = iss + 1; 386 tp->snd_nxt = iss + 1; 387 tp->snd_max = iss + 1; 388 389 assign_rxopt(tp, tcpopt); 390 send_flowc_wr(toep, tp); 391 392 soisconnected(so); 393 } 394 395 int 396 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 397 { 398 struct wrqe *wr; 399 struct cpl_rx_data_ack *req; 400 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 401 402 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 403 404 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 405 if (wr == NULL) 406 return (0); 407 req = wrtod(wr); 408 409 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 410 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 411 412 t4_wrq_tx(sc, wr); 413 return (credits); 414 } 415 416 void 417 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 418 { 419 struct adapter *sc = tod->tod_softc; 420 struct inpcb *inp = tptoinpcb(tp); 421 struct socket *so = inp->inp_socket; 422 struct sockbuf *sb = &so->so_rcv; 423 struct toepcb *toep = tp->t_toe; 424 int rx_credits; 425 426 INP_WLOCK_ASSERT(inp); 427 SOCKBUF_LOCK_ASSERT(sb); 428 429 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 430 if (rx_credits > 0 && 431 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 432 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 433 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 434 rx_credits = send_rx_credits(sc, toep, rx_credits); 435 tp->rcv_wnd += rx_credits; 436 tp->rcv_adv += rx_credits; 437 } 438 } 439 440 void 441 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 442 { 443 struct inpcb *inp = tptoinpcb(tp); 444 struct socket *so = inp->inp_socket; 445 struct sockbuf *sb = &so->so_rcv; 446 447 SOCKBUF_LOCK(sb); 448 t4_rcvd_locked(tod, tp); 449 SOCKBUF_UNLOCK(sb); 450 } 451 452 /* 453 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 454 */ 455 int 456 t4_close_conn(struct adapter *sc, struct toepcb *toep) 457 { 458 struct wrqe *wr; 459 struct cpl_close_con_req *req; 460 unsigned int tid = toep->tid; 461 462 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 463 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 464 465 if (toep->flags & TPF_FIN_SENT) 466 return (0); 467 468 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 469 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 470 471 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 472 if (wr == NULL) { 473 /* XXX */ 474 panic("%s: allocation failure.", __func__); 475 } 476 req = wrtod(wr); 477 478 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 479 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 480 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 481 V_FW_WR_FLOWID(tid)); 482 req->wr.wr_lo = cpu_to_be64(0); 483 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 484 req->rsvd = 0; 485 486 toep->flags |= TPF_FIN_SENT; 487 toep->flags &= ~TPF_SEND_FIN; 488 t4_l2t_send(sc, wr, toep->l2te); 489 490 return (0); 491 } 492 493 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 494 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 495 #define MIN_ISO_TX_CREDITS (howmany(sizeof(struct cpl_tx_data_iso), 16)) 496 #define MIN_TX_CREDITS(iso) \ 497 (MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0)) 498 499 _Static_assert(MAX_OFLD_TX_CREDITS <= MAX_OFLD_TX_SDESC_CREDITS, 500 "MAX_OFLD_TX_SDESC_CREDITS too small"); 501 502 /* Maximum amount of immediate data we could stuff in a WR */ 503 static inline int 504 max_imm_payload(int tx_credits, int iso) 505 { 506 const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0; 507 const int n = 1; /* Use no more than one desc for imm. data WR */ 508 509 KASSERT(tx_credits >= 0 && 510 tx_credits <= MAX_OFLD_TX_CREDITS, 511 ("%s: %d credits", __func__, tx_credits)); 512 513 if (tx_credits < MIN_TX_CREDITS(iso)) 514 return (0); 515 516 if (tx_credits >= (n * EQ_ESIZE) / 16) 517 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) - 518 iso_cpl_size); 519 else 520 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) - 521 iso_cpl_size); 522 } 523 524 /* Maximum number of SGL entries we could stuff in a WR */ 525 static inline int 526 max_dsgl_nsegs(int tx_credits, int iso) 527 { 528 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 529 int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso); 530 531 KASSERT(tx_credits >= 0 && 532 tx_credits <= MAX_OFLD_TX_CREDITS, 533 ("%s: %d credits", __func__, tx_credits)); 534 535 if (tx_credits < MIN_TX_CREDITS(iso)) 536 return (0); 537 538 nseg += 2 * (sge_pair_credits * 16 / 24); 539 if ((sge_pair_credits * 16) % 24 == 16) 540 nseg++; 541 542 return (nseg); 543 } 544 545 static inline void 546 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode, 547 unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, 548 int ulp_submode) 549 { 550 struct fw_ofld_tx_data_wr *txwr = dst; 551 552 txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) | 553 V_FW_WR_IMMDLEN(immdlen)); 554 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 555 V_FW_WR_LEN16(credits)); 556 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 557 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 558 txwr->plen = htobe32(plen); 559 560 if (toep->params.tx_align > 0) { 561 if (plen < 2 * toep->params.emss) 562 txwr->lsodisable_to_flags |= 563 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 564 else 565 txwr->lsodisable_to_flags |= 566 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 567 (toep->params.nagle == 0 ? 0 : 568 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 569 } 570 } 571 572 /* 573 * Generate a DSGL from a starting mbuf. The total number of segments and the 574 * maximum segments in any one mbuf are provided. 575 */ 576 static void 577 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 578 { 579 struct mbuf *m; 580 struct ulptx_sgl *usgl = dst; 581 int i, j, rc; 582 struct sglist sg; 583 struct sglist_seg segs[n]; 584 585 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 586 587 sglist_init(&sg, n, segs); 588 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 589 V_ULPTX_NSGE(nsegs)); 590 591 i = -1; 592 for (m = start; m != stop; m = m->m_next) { 593 if (m->m_flags & M_EXTPG) 594 rc = sglist_append_mbuf_epg(&sg, m, 595 mtod(m, vm_offset_t), m->m_len); 596 else 597 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 598 if (__predict_false(rc != 0)) 599 panic("%s: sglist_append %d", __func__, rc); 600 601 for (j = 0; j < sg.sg_nseg; i++, j++) { 602 if (i < 0) { 603 usgl->len0 = htobe32(segs[j].ss_len); 604 usgl->addr0 = htobe64(segs[j].ss_paddr); 605 } else { 606 usgl->sge[i / 2].len[i & 1] = 607 htobe32(segs[j].ss_len); 608 usgl->sge[i / 2].addr[i & 1] = 609 htobe64(segs[j].ss_paddr); 610 } 611 #ifdef INVARIANTS 612 nsegs--; 613 #endif 614 } 615 sglist_reset(&sg); 616 } 617 if (i & 1) 618 usgl->sge[i / 2].len[1] = htobe32(0); 619 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 620 __func__, nsegs, start, stop)); 621 } 622 623 bool 624 t4_push_raw_wr(struct adapter *sc, struct toepcb *toep, struct mbuf *m) 625 { 626 #ifdef INVARIANTS 627 struct inpcb *inp = toep->inp; 628 #endif 629 struct wrqe *wr; 630 struct ofld_tx_sdesc *txsd; 631 u_int credits, plen; 632 633 INP_WLOCK_ASSERT(inp); 634 MPASS(mbuf_raw_wr(m)); 635 plen = m->m_pkthdr.len; 636 credits = howmany(plen, 16); 637 if (credits > toep->tx_credits) 638 return (false); 639 640 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 641 if (wr == NULL) 642 return (false); 643 644 m_copydata(m, 0, plen, wrtod(wr)); 645 m_freem(m); 646 647 toep->tx_credits -= credits; 648 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 649 toep->flags |= TPF_TX_SUSPENDED; 650 651 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 652 KASSERT(credits <= MAX_OFLD_TX_SDESC_CREDITS, 653 ("%s: tx_credits %u too large", __func__, credits)); 654 txsd = &toep->txsd[toep->txsd_pidx]; 655 txsd->plen = 0; 656 txsd->tx_credits = credits; 657 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 658 toep->txsd_pidx = 0; 659 toep->txsd_avail--; 660 661 t4_wrq_tx(sc, wr); 662 return (true); 663 } 664 665 /* 666 * Max number of SGL entries an offload tx work request can have. This is 41 667 * (1 + 40) for a full 512B work request. 668 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 669 */ 670 #define OFLD_SGL_LEN (41) 671 672 /* 673 * Send data and/or a FIN to the peer. 674 * 675 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 676 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 677 * was transmitted. 678 * 679 * drop indicates the number of bytes that should be dropped from the head of 680 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 681 * contention on the send buffer lock (before this change it used to do 682 * sowwakeup and then t4_push_frames right after that when recovering from tx 683 * stalls). When drop is set this function MUST drop the bytes and wake up any 684 * writers. 685 */ 686 static void 687 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 688 { 689 struct mbuf *sndptr, *m, *sb_sndptr; 690 struct fw_ofld_tx_data_wr *txwr; 691 struct wrqe *wr; 692 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 693 struct inpcb *inp = toep->inp; 694 struct tcpcb *tp = intotcpcb(inp); 695 struct socket *so = inp->inp_socket; 696 struct sockbuf *sb = &so->so_snd; 697 struct mbufq *pduq = &toep->ulp_pduq; 698 int tx_credits, shove, compl, sowwakeup; 699 struct ofld_tx_sdesc *txsd; 700 bool nomap_mbuf_seen; 701 702 INP_WLOCK_ASSERT(inp); 703 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 704 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 705 706 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 707 ulp_mode(toep) == ULP_MODE_TCPDDP || 708 ulp_mode(toep) == ULP_MODE_TLS || 709 ulp_mode(toep) == ULP_MODE_RDMA, 710 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 711 712 #ifdef VERBOSE_TRACES 713 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 714 __func__, toep->tid, toep->flags, tp->t_flags, drop); 715 #endif 716 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 717 return; 718 719 #ifdef RATELIMIT 720 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 721 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 722 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 723 } 724 #endif 725 726 /* 727 * This function doesn't resume by itself. Someone else must clear the 728 * flag and call this function. 729 */ 730 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 731 KASSERT(drop == 0, 732 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 733 return; 734 } 735 736 txsd = &toep->txsd[toep->txsd_pidx]; 737 do { 738 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 739 max_imm = max_imm_payload(tx_credits, 0); 740 max_nsegs = max_dsgl_nsegs(tx_credits, 0); 741 742 if (__predict_false((sndptr = mbufq_first(pduq)) != NULL)) { 743 if (!t4_push_raw_wr(sc, toep, sndptr)) { 744 toep->flags |= TPF_TX_SUSPENDED; 745 return; 746 } 747 748 m = mbufq_dequeue(pduq); 749 MPASS(m == sndptr); 750 751 txsd = &toep->txsd[toep->txsd_pidx]; 752 continue; 753 } 754 755 SOCKBUF_LOCK(sb); 756 sowwakeup = drop; 757 if (drop) { 758 sbdrop_locked(sb, drop); 759 drop = 0; 760 } 761 sb_sndptr = sb->sb_sndptr; 762 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 763 plen = 0; 764 nsegs = 0; 765 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 766 nomap_mbuf_seen = false; 767 for (m = sndptr; m != NULL; m = m->m_next) { 768 int n; 769 770 if ((m->m_flags & M_NOTREADY) != 0) 771 break; 772 if (plen + m->m_len > MAX_OFLD_TX_SDESC_PLEN) 773 break; 774 if (m->m_flags & M_EXTPG) { 775 #ifdef KERN_TLS 776 if (m->m_epg_tls != NULL) { 777 toep->flags |= TPF_KTLS; 778 if (plen == 0) { 779 SOCKBUF_UNLOCK(sb); 780 t4_push_ktls(sc, toep, 0); 781 return; 782 } 783 break; 784 } 785 #endif 786 n = sglist_count_mbuf_epg(m, 787 mtod(m, vm_offset_t), m->m_len); 788 } else 789 n = sglist_count(mtod(m, void *), m->m_len); 790 791 nsegs += n; 792 plen += m->m_len; 793 794 /* This mbuf sent us _over_ the nsegs limit, back out */ 795 if (plen > max_imm && nsegs > max_nsegs) { 796 nsegs -= n; 797 plen -= m->m_len; 798 if (plen == 0) { 799 /* Too few credits */ 800 toep->flags |= TPF_TX_SUSPENDED; 801 if (sowwakeup) { 802 if (!TAILQ_EMPTY( 803 &toep->aiotx_jobq)) 804 t4_aiotx_queue_toep(so, 805 toep); 806 sowwakeup_locked(so); 807 } else 808 SOCKBUF_UNLOCK(sb); 809 SOCKBUF_UNLOCK_ASSERT(sb); 810 return; 811 } 812 break; 813 } 814 815 if (m->m_flags & M_EXTPG) 816 nomap_mbuf_seen = true; 817 if (max_nsegs_1mbuf < n) 818 max_nsegs_1mbuf = n; 819 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 820 821 /* This mbuf put us right at the max_nsegs limit */ 822 if (plen > max_imm && nsegs == max_nsegs) { 823 m = m->m_next; 824 break; 825 } 826 } 827 828 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 829 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 830 compl = 1; 831 else 832 compl = 0; 833 834 if (sb->sb_flags & SB_AUTOSIZE && 835 V_tcp_do_autosndbuf && 836 sb->sb_hiwat < V_tcp_autosndbuf_max && 837 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 838 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 839 V_tcp_autosndbuf_max); 840 841 if (!sbreserve_locked(so, SO_SND, newsize, NULL)) 842 sb->sb_flags &= ~SB_AUTOSIZE; 843 else 844 sowwakeup = 1; /* room available */ 845 } 846 if (sowwakeup) { 847 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 848 t4_aiotx_queue_toep(so, toep); 849 sowwakeup_locked(so); 850 } else 851 SOCKBUF_UNLOCK(sb); 852 SOCKBUF_UNLOCK_ASSERT(sb); 853 854 /* nothing to send */ 855 if (plen == 0) { 856 KASSERT(m == NULL || (m->m_flags & M_NOTREADY) != 0, 857 ("%s: nothing to send, but m != NULL is ready", 858 __func__)); 859 break; 860 } 861 862 if (__predict_false(toep->flags & TPF_FIN_SENT)) 863 panic("%s: excess tx.", __func__); 864 865 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 866 if (plen <= max_imm && !nomap_mbuf_seen) { 867 868 /* Immediate data tx */ 869 870 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 871 &toep->ofld_txq->wrq); 872 if (wr == NULL) { 873 /* XXX: how will we recover from this? */ 874 toep->flags |= TPF_TX_SUSPENDED; 875 return; 876 } 877 txwr = wrtod(wr); 878 credits = howmany(wr->wr_len, 16); 879 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen, 880 credits, shove, 0); 881 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 882 nsegs = 0; 883 } else { 884 int wr_len; 885 886 /* DSGL tx */ 887 888 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 889 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 890 wr = alloc_wrqe(roundup2(wr_len, 16), 891 &toep->ofld_txq->wrq); 892 if (wr == NULL) { 893 /* XXX: how will we recover from this? */ 894 toep->flags |= TPF_TX_SUSPENDED; 895 return; 896 } 897 txwr = wrtod(wr); 898 credits = howmany(wr_len, 16); 899 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen, 900 credits, shove, 0); 901 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 902 max_nsegs_1mbuf); 903 if (wr_len & 0xf) { 904 uint64_t *pad = (uint64_t *) 905 ((uintptr_t)txwr + wr_len); 906 *pad = 0; 907 } 908 } 909 910 KASSERT(toep->tx_credits >= credits, 911 ("%s: not enough credits", __func__)); 912 913 toep->tx_credits -= credits; 914 toep->tx_nocompl += credits; 915 toep->plen_nocompl += plen; 916 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 917 toep->tx_nocompl >= toep->tx_total / 4) 918 compl = 1; 919 920 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 921 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 922 toep->tx_nocompl = 0; 923 toep->plen_nocompl = 0; 924 } 925 926 tp->snd_nxt += plen; 927 tp->snd_max += plen; 928 929 SOCKBUF_LOCK(sb); 930 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 931 sb->sb_sndptr = sb_sndptr; 932 SOCKBUF_UNLOCK(sb); 933 934 toep->flags |= TPF_TX_DATA_SENT; 935 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 936 toep->flags |= TPF_TX_SUSPENDED; 937 938 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 939 KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, 940 ("%s: plen %u too large", __func__, plen)); 941 txsd->plen = plen; 942 txsd->tx_credits = credits; 943 txsd++; 944 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 945 toep->txsd_pidx = 0; 946 txsd = &toep->txsd[0]; 947 } 948 toep->txsd_avail--; 949 950 t4_l2t_send(sc, wr, toep->l2te); 951 } while (m != NULL && (m->m_flags & M_NOTREADY) == 0); 952 953 /* Send a FIN if requested, but only if there's no more data to send */ 954 if (m == NULL && toep->flags & TPF_SEND_FIN) 955 t4_close_conn(sc, toep); 956 } 957 958 static inline void 959 rqdrop_locked(struct mbufq *q, int plen) 960 { 961 struct mbuf *m; 962 963 while (plen > 0) { 964 m = mbufq_dequeue(q); 965 966 /* Too many credits. */ 967 MPASS(m != NULL); 968 M_ASSERTPKTHDR(m); 969 970 /* Partial credits. */ 971 MPASS(plen >= m->m_pkthdr.len); 972 973 plen -= m->m_pkthdr.len; 974 m_freem(m); 975 } 976 } 977 978 /* 979 * Not a bit in the TCB, but is a bit in the ulp_submode field of the 980 * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR. 981 */ 982 #define ULP_ISO G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO) 983 984 static void 985 write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss, 986 int len, int npdu) 987 { 988 struct cpl_tx_data_iso *cpl; 989 unsigned int burst_size; 990 unsigned int last; 991 992 /* 993 * The firmware will set the 'F' bit on the last PDU when 994 * either condition is true: 995 * 996 * - this large PDU is marked as the "last" slice 997 * 998 * - the amount of data payload bytes equals the burst_size 999 * 1000 * The strategy used here is to always set the burst_size 1001 * artificially high (len includes the size of the template 1002 * BHS) and only set the "last" flag if the original PDU had 1003 * 'F' set. 1004 */ 1005 burst_size = len; 1006 last = !!(flags & CXGBE_ISO_F); 1007 1008 cpl = (struct cpl_tx_data_iso *)dst; 1009 cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) | 1010 V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) | 1011 V_CPL_TX_DATA_ISO_CPLHDRLEN(0) | 1012 V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) | 1013 V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) | 1014 V_CPL_TX_DATA_ISO_IMMEDIATE(0) | 1015 V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags))); 1016 1017 cpl->ahs_len = 0; 1018 cpl->mpdu = htons(DIV_ROUND_UP(mss, 4)); 1019 cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4)); 1020 cpl->len = htonl(len); 1021 cpl->reserved2_seglen_offset = htonl(0); 1022 cpl->datasn_offset = htonl(0); 1023 cpl->buffer_offset = htonl(0); 1024 cpl->reserved3 = 0; 1025 } 1026 1027 static struct wrqe * 1028 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) 1029 { 1030 struct mbuf *m; 1031 struct fw_ofld_tx_data_wr *txwr; 1032 struct cpl_tx_data_iso *cpl_iso; 1033 void *p; 1034 struct wrqe *wr; 1035 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 1036 u_int adjusted_plen, imm_data, ulp_submode; 1037 struct inpcb *inp = toep->inp; 1038 struct tcpcb *tp = intotcpcb(inp); 1039 int tx_credits, shove, npdu, wr_len; 1040 uint16_t iso_mss; 1041 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 1042 bool iso, nomap_mbuf_seen; 1043 1044 M_ASSERTPKTHDR(sndptr); 1045 1046 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 1047 if (mbuf_raw_wr(sndptr)) { 1048 plen = sndptr->m_pkthdr.len; 1049 KASSERT(plen <= SGE_MAX_WR_LEN, 1050 ("raw WR len %u is greater than max WR len", plen)); 1051 if (plen > tx_credits * 16) 1052 return (NULL); 1053 1054 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 1055 if (__predict_false(wr == NULL)) 1056 return (NULL); 1057 1058 m_copydata(sndptr, 0, plen, wrtod(wr)); 1059 return (wr); 1060 } 1061 1062 iso = mbuf_iscsi_iso(sndptr); 1063 max_imm = max_imm_payload(tx_credits, iso); 1064 max_nsegs = max_dsgl_nsegs(tx_credits, iso); 1065 iso_mss = mbuf_iscsi_iso_mss(sndptr); 1066 1067 plen = 0; 1068 nsegs = 0; 1069 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 1070 nomap_mbuf_seen = false; 1071 for (m = sndptr; m != NULL; m = m->m_next) { 1072 int n; 1073 1074 if (m->m_flags & M_EXTPG) 1075 n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), 1076 m->m_len); 1077 else 1078 n = sglist_count(mtod(m, void *), m->m_len); 1079 1080 nsegs += n; 1081 plen += m->m_len; 1082 1083 /* 1084 * This mbuf would send us _over_ the nsegs limit. 1085 * Suspend tx because the PDU can't be sent out. 1086 */ 1087 if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs) 1088 return (NULL); 1089 1090 if (m->m_flags & M_EXTPG) 1091 nomap_mbuf_seen = true; 1092 if (max_nsegs_1mbuf < n) 1093 max_nsegs_1mbuf = n; 1094 } 1095 1096 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1097 panic("%s: excess tx.", __func__); 1098 1099 /* 1100 * We have a PDU to send. All of it goes out in one WR so 'm' 1101 * is NULL. A PDU's length is always a multiple of 4. 1102 */ 1103 MPASS(m == NULL); 1104 MPASS((plen & 3) == 0); 1105 MPASS(sndptr->m_pkthdr.len == plen); 1106 1107 shove = !(tp->t_flags & TF_MORETOCOME); 1108 1109 /* 1110 * plen doesn't include header and data digests, which are 1111 * generated and inserted in the right places by the TOE, but 1112 * they do occupy TCP sequence space and need to be accounted 1113 * for. 1114 */ 1115 ulp_submode = mbuf_ulp_submode(sndptr); 1116 MPASS(ulp_submode < nitems(ulp_extra_len)); 1117 npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1; 1118 adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu; 1119 if (iso) 1120 adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1); 1121 wr_len = sizeof(*txwr); 1122 if (iso) 1123 wr_len += sizeof(struct cpl_tx_data_iso); 1124 if (plen <= max_imm && !nomap_mbuf_seen) { 1125 /* Immediate data tx */ 1126 imm_data = plen; 1127 wr_len += plen; 1128 nsegs = 0; 1129 } else { 1130 /* DSGL tx */ 1131 imm_data = 0; 1132 wr_len += sizeof(struct ulptx_sgl) + 1133 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1134 } 1135 1136 wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); 1137 if (wr == NULL) { 1138 /* XXX: how will we recover from this? */ 1139 return (NULL); 1140 } 1141 txwr = wrtod(wr); 1142 credits = howmany(wr->wr_len, 16); 1143 1144 if (iso) { 1145 write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR, 1146 imm_data + sizeof(struct cpl_tx_data_iso), 1147 adjusted_plen, credits, shove, ulp_submode | ULP_ISO); 1148 cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1); 1149 MPASS(plen == sndptr->m_pkthdr.len); 1150 write_tx_data_iso(cpl_iso, ulp_submode, 1151 mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu); 1152 p = cpl_iso + 1; 1153 } else { 1154 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data, 1155 adjusted_plen, credits, shove, ulp_submode); 1156 p = txwr + 1; 1157 } 1158 1159 if (imm_data != 0) { 1160 m_copydata(sndptr, 0, plen, p); 1161 } else { 1162 write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf); 1163 if (wr_len & 0xf) { 1164 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); 1165 *pad = 0; 1166 } 1167 } 1168 1169 KASSERT(toep->tx_credits >= credits, 1170 ("%s: not enough credits: credits %u " 1171 "toep->tx_credits %u tx_credits %u nsegs %u " 1172 "max_nsegs %u iso %d", __func__, credits, 1173 toep->tx_credits, tx_credits, nsegs, max_nsegs, iso)); 1174 1175 tp->snd_nxt += adjusted_plen; 1176 tp->snd_max += adjusted_plen; 1177 1178 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu); 1179 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); 1180 if (iso) 1181 counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1); 1182 1183 return (wr); 1184 } 1185 1186 void 1187 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 1188 { 1189 struct mbuf *sndptr, *m; 1190 struct fw_wr_hdr *wrhdr; 1191 struct wrqe *wr; 1192 u_int plen, credits; 1193 struct inpcb *inp = toep->inp; 1194 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 1195 struct mbufq *pduq = &toep->ulp_pduq; 1196 1197 INP_WLOCK_ASSERT(inp); 1198 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1199 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 1200 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 1201 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 1202 1203 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 1204 return; 1205 1206 /* 1207 * This function doesn't resume by itself. Someone else must clear the 1208 * flag and call this function. 1209 */ 1210 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 1211 KASSERT(drop == 0, 1212 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1213 return; 1214 } 1215 1216 if (drop) { 1217 struct socket *so = inp->inp_socket; 1218 struct sockbuf *sb = &so->so_snd; 1219 int sbu; 1220 1221 /* 1222 * An unlocked read is ok here as the data should only 1223 * transition from a non-zero value to either another 1224 * non-zero value or zero. Once it is zero it should 1225 * stay zero. 1226 */ 1227 if (__predict_false(sbused(sb)) > 0) { 1228 SOCKBUF_LOCK(sb); 1229 sbu = sbused(sb); 1230 if (sbu > 0) { 1231 /* 1232 * The data transmitted before the 1233 * tid's ULP mode changed to ISCSI is 1234 * still in so_snd. Incoming credits 1235 * should account for so_snd first. 1236 */ 1237 sbdrop_locked(sb, min(sbu, drop)); 1238 drop -= min(sbu, drop); 1239 } 1240 sowwakeup_locked(so); /* unlocks so_snd */ 1241 } 1242 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1243 } 1244 1245 while ((sndptr = mbufq_first(pduq)) != NULL) { 1246 wr = write_iscsi_mbuf_wr(toep, sndptr); 1247 if (wr == NULL) { 1248 toep->flags |= TPF_TX_SUSPENDED; 1249 return; 1250 } 1251 1252 plen = sndptr->m_pkthdr.len; 1253 credits = howmany(wr->wr_len, 16); 1254 KASSERT(toep->tx_credits >= credits, 1255 ("%s: not enough credits", __func__)); 1256 1257 m = mbufq_dequeue(pduq); 1258 MPASS(m == sndptr); 1259 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1260 1261 toep->tx_credits -= credits; 1262 toep->tx_nocompl += credits; 1263 toep->plen_nocompl += plen; 1264 1265 /* 1266 * Ensure there are enough credits for a full-sized WR 1267 * as page pod WRs can be full-sized. 1268 */ 1269 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && 1270 toep->tx_nocompl >= toep->tx_total / 4) { 1271 wrhdr = wrtod(wr); 1272 wrhdr->hi |= htobe32(F_FW_WR_COMPL); 1273 toep->tx_nocompl = 0; 1274 toep->plen_nocompl = 0; 1275 } 1276 1277 toep->flags |= TPF_TX_DATA_SENT; 1278 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1279 toep->flags |= TPF_TX_SUSPENDED; 1280 1281 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1282 KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, 1283 ("%s: plen %u too large", __func__, plen)); 1284 txsd->plen = plen; 1285 txsd->tx_credits = credits; 1286 txsd++; 1287 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1288 toep->txsd_pidx = 0; 1289 txsd = &toep->txsd[0]; 1290 } 1291 toep->txsd_avail--; 1292 1293 t4_l2t_send(sc, wr, toep->l2te); 1294 } 1295 1296 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1297 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1298 t4_close_conn(sc, toep); 1299 } 1300 1301 static inline void 1302 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1303 { 1304 1305 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1306 t4_push_pdus(sc, toep, drop); 1307 else if (toep->flags & TPF_KTLS) 1308 t4_push_ktls(sc, toep, drop); 1309 else 1310 t4_push_frames(sc, toep, drop); 1311 } 1312 1313 void 1314 t4_raw_wr_tx(struct adapter *sc, struct toepcb *toep, struct mbuf *m) 1315 { 1316 #ifdef INVARIANTS 1317 struct inpcb *inp = toep->inp; 1318 #endif 1319 1320 INP_WLOCK_ASSERT(inp); 1321 1322 /* 1323 * If there are other raw WRs enqueued, enqueue to preserve 1324 * FIFO ordering. 1325 */ 1326 if (!mbufq_empty(&toep->ulp_pduq)) { 1327 mbufq_enqueue(&toep->ulp_pduq, m); 1328 return; 1329 } 1330 1331 /* 1332 * Cannot call t4_push_data here as that will lock so_snd and 1333 * some callers of this run in rx handlers with so_rcv locked. 1334 * Instead, just try to transmit this WR. 1335 */ 1336 if (!t4_push_raw_wr(sc, toep, m)) { 1337 mbufq_enqueue(&toep->ulp_pduq, m); 1338 toep->flags |= TPF_TX_SUSPENDED; 1339 } 1340 } 1341 1342 int 1343 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1344 { 1345 struct adapter *sc = tod->tod_softc; 1346 #ifdef INVARIANTS 1347 struct inpcb *inp = tptoinpcb(tp); 1348 #endif 1349 struct toepcb *toep = tp->t_toe; 1350 1351 INP_WLOCK_ASSERT(inp); 1352 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1353 ("%s: inp %p dropped.", __func__, inp)); 1354 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1355 1356 t4_push_data(sc, toep, 0); 1357 1358 return (0); 1359 } 1360 1361 int 1362 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1363 { 1364 struct adapter *sc = tod->tod_softc; 1365 #ifdef INVARIANTS 1366 struct inpcb *inp = tptoinpcb(tp); 1367 #endif 1368 struct toepcb *toep = tp->t_toe; 1369 1370 INP_WLOCK_ASSERT(inp); 1371 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1372 ("%s: inp %p dropped.", __func__, inp)); 1373 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1374 1375 toep->flags |= TPF_SEND_FIN; 1376 if (tp->t_state >= TCPS_ESTABLISHED) 1377 t4_push_data(sc, toep, 0); 1378 1379 return (0); 1380 } 1381 1382 int 1383 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1384 { 1385 struct adapter *sc = tod->tod_softc; 1386 #if defined(INVARIANTS) 1387 struct inpcb *inp = tptoinpcb(tp); 1388 #endif 1389 struct toepcb *toep = tp->t_toe; 1390 1391 INP_WLOCK_ASSERT(inp); 1392 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1393 ("%s: inp %p dropped.", __func__, inp)); 1394 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1395 1396 /* hmmmm */ 1397 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1398 ("%s: flowc for tid %u [%s] not sent already", 1399 __func__, toep->tid, tcpstates[tp->t_state])); 1400 1401 send_reset(sc, toep, 0); 1402 return (0); 1403 } 1404 1405 /* 1406 * Peer has sent us a FIN. 1407 */ 1408 static int 1409 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1410 { 1411 struct adapter *sc = iq->adapter; 1412 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1413 unsigned int tid = GET_TID(cpl); 1414 struct toepcb *toep = lookup_tid(sc, tid); 1415 struct inpcb *inp = toep->inp; 1416 struct tcpcb *tp = NULL; 1417 struct socket *so; 1418 struct epoch_tracker et; 1419 #ifdef INVARIANTS 1420 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1421 #endif 1422 1423 KASSERT(opcode == CPL_PEER_CLOSE, 1424 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1425 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1426 1427 if (__predict_false(toep->flags & TPF_SYNQE)) { 1428 /* 1429 * do_pass_establish must have run before do_peer_close and if 1430 * this is still a synqe instead of a toepcb then the connection 1431 * must be getting aborted. 1432 */ 1433 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1434 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1435 toep, toep->flags); 1436 return (0); 1437 } 1438 1439 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1440 1441 CURVNET_SET(toep->vnet); 1442 NET_EPOCH_ENTER(et); 1443 INP_WLOCK(inp); 1444 tp = intotcpcb(inp); 1445 1446 CTR6(KTR_CXGBE, 1447 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1448 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1449 toep->ddp.flags, inp); 1450 1451 if (toep->flags & TPF_ABORT_SHUTDOWN) 1452 goto done; 1453 1454 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1455 DDP_LOCK(toep); 1456 if (__predict_false(toep->ddp.flags & 1457 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1458 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1459 DDP_UNLOCK(toep); 1460 } 1461 so = inp->inp_socket; 1462 socantrcvmore(so); 1463 1464 if (ulp_mode(toep) == ULP_MODE_RDMA || 1465 (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) { 1466 /* 1467 * There might be data received via DDP before the FIN 1468 * not reported to the driver. Just assume the 1469 * sequence number in the CPL is correct as the 1470 * sequence number of the FIN. 1471 */ 1472 } else { 1473 KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt), 1474 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1475 be32toh(cpl->rcv_nxt))); 1476 } 1477 1478 tp->rcv_nxt = be32toh(cpl->rcv_nxt); 1479 1480 switch (tp->t_state) { 1481 case TCPS_SYN_RECEIVED: 1482 tp->t_starttime = ticks; 1483 /* FALLTHROUGH */ 1484 1485 case TCPS_ESTABLISHED: 1486 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1487 break; 1488 1489 case TCPS_FIN_WAIT_1: 1490 tcp_state_change(tp, TCPS_CLOSING); 1491 break; 1492 1493 case TCPS_FIN_WAIT_2: 1494 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1495 t4_pcb_detach(NULL, tp); 1496 tcp_twstart(tp); 1497 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1498 NET_EPOCH_EXIT(et); 1499 CURVNET_RESTORE(); 1500 1501 INP_WLOCK(inp); 1502 final_cpl_received(toep); 1503 return (0); 1504 1505 default: 1506 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1507 __func__, tid, tp->t_state); 1508 } 1509 done: 1510 INP_WUNLOCK(inp); 1511 NET_EPOCH_EXIT(et); 1512 CURVNET_RESTORE(); 1513 return (0); 1514 } 1515 1516 /* 1517 * Peer has ACK'd our FIN. 1518 */ 1519 static int 1520 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1521 struct mbuf *m) 1522 { 1523 struct adapter *sc = iq->adapter; 1524 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1525 unsigned int tid = GET_TID(cpl); 1526 struct toepcb *toep = lookup_tid(sc, tid); 1527 struct inpcb *inp = toep->inp; 1528 struct tcpcb *tp = NULL; 1529 struct socket *so = NULL; 1530 struct epoch_tracker et; 1531 #ifdef INVARIANTS 1532 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1533 #endif 1534 1535 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1536 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1537 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1538 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1539 1540 CURVNET_SET(toep->vnet); 1541 NET_EPOCH_ENTER(et); 1542 INP_WLOCK(inp); 1543 tp = intotcpcb(inp); 1544 1545 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1546 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1547 1548 if (toep->flags & TPF_ABORT_SHUTDOWN) 1549 goto done; 1550 1551 so = inp->inp_socket; 1552 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1553 1554 switch (tp->t_state) { 1555 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1556 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1557 t4_pcb_detach(NULL, tp); 1558 tcp_twstart(tp); 1559 release: 1560 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1561 NET_EPOCH_EXIT(et); 1562 CURVNET_RESTORE(); 1563 1564 INP_WLOCK(inp); 1565 final_cpl_received(toep); /* no more CPLs expected */ 1566 1567 return (0); 1568 case TCPS_LAST_ACK: 1569 if (tcp_close(tp)) 1570 INP_WUNLOCK(inp); 1571 goto release; 1572 1573 case TCPS_FIN_WAIT_1: 1574 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1575 soisdisconnected(so); 1576 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1577 break; 1578 1579 default: 1580 log(LOG_ERR, 1581 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1582 __func__, tid, tcpstates[tp->t_state]); 1583 } 1584 done: 1585 INP_WUNLOCK(inp); 1586 NET_EPOCH_EXIT(et); 1587 CURVNET_RESTORE(); 1588 return (0); 1589 } 1590 1591 void 1592 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, 1593 int rst_status) 1594 { 1595 struct wrqe *wr; 1596 struct cpl_abort_rpl *cpl; 1597 1598 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); 1599 if (wr == NULL) { 1600 /* XXX */ 1601 panic("%s: allocation failure.", __func__); 1602 } 1603 cpl = wrtod(wr); 1604 1605 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1606 cpl->cmd = rst_status; 1607 1608 t4_wrq_tx(sc, wr); 1609 } 1610 1611 static int 1612 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1613 { 1614 switch (abort_reason) { 1615 case CPL_ERR_BAD_SYN: 1616 case CPL_ERR_CONN_RESET: 1617 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1618 case CPL_ERR_XMIT_TIMEDOUT: 1619 case CPL_ERR_PERSIST_TIMEDOUT: 1620 case CPL_ERR_FINWAIT2_TIMEDOUT: 1621 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1622 return (ETIMEDOUT); 1623 default: 1624 return (EIO); 1625 } 1626 } 1627 1628 /* 1629 * TCP RST from the peer, timeout, or some other such critical error. 1630 */ 1631 static int 1632 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1633 { 1634 struct adapter *sc = iq->adapter; 1635 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1636 unsigned int tid = GET_TID(cpl); 1637 struct toepcb *toep = lookup_tid(sc, tid); 1638 struct sge_ofld_txq *ofld_txq = toep->ofld_txq; 1639 struct inpcb *inp; 1640 struct tcpcb *tp; 1641 struct epoch_tracker et; 1642 #ifdef INVARIANTS 1643 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1644 #endif 1645 1646 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1647 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1648 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1649 1650 if (toep->flags & TPF_SYNQE) 1651 return (do_abort_req_synqe(iq, rss, m)); 1652 1653 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1654 1655 if (negative_advice(cpl->status)) { 1656 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1657 __func__, cpl->status, tid, toep->flags); 1658 return (0); /* Ignore negative advice */ 1659 } 1660 1661 inp = toep->inp; 1662 CURVNET_SET(toep->vnet); 1663 NET_EPOCH_ENTER(et); /* for tcp_close */ 1664 INP_WLOCK(inp); 1665 1666 tp = intotcpcb(inp); 1667 1668 CTR6(KTR_CXGBE, 1669 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1670 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1671 inp->inp_flags, cpl->status); 1672 1673 /* 1674 * If we'd initiated an abort earlier the reply to it is responsible for 1675 * cleaning up resources. Otherwise we tear everything down right here 1676 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1677 */ 1678 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1679 INP_WUNLOCK(inp); 1680 goto done; 1681 } 1682 toep->flags |= TPF_ABORT_SHUTDOWN; 1683 1684 if ((inp->inp_flags & INP_DROPPED) == 0) { 1685 struct socket *so = inp->inp_socket; 1686 1687 if (so != NULL) 1688 so_error_set(so, abort_status_to_errno(tp, 1689 cpl->status)); 1690 tp = tcp_close(tp); 1691 if (tp == NULL) 1692 INP_WLOCK(inp); /* re-acquire */ 1693 } 1694 1695 final_cpl_received(toep); 1696 done: 1697 NET_EPOCH_EXIT(et); 1698 CURVNET_RESTORE(); 1699 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1700 return (0); 1701 } 1702 1703 /* 1704 * Reply to the CPL_ABORT_REQ (send_reset) 1705 */ 1706 static int 1707 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1708 { 1709 struct adapter *sc = iq->adapter; 1710 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1711 unsigned int tid = GET_TID(cpl); 1712 struct toepcb *toep = lookup_tid(sc, tid); 1713 struct inpcb *inp = toep->inp; 1714 #ifdef INVARIANTS 1715 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1716 #endif 1717 1718 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1719 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1720 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1721 1722 if (toep->flags & TPF_SYNQE) 1723 return (do_abort_rpl_synqe(iq, rss, m)); 1724 1725 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1726 1727 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1728 __func__, tid, toep, inp, cpl->status); 1729 1730 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1731 ("%s: wasn't expecting abort reply", __func__)); 1732 1733 INP_WLOCK(inp); 1734 final_cpl_received(toep); 1735 1736 return (0); 1737 } 1738 1739 static int 1740 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1741 { 1742 struct adapter *sc = iq->adapter; 1743 const struct cpl_rx_data *cpl = mtod(m, const void *); 1744 unsigned int tid = GET_TID(cpl); 1745 struct toepcb *toep = lookup_tid(sc, tid); 1746 struct inpcb *inp = toep->inp; 1747 struct tcpcb *tp; 1748 struct socket *so; 1749 struct sockbuf *sb; 1750 struct epoch_tracker et; 1751 int len; 1752 uint32_t ddp_placed = 0; 1753 1754 if (__predict_false(toep->flags & TPF_SYNQE)) { 1755 /* 1756 * do_pass_establish must have run before do_rx_data and if this 1757 * is still a synqe instead of a toepcb then the connection must 1758 * be getting aborted. 1759 */ 1760 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1761 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1762 toep, toep->flags); 1763 m_freem(m); 1764 return (0); 1765 } 1766 1767 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1768 1769 /* strip off CPL header */ 1770 m_adj(m, sizeof(*cpl)); 1771 len = m->m_pkthdr.len; 1772 1773 INP_WLOCK(inp); 1774 if (inp->inp_flags & INP_DROPPED) { 1775 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1776 __func__, tid, len, inp->inp_flags); 1777 INP_WUNLOCK(inp); 1778 m_freem(m); 1779 return (0); 1780 } 1781 1782 tp = intotcpcb(inp); 1783 1784 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1785 toep->flags & TPF_TLS_RECEIVE)) { 1786 /* Received "raw" data on a TLS socket. */ 1787 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1788 __func__, tid, len); 1789 do_rx_data_tls(cpl, toep, m); 1790 return (0); 1791 } 1792 1793 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1794 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1795 1796 tp->rcv_nxt += len; 1797 if (tp->rcv_wnd < len) { 1798 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1799 ("%s: negative window size", __func__)); 1800 } 1801 1802 tp->rcv_wnd -= len; 1803 tp->t_rcvtime = ticks; 1804 1805 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1806 DDP_LOCK(toep); 1807 so = inp_inpcbtosocket(inp); 1808 sb = &so->so_rcv; 1809 SOCKBUF_LOCK(sb); 1810 1811 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1812 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1813 __func__, tid, len); 1814 m_freem(m); 1815 SOCKBUF_UNLOCK(sb); 1816 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1817 DDP_UNLOCK(toep); 1818 INP_WUNLOCK(inp); 1819 1820 CURVNET_SET(toep->vnet); 1821 NET_EPOCH_ENTER(et); 1822 INP_WLOCK(inp); 1823 tp = tcp_drop(tp, ECONNRESET); 1824 if (tp) 1825 INP_WUNLOCK(inp); 1826 NET_EPOCH_EXIT(et); 1827 CURVNET_RESTORE(); 1828 1829 return (0); 1830 } 1831 1832 /* receive buffer autosize */ 1833 MPASS(toep->vnet == so->so_vnet); 1834 CURVNET_SET(toep->vnet); 1835 if (sb->sb_flags & SB_AUTOSIZE && 1836 V_tcp_do_autorcvbuf && 1837 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1838 len > (sbspace(sb) / 8 * 7)) { 1839 unsigned int hiwat = sb->sb_hiwat; 1840 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1841 V_tcp_autorcvbuf_max); 1842 1843 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 1844 sb->sb_flags &= ~SB_AUTOSIZE; 1845 } 1846 1847 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1848 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1849 1850 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1851 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1852 __func__, tid, len); 1853 1854 if (changed) { 1855 if (toep->ddp.flags & DDP_SC_REQ) 1856 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1857 else if (cpl->ddp_off == 1) { 1858 /* Fell out of DDP mode */ 1859 toep->ddp.flags &= ~DDP_ON; 1860 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1861 __func__); 1862 1863 insert_ddp_data(toep, ddp_placed); 1864 } else { 1865 /* 1866 * Data was received while still 1867 * ULP_MODE_NONE, just fall through. 1868 */ 1869 } 1870 } 1871 1872 if (toep->ddp.flags & DDP_ON) { 1873 /* 1874 * CPL_RX_DATA with DDP on can only be an indicate. 1875 * Start posting queued AIO requests via DDP. The 1876 * payload that arrived in this indicate is appended 1877 * to the socket buffer as usual. 1878 */ 1879 handle_ddp_indicate(toep); 1880 } 1881 } 1882 1883 sbappendstream_locked(sb, m, 0); 1884 t4_rcvd_locked(&toep->td->tod, tp); 1885 1886 if (ulp_mode(toep) == ULP_MODE_TCPDDP && 1887 (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 && 1888 sbavail(sb) != 0) { 1889 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1890 tid); 1891 ddp_queue_toep(toep); 1892 } 1893 if (toep->flags & TPF_TLS_STARTING) 1894 tls_received_starting_data(sc, toep, sb, len); 1895 sorwakeup_locked(so); 1896 SOCKBUF_UNLOCK_ASSERT(sb); 1897 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1898 DDP_UNLOCK(toep); 1899 1900 INP_WUNLOCK(inp); 1901 CURVNET_RESTORE(); 1902 return (0); 1903 } 1904 1905 static int 1906 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1907 { 1908 struct adapter *sc = iq->adapter; 1909 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1910 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1911 struct toepcb *toep = lookup_tid(sc, tid); 1912 struct inpcb *inp; 1913 struct tcpcb *tp; 1914 struct socket *so; 1915 uint8_t credits = cpl->credits; 1916 struct ofld_tx_sdesc *txsd; 1917 int plen; 1918 #ifdef INVARIANTS 1919 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1920 #endif 1921 1922 /* 1923 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1924 * now this comes back carrying the credits for the flowc. 1925 */ 1926 if (__predict_false(toep->flags & TPF_SYNQE)) { 1927 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1928 ("%s: credits for a synq entry %p", __func__, toep)); 1929 return (0); 1930 } 1931 1932 inp = toep->inp; 1933 1934 KASSERT(opcode == CPL_FW4_ACK, 1935 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1936 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1937 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1938 1939 INP_WLOCK(inp); 1940 1941 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1942 INP_WUNLOCK(inp); 1943 return (0); 1944 } 1945 1946 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1947 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1948 1949 tp = intotcpcb(inp); 1950 1951 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1952 tcp_seq snd_una = be32toh(cpl->snd_una); 1953 1954 #ifdef INVARIANTS 1955 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1956 log(LOG_ERR, 1957 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1958 __func__, snd_una, toep->tid, tp->snd_una); 1959 } 1960 #endif 1961 1962 if (tp->snd_una != snd_una) { 1963 tp->snd_una = snd_una; 1964 tp->ts_recent_age = tcp_ts_getticks(); 1965 } 1966 } 1967 1968 #ifdef VERBOSE_TRACES 1969 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1970 #endif 1971 so = inp->inp_socket; 1972 txsd = &toep->txsd[toep->txsd_cidx]; 1973 plen = 0; 1974 while (credits) { 1975 KASSERT(credits >= txsd->tx_credits, 1976 ("%s: too many (or partial) credits", __func__)); 1977 credits -= txsd->tx_credits; 1978 toep->tx_credits += txsd->tx_credits; 1979 plen += txsd->plen; 1980 txsd++; 1981 toep->txsd_avail++; 1982 KASSERT(toep->txsd_avail <= toep->txsd_total, 1983 ("%s: txsd avail > total", __func__)); 1984 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1985 txsd = &toep->txsd[0]; 1986 toep->txsd_cidx = 0; 1987 } 1988 } 1989 1990 if (toep->tx_credits == toep->tx_total) { 1991 toep->tx_nocompl = 0; 1992 toep->plen_nocompl = 0; 1993 } 1994 1995 if (toep->flags & TPF_TX_SUSPENDED && 1996 toep->tx_credits >= toep->tx_total / 4) { 1997 #ifdef VERBOSE_TRACES 1998 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1999 tid); 2000 #endif 2001 toep->flags &= ~TPF_TX_SUSPENDED; 2002 CURVNET_SET(toep->vnet); 2003 t4_push_data(sc, toep, plen); 2004 CURVNET_RESTORE(); 2005 } else if (plen > 0) { 2006 struct sockbuf *sb = &so->so_snd; 2007 int sbu; 2008 2009 SOCKBUF_LOCK(sb); 2010 sbu = sbused(sb); 2011 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 2012 if (__predict_false(sbu > 0)) { 2013 /* 2014 * The data transmitted before the 2015 * tid's ULP mode changed to ISCSI is 2016 * still in so_snd. Incoming credits 2017 * should account for so_snd first. 2018 */ 2019 sbdrop_locked(sb, min(sbu, plen)); 2020 plen -= min(sbu, plen); 2021 } 2022 sowwakeup_locked(so); /* unlocks so_snd */ 2023 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 2024 } else { 2025 #ifdef VERBOSE_TRACES 2026 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 2027 tid, plen); 2028 #endif 2029 sbdrop_locked(sb, plen); 2030 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2031 t4_aiotx_queue_toep(so, toep); 2032 sowwakeup_locked(so); /* unlocks so_snd */ 2033 } 2034 SOCKBUF_UNLOCK_ASSERT(sb); 2035 } 2036 2037 INP_WUNLOCK(inp); 2038 2039 return (0); 2040 } 2041 2042 void 2043 write_set_tcb_field(struct adapter *sc, void *dst, struct toepcb *toep, 2044 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 2045 { 2046 struct cpl_set_tcb_field *req = dst; 2047 2048 MPASS((cookie & ~M_COOKIE) == 0); 2049 if (reply) { 2050 MPASS(cookie != CPL_COOKIE_RESERVED); 2051 } 2052 2053 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 2054 if (reply == 0) { 2055 req->reply_ctrl = htobe16(F_NO_REPLY); 2056 } else { 2057 const int qid = toep->ofld_rxq->iq.abs_id; 2058 if (chip_id(sc) >= CHELSIO_T7) { 2059 req->reply_ctrl = htobe16(V_T7_QUEUENO(qid) | 2060 V_T7_REPLY_CHAN(0) | V_NO_REPLY(0)); 2061 } else { 2062 req->reply_ctrl = htobe16(V_QUEUENO(qid) | 2063 V_REPLY_CHAN(0) | V_NO_REPLY(0)); 2064 } 2065 } 2066 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 2067 req->mask = htobe64(mask); 2068 req->val = htobe64(val); 2069 } 2070 2071 void 2072 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 2073 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 2074 { 2075 struct wrqe *wr; 2076 struct ofld_tx_sdesc *txsd; 2077 const u_int len = sizeof(struct cpl_set_tcb_field); 2078 2079 wr = alloc_wrqe(len, wrq); 2080 if (wr == NULL) { 2081 /* XXX */ 2082 panic("%s: allocation failure.", __func__); 2083 } 2084 write_set_tcb_field(sc, wrtod(wr), toep, word, mask, val, reply, 2085 cookie); 2086 2087 if (wrq->eq.type == EQ_OFLD) { 2088 txsd = &toep->txsd[toep->txsd_pidx]; 2089 _Static_assert(howmany(len, 16) <= MAX_OFLD_TX_SDESC_CREDITS, 2090 "MAX_OFLD_TX_SDESC_CREDITS too small"); 2091 txsd->tx_credits = howmany(len, 16); 2092 txsd->plen = 0; 2093 KASSERT(toep->tx_credits >= txsd->tx_credits && 2094 toep->txsd_avail > 0, 2095 ("%s: not enough credits (%d)", __func__, 2096 toep->tx_credits)); 2097 toep->tx_credits -= txsd->tx_credits; 2098 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 2099 toep->txsd_pidx = 0; 2100 toep->txsd_avail--; 2101 } 2102 2103 t4_wrq_tx(sc, wr); 2104 } 2105 2106 void 2107 t4_init_cpl_io_handlers(void) 2108 { 2109 2110 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 2111 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 2112 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 2113 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 2114 CPL_COOKIE_TOM); 2115 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 2116 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 2117 } 2118 2119 void 2120 t4_uninit_cpl_io_handlers(void) 2121 { 2122 2123 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 2124 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 2125 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 2126 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 2127 t4_register_cpl_handler(CPL_RX_DATA, NULL); 2128 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 2129 } 2130 2131 /* 2132 * Use the 'backend1' field in AIO jobs to hold an error that should 2133 * be reported when the job is completed, the 'backend3' field to 2134 * store the amount of data sent by the AIO job so far, and the 2135 * 'backend4' field to hold a reference count on the job. 2136 * 2137 * Each unmapped mbuf holds a reference on the job as does the queue 2138 * so long as the job is queued. 2139 */ 2140 #define aio_error backend1 2141 #define aio_sent backend3 2142 #define aio_refs backend4 2143 2144 #ifdef VERBOSE_TRACES 2145 static int 2146 jobtotid(struct kaiocb *job) 2147 { 2148 struct socket *so; 2149 struct tcpcb *tp; 2150 struct toepcb *toep; 2151 2152 so = job->fd_file->f_data; 2153 tp = sototcpcb(so); 2154 toep = tp->t_toe; 2155 return (toep->tid); 2156 } 2157 #endif 2158 2159 static void 2160 aiotx_free_job(struct kaiocb *job) 2161 { 2162 long status; 2163 int error; 2164 2165 if (refcount_release(&job->aio_refs) == 0) 2166 return; 2167 2168 error = (intptr_t)job->aio_error; 2169 status = job->aio_sent; 2170 #ifdef VERBOSE_TRACES 2171 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 2172 jobtotid(job), job, status, error); 2173 #endif 2174 if (error != 0 && status != 0) 2175 error = 0; 2176 if (error == ECANCELED) 2177 aio_cancel(job); 2178 else if (error) 2179 aio_complete(job, -1, error); 2180 else { 2181 job->msgsnd = 1; 2182 aio_complete(job, status, 0); 2183 } 2184 } 2185 2186 static void 2187 aiotx_free_pgs(struct mbuf *m) 2188 { 2189 struct kaiocb *job; 2190 vm_page_t pg; 2191 2192 M_ASSERTEXTPG(m); 2193 job = m->m_ext.ext_arg1; 2194 #ifdef VERBOSE_TRACES 2195 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 2196 m->m_len, jobtotid(job)); 2197 #endif 2198 2199 for (int i = 0; i < m->m_epg_npgs; i++) { 2200 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 2201 vm_page_unwire(pg, PQ_ACTIVE); 2202 } 2203 2204 aiotx_free_job(job); 2205 } 2206 2207 /* 2208 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 2209 * of an AIO job. 2210 */ 2211 static struct mbuf * 2212 alloc_aiotx_mbuf(struct kaiocb *job, int len) 2213 { 2214 struct vmspace *vm; 2215 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 2216 struct mbuf *m, *top, *last; 2217 vm_map_t map; 2218 vm_offset_t start; 2219 int i, mlen, npages, pgoff; 2220 2221 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 2222 ("%s(%p, %d): request to send beyond end of buffer", __func__, 2223 job, len)); 2224 2225 /* 2226 * The AIO subsystem will cancel and drain all requests before 2227 * permitting a process to exit or exec, so p_vmspace should 2228 * be stable here. 2229 */ 2230 vm = job->userproc->p_vmspace; 2231 map = &vm->vm_map; 2232 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 2233 pgoff = start & PAGE_MASK; 2234 2235 top = NULL; 2236 last = NULL; 2237 while (len > 0) { 2238 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 2239 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 2240 ("%s: next start (%#jx + %#x) is not page aligned", 2241 __func__, (uintmax_t)start, mlen)); 2242 2243 npages = vm_fault_quick_hold_pages(map, start, mlen, 2244 VM_PROT_WRITE, pgs, nitems(pgs)); 2245 if (npages < 0) 2246 break; 2247 2248 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs, M_RDONLY); 2249 m->m_epg_1st_off = pgoff; 2250 m->m_epg_npgs = npages; 2251 if (npages == 1) { 2252 KASSERT(mlen + pgoff <= PAGE_SIZE, 2253 ("%s: single page is too large (off %d len %d)", 2254 __func__, pgoff, mlen)); 2255 m->m_epg_last_len = mlen; 2256 } else { 2257 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2258 (npages - 2) * PAGE_SIZE; 2259 } 2260 for (i = 0; i < npages; i++) 2261 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2262 2263 m->m_len = mlen; 2264 m->m_ext.ext_size = npages * PAGE_SIZE; 2265 m->m_ext.ext_arg1 = job; 2266 refcount_acquire(&job->aio_refs); 2267 2268 #ifdef VERBOSE_TRACES 2269 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2270 __func__, jobtotid(job), m, job, npages); 2271 #endif 2272 2273 if (top == NULL) 2274 top = m; 2275 else 2276 last->m_next = m; 2277 last = m; 2278 2279 len -= mlen; 2280 start += mlen; 2281 pgoff = 0; 2282 } 2283 2284 return (top); 2285 } 2286 2287 static void 2288 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2289 { 2290 struct sockbuf *sb; 2291 struct inpcb *inp; 2292 struct tcpcb *tp; 2293 struct mbuf *m; 2294 u_int sent; 2295 int error, len; 2296 bool moretocome, sendmore; 2297 2298 sb = &so->so_snd; 2299 SOCKBUF_UNLOCK(sb); 2300 m = NULL; 2301 2302 #ifdef MAC 2303 error = mac_socket_check_send(job->fd_file->f_cred, so); 2304 if (error != 0) 2305 goto out; 2306 #endif 2307 2308 /* Inline sosend_generic(). */ 2309 2310 error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); 2311 MPASS(error == 0); 2312 2313 sendanother: 2314 SOCKBUF_LOCK(sb); 2315 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2316 SOCKBUF_UNLOCK(sb); 2317 SOCK_IO_SEND_UNLOCK(so); 2318 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2319 PROC_LOCK(job->userproc); 2320 kern_psignal(job->userproc, SIGPIPE); 2321 PROC_UNLOCK(job->userproc); 2322 } 2323 error = EPIPE; 2324 goto out; 2325 } 2326 if (so->so_error) { 2327 error = so->so_error; 2328 so->so_error = 0; 2329 SOCKBUF_UNLOCK(sb); 2330 SOCK_IO_SEND_UNLOCK(so); 2331 goto out; 2332 } 2333 if ((so->so_state & SS_ISCONNECTED) == 0) { 2334 SOCKBUF_UNLOCK(sb); 2335 SOCK_IO_SEND_UNLOCK(so); 2336 error = ENOTCONN; 2337 goto out; 2338 } 2339 if (sbspace(sb) < sb->sb_lowat) { 2340 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2341 2342 /* 2343 * Don't block if there is too little room in the socket 2344 * buffer. Instead, requeue the request. 2345 */ 2346 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2347 SOCKBUF_UNLOCK(sb); 2348 SOCK_IO_SEND_UNLOCK(so); 2349 error = ECANCELED; 2350 goto out; 2351 } 2352 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2353 SOCKBUF_UNLOCK(sb); 2354 SOCK_IO_SEND_UNLOCK(so); 2355 goto out; 2356 } 2357 2358 /* 2359 * Write as much data as the socket permits, but no more than a 2360 * a single sndbuf at a time. 2361 */ 2362 len = sbspace(sb); 2363 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2364 len = job->uaiocb.aio_nbytes - job->aio_sent; 2365 moretocome = false; 2366 } else 2367 moretocome = true; 2368 if (len > toep->params.sndbuf) { 2369 len = toep->params.sndbuf; 2370 sendmore = true; 2371 } else 2372 sendmore = false; 2373 2374 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2375 moretocome = true; 2376 SOCKBUF_UNLOCK(sb); 2377 MPASS(len != 0); 2378 2379 m = alloc_aiotx_mbuf(job, len); 2380 if (m == NULL) { 2381 SOCK_IO_SEND_UNLOCK(so); 2382 error = EFAULT; 2383 goto out; 2384 } 2385 2386 /* Inlined tcp_usr_send(). */ 2387 2388 inp = toep->inp; 2389 INP_WLOCK(inp); 2390 if (inp->inp_flags & INP_DROPPED) { 2391 INP_WUNLOCK(inp); 2392 SOCK_IO_SEND_UNLOCK(so); 2393 error = ECONNRESET; 2394 goto out; 2395 } 2396 2397 sent = m_length(m, NULL); 2398 job->aio_sent += sent; 2399 counter_u64_add(toep->ofld_txq->tx_aio_octets, sent); 2400 2401 sbappendstream(sb, m, 0); 2402 m = NULL; 2403 2404 if (!(inp->inp_flags & INP_DROPPED)) { 2405 tp = intotcpcb(inp); 2406 if (moretocome) 2407 tp->t_flags |= TF_MORETOCOME; 2408 error = tcp_output(tp); 2409 if (error < 0) { 2410 INP_UNLOCK_ASSERT(inp); 2411 SOCK_IO_SEND_UNLOCK(so); 2412 error = -error; 2413 goto out; 2414 } 2415 if (moretocome) 2416 tp->t_flags &= ~TF_MORETOCOME; 2417 } 2418 2419 INP_WUNLOCK(inp); 2420 if (sendmore) 2421 goto sendanother; 2422 SOCK_IO_SEND_UNLOCK(so); 2423 2424 if (error) 2425 goto out; 2426 2427 /* 2428 * If this is a blocking socket and the request has not been 2429 * fully completed, requeue it until the socket is ready 2430 * again. 2431 */ 2432 if (job->aio_sent < job->uaiocb.aio_nbytes && 2433 !(so->so_state & SS_NBIO)) { 2434 SOCKBUF_LOCK(sb); 2435 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2436 SOCKBUF_UNLOCK(sb); 2437 error = ECANCELED; 2438 goto out; 2439 } 2440 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2441 return; 2442 } 2443 2444 /* 2445 * If the request will not be requeued, drop the queue's 2446 * reference to the job. Any mbufs in flight should still 2447 * hold a reference, but this drops the reference that the 2448 * queue owns while it is waiting to queue mbufs to the 2449 * socket. 2450 */ 2451 aiotx_free_job(job); 2452 counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1); 2453 2454 out: 2455 if (error) { 2456 job->aio_error = (void *)(intptr_t)error; 2457 aiotx_free_job(job); 2458 } 2459 m_freem(m); 2460 SOCKBUF_LOCK(sb); 2461 } 2462 2463 static void 2464 t4_aiotx_task(void *context, int pending) 2465 { 2466 struct toepcb *toep = context; 2467 struct socket *so; 2468 struct kaiocb *job; 2469 struct epoch_tracker et; 2470 2471 so = toep->aiotx_so; 2472 CURVNET_SET(toep->vnet); 2473 NET_EPOCH_ENTER(et); 2474 SOCKBUF_LOCK(&so->so_snd); 2475 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2476 job = TAILQ_FIRST(&toep->aiotx_jobq); 2477 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2478 if (!aio_clear_cancel_function(job)) 2479 continue; 2480 2481 t4_aiotx_process_job(toep, so, job); 2482 } 2483 toep->aiotx_so = NULL; 2484 SOCKBUF_UNLOCK(&so->so_snd); 2485 NET_EPOCH_EXIT(et); 2486 2487 free_toepcb(toep); 2488 sorele(so); 2489 CURVNET_RESTORE(); 2490 } 2491 2492 static void 2493 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2494 { 2495 2496 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2497 #ifdef VERBOSE_TRACES 2498 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2499 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2500 #endif 2501 if (toep->aiotx_so != NULL) 2502 return; 2503 soref(so); 2504 toep->aiotx_so = so; 2505 hold_toepcb(toep); 2506 soaio_enqueue(&toep->aiotx_task); 2507 } 2508 2509 static void 2510 t4_aiotx_cancel(struct kaiocb *job) 2511 { 2512 struct socket *so; 2513 struct sockbuf *sb; 2514 struct tcpcb *tp; 2515 struct toepcb *toep; 2516 2517 so = job->fd_file->f_data; 2518 tp = sototcpcb(so); 2519 toep = tp->t_toe; 2520 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2521 sb = &so->so_snd; 2522 2523 SOCKBUF_LOCK(sb); 2524 if (!aio_cancel_cleared(job)) 2525 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2526 SOCKBUF_UNLOCK(sb); 2527 2528 job->aio_error = (void *)(intptr_t)ECANCELED; 2529 aiotx_free_job(job); 2530 } 2531 2532 int 2533 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2534 { 2535 struct tcpcb *tp = sototcpcb(so); 2536 struct toepcb *toep = tp->t_toe; 2537 struct adapter *sc = td_adapter(toep->td); 2538 2539 /* This only handles writes. */ 2540 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2541 return (EOPNOTSUPP); 2542 2543 if (!sc->tt.tx_zcopy) 2544 return (EOPNOTSUPP); 2545 2546 if (tls_tx_key(toep)) 2547 return (EOPNOTSUPP); 2548 2549 SOCKBUF_LOCK(&so->so_snd); 2550 #ifdef VERBOSE_TRACES 2551 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2552 #endif 2553 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2554 panic("new job was cancelled"); 2555 refcount_init(&job->aio_refs, 1); 2556 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2557 if (sowriteable(so)) 2558 t4_aiotx_queue_toep(so, toep); 2559 SOCKBUF_UNLOCK(&so->so_snd); 2560 return (0); 2561 } 2562 2563 void 2564 aiotx_init_toep(struct toepcb *toep) 2565 { 2566 2567 TAILQ_INIT(&toep->aiotx_jobq); 2568 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2569 } 2570 #endif 2571