1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_kern_tls.h" 34 #include "opt_ratelimit.h" 35 36 #ifdef TCP_OFFLOAD 37 #include <sys/param.h> 38 #include <sys/aio.h> 39 #include <sys/file.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/module.h> 43 #include <sys/proc.h> 44 #include <sys/protosw.h> 45 #include <sys/domain.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sglist.h> 49 #include <sys/taskqueue.h> 50 #include <netinet/in.h> 51 #include <netinet/in_pcb.h> 52 #include <netinet/ip.h> 53 #include <netinet/ip6.h> 54 #define TCPSTATES 55 #include <netinet/tcp_fsm.h> 56 #include <netinet/tcp_seq.h> 57 #include <netinet/tcp_var.h> 58 #include <netinet/toecore.h> 59 60 #include <security/mac/mac_framework.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_extern.h> 64 #include <vm/pmap.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_page.h> 67 68 #include <dev/iscsi/iscsi_proto.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (toep->params.tc_idx != -1) { 100 MPASS(toep->params.tc_idx >= 0 && 101 toep->params.tc_idx < sc->params.nsched_cls); 102 nparams++; 103 } 104 105 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 106 107 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 108 if (wr == NULL) { 109 /* XXX */ 110 panic("%s: allocation failure.", __func__); 111 } 112 flowc = wrtod(wr); 113 memset(flowc, 0, wr->wr_len); 114 115 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 116 V_FW_FLOWC_WR_NPARAMS(nparams)); 117 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 118 V_FW_WR_FLOWID(toep->tid)); 119 120 #define FLOWC_PARAM(__m, __v) \ 121 do { \ 122 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 123 flowc->mnemval[paramidx].val = htobe32(__v); \ 124 paramidx++; \ 125 } while (0) 126 127 paramidx = 0; 128 129 FLOWC_PARAM(PFNVFN, pfvf); 130 FLOWC_PARAM(CH, pi->tx_chan); 131 FLOWC_PARAM(PORT, pi->tx_chan); 132 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 133 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 134 if (tp) { 135 FLOWC_PARAM(MSS, toep->params.emss); 136 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 137 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 138 } else 139 FLOWC_PARAM(MSS, 512); 140 CTR6(KTR_CXGBE, 141 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 142 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 143 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 144 145 if (toep->params.tc_idx != -1) 146 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 147 #undef FLOWC_PARAM 148 149 KASSERT(paramidx == nparams, ("nparams mismatch")); 150 151 txsd->tx_credits = howmany(flowclen, 16); 152 txsd->plen = 0; 153 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 154 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 155 toep->tx_credits -= txsd->tx_credits; 156 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 157 toep->txsd_pidx = 0; 158 toep->txsd_avail--; 159 160 toep->flags |= TPF_FLOWC_WR_SENT; 161 t4_wrq_tx(sc, wr); 162 } 163 164 #ifdef RATELIMIT 165 /* 166 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 167 */ 168 static int 169 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 170 { 171 int tc_idx, rc; 172 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 173 const int port_id = toep->vi->pi->port_id; 174 175 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 176 177 if (kbps == 0) { 178 /* unbind */ 179 tc_idx = -1; 180 } else { 181 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 182 if (rc != 0) 183 return (rc); 184 MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls); 185 } 186 187 if (toep->params.tc_idx != tc_idx) { 188 struct wrqe *wr; 189 struct fw_flowc_wr *flowc; 190 int nparams = 1, flowclen, flowclen16; 191 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 192 193 flowclen = sizeof(*flowc) + nparams * sizeof(struct 194 fw_flowc_mnemval); 195 flowclen16 = howmany(flowclen, 16); 196 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 197 (wr = alloc_wrqe(roundup2(flowclen, 16), 198 &toep->ofld_txq->wrq)) == NULL) { 199 if (tc_idx >= 0) 200 t4_release_cl_rl(sc, port_id, tc_idx); 201 return (ENOMEM); 202 } 203 204 flowc = wrtod(wr); 205 memset(flowc, 0, wr->wr_len); 206 207 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 208 V_FW_FLOWC_WR_NPARAMS(nparams)); 209 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 210 V_FW_WR_FLOWID(toep->tid)); 211 212 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 213 if (tc_idx == -1) 214 flowc->mnemval[0].val = htobe32(0xff); 215 else 216 flowc->mnemval[0].val = htobe32(tc_idx); 217 218 txsd->tx_credits = flowclen16; 219 txsd->plen = 0; 220 toep->tx_credits -= txsd->tx_credits; 221 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 222 toep->txsd_pidx = 0; 223 toep->txsd_avail--; 224 t4_wrq_tx(sc, wr); 225 } 226 227 if (toep->params.tc_idx >= 0) 228 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 229 toep->params.tc_idx = tc_idx; 230 231 return (0); 232 } 233 #endif 234 235 void 236 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 237 { 238 struct wrqe *wr; 239 struct cpl_abort_req *req; 240 int tid = toep->tid; 241 struct inpcb *inp = toep->inp; 242 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 243 244 INP_WLOCK_ASSERT(inp); 245 246 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 247 __func__, toep->tid, 248 inp->inp_flags & INP_DROPPED ? "inp dropped" : 249 tcpstates[tp->t_state], 250 toep->flags, inp->inp_flags, 251 toep->flags & TPF_ABORT_SHUTDOWN ? 252 " (abort already in progress)" : ""); 253 254 if (toep->flags & TPF_ABORT_SHUTDOWN) 255 return; /* abort already in progress */ 256 257 toep->flags |= TPF_ABORT_SHUTDOWN; 258 259 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 260 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 261 262 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 263 if (wr == NULL) { 264 /* XXX */ 265 panic("%s: allocation failure.", __func__); 266 } 267 req = wrtod(wr); 268 269 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 270 if (inp->inp_flags & INP_DROPPED) 271 req->rsvd0 = htobe32(snd_nxt); 272 else 273 req->rsvd0 = htobe32(tp->snd_nxt); 274 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 275 req->cmd = CPL_ABORT_SEND_RST; 276 277 /* 278 * XXX: What's the correct way to tell that the inp hasn't been detached 279 * from its socket? Should I even be flushing the snd buffer here? 280 */ 281 if ((inp->inp_flags & INP_DROPPED) == 0) { 282 struct socket *so = inp->inp_socket; 283 284 if (so != NULL) /* because I'm not sure. See comment above */ 285 sbflush(&so->so_snd); 286 } 287 288 t4_l2t_send(sc, wr, toep->l2te); 289 } 290 291 /* 292 * Called when a connection is established to translate the TCP options 293 * reported by HW to FreeBSD's native format. 294 */ 295 static void 296 assign_rxopt(struct tcpcb *tp, uint16_t opt) 297 { 298 struct toepcb *toep = tp->t_toe; 299 struct inpcb *inp = tptoinpcb(tp); 300 struct adapter *sc = td_adapter(toep->td); 301 302 INP_LOCK_ASSERT(inp); 303 304 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 305 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 306 if (inp->inp_inc.inc_flags & INC_ISIPV6) 307 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 308 else 309 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 310 311 toep->params.emss = tp->t_maxseg; 312 if (G_TCPOPT_TSTAMP(opt)) { 313 toep->params.tstamp = 1; 314 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 315 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 316 tp->ts_recent = 0; /* hmmm */ 317 tp->ts_recent_age = tcp_ts_getticks(); 318 } else 319 toep->params.tstamp = 0; 320 321 if (G_TCPOPT_SACK(opt)) { 322 toep->params.sack = 1; 323 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 324 } else { 325 toep->params.sack = 0; 326 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 327 } 328 329 if (G_TCPOPT_WSCALE_OK(opt)) 330 tp->t_flags |= TF_RCVD_SCALE; 331 332 /* Doing window scaling? */ 333 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 334 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 335 tp->rcv_scale = tp->request_r_scale; 336 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 337 } else 338 toep->params.wscale = 0; 339 340 CTR6(KTR_CXGBE, 341 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 342 toep->tid, toep->params.mtu_idx, toep->params.emss, 343 toep->params.tstamp, toep->params.sack, toep->params.wscale); 344 } 345 346 /* 347 * Completes some final bits of initialization for just established connections 348 * and changes their state to TCPS_ESTABLISHED. 349 * 350 * The ISNs are from the exchange of SYNs. 351 */ 352 void 353 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 354 { 355 struct inpcb *inp = toep->inp; 356 struct socket *so = inp->inp_socket; 357 struct tcpcb *tp = intotcpcb(inp); 358 uint16_t tcpopt = be16toh(opt); 359 360 INP_WLOCK_ASSERT(inp); 361 KASSERT(tp->t_state == TCPS_SYN_SENT || 362 tp->t_state == TCPS_SYN_RECEIVED, 363 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 364 365 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 366 __func__, toep->tid, so, inp, tp, toep); 367 368 tcp_state_change(tp, TCPS_ESTABLISHED); 369 tp->t_starttime = ticks; 370 TCPSTAT_INC(tcps_connects); 371 372 tp->irs = irs; 373 tcp_rcvseqinit(tp); 374 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 375 tp->rcv_adv += tp->rcv_wnd; 376 tp->last_ack_sent = tp->rcv_nxt; 377 378 tp->iss = iss; 379 tcp_sendseqinit(tp); 380 tp->snd_una = iss + 1; 381 tp->snd_nxt = iss + 1; 382 tp->snd_max = iss + 1; 383 384 assign_rxopt(tp, tcpopt); 385 send_flowc_wr(toep, tp); 386 387 soisconnected(so); 388 } 389 390 int 391 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 392 { 393 struct wrqe *wr; 394 struct cpl_rx_data_ack *req; 395 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 396 397 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 398 399 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 400 if (wr == NULL) 401 return (0); 402 req = wrtod(wr); 403 404 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 405 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 406 407 t4_wrq_tx(sc, wr); 408 return (credits); 409 } 410 411 void 412 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 413 { 414 struct adapter *sc = tod->tod_softc; 415 struct inpcb *inp = tptoinpcb(tp); 416 struct socket *so = inp->inp_socket; 417 struct sockbuf *sb = &so->so_rcv; 418 struct toepcb *toep = tp->t_toe; 419 int rx_credits; 420 421 INP_WLOCK_ASSERT(inp); 422 SOCKBUF_LOCK_ASSERT(sb); 423 424 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 425 if (rx_credits > 0 && 426 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 427 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 428 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 429 rx_credits = send_rx_credits(sc, toep, rx_credits); 430 tp->rcv_wnd += rx_credits; 431 tp->rcv_adv += rx_credits; 432 } 433 } 434 435 void 436 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 437 { 438 struct inpcb *inp = tptoinpcb(tp); 439 struct socket *so = inp->inp_socket; 440 struct sockbuf *sb = &so->so_rcv; 441 442 SOCKBUF_LOCK(sb); 443 t4_rcvd_locked(tod, tp); 444 SOCKBUF_UNLOCK(sb); 445 } 446 447 /* 448 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 449 */ 450 int 451 t4_close_conn(struct adapter *sc, struct toepcb *toep) 452 { 453 struct wrqe *wr; 454 struct cpl_close_con_req *req; 455 unsigned int tid = toep->tid; 456 457 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 458 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 459 460 if (toep->flags & TPF_FIN_SENT) 461 return (0); 462 463 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 464 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 465 466 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 467 if (wr == NULL) { 468 /* XXX */ 469 panic("%s: allocation failure.", __func__); 470 } 471 req = wrtod(wr); 472 473 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 474 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 475 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 476 V_FW_WR_FLOWID(tid)); 477 req->wr.wr_lo = cpu_to_be64(0); 478 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 479 req->rsvd = 0; 480 481 toep->flags |= TPF_FIN_SENT; 482 toep->flags &= ~TPF_SEND_FIN; 483 t4_l2t_send(sc, wr, toep->l2te); 484 485 return (0); 486 } 487 488 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 489 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 490 #define MIN_ISO_TX_CREDITS (howmany(sizeof(struct cpl_tx_data_iso), 16)) 491 #define MIN_TX_CREDITS(iso) \ 492 (MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0)) 493 494 /* Maximum amount of immediate data we could stuff in a WR */ 495 static inline int 496 max_imm_payload(int tx_credits, int iso) 497 { 498 const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0; 499 const int n = 1; /* Use no more than one desc for imm. data WR */ 500 501 KASSERT(tx_credits >= 0 && 502 tx_credits <= MAX_OFLD_TX_CREDITS, 503 ("%s: %d credits", __func__, tx_credits)); 504 505 if (tx_credits < MIN_TX_CREDITS(iso)) 506 return (0); 507 508 if (tx_credits >= (n * EQ_ESIZE) / 16) 509 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) - 510 iso_cpl_size); 511 else 512 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) - 513 iso_cpl_size); 514 } 515 516 /* Maximum number of SGL entries we could stuff in a WR */ 517 static inline int 518 max_dsgl_nsegs(int tx_credits, int iso) 519 { 520 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 521 int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso); 522 523 KASSERT(tx_credits >= 0 && 524 tx_credits <= MAX_OFLD_TX_CREDITS, 525 ("%s: %d credits", __func__, tx_credits)); 526 527 if (tx_credits < MIN_TX_CREDITS(iso)) 528 return (0); 529 530 nseg += 2 * (sge_pair_credits * 16 / 24); 531 if ((sge_pair_credits * 16) % 24 == 16) 532 nseg++; 533 534 return (nseg); 535 } 536 537 static inline void 538 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode, 539 unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, 540 int ulp_submode) 541 { 542 struct fw_ofld_tx_data_wr *txwr = dst; 543 544 txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) | 545 V_FW_WR_IMMDLEN(immdlen)); 546 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 547 V_FW_WR_LEN16(credits)); 548 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 549 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 550 txwr->plen = htobe32(plen); 551 552 if (toep->params.tx_align > 0) { 553 if (plen < 2 * toep->params.emss) 554 txwr->lsodisable_to_flags |= 555 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 556 else 557 txwr->lsodisable_to_flags |= 558 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 559 (toep->params.nagle == 0 ? 0 : 560 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 561 } 562 } 563 564 /* 565 * Generate a DSGL from a starting mbuf. The total number of segments and the 566 * maximum segments in any one mbuf are provided. 567 */ 568 static void 569 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 570 { 571 struct mbuf *m; 572 struct ulptx_sgl *usgl = dst; 573 int i, j, rc; 574 struct sglist sg; 575 struct sglist_seg segs[n]; 576 577 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 578 579 sglist_init(&sg, n, segs); 580 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 581 V_ULPTX_NSGE(nsegs)); 582 583 i = -1; 584 for (m = start; m != stop; m = m->m_next) { 585 if (m->m_flags & M_EXTPG) 586 rc = sglist_append_mbuf_epg(&sg, m, 587 mtod(m, vm_offset_t), m->m_len); 588 else 589 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 590 if (__predict_false(rc != 0)) 591 panic("%s: sglist_append %d", __func__, rc); 592 593 for (j = 0; j < sg.sg_nseg; i++, j++) { 594 if (i < 0) { 595 usgl->len0 = htobe32(segs[j].ss_len); 596 usgl->addr0 = htobe64(segs[j].ss_paddr); 597 } else { 598 usgl->sge[i / 2].len[i & 1] = 599 htobe32(segs[j].ss_len); 600 usgl->sge[i / 2].addr[i & 1] = 601 htobe64(segs[j].ss_paddr); 602 } 603 #ifdef INVARIANTS 604 nsegs--; 605 #endif 606 } 607 sglist_reset(&sg); 608 } 609 if (i & 1) 610 usgl->sge[i / 2].len[1] = htobe32(0); 611 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 612 __func__, nsegs, start, stop)); 613 } 614 615 /* 616 * Max number of SGL entries an offload tx work request can have. This is 41 617 * (1 + 40) for a full 512B work request. 618 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 619 */ 620 #define OFLD_SGL_LEN (41) 621 622 /* 623 * Send data and/or a FIN to the peer. 624 * 625 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 626 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 627 * was transmitted. 628 * 629 * drop indicates the number of bytes that should be dropped from the head of 630 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 631 * contention on the send buffer lock (before this change it used to do 632 * sowwakeup and then t4_push_frames right after that when recovering from tx 633 * stalls). When drop is set this function MUST drop the bytes and wake up any 634 * writers. 635 */ 636 void 637 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 638 { 639 struct mbuf *sndptr, *m, *sb_sndptr; 640 struct fw_ofld_tx_data_wr *txwr; 641 struct wrqe *wr; 642 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 643 struct inpcb *inp = toep->inp; 644 struct tcpcb *tp = intotcpcb(inp); 645 struct socket *so = inp->inp_socket; 646 struct sockbuf *sb = &so->so_snd; 647 int tx_credits, shove, compl, sowwakeup; 648 struct ofld_tx_sdesc *txsd; 649 bool nomap_mbuf_seen; 650 651 INP_WLOCK_ASSERT(inp); 652 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 653 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 654 655 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 656 ulp_mode(toep) == ULP_MODE_TCPDDP || 657 ulp_mode(toep) == ULP_MODE_TLS || 658 ulp_mode(toep) == ULP_MODE_RDMA, 659 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 660 661 #ifdef VERBOSE_TRACES 662 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 663 __func__, toep->tid, toep->flags, tp->t_flags, drop); 664 #endif 665 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 666 return; 667 668 #ifdef RATELIMIT 669 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 670 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 671 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 672 } 673 #endif 674 675 /* 676 * This function doesn't resume by itself. Someone else must clear the 677 * flag and call this function. 678 */ 679 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 680 KASSERT(drop == 0, 681 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 682 return; 683 } 684 685 txsd = &toep->txsd[toep->txsd_pidx]; 686 do { 687 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 688 max_imm = max_imm_payload(tx_credits, 0); 689 max_nsegs = max_dsgl_nsegs(tx_credits, 0); 690 691 SOCKBUF_LOCK(sb); 692 sowwakeup = drop; 693 if (drop) { 694 sbdrop_locked(sb, drop); 695 drop = 0; 696 } 697 sb_sndptr = sb->sb_sndptr; 698 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 699 plen = 0; 700 nsegs = 0; 701 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 702 nomap_mbuf_seen = false; 703 for (m = sndptr; m != NULL; m = m->m_next) { 704 int n; 705 706 if ((m->m_flags & M_NOTAVAIL) != 0) 707 break; 708 if (m->m_flags & M_EXTPG) { 709 #ifdef KERN_TLS 710 if (m->m_epg_tls != NULL) { 711 toep->flags |= TPF_KTLS; 712 if (plen == 0) { 713 SOCKBUF_UNLOCK(sb); 714 t4_push_ktls(sc, toep, 0); 715 return; 716 } 717 break; 718 } 719 #endif 720 n = sglist_count_mbuf_epg(m, 721 mtod(m, vm_offset_t), m->m_len); 722 } else 723 n = sglist_count(mtod(m, void *), m->m_len); 724 725 nsegs += n; 726 plen += m->m_len; 727 728 /* This mbuf sent us _over_ the nsegs limit, back out */ 729 if (plen > max_imm && nsegs > max_nsegs) { 730 nsegs -= n; 731 plen -= m->m_len; 732 if (plen == 0) { 733 /* Too few credits */ 734 toep->flags |= TPF_TX_SUSPENDED; 735 if (sowwakeup) { 736 if (!TAILQ_EMPTY( 737 &toep->aiotx_jobq)) 738 t4_aiotx_queue_toep(so, 739 toep); 740 sowwakeup_locked(so); 741 } else 742 SOCKBUF_UNLOCK(sb); 743 SOCKBUF_UNLOCK_ASSERT(sb); 744 return; 745 } 746 break; 747 } 748 749 if (m->m_flags & M_EXTPG) 750 nomap_mbuf_seen = true; 751 if (max_nsegs_1mbuf < n) 752 max_nsegs_1mbuf = n; 753 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 754 755 /* This mbuf put us right at the max_nsegs limit */ 756 if (plen > max_imm && nsegs == max_nsegs) { 757 m = m->m_next; 758 break; 759 } 760 } 761 762 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 763 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 764 compl = 1; 765 else 766 compl = 0; 767 768 if (sb->sb_flags & SB_AUTOSIZE && 769 V_tcp_do_autosndbuf && 770 sb->sb_hiwat < V_tcp_autosndbuf_max && 771 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 772 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 773 V_tcp_autosndbuf_max); 774 775 if (!sbreserve_locked(so, SO_SND, newsize, NULL)) 776 sb->sb_flags &= ~SB_AUTOSIZE; 777 else 778 sowwakeup = 1; /* room available */ 779 } 780 if (sowwakeup) { 781 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 782 t4_aiotx_queue_toep(so, toep); 783 sowwakeup_locked(so); 784 } else 785 SOCKBUF_UNLOCK(sb); 786 SOCKBUF_UNLOCK_ASSERT(sb); 787 788 /* nothing to send */ 789 if (plen == 0) { 790 KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0, 791 ("%s: nothing to send, but m != NULL is ready", 792 __func__)); 793 break; 794 } 795 796 if (__predict_false(toep->flags & TPF_FIN_SENT)) 797 panic("%s: excess tx.", __func__); 798 799 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 800 if (plen <= max_imm && !nomap_mbuf_seen) { 801 802 /* Immediate data tx */ 803 804 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 805 &toep->ofld_txq->wrq); 806 if (wr == NULL) { 807 /* XXX: how will we recover from this? */ 808 toep->flags |= TPF_TX_SUSPENDED; 809 return; 810 } 811 txwr = wrtod(wr); 812 credits = howmany(wr->wr_len, 16); 813 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen, 814 credits, shove, 0); 815 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 816 nsegs = 0; 817 } else { 818 int wr_len; 819 820 /* DSGL tx */ 821 822 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 823 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 824 wr = alloc_wrqe(roundup2(wr_len, 16), 825 &toep->ofld_txq->wrq); 826 if (wr == NULL) { 827 /* XXX: how will we recover from this? */ 828 toep->flags |= TPF_TX_SUSPENDED; 829 return; 830 } 831 txwr = wrtod(wr); 832 credits = howmany(wr_len, 16); 833 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen, 834 credits, shove, 0); 835 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 836 max_nsegs_1mbuf); 837 if (wr_len & 0xf) { 838 uint64_t *pad = (uint64_t *) 839 ((uintptr_t)txwr + wr_len); 840 *pad = 0; 841 } 842 } 843 844 KASSERT(toep->tx_credits >= credits, 845 ("%s: not enough credits", __func__)); 846 847 toep->tx_credits -= credits; 848 toep->tx_nocompl += credits; 849 toep->plen_nocompl += plen; 850 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 851 toep->tx_nocompl >= toep->tx_total / 4) 852 compl = 1; 853 854 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 855 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 856 toep->tx_nocompl = 0; 857 toep->plen_nocompl = 0; 858 } 859 860 tp->snd_nxt += plen; 861 tp->snd_max += plen; 862 863 SOCKBUF_LOCK(sb); 864 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 865 sb->sb_sndptr = sb_sndptr; 866 SOCKBUF_UNLOCK(sb); 867 868 toep->flags |= TPF_TX_DATA_SENT; 869 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 870 toep->flags |= TPF_TX_SUSPENDED; 871 872 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 873 txsd->plen = plen; 874 txsd->tx_credits = credits; 875 txsd++; 876 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 877 toep->txsd_pidx = 0; 878 txsd = &toep->txsd[0]; 879 } 880 toep->txsd_avail--; 881 882 t4_l2t_send(sc, wr, toep->l2te); 883 } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0); 884 885 /* Send a FIN if requested, but only if there's no more data to send */ 886 if (m == NULL && toep->flags & TPF_SEND_FIN) 887 t4_close_conn(sc, toep); 888 } 889 890 static inline void 891 rqdrop_locked(struct mbufq *q, int plen) 892 { 893 struct mbuf *m; 894 895 while (plen > 0) { 896 m = mbufq_dequeue(q); 897 898 /* Too many credits. */ 899 MPASS(m != NULL); 900 M_ASSERTPKTHDR(m); 901 902 /* Partial credits. */ 903 MPASS(plen >= m->m_pkthdr.len); 904 905 plen -= m->m_pkthdr.len; 906 m_freem(m); 907 } 908 } 909 910 /* 911 * Not a bit in the TCB, but is a bit in the ulp_submode field of the 912 * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR. 913 */ 914 #define ULP_ISO G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO) 915 916 static void 917 write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss, 918 int len, int npdu) 919 { 920 struct cpl_tx_data_iso *cpl; 921 unsigned int burst_size; 922 unsigned int last; 923 924 /* 925 * The firmware will set the 'F' bit on the last PDU when 926 * either condition is true: 927 * 928 * - this large PDU is marked as the "last" slice 929 * 930 * - the amount of data payload bytes equals the burst_size 931 * 932 * The strategy used here is to always set the burst_size 933 * artificially high (len includes the size of the template 934 * BHS) and only set the "last" flag if the original PDU had 935 * 'F' set. 936 */ 937 burst_size = len; 938 last = !!(flags & CXGBE_ISO_F); 939 940 cpl = (struct cpl_tx_data_iso *)dst; 941 cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) | 942 V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) | 943 V_CPL_TX_DATA_ISO_CPLHDRLEN(0) | 944 V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) | 945 V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) | 946 V_CPL_TX_DATA_ISO_IMMEDIATE(0) | 947 V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags))); 948 949 cpl->ahs_len = 0; 950 cpl->mpdu = htons(DIV_ROUND_UP(mss, 4)); 951 cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4)); 952 cpl->len = htonl(len); 953 cpl->reserved2_seglen_offset = htonl(0); 954 cpl->datasn_offset = htonl(0); 955 cpl->buffer_offset = htonl(0); 956 cpl->reserved3 = 0; 957 } 958 959 static struct wrqe * 960 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) 961 { 962 struct mbuf *m; 963 struct fw_ofld_tx_data_wr *txwr; 964 struct cpl_tx_data_iso *cpl_iso; 965 void *p; 966 struct wrqe *wr; 967 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 968 u_int adjusted_plen, imm_data, ulp_submode; 969 struct inpcb *inp = toep->inp; 970 struct tcpcb *tp = intotcpcb(inp); 971 int tx_credits, shove, npdu, wr_len; 972 uint16_t iso_mss; 973 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 974 bool iso, nomap_mbuf_seen; 975 976 M_ASSERTPKTHDR(sndptr); 977 978 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 979 if (mbuf_raw_wr(sndptr)) { 980 plen = sndptr->m_pkthdr.len; 981 KASSERT(plen <= SGE_MAX_WR_LEN, 982 ("raw WR len %u is greater than max WR len", plen)); 983 if (plen > tx_credits * 16) 984 return (NULL); 985 986 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 987 if (__predict_false(wr == NULL)) 988 return (NULL); 989 990 m_copydata(sndptr, 0, plen, wrtod(wr)); 991 return (wr); 992 } 993 994 iso = mbuf_iscsi_iso(sndptr); 995 max_imm = max_imm_payload(tx_credits, iso); 996 max_nsegs = max_dsgl_nsegs(tx_credits, iso); 997 iso_mss = mbuf_iscsi_iso_mss(sndptr); 998 999 plen = 0; 1000 nsegs = 0; 1001 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 1002 nomap_mbuf_seen = false; 1003 for (m = sndptr; m != NULL; m = m->m_next) { 1004 int n; 1005 1006 if (m->m_flags & M_EXTPG) 1007 n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), 1008 m->m_len); 1009 else 1010 n = sglist_count(mtod(m, void *), m->m_len); 1011 1012 nsegs += n; 1013 plen += m->m_len; 1014 1015 /* 1016 * This mbuf would send us _over_ the nsegs limit. 1017 * Suspend tx because the PDU can't be sent out. 1018 */ 1019 if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs) 1020 return (NULL); 1021 1022 if (m->m_flags & M_EXTPG) 1023 nomap_mbuf_seen = true; 1024 if (max_nsegs_1mbuf < n) 1025 max_nsegs_1mbuf = n; 1026 } 1027 1028 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1029 panic("%s: excess tx.", __func__); 1030 1031 /* 1032 * We have a PDU to send. All of it goes out in one WR so 'm' 1033 * is NULL. A PDU's length is always a multiple of 4. 1034 */ 1035 MPASS(m == NULL); 1036 MPASS((plen & 3) == 0); 1037 MPASS(sndptr->m_pkthdr.len == plen); 1038 1039 shove = !(tp->t_flags & TF_MORETOCOME); 1040 1041 /* 1042 * plen doesn't include header and data digests, which are 1043 * generated and inserted in the right places by the TOE, but 1044 * they do occupy TCP sequence space and need to be accounted 1045 * for. 1046 */ 1047 ulp_submode = mbuf_ulp_submode(sndptr); 1048 MPASS(ulp_submode < nitems(ulp_extra_len)); 1049 npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1; 1050 adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu; 1051 if (iso) 1052 adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1); 1053 wr_len = sizeof(*txwr); 1054 if (iso) 1055 wr_len += sizeof(struct cpl_tx_data_iso); 1056 if (plen <= max_imm && !nomap_mbuf_seen) { 1057 /* Immediate data tx */ 1058 imm_data = plen; 1059 wr_len += plen; 1060 nsegs = 0; 1061 } else { 1062 /* DSGL tx */ 1063 imm_data = 0; 1064 wr_len += sizeof(struct ulptx_sgl) + 1065 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1066 } 1067 1068 wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); 1069 if (wr == NULL) { 1070 /* XXX: how will we recover from this? */ 1071 return (NULL); 1072 } 1073 txwr = wrtod(wr); 1074 credits = howmany(wr->wr_len, 16); 1075 1076 if (iso) { 1077 write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR, 1078 imm_data + sizeof(struct cpl_tx_data_iso), 1079 adjusted_plen, credits, shove, ulp_submode | ULP_ISO); 1080 cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1); 1081 MPASS(plen == sndptr->m_pkthdr.len); 1082 write_tx_data_iso(cpl_iso, ulp_submode, 1083 mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu); 1084 p = cpl_iso + 1; 1085 } else { 1086 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data, 1087 adjusted_plen, credits, shove, ulp_submode); 1088 p = txwr + 1; 1089 } 1090 1091 if (imm_data != 0) { 1092 m_copydata(sndptr, 0, plen, p); 1093 } else { 1094 write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf); 1095 if (wr_len & 0xf) { 1096 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); 1097 *pad = 0; 1098 } 1099 } 1100 1101 KASSERT(toep->tx_credits >= credits, 1102 ("%s: not enough credits: credits %u " 1103 "toep->tx_credits %u tx_credits %u nsegs %u " 1104 "max_nsegs %u iso %d", __func__, credits, 1105 toep->tx_credits, tx_credits, nsegs, max_nsegs, iso)); 1106 1107 tp->snd_nxt += adjusted_plen; 1108 tp->snd_max += adjusted_plen; 1109 1110 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu); 1111 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); 1112 if (iso) 1113 counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1); 1114 1115 return (wr); 1116 } 1117 1118 void 1119 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 1120 { 1121 struct mbuf *sndptr, *m; 1122 struct fw_wr_hdr *wrhdr; 1123 struct wrqe *wr; 1124 u_int plen, credits; 1125 struct inpcb *inp = toep->inp; 1126 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 1127 struct mbufq *pduq = &toep->ulp_pduq; 1128 1129 INP_WLOCK_ASSERT(inp); 1130 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1131 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 1132 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 1133 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 1134 1135 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 1136 return; 1137 1138 /* 1139 * This function doesn't resume by itself. Someone else must clear the 1140 * flag and call this function. 1141 */ 1142 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 1143 KASSERT(drop == 0, 1144 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1145 return; 1146 } 1147 1148 if (drop) { 1149 struct socket *so = inp->inp_socket; 1150 struct sockbuf *sb = &so->so_snd; 1151 int sbu; 1152 1153 /* 1154 * An unlocked read is ok here as the data should only 1155 * transition from a non-zero value to either another 1156 * non-zero value or zero. Once it is zero it should 1157 * stay zero. 1158 */ 1159 if (__predict_false(sbused(sb)) > 0) { 1160 SOCKBUF_LOCK(sb); 1161 sbu = sbused(sb); 1162 if (sbu > 0) { 1163 /* 1164 * The data transmitted before the 1165 * tid's ULP mode changed to ISCSI is 1166 * still in so_snd. Incoming credits 1167 * should account for so_snd first. 1168 */ 1169 sbdrop_locked(sb, min(sbu, drop)); 1170 drop -= min(sbu, drop); 1171 } 1172 sowwakeup_locked(so); /* unlocks so_snd */ 1173 } 1174 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1175 } 1176 1177 while ((sndptr = mbufq_first(pduq)) != NULL) { 1178 wr = write_iscsi_mbuf_wr(toep, sndptr); 1179 if (wr == NULL) { 1180 toep->flags |= TPF_TX_SUSPENDED; 1181 return; 1182 } 1183 1184 plen = sndptr->m_pkthdr.len; 1185 credits = howmany(wr->wr_len, 16); 1186 KASSERT(toep->tx_credits >= credits, 1187 ("%s: not enough credits", __func__)); 1188 1189 m = mbufq_dequeue(pduq); 1190 MPASS(m == sndptr); 1191 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1192 1193 toep->tx_credits -= credits; 1194 toep->tx_nocompl += credits; 1195 toep->plen_nocompl += plen; 1196 1197 /* 1198 * Ensure there are enough credits for a full-sized WR 1199 * as page pod WRs can be full-sized. 1200 */ 1201 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && 1202 toep->tx_nocompl >= toep->tx_total / 4) { 1203 wrhdr = wrtod(wr); 1204 wrhdr->hi |= htobe32(F_FW_WR_COMPL); 1205 toep->tx_nocompl = 0; 1206 toep->plen_nocompl = 0; 1207 } 1208 1209 toep->flags |= TPF_TX_DATA_SENT; 1210 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1211 toep->flags |= TPF_TX_SUSPENDED; 1212 1213 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1214 txsd->plen = plen; 1215 txsd->tx_credits = credits; 1216 txsd++; 1217 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1218 toep->txsd_pidx = 0; 1219 txsd = &toep->txsd[0]; 1220 } 1221 toep->txsd_avail--; 1222 1223 t4_l2t_send(sc, wr, toep->l2te); 1224 } 1225 1226 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1227 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1228 t4_close_conn(sc, toep); 1229 } 1230 1231 static inline void 1232 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1233 { 1234 1235 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1236 t4_push_pdus(sc, toep, drop); 1237 else if (toep->flags & TPF_KTLS) 1238 t4_push_ktls(sc, toep, drop); 1239 else 1240 t4_push_frames(sc, toep, drop); 1241 } 1242 1243 int 1244 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1245 { 1246 struct adapter *sc = tod->tod_softc; 1247 #ifdef INVARIANTS 1248 struct inpcb *inp = tptoinpcb(tp); 1249 #endif 1250 struct toepcb *toep = tp->t_toe; 1251 1252 INP_WLOCK_ASSERT(inp); 1253 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1254 ("%s: inp %p dropped.", __func__, inp)); 1255 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1256 1257 t4_push_data(sc, toep, 0); 1258 1259 return (0); 1260 } 1261 1262 int 1263 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1264 { 1265 struct adapter *sc = tod->tod_softc; 1266 #ifdef INVARIANTS 1267 struct inpcb *inp = tptoinpcb(tp); 1268 #endif 1269 struct toepcb *toep = tp->t_toe; 1270 1271 INP_WLOCK_ASSERT(inp); 1272 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1273 ("%s: inp %p dropped.", __func__, inp)); 1274 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1275 1276 toep->flags |= TPF_SEND_FIN; 1277 if (tp->t_state >= TCPS_ESTABLISHED) 1278 t4_push_data(sc, toep, 0); 1279 1280 return (0); 1281 } 1282 1283 int 1284 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1285 { 1286 struct adapter *sc = tod->tod_softc; 1287 #if defined(INVARIANTS) 1288 struct inpcb *inp = tptoinpcb(tp); 1289 #endif 1290 struct toepcb *toep = tp->t_toe; 1291 1292 INP_WLOCK_ASSERT(inp); 1293 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1294 ("%s: inp %p dropped.", __func__, inp)); 1295 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1296 1297 /* hmmmm */ 1298 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1299 ("%s: flowc for tid %u [%s] not sent already", 1300 __func__, toep->tid, tcpstates[tp->t_state])); 1301 1302 send_reset(sc, toep, 0); 1303 return (0); 1304 } 1305 1306 /* 1307 * Peer has sent us a FIN. 1308 */ 1309 static int 1310 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1311 { 1312 struct adapter *sc = iq->adapter; 1313 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1314 unsigned int tid = GET_TID(cpl); 1315 struct toepcb *toep = lookup_tid(sc, tid); 1316 struct inpcb *inp = toep->inp; 1317 struct tcpcb *tp = NULL; 1318 struct socket *so; 1319 struct epoch_tracker et; 1320 #ifdef INVARIANTS 1321 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1322 #endif 1323 1324 KASSERT(opcode == CPL_PEER_CLOSE, 1325 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1326 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1327 1328 if (__predict_false(toep->flags & TPF_SYNQE)) { 1329 /* 1330 * do_pass_establish must have run before do_peer_close and if 1331 * this is still a synqe instead of a toepcb then the connection 1332 * must be getting aborted. 1333 */ 1334 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1335 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1336 toep, toep->flags); 1337 return (0); 1338 } 1339 1340 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1341 1342 CURVNET_SET(toep->vnet); 1343 NET_EPOCH_ENTER(et); 1344 INP_WLOCK(inp); 1345 tp = intotcpcb(inp); 1346 1347 CTR6(KTR_CXGBE, 1348 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1349 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1350 toep->ddp.flags, inp); 1351 1352 if (toep->flags & TPF_ABORT_SHUTDOWN) 1353 goto done; 1354 1355 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1356 DDP_LOCK(toep); 1357 if (__predict_false(toep->ddp.flags & 1358 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1359 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1360 DDP_UNLOCK(toep); 1361 } 1362 so = inp->inp_socket; 1363 socantrcvmore(so); 1364 1365 if (ulp_mode(toep) == ULP_MODE_RDMA || 1366 (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) { 1367 /* 1368 * There might be data received via DDP before the FIN 1369 * not reported to the driver. Just assume the 1370 * sequence number in the CPL is correct as the 1371 * sequence number of the FIN. 1372 */ 1373 } else { 1374 KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt), 1375 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1376 be32toh(cpl->rcv_nxt))); 1377 } 1378 1379 tp->rcv_nxt = be32toh(cpl->rcv_nxt); 1380 1381 switch (tp->t_state) { 1382 case TCPS_SYN_RECEIVED: 1383 tp->t_starttime = ticks; 1384 /* FALLTHROUGH */ 1385 1386 case TCPS_ESTABLISHED: 1387 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1388 break; 1389 1390 case TCPS_FIN_WAIT_1: 1391 tcp_state_change(tp, TCPS_CLOSING); 1392 break; 1393 1394 case TCPS_FIN_WAIT_2: 1395 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1396 tcp_twstart(tp); 1397 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1398 NET_EPOCH_EXIT(et); 1399 CURVNET_RESTORE(); 1400 1401 INP_WLOCK(inp); 1402 final_cpl_received(toep); 1403 return (0); 1404 1405 default: 1406 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1407 __func__, tid, tp->t_state); 1408 } 1409 done: 1410 INP_WUNLOCK(inp); 1411 NET_EPOCH_EXIT(et); 1412 CURVNET_RESTORE(); 1413 return (0); 1414 } 1415 1416 /* 1417 * Peer has ACK'd our FIN. 1418 */ 1419 static int 1420 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1421 struct mbuf *m) 1422 { 1423 struct adapter *sc = iq->adapter; 1424 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1425 unsigned int tid = GET_TID(cpl); 1426 struct toepcb *toep = lookup_tid(sc, tid); 1427 struct inpcb *inp = toep->inp; 1428 struct tcpcb *tp = NULL; 1429 struct socket *so = NULL; 1430 struct epoch_tracker et; 1431 #ifdef INVARIANTS 1432 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1433 #endif 1434 1435 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1436 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1437 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1438 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1439 1440 CURVNET_SET(toep->vnet); 1441 NET_EPOCH_ENTER(et); 1442 INP_WLOCK(inp); 1443 tp = intotcpcb(inp); 1444 1445 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1446 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1447 1448 if (toep->flags & TPF_ABORT_SHUTDOWN) 1449 goto done; 1450 1451 so = inp->inp_socket; 1452 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1453 1454 switch (tp->t_state) { 1455 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1456 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1457 tcp_twstart(tp); 1458 release: 1459 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1460 NET_EPOCH_EXIT(et); 1461 CURVNET_RESTORE(); 1462 1463 INP_WLOCK(inp); 1464 final_cpl_received(toep); /* no more CPLs expected */ 1465 1466 return (0); 1467 case TCPS_LAST_ACK: 1468 if (tcp_close(tp)) 1469 INP_WUNLOCK(inp); 1470 goto release; 1471 1472 case TCPS_FIN_WAIT_1: 1473 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1474 soisdisconnected(so); 1475 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1476 break; 1477 1478 default: 1479 log(LOG_ERR, 1480 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1481 __func__, tid, tcpstates[tp->t_state]); 1482 } 1483 done: 1484 INP_WUNLOCK(inp); 1485 NET_EPOCH_EXIT(et); 1486 CURVNET_RESTORE(); 1487 return (0); 1488 } 1489 1490 void 1491 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, 1492 int rst_status) 1493 { 1494 struct wrqe *wr; 1495 struct cpl_abort_rpl *cpl; 1496 1497 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); 1498 if (wr == NULL) { 1499 /* XXX */ 1500 panic("%s: allocation failure.", __func__); 1501 } 1502 cpl = wrtod(wr); 1503 1504 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1505 cpl->cmd = rst_status; 1506 1507 t4_wrq_tx(sc, wr); 1508 } 1509 1510 static int 1511 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1512 { 1513 switch (abort_reason) { 1514 case CPL_ERR_BAD_SYN: 1515 case CPL_ERR_CONN_RESET: 1516 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1517 case CPL_ERR_XMIT_TIMEDOUT: 1518 case CPL_ERR_PERSIST_TIMEDOUT: 1519 case CPL_ERR_FINWAIT2_TIMEDOUT: 1520 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1521 return (ETIMEDOUT); 1522 default: 1523 return (EIO); 1524 } 1525 } 1526 1527 /* 1528 * TCP RST from the peer, timeout, or some other such critical error. 1529 */ 1530 static int 1531 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1532 { 1533 struct adapter *sc = iq->adapter; 1534 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1535 unsigned int tid = GET_TID(cpl); 1536 struct toepcb *toep = lookup_tid(sc, tid); 1537 struct sge_ofld_txq *ofld_txq = toep->ofld_txq; 1538 struct inpcb *inp; 1539 struct tcpcb *tp; 1540 struct epoch_tracker et; 1541 #ifdef INVARIANTS 1542 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1543 #endif 1544 1545 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1546 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1547 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1548 1549 if (toep->flags & TPF_SYNQE) 1550 return (do_abort_req_synqe(iq, rss, m)); 1551 1552 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1553 1554 if (negative_advice(cpl->status)) { 1555 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1556 __func__, cpl->status, tid, toep->flags); 1557 return (0); /* Ignore negative advice */ 1558 } 1559 1560 inp = toep->inp; 1561 CURVNET_SET(toep->vnet); 1562 NET_EPOCH_ENTER(et); /* for tcp_close */ 1563 INP_WLOCK(inp); 1564 1565 tp = intotcpcb(inp); 1566 1567 CTR6(KTR_CXGBE, 1568 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1569 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1570 inp->inp_flags, cpl->status); 1571 1572 /* 1573 * If we'd initiated an abort earlier the reply to it is responsible for 1574 * cleaning up resources. Otherwise we tear everything down right here 1575 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1576 */ 1577 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1578 INP_WUNLOCK(inp); 1579 goto done; 1580 } 1581 toep->flags |= TPF_ABORT_SHUTDOWN; 1582 1583 if ((inp->inp_flags & INP_DROPPED) == 0) { 1584 struct socket *so = inp->inp_socket; 1585 1586 if (so != NULL) 1587 so_error_set(so, abort_status_to_errno(tp, 1588 cpl->status)); 1589 tp = tcp_close(tp); 1590 if (tp == NULL) 1591 INP_WLOCK(inp); /* re-acquire */ 1592 } 1593 1594 final_cpl_received(toep); 1595 done: 1596 NET_EPOCH_EXIT(et); 1597 CURVNET_RESTORE(); 1598 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1599 return (0); 1600 } 1601 1602 /* 1603 * Reply to the CPL_ABORT_REQ (send_reset) 1604 */ 1605 static int 1606 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1607 { 1608 struct adapter *sc = iq->adapter; 1609 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1610 unsigned int tid = GET_TID(cpl); 1611 struct toepcb *toep = lookup_tid(sc, tid); 1612 struct inpcb *inp = toep->inp; 1613 #ifdef INVARIANTS 1614 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1615 #endif 1616 1617 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1618 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1619 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1620 1621 if (toep->flags & TPF_SYNQE) 1622 return (do_abort_rpl_synqe(iq, rss, m)); 1623 1624 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1625 1626 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1627 __func__, tid, toep, inp, cpl->status); 1628 1629 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1630 ("%s: wasn't expecting abort reply", __func__)); 1631 1632 INP_WLOCK(inp); 1633 final_cpl_received(toep); 1634 1635 return (0); 1636 } 1637 1638 static int 1639 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1640 { 1641 struct adapter *sc = iq->adapter; 1642 const struct cpl_rx_data *cpl = mtod(m, const void *); 1643 unsigned int tid = GET_TID(cpl); 1644 struct toepcb *toep = lookup_tid(sc, tid); 1645 struct inpcb *inp = toep->inp; 1646 struct tcpcb *tp; 1647 struct socket *so; 1648 struct sockbuf *sb; 1649 struct epoch_tracker et; 1650 int len; 1651 uint32_t ddp_placed = 0; 1652 1653 if (__predict_false(toep->flags & TPF_SYNQE)) { 1654 /* 1655 * do_pass_establish must have run before do_rx_data and if this 1656 * is still a synqe instead of a toepcb then the connection must 1657 * be getting aborted. 1658 */ 1659 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1660 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1661 toep, toep->flags); 1662 m_freem(m); 1663 return (0); 1664 } 1665 1666 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1667 1668 /* strip off CPL header */ 1669 m_adj(m, sizeof(*cpl)); 1670 len = m->m_pkthdr.len; 1671 1672 INP_WLOCK(inp); 1673 if (inp->inp_flags & INP_DROPPED) { 1674 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1675 __func__, tid, len, inp->inp_flags); 1676 INP_WUNLOCK(inp); 1677 m_freem(m); 1678 return (0); 1679 } 1680 1681 tp = intotcpcb(inp); 1682 1683 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1684 toep->flags & TPF_TLS_RECEIVE)) { 1685 /* Received "raw" data on a TLS socket. */ 1686 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1687 __func__, tid, len); 1688 do_rx_data_tls(cpl, toep, m); 1689 return (0); 1690 } 1691 1692 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1693 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1694 1695 tp->rcv_nxt += len; 1696 if (tp->rcv_wnd < len) { 1697 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1698 ("%s: negative window size", __func__)); 1699 } 1700 1701 tp->rcv_wnd -= len; 1702 tp->t_rcvtime = ticks; 1703 1704 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1705 DDP_LOCK(toep); 1706 so = inp_inpcbtosocket(inp); 1707 sb = &so->so_rcv; 1708 SOCKBUF_LOCK(sb); 1709 1710 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1711 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1712 __func__, tid, len); 1713 m_freem(m); 1714 SOCKBUF_UNLOCK(sb); 1715 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1716 DDP_UNLOCK(toep); 1717 INP_WUNLOCK(inp); 1718 1719 CURVNET_SET(toep->vnet); 1720 NET_EPOCH_ENTER(et); 1721 INP_WLOCK(inp); 1722 tp = tcp_drop(tp, ECONNRESET); 1723 if (tp) 1724 INP_WUNLOCK(inp); 1725 NET_EPOCH_EXIT(et); 1726 CURVNET_RESTORE(); 1727 1728 return (0); 1729 } 1730 1731 /* receive buffer autosize */ 1732 MPASS(toep->vnet == so->so_vnet); 1733 CURVNET_SET(toep->vnet); 1734 if (sb->sb_flags & SB_AUTOSIZE && 1735 V_tcp_do_autorcvbuf && 1736 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1737 len > (sbspace(sb) / 8 * 7)) { 1738 unsigned int hiwat = sb->sb_hiwat; 1739 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1740 V_tcp_autorcvbuf_max); 1741 1742 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 1743 sb->sb_flags &= ~SB_AUTOSIZE; 1744 } 1745 1746 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1747 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1748 1749 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1750 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1751 __func__, tid, len); 1752 1753 if (changed) { 1754 if (toep->ddp.flags & DDP_SC_REQ) 1755 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1756 else if (cpl->ddp_off == 1) { 1757 /* Fell out of DDP mode */ 1758 toep->ddp.flags &= ~DDP_ON; 1759 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1760 __func__); 1761 1762 insert_ddp_data(toep, ddp_placed); 1763 } else { 1764 /* 1765 * Data was received while still 1766 * ULP_MODE_NONE, just fall through. 1767 */ 1768 } 1769 } 1770 1771 if (toep->ddp.flags & DDP_ON) { 1772 /* 1773 * CPL_RX_DATA with DDP on can only be an indicate. 1774 * Start posting queued AIO requests via DDP. The 1775 * payload that arrived in this indicate is appended 1776 * to the socket buffer as usual. 1777 */ 1778 handle_ddp_indicate(toep); 1779 } 1780 } 1781 1782 sbappendstream_locked(sb, m, 0); 1783 t4_rcvd_locked(&toep->td->tod, tp); 1784 1785 if (ulp_mode(toep) == ULP_MODE_TCPDDP && 1786 (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 && 1787 sbavail(sb) != 0) { 1788 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1789 tid); 1790 ddp_queue_toep(toep); 1791 } 1792 if (toep->flags & TPF_TLS_STARTING) 1793 tls_received_starting_data(sc, toep, sb, len); 1794 sorwakeup_locked(so); 1795 SOCKBUF_UNLOCK_ASSERT(sb); 1796 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1797 DDP_UNLOCK(toep); 1798 1799 INP_WUNLOCK(inp); 1800 CURVNET_RESTORE(); 1801 return (0); 1802 } 1803 1804 static int 1805 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1806 { 1807 struct adapter *sc = iq->adapter; 1808 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1809 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1810 struct toepcb *toep = lookup_tid(sc, tid); 1811 struct inpcb *inp; 1812 struct tcpcb *tp; 1813 struct socket *so; 1814 uint8_t credits = cpl->credits; 1815 struct ofld_tx_sdesc *txsd; 1816 int plen; 1817 #ifdef INVARIANTS 1818 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1819 #endif 1820 1821 /* 1822 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1823 * now this comes back carrying the credits for the flowc. 1824 */ 1825 if (__predict_false(toep->flags & TPF_SYNQE)) { 1826 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1827 ("%s: credits for a synq entry %p", __func__, toep)); 1828 return (0); 1829 } 1830 1831 inp = toep->inp; 1832 1833 KASSERT(opcode == CPL_FW4_ACK, 1834 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1835 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1836 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1837 1838 INP_WLOCK(inp); 1839 1840 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1841 INP_WUNLOCK(inp); 1842 return (0); 1843 } 1844 1845 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1846 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1847 1848 tp = intotcpcb(inp); 1849 1850 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1851 tcp_seq snd_una = be32toh(cpl->snd_una); 1852 1853 #ifdef INVARIANTS 1854 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1855 log(LOG_ERR, 1856 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1857 __func__, snd_una, toep->tid, tp->snd_una); 1858 } 1859 #endif 1860 1861 if (tp->snd_una != snd_una) { 1862 tp->snd_una = snd_una; 1863 tp->ts_recent_age = tcp_ts_getticks(); 1864 } 1865 } 1866 1867 #ifdef VERBOSE_TRACES 1868 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1869 #endif 1870 so = inp->inp_socket; 1871 txsd = &toep->txsd[toep->txsd_cidx]; 1872 plen = 0; 1873 while (credits) { 1874 KASSERT(credits >= txsd->tx_credits, 1875 ("%s: too many (or partial) credits", __func__)); 1876 credits -= txsd->tx_credits; 1877 toep->tx_credits += txsd->tx_credits; 1878 plen += txsd->plen; 1879 txsd++; 1880 toep->txsd_avail++; 1881 KASSERT(toep->txsd_avail <= toep->txsd_total, 1882 ("%s: txsd avail > total", __func__)); 1883 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1884 txsd = &toep->txsd[0]; 1885 toep->txsd_cidx = 0; 1886 } 1887 } 1888 1889 if (toep->tx_credits == toep->tx_total) { 1890 toep->tx_nocompl = 0; 1891 toep->plen_nocompl = 0; 1892 } 1893 1894 if (toep->flags & TPF_TX_SUSPENDED && 1895 toep->tx_credits >= toep->tx_total / 4) { 1896 #ifdef VERBOSE_TRACES 1897 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1898 tid); 1899 #endif 1900 toep->flags &= ~TPF_TX_SUSPENDED; 1901 CURVNET_SET(toep->vnet); 1902 t4_push_data(sc, toep, plen); 1903 CURVNET_RESTORE(); 1904 } else if (plen > 0) { 1905 struct sockbuf *sb = &so->so_snd; 1906 int sbu; 1907 1908 SOCKBUF_LOCK(sb); 1909 sbu = sbused(sb); 1910 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 1911 if (__predict_false(sbu > 0)) { 1912 /* 1913 * The data transmitted before the 1914 * tid's ULP mode changed to ISCSI is 1915 * still in so_snd. Incoming credits 1916 * should account for so_snd first. 1917 */ 1918 sbdrop_locked(sb, min(sbu, plen)); 1919 plen -= min(sbu, plen); 1920 } 1921 sowwakeup_locked(so); /* unlocks so_snd */ 1922 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1923 } else { 1924 #ifdef VERBOSE_TRACES 1925 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1926 tid, plen); 1927 #endif 1928 sbdrop_locked(sb, plen); 1929 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1930 t4_aiotx_queue_toep(so, toep); 1931 sowwakeup_locked(so); /* unlocks so_snd */ 1932 } 1933 SOCKBUF_UNLOCK_ASSERT(sb); 1934 } 1935 1936 INP_WUNLOCK(inp); 1937 1938 return (0); 1939 } 1940 1941 void 1942 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1943 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1944 { 1945 struct wrqe *wr; 1946 struct cpl_set_tcb_field *req; 1947 struct ofld_tx_sdesc *txsd; 1948 1949 MPASS((cookie & ~M_COOKIE) == 0); 1950 if (reply) { 1951 MPASS(cookie != CPL_COOKIE_RESERVED); 1952 } 1953 1954 wr = alloc_wrqe(sizeof(*req), wrq); 1955 if (wr == NULL) { 1956 /* XXX */ 1957 panic("%s: allocation failure.", __func__); 1958 } 1959 req = wrtod(wr); 1960 1961 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1962 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1963 if (reply == 0) 1964 req->reply_ctrl |= htobe16(F_NO_REPLY); 1965 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1966 req->mask = htobe64(mask); 1967 req->val = htobe64(val); 1968 if (wrq->eq.type == EQ_OFLD) { 1969 txsd = &toep->txsd[toep->txsd_pidx]; 1970 txsd->tx_credits = howmany(sizeof(*req), 16); 1971 txsd->plen = 0; 1972 KASSERT(toep->tx_credits >= txsd->tx_credits && 1973 toep->txsd_avail > 0, 1974 ("%s: not enough credits (%d)", __func__, 1975 toep->tx_credits)); 1976 toep->tx_credits -= txsd->tx_credits; 1977 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1978 toep->txsd_pidx = 0; 1979 toep->txsd_avail--; 1980 } 1981 1982 t4_wrq_tx(sc, wr); 1983 } 1984 1985 void 1986 t4_init_cpl_io_handlers(void) 1987 { 1988 1989 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1990 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1991 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1992 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1993 CPL_COOKIE_TOM); 1994 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1995 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1996 } 1997 1998 void 1999 t4_uninit_cpl_io_handlers(void) 2000 { 2001 2002 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 2003 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 2004 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 2005 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 2006 t4_register_cpl_handler(CPL_RX_DATA, NULL); 2007 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 2008 } 2009 2010 /* 2011 * Use the 'backend1' field in AIO jobs to hold an error that should 2012 * be reported when the job is completed, the 'backend3' field to 2013 * store the amount of data sent by the AIO job so far, and the 2014 * 'backend4' field to hold a reference count on the job. 2015 * 2016 * Each unmapped mbuf holds a reference on the job as does the queue 2017 * so long as the job is queued. 2018 */ 2019 #define aio_error backend1 2020 #define aio_sent backend3 2021 #define aio_refs backend4 2022 2023 #ifdef VERBOSE_TRACES 2024 static int 2025 jobtotid(struct kaiocb *job) 2026 { 2027 struct socket *so; 2028 struct tcpcb *tp; 2029 struct toepcb *toep; 2030 2031 so = job->fd_file->f_data; 2032 tp = sototcpcb(so); 2033 toep = tp->t_toe; 2034 return (toep->tid); 2035 } 2036 #endif 2037 2038 static void 2039 aiotx_free_job(struct kaiocb *job) 2040 { 2041 long status; 2042 int error; 2043 2044 if (refcount_release(&job->aio_refs) == 0) 2045 return; 2046 2047 error = (intptr_t)job->aio_error; 2048 status = job->aio_sent; 2049 #ifdef VERBOSE_TRACES 2050 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 2051 jobtotid(job), job, status, error); 2052 #endif 2053 if (error != 0 && status != 0) 2054 error = 0; 2055 if (error == ECANCELED) 2056 aio_cancel(job); 2057 else if (error) 2058 aio_complete(job, -1, error); 2059 else { 2060 job->msgsnd = 1; 2061 aio_complete(job, status, 0); 2062 } 2063 } 2064 2065 static void 2066 aiotx_free_pgs(struct mbuf *m) 2067 { 2068 struct kaiocb *job; 2069 vm_page_t pg; 2070 2071 M_ASSERTEXTPG(m); 2072 job = m->m_ext.ext_arg1; 2073 #ifdef VERBOSE_TRACES 2074 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 2075 m->m_len, jobtotid(job)); 2076 #endif 2077 2078 for (int i = 0; i < m->m_epg_npgs; i++) { 2079 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 2080 vm_page_unwire(pg, PQ_ACTIVE); 2081 } 2082 2083 aiotx_free_job(job); 2084 } 2085 2086 /* 2087 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 2088 * of an AIO job. 2089 */ 2090 static struct mbuf * 2091 alloc_aiotx_mbuf(struct kaiocb *job, int len) 2092 { 2093 struct vmspace *vm; 2094 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 2095 struct mbuf *m, *top, *last; 2096 vm_map_t map; 2097 vm_offset_t start; 2098 int i, mlen, npages, pgoff; 2099 2100 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 2101 ("%s(%p, %d): request to send beyond end of buffer", __func__, 2102 job, len)); 2103 2104 /* 2105 * The AIO subsystem will cancel and drain all requests before 2106 * permitting a process to exit or exec, so p_vmspace should 2107 * be stable here. 2108 */ 2109 vm = job->userproc->p_vmspace; 2110 map = &vm->vm_map; 2111 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 2112 pgoff = start & PAGE_MASK; 2113 2114 top = NULL; 2115 last = NULL; 2116 while (len > 0) { 2117 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 2118 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 2119 ("%s: next start (%#jx + %#x) is not page aligned", 2120 __func__, (uintmax_t)start, mlen)); 2121 2122 npages = vm_fault_quick_hold_pages(map, start, mlen, 2123 VM_PROT_WRITE, pgs, nitems(pgs)); 2124 if (npages < 0) 2125 break; 2126 2127 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs); 2128 if (m == NULL) { 2129 vm_page_unhold_pages(pgs, npages); 2130 break; 2131 } 2132 2133 m->m_epg_1st_off = pgoff; 2134 m->m_epg_npgs = npages; 2135 if (npages == 1) { 2136 KASSERT(mlen + pgoff <= PAGE_SIZE, 2137 ("%s: single page is too large (off %d len %d)", 2138 __func__, pgoff, mlen)); 2139 m->m_epg_last_len = mlen; 2140 } else { 2141 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2142 (npages - 2) * PAGE_SIZE; 2143 } 2144 for (i = 0; i < npages; i++) 2145 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2146 2147 m->m_len = mlen; 2148 m->m_ext.ext_size = npages * PAGE_SIZE; 2149 m->m_ext.ext_arg1 = job; 2150 refcount_acquire(&job->aio_refs); 2151 2152 #ifdef VERBOSE_TRACES 2153 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2154 __func__, jobtotid(job), m, job, npages); 2155 #endif 2156 2157 if (top == NULL) 2158 top = m; 2159 else 2160 last->m_next = m; 2161 last = m; 2162 2163 len -= mlen; 2164 start += mlen; 2165 pgoff = 0; 2166 } 2167 2168 return (top); 2169 } 2170 2171 static void 2172 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2173 { 2174 struct sockbuf *sb; 2175 struct inpcb *inp; 2176 struct tcpcb *tp; 2177 struct mbuf *m; 2178 u_int sent; 2179 int error, len; 2180 bool moretocome, sendmore; 2181 2182 sb = &so->so_snd; 2183 SOCKBUF_UNLOCK(sb); 2184 m = NULL; 2185 2186 #ifdef MAC 2187 error = mac_socket_check_send(job->fd_file->f_cred, so); 2188 if (error != 0) 2189 goto out; 2190 #endif 2191 2192 /* Inline sosend_generic(). */ 2193 2194 error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); 2195 MPASS(error == 0); 2196 2197 sendanother: 2198 SOCKBUF_LOCK(sb); 2199 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2200 SOCKBUF_UNLOCK(sb); 2201 SOCK_IO_SEND_UNLOCK(so); 2202 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2203 PROC_LOCK(job->userproc); 2204 kern_psignal(job->userproc, SIGPIPE); 2205 PROC_UNLOCK(job->userproc); 2206 } 2207 error = EPIPE; 2208 goto out; 2209 } 2210 if (so->so_error) { 2211 error = so->so_error; 2212 so->so_error = 0; 2213 SOCKBUF_UNLOCK(sb); 2214 SOCK_IO_SEND_UNLOCK(so); 2215 goto out; 2216 } 2217 if ((so->so_state & SS_ISCONNECTED) == 0) { 2218 SOCKBUF_UNLOCK(sb); 2219 SOCK_IO_SEND_UNLOCK(so); 2220 error = ENOTCONN; 2221 goto out; 2222 } 2223 if (sbspace(sb) < sb->sb_lowat) { 2224 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2225 2226 /* 2227 * Don't block if there is too little room in the socket 2228 * buffer. Instead, requeue the request. 2229 */ 2230 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2231 SOCKBUF_UNLOCK(sb); 2232 SOCK_IO_SEND_UNLOCK(so); 2233 error = ECANCELED; 2234 goto out; 2235 } 2236 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2237 SOCKBUF_UNLOCK(sb); 2238 SOCK_IO_SEND_UNLOCK(so); 2239 goto out; 2240 } 2241 2242 /* 2243 * Write as much data as the socket permits, but no more than a 2244 * a single sndbuf at a time. 2245 */ 2246 len = sbspace(sb); 2247 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2248 len = job->uaiocb.aio_nbytes - job->aio_sent; 2249 moretocome = false; 2250 } else 2251 moretocome = true; 2252 if (len > toep->params.sndbuf) { 2253 len = toep->params.sndbuf; 2254 sendmore = true; 2255 } else 2256 sendmore = false; 2257 2258 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2259 moretocome = true; 2260 SOCKBUF_UNLOCK(sb); 2261 MPASS(len != 0); 2262 2263 m = alloc_aiotx_mbuf(job, len); 2264 if (m == NULL) { 2265 SOCK_IO_SEND_UNLOCK(so); 2266 error = EFAULT; 2267 goto out; 2268 } 2269 2270 /* Inlined tcp_usr_send(). */ 2271 2272 inp = toep->inp; 2273 INP_WLOCK(inp); 2274 if (inp->inp_flags & INP_DROPPED) { 2275 INP_WUNLOCK(inp); 2276 SOCK_IO_SEND_UNLOCK(so); 2277 error = ECONNRESET; 2278 goto out; 2279 } 2280 2281 sent = m_length(m, NULL); 2282 job->aio_sent += sent; 2283 counter_u64_add(toep->ofld_txq->tx_aio_octets, sent); 2284 2285 sbappendstream(sb, m, 0); 2286 m = NULL; 2287 2288 if (!(inp->inp_flags & INP_DROPPED)) { 2289 tp = intotcpcb(inp); 2290 if (moretocome) 2291 tp->t_flags |= TF_MORETOCOME; 2292 error = tcp_output(tp); 2293 if (error < 0) { 2294 INP_UNLOCK_ASSERT(inp); 2295 SOCK_IO_SEND_UNLOCK(so); 2296 error = -error; 2297 goto out; 2298 } 2299 if (moretocome) 2300 tp->t_flags &= ~TF_MORETOCOME; 2301 } 2302 2303 INP_WUNLOCK(inp); 2304 if (sendmore) 2305 goto sendanother; 2306 SOCK_IO_SEND_UNLOCK(so); 2307 2308 if (error) 2309 goto out; 2310 2311 /* 2312 * If this is a blocking socket and the request has not been 2313 * fully completed, requeue it until the socket is ready 2314 * again. 2315 */ 2316 if (job->aio_sent < job->uaiocb.aio_nbytes && 2317 !(so->so_state & SS_NBIO)) { 2318 SOCKBUF_LOCK(sb); 2319 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2320 SOCKBUF_UNLOCK(sb); 2321 error = ECANCELED; 2322 goto out; 2323 } 2324 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2325 return; 2326 } 2327 2328 /* 2329 * If the request will not be requeued, drop the queue's 2330 * reference to the job. Any mbufs in flight should still 2331 * hold a reference, but this drops the reference that the 2332 * queue owns while it is waiting to queue mbufs to the 2333 * socket. 2334 */ 2335 aiotx_free_job(job); 2336 counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1); 2337 2338 out: 2339 if (error) { 2340 job->aio_error = (void *)(intptr_t)error; 2341 aiotx_free_job(job); 2342 } 2343 m_freem(m); 2344 SOCKBUF_LOCK(sb); 2345 } 2346 2347 static void 2348 t4_aiotx_task(void *context, int pending) 2349 { 2350 struct toepcb *toep = context; 2351 struct socket *so; 2352 struct kaiocb *job; 2353 struct epoch_tracker et; 2354 2355 so = toep->aiotx_so; 2356 CURVNET_SET(toep->vnet); 2357 NET_EPOCH_ENTER(et); 2358 SOCKBUF_LOCK(&so->so_snd); 2359 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2360 job = TAILQ_FIRST(&toep->aiotx_jobq); 2361 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2362 if (!aio_clear_cancel_function(job)) 2363 continue; 2364 2365 t4_aiotx_process_job(toep, so, job); 2366 } 2367 toep->aiotx_so = NULL; 2368 SOCKBUF_UNLOCK(&so->so_snd); 2369 NET_EPOCH_EXIT(et); 2370 2371 free_toepcb(toep); 2372 sorele(so); 2373 CURVNET_RESTORE(); 2374 } 2375 2376 static void 2377 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2378 { 2379 2380 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2381 #ifdef VERBOSE_TRACES 2382 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2383 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2384 #endif 2385 if (toep->aiotx_so != NULL) 2386 return; 2387 soref(so); 2388 toep->aiotx_so = so; 2389 hold_toepcb(toep); 2390 soaio_enqueue(&toep->aiotx_task); 2391 } 2392 2393 static void 2394 t4_aiotx_cancel(struct kaiocb *job) 2395 { 2396 struct socket *so; 2397 struct sockbuf *sb; 2398 struct tcpcb *tp; 2399 struct toepcb *toep; 2400 2401 so = job->fd_file->f_data; 2402 tp = sototcpcb(so); 2403 toep = tp->t_toe; 2404 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2405 sb = &so->so_snd; 2406 2407 SOCKBUF_LOCK(sb); 2408 if (!aio_cancel_cleared(job)) 2409 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2410 SOCKBUF_UNLOCK(sb); 2411 2412 job->aio_error = (void *)(intptr_t)ECANCELED; 2413 aiotx_free_job(job); 2414 } 2415 2416 int 2417 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2418 { 2419 struct tcpcb *tp = sototcpcb(so); 2420 struct toepcb *toep = tp->t_toe; 2421 struct adapter *sc = td_adapter(toep->td); 2422 2423 /* This only handles writes. */ 2424 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2425 return (EOPNOTSUPP); 2426 2427 if (!sc->tt.tx_zcopy) 2428 return (EOPNOTSUPP); 2429 2430 if (tls_tx_key(toep)) 2431 return (EOPNOTSUPP); 2432 2433 SOCKBUF_LOCK(&so->so_snd); 2434 #ifdef VERBOSE_TRACES 2435 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2436 #endif 2437 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2438 panic("new job was cancelled"); 2439 refcount_init(&job->aio_refs, 1); 2440 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2441 if (sowriteable(so)) 2442 t4_aiotx_queue_toep(so, toep); 2443 SOCKBUF_UNLOCK(&so->so_snd); 2444 return (0); 2445 } 2446 2447 void 2448 aiotx_init_toep(struct toepcb *toep) 2449 { 2450 2451 TAILQ_INIT(&toep->aiotx_jobq); 2452 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2453 } 2454 #endif 2455