1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_kern_tls.h" 34 #include "opt_ratelimit.h" 35 36 #ifdef TCP_OFFLOAD 37 #include <sys/param.h> 38 #include <sys/aio.h> 39 #include <sys/file.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/module.h> 43 #include <sys/proc.h> 44 #include <sys/protosw.h> 45 #include <sys/domain.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sglist.h> 49 #include <sys/taskqueue.h> 50 #include <netinet/in.h> 51 #include <netinet/in_pcb.h> 52 #include <netinet/ip.h> 53 #include <netinet/ip6.h> 54 #define TCPSTATES 55 #include <netinet/tcp_fsm.h> 56 #include <netinet/tcp_seq.h> 57 #include <netinet/tcp_var.h> 58 #include <netinet/toecore.h> 59 60 #include <security/mac/mac_framework.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_extern.h> 64 #include <vm/pmap.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_page.h> 67 68 #include <dev/iscsi/iscsi_proto.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (toep->params.tc_idx != -1) { 100 MPASS(toep->params.tc_idx >= 0 && 101 toep->params.tc_idx < sc->params.nsched_cls); 102 nparams++; 103 } 104 105 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 106 107 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 108 if (wr == NULL) { 109 /* XXX */ 110 panic("%s: allocation failure.", __func__); 111 } 112 flowc = wrtod(wr); 113 memset(flowc, 0, wr->wr_len); 114 115 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 116 V_FW_FLOWC_WR_NPARAMS(nparams)); 117 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 118 V_FW_WR_FLOWID(toep->tid)); 119 120 #define FLOWC_PARAM(__m, __v) \ 121 do { \ 122 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 123 flowc->mnemval[paramidx].val = htobe32(__v); \ 124 paramidx++; \ 125 } while (0) 126 127 paramidx = 0; 128 129 FLOWC_PARAM(PFNVFN, pfvf); 130 FLOWC_PARAM(CH, pi->tx_chan); 131 FLOWC_PARAM(PORT, pi->tx_chan); 132 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 133 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 134 if (tp) { 135 FLOWC_PARAM(MSS, toep->params.emss); 136 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 137 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 138 } else 139 FLOWC_PARAM(MSS, 512); 140 CTR6(KTR_CXGBE, 141 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 142 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 143 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 144 145 if (toep->params.tc_idx != -1) 146 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 147 #undef FLOWC_PARAM 148 149 KASSERT(paramidx == nparams, ("nparams mismatch")); 150 151 txsd->tx_credits = howmany(flowclen, 16); 152 txsd->plen = 0; 153 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 154 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 155 toep->tx_credits -= txsd->tx_credits; 156 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 157 toep->txsd_pidx = 0; 158 toep->txsd_avail--; 159 160 toep->flags |= TPF_FLOWC_WR_SENT; 161 t4_wrq_tx(sc, wr); 162 } 163 164 #ifdef RATELIMIT 165 /* 166 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 167 */ 168 static int 169 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 170 { 171 int tc_idx, rc; 172 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 173 const int port_id = toep->vi->pi->port_id; 174 175 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 176 177 if (kbps == 0) { 178 /* unbind */ 179 tc_idx = -1; 180 } else { 181 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 182 if (rc != 0) 183 return (rc); 184 MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls); 185 } 186 187 if (toep->params.tc_idx != tc_idx) { 188 struct wrqe *wr; 189 struct fw_flowc_wr *flowc; 190 int nparams = 1, flowclen, flowclen16; 191 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 192 193 flowclen = sizeof(*flowc) + nparams * sizeof(struct 194 fw_flowc_mnemval); 195 flowclen16 = howmany(flowclen, 16); 196 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 197 (wr = alloc_wrqe(roundup2(flowclen, 16), 198 &toep->ofld_txq->wrq)) == NULL) { 199 if (tc_idx >= 0) 200 t4_release_cl_rl(sc, port_id, tc_idx); 201 return (ENOMEM); 202 } 203 204 flowc = wrtod(wr); 205 memset(flowc, 0, wr->wr_len); 206 207 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 208 V_FW_FLOWC_WR_NPARAMS(nparams)); 209 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 210 V_FW_WR_FLOWID(toep->tid)); 211 212 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 213 if (tc_idx == -1) 214 flowc->mnemval[0].val = htobe32(0xff); 215 else 216 flowc->mnemval[0].val = htobe32(tc_idx); 217 218 txsd->tx_credits = flowclen16; 219 txsd->plen = 0; 220 toep->tx_credits -= txsd->tx_credits; 221 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 222 toep->txsd_pidx = 0; 223 toep->txsd_avail--; 224 t4_wrq_tx(sc, wr); 225 } 226 227 if (toep->params.tc_idx >= 0) 228 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 229 toep->params.tc_idx = tc_idx; 230 231 return (0); 232 } 233 #endif 234 235 void 236 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 237 { 238 struct wrqe *wr; 239 struct cpl_abort_req *req; 240 int tid = toep->tid; 241 struct inpcb *inp = toep->inp; 242 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 243 244 INP_WLOCK_ASSERT(inp); 245 246 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 247 __func__, toep->tid, 248 inp->inp_flags & INP_DROPPED ? "inp dropped" : 249 tcpstates[tp->t_state], 250 toep->flags, inp->inp_flags, 251 toep->flags & TPF_ABORT_SHUTDOWN ? 252 " (abort already in progress)" : ""); 253 254 if (toep->flags & TPF_ABORT_SHUTDOWN) 255 return; /* abort already in progress */ 256 257 toep->flags |= TPF_ABORT_SHUTDOWN; 258 259 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 260 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 261 262 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 263 if (wr == NULL) { 264 /* XXX */ 265 panic("%s: allocation failure.", __func__); 266 } 267 req = wrtod(wr); 268 269 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 270 if (inp->inp_flags & INP_DROPPED) 271 req->rsvd0 = htobe32(snd_nxt); 272 else 273 req->rsvd0 = htobe32(tp->snd_nxt); 274 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 275 req->cmd = CPL_ABORT_SEND_RST; 276 277 /* 278 * XXX: What's the correct way to tell that the inp hasn't been detached 279 * from its socket? Should I even be flushing the snd buffer here? 280 */ 281 if ((inp->inp_flags & INP_DROPPED) == 0) { 282 struct socket *so = inp->inp_socket; 283 284 if (so != NULL) /* because I'm not sure. See comment above */ 285 sbflush(&so->so_snd); 286 } 287 288 t4_l2t_send(sc, wr, toep->l2te); 289 } 290 291 /* 292 * Called when a connection is established to translate the TCP options 293 * reported by HW to FreeBSD's native format. 294 */ 295 static void 296 assign_rxopt(struct tcpcb *tp, uint16_t opt) 297 { 298 struct toepcb *toep = tp->t_toe; 299 struct inpcb *inp = tptoinpcb(tp); 300 struct adapter *sc = td_adapter(toep->td); 301 302 INP_LOCK_ASSERT(inp); 303 304 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 305 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 306 if (inp->inp_inc.inc_flags & INC_ISIPV6) 307 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 308 else 309 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 310 311 toep->params.emss = tp->t_maxseg; 312 if (G_TCPOPT_TSTAMP(opt)) { 313 toep->params.tstamp = 1; 314 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 315 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 316 tp->ts_recent = 0; /* hmmm */ 317 tp->ts_recent_age = tcp_ts_getticks(); 318 } else 319 toep->params.tstamp = 0; 320 321 if (G_TCPOPT_SACK(opt)) { 322 toep->params.sack = 1; 323 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 324 } else { 325 toep->params.sack = 0; 326 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 327 } 328 329 if (G_TCPOPT_WSCALE_OK(opt)) 330 tp->t_flags |= TF_RCVD_SCALE; 331 332 /* Doing window scaling? */ 333 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 334 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 335 tp->rcv_scale = tp->request_r_scale; 336 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 337 } else 338 toep->params.wscale = 0; 339 340 CTR6(KTR_CXGBE, 341 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 342 toep->tid, toep->params.mtu_idx, toep->params.emss, 343 toep->params.tstamp, toep->params.sack, toep->params.wscale); 344 } 345 346 /* 347 * Completes some final bits of initialization for just established connections 348 * and changes their state to TCPS_ESTABLISHED. 349 * 350 * The ISNs are from the exchange of SYNs. 351 */ 352 void 353 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 354 { 355 struct inpcb *inp = toep->inp; 356 struct socket *so = inp->inp_socket; 357 struct tcpcb *tp = intotcpcb(inp); 358 uint16_t tcpopt = be16toh(opt); 359 360 INP_WLOCK_ASSERT(inp); 361 KASSERT(tp->t_state == TCPS_SYN_SENT || 362 tp->t_state == TCPS_SYN_RECEIVED, 363 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 364 365 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 366 __func__, toep->tid, so, inp, tp, toep); 367 368 tcp_state_change(tp, TCPS_ESTABLISHED); 369 tp->t_starttime = ticks; 370 TCPSTAT_INC(tcps_connects); 371 372 tp->irs = irs; 373 tcp_rcvseqinit(tp); 374 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 375 tp->rcv_adv += tp->rcv_wnd; 376 tp->last_ack_sent = tp->rcv_nxt; 377 378 tp->iss = iss; 379 tcp_sendseqinit(tp); 380 tp->snd_una = iss + 1; 381 tp->snd_nxt = iss + 1; 382 tp->snd_max = iss + 1; 383 384 assign_rxopt(tp, tcpopt); 385 send_flowc_wr(toep, tp); 386 387 soisconnected(so); 388 } 389 390 int 391 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 392 { 393 struct wrqe *wr; 394 struct cpl_rx_data_ack *req; 395 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 396 397 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 398 399 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 400 if (wr == NULL) 401 return (0); 402 req = wrtod(wr); 403 404 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 405 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 406 407 t4_wrq_tx(sc, wr); 408 return (credits); 409 } 410 411 void 412 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 413 { 414 struct adapter *sc = tod->tod_softc; 415 struct inpcb *inp = tptoinpcb(tp); 416 struct socket *so = inp->inp_socket; 417 struct sockbuf *sb = &so->so_rcv; 418 struct toepcb *toep = tp->t_toe; 419 int rx_credits; 420 421 INP_WLOCK_ASSERT(inp); 422 SOCKBUF_LOCK_ASSERT(sb); 423 424 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 425 if (rx_credits > 0 && 426 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 427 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 428 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 429 rx_credits = send_rx_credits(sc, toep, rx_credits); 430 tp->rcv_wnd += rx_credits; 431 tp->rcv_adv += rx_credits; 432 } 433 } 434 435 void 436 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 437 { 438 struct inpcb *inp = tptoinpcb(tp); 439 struct socket *so = inp->inp_socket; 440 struct sockbuf *sb = &so->so_rcv; 441 442 SOCKBUF_LOCK(sb); 443 t4_rcvd_locked(tod, tp); 444 SOCKBUF_UNLOCK(sb); 445 } 446 447 /* 448 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 449 */ 450 int 451 t4_close_conn(struct adapter *sc, struct toepcb *toep) 452 { 453 struct wrqe *wr; 454 struct cpl_close_con_req *req; 455 unsigned int tid = toep->tid; 456 457 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 458 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 459 460 if (toep->flags & TPF_FIN_SENT) 461 return (0); 462 463 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 464 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 465 466 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 467 if (wr == NULL) { 468 /* XXX */ 469 panic("%s: allocation failure.", __func__); 470 } 471 req = wrtod(wr); 472 473 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 474 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 475 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 476 V_FW_WR_FLOWID(tid)); 477 req->wr.wr_lo = cpu_to_be64(0); 478 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 479 req->rsvd = 0; 480 481 toep->flags |= TPF_FIN_SENT; 482 toep->flags &= ~TPF_SEND_FIN; 483 t4_l2t_send(sc, wr, toep->l2te); 484 485 return (0); 486 } 487 488 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 489 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 490 #define MIN_ISO_TX_CREDITS (howmany(sizeof(struct cpl_tx_data_iso), 16)) 491 #define MIN_TX_CREDITS(iso) \ 492 (MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0)) 493 494 /* Maximum amount of immediate data we could stuff in a WR */ 495 static inline int 496 max_imm_payload(int tx_credits, int iso) 497 { 498 const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0; 499 const int n = 1; /* Use no more than one desc for imm. data WR */ 500 501 KASSERT(tx_credits >= 0 && 502 tx_credits <= MAX_OFLD_TX_CREDITS, 503 ("%s: %d credits", __func__, tx_credits)); 504 505 if (tx_credits < MIN_TX_CREDITS(iso)) 506 return (0); 507 508 if (tx_credits >= (n * EQ_ESIZE) / 16) 509 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) - 510 iso_cpl_size); 511 else 512 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) - 513 iso_cpl_size); 514 } 515 516 /* Maximum number of SGL entries we could stuff in a WR */ 517 static inline int 518 max_dsgl_nsegs(int tx_credits, int iso) 519 { 520 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 521 int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso); 522 523 KASSERT(tx_credits >= 0 && 524 tx_credits <= MAX_OFLD_TX_CREDITS, 525 ("%s: %d credits", __func__, tx_credits)); 526 527 if (tx_credits < MIN_TX_CREDITS(iso)) 528 return (0); 529 530 nseg += 2 * (sge_pair_credits * 16 / 24); 531 if ((sge_pair_credits * 16) % 24 == 16) 532 nseg++; 533 534 return (nseg); 535 } 536 537 static inline void 538 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode, 539 unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, 540 int ulp_submode) 541 { 542 struct fw_ofld_tx_data_wr *txwr = dst; 543 544 txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) | 545 V_FW_WR_IMMDLEN(immdlen)); 546 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 547 V_FW_WR_LEN16(credits)); 548 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 549 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 550 txwr->plen = htobe32(plen); 551 552 if (toep->params.tx_align > 0) { 553 if (plen < 2 * toep->params.emss) 554 txwr->lsodisable_to_flags |= 555 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 556 else 557 txwr->lsodisable_to_flags |= 558 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 559 (toep->params.nagle == 0 ? 0 : 560 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 561 } 562 } 563 564 /* 565 * Generate a DSGL from a starting mbuf. The total number of segments and the 566 * maximum segments in any one mbuf are provided. 567 */ 568 static void 569 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 570 { 571 struct mbuf *m; 572 struct ulptx_sgl *usgl = dst; 573 int i, j, rc; 574 struct sglist sg; 575 struct sglist_seg segs[n]; 576 577 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 578 579 sglist_init(&sg, n, segs); 580 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 581 V_ULPTX_NSGE(nsegs)); 582 583 i = -1; 584 for (m = start; m != stop; m = m->m_next) { 585 if (m->m_flags & M_EXTPG) 586 rc = sglist_append_mbuf_epg(&sg, m, 587 mtod(m, vm_offset_t), m->m_len); 588 else 589 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 590 if (__predict_false(rc != 0)) 591 panic("%s: sglist_append %d", __func__, rc); 592 593 for (j = 0; j < sg.sg_nseg; i++, j++) { 594 if (i < 0) { 595 usgl->len0 = htobe32(segs[j].ss_len); 596 usgl->addr0 = htobe64(segs[j].ss_paddr); 597 } else { 598 usgl->sge[i / 2].len[i & 1] = 599 htobe32(segs[j].ss_len); 600 usgl->sge[i / 2].addr[i & 1] = 601 htobe64(segs[j].ss_paddr); 602 } 603 #ifdef INVARIANTS 604 nsegs--; 605 #endif 606 } 607 sglist_reset(&sg); 608 } 609 if (i & 1) 610 usgl->sge[i / 2].len[1] = htobe32(0); 611 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 612 __func__, nsegs, start, stop)); 613 } 614 615 /* 616 * Max number of SGL entries an offload tx work request can have. This is 41 617 * (1 + 40) for a full 512B work request. 618 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 619 */ 620 #define OFLD_SGL_LEN (41) 621 622 /* 623 * Send data and/or a FIN to the peer. 624 * 625 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 626 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 627 * was transmitted. 628 * 629 * drop indicates the number of bytes that should be dropped from the head of 630 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 631 * contention on the send buffer lock (before this change it used to do 632 * sowwakeup and then t4_push_frames right after that when recovering from tx 633 * stalls). When drop is set this function MUST drop the bytes and wake up any 634 * writers. 635 */ 636 void 637 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 638 { 639 struct mbuf *sndptr, *m, *sb_sndptr; 640 struct fw_ofld_tx_data_wr *txwr; 641 struct wrqe *wr; 642 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 643 struct inpcb *inp = toep->inp; 644 struct tcpcb *tp = intotcpcb(inp); 645 struct socket *so = inp->inp_socket; 646 struct sockbuf *sb = &so->so_snd; 647 int tx_credits, shove, compl, sowwakeup; 648 struct ofld_tx_sdesc *txsd; 649 bool nomap_mbuf_seen; 650 651 INP_WLOCK_ASSERT(inp); 652 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 653 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 654 655 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 656 ulp_mode(toep) == ULP_MODE_TCPDDP || 657 ulp_mode(toep) == ULP_MODE_TLS || 658 ulp_mode(toep) == ULP_MODE_RDMA, 659 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 660 661 #ifdef VERBOSE_TRACES 662 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 663 __func__, toep->tid, toep->flags, tp->t_flags, drop); 664 #endif 665 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 666 return; 667 668 #ifdef RATELIMIT 669 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 670 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 671 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 672 } 673 #endif 674 675 /* 676 * This function doesn't resume by itself. Someone else must clear the 677 * flag and call this function. 678 */ 679 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 680 KASSERT(drop == 0, 681 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 682 return; 683 } 684 685 txsd = &toep->txsd[toep->txsd_pidx]; 686 do { 687 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 688 max_imm = max_imm_payload(tx_credits, 0); 689 max_nsegs = max_dsgl_nsegs(tx_credits, 0); 690 691 SOCKBUF_LOCK(sb); 692 sowwakeup = drop; 693 if (drop) { 694 sbdrop_locked(sb, drop); 695 drop = 0; 696 } 697 sb_sndptr = sb->sb_sndptr; 698 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 699 plen = 0; 700 nsegs = 0; 701 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 702 nomap_mbuf_seen = false; 703 for (m = sndptr; m != NULL; m = m->m_next) { 704 int n; 705 706 if ((m->m_flags & M_NOTAVAIL) != 0) 707 break; 708 if (m->m_flags & M_EXTPG) { 709 #ifdef KERN_TLS 710 if (m->m_epg_tls != NULL) { 711 toep->flags |= TPF_KTLS; 712 if (plen == 0) { 713 SOCKBUF_UNLOCK(sb); 714 t4_push_ktls(sc, toep, 0); 715 return; 716 } 717 break; 718 } 719 #endif 720 n = sglist_count_mbuf_epg(m, 721 mtod(m, vm_offset_t), m->m_len); 722 } else 723 n = sglist_count(mtod(m, void *), m->m_len); 724 725 nsegs += n; 726 plen += m->m_len; 727 728 /* This mbuf sent us _over_ the nsegs limit, back out */ 729 if (plen > max_imm && nsegs > max_nsegs) { 730 nsegs -= n; 731 plen -= m->m_len; 732 if (plen == 0) { 733 /* Too few credits */ 734 toep->flags |= TPF_TX_SUSPENDED; 735 if (sowwakeup) { 736 if (!TAILQ_EMPTY( 737 &toep->aiotx_jobq)) 738 t4_aiotx_queue_toep(so, 739 toep); 740 sowwakeup_locked(so); 741 } else 742 SOCKBUF_UNLOCK(sb); 743 SOCKBUF_UNLOCK_ASSERT(sb); 744 return; 745 } 746 break; 747 } 748 749 if (m->m_flags & M_EXTPG) 750 nomap_mbuf_seen = true; 751 if (max_nsegs_1mbuf < n) 752 max_nsegs_1mbuf = n; 753 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 754 755 /* This mbuf put us right at the max_nsegs limit */ 756 if (plen > max_imm && nsegs == max_nsegs) { 757 m = m->m_next; 758 break; 759 } 760 } 761 762 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 763 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 764 compl = 1; 765 else 766 compl = 0; 767 768 if (sb->sb_flags & SB_AUTOSIZE && 769 V_tcp_do_autosndbuf && 770 sb->sb_hiwat < V_tcp_autosndbuf_max && 771 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 772 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 773 V_tcp_autosndbuf_max); 774 775 if (!sbreserve_locked(so, SO_SND, newsize, NULL)) 776 sb->sb_flags &= ~SB_AUTOSIZE; 777 else 778 sowwakeup = 1; /* room available */ 779 } 780 if (sowwakeup) { 781 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 782 t4_aiotx_queue_toep(so, toep); 783 sowwakeup_locked(so); 784 } else 785 SOCKBUF_UNLOCK(sb); 786 SOCKBUF_UNLOCK_ASSERT(sb); 787 788 /* nothing to send */ 789 if (plen == 0) { 790 KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0, 791 ("%s: nothing to send, but m != NULL is ready", 792 __func__)); 793 break; 794 } 795 796 if (__predict_false(toep->flags & TPF_FIN_SENT)) 797 panic("%s: excess tx.", __func__); 798 799 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 800 if (plen <= max_imm && !nomap_mbuf_seen) { 801 802 /* Immediate data tx */ 803 804 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 805 &toep->ofld_txq->wrq); 806 if (wr == NULL) { 807 /* XXX: how will we recover from this? */ 808 toep->flags |= TPF_TX_SUSPENDED; 809 return; 810 } 811 txwr = wrtod(wr); 812 credits = howmany(wr->wr_len, 16); 813 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen, 814 credits, shove, 0); 815 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 816 nsegs = 0; 817 } else { 818 int wr_len; 819 820 /* DSGL tx */ 821 822 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 823 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 824 wr = alloc_wrqe(roundup2(wr_len, 16), 825 &toep->ofld_txq->wrq); 826 if (wr == NULL) { 827 /* XXX: how will we recover from this? */ 828 toep->flags |= TPF_TX_SUSPENDED; 829 return; 830 } 831 txwr = wrtod(wr); 832 credits = howmany(wr_len, 16); 833 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen, 834 credits, shove, 0); 835 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 836 max_nsegs_1mbuf); 837 if (wr_len & 0xf) { 838 uint64_t *pad = (uint64_t *) 839 ((uintptr_t)txwr + wr_len); 840 *pad = 0; 841 } 842 } 843 844 KASSERT(toep->tx_credits >= credits, 845 ("%s: not enough credits", __func__)); 846 847 toep->tx_credits -= credits; 848 toep->tx_nocompl += credits; 849 toep->plen_nocompl += plen; 850 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 851 toep->tx_nocompl >= toep->tx_total / 4) 852 compl = 1; 853 854 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 855 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 856 toep->tx_nocompl = 0; 857 toep->plen_nocompl = 0; 858 } 859 860 tp->snd_nxt += plen; 861 tp->snd_max += plen; 862 863 SOCKBUF_LOCK(sb); 864 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 865 sb->sb_sndptr = sb_sndptr; 866 SOCKBUF_UNLOCK(sb); 867 868 toep->flags |= TPF_TX_DATA_SENT; 869 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 870 toep->flags |= TPF_TX_SUSPENDED; 871 872 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 873 txsd->plen = plen; 874 txsd->tx_credits = credits; 875 txsd++; 876 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 877 toep->txsd_pidx = 0; 878 txsd = &toep->txsd[0]; 879 } 880 toep->txsd_avail--; 881 882 t4_l2t_send(sc, wr, toep->l2te); 883 } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0); 884 885 /* Send a FIN if requested, but only if there's no more data to send */ 886 if (m == NULL && toep->flags & TPF_SEND_FIN) 887 t4_close_conn(sc, toep); 888 } 889 890 static inline void 891 rqdrop_locked(struct mbufq *q, int plen) 892 { 893 struct mbuf *m; 894 895 while (plen > 0) { 896 m = mbufq_dequeue(q); 897 898 /* Too many credits. */ 899 MPASS(m != NULL); 900 M_ASSERTPKTHDR(m); 901 902 /* Partial credits. */ 903 MPASS(plen >= m->m_pkthdr.len); 904 905 plen -= m->m_pkthdr.len; 906 m_freem(m); 907 } 908 } 909 910 /* 911 * Not a bit in the TCB, but is a bit in the ulp_submode field of the 912 * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR. 913 */ 914 #define ULP_ISO G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO) 915 916 static void 917 write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss, 918 int len, int npdu) 919 { 920 struct cpl_tx_data_iso *cpl; 921 unsigned int burst_size; 922 unsigned int last; 923 924 /* 925 * The firmware will set the 'F' bit on the last PDU when 926 * either condition is true: 927 * 928 * - this large PDU is marked as the "last" slice 929 * 930 * - the amount of data payload bytes equals the burst_size 931 * 932 * The strategy used here is to always set the burst_size 933 * artificially high (len includes the size of the template 934 * BHS) and only set the "last" flag if the original PDU had 935 * 'F' set. 936 */ 937 burst_size = len; 938 last = !!(flags & CXGBE_ISO_F); 939 940 cpl = (struct cpl_tx_data_iso *)dst; 941 cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) | 942 V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) | 943 V_CPL_TX_DATA_ISO_CPLHDRLEN(0) | 944 V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) | 945 V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) | 946 V_CPL_TX_DATA_ISO_IMMEDIATE(0) | 947 V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags))); 948 949 cpl->ahs_len = 0; 950 cpl->mpdu = htons(DIV_ROUND_UP(mss, 4)); 951 cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4)); 952 cpl->len = htonl(len); 953 cpl->reserved2_seglen_offset = htonl(0); 954 cpl->datasn_offset = htonl(0); 955 cpl->buffer_offset = htonl(0); 956 cpl->reserved3 = 0; 957 } 958 959 static struct wrqe * 960 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) 961 { 962 struct mbuf *m; 963 struct fw_ofld_tx_data_wr *txwr; 964 struct cpl_tx_data_iso *cpl_iso; 965 void *p; 966 struct wrqe *wr; 967 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 968 u_int adjusted_plen, imm_data, ulp_submode; 969 struct inpcb *inp = toep->inp; 970 struct tcpcb *tp = intotcpcb(inp); 971 int tx_credits, shove, npdu, wr_len; 972 uint16_t iso_mss; 973 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 974 bool iso, nomap_mbuf_seen; 975 976 M_ASSERTPKTHDR(sndptr); 977 978 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 979 if (mbuf_raw_wr(sndptr)) { 980 plen = sndptr->m_pkthdr.len; 981 KASSERT(plen <= SGE_MAX_WR_LEN, 982 ("raw WR len %u is greater than max WR len", plen)); 983 if (plen > tx_credits * 16) 984 return (NULL); 985 986 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 987 if (__predict_false(wr == NULL)) 988 return (NULL); 989 990 m_copydata(sndptr, 0, plen, wrtod(wr)); 991 return (wr); 992 } 993 994 iso = mbuf_iscsi_iso(sndptr); 995 max_imm = max_imm_payload(tx_credits, iso); 996 max_nsegs = max_dsgl_nsegs(tx_credits, iso); 997 iso_mss = mbuf_iscsi_iso_mss(sndptr); 998 999 plen = 0; 1000 nsegs = 0; 1001 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 1002 nomap_mbuf_seen = false; 1003 for (m = sndptr; m != NULL; m = m->m_next) { 1004 int n; 1005 1006 if (m->m_flags & M_EXTPG) 1007 n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), 1008 m->m_len); 1009 else 1010 n = sglist_count(mtod(m, void *), m->m_len); 1011 1012 nsegs += n; 1013 plen += m->m_len; 1014 1015 /* 1016 * This mbuf would send us _over_ the nsegs limit. 1017 * Suspend tx because the PDU can't be sent out. 1018 */ 1019 if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs) 1020 return (NULL); 1021 1022 if (m->m_flags & M_EXTPG) 1023 nomap_mbuf_seen = true; 1024 if (max_nsegs_1mbuf < n) 1025 max_nsegs_1mbuf = n; 1026 } 1027 1028 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1029 panic("%s: excess tx.", __func__); 1030 1031 /* 1032 * We have a PDU to send. All of it goes out in one WR so 'm' 1033 * is NULL. A PDU's length is always a multiple of 4. 1034 */ 1035 MPASS(m == NULL); 1036 MPASS((plen & 3) == 0); 1037 MPASS(sndptr->m_pkthdr.len == plen); 1038 1039 shove = !(tp->t_flags & TF_MORETOCOME); 1040 1041 /* 1042 * plen doesn't include header and data digests, which are 1043 * generated and inserted in the right places by the TOE, but 1044 * they do occupy TCP sequence space and need to be accounted 1045 * for. 1046 */ 1047 ulp_submode = mbuf_ulp_submode(sndptr); 1048 MPASS(ulp_submode < nitems(ulp_extra_len)); 1049 npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1; 1050 adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu; 1051 if (iso) 1052 adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1); 1053 wr_len = sizeof(*txwr); 1054 if (iso) 1055 wr_len += sizeof(struct cpl_tx_data_iso); 1056 if (plen <= max_imm && !nomap_mbuf_seen) { 1057 /* Immediate data tx */ 1058 imm_data = plen; 1059 wr_len += plen; 1060 nsegs = 0; 1061 } else { 1062 /* DSGL tx */ 1063 imm_data = 0; 1064 wr_len += sizeof(struct ulptx_sgl) + 1065 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1066 } 1067 1068 wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); 1069 if (wr == NULL) { 1070 /* XXX: how will we recover from this? */ 1071 return (NULL); 1072 } 1073 txwr = wrtod(wr); 1074 credits = howmany(wr->wr_len, 16); 1075 1076 if (iso) { 1077 write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR, 1078 imm_data + sizeof(struct cpl_tx_data_iso), 1079 adjusted_plen, credits, shove, ulp_submode | ULP_ISO); 1080 cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1); 1081 MPASS(plen == sndptr->m_pkthdr.len); 1082 write_tx_data_iso(cpl_iso, ulp_submode, 1083 mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu); 1084 p = cpl_iso + 1; 1085 } else { 1086 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data, 1087 adjusted_plen, credits, shove, ulp_submode); 1088 p = txwr + 1; 1089 } 1090 1091 if (imm_data != 0) { 1092 m_copydata(sndptr, 0, plen, p); 1093 } else { 1094 write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf); 1095 if (wr_len & 0xf) { 1096 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); 1097 *pad = 0; 1098 } 1099 } 1100 1101 KASSERT(toep->tx_credits >= credits, 1102 ("%s: not enough credits: credits %u " 1103 "toep->tx_credits %u tx_credits %u nsegs %u " 1104 "max_nsegs %u iso %d", __func__, credits, 1105 toep->tx_credits, tx_credits, nsegs, max_nsegs, iso)); 1106 1107 tp->snd_nxt += adjusted_plen; 1108 tp->snd_max += adjusted_plen; 1109 1110 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu); 1111 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); 1112 if (iso) 1113 counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1); 1114 1115 return (wr); 1116 } 1117 1118 void 1119 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 1120 { 1121 struct mbuf *sndptr, *m; 1122 struct fw_wr_hdr *wrhdr; 1123 struct wrqe *wr; 1124 u_int plen, credits; 1125 struct inpcb *inp = toep->inp; 1126 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 1127 struct mbufq *pduq = &toep->ulp_pduq; 1128 1129 INP_WLOCK_ASSERT(inp); 1130 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1131 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 1132 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 1133 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 1134 1135 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 1136 return; 1137 1138 /* 1139 * This function doesn't resume by itself. Someone else must clear the 1140 * flag and call this function. 1141 */ 1142 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 1143 KASSERT(drop == 0, 1144 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1145 return; 1146 } 1147 1148 if (drop) { 1149 struct socket *so = inp->inp_socket; 1150 struct sockbuf *sb = &so->so_snd; 1151 int sbu; 1152 1153 /* 1154 * An unlocked read is ok here as the data should only 1155 * transition from a non-zero value to either another 1156 * non-zero value or zero. Once it is zero it should 1157 * stay zero. 1158 */ 1159 if (__predict_false(sbused(sb)) > 0) { 1160 SOCKBUF_LOCK(sb); 1161 sbu = sbused(sb); 1162 if (sbu > 0) { 1163 /* 1164 * The data transmitted before the 1165 * tid's ULP mode changed to ISCSI is 1166 * still in so_snd. Incoming credits 1167 * should account for so_snd first. 1168 */ 1169 sbdrop_locked(sb, min(sbu, drop)); 1170 drop -= min(sbu, drop); 1171 } 1172 sowwakeup_locked(so); /* unlocks so_snd */ 1173 } 1174 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1175 } 1176 1177 while ((sndptr = mbufq_first(pduq)) != NULL) { 1178 wr = write_iscsi_mbuf_wr(toep, sndptr); 1179 if (wr == NULL) { 1180 toep->flags |= TPF_TX_SUSPENDED; 1181 return; 1182 } 1183 1184 plen = sndptr->m_pkthdr.len; 1185 credits = howmany(wr->wr_len, 16); 1186 KASSERT(toep->tx_credits >= credits, 1187 ("%s: not enough credits", __func__)); 1188 1189 m = mbufq_dequeue(pduq); 1190 MPASS(m == sndptr); 1191 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1192 1193 toep->tx_credits -= credits; 1194 toep->tx_nocompl += credits; 1195 toep->plen_nocompl += plen; 1196 1197 /* 1198 * Ensure there are enough credits for a full-sized WR 1199 * as page pod WRs can be full-sized. 1200 */ 1201 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && 1202 toep->tx_nocompl >= toep->tx_total / 4) { 1203 wrhdr = wrtod(wr); 1204 wrhdr->hi |= htobe32(F_FW_WR_COMPL); 1205 toep->tx_nocompl = 0; 1206 toep->plen_nocompl = 0; 1207 } 1208 1209 toep->flags |= TPF_TX_DATA_SENT; 1210 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1211 toep->flags |= TPF_TX_SUSPENDED; 1212 1213 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1214 txsd->plen = plen; 1215 txsd->tx_credits = credits; 1216 txsd++; 1217 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1218 toep->txsd_pidx = 0; 1219 txsd = &toep->txsd[0]; 1220 } 1221 toep->txsd_avail--; 1222 1223 t4_l2t_send(sc, wr, toep->l2te); 1224 } 1225 1226 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1227 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1228 t4_close_conn(sc, toep); 1229 } 1230 1231 static inline void 1232 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1233 { 1234 1235 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1236 t4_push_pdus(sc, toep, drop); 1237 else if (toep->flags & TPF_KTLS) 1238 t4_push_ktls(sc, toep, drop); 1239 else 1240 t4_push_frames(sc, toep, drop); 1241 } 1242 1243 int 1244 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1245 { 1246 struct adapter *sc = tod->tod_softc; 1247 #ifdef INVARIANTS 1248 struct inpcb *inp = tptoinpcb(tp); 1249 #endif 1250 struct toepcb *toep = tp->t_toe; 1251 1252 INP_WLOCK_ASSERT(inp); 1253 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1254 ("%s: inp %p dropped.", __func__, inp)); 1255 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1256 1257 t4_push_data(sc, toep, 0); 1258 1259 return (0); 1260 } 1261 1262 int 1263 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1264 { 1265 struct adapter *sc = tod->tod_softc; 1266 #ifdef INVARIANTS 1267 struct inpcb *inp = tptoinpcb(tp); 1268 #endif 1269 struct toepcb *toep = tp->t_toe; 1270 1271 INP_WLOCK_ASSERT(inp); 1272 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1273 ("%s: inp %p dropped.", __func__, inp)); 1274 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1275 1276 toep->flags |= TPF_SEND_FIN; 1277 if (tp->t_state >= TCPS_ESTABLISHED) 1278 t4_push_data(sc, toep, 0); 1279 1280 return (0); 1281 } 1282 1283 int 1284 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1285 { 1286 struct adapter *sc = tod->tod_softc; 1287 #if defined(INVARIANTS) 1288 struct inpcb *inp = tptoinpcb(tp); 1289 #endif 1290 struct toepcb *toep = tp->t_toe; 1291 1292 INP_WLOCK_ASSERT(inp); 1293 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1294 ("%s: inp %p dropped.", __func__, inp)); 1295 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1296 1297 /* hmmmm */ 1298 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1299 ("%s: flowc for tid %u [%s] not sent already", 1300 __func__, toep->tid, tcpstates[tp->t_state])); 1301 1302 send_reset(sc, toep, 0); 1303 return (0); 1304 } 1305 1306 /* 1307 * Peer has sent us a FIN. 1308 */ 1309 static int 1310 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1311 { 1312 struct adapter *sc = iq->adapter; 1313 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1314 unsigned int tid = GET_TID(cpl); 1315 struct toepcb *toep = lookup_tid(sc, tid); 1316 struct inpcb *inp = toep->inp; 1317 struct tcpcb *tp = NULL; 1318 struct socket *so; 1319 struct epoch_tracker et; 1320 #ifdef INVARIANTS 1321 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1322 #endif 1323 1324 KASSERT(opcode == CPL_PEER_CLOSE, 1325 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1326 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1327 1328 if (__predict_false(toep->flags & TPF_SYNQE)) { 1329 /* 1330 * do_pass_establish must have run before do_peer_close and if 1331 * this is still a synqe instead of a toepcb then the connection 1332 * must be getting aborted. 1333 */ 1334 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1335 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1336 toep, toep->flags); 1337 return (0); 1338 } 1339 1340 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1341 1342 CURVNET_SET(toep->vnet); 1343 NET_EPOCH_ENTER(et); 1344 INP_WLOCK(inp); 1345 tp = intotcpcb(inp); 1346 1347 CTR6(KTR_CXGBE, 1348 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1349 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1350 toep->ddp.flags, inp); 1351 1352 if (toep->flags & TPF_ABORT_SHUTDOWN) 1353 goto done; 1354 1355 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1356 DDP_LOCK(toep); 1357 if (__predict_false(toep->ddp.flags & 1358 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1359 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1360 DDP_UNLOCK(toep); 1361 } 1362 so = inp->inp_socket; 1363 socantrcvmore(so); 1364 1365 if (ulp_mode(toep) == ULP_MODE_RDMA || 1366 (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) { 1367 /* 1368 * There might be data received via DDP before the FIN 1369 * not reported to the driver. Just assume the 1370 * sequence number in the CPL is correct as the 1371 * sequence number of the FIN. 1372 */ 1373 } else { 1374 KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt), 1375 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1376 be32toh(cpl->rcv_nxt))); 1377 } 1378 1379 tp->rcv_nxt = be32toh(cpl->rcv_nxt); 1380 1381 switch (tp->t_state) { 1382 case TCPS_SYN_RECEIVED: 1383 tp->t_starttime = ticks; 1384 /* FALLTHROUGH */ 1385 1386 case TCPS_ESTABLISHED: 1387 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1388 break; 1389 1390 case TCPS_FIN_WAIT_1: 1391 tcp_state_change(tp, TCPS_CLOSING); 1392 break; 1393 1394 case TCPS_FIN_WAIT_2: 1395 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1396 t4_pcb_detach(NULL, tp); 1397 tcp_twstart(tp); 1398 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1399 NET_EPOCH_EXIT(et); 1400 CURVNET_RESTORE(); 1401 1402 INP_WLOCK(inp); 1403 final_cpl_received(toep); 1404 return (0); 1405 1406 default: 1407 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1408 __func__, tid, tp->t_state); 1409 } 1410 done: 1411 INP_WUNLOCK(inp); 1412 NET_EPOCH_EXIT(et); 1413 CURVNET_RESTORE(); 1414 return (0); 1415 } 1416 1417 /* 1418 * Peer has ACK'd our FIN. 1419 */ 1420 static int 1421 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1422 struct mbuf *m) 1423 { 1424 struct adapter *sc = iq->adapter; 1425 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1426 unsigned int tid = GET_TID(cpl); 1427 struct toepcb *toep = lookup_tid(sc, tid); 1428 struct inpcb *inp = toep->inp; 1429 struct tcpcb *tp = NULL; 1430 struct socket *so = NULL; 1431 struct epoch_tracker et; 1432 #ifdef INVARIANTS 1433 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1434 #endif 1435 1436 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1437 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1438 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1439 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1440 1441 CURVNET_SET(toep->vnet); 1442 NET_EPOCH_ENTER(et); 1443 INP_WLOCK(inp); 1444 tp = intotcpcb(inp); 1445 1446 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1447 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1448 1449 if (toep->flags & TPF_ABORT_SHUTDOWN) 1450 goto done; 1451 1452 so = inp->inp_socket; 1453 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1454 1455 switch (tp->t_state) { 1456 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1457 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1458 t4_pcb_detach(NULL, tp); 1459 tcp_twstart(tp); 1460 release: 1461 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1462 NET_EPOCH_EXIT(et); 1463 CURVNET_RESTORE(); 1464 1465 INP_WLOCK(inp); 1466 final_cpl_received(toep); /* no more CPLs expected */ 1467 1468 return (0); 1469 case TCPS_LAST_ACK: 1470 if (tcp_close(tp)) 1471 INP_WUNLOCK(inp); 1472 goto release; 1473 1474 case TCPS_FIN_WAIT_1: 1475 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1476 soisdisconnected(so); 1477 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1478 break; 1479 1480 default: 1481 log(LOG_ERR, 1482 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1483 __func__, tid, tcpstates[tp->t_state]); 1484 } 1485 done: 1486 INP_WUNLOCK(inp); 1487 NET_EPOCH_EXIT(et); 1488 CURVNET_RESTORE(); 1489 return (0); 1490 } 1491 1492 void 1493 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, 1494 int rst_status) 1495 { 1496 struct wrqe *wr; 1497 struct cpl_abort_rpl *cpl; 1498 1499 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); 1500 if (wr == NULL) { 1501 /* XXX */ 1502 panic("%s: allocation failure.", __func__); 1503 } 1504 cpl = wrtod(wr); 1505 1506 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1507 cpl->cmd = rst_status; 1508 1509 t4_wrq_tx(sc, wr); 1510 } 1511 1512 static int 1513 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1514 { 1515 switch (abort_reason) { 1516 case CPL_ERR_BAD_SYN: 1517 case CPL_ERR_CONN_RESET: 1518 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1519 case CPL_ERR_XMIT_TIMEDOUT: 1520 case CPL_ERR_PERSIST_TIMEDOUT: 1521 case CPL_ERR_FINWAIT2_TIMEDOUT: 1522 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1523 return (ETIMEDOUT); 1524 default: 1525 return (EIO); 1526 } 1527 } 1528 1529 /* 1530 * TCP RST from the peer, timeout, or some other such critical error. 1531 */ 1532 static int 1533 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1534 { 1535 struct adapter *sc = iq->adapter; 1536 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1537 unsigned int tid = GET_TID(cpl); 1538 struct toepcb *toep = lookup_tid(sc, tid); 1539 struct sge_ofld_txq *ofld_txq = toep->ofld_txq; 1540 struct inpcb *inp; 1541 struct tcpcb *tp; 1542 struct epoch_tracker et; 1543 #ifdef INVARIANTS 1544 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1545 #endif 1546 1547 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1548 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1549 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1550 1551 if (toep->flags & TPF_SYNQE) 1552 return (do_abort_req_synqe(iq, rss, m)); 1553 1554 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1555 1556 if (negative_advice(cpl->status)) { 1557 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1558 __func__, cpl->status, tid, toep->flags); 1559 return (0); /* Ignore negative advice */ 1560 } 1561 1562 inp = toep->inp; 1563 CURVNET_SET(toep->vnet); 1564 NET_EPOCH_ENTER(et); /* for tcp_close */ 1565 INP_WLOCK(inp); 1566 1567 tp = intotcpcb(inp); 1568 1569 CTR6(KTR_CXGBE, 1570 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1571 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1572 inp->inp_flags, cpl->status); 1573 1574 /* 1575 * If we'd initiated an abort earlier the reply to it is responsible for 1576 * cleaning up resources. Otherwise we tear everything down right here 1577 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1578 */ 1579 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1580 INP_WUNLOCK(inp); 1581 goto done; 1582 } 1583 toep->flags |= TPF_ABORT_SHUTDOWN; 1584 1585 if ((inp->inp_flags & INP_DROPPED) == 0) { 1586 struct socket *so = inp->inp_socket; 1587 1588 if (so != NULL) 1589 so_error_set(so, abort_status_to_errno(tp, 1590 cpl->status)); 1591 tp = tcp_close(tp); 1592 if (tp == NULL) 1593 INP_WLOCK(inp); /* re-acquire */ 1594 } 1595 1596 final_cpl_received(toep); 1597 done: 1598 NET_EPOCH_EXIT(et); 1599 CURVNET_RESTORE(); 1600 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1601 return (0); 1602 } 1603 1604 /* 1605 * Reply to the CPL_ABORT_REQ (send_reset) 1606 */ 1607 static int 1608 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1609 { 1610 struct adapter *sc = iq->adapter; 1611 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1612 unsigned int tid = GET_TID(cpl); 1613 struct toepcb *toep = lookup_tid(sc, tid); 1614 struct inpcb *inp = toep->inp; 1615 #ifdef INVARIANTS 1616 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1617 #endif 1618 1619 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1620 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1621 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1622 1623 if (toep->flags & TPF_SYNQE) 1624 return (do_abort_rpl_synqe(iq, rss, m)); 1625 1626 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1627 1628 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1629 __func__, tid, toep, inp, cpl->status); 1630 1631 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1632 ("%s: wasn't expecting abort reply", __func__)); 1633 1634 INP_WLOCK(inp); 1635 final_cpl_received(toep); 1636 1637 return (0); 1638 } 1639 1640 static int 1641 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1642 { 1643 struct adapter *sc = iq->adapter; 1644 const struct cpl_rx_data *cpl = mtod(m, const void *); 1645 unsigned int tid = GET_TID(cpl); 1646 struct toepcb *toep = lookup_tid(sc, tid); 1647 struct inpcb *inp = toep->inp; 1648 struct tcpcb *tp; 1649 struct socket *so; 1650 struct sockbuf *sb; 1651 struct epoch_tracker et; 1652 int len; 1653 uint32_t ddp_placed = 0; 1654 1655 if (__predict_false(toep->flags & TPF_SYNQE)) { 1656 /* 1657 * do_pass_establish must have run before do_rx_data and if this 1658 * is still a synqe instead of a toepcb then the connection must 1659 * be getting aborted. 1660 */ 1661 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1662 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1663 toep, toep->flags); 1664 m_freem(m); 1665 return (0); 1666 } 1667 1668 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1669 1670 /* strip off CPL header */ 1671 m_adj(m, sizeof(*cpl)); 1672 len = m->m_pkthdr.len; 1673 1674 INP_WLOCK(inp); 1675 if (inp->inp_flags & INP_DROPPED) { 1676 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1677 __func__, tid, len, inp->inp_flags); 1678 INP_WUNLOCK(inp); 1679 m_freem(m); 1680 return (0); 1681 } 1682 1683 tp = intotcpcb(inp); 1684 1685 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1686 toep->flags & TPF_TLS_RECEIVE)) { 1687 /* Received "raw" data on a TLS socket. */ 1688 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1689 __func__, tid, len); 1690 do_rx_data_tls(cpl, toep, m); 1691 return (0); 1692 } 1693 1694 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1695 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1696 1697 tp->rcv_nxt += len; 1698 if (tp->rcv_wnd < len) { 1699 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1700 ("%s: negative window size", __func__)); 1701 } 1702 1703 tp->rcv_wnd -= len; 1704 tp->t_rcvtime = ticks; 1705 1706 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1707 DDP_LOCK(toep); 1708 so = inp_inpcbtosocket(inp); 1709 sb = &so->so_rcv; 1710 SOCKBUF_LOCK(sb); 1711 1712 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1713 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1714 __func__, tid, len); 1715 m_freem(m); 1716 SOCKBUF_UNLOCK(sb); 1717 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1718 DDP_UNLOCK(toep); 1719 INP_WUNLOCK(inp); 1720 1721 CURVNET_SET(toep->vnet); 1722 NET_EPOCH_ENTER(et); 1723 INP_WLOCK(inp); 1724 tp = tcp_drop(tp, ECONNRESET); 1725 if (tp) 1726 INP_WUNLOCK(inp); 1727 NET_EPOCH_EXIT(et); 1728 CURVNET_RESTORE(); 1729 1730 return (0); 1731 } 1732 1733 /* receive buffer autosize */ 1734 MPASS(toep->vnet == so->so_vnet); 1735 CURVNET_SET(toep->vnet); 1736 if (sb->sb_flags & SB_AUTOSIZE && 1737 V_tcp_do_autorcvbuf && 1738 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1739 len > (sbspace(sb) / 8 * 7)) { 1740 unsigned int hiwat = sb->sb_hiwat; 1741 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1742 V_tcp_autorcvbuf_max); 1743 1744 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 1745 sb->sb_flags &= ~SB_AUTOSIZE; 1746 } 1747 1748 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1749 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1750 1751 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1752 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1753 __func__, tid, len); 1754 1755 if (changed) { 1756 if (toep->ddp.flags & DDP_SC_REQ) 1757 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1758 else if (cpl->ddp_off == 1) { 1759 /* Fell out of DDP mode */ 1760 toep->ddp.flags &= ~DDP_ON; 1761 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1762 __func__); 1763 1764 insert_ddp_data(toep, ddp_placed); 1765 } else { 1766 /* 1767 * Data was received while still 1768 * ULP_MODE_NONE, just fall through. 1769 */ 1770 } 1771 } 1772 1773 if (toep->ddp.flags & DDP_ON) { 1774 /* 1775 * CPL_RX_DATA with DDP on can only be an indicate. 1776 * Start posting queued AIO requests via DDP. The 1777 * payload that arrived in this indicate is appended 1778 * to the socket buffer as usual. 1779 */ 1780 handle_ddp_indicate(toep); 1781 } 1782 } 1783 1784 sbappendstream_locked(sb, m, 0); 1785 t4_rcvd_locked(&toep->td->tod, tp); 1786 1787 if (ulp_mode(toep) == ULP_MODE_TCPDDP && 1788 (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 && 1789 sbavail(sb) != 0) { 1790 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1791 tid); 1792 ddp_queue_toep(toep); 1793 } 1794 if (toep->flags & TPF_TLS_STARTING) 1795 tls_received_starting_data(sc, toep, sb, len); 1796 sorwakeup_locked(so); 1797 SOCKBUF_UNLOCK_ASSERT(sb); 1798 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1799 DDP_UNLOCK(toep); 1800 1801 INP_WUNLOCK(inp); 1802 CURVNET_RESTORE(); 1803 return (0); 1804 } 1805 1806 static int 1807 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1808 { 1809 struct adapter *sc = iq->adapter; 1810 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1811 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1812 struct toepcb *toep = lookup_tid(sc, tid); 1813 struct inpcb *inp; 1814 struct tcpcb *tp; 1815 struct socket *so; 1816 uint8_t credits = cpl->credits; 1817 struct ofld_tx_sdesc *txsd; 1818 int plen; 1819 #ifdef INVARIANTS 1820 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1821 #endif 1822 1823 /* 1824 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1825 * now this comes back carrying the credits for the flowc. 1826 */ 1827 if (__predict_false(toep->flags & TPF_SYNQE)) { 1828 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1829 ("%s: credits for a synq entry %p", __func__, toep)); 1830 return (0); 1831 } 1832 1833 inp = toep->inp; 1834 1835 KASSERT(opcode == CPL_FW4_ACK, 1836 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1837 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1838 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1839 1840 INP_WLOCK(inp); 1841 1842 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1843 INP_WUNLOCK(inp); 1844 return (0); 1845 } 1846 1847 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1848 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1849 1850 tp = intotcpcb(inp); 1851 1852 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1853 tcp_seq snd_una = be32toh(cpl->snd_una); 1854 1855 #ifdef INVARIANTS 1856 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1857 log(LOG_ERR, 1858 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1859 __func__, snd_una, toep->tid, tp->snd_una); 1860 } 1861 #endif 1862 1863 if (tp->snd_una != snd_una) { 1864 tp->snd_una = snd_una; 1865 tp->ts_recent_age = tcp_ts_getticks(); 1866 } 1867 } 1868 1869 #ifdef VERBOSE_TRACES 1870 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1871 #endif 1872 so = inp->inp_socket; 1873 txsd = &toep->txsd[toep->txsd_cidx]; 1874 plen = 0; 1875 while (credits) { 1876 KASSERT(credits >= txsd->tx_credits, 1877 ("%s: too many (or partial) credits", __func__)); 1878 credits -= txsd->tx_credits; 1879 toep->tx_credits += txsd->tx_credits; 1880 plen += txsd->plen; 1881 txsd++; 1882 toep->txsd_avail++; 1883 KASSERT(toep->txsd_avail <= toep->txsd_total, 1884 ("%s: txsd avail > total", __func__)); 1885 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1886 txsd = &toep->txsd[0]; 1887 toep->txsd_cidx = 0; 1888 } 1889 } 1890 1891 if (toep->tx_credits == toep->tx_total) { 1892 toep->tx_nocompl = 0; 1893 toep->plen_nocompl = 0; 1894 } 1895 1896 if (toep->flags & TPF_TX_SUSPENDED && 1897 toep->tx_credits >= toep->tx_total / 4) { 1898 #ifdef VERBOSE_TRACES 1899 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1900 tid); 1901 #endif 1902 toep->flags &= ~TPF_TX_SUSPENDED; 1903 CURVNET_SET(toep->vnet); 1904 t4_push_data(sc, toep, plen); 1905 CURVNET_RESTORE(); 1906 } else if (plen > 0) { 1907 struct sockbuf *sb = &so->so_snd; 1908 int sbu; 1909 1910 SOCKBUF_LOCK(sb); 1911 sbu = sbused(sb); 1912 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 1913 if (__predict_false(sbu > 0)) { 1914 /* 1915 * The data transmitted before the 1916 * tid's ULP mode changed to ISCSI is 1917 * still in so_snd. Incoming credits 1918 * should account for so_snd first. 1919 */ 1920 sbdrop_locked(sb, min(sbu, plen)); 1921 plen -= min(sbu, plen); 1922 } 1923 sowwakeup_locked(so); /* unlocks so_snd */ 1924 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1925 } else { 1926 #ifdef VERBOSE_TRACES 1927 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1928 tid, plen); 1929 #endif 1930 sbdrop_locked(sb, plen); 1931 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1932 t4_aiotx_queue_toep(so, toep); 1933 sowwakeup_locked(so); /* unlocks so_snd */ 1934 } 1935 SOCKBUF_UNLOCK_ASSERT(sb); 1936 } 1937 1938 INP_WUNLOCK(inp); 1939 1940 return (0); 1941 } 1942 1943 void 1944 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1945 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1946 { 1947 struct wrqe *wr; 1948 struct cpl_set_tcb_field *req; 1949 struct ofld_tx_sdesc *txsd; 1950 1951 MPASS((cookie & ~M_COOKIE) == 0); 1952 if (reply) { 1953 MPASS(cookie != CPL_COOKIE_RESERVED); 1954 } 1955 1956 wr = alloc_wrqe(sizeof(*req), wrq); 1957 if (wr == NULL) { 1958 /* XXX */ 1959 panic("%s: allocation failure.", __func__); 1960 } 1961 req = wrtod(wr); 1962 1963 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1964 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1965 if (reply == 0) 1966 req->reply_ctrl |= htobe16(F_NO_REPLY); 1967 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1968 req->mask = htobe64(mask); 1969 req->val = htobe64(val); 1970 if (wrq->eq.type == EQ_OFLD) { 1971 txsd = &toep->txsd[toep->txsd_pidx]; 1972 txsd->tx_credits = howmany(sizeof(*req), 16); 1973 txsd->plen = 0; 1974 KASSERT(toep->tx_credits >= txsd->tx_credits && 1975 toep->txsd_avail > 0, 1976 ("%s: not enough credits (%d)", __func__, 1977 toep->tx_credits)); 1978 toep->tx_credits -= txsd->tx_credits; 1979 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1980 toep->txsd_pidx = 0; 1981 toep->txsd_avail--; 1982 } 1983 1984 t4_wrq_tx(sc, wr); 1985 } 1986 1987 void 1988 t4_init_cpl_io_handlers(void) 1989 { 1990 1991 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1992 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1993 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1994 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1995 CPL_COOKIE_TOM); 1996 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1997 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1998 } 1999 2000 void 2001 t4_uninit_cpl_io_handlers(void) 2002 { 2003 2004 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 2005 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 2006 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 2007 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 2008 t4_register_cpl_handler(CPL_RX_DATA, NULL); 2009 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 2010 } 2011 2012 /* 2013 * Use the 'backend1' field in AIO jobs to hold an error that should 2014 * be reported when the job is completed, the 'backend3' field to 2015 * store the amount of data sent by the AIO job so far, and the 2016 * 'backend4' field to hold a reference count on the job. 2017 * 2018 * Each unmapped mbuf holds a reference on the job as does the queue 2019 * so long as the job is queued. 2020 */ 2021 #define aio_error backend1 2022 #define aio_sent backend3 2023 #define aio_refs backend4 2024 2025 #ifdef VERBOSE_TRACES 2026 static int 2027 jobtotid(struct kaiocb *job) 2028 { 2029 struct socket *so; 2030 struct tcpcb *tp; 2031 struct toepcb *toep; 2032 2033 so = job->fd_file->f_data; 2034 tp = sototcpcb(so); 2035 toep = tp->t_toe; 2036 return (toep->tid); 2037 } 2038 #endif 2039 2040 static void 2041 aiotx_free_job(struct kaiocb *job) 2042 { 2043 long status; 2044 int error; 2045 2046 if (refcount_release(&job->aio_refs) == 0) 2047 return; 2048 2049 error = (intptr_t)job->aio_error; 2050 status = job->aio_sent; 2051 #ifdef VERBOSE_TRACES 2052 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 2053 jobtotid(job), job, status, error); 2054 #endif 2055 if (error != 0 && status != 0) 2056 error = 0; 2057 if (error == ECANCELED) 2058 aio_cancel(job); 2059 else if (error) 2060 aio_complete(job, -1, error); 2061 else { 2062 job->msgsnd = 1; 2063 aio_complete(job, status, 0); 2064 } 2065 } 2066 2067 static void 2068 aiotx_free_pgs(struct mbuf *m) 2069 { 2070 struct kaiocb *job; 2071 vm_page_t pg; 2072 2073 M_ASSERTEXTPG(m); 2074 job = m->m_ext.ext_arg1; 2075 #ifdef VERBOSE_TRACES 2076 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 2077 m->m_len, jobtotid(job)); 2078 #endif 2079 2080 for (int i = 0; i < m->m_epg_npgs; i++) { 2081 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 2082 vm_page_unwire(pg, PQ_ACTIVE); 2083 } 2084 2085 aiotx_free_job(job); 2086 } 2087 2088 /* 2089 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 2090 * of an AIO job. 2091 */ 2092 static struct mbuf * 2093 alloc_aiotx_mbuf(struct kaiocb *job, int len) 2094 { 2095 struct vmspace *vm; 2096 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 2097 struct mbuf *m, *top, *last; 2098 vm_map_t map; 2099 vm_offset_t start; 2100 int i, mlen, npages, pgoff; 2101 2102 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 2103 ("%s(%p, %d): request to send beyond end of buffer", __func__, 2104 job, len)); 2105 2106 /* 2107 * The AIO subsystem will cancel and drain all requests before 2108 * permitting a process to exit or exec, so p_vmspace should 2109 * be stable here. 2110 */ 2111 vm = job->userproc->p_vmspace; 2112 map = &vm->vm_map; 2113 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 2114 pgoff = start & PAGE_MASK; 2115 2116 top = NULL; 2117 last = NULL; 2118 while (len > 0) { 2119 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 2120 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 2121 ("%s: next start (%#jx + %#x) is not page aligned", 2122 __func__, (uintmax_t)start, mlen)); 2123 2124 npages = vm_fault_quick_hold_pages(map, start, mlen, 2125 VM_PROT_WRITE, pgs, nitems(pgs)); 2126 if (npages < 0) 2127 break; 2128 2129 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs, M_RDONLY); 2130 m->m_epg_1st_off = pgoff; 2131 m->m_epg_npgs = npages; 2132 if (npages == 1) { 2133 KASSERT(mlen + pgoff <= PAGE_SIZE, 2134 ("%s: single page is too large (off %d len %d)", 2135 __func__, pgoff, mlen)); 2136 m->m_epg_last_len = mlen; 2137 } else { 2138 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2139 (npages - 2) * PAGE_SIZE; 2140 } 2141 for (i = 0; i < npages; i++) 2142 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2143 2144 m->m_len = mlen; 2145 m->m_ext.ext_size = npages * PAGE_SIZE; 2146 m->m_ext.ext_arg1 = job; 2147 refcount_acquire(&job->aio_refs); 2148 2149 #ifdef VERBOSE_TRACES 2150 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2151 __func__, jobtotid(job), m, job, npages); 2152 #endif 2153 2154 if (top == NULL) 2155 top = m; 2156 else 2157 last->m_next = m; 2158 last = m; 2159 2160 len -= mlen; 2161 start += mlen; 2162 pgoff = 0; 2163 } 2164 2165 return (top); 2166 } 2167 2168 static void 2169 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2170 { 2171 struct sockbuf *sb; 2172 struct inpcb *inp; 2173 struct tcpcb *tp; 2174 struct mbuf *m; 2175 u_int sent; 2176 int error, len; 2177 bool moretocome, sendmore; 2178 2179 sb = &so->so_snd; 2180 SOCKBUF_UNLOCK(sb); 2181 m = NULL; 2182 2183 #ifdef MAC 2184 error = mac_socket_check_send(job->fd_file->f_cred, so); 2185 if (error != 0) 2186 goto out; 2187 #endif 2188 2189 /* Inline sosend_generic(). */ 2190 2191 error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); 2192 MPASS(error == 0); 2193 2194 sendanother: 2195 SOCKBUF_LOCK(sb); 2196 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2197 SOCKBUF_UNLOCK(sb); 2198 SOCK_IO_SEND_UNLOCK(so); 2199 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2200 PROC_LOCK(job->userproc); 2201 kern_psignal(job->userproc, SIGPIPE); 2202 PROC_UNLOCK(job->userproc); 2203 } 2204 error = EPIPE; 2205 goto out; 2206 } 2207 if (so->so_error) { 2208 error = so->so_error; 2209 so->so_error = 0; 2210 SOCKBUF_UNLOCK(sb); 2211 SOCK_IO_SEND_UNLOCK(so); 2212 goto out; 2213 } 2214 if ((so->so_state & SS_ISCONNECTED) == 0) { 2215 SOCKBUF_UNLOCK(sb); 2216 SOCK_IO_SEND_UNLOCK(so); 2217 error = ENOTCONN; 2218 goto out; 2219 } 2220 if (sbspace(sb) < sb->sb_lowat) { 2221 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2222 2223 /* 2224 * Don't block if there is too little room in the socket 2225 * buffer. Instead, requeue the request. 2226 */ 2227 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2228 SOCKBUF_UNLOCK(sb); 2229 SOCK_IO_SEND_UNLOCK(so); 2230 error = ECANCELED; 2231 goto out; 2232 } 2233 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2234 SOCKBUF_UNLOCK(sb); 2235 SOCK_IO_SEND_UNLOCK(so); 2236 goto out; 2237 } 2238 2239 /* 2240 * Write as much data as the socket permits, but no more than a 2241 * a single sndbuf at a time. 2242 */ 2243 len = sbspace(sb); 2244 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2245 len = job->uaiocb.aio_nbytes - job->aio_sent; 2246 moretocome = false; 2247 } else 2248 moretocome = true; 2249 if (len > toep->params.sndbuf) { 2250 len = toep->params.sndbuf; 2251 sendmore = true; 2252 } else 2253 sendmore = false; 2254 2255 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2256 moretocome = true; 2257 SOCKBUF_UNLOCK(sb); 2258 MPASS(len != 0); 2259 2260 m = alloc_aiotx_mbuf(job, len); 2261 if (m == NULL) { 2262 SOCK_IO_SEND_UNLOCK(so); 2263 error = EFAULT; 2264 goto out; 2265 } 2266 2267 /* Inlined tcp_usr_send(). */ 2268 2269 inp = toep->inp; 2270 INP_WLOCK(inp); 2271 if (inp->inp_flags & INP_DROPPED) { 2272 INP_WUNLOCK(inp); 2273 SOCK_IO_SEND_UNLOCK(so); 2274 error = ECONNRESET; 2275 goto out; 2276 } 2277 2278 sent = m_length(m, NULL); 2279 job->aio_sent += sent; 2280 counter_u64_add(toep->ofld_txq->tx_aio_octets, sent); 2281 2282 sbappendstream(sb, m, 0); 2283 m = NULL; 2284 2285 if (!(inp->inp_flags & INP_DROPPED)) { 2286 tp = intotcpcb(inp); 2287 if (moretocome) 2288 tp->t_flags |= TF_MORETOCOME; 2289 error = tcp_output(tp); 2290 if (error < 0) { 2291 INP_UNLOCK_ASSERT(inp); 2292 SOCK_IO_SEND_UNLOCK(so); 2293 error = -error; 2294 goto out; 2295 } 2296 if (moretocome) 2297 tp->t_flags &= ~TF_MORETOCOME; 2298 } 2299 2300 INP_WUNLOCK(inp); 2301 if (sendmore) 2302 goto sendanother; 2303 SOCK_IO_SEND_UNLOCK(so); 2304 2305 if (error) 2306 goto out; 2307 2308 /* 2309 * If this is a blocking socket and the request has not been 2310 * fully completed, requeue it until the socket is ready 2311 * again. 2312 */ 2313 if (job->aio_sent < job->uaiocb.aio_nbytes && 2314 !(so->so_state & SS_NBIO)) { 2315 SOCKBUF_LOCK(sb); 2316 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2317 SOCKBUF_UNLOCK(sb); 2318 error = ECANCELED; 2319 goto out; 2320 } 2321 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2322 return; 2323 } 2324 2325 /* 2326 * If the request will not be requeued, drop the queue's 2327 * reference to the job. Any mbufs in flight should still 2328 * hold a reference, but this drops the reference that the 2329 * queue owns while it is waiting to queue mbufs to the 2330 * socket. 2331 */ 2332 aiotx_free_job(job); 2333 counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1); 2334 2335 out: 2336 if (error) { 2337 job->aio_error = (void *)(intptr_t)error; 2338 aiotx_free_job(job); 2339 } 2340 m_freem(m); 2341 SOCKBUF_LOCK(sb); 2342 } 2343 2344 static void 2345 t4_aiotx_task(void *context, int pending) 2346 { 2347 struct toepcb *toep = context; 2348 struct socket *so; 2349 struct kaiocb *job; 2350 struct epoch_tracker et; 2351 2352 so = toep->aiotx_so; 2353 CURVNET_SET(toep->vnet); 2354 NET_EPOCH_ENTER(et); 2355 SOCKBUF_LOCK(&so->so_snd); 2356 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2357 job = TAILQ_FIRST(&toep->aiotx_jobq); 2358 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2359 if (!aio_clear_cancel_function(job)) 2360 continue; 2361 2362 t4_aiotx_process_job(toep, so, job); 2363 } 2364 toep->aiotx_so = NULL; 2365 SOCKBUF_UNLOCK(&so->so_snd); 2366 NET_EPOCH_EXIT(et); 2367 2368 free_toepcb(toep); 2369 sorele(so); 2370 CURVNET_RESTORE(); 2371 } 2372 2373 static void 2374 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2375 { 2376 2377 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2378 #ifdef VERBOSE_TRACES 2379 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2380 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2381 #endif 2382 if (toep->aiotx_so != NULL) 2383 return; 2384 soref(so); 2385 toep->aiotx_so = so; 2386 hold_toepcb(toep); 2387 soaio_enqueue(&toep->aiotx_task); 2388 } 2389 2390 static void 2391 t4_aiotx_cancel(struct kaiocb *job) 2392 { 2393 struct socket *so; 2394 struct sockbuf *sb; 2395 struct tcpcb *tp; 2396 struct toepcb *toep; 2397 2398 so = job->fd_file->f_data; 2399 tp = sototcpcb(so); 2400 toep = tp->t_toe; 2401 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2402 sb = &so->so_snd; 2403 2404 SOCKBUF_LOCK(sb); 2405 if (!aio_cancel_cleared(job)) 2406 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2407 SOCKBUF_UNLOCK(sb); 2408 2409 job->aio_error = (void *)(intptr_t)ECANCELED; 2410 aiotx_free_job(job); 2411 } 2412 2413 int 2414 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2415 { 2416 struct tcpcb *tp = sototcpcb(so); 2417 struct toepcb *toep = tp->t_toe; 2418 struct adapter *sc = td_adapter(toep->td); 2419 2420 /* This only handles writes. */ 2421 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2422 return (EOPNOTSUPP); 2423 2424 if (!sc->tt.tx_zcopy) 2425 return (EOPNOTSUPP); 2426 2427 if (tls_tx_key(toep)) 2428 return (EOPNOTSUPP); 2429 2430 SOCKBUF_LOCK(&so->so_snd); 2431 #ifdef VERBOSE_TRACES 2432 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2433 #endif 2434 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2435 panic("new job was cancelled"); 2436 refcount_init(&job->aio_refs, 1); 2437 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2438 if (sowriteable(so)) 2439 t4_aiotx_queue_toep(so, toep); 2440 SOCKBUF_UNLOCK(&so->so_snd); 2441 return (0); 2442 } 2443 2444 void 2445 aiotx_init_toep(struct toepcb *toep) 2446 { 2447 2448 TAILQ_INIT(&toep->aiotx_jobq); 2449 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2450 } 2451 #endif 2452