1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_kern_tls.h" 34 #include "opt_ratelimit.h" 35 36 #ifdef TCP_OFFLOAD 37 #include <sys/param.h> 38 #include <sys/aio.h> 39 #include <sys/file.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/module.h> 43 #include <sys/proc.h> 44 #include <sys/protosw.h> 45 #include <sys/domain.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sglist.h> 49 #include <sys/taskqueue.h> 50 #include <netinet/in.h> 51 #include <netinet/in_pcb.h> 52 #include <netinet/ip.h> 53 #include <netinet/ip6.h> 54 #define TCPSTATES 55 #include <netinet/tcp_fsm.h> 56 #include <netinet/tcp_seq.h> 57 #include <netinet/tcp_var.h> 58 #include <netinet/toecore.h> 59 60 #include <security/mac/mac_framework.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_extern.h> 64 #include <vm/pmap.h> 65 #include <vm/vm_map.h> 66 #include <vm/vm_page.h> 67 68 #include <dev/iscsi/iscsi_proto.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (toep->params.tc_idx != -1) { 100 MPASS(toep->params.tc_idx >= 0 && 101 toep->params.tc_idx < sc->params.nsched_cls); 102 nparams++; 103 } 104 105 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 106 107 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 108 if (wr == NULL) { 109 /* XXX */ 110 panic("%s: allocation failure.", __func__); 111 } 112 flowc = wrtod(wr); 113 memset(flowc, 0, wr->wr_len); 114 115 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 116 V_FW_FLOWC_WR_NPARAMS(nparams)); 117 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 118 V_FW_WR_FLOWID(toep->tid)); 119 120 #define FLOWC_PARAM(__m, __v) \ 121 do { \ 122 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 123 flowc->mnemval[paramidx].val = htobe32(__v); \ 124 paramidx++; \ 125 } while (0) 126 127 paramidx = 0; 128 129 FLOWC_PARAM(PFNVFN, pfvf); 130 FLOWC_PARAM(CH, pi->tx_chan); 131 FLOWC_PARAM(PORT, pi->tx_chan); 132 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 133 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 134 if (tp) { 135 FLOWC_PARAM(MSS, toep->params.emss); 136 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 137 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 138 } else 139 FLOWC_PARAM(MSS, 512); 140 CTR6(KTR_CXGBE, 141 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 142 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 143 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 144 145 if (toep->params.tc_idx != -1) 146 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 147 #undef FLOWC_PARAM 148 149 KASSERT(paramidx == nparams, ("nparams mismatch")); 150 151 KASSERT(howmany(flowclen, 16) <= MAX_OFLD_TX_SDESC_CREDITS, 152 ("%s: tx_credits %u too large", __func__, howmany(flowclen, 16))); 153 txsd->tx_credits = howmany(flowclen, 16); 154 txsd->plen = 0; 155 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 156 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 157 toep->tx_credits -= txsd->tx_credits; 158 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 159 toep->txsd_pidx = 0; 160 toep->txsd_avail--; 161 162 toep->flags |= TPF_FLOWC_WR_SENT; 163 t4_wrq_tx(sc, wr); 164 } 165 166 #ifdef RATELIMIT 167 /* 168 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 169 */ 170 static int 171 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 172 { 173 int tc_idx, rc; 174 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 175 const int port_id = toep->vi->pi->port_id; 176 177 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 178 179 if (kbps == 0) { 180 /* unbind */ 181 tc_idx = -1; 182 } else { 183 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 184 if (rc != 0) 185 return (rc); 186 MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls); 187 } 188 189 if (toep->params.tc_idx != tc_idx) { 190 struct wrqe *wr; 191 struct fw_flowc_wr *flowc; 192 int nparams = 1, flowclen, flowclen16; 193 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 194 195 flowclen = sizeof(*flowc) + nparams * sizeof(struct 196 fw_flowc_mnemval); 197 flowclen16 = howmany(flowclen, 16); 198 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 199 (wr = alloc_wrqe(roundup2(flowclen, 16), 200 &toep->ofld_txq->wrq)) == NULL) { 201 if (tc_idx >= 0) 202 t4_release_cl_rl(sc, port_id, tc_idx); 203 return (ENOMEM); 204 } 205 206 flowc = wrtod(wr); 207 memset(flowc, 0, wr->wr_len); 208 209 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 210 V_FW_FLOWC_WR_NPARAMS(nparams)); 211 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 212 V_FW_WR_FLOWID(toep->tid)); 213 214 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 215 if (tc_idx == -1) 216 flowc->mnemval[0].val = htobe32(0xff); 217 else 218 flowc->mnemval[0].val = htobe32(tc_idx); 219 220 KASSERT(flowclen16 <= MAX_OFLD_TX_SDESC_CREDITS, 221 ("%s: tx_credits %u too large", __func__, flowclen16)); 222 txsd->tx_credits = flowclen16; 223 txsd->plen = 0; 224 toep->tx_credits -= txsd->tx_credits; 225 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 226 toep->txsd_pidx = 0; 227 toep->txsd_avail--; 228 t4_wrq_tx(sc, wr); 229 } 230 231 if (toep->params.tc_idx >= 0) 232 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 233 toep->params.tc_idx = tc_idx; 234 235 return (0); 236 } 237 #endif 238 239 void 240 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 241 { 242 struct wrqe *wr; 243 struct cpl_abort_req *req; 244 int tid = toep->tid; 245 struct inpcb *inp = toep->inp; 246 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 247 248 INP_WLOCK_ASSERT(inp); 249 250 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 251 __func__, toep->tid, 252 inp->inp_flags & INP_DROPPED ? "inp dropped" : 253 tcpstates[tp->t_state], 254 toep->flags, inp->inp_flags, 255 toep->flags & TPF_ABORT_SHUTDOWN ? 256 " (abort already in progress)" : ""); 257 258 if (toep->flags & TPF_ABORT_SHUTDOWN) 259 return; /* abort already in progress */ 260 261 toep->flags |= TPF_ABORT_SHUTDOWN; 262 263 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 264 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 265 266 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 267 if (wr == NULL) { 268 /* XXX */ 269 panic("%s: allocation failure.", __func__); 270 } 271 req = wrtod(wr); 272 273 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 274 if (inp->inp_flags & INP_DROPPED) 275 req->rsvd0 = htobe32(snd_nxt); 276 else 277 req->rsvd0 = htobe32(tp->snd_nxt); 278 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 279 req->cmd = CPL_ABORT_SEND_RST; 280 281 /* 282 * XXX: What's the correct way to tell that the inp hasn't been detached 283 * from its socket? Should I even be flushing the snd buffer here? 284 */ 285 if ((inp->inp_flags & INP_DROPPED) == 0) { 286 struct socket *so = inp->inp_socket; 287 288 if (so != NULL) /* because I'm not sure. See comment above */ 289 sbflush(&so->so_snd); 290 } 291 292 t4_l2t_send(sc, wr, toep->l2te); 293 } 294 295 /* 296 * Called when a connection is established to translate the TCP options 297 * reported by HW to FreeBSD's native format. 298 */ 299 static void 300 assign_rxopt(struct tcpcb *tp, uint16_t opt) 301 { 302 struct toepcb *toep = tp->t_toe; 303 struct inpcb *inp = tptoinpcb(tp); 304 struct adapter *sc = td_adapter(toep->td); 305 306 INP_LOCK_ASSERT(inp); 307 308 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 309 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 310 if (inp->inp_inc.inc_flags & INC_ISIPV6) 311 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 312 else 313 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 314 315 toep->params.emss = tp->t_maxseg; 316 if (G_TCPOPT_TSTAMP(opt)) { 317 toep->params.tstamp = 1; 318 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 319 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 320 tp->ts_recent = 0; /* hmmm */ 321 tp->ts_recent_age = tcp_ts_getticks(); 322 } else 323 toep->params.tstamp = 0; 324 325 if (G_TCPOPT_SACK(opt)) { 326 toep->params.sack = 1; 327 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 328 } else { 329 toep->params.sack = 0; 330 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 331 } 332 333 if (G_TCPOPT_WSCALE_OK(opt)) 334 tp->t_flags |= TF_RCVD_SCALE; 335 336 /* Doing window scaling? */ 337 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 338 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 339 tp->rcv_scale = tp->request_r_scale; 340 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 341 } else 342 toep->params.wscale = 0; 343 344 CTR6(KTR_CXGBE, 345 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 346 toep->tid, toep->params.mtu_idx, toep->params.emss, 347 toep->params.tstamp, toep->params.sack, toep->params.wscale); 348 } 349 350 /* 351 * Completes some final bits of initialization for just established connections 352 * and changes their state to TCPS_ESTABLISHED. 353 * 354 * The ISNs are from the exchange of SYNs. 355 */ 356 void 357 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 358 { 359 struct inpcb *inp = toep->inp; 360 struct socket *so = inp->inp_socket; 361 struct tcpcb *tp = intotcpcb(inp); 362 uint16_t tcpopt = be16toh(opt); 363 364 INP_WLOCK_ASSERT(inp); 365 KASSERT(tp->t_state == TCPS_SYN_SENT || 366 tp->t_state == TCPS_SYN_RECEIVED, 367 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 368 369 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 370 __func__, toep->tid, so, inp, tp, toep); 371 372 tcp_state_change(tp, TCPS_ESTABLISHED); 373 tp->t_starttime = ticks; 374 TCPSTAT_INC(tcps_connects); 375 376 tp->irs = irs; 377 tcp_rcvseqinit(tp); 378 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 379 tp->rcv_adv += tp->rcv_wnd; 380 tp->last_ack_sent = tp->rcv_nxt; 381 382 tp->iss = iss; 383 tcp_sendseqinit(tp); 384 tp->snd_una = iss + 1; 385 tp->snd_nxt = iss + 1; 386 tp->snd_max = iss + 1; 387 388 assign_rxopt(tp, tcpopt); 389 send_flowc_wr(toep, tp); 390 391 soisconnected(so); 392 } 393 394 int 395 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 396 { 397 struct wrqe *wr; 398 struct cpl_rx_data_ack *req; 399 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 400 401 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 402 403 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 404 if (wr == NULL) 405 return (0); 406 req = wrtod(wr); 407 408 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 409 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 410 411 t4_wrq_tx(sc, wr); 412 return (credits); 413 } 414 415 void 416 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 417 { 418 struct adapter *sc = tod->tod_softc; 419 struct inpcb *inp = tptoinpcb(tp); 420 struct socket *so = inp->inp_socket; 421 struct sockbuf *sb = &so->so_rcv; 422 struct toepcb *toep = tp->t_toe; 423 int rx_credits; 424 425 INP_WLOCK_ASSERT(inp); 426 SOCKBUF_LOCK_ASSERT(sb); 427 428 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 429 if (rx_credits > 0 && 430 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 431 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 432 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 433 rx_credits = send_rx_credits(sc, toep, rx_credits); 434 tp->rcv_wnd += rx_credits; 435 tp->rcv_adv += rx_credits; 436 } 437 } 438 439 void 440 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 441 { 442 struct inpcb *inp = tptoinpcb(tp); 443 struct socket *so = inp->inp_socket; 444 struct sockbuf *sb = &so->so_rcv; 445 446 SOCKBUF_LOCK(sb); 447 t4_rcvd_locked(tod, tp); 448 SOCKBUF_UNLOCK(sb); 449 } 450 451 /* 452 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 453 */ 454 int 455 t4_close_conn(struct adapter *sc, struct toepcb *toep) 456 { 457 struct wrqe *wr; 458 struct cpl_close_con_req *req; 459 unsigned int tid = toep->tid; 460 461 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 462 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 463 464 if (toep->flags & TPF_FIN_SENT) 465 return (0); 466 467 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 468 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 469 470 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 471 if (wr == NULL) { 472 /* XXX */ 473 panic("%s: allocation failure.", __func__); 474 } 475 req = wrtod(wr); 476 477 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 478 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 479 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 480 V_FW_WR_FLOWID(tid)); 481 req->wr.wr_lo = cpu_to_be64(0); 482 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 483 req->rsvd = 0; 484 485 toep->flags |= TPF_FIN_SENT; 486 toep->flags &= ~TPF_SEND_FIN; 487 t4_l2t_send(sc, wr, toep->l2te); 488 489 return (0); 490 } 491 492 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 493 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 494 #define MIN_ISO_TX_CREDITS (howmany(sizeof(struct cpl_tx_data_iso), 16)) 495 #define MIN_TX_CREDITS(iso) \ 496 (MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0)) 497 498 _Static_assert(MAX_OFLD_TX_CREDITS <= MAX_OFLD_TX_SDESC_CREDITS, 499 "MAX_OFLD_TX_SDESC_CREDITS too small"); 500 501 /* Maximum amount of immediate data we could stuff in a WR */ 502 static inline int 503 max_imm_payload(int tx_credits, int iso) 504 { 505 const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0; 506 const int n = 1; /* Use no more than one desc for imm. data WR */ 507 508 KASSERT(tx_credits >= 0 && 509 tx_credits <= MAX_OFLD_TX_CREDITS, 510 ("%s: %d credits", __func__, tx_credits)); 511 512 if (tx_credits < MIN_TX_CREDITS(iso)) 513 return (0); 514 515 if (tx_credits >= (n * EQ_ESIZE) / 16) 516 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) - 517 iso_cpl_size); 518 else 519 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) - 520 iso_cpl_size); 521 } 522 523 /* Maximum number of SGL entries we could stuff in a WR */ 524 static inline int 525 max_dsgl_nsegs(int tx_credits, int iso) 526 { 527 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 528 int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso); 529 530 KASSERT(tx_credits >= 0 && 531 tx_credits <= MAX_OFLD_TX_CREDITS, 532 ("%s: %d credits", __func__, tx_credits)); 533 534 if (tx_credits < MIN_TX_CREDITS(iso)) 535 return (0); 536 537 nseg += 2 * (sge_pair_credits * 16 / 24); 538 if ((sge_pair_credits * 16) % 24 == 16) 539 nseg++; 540 541 return (nseg); 542 } 543 544 static inline void 545 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode, 546 unsigned int immdlen, unsigned int plen, uint8_t credits, int shove, 547 int ulp_submode) 548 { 549 struct fw_ofld_tx_data_wr *txwr = dst; 550 551 txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) | 552 V_FW_WR_IMMDLEN(immdlen)); 553 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 554 V_FW_WR_LEN16(credits)); 555 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 556 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 557 txwr->plen = htobe32(plen); 558 559 if (toep->params.tx_align > 0) { 560 if (plen < 2 * toep->params.emss) 561 txwr->lsodisable_to_flags |= 562 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 563 else 564 txwr->lsodisable_to_flags |= 565 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 566 (toep->params.nagle == 0 ? 0 : 567 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 568 } 569 } 570 571 /* 572 * Generate a DSGL from a starting mbuf. The total number of segments and the 573 * maximum segments in any one mbuf are provided. 574 */ 575 static void 576 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 577 { 578 struct mbuf *m; 579 struct ulptx_sgl *usgl = dst; 580 int i, j, rc; 581 struct sglist sg; 582 struct sglist_seg segs[n]; 583 584 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 585 586 sglist_init(&sg, n, segs); 587 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 588 V_ULPTX_NSGE(nsegs)); 589 590 i = -1; 591 for (m = start; m != stop; m = m->m_next) { 592 if (m->m_flags & M_EXTPG) 593 rc = sglist_append_mbuf_epg(&sg, m, 594 mtod(m, vm_offset_t), m->m_len); 595 else 596 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 597 if (__predict_false(rc != 0)) 598 panic("%s: sglist_append %d", __func__, rc); 599 600 for (j = 0; j < sg.sg_nseg; i++, j++) { 601 if (i < 0) { 602 usgl->len0 = htobe32(segs[j].ss_len); 603 usgl->addr0 = htobe64(segs[j].ss_paddr); 604 } else { 605 usgl->sge[i / 2].len[i & 1] = 606 htobe32(segs[j].ss_len); 607 usgl->sge[i / 2].addr[i & 1] = 608 htobe64(segs[j].ss_paddr); 609 } 610 #ifdef INVARIANTS 611 nsegs--; 612 #endif 613 } 614 sglist_reset(&sg); 615 } 616 if (i & 1) 617 usgl->sge[i / 2].len[1] = htobe32(0); 618 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 619 __func__, nsegs, start, stop)); 620 } 621 622 bool 623 t4_push_raw_wr(struct adapter *sc, struct toepcb *toep, struct mbuf *m) 624 { 625 #ifdef INVARIANTS 626 struct inpcb *inp = toep->inp; 627 #endif 628 struct wrqe *wr; 629 struct ofld_tx_sdesc *txsd; 630 u_int credits, plen; 631 632 INP_WLOCK_ASSERT(inp); 633 MPASS(mbuf_raw_wr(m)); 634 plen = m->m_pkthdr.len; 635 credits = howmany(plen, 16); 636 if (credits > toep->tx_credits) 637 return (false); 638 639 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 640 if (wr == NULL) 641 return (false); 642 643 m_copydata(m, 0, plen, wrtod(wr)); 644 m_freem(m); 645 646 toep->tx_credits -= credits; 647 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 648 toep->flags |= TPF_TX_SUSPENDED; 649 650 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 651 KASSERT(credits <= MAX_OFLD_TX_SDESC_CREDITS, 652 ("%s: tx_credits %u too large", __func__, credits)); 653 txsd = &toep->txsd[toep->txsd_pidx]; 654 txsd->plen = 0; 655 txsd->tx_credits = credits; 656 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 657 toep->txsd_pidx = 0; 658 toep->txsd_avail--; 659 660 t4_wrq_tx(sc, wr); 661 return (true); 662 } 663 664 /* 665 * Max number of SGL entries an offload tx work request can have. This is 41 666 * (1 + 40) for a full 512B work request. 667 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 668 */ 669 #define OFLD_SGL_LEN (41) 670 671 /* 672 * Send data and/or a FIN to the peer. 673 * 674 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 675 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 676 * was transmitted. 677 * 678 * drop indicates the number of bytes that should be dropped from the head of 679 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 680 * contention on the send buffer lock (before this change it used to do 681 * sowwakeup and then t4_push_frames right after that when recovering from tx 682 * stalls). When drop is set this function MUST drop the bytes and wake up any 683 * writers. 684 */ 685 static void 686 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 687 { 688 struct mbuf *sndptr, *m, *sb_sndptr; 689 struct fw_ofld_tx_data_wr *txwr; 690 struct wrqe *wr; 691 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 692 struct inpcb *inp = toep->inp; 693 struct tcpcb *tp = intotcpcb(inp); 694 struct socket *so = inp->inp_socket; 695 struct sockbuf *sb = &so->so_snd; 696 struct mbufq *pduq = &toep->ulp_pduq; 697 int tx_credits, shove, compl, sowwakeup; 698 struct ofld_tx_sdesc *txsd; 699 bool nomap_mbuf_seen; 700 701 INP_WLOCK_ASSERT(inp); 702 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 703 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 704 705 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 706 ulp_mode(toep) == ULP_MODE_TCPDDP || 707 ulp_mode(toep) == ULP_MODE_TLS || 708 ulp_mode(toep) == ULP_MODE_RDMA, 709 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 710 711 #ifdef VERBOSE_TRACES 712 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 713 __func__, toep->tid, toep->flags, tp->t_flags, drop); 714 #endif 715 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 716 return; 717 718 #ifdef RATELIMIT 719 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 720 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 721 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 722 } 723 #endif 724 725 /* 726 * This function doesn't resume by itself. Someone else must clear the 727 * flag and call this function. 728 */ 729 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 730 KASSERT(drop == 0, 731 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 732 return; 733 } 734 735 txsd = &toep->txsd[toep->txsd_pidx]; 736 do { 737 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 738 max_imm = max_imm_payload(tx_credits, 0); 739 max_nsegs = max_dsgl_nsegs(tx_credits, 0); 740 741 if (__predict_false((sndptr = mbufq_first(pduq)) != NULL)) { 742 if (!t4_push_raw_wr(sc, toep, sndptr)) { 743 toep->flags |= TPF_TX_SUSPENDED; 744 return; 745 } 746 747 m = mbufq_dequeue(pduq); 748 MPASS(m == sndptr); 749 750 txsd = &toep->txsd[toep->txsd_pidx]; 751 continue; 752 } 753 754 SOCKBUF_LOCK(sb); 755 sowwakeup = drop; 756 if (drop) { 757 sbdrop_locked(sb, drop); 758 drop = 0; 759 } 760 sb_sndptr = sb->sb_sndptr; 761 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 762 plen = 0; 763 nsegs = 0; 764 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 765 nomap_mbuf_seen = false; 766 for (m = sndptr; m != NULL; m = m->m_next) { 767 int n; 768 769 if ((m->m_flags & M_NOTREADY) != 0) 770 break; 771 if (plen + m->m_len > MAX_OFLD_TX_SDESC_PLEN) 772 break; 773 if (m->m_flags & M_EXTPG) { 774 #ifdef KERN_TLS 775 if (m->m_epg_tls != NULL) { 776 toep->flags |= TPF_KTLS; 777 if (plen == 0) { 778 SOCKBUF_UNLOCK(sb); 779 t4_push_ktls(sc, toep, 0); 780 return; 781 } 782 break; 783 } 784 #endif 785 n = sglist_count_mbuf_epg(m, 786 mtod(m, vm_offset_t), m->m_len); 787 } else 788 n = sglist_count(mtod(m, void *), m->m_len); 789 790 nsegs += n; 791 plen += m->m_len; 792 793 /* This mbuf sent us _over_ the nsegs limit, back out */ 794 if (plen > max_imm && nsegs > max_nsegs) { 795 nsegs -= n; 796 plen -= m->m_len; 797 if (plen == 0) { 798 /* Too few credits */ 799 toep->flags |= TPF_TX_SUSPENDED; 800 if (sowwakeup) { 801 if (!TAILQ_EMPTY( 802 &toep->aiotx_jobq)) 803 t4_aiotx_queue_toep(so, 804 toep); 805 sowwakeup_locked(so); 806 } else 807 SOCKBUF_UNLOCK(sb); 808 SOCKBUF_UNLOCK_ASSERT(sb); 809 return; 810 } 811 break; 812 } 813 814 if (m->m_flags & M_EXTPG) 815 nomap_mbuf_seen = true; 816 if (max_nsegs_1mbuf < n) 817 max_nsegs_1mbuf = n; 818 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 819 820 /* This mbuf put us right at the max_nsegs limit */ 821 if (plen > max_imm && nsegs == max_nsegs) { 822 m = m->m_next; 823 break; 824 } 825 } 826 827 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 828 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 829 compl = 1; 830 else 831 compl = 0; 832 833 if (sb->sb_flags & SB_AUTOSIZE && 834 V_tcp_do_autosndbuf && 835 sb->sb_hiwat < V_tcp_autosndbuf_max && 836 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 837 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 838 V_tcp_autosndbuf_max); 839 840 if (!sbreserve_locked(so, SO_SND, newsize, NULL)) 841 sb->sb_flags &= ~SB_AUTOSIZE; 842 else 843 sowwakeup = 1; /* room available */ 844 } 845 if (sowwakeup) { 846 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 847 t4_aiotx_queue_toep(so, toep); 848 sowwakeup_locked(so); 849 } else 850 SOCKBUF_UNLOCK(sb); 851 SOCKBUF_UNLOCK_ASSERT(sb); 852 853 /* nothing to send */ 854 if (plen == 0) { 855 KASSERT(m == NULL || (m->m_flags & M_NOTREADY) != 0, 856 ("%s: nothing to send, but m != NULL is ready", 857 __func__)); 858 break; 859 } 860 861 if (__predict_false(toep->flags & TPF_FIN_SENT)) 862 panic("%s: excess tx.", __func__); 863 864 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 865 if (plen <= max_imm && !nomap_mbuf_seen) { 866 867 /* Immediate data tx */ 868 869 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 870 &toep->ofld_txq->wrq); 871 if (wr == NULL) { 872 /* XXX: how will we recover from this? */ 873 toep->flags |= TPF_TX_SUSPENDED; 874 return; 875 } 876 txwr = wrtod(wr); 877 credits = howmany(wr->wr_len, 16); 878 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen, 879 credits, shove, 0); 880 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 881 nsegs = 0; 882 } else { 883 int wr_len; 884 885 /* DSGL tx */ 886 887 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 888 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 889 wr = alloc_wrqe(roundup2(wr_len, 16), 890 &toep->ofld_txq->wrq); 891 if (wr == NULL) { 892 /* XXX: how will we recover from this? */ 893 toep->flags |= TPF_TX_SUSPENDED; 894 return; 895 } 896 txwr = wrtod(wr); 897 credits = howmany(wr_len, 16); 898 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen, 899 credits, shove, 0); 900 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 901 max_nsegs_1mbuf); 902 if (wr_len & 0xf) { 903 uint64_t *pad = (uint64_t *) 904 ((uintptr_t)txwr + wr_len); 905 *pad = 0; 906 } 907 } 908 909 KASSERT(toep->tx_credits >= credits, 910 ("%s: not enough credits", __func__)); 911 912 toep->tx_credits -= credits; 913 toep->tx_nocompl += credits; 914 toep->plen_nocompl += plen; 915 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 916 toep->tx_nocompl >= toep->tx_total / 4) 917 compl = 1; 918 919 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 920 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 921 toep->tx_nocompl = 0; 922 toep->plen_nocompl = 0; 923 } 924 925 tp->snd_nxt += plen; 926 tp->snd_max += plen; 927 928 SOCKBUF_LOCK(sb); 929 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 930 sb->sb_sndptr = sb_sndptr; 931 SOCKBUF_UNLOCK(sb); 932 933 toep->flags |= TPF_TX_DATA_SENT; 934 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 935 toep->flags |= TPF_TX_SUSPENDED; 936 937 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 938 KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, 939 ("%s: plen %u too large", __func__, plen)); 940 txsd->plen = plen; 941 txsd->tx_credits = credits; 942 txsd++; 943 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 944 toep->txsd_pidx = 0; 945 txsd = &toep->txsd[0]; 946 } 947 toep->txsd_avail--; 948 949 t4_l2t_send(sc, wr, toep->l2te); 950 } while (m != NULL && (m->m_flags & M_NOTREADY) == 0); 951 952 /* Send a FIN if requested, but only if there's no more data to send */ 953 if (m == NULL && toep->flags & TPF_SEND_FIN) 954 t4_close_conn(sc, toep); 955 } 956 957 static inline void 958 rqdrop_locked(struct mbufq *q, int plen) 959 { 960 struct mbuf *m; 961 962 while (plen > 0) { 963 m = mbufq_dequeue(q); 964 965 /* Too many credits. */ 966 MPASS(m != NULL); 967 M_ASSERTPKTHDR(m); 968 969 /* Partial credits. */ 970 MPASS(plen >= m->m_pkthdr.len); 971 972 plen -= m->m_pkthdr.len; 973 m_freem(m); 974 } 975 } 976 977 /* 978 * Not a bit in the TCB, but is a bit in the ulp_submode field of the 979 * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR. 980 */ 981 #define ULP_ISO G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO) 982 983 static void 984 write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss, 985 int len, int npdu) 986 { 987 struct cpl_tx_data_iso *cpl; 988 unsigned int burst_size; 989 unsigned int last; 990 991 /* 992 * The firmware will set the 'F' bit on the last PDU when 993 * either condition is true: 994 * 995 * - this large PDU is marked as the "last" slice 996 * 997 * - the amount of data payload bytes equals the burst_size 998 * 999 * The strategy used here is to always set the burst_size 1000 * artificially high (len includes the size of the template 1001 * BHS) and only set the "last" flag if the original PDU had 1002 * 'F' set. 1003 */ 1004 burst_size = len; 1005 last = !!(flags & CXGBE_ISO_F); 1006 1007 cpl = (struct cpl_tx_data_iso *)dst; 1008 cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) | 1009 V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) | 1010 V_CPL_TX_DATA_ISO_CPLHDRLEN(0) | 1011 V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) | 1012 V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) | 1013 V_CPL_TX_DATA_ISO_IMMEDIATE(0) | 1014 V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags))); 1015 1016 cpl->ahs_len = 0; 1017 cpl->mpdu = htons(DIV_ROUND_UP(mss, 4)); 1018 cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4)); 1019 cpl->len = htonl(len); 1020 cpl->reserved2_seglen_offset = htonl(0); 1021 cpl->datasn_offset = htonl(0); 1022 cpl->buffer_offset = htonl(0); 1023 cpl->reserved3 = 0; 1024 } 1025 1026 static struct wrqe * 1027 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr) 1028 { 1029 struct mbuf *m; 1030 struct fw_ofld_tx_data_wr *txwr; 1031 struct cpl_tx_data_iso *cpl_iso; 1032 void *p; 1033 struct wrqe *wr; 1034 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 1035 u_int adjusted_plen, imm_data, ulp_submode; 1036 struct inpcb *inp = toep->inp; 1037 struct tcpcb *tp = intotcpcb(inp); 1038 int tx_credits, shove, npdu, wr_len; 1039 uint16_t iso_mss; 1040 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 1041 bool iso, nomap_mbuf_seen; 1042 1043 M_ASSERTPKTHDR(sndptr); 1044 1045 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 1046 if (mbuf_raw_wr(sndptr)) { 1047 plen = sndptr->m_pkthdr.len; 1048 KASSERT(plen <= SGE_MAX_WR_LEN, 1049 ("raw WR len %u is greater than max WR len", plen)); 1050 if (plen > tx_credits * 16) 1051 return (NULL); 1052 1053 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq); 1054 if (__predict_false(wr == NULL)) 1055 return (NULL); 1056 1057 m_copydata(sndptr, 0, plen, wrtod(wr)); 1058 return (wr); 1059 } 1060 1061 iso = mbuf_iscsi_iso(sndptr); 1062 max_imm = max_imm_payload(tx_credits, iso); 1063 max_nsegs = max_dsgl_nsegs(tx_credits, iso); 1064 iso_mss = mbuf_iscsi_iso_mss(sndptr); 1065 1066 plen = 0; 1067 nsegs = 0; 1068 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 1069 nomap_mbuf_seen = false; 1070 for (m = sndptr; m != NULL; m = m->m_next) { 1071 int n; 1072 1073 if (m->m_flags & M_EXTPG) 1074 n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t), 1075 m->m_len); 1076 else 1077 n = sglist_count(mtod(m, void *), m->m_len); 1078 1079 nsegs += n; 1080 plen += m->m_len; 1081 1082 /* 1083 * This mbuf would send us _over_ the nsegs limit. 1084 * Suspend tx because the PDU can't be sent out. 1085 */ 1086 if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs) 1087 return (NULL); 1088 1089 if (m->m_flags & M_EXTPG) 1090 nomap_mbuf_seen = true; 1091 if (max_nsegs_1mbuf < n) 1092 max_nsegs_1mbuf = n; 1093 } 1094 1095 if (__predict_false(toep->flags & TPF_FIN_SENT)) 1096 panic("%s: excess tx.", __func__); 1097 1098 /* 1099 * We have a PDU to send. All of it goes out in one WR so 'm' 1100 * is NULL. A PDU's length is always a multiple of 4. 1101 */ 1102 MPASS(m == NULL); 1103 MPASS((plen & 3) == 0); 1104 MPASS(sndptr->m_pkthdr.len == plen); 1105 1106 shove = !(tp->t_flags & TF_MORETOCOME); 1107 1108 /* 1109 * plen doesn't include header and data digests, which are 1110 * generated and inserted in the right places by the TOE, but 1111 * they do occupy TCP sequence space and need to be accounted 1112 * for. 1113 */ 1114 ulp_submode = mbuf_ulp_submode(sndptr); 1115 MPASS(ulp_submode < nitems(ulp_extra_len)); 1116 npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1; 1117 adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu; 1118 if (iso) 1119 adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1); 1120 wr_len = sizeof(*txwr); 1121 if (iso) 1122 wr_len += sizeof(struct cpl_tx_data_iso); 1123 if (plen <= max_imm && !nomap_mbuf_seen) { 1124 /* Immediate data tx */ 1125 imm_data = plen; 1126 wr_len += plen; 1127 nsegs = 0; 1128 } else { 1129 /* DSGL tx */ 1130 imm_data = 0; 1131 wr_len += sizeof(struct ulptx_sgl) + 1132 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1133 } 1134 1135 wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); 1136 if (wr == NULL) { 1137 /* XXX: how will we recover from this? */ 1138 return (NULL); 1139 } 1140 txwr = wrtod(wr); 1141 credits = howmany(wr->wr_len, 16); 1142 1143 if (iso) { 1144 write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR, 1145 imm_data + sizeof(struct cpl_tx_data_iso), 1146 adjusted_plen, credits, shove, ulp_submode | ULP_ISO); 1147 cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1); 1148 MPASS(plen == sndptr->m_pkthdr.len); 1149 write_tx_data_iso(cpl_iso, ulp_submode, 1150 mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu); 1151 p = cpl_iso + 1; 1152 } else { 1153 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data, 1154 adjusted_plen, credits, shove, ulp_submode); 1155 p = txwr + 1; 1156 } 1157 1158 if (imm_data != 0) { 1159 m_copydata(sndptr, 0, plen, p); 1160 } else { 1161 write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf); 1162 if (wr_len & 0xf) { 1163 uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len); 1164 *pad = 0; 1165 } 1166 } 1167 1168 KASSERT(toep->tx_credits >= credits, 1169 ("%s: not enough credits: credits %u " 1170 "toep->tx_credits %u tx_credits %u nsegs %u " 1171 "max_nsegs %u iso %d", __func__, credits, 1172 toep->tx_credits, tx_credits, nsegs, max_nsegs, iso)); 1173 1174 tp->snd_nxt += adjusted_plen; 1175 tp->snd_max += adjusted_plen; 1176 1177 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu); 1178 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); 1179 if (iso) 1180 counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1); 1181 1182 return (wr); 1183 } 1184 1185 void 1186 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 1187 { 1188 struct mbuf *sndptr, *m; 1189 struct fw_wr_hdr *wrhdr; 1190 struct wrqe *wr; 1191 u_int plen, credits; 1192 struct inpcb *inp = toep->inp; 1193 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 1194 struct mbufq *pduq = &toep->ulp_pduq; 1195 1196 INP_WLOCK_ASSERT(inp); 1197 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1198 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 1199 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 1200 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 1201 1202 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 1203 return; 1204 1205 /* 1206 * This function doesn't resume by itself. Someone else must clear the 1207 * flag and call this function. 1208 */ 1209 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 1210 KASSERT(drop == 0, 1211 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 1212 return; 1213 } 1214 1215 if (drop) { 1216 struct socket *so = inp->inp_socket; 1217 struct sockbuf *sb = &so->so_snd; 1218 int sbu; 1219 1220 /* 1221 * An unlocked read is ok here as the data should only 1222 * transition from a non-zero value to either another 1223 * non-zero value or zero. Once it is zero it should 1224 * stay zero. 1225 */ 1226 if (__predict_false(sbused(sb)) > 0) { 1227 SOCKBUF_LOCK(sb); 1228 sbu = sbused(sb); 1229 if (sbu > 0) { 1230 /* 1231 * The data transmitted before the 1232 * tid's ULP mode changed to ISCSI is 1233 * still in so_snd. Incoming credits 1234 * should account for so_snd first. 1235 */ 1236 sbdrop_locked(sb, min(sbu, drop)); 1237 drop -= min(sbu, drop); 1238 } 1239 sowwakeup_locked(so); /* unlocks so_snd */ 1240 } 1241 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 1242 } 1243 1244 while ((sndptr = mbufq_first(pduq)) != NULL) { 1245 wr = write_iscsi_mbuf_wr(toep, sndptr); 1246 if (wr == NULL) { 1247 toep->flags |= TPF_TX_SUSPENDED; 1248 return; 1249 } 1250 1251 plen = sndptr->m_pkthdr.len; 1252 credits = howmany(wr->wr_len, 16); 1253 KASSERT(toep->tx_credits >= credits, 1254 ("%s: not enough credits", __func__)); 1255 1256 m = mbufq_dequeue(pduq); 1257 MPASS(m == sndptr); 1258 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1259 1260 toep->tx_credits -= credits; 1261 toep->tx_nocompl += credits; 1262 toep->plen_nocompl += plen; 1263 1264 /* 1265 * Ensure there are enough credits for a full-sized WR 1266 * as page pod WRs can be full-sized. 1267 */ 1268 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 && 1269 toep->tx_nocompl >= toep->tx_total / 4) { 1270 wrhdr = wrtod(wr); 1271 wrhdr->hi |= htobe32(F_FW_WR_COMPL); 1272 toep->tx_nocompl = 0; 1273 toep->plen_nocompl = 0; 1274 } 1275 1276 toep->flags |= TPF_TX_DATA_SENT; 1277 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1278 toep->flags |= TPF_TX_SUSPENDED; 1279 1280 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1281 KASSERT(plen <= MAX_OFLD_TX_SDESC_PLEN, 1282 ("%s: plen %u too large", __func__, plen)); 1283 txsd->plen = plen; 1284 txsd->tx_credits = credits; 1285 txsd++; 1286 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1287 toep->txsd_pidx = 0; 1288 txsd = &toep->txsd[0]; 1289 } 1290 toep->txsd_avail--; 1291 1292 t4_l2t_send(sc, wr, toep->l2te); 1293 } 1294 1295 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1296 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1297 t4_close_conn(sc, toep); 1298 } 1299 1300 static inline void 1301 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1302 { 1303 1304 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1305 t4_push_pdus(sc, toep, drop); 1306 else if (toep->flags & TPF_KTLS) 1307 t4_push_ktls(sc, toep, drop); 1308 else 1309 t4_push_frames(sc, toep, drop); 1310 } 1311 1312 void 1313 t4_raw_wr_tx(struct adapter *sc, struct toepcb *toep, struct mbuf *m) 1314 { 1315 #ifdef INVARIANTS 1316 struct inpcb *inp = toep->inp; 1317 #endif 1318 1319 INP_WLOCK_ASSERT(inp); 1320 1321 /* 1322 * If there are other raw WRs enqueued, enqueue to preserve 1323 * FIFO ordering. 1324 */ 1325 if (!mbufq_empty(&toep->ulp_pduq)) { 1326 mbufq_enqueue(&toep->ulp_pduq, m); 1327 return; 1328 } 1329 1330 /* 1331 * Cannot call t4_push_data here as that will lock so_snd and 1332 * some callers of this run in rx handlers with so_rcv locked. 1333 * Instead, just try to transmit this WR. 1334 */ 1335 if (!t4_push_raw_wr(sc, toep, m)) { 1336 mbufq_enqueue(&toep->ulp_pduq, m); 1337 toep->flags |= TPF_TX_SUSPENDED; 1338 } 1339 } 1340 1341 int 1342 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1343 { 1344 struct adapter *sc = tod->tod_softc; 1345 #ifdef INVARIANTS 1346 struct inpcb *inp = tptoinpcb(tp); 1347 #endif 1348 struct toepcb *toep = tp->t_toe; 1349 1350 INP_WLOCK_ASSERT(inp); 1351 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1352 ("%s: inp %p dropped.", __func__, inp)); 1353 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1354 1355 t4_push_data(sc, toep, 0); 1356 1357 return (0); 1358 } 1359 1360 int 1361 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1362 { 1363 struct adapter *sc = tod->tod_softc; 1364 #ifdef INVARIANTS 1365 struct inpcb *inp = tptoinpcb(tp); 1366 #endif 1367 struct toepcb *toep = tp->t_toe; 1368 1369 INP_WLOCK_ASSERT(inp); 1370 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1371 ("%s: inp %p dropped.", __func__, inp)); 1372 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1373 1374 toep->flags |= TPF_SEND_FIN; 1375 if (tp->t_state >= TCPS_ESTABLISHED) 1376 t4_push_data(sc, toep, 0); 1377 1378 return (0); 1379 } 1380 1381 int 1382 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1383 { 1384 struct adapter *sc = tod->tod_softc; 1385 #if defined(INVARIANTS) 1386 struct inpcb *inp = tptoinpcb(tp); 1387 #endif 1388 struct toepcb *toep = tp->t_toe; 1389 1390 INP_WLOCK_ASSERT(inp); 1391 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1392 ("%s: inp %p dropped.", __func__, inp)); 1393 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1394 1395 /* hmmmm */ 1396 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1397 ("%s: flowc for tid %u [%s] not sent already", 1398 __func__, toep->tid, tcpstates[tp->t_state])); 1399 1400 send_reset(sc, toep, 0); 1401 return (0); 1402 } 1403 1404 /* 1405 * Peer has sent us a FIN. 1406 */ 1407 static int 1408 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1409 { 1410 struct adapter *sc = iq->adapter; 1411 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1412 unsigned int tid = GET_TID(cpl); 1413 struct toepcb *toep = lookup_tid(sc, tid); 1414 struct inpcb *inp = toep->inp; 1415 struct tcpcb *tp = NULL; 1416 struct socket *so; 1417 struct epoch_tracker et; 1418 #ifdef INVARIANTS 1419 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1420 #endif 1421 1422 KASSERT(opcode == CPL_PEER_CLOSE, 1423 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1424 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1425 1426 if (__predict_false(toep->flags & TPF_SYNQE)) { 1427 /* 1428 * do_pass_establish must have run before do_peer_close and if 1429 * this is still a synqe instead of a toepcb then the connection 1430 * must be getting aborted. 1431 */ 1432 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1433 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1434 toep, toep->flags); 1435 return (0); 1436 } 1437 1438 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1439 1440 CURVNET_SET(toep->vnet); 1441 NET_EPOCH_ENTER(et); 1442 INP_WLOCK(inp); 1443 tp = intotcpcb(inp); 1444 1445 CTR6(KTR_CXGBE, 1446 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1447 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1448 toep->ddp.flags, inp); 1449 1450 if (toep->flags & TPF_ABORT_SHUTDOWN) 1451 goto done; 1452 1453 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1454 DDP_LOCK(toep); 1455 if (__predict_false(toep->ddp.flags & 1456 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1457 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1458 DDP_UNLOCK(toep); 1459 } 1460 so = inp->inp_socket; 1461 socantrcvmore(so); 1462 1463 if (ulp_mode(toep) == ULP_MODE_RDMA || 1464 (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) { 1465 /* 1466 * There might be data received via DDP before the FIN 1467 * not reported to the driver. Just assume the 1468 * sequence number in the CPL is correct as the 1469 * sequence number of the FIN. 1470 */ 1471 } else { 1472 KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt), 1473 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1474 be32toh(cpl->rcv_nxt))); 1475 } 1476 1477 tp->rcv_nxt = be32toh(cpl->rcv_nxt); 1478 1479 switch (tp->t_state) { 1480 case TCPS_SYN_RECEIVED: 1481 tp->t_starttime = ticks; 1482 /* FALLTHROUGH */ 1483 1484 case TCPS_ESTABLISHED: 1485 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1486 break; 1487 1488 case TCPS_FIN_WAIT_1: 1489 tcp_state_change(tp, TCPS_CLOSING); 1490 break; 1491 1492 case TCPS_FIN_WAIT_2: 1493 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1494 t4_pcb_detach(NULL, tp); 1495 tcp_twstart(tp); 1496 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1497 NET_EPOCH_EXIT(et); 1498 CURVNET_RESTORE(); 1499 1500 INP_WLOCK(inp); 1501 final_cpl_received(toep); 1502 return (0); 1503 1504 default: 1505 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1506 __func__, tid, tp->t_state); 1507 } 1508 done: 1509 INP_WUNLOCK(inp); 1510 NET_EPOCH_EXIT(et); 1511 CURVNET_RESTORE(); 1512 return (0); 1513 } 1514 1515 /* 1516 * Peer has ACK'd our FIN. 1517 */ 1518 static int 1519 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1520 struct mbuf *m) 1521 { 1522 struct adapter *sc = iq->adapter; 1523 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1524 unsigned int tid = GET_TID(cpl); 1525 struct toepcb *toep = lookup_tid(sc, tid); 1526 struct inpcb *inp = toep->inp; 1527 struct tcpcb *tp = NULL; 1528 struct socket *so = NULL; 1529 struct epoch_tracker et; 1530 #ifdef INVARIANTS 1531 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1532 #endif 1533 1534 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1535 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1536 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1537 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1538 1539 CURVNET_SET(toep->vnet); 1540 NET_EPOCH_ENTER(et); 1541 INP_WLOCK(inp); 1542 tp = intotcpcb(inp); 1543 1544 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1545 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1546 1547 if (toep->flags & TPF_ABORT_SHUTDOWN) 1548 goto done; 1549 1550 so = inp->inp_socket; 1551 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1552 1553 switch (tp->t_state) { 1554 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1555 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1556 t4_pcb_detach(NULL, tp); 1557 tcp_twstart(tp); 1558 release: 1559 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1560 NET_EPOCH_EXIT(et); 1561 CURVNET_RESTORE(); 1562 1563 INP_WLOCK(inp); 1564 final_cpl_received(toep); /* no more CPLs expected */ 1565 1566 return (0); 1567 case TCPS_LAST_ACK: 1568 if (tcp_close(tp)) 1569 INP_WUNLOCK(inp); 1570 goto release; 1571 1572 case TCPS_FIN_WAIT_1: 1573 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1574 soisdisconnected(so); 1575 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1576 break; 1577 1578 default: 1579 log(LOG_ERR, 1580 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1581 __func__, tid, tcpstates[tp->t_state]); 1582 } 1583 done: 1584 INP_WUNLOCK(inp); 1585 NET_EPOCH_EXIT(et); 1586 CURVNET_RESTORE(); 1587 return (0); 1588 } 1589 1590 void 1591 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, 1592 int rst_status) 1593 { 1594 struct wrqe *wr; 1595 struct cpl_abort_rpl *cpl; 1596 1597 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); 1598 if (wr == NULL) { 1599 /* XXX */ 1600 panic("%s: allocation failure.", __func__); 1601 } 1602 cpl = wrtod(wr); 1603 1604 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1605 cpl->cmd = rst_status; 1606 1607 t4_wrq_tx(sc, wr); 1608 } 1609 1610 static int 1611 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1612 { 1613 switch (abort_reason) { 1614 case CPL_ERR_BAD_SYN: 1615 case CPL_ERR_CONN_RESET: 1616 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1617 case CPL_ERR_XMIT_TIMEDOUT: 1618 case CPL_ERR_PERSIST_TIMEDOUT: 1619 case CPL_ERR_FINWAIT2_TIMEDOUT: 1620 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1621 return (ETIMEDOUT); 1622 default: 1623 return (EIO); 1624 } 1625 } 1626 1627 /* 1628 * TCP RST from the peer, timeout, or some other such critical error. 1629 */ 1630 static int 1631 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1632 { 1633 struct adapter *sc = iq->adapter; 1634 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1635 unsigned int tid = GET_TID(cpl); 1636 struct toepcb *toep = lookup_tid(sc, tid); 1637 struct sge_ofld_txq *ofld_txq = toep->ofld_txq; 1638 struct inpcb *inp; 1639 struct tcpcb *tp; 1640 struct epoch_tracker et; 1641 #ifdef INVARIANTS 1642 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1643 #endif 1644 1645 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1646 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1647 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1648 1649 if (toep->flags & TPF_SYNQE) 1650 return (do_abort_req_synqe(iq, rss, m)); 1651 1652 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1653 1654 if (negative_advice(cpl->status)) { 1655 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1656 __func__, cpl->status, tid, toep->flags); 1657 return (0); /* Ignore negative advice */ 1658 } 1659 1660 inp = toep->inp; 1661 CURVNET_SET(toep->vnet); 1662 NET_EPOCH_ENTER(et); /* for tcp_close */ 1663 INP_WLOCK(inp); 1664 1665 tp = intotcpcb(inp); 1666 1667 CTR6(KTR_CXGBE, 1668 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1669 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1670 inp->inp_flags, cpl->status); 1671 1672 /* 1673 * If we'd initiated an abort earlier the reply to it is responsible for 1674 * cleaning up resources. Otherwise we tear everything down right here 1675 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1676 */ 1677 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1678 INP_WUNLOCK(inp); 1679 goto done; 1680 } 1681 toep->flags |= TPF_ABORT_SHUTDOWN; 1682 1683 if ((inp->inp_flags & INP_DROPPED) == 0) { 1684 struct socket *so = inp->inp_socket; 1685 1686 if (so != NULL) 1687 so_error_set(so, abort_status_to_errno(tp, 1688 cpl->status)); 1689 tp = tcp_close(tp); 1690 if (tp == NULL) 1691 INP_WLOCK(inp); /* re-acquire */ 1692 } 1693 1694 final_cpl_received(toep); 1695 done: 1696 NET_EPOCH_EXIT(et); 1697 CURVNET_RESTORE(); 1698 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1699 return (0); 1700 } 1701 1702 /* 1703 * Reply to the CPL_ABORT_REQ (send_reset) 1704 */ 1705 static int 1706 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1707 { 1708 struct adapter *sc = iq->adapter; 1709 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1710 unsigned int tid = GET_TID(cpl); 1711 struct toepcb *toep = lookup_tid(sc, tid); 1712 struct inpcb *inp = toep->inp; 1713 #ifdef INVARIANTS 1714 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1715 #endif 1716 1717 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1718 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1719 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1720 1721 if (toep->flags & TPF_SYNQE) 1722 return (do_abort_rpl_synqe(iq, rss, m)); 1723 1724 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1725 1726 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1727 __func__, tid, toep, inp, cpl->status); 1728 1729 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1730 ("%s: wasn't expecting abort reply", __func__)); 1731 1732 INP_WLOCK(inp); 1733 final_cpl_received(toep); 1734 1735 return (0); 1736 } 1737 1738 static int 1739 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1740 { 1741 struct adapter *sc = iq->adapter; 1742 const struct cpl_rx_data *cpl = mtod(m, const void *); 1743 unsigned int tid = GET_TID(cpl); 1744 struct toepcb *toep = lookup_tid(sc, tid); 1745 struct inpcb *inp = toep->inp; 1746 struct tcpcb *tp; 1747 struct socket *so; 1748 struct sockbuf *sb; 1749 struct epoch_tracker et; 1750 int len; 1751 uint32_t ddp_placed = 0; 1752 1753 if (__predict_false(toep->flags & TPF_SYNQE)) { 1754 /* 1755 * do_pass_establish must have run before do_rx_data and if this 1756 * is still a synqe instead of a toepcb then the connection must 1757 * be getting aborted. 1758 */ 1759 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1760 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1761 toep, toep->flags); 1762 m_freem(m); 1763 return (0); 1764 } 1765 1766 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1767 1768 /* strip off CPL header */ 1769 m_adj(m, sizeof(*cpl)); 1770 len = m->m_pkthdr.len; 1771 1772 INP_WLOCK(inp); 1773 if (inp->inp_flags & INP_DROPPED) { 1774 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1775 __func__, tid, len, inp->inp_flags); 1776 INP_WUNLOCK(inp); 1777 m_freem(m); 1778 return (0); 1779 } 1780 1781 tp = intotcpcb(inp); 1782 1783 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1784 toep->flags & TPF_TLS_RECEIVE)) { 1785 /* Received "raw" data on a TLS socket. */ 1786 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1787 __func__, tid, len); 1788 do_rx_data_tls(cpl, toep, m); 1789 return (0); 1790 } 1791 1792 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1793 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1794 1795 tp->rcv_nxt += len; 1796 if (tp->rcv_wnd < len) { 1797 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1798 ("%s: negative window size", __func__)); 1799 } 1800 1801 tp->rcv_wnd -= len; 1802 tp->t_rcvtime = ticks; 1803 1804 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1805 DDP_LOCK(toep); 1806 so = inp_inpcbtosocket(inp); 1807 sb = &so->so_rcv; 1808 SOCKBUF_LOCK(sb); 1809 1810 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1811 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1812 __func__, tid, len); 1813 m_freem(m); 1814 SOCKBUF_UNLOCK(sb); 1815 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1816 DDP_UNLOCK(toep); 1817 INP_WUNLOCK(inp); 1818 1819 CURVNET_SET(toep->vnet); 1820 NET_EPOCH_ENTER(et); 1821 INP_WLOCK(inp); 1822 tp = tcp_drop(tp, ECONNRESET); 1823 if (tp) 1824 INP_WUNLOCK(inp); 1825 NET_EPOCH_EXIT(et); 1826 CURVNET_RESTORE(); 1827 1828 return (0); 1829 } 1830 1831 /* receive buffer autosize */ 1832 MPASS(toep->vnet == so->so_vnet); 1833 CURVNET_SET(toep->vnet); 1834 if (sb->sb_flags & SB_AUTOSIZE && 1835 V_tcp_do_autorcvbuf && 1836 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1837 len > (sbspace(sb) / 8 * 7)) { 1838 unsigned int hiwat = sb->sb_hiwat; 1839 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1840 V_tcp_autorcvbuf_max); 1841 1842 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 1843 sb->sb_flags &= ~SB_AUTOSIZE; 1844 } 1845 1846 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1847 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1848 1849 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1850 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1851 __func__, tid, len); 1852 1853 if (changed) { 1854 if (toep->ddp.flags & DDP_SC_REQ) 1855 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1856 else if (cpl->ddp_off == 1) { 1857 /* Fell out of DDP mode */ 1858 toep->ddp.flags &= ~DDP_ON; 1859 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1860 __func__); 1861 1862 insert_ddp_data(toep, ddp_placed); 1863 } else { 1864 /* 1865 * Data was received while still 1866 * ULP_MODE_NONE, just fall through. 1867 */ 1868 } 1869 } 1870 1871 if (toep->ddp.flags & DDP_ON) { 1872 /* 1873 * CPL_RX_DATA with DDP on can only be an indicate. 1874 * Start posting queued AIO requests via DDP. The 1875 * payload that arrived in this indicate is appended 1876 * to the socket buffer as usual. 1877 */ 1878 handle_ddp_indicate(toep); 1879 } 1880 } 1881 1882 sbappendstream_locked(sb, m, 0); 1883 t4_rcvd_locked(&toep->td->tod, tp); 1884 1885 if (ulp_mode(toep) == ULP_MODE_TCPDDP && 1886 (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 && 1887 sbavail(sb) != 0) { 1888 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1889 tid); 1890 ddp_queue_toep(toep); 1891 } 1892 if (toep->flags & TPF_TLS_STARTING) 1893 tls_received_starting_data(sc, toep, sb, len); 1894 sorwakeup_locked(so); 1895 SOCKBUF_UNLOCK_ASSERT(sb); 1896 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1897 DDP_UNLOCK(toep); 1898 1899 INP_WUNLOCK(inp); 1900 CURVNET_RESTORE(); 1901 return (0); 1902 } 1903 1904 static int 1905 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1906 { 1907 struct adapter *sc = iq->adapter; 1908 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1909 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1910 struct toepcb *toep = lookup_tid(sc, tid); 1911 struct inpcb *inp; 1912 struct tcpcb *tp; 1913 struct socket *so; 1914 uint8_t credits = cpl->credits; 1915 struct ofld_tx_sdesc *txsd; 1916 int plen; 1917 #ifdef INVARIANTS 1918 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1919 #endif 1920 1921 /* 1922 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1923 * now this comes back carrying the credits for the flowc. 1924 */ 1925 if (__predict_false(toep->flags & TPF_SYNQE)) { 1926 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1927 ("%s: credits for a synq entry %p", __func__, toep)); 1928 return (0); 1929 } 1930 1931 inp = toep->inp; 1932 1933 KASSERT(opcode == CPL_FW4_ACK, 1934 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1935 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1936 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1937 1938 INP_WLOCK(inp); 1939 1940 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1941 INP_WUNLOCK(inp); 1942 return (0); 1943 } 1944 1945 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1946 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1947 1948 tp = intotcpcb(inp); 1949 1950 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1951 tcp_seq snd_una = be32toh(cpl->snd_una); 1952 1953 #ifdef INVARIANTS 1954 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1955 log(LOG_ERR, 1956 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1957 __func__, snd_una, toep->tid, tp->snd_una); 1958 } 1959 #endif 1960 1961 if (tp->snd_una != snd_una) { 1962 tp->snd_una = snd_una; 1963 tp->ts_recent_age = tcp_ts_getticks(); 1964 } 1965 } 1966 1967 #ifdef VERBOSE_TRACES 1968 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1969 #endif 1970 so = inp->inp_socket; 1971 txsd = &toep->txsd[toep->txsd_cidx]; 1972 plen = 0; 1973 while (credits) { 1974 KASSERT(credits >= txsd->tx_credits, 1975 ("%s: too many (or partial) credits", __func__)); 1976 credits -= txsd->tx_credits; 1977 toep->tx_credits += txsd->tx_credits; 1978 plen += txsd->plen; 1979 txsd++; 1980 toep->txsd_avail++; 1981 KASSERT(toep->txsd_avail <= toep->txsd_total, 1982 ("%s: txsd avail > total", __func__)); 1983 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1984 txsd = &toep->txsd[0]; 1985 toep->txsd_cidx = 0; 1986 } 1987 } 1988 1989 if (toep->tx_credits == toep->tx_total) { 1990 toep->tx_nocompl = 0; 1991 toep->plen_nocompl = 0; 1992 } 1993 1994 if (toep->flags & TPF_TX_SUSPENDED && 1995 toep->tx_credits >= toep->tx_total / 4) { 1996 #ifdef VERBOSE_TRACES 1997 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1998 tid); 1999 #endif 2000 toep->flags &= ~TPF_TX_SUSPENDED; 2001 CURVNET_SET(toep->vnet); 2002 t4_push_data(sc, toep, plen); 2003 CURVNET_RESTORE(); 2004 } else if (plen > 0) { 2005 struct sockbuf *sb = &so->so_snd; 2006 int sbu; 2007 2008 SOCKBUF_LOCK(sb); 2009 sbu = sbused(sb); 2010 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 2011 if (__predict_false(sbu > 0)) { 2012 /* 2013 * The data transmitted before the 2014 * tid's ULP mode changed to ISCSI is 2015 * still in so_snd. Incoming credits 2016 * should account for so_snd first. 2017 */ 2018 sbdrop_locked(sb, min(sbu, plen)); 2019 plen -= min(sbu, plen); 2020 } 2021 sowwakeup_locked(so); /* unlocks so_snd */ 2022 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 2023 } else { 2024 #ifdef VERBOSE_TRACES 2025 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 2026 tid, plen); 2027 #endif 2028 sbdrop_locked(sb, plen); 2029 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2030 t4_aiotx_queue_toep(so, toep); 2031 sowwakeup_locked(so); /* unlocks so_snd */ 2032 } 2033 SOCKBUF_UNLOCK_ASSERT(sb); 2034 } 2035 2036 INP_WUNLOCK(inp); 2037 2038 return (0); 2039 } 2040 2041 void 2042 write_set_tcb_field(struct adapter *sc, void *dst, struct toepcb *toep, 2043 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 2044 { 2045 struct cpl_set_tcb_field *req = dst; 2046 2047 MPASS((cookie & ~M_COOKIE) == 0); 2048 if (reply) { 2049 MPASS(cookie != CPL_COOKIE_RESERVED); 2050 } 2051 2052 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 2053 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 2054 if (reply == 0) 2055 req->reply_ctrl |= htobe16(F_NO_REPLY); 2056 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 2057 req->mask = htobe64(mask); 2058 req->val = htobe64(val); 2059 } 2060 2061 void 2062 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 2063 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 2064 { 2065 struct wrqe *wr; 2066 struct ofld_tx_sdesc *txsd; 2067 const u_int len = sizeof(struct cpl_set_tcb_field); 2068 2069 wr = alloc_wrqe(len, wrq); 2070 if (wr == NULL) { 2071 /* XXX */ 2072 panic("%s: allocation failure.", __func__); 2073 } 2074 write_set_tcb_field(sc, wrtod(wr), toep, word, mask, val, reply, 2075 cookie); 2076 2077 if (wrq->eq.type == EQ_OFLD) { 2078 txsd = &toep->txsd[toep->txsd_pidx]; 2079 _Static_assert(howmany(len, 16) <= MAX_OFLD_TX_SDESC_CREDITS, 2080 "MAX_OFLD_TX_SDESC_CREDITS too small"); 2081 txsd->tx_credits = howmany(len, 16); 2082 txsd->plen = 0; 2083 KASSERT(toep->tx_credits >= txsd->tx_credits && 2084 toep->txsd_avail > 0, 2085 ("%s: not enough credits (%d)", __func__, 2086 toep->tx_credits)); 2087 toep->tx_credits -= txsd->tx_credits; 2088 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 2089 toep->txsd_pidx = 0; 2090 toep->txsd_avail--; 2091 } 2092 2093 t4_wrq_tx(sc, wr); 2094 } 2095 2096 void 2097 t4_init_cpl_io_handlers(void) 2098 { 2099 2100 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 2101 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 2102 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 2103 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 2104 CPL_COOKIE_TOM); 2105 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 2106 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 2107 } 2108 2109 void 2110 t4_uninit_cpl_io_handlers(void) 2111 { 2112 2113 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 2114 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 2115 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 2116 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 2117 t4_register_cpl_handler(CPL_RX_DATA, NULL); 2118 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 2119 } 2120 2121 /* 2122 * Use the 'backend1' field in AIO jobs to hold an error that should 2123 * be reported when the job is completed, the 'backend3' field to 2124 * store the amount of data sent by the AIO job so far, and the 2125 * 'backend4' field to hold a reference count on the job. 2126 * 2127 * Each unmapped mbuf holds a reference on the job as does the queue 2128 * so long as the job is queued. 2129 */ 2130 #define aio_error backend1 2131 #define aio_sent backend3 2132 #define aio_refs backend4 2133 2134 #ifdef VERBOSE_TRACES 2135 static int 2136 jobtotid(struct kaiocb *job) 2137 { 2138 struct socket *so; 2139 struct tcpcb *tp; 2140 struct toepcb *toep; 2141 2142 so = job->fd_file->f_data; 2143 tp = sototcpcb(so); 2144 toep = tp->t_toe; 2145 return (toep->tid); 2146 } 2147 #endif 2148 2149 static void 2150 aiotx_free_job(struct kaiocb *job) 2151 { 2152 long status; 2153 int error; 2154 2155 if (refcount_release(&job->aio_refs) == 0) 2156 return; 2157 2158 error = (intptr_t)job->aio_error; 2159 status = job->aio_sent; 2160 #ifdef VERBOSE_TRACES 2161 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 2162 jobtotid(job), job, status, error); 2163 #endif 2164 if (error != 0 && status != 0) 2165 error = 0; 2166 if (error == ECANCELED) 2167 aio_cancel(job); 2168 else if (error) 2169 aio_complete(job, -1, error); 2170 else { 2171 job->msgsnd = 1; 2172 aio_complete(job, status, 0); 2173 } 2174 } 2175 2176 static void 2177 aiotx_free_pgs(struct mbuf *m) 2178 { 2179 struct kaiocb *job; 2180 vm_page_t pg; 2181 2182 M_ASSERTEXTPG(m); 2183 job = m->m_ext.ext_arg1; 2184 #ifdef VERBOSE_TRACES 2185 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 2186 m->m_len, jobtotid(job)); 2187 #endif 2188 2189 for (int i = 0; i < m->m_epg_npgs; i++) { 2190 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 2191 vm_page_unwire(pg, PQ_ACTIVE); 2192 } 2193 2194 aiotx_free_job(job); 2195 } 2196 2197 /* 2198 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 2199 * of an AIO job. 2200 */ 2201 static struct mbuf * 2202 alloc_aiotx_mbuf(struct kaiocb *job, int len) 2203 { 2204 struct vmspace *vm; 2205 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 2206 struct mbuf *m, *top, *last; 2207 vm_map_t map; 2208 vm_offset_t start; 2209 int i, mlen, npages, pgoff; 2210 2211 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 2212 ("%s(%p, %d): request to send beyond end of buffer", __func__, 2213 job, len)); 2214 2215 /* 2216 * The AIO subsystem will cancel and drain all requests before 2217 * permitting a process to exit or exec, so p_vmspace should 2218 * be stable here. 2219 */ 2220 vm = job->userproc->p_vmspace; 2221 map = &vm->vm_map; 2222 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 2223 pgoff = start & PAGE_MASK; 2224 2225 top = NULL; 2226 last = NULL; 2227 while (len > 0) { 2228 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 2229 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 2230 ("%s: next start (%#jx + %#x) is not page aligned", 2231 __func__, (uintmax_t)start, mlen)); 2232 2233 npages = vm_fault_quick_hold_pages(map, start, mlen, 2234 VM_PROT_WRITE, pgs, nitems(pgs)); 2235 if (npages < 0) 2236 break; 2237 2238 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs, M_RDONLY); 2239 m->m_epg_1st_off = pgoff; 2240 m->m_epg_npgs = npages; 2241 if (npages == 1) { 2242 KASSERT(mlen + pgoff <= PAGE_SIZE, 2243 ("%s: single page is too large (off %d len %d)", 2244 __func__, pgoff, mlen)); 2245 m->m_epg_last_len = mlen; 2246 } else { 2247 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2248 (npages - 2) * PAGE_SIZE; 2249 } 2250 for (i = 0; i < npages; i++) 2251 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2252 2253 m->m_len = mlen; 2254 m->m_ext.ext_size = npages * PAGE_SIZE; 2255 m->m_ext.ext_arg1 = job; 2256 refcount_acquire(&job->aio_refs); 2257 2258 #ifdef VERBOSE_TRACES 2259 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2260 __func__, jobtotid(job), m, job, npages); 2261 #endif 2262 2263 if (top == NULL) 2264 top = m; 2265 else 2266 last->m_next = m; 2267 last = m; 2268 2269 len -= mlen; 2270 start += mlen; 2271 pgoff = 0; 2272 } 2273 2274 return (top); 2275 } 2276 2277 static void 2278 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2279 { 2280 struct sockbuf *sb; 2281 struct inpcb *inp; 2282 struct tcpcb *tp; 2283 struct mbuf *m; 2284 u_int sent; 2285 int error, len; 2286 bool moretocome, sendmore; 2287 2288 sb = &so->so_snd; 2289 SOCKBUF_UNLOCK(sb); 2290 m = NULL; 2291 2292 #ifdef MAC 2293 error = mac_socket_check_send(job->fd_file->f_cred, so); 2294 if (error != 0) 2295 goto out; 2296 #endif 2297 2298 /* Inline sosend_generic(). */ 2299 2300 error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); 2301 MPASS(error == 0); 2302 2303 sendanother: 2304 SOCKBUF_LOCK(sb); 2305 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2306 SOCKBUF_UNLOCK(sb); 2307 SOCK_IO_SEND_UNLOCK(so); 2308 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2309 PROC_LOCK(job->userproc); 2310 kern_psignal(job->userproc, SIGPIPE); 2311 PROC_UNLOCK(job->userproc); 2312 } 2313 error = EPIPE; 2314 goto out; 2315 } 2316 if (so->so_error) { 2317 error = so->so_error; 2318 so->so_error = 0; 2319 SOCKBUF_UNLOCK(sb); 2320 SOCK_IO_SEND_UNLOCK(so); 2321 goto out; 2322 } 2323 if ((so->so_state & SS_ISCONNECTED) == 0) { 2324 SOCKBUF_UNLOCK(sb); 2325 SOCK_IO_SEND_UNLOCK(so); 2326 error = ENOTCONN; 2327 goto out; 2328 } 2329 if (sbspace(sb) < sb->sb_lowat) { 2330 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2331 2332 /* 2333 * Don't block if there is too little room in the socket 2334 * buffer. Instead, requeue the request. 2335 */ 2336 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2337 SOCKBUF_UNLOCK(sb); 2338 SOCK_IO_SEND_UNLOCK(so); 2339 error = ECANCELED; 2340 goto out; 2341 } 2342 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2343 SOCKBUF_UNLOCK(sb); 2344 SOCK_IO_SEND_UNLOCK(so); 2345 goto out; 2346 } 2347 2348 /* 2349 * Write as much data as the socket permits, but no more than a 2350 * a single sndbuf at a time. 2351 */ 2352 len = sbspace(sb); 2353 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2354 len = job->uaiocb.aio_nbytes - job->aio_sent; 2355 moretocome = false; 2356 } else 2357 moretocome = true; 2358 if (len > toep->params.sndbuf) { 2359 len = toep->params.sndbuf; 2360 sendmore = true; 2361 } else 2362 sendmore = false; 2363 2364 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2365 moretocome = true; 2366 SOCKBUF_UNLOCK(sb); 2367 MPASS(len != 0); 2368 2369 m = alloc_aiotx_mbuf(job, len); 2370 if (m == NULL) { 2371 SOCK_IO_SEND_UNLOCK(so); 2372 error = EFAULT; 2373 goto out; 2374 } 2375 2376 /* Inlined tcp_usr_send(). */ 2377 2378 inp = toep->inp; 2379 INP_WLOCK(inp); 2380 if (inp->inp_flags & INP_DROPPED) { 2381 INP_WUNLOCK(inp); 2382 SOCK_IO_SEND_UNLOCK(so); 2383 error = ECONNRESET; 2384 goto out; 2385 } 2386 2387 sent = m_length(m, NULL); 2388 job->aio_sent += sent; 2389 counter_u64_add(toep->ofld_txq->tx_aio_octets, sent); 2390 2391 sbappendstream(sb, m, 0); 2392 m = NULL; 2393 2394 if (!(inp->inp_flags & INP_DROPPED)) { 2395 tp = intotcpcb(inp); 2396 if (moretocome) 2397 tp->t_flags |= TF_MORETOCOME; 2398 error = tcp_output(tp); 2399 if (error < 0) { 2400 INP_UNLOCK_ASSERT(inp); 2401 SOCK_IO_SEND_UNLOCK(so); 2402 error = -error; 2403 goto out; 2404 } 2405 if (moretocome) 2406 tp->t_flags &= ~TF_MORETOCOME; 2407 } 2408 2409 INP_WUNLOCK(inp); 2410 if (sendmore) 2411 goto sendanother; 2412 SOCK_IO_SEND_UNLOCK(so); 2413 2414 if (error) 2415 goto out; 2416 2417 /* 2418 * If this is a blocking socket and the request has not been 2419 * fully completed, requeue it until the socket is ready 2420 * again. 2421 */ 2422 if (job->aio_sent < job->uaiocb.aio_nbytes && 2423 !(so->so_state & SS_NBIO)) { 2424 SOCKBUF_LOCK(sb); 2425 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2426 SOCKBUF_UNLOCK(sb); 2427 error = ECANCELED; 2428 goto out; 2429 } 2430 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2431 return; 2432 } 2433 2434 /* 2435 * If the request will not be requeued, drop the queue's 2436 * reference to the job. Any mbufs in flight should still 2437 * hold a reference, but this drops the reference that the 2438 * queue owns while it is waiting to queue mbufs to the 2439 * socket. 2440 */ 2441 aiotx_free_job(job); 2442 counter_u64_add(toep->ofld_txq->tx_aio_jobs, 1); 2443 2444 out: 2445 if (error) { 2446 job->aio_error = (void *)(intptr_t)error; 2447 aiotx_free_job(job); 2448 } 2449 m_freem(m); 2450 SOCKBUF_LOCK(sb); 2451 } 2452 2453 static void 2454 t4_aiotx_task(void *context, int pending) 2455 { 2456 struct toepcb *toep = context; 2457 struct socket *so; 2458 struct kaiocb *job; 2459 struct epoch_tracker et; 2460 2461 so = toep->aiotx_so; 2462 CURVNET_SET(toep->vnet); 2463 NET_EPOCH_ENTER(et); 2464 SOCKBUF_LOCK(&so->so_snd); 2465 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2466 job = TAILQ_FIRST(&toep->aiotx_jobq); 2467 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2468 if (!aio_clear_cancel_function(job)) 2469 continue; 2470 2471 t4_aiotx_process_job(toep, so, job); 2472 } 2473 toep->aiotx_so = NULL; 2474 SOCKBUF_UNLOCK(&so->so_snd); 2475 NET_EPOCH_EXIT(et); 2476 2477 free_toepcb(toep); 2478 sorele(so); 2479 CURVNET_RESTORE(); 2480 } 2481 2482 static void 2483 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2484 { 2485 2486 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2487 #ifdef VERBOSE_TRACES 2488 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2489 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2490 #endif 2491 if (toep->aiotx_so != NULL) 2492 return; 2493 soref(so); 2494 toep->aiotx_so = so; 2495 hold_toepcb(toep); 2496 soaio_enqueue(&toep->aiotx_task); 2497 } 2498 2499 static void 2500 t4_aiotx_cancel(struct kaiocb *job) 2501 { 2502 struct socket *so; 2503 struct sockbuf *sb; 2504 struct tcpcb *tp; 2505 struct toepcb *toep; 2506 2507 so = job->fd_file->f_data; 2508 tp = sototcpcb(so); 2509 toep = tp->t_toe; 2510 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2511 sb = &so->so_snd; 2512 2513 SOCKBUF_LOCK(sb); 2514 if (!aio_cancel_cleared(job)) 2515 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2516 SOCKBUF_UNLOCK(sb); 2517 2518 job->aio_error = (void *)(intptr_t)ECANCELED; 2519 aiotx_free_job(job); 2520 } 2521 2522 int 2523 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2524 { 2525 struct tcpcb *tp = sototcpcb(so); 2526 struct toepcb *toep = tp->t_toe; 2527 struct adapter *sc = td_adapter(toep->td); 2528 2529 /* This only handles writes. */ 2530 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2531 return (EOPNOTSUPP); 2532 2533 if (!sc->tt.tx_zcopy) 2534 return (EOPNOTSUPP); 2535 2536 if (tls_tx_key(toep)) 2537 return (EOPNOTSUPP); 2538 2539 SOCKBUF_LOCK(&so->so_snd); 2540 #ifdef VERBOSE_TRACES 2541 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2542 #endif 2543 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2544 panic("new job was cancelled"); 2545 refcount_init(&job->aio_refs, 1); 2546 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2547 if (sowriteable(so)) 2548 t4_aiotx_queue_toep(so, toep); 2549 SOCKBUF_UNLOCK(&so->so_snd); 2550 return (0); 2551 } 2552 2553 void 2554 aiotx_init_toep(struct toepcb *toep) 2555 { 2556 2557 TAILQ_INIT(&toep->aiotx_jobq); 2558 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2559 } 2560 #endif 2561