1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_kern_tls.h" 36 #include "opt_ratelimit.h" 37 38 #ifdef TCP_OFFLOAD 39 #include <sys/param.h> 40 #include <sys/aio.h> 41 #include <sys/file.h> 42 #include <sys/kernel.h> 43 #include <sys/ktr.h> 44 #include <sys/module.h> 45 #include <sys/proc.h> 46 #include <sys/protosw.h> 47 #include <sys/domain.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sglist.h> 51 #include <sys/taskqueue.h> 52 #include <netinet/in.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip6.h> 56 #define TCPSTATES 57 #include <netinet/tcp_fsm.h> 58 #include <netinet/tcp_seq.h> 59 #include <netinet/tcp_var.h> 60 #include <netinet/toecore.h> 61 62 #include <security/mac/mac_framework.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_page.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (ulp_mode(toep) == ULP_MODE_TLS) 100 nparams++; 101 if (toep->tls.fcplenmax != 0) 102 nparams++; 103 if (toep->params.tc_idx != -1) { 104 MPASS(toep->params.tc_idx >= 0 && 105 toep->params.tc_idx < sc->chip_params->nsched_cls); 106 nparams++; 107 } 108 109 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 110 111 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 112 if (wr == NULL) { 113 /* XXX */ 114 panic("%s: allocation failure.", __func__); 115 } 116 flowc = wrtod(wr); 117 memset(flowc, 0, wr->wr_len); 118 119 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 120 V_FW_FLOWC_WR_NPARAMS(nparams)); 121 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 122 V_FW_WR_FLOWID(toep->tid)); 123 124 #define FLOWC_PARAM(__m, __v) \ 125 do { \ 126 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 127 flowc->mnemval[paramidx].val = htobe32(__v); \ 128 paramidx++; \ 129 } while (0) 130 131 paramidx = 0; 132 133 FLOWC_PARAM(PFNVFN, pfvf); 134 FLOWC_PARAM(CH, pi->tx_chan); 135 FLOWC_PARAM(PORT, pi->tx_chan); 136 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 137 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 138 if (tp) { 139 FLOWC_PARAM(MSS, toep->params.emss); 140 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 141 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 142 } else 143 FLOWC_PARAM(MSS, 512); 144 CTR6(KTR_CXGBE, 145 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 146 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 147 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 148 149 if (ulp_mode(toep) == ULP_MODE_TLS) 150 FLOWC_PARAM(ULP_MODE, ulp_mode(toep)); 151 if (toep->tls.fcplenmax != 0) 152 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 153 if (toep->params.tc_idx != -1) 154 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 155 #undef FLOWC_PARAM 156 157 KASSERT(paramidx == nparams, ("nparams mismatch")); 158 159 txsd->tx_credits = howmany(flowclen, 16); 160 txsd->plen = 0; 161 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 162 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 163 toep->tx_credits -= txsd->tx_credits; 164 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 165 toep->txsd_pidx = 0; 166 toep->txsd_avail--; 167 168 toep->flags |= TPF_FLOWC_WR_SENT; 169 t4_wrq_tx(sc, wr); 170 } 171 172 #ifdef RATELIMIT 173 /* 174 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 175 */ 176 static int 177 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 178 { 179 int tc_idx, rc; 180 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 181 const int port_id = toep->vi->pi->port_id; 182 183 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 184 185 if (kbps == 0) { 186 /* unbind */ 187 tc_idx = -1; 188 } else { 189 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 190 if (rc != 0) 191 return (rc); 192 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 193 } 194 195 if (toep->params.tc_idx != tc_idx) { 196 struct wrqe *wr; 197 struct fw_flowc_wr *flowc; 198 int nparams = 1, flowclen, flowclen16; 199 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 200 201 flowclen = sizeof(*flowc) + nparams * sizeof(struct 202 fw_flowc_mnemval); 203 flowclen16 = howmany(flowclen, 16); 204 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 205 (wr = alloc_wrqe(roundup2(flowclen, 16), 206 &toep->ofld_txq->wrq)) == NULL) { 207 if (tc_idx >= 0) 208 t4_release_cl_rl(sc, port_id, tc_idx); 209 return (ENOMEM); 210 } 211 212 flowc = wrtod(wr); 213 memset(flowc, 0, wr->wr_len); 214 215 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 216 V_FW_FLOWC_WR_NPARAMS(nparams)); 217 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 218 V_FW_WR_FLOWID(toep->tid)); 219 220 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 221 if (tc_idx == -1) 222 flowc->mnemval[0].val = htobe32(0xff); 223 else 224 flowc->mnemval[0].val = htobe32(tc_idx); 225 226 txsd->tx_credits = flowclen16; 227 txsd->plen = 0; 228 toep->tx_credits -= txsd->tx_credits; 229 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 230 toep->txsd_pidx = 0; 231 toep->txsd_avail--; 232 t4_wrq_tx(sc, wr); 233 } 234 235 if (toep->params.tc_idx >= 0) 236 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 237 toep->params.tc_idx = tc_idx; 238 239 return (0); 240 } 241 #endif 242 243 void 244 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 245 { 246 struct wrqe *wr; 247 struct cpl_abort_req *req; 248 int tid = toep->tid; 249 struct inpcb *inp = toep->inp; 250 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 251 252 INP_WLOCK_ASSERT(inp); 253 254 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 255 __func__, toep->tid, 256 inp->inp_flags & INP_DROPPED ? "inp dropped" : 257 tcpstates[tp->t_state], 258 toep->flags, inp->inp_flags, 259 toep->flags & TPF_ABORT_SHUTDOWN ? 260 " (abort already in progress)" : ""); 261 262 if (toep->flags & TPF_ABORT_SHUTDOWN) 263 return; /* abort already in progress */ 264 265 toep->flags |= TPF_ABORT_SHUTDOWN; 266 267 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 268 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 269 270 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 271 if (wr == NULL) { 272 /* XXX */ 273 panic("%s: allocation failure.", __func__); 274 } 275 req = wrtod(wr); 276 277 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 278 if (inp->inp_flags & INP_DROPPED) 279 req->rsvd0 = htobe32(snd_nxt); 280 else 281 req->rsvd0 = htobe32(tp->snd_nxt); 282 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 283 req->cmd = CPL_ABORT_SEND_RST; 284 285 /* 286 * XXX: What's the correct way to tell that the inp hasn't been detached 287 * from its socket? Should I even be flushing the snd buffer here? 288 */ 289 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 290 struct socket *so = inp->inp_socket; 291 292 if (so != NULL) /* because I'm not sure. See comment above */ 293 sbflush(&so->so_snd); 294 } 295 296 t4_l2t_send(sc, wr, toep->l2te); 297 } 298 299 /* 300 * Called when a connection is established to translate the TCP options 301 * reported by HW to FreeBSD's native format. 302 */ 303 static void 304 assign_rxopt(struct tcpcb *tp, uint16_t opt) 305 { 306 struct toepcb *toep = tp->t_toe; 307 struct inpcb *inp = tp->t_inpcb; 308 struct adapter *sc = td_adapter(toep->td); 309 310 INP_LOCK_ASSERT(inp); 311 312 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 313 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 314 if (inp->inp_inc.inc_flags & INC_ISIPV6) 315 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 316 else 317 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 318 319 toep->params.emss = tp->t_maxseg; 320 if (G_TCPOPT_TSTAMP(opt)) { 321 toep->params.tstamp = 1; 322 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 323 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 324 tp->ts_recent = 0; /* hmmm */ 325 tp->ts_recent_age = tcp_ts_getticks(); 326 } else 327 toep->params.tstamp = 0; 328 329 if (G_TCPOPT_SACK(opt)) { 330 toep->params.sack = 1; 331 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 332 } else { 333 toep->params.sack = 0; 334 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 335 } 336 337 if (G_TCPOPT_WSCALE_OK(opt)) 338 tp->t_flags |= TF_RCVD_SCALE; 339 340 /* Doing window scaling? */ 341 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 342 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 343 tp->rcv_scale = tp->request_r_scale; 344 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 345 } else 346 toep->params.wscale = 0; 347 348 CTR6(KTR_CXGBE, 349 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 350 toep->tid, toep->params.mtu_idx, toep->params.emss, 351 toep->params.tstamp, toep->params.sack, toep->params.wscale); 352 } 353 354 /* 355 * Completes some final bits of initialization for just established connections 356 * and changes their state to TCPS_ESTABLISHED. 357 * 358 * The ISNs are from the exchange of SYNs. 359 */ 360 void 361 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 362 { 363 struct inpcb *inp = toep->inp; 364 struct socket *so = inp->inp_socket; 365 struct tcpcb *tp = intotcpcb(inp); 366 uint16_t tcpopt = be16toh(opt); 367 368 INP_WLOCK_ASSERT(inp); 369 KASSERT(tp->t_state == TCPS_SYN_SENT || 370 tp->t_state == TCPS_SYN_RECEIVED, 371 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 372 373 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 374 __func__, toep->tid, so, inp, tp, toep); 375 376 tcp_state_change(tp, TCPS_ESTABLISHED); 377 tp->t_starttime = ticks; 378 TCPSTAT_INC(tcps_connects); 379 380 tp->irs = irs; 381 tcp_rcvseqinit(tp); 382 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 383 tp->rcv_adv += tp->rcv_wnd; 384 tp->last_ack_sent = tp->rcv_nxt; 385 386 tp->iss = iss; 387 tcp_sendseqinit(tp); 388 tp->snd_una = iss + 1; 389 tp->snd_nxt = iss + 1; 390 tp->snd_max = iss + 1; 391 392 assign_rxopt(tp, tcpopt); 393 send_flowc_wr(toep, tp); 394 395 soisconnected(so); 396 397 if (ulp_mode(toep) == ULP_MODE_TLS) 398 tls_establish(toep); 399 } 400 401 int 402 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 403 { 404 struct wrqe *wr; 405 struct cpl_rx_data_ack *req; 406 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 407 408 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 409 410 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 411 if (wr == NULL) 412 return (0); 413 req = wrtod(wr); 414 415 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 416 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 417 418 t4_wrq_tx(sc, wr); 419 return (credits); 420 } 421 422 void 423 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 424 { 425 struct wrqe *wr; 426 struct cpl_rx_data_ack *req; 427 428 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 429 if (wr == NULL) 430 return; 431 req = wrtod(wr); 432 433 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 434 req->credit_dack = htobe32(F_RX_MODULATE_RX); 435 436 t4_wrq_tx(sc, wr); 437 } 438 439 void 440 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 441 { 442 struct adapter *sc = tod->tod_softc; 443 struct inpcb *inp = tp->t_inpcb; 444 struct socket *so = inp->inp_socket; 445 struct sockbuf *sb = &so->so_rcv; 446 struct toepcb *toep = tp->t_toe; 447 int rx_credits; 448 449 INP_WLOCK_ASSERT(inp); 450 SOCKBUF_LOCK_ASSERT(sb); 451 452 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 453 if (rx_credits > 0 && 454 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 455 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 456 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 457 rx_credits = send_rx_credits(sc, toep, rx_credits); 458 tp->rcv_wnd += rx_credits; 459 tp->rcv_adv += rx_credits; 460 } else if (toep->flags & TPF_FORCE_CREDITS) 461 send_rx_modulate(sc, toep); 462 } 463 464 void 465 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 466 { 467 struct inpcb *inp = tp->t_inpcb; 468 struct socket *so = inp->inp_socket; 469 struct sockbuf *sb = &so->so_rcv; 470 471 SOCKBUF_LOCK(sb); 472 t4_rcvd_locked(tod, tp); 473 SOCKBUF_UNLOCK(sb); 474 } 475 476 /* 477 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 478 */ 479 int 480 t4_close_conn(struct adapter *sc, struct toepcb *toep) 481 { 482 struct wrqe *wr; 483 struct cpl_close_con_req *req; 484 unsigned int tid = toep->tid; 485 486 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 487 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 488 489 if (toep->flags & TPF_FIN_SENT) 490 return (0); 491 492 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 493 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 494 495 wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq); 496 if (wr == NULL) { 497 /* XXX */ 498 panic("%s: allocation failure.", __func__); 499 } 500 req = wrtod(wr); 501 502 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 503 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 504 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 505 V_FW_WR_FLOWID(tid)); 506 req->wr.wr_lo = cpu_to_be64(0); 507 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 508 req->rsvd = 0; 509 510 toep->flags |= TPF_FIN_SENT; 511 toep->flags &= ~TPF_SEND_FIN; 512 t4_l2t_send(sc, wr, toep->l2te); 513 514 return (0); 515 } 516 517 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 518 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 519 520 /* Maximum amount of immediate data we could stuff in a WR */ 521 static inline int 522 max_imm_payload(int tx_credits) 523 { 524 const int n = 1; /* Use no more than one desc for imm. data WR */ 525 526 KASSERT(tx_credits >= 0 && 527 tx_credits <= MAX_OFLD_TX_CREDITS, 528 ("%s: %d credits", __func__, tx_credits)); 529 530 if (tx_credits < MIN_OFLD_TX_CREDITS) 531 return (0); 532 533 if (tx_credits >= (n * EQ_ESIZE) / 16) 534 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 535 else 536 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 537 } 538 539 /* Maximum number of SGL entries we could stuff in a WR */ 540 static inline int 541 max_dsgl_nsegs(int tx_credits) 542 { 543 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 544 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 545 546 KASSERT(tx_credits >= 0 && 547 tx_credits <= MAX_OFLD_TX_CREDITS, 548 ("%s: %d credits", __func__, tx_credits)); 549 550 if (tx_credits < MIN_OFLD_TX_CREDITS) 551 return (0); 552 553 nseg += 2 * (sge_pair_credits * 16 / 24); 554 if ((sge_pair_credits * 16) % 24 == 16) 555 nseg++; 556 557 return (nseg); 558 } 559 560 static inline void 561 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 562 unsigned int plen, uint8_t credits, int shove, int ulp_submode) 563 { 564 struct fw_ofld_tx_data_wr *txwr = dst; 565 566 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 567 V_FW_WR_IMMDLEN(immdlen)); 568 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 569 V_FW_WR_LEN16(credits)); 570 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 571 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 572 txwr->plen = htobe32(plen); 573 574 if (toep->params.tx_align > 0) { 575 if (plen < 2 * toep->params.emss) 576 txwr->lsodisable_to_flags |= 577 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 578 else 579 txwr->lsodisable_to_flags |= 580 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 581 (toep->params.nagle == 0 ? 0 : 582 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 583 } 584 } 585 586 /* 587 * Generate a DSGL from a starting mbuf. The total number of segments and the 588 * maximum segments in any one mbuf are provided. 589 */ 590 static void 591 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 592 { 593 struct mbuf *m; 594 struct ulptx_sgl *usgl = dst; 595 int i, j, rc; 596 struct sglist sg; 597 struct sglist_seg segs[n]; 598 599 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 600 601 sglist_init(&sg, n, segs); 602 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 603 V_ULPTX_NSGE(nsegs)); 604 605 i = -1; 606 for (m = start; m != stop; m = m->m_next) { 607 if (m->m_flags & M_EXTPG) 608 rc = sglist_append_mbuf_epg(&sg, m, 609 mtod(m, vm_offset_t), m->m_len); 610 else 611 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 612 if (__predict_false(rc != 0)) 613 panic("%s: sglist_append %d", __func__, rc); 614 615 for (j = 0; j < sg.sg_nseg; i++, j++) { 616 if (i < 0) { 617 usgl->len0 = htobe32(segs[j].ss_len); 618 usgl->addr0 = htobe64(segs[j].ss_paddr); 619 } else { 620 usgl->sge[i / 2].len[i & 1] = 621 htobe32(segs[j].ss_len); 622 usgl->sge[i / 2].addr[i & 1] = 623 htobe64(segs[j].ss_paddr); 624 } 625 #ifdef INVARIANTS 626 nsegs--; 627 #endif 628 } 629 sglist_reset(&sg); 630 } 631 if (i & 1) 632 usgl->sge[i / 2].len[1] = htobe32(0); 633 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 634 __func__, nsegs, start, stop)); 635 } 636 637 /* 638 * Max number of SGL entries an offload tx work request can have. This is 41 639 * (1 + 40) for a full 512B work request. 640 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 641 */ 642 #define OFLD_SGL_LEN (41) 643 644 /* 645 * Send data and/or a FIN to the peer. 646 * 647 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 648 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 649 * was transmitted. 650 * 651 * drop indicates the number of bytes that should be dropped from the head of 652 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 653 * contention on the send buffer lock (before this change it used to do 654 * sowwakeup and then t4_push_frames right after that when recovering from tx 655 * stalls). When drop is set this function MUST drop the bytes and wake up any 656 * writers. 657 */ 658 void 659 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 660 { 661 struct mbuf *sndptr, *m, *sb_sndptr; 662 struct fw_ofld_tx_data_wr *txwr; 663 struct wrqe *wr; 664 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 665 struct inpcb *inp = toep->inp; 666 struct tcpcb *tp = intotcpcb(inp); 667 struct socket *so = inp->inp_socket; 668 struct sockbuf *sb = &so->so_snd; 669 int tx_credits, shove, compl, sowwakeup; 670 struct ofld_tx_sdesc *txsd; 671 bool nomap_mbuf_seen; 672 673 INP_WLOCK_ASSERT(inp); 674 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 675 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 676 677 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 678 ulp_mode(toep) == ULP_MODE_TCPDDP || 679 ulp_mode(toep) == ULP_MODE_TLS || 680 ulp_mode(toep) == ULP_MODE_RDMA, 681 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 682 683 #ifdef VERBOSE_TRACES 684 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 685 __func__, toep->tid, toep->flags, tp->t_flags, drop); 686 #endif 687 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 688 return; 689 690 #ifdef RATELIMIT 691 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 692 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 693 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 694 } 695 #endif 696 697 /* 698 * This function doesn't resume by itself. Someone else must clear the 699 * flag and call this function. 700 */ 701 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 702 KASSERT(drop == 0, 703 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 704 return; 705 } 706 707 txsd = &toep->txsd[toep->txsd_pidx]; 708 do { 709 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 710 max_imm = max_imm_payload(tx_credits); 711 max_nsegs = max_dsgl_nsegs(tx_credits); 712 713 SOCKBUF_LOCK(sb); 714 sowwakeup = drop; 715 if (drop) { 716 sbdrop_locked(sb, drop); 717 drop = 0; 718 } 719 sb_sndptr = sb->sb_sndptr; 720 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 721 plen = 0; 722 nsegs = 0; 723 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 724 nomap_mbuf_seen = false; 725 for (m = sndptr; m != NULL; m = m->m_next) { 726 int n; 727 728 if ((m->m_flags & M_NOTAVAIL) != 0) 729 break; 730 if (m->m_flags & M_EXTPG) { 731 #ifdef KERN_TLS 732 if (m->m_epg_tls != NULL) { 733 toep->flags |= TPF_KTLS; 734 if (plen == 0) { 735 SOCKBUF_UNLOCK(sb); 736 t4_push_ktls(sc, toep, 0); 737 return; 738 } 739 break; 740 } 741 #endif 742 n = sglist_count_mbuf_epg(m, 743 mtod(m, vm_offset_t), m->m_len); 744 } else 745 n = sglist_count(mtod(m, void *), m->m_len); 746 747 nsegs += n; 748 plen += m->m_len; 749 750 /* This mbuf sent us _over_ the nsegs limit, back out */ 751 if (plen > max_imm && nsegs > max_nsegs) { 752 nsegs -= n; 753 plen -= m->m_len; 754 if (plen == 0) { 755 /* Too few credits */ 756 toep->flags |= TPF_TX_SUSPENDED; 757 if (sowwakeup) { 758 if (!TAILQ_EMPTY( 759 &toep->aiotx_jobq)) 760 t4_aiotx_queue_toep(so, 761 toep); 762 sowwakeup_locked(so); 763 } else 764 SOCKBUF_UNLOCK(sb); 765 SOCKBUF_UNLOCK_ASSERT(sb); 766 return; 767 } 768 break; 769 } 770 771 if (m->m_flags & M_EXTPG) 772 nomap_mbuf_seen = true; 773 if (max_nsegs_1mbuf < n) 774 max_nsegs_1mbuf = n; 775 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 776 777 /* This mbuf put us right at the max_nsegs limit */ 778 if (plen > max_imm && nsegs == max_nsegs) { 779 m = m->m_next; 780 break; 781 } 782 } 783 784 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 785 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 786 compl = 1; 787 else 788 compl = 0; 789 790 if (sb->sb_flags & SB_AUTOSIZE && 791 V_tcp_do_autosndbuf && 792 sb->sb_hiwat < V_tcp_autosndbuf_max && 793 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 794 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 795 V_tcp_autosndbuf_max); 796 797 if (!sbreserve_locked(sb, newsize, so, NULL)) 798 sb->sb_flags &= ~SB_AUTOSIZE; 799 else 800 sowwakeup = 1; /* room available */ 801 } 802 if (sowwakeup) { 803 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 804 t4_aiotx_queue_toep(so, toep); 805 sowwakeup_locked(so); 806 } else 807 SOCKBUF_UNLOCK(sb); 808 SOCKBUF_UNLOCK_ASSERT(sb); 809 810 /* nothing to send */ 811 if (plen == 0) { 812 KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0, 813 ("%s: nothing to send, but m != NULL is ready", 814 __func__)); 815 break; 816 } 817 818 if (__predict_false(toep->flags & TPF_FIN_SENT)) 819 panic("%s: excess tx.", __func__); 820 821 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 822 if (plen <= max_imm && !nomap_mbuf_seen) { 823 824 /* Immediate data tx */ 825 826 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 827 &toep->ofld_txq->wrq); 828 if (wr == NULL) { 829 /* XXX: how will we recover from this? */ 830 toep->flags |= TPF_TX_SUSPENDED; 831 return; 832 } 833 txwr = wrtod(wr); 834 credits = howmany(wr->wr_len, 16); 835 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); 836 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 837 nsegs = 0; 838 } else { 839 int wr_len; 840 841 /* DSGL tx */ 842 843 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 844 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 845 wr = alloc_wrqe(roundup2(wr_len, 16), 846 &toep->ofld_txq->wrq); 847 if (wr == NULL) { 848 /* XXX: how will we recover from this? */ 849 toep->flags |= TPF_TX_SUSPENDED; 850 return; 851 } 852 txwr = wrtod(wr); 853 credits = howmany(wr_len, 16); 854 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); 855 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 856 max_nsegs_1mbuf); 857 if (wr_len & 0xf) { 858 uint64_t *pad = (uint64_t *) 859 ((uintptr_t)txwr + wr_len); 860 *pad = 0; 861 } 862 } 863 864 KASSERT(toep->tx_credits >= credits, 865 ("%s: not enough credits", __func__)); 866 867 toep->tx_credits -= credits; 868 toep->tx_nocompl += credits; 869 toep->plen_nocompl += plen; 870 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 871 toep->tx_nocompl >= toep->tx_total / 4) 872 compl = 1; 873 874 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 875 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 876 toep->tx_nocompl = 0; 877 toep->plen_nocompl = 0; 878 } 879 880 tp->snd_nxt += plen; 881 tp->snd_max += plen; 882 883 SOCKBUF_LOCK(sb); 884 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 885 sb->sb_sndptr = sb_sndptr; 886 SOCKBUF_UNLOCK(sb); 887 888 toep->flags |= TPF_TX_DATA_SENT; 889 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 890 toep->flags |= TPF_TX_SUSPENDED; 891 892 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 893 txsd->plen = plen; 894 txsd->tx_credits = credits; 895 txsd++; 896 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 897 toep->txsd_pidx = 0; 898 txsd = &toep->txsd[0]; 899 } 900 toep->txsd_avail--; 901 902 t4_l2t_send(sc, wr, toep->l2te); 903 } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0); 904 905 /* Send a FIN if requested, but only if there's no more data to send */ 906 if (m == NULL && toep->flags & TPF_SEND_FIN) 907 t4_close_conn(sc, toep); 908 } 909 910 static inline void 911 rqdrop_locked(struct mbufq *q, int plen) 912 { 913 struct mbuf *m; 914 915 while (plen > 0) { 916 m = mbufq_dequeue(q); 917 918 /* Too many credits. */ 919 MPASS(m != NULL); 920 M_ASSERTPKTHDR(m); 921 922 /* Partial credits. */ 923 MPASS(plen >= m->m_pkthdr.len); 924 925 plen -= m->m_pkthdr.len; 926 m_freem(m); 927 } 928 } 929 930 void 931 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 932 { 933 struct mbuf *sndptr, *m; 934 struct fw_ofld_tx_data_wr *txwr; 935 struct wrqe *wr; 936 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 937 u_int adjusted_plen, ulp_submode; 938 struct inpcb *inp = toep->inp; 939 struct tcpcb *tp = intotcpcb(inp); 940 int tx_credits, shove; 941 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 942 struct mbufq *pduq = &toep->ulp_pduq; 943 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 944 945 INP_WLOCK_ASSERT(inp); 946 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 947 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 948 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 949 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 950 951 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 952 return; 953 954 /* 955 * This function doesn't resume by itself. Someone else must clear the 956 * flag and call this function. 957 */ 958 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 959 KASSERT(drop == 0, 960 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 961 return; 962 } 963 964 if (drop) 965 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 966 967 while ((sndptr = mbufq_first(pduq)) != NULL) { 968 M_ASSERTPKTHDR(sndptr); 969 970 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 971 max_imm = max_imm_payload(tx_credits); 972 max_nsegs = max_dsgl_nsegs(tx_credits); 973 974 plen = 0; 975 nsegs = 0; 976 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 977 for (m = sndptr; m != NULL; m = m->m_next) { 978 int n = sglist_count(mtod(m, void *), m->m_len); 979 980 nsegs += n; 981 plen += m->m_len; 982 983 /* 984 * This mbuf would send us _over_ the nsegs limit. 985 * Suspend tx because the PDU can't be sent out. 986 */ 987 if (plen > max_imm && nsegs > max_nsegs) { 988 toep->flags |= TPF_TX_SUSPENDED; 989 return; 990 } 991 992 if (max_nsegs_1mbuf < n) 993 max_nsegs_1mbuf = n; 994 } 995 996 if (__predict_false(toep->flags & TPF_FIN_SENT)) 997 panic("%s: excess tx.", __func__); 998 999 /* 1000 * We have a PDU to send. All of it goes out in one WR so 'm' 1001 * is NULL. A PDU's length is always a multiple of 4. 1002 */ 1003 MPASS(m == NULL); 1004 MPASS((plen & 3) == 0); 1005 MPASS(sndptr->m_pkthdr.len == plen); 1006 1007 shove = !(tp->t_flags & TF_MORETOCOME); 1008 ulp_submode = mbuf_ulp_submode(sndptr); 1009 MPASS(ulp_submode < nitems(ulp_extra_len)); 1010 1011 /* 1012 * plen doesn't include header and data digests, which are 1013 * generated and inserted in the right places by the TOE, but 1014 * they do occupy TCP sequence space and need to be accounted 1015 * for. 1016 */ 1017 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1018 if (plen <= max_imm) { 1019 1020 /* Immediate data tx */ 1021 1022 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1023 &toep->ofld_txq->wrq); 1024 if (wr == NULL) { 1025 /* XXX: how will we recover from this? */ 1026 toep->flags |= TPF_TX_SUSPENDED; 1027 return; 1028 } 1029 txwr = wrtod(wr); 1030 credits = howmany(wr->wr_len, 16); 1031 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1032 shove, ulp_submode); 1033 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1034 nsegs = 0; 1035 } else { 1036 int wr_len; 1037 1038 /* DSGL tx */ 1039 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1040 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1041 wr = alloc_wrqe(roundup2(wr_len, 16), 1042 &toep->ofld_txq->wrq); 1043 if (wr == NULL) { 1044 /* XXX: how will we recover from this? */ 1045 toep->flags |= TPF_TX_SUSPENDED; 1046 return; 1047 } 1048 txwr = wrtod(wr); 1049 credits = howmany(wr_len, 16); 1050 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1051 shove, ulp_submode); 1052 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1053 max_nsegs_1mbuf); 1054 if (wr_len & 0xf) { 1055 uint64_t *pad = (uint64_t *) 1056 ((uintptr_t)txwr + wr_len); 1057 *pad = 0; 1058 } 1059 } 1060 1061 KASSERT(toep->tx_credits >= credits, 1062 ("%s: not enough credits", __func__)); 1063 1064 m = mbufq_dequeue(pduq); 1065 MPASS(m == sndptr); 1066 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1067 1068 toep->tx_credits -= credits; 1069 toep->tx_nocompl += credits; 1070 toep->plen_nocompl += plen; 1071 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1072 toep->tx_nocompl >= toep->tx_total / 4) { 1073 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1074 toep->tx_nocompl = 0; 1075 toep->plen_nocompl = 0; 1076 } 1077 1078 tp->snd_nxt += adjusted_plen; 1079 tp->snd_max += adjusted_plen; 1080 1081 toep->flags |= TPF_TX_DATA_SENT; 1082 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1083 toep->flags |= TPF_TX_SUSPENDED; 1084 1085 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1086 txsd->plen = plen; 1087 txsd->tx_credits = credits; 1088 txsd++; 1089 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1090 toep->txsd_pidx = 0; 1091 txsd = &toep->txsd[0]; 1092 } 1093 toep->txsd_avail--; 1094 1095 counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, 1); 1096 counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen); 1097 1098 t4_l2t_send(sc, wr, toep->l2te); 1099 } 1100 1101 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1102 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1103 t4_close_conn(sc, toep); 1104 } 1105 1106 static inline void 1107 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1108 { 1109 1110 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1111 t4_push_pdus(sc, toep, drop); 1112 else if (tls_tx_key(toep) && toep->tls.mode == TLS_MODE_TLSOM) 1113 t4_push_tls_records(sc, toep, drop); 1114 #ifdef KERN_TLS 1115 else if (toep->flags & TPF_KTLS) 1116 t4_push_ktls(sc, toep, drop); 1117 #endif 1118 else 1119 t4_push_frames(sc, toep, drop); 1120 } 1121 1122 int 1123 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1124 { 1125 struct adapter *sc = tod->tod_softc; 1126 #ifdef INVARIANTS 1127 struct inpcb *inp = tp->t_inpcb; 1128 #endif 1129 struct toepcb *toep = tp->t_toe; 1130 1131 INP_WLOCK_ASSERT(inp); 1132 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1133 ("%s: inp %p dropped.", __func__, inp)); 1134 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1135 1136 t4_push_data(sc, toep, 0); 1137 1138 return (0); 1139 } 1140 1141 int 1142 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1143 { 1144 struct adapter *sc = tod->tod_softc; 1145 #ifdef INVARIANTS 1146 struct inpcb *inp = tp->t_inpcb; 1147 #endif 1148 struct toepcb *toep = tp->t_toe; 1149 1150 INP_WLOCK_ASSERT(inp); 1151 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1152 ("%s: inp %p dropped.", __func__, inp)); 1153 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1154 1155 toep->flags |= TPF_SEND_FIN; 1156 if (tp->t_state >= TCPS_ESTABLISHED) 1157 t4_push_data(sc, toep, 0); 1158 1159 return (0); 1160 } 1161 1162 int 1163 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1164 { 1165 struct adapter *sc = tod->tod_softc; 1166 #if defined(INVARIANTS) 1167 struct inpcb *inp = tp->t_inpcb; 1168 #endif 1169 struct toepcb *toep = tp->t_toe; 1170 1171 INP_WLOCK_ASSERT(inp); 1172 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1173 ("%s: inp %p dropped.", __func__, inp)); 1174 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1175 1176 /* hmmmm */ 1177 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1178 ("%s: flowc for tid %u [%s] not sent already", 1179 __func__, toep->tid, tcpstates[tp->t_state])); 1180 1181 send_reset(sc, toep, 0); 1182 return (0); 1183 } 1184 1185 /* 1186 * Peer has sent us a FIN. 1187 */ 1188 static int 1189 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1190 { 1191 struct adapter *sc = iq->adapter; 1192 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1193 unsigned int tid = GET_TID(cpl); 1194 struct toepcb *toep = lookup_tid(sc, tid); 1195 struct inpcb *inp = toep->inp; 1196 struct tcpcb *tp = NULL; 1197 struct socket *so; 1198 struct epoch_tracker et; 1199 #ifdef INVARIANTS 1200 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1201 #endif 1202 1203 KASSERT(opcode == CPL_PEER_CLOSE, 1204 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1205 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1206 1207 if (__predict_false(toep->flags & TPF_SYNQE)) { 1208 /* 1209 * do_pass_establish must have run before do_peer_close and if 1210 * this is still a synqe instead of a toepcb then the connection 1211 * must be getting aborted. 1212 */ 1213 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1214 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1215 toep, toep->flags); 1216 return (0); 1217 } 1218 1219 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1220 1221 CURVNET_SET(toep->vnet); 1222 NET_EPOCH_ENTER(et); 1223 INP_WLOCK(inp); 1224 tp = intotcpcb(inp); 1225 1226 CTR6(KTR_CXGBE, 1227 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1228 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1229 toep->ddp.flags, inp); 1230 1231 if (toep->flags & TPF_ABORT_SHUTDOWN) 1232 goto done; 1233 1234 tp->rcv_nxt++; /* FIN */ 1235 1236 so = inp->inp_socket; 1237 socantrcvmore(so); 1238 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1239 DDP_LOCK(toep); 1240 if (__predict_false(toep->ddp.flags & 1241 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1242 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1243 DDP_UNLOCK(toep); 1244 } 1245 1246 if (ulp_mode(toep) != ULP_MODE_RDMA) { 1247 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1248 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1249 be32toh(cpl->rcv_nxt))); 1250 } 1251 1252 switch (tp->t_state) { 1253 case TCPS_SYN_RECEIVED: 1254 tp->t_starttime = ticks; 1255 /* FALLTHROUGH */ 1256 1257 case TCPS_ESTABLISHED: 1258 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1259 break; 1260 1261 case TCPS_FIN_WAIT_1: 1262 tcp_state_change(tp, TCPS_CLOSING); 1263 break; 1264 1265 case TCPS_FIN_WAIT_2: 1266 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1267 tcp_twstart(tp); 1268 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1269 NET_EPOCH_EXIT(et); 1270 CURVNET_RESTORE(); 1271 1272 INP_WLOCK(inp); 1273 final_cpl_received(toep); 1274 return (0); 1275 1276 default: 1277 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1278 __func__, tid, tp->t_state); 1279 } 1280 done: 1281 INP_WUNLOCK(inp); 1282 NET_EPOCH_EXIT(et); 1283 CURVNET_RESTORE(); 1284 return (0); 1285 } 1286 1287 /* 1288 * Peer has ACK'd our FIN. 1289 */ 1290 static int 1291 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1292 struct mbuf *m) 1293 { 1294 struct adapter *sc = iq->adapter; 1295 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1296 unsigned int tid = GET_TID(cpl); 1297 struct toepcb *toep = lookup_tid(sc, tid); 1298 struct inpcb *inp = toep->inp; 1299 struct tcpcb *tp = NULL; 1300 struct socket *so = NULL; 1301 struct epoch_tracker et; 1302 #ifdef INVARIANTS 1303 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1304 #endif 1305 1306 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1307 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1308 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1309 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1310 1311 CURVNET_SET(toep->vnet); 1312 NET_EPOCH_ENTER(et); 1313 INP_WLOCK(inp); 1314 tp = intotcpcb(inp); 1315 1316 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1317 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1318 1319 if (toep->flags & TPF_ABORT_SHUTDOWN) 1320 goto done; 1321 1322 so = inp->inp_socket; 1323 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1324 1325 switch (tp->t_state) { 1326 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1327 restore_so_proto(so, inp->inp_vflag & INP_IPV6); 1328 tcp_twstart(tp); 1329 release: 1330 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1331 NET_EPOCH_EXIT(et); 1332 CURVNET_RESTORE(); 1333 1334 INP_WLOCK(inp); 1335 final_cpl_received(toep); /* no more CPLs expected */ 1336 1337 return (0); 1338 case TCPS_LAST_ACK: 1339 if (tcp_close(tp)) 1340 INP_WUNLOCK(inp); 1341 goto release; 1342 1343 case TCPS_FIN_WAIT_1: 1344 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1345 soisdisconnected(so); 1346 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1347 break; 1348 1349 default: 1350 log(LOG_ERR, 1351 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1352 __func__, tid, tcpstates[tp->t_state]); 1353 } 1354 done: 1355 INP_WUNLOCK(inp); 1356 NET_EPOCH_EXIT(et); 1357 CURVNET_RESTORE(); 1358 return (0); 1359 } 1360 1361 void 1362 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid, 1363 int rst_status) 1364 { 1365 struct wrqe *wr; 1366 struct cpl_abort_rpl *cpl; 1367 1368 wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq); 1369 if (wr == NULL) { 1370 /* XXX */ 1371 panic("%s: allocation failure.", __func__); 1372 } 1373 cpl = wrtod(wr); 1374 1375 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1376 cpl->cmd = rst_status; 1377 1378 t4_wrq_tx(sc, wr); 1379 } 1380 1381 static int 1382 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1383 { 1384 switch (abort_reason) { 1385 case CPL_ERR_BAD_SYN: 1386 case CPL_ERR_CONN_RESET: 1387 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1388 case CPL_ERR_XMIT_TIMEDOUT: 1389 case CPL_ERR_PERSIST_TIMEDOUT: 1390 case CPL_ERR_FINWAIT2_TIMEDOUT: 1391 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1392 return (ETIMEDOUT); 1393 default: 1394 return (EIO); 1395 } 1396 } 1397 1398 /* 1399 * TCP RST from the peer, timeout, or some other such critical error. 1400 */ 1401 static int 1402 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1403 { 1404 struct adapter *sc = iq->adapter; 1405 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1406 unsigned int tid = GET_TID(cpl); 1407 struct toepcb *toep = lookup_tid(sc, tid); 1408 struct sge_ofld_txq *ofld_txq = toep->ofld_txq; 1409 struct inpcb *inp; 1410 struct tcpcb *tp; 1411 struct epoch_tracker et; 1412 #ifdef INVARIANTS 1413 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1414 #endif 1415 1416 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1417 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1418 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1419 1420 if (toep->flags & TPF_SYNQE) 1421 return (do_abort_req_synqe(iq, rss, m)); 1422 1423 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1424 1425 if (negative_advice(cpl->status)) { 1426 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1427 __func__, cpl->status, tid, toep->flags); 1428 return (0); /* Ignore negative advice */ 1429 } 1430 1431 inp = toep->inp; 1432 CURVNET_SET(toep->vnet); 1433 NET_EPOCH_ENTER(et); /* for tcp_close */ 1434 INP_WLOCK(inp); 1435 1436 tp = intotcpcb(inp); 1437 1438 CTR6(KTR_CXGBE, 1439 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1440 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1441 inp->inp_flags, cpl->status); 1442 1443 /* 1444 * If we'd initiated an abort earlier the reply to it is responsible for 1445 * cleaning up resources. Otherwise we tear everything down right here 1446 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1447 */ 1448 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1449 INP_WUNLOCK(inp); 1450 goto done; 1451 } 1452 toep->flags |= TPF_ABORT_SHUTDOWN; 1453 1454 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1455 struct socket *so = inp->inp_socket; 1456 1457 if (so != NULL) 1458 so_error_set(so, abort_status_to_errno(tp, 1459 cpl->status)); 1460 tp = tcp_close(tp); 1461 if (tp == NULL) 1462 INP_WLOCK(inp); /* re-acquire */ 1463 } 1464 1465 final_cpl_received(toep); 1466 done: 1467 NET_EPOCH_EXIT(et); 1468 CURVNET_RESTORE(); 1469 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1470 return (0); 1471 } 1472 1473 /* 1474 * Reply to the CPL_ABORT_REQ (send_reset) 1475 */ 1476 static int 1477 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1478 { 1479 struct adapter *sc = iq->adapter; 1480 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1481 unsigned int tid = GET_TID(cpl); 1482 struct toepcb *toep = lookup_tid(sc, tid); 1483 struct inpcb *inp = toep->inp; 1484 #ifdef INVARIANTS 1485 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1486 #endif 1487 1488 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1489 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1490 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1491 1492 if (toep->flags & TPF_SYNQE) 1493 return (do_abort_rpl_synqe(iq, rss, m)); 1494 1495 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1496 1497 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1498 __func__, tid, toep, inp, cpl->status); 1499 1500 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1501 ("%s: wasn't expecting abort reply", __func__)); 1502 1503 INP_WLOCK(inp); 1504 final_cpl_received(toep); 1505 1506 return (0); 1507 } 1508 1509 static int 1510 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1511 { 1512 struct adapter *sc = iq->adapter; 1513 const struct cpl_rx_data *cpl = mtod(m, const void *); 1514 unsigned int tid = GET_TID(cpl); 1515 struct toepcb *toep = lookup_tid(sc, tid); 1516 struct inpcb *inp = toep->inp; 1517 struct tcpcb *tp; 1518 struct socket *so; 1519 struct sockbuf *sb; 1520 struct epoch_tracker et; 1521 int len, rx_credits; 1522 uint32_t ddp_placed = 0; 1523 1524 if (__predict_false(toep->flags & TPF_SYNQE)) { 1525 /* 1526 * do_pass_establish must have run before do_rx_data and if this 1527 * is still a synqe instead of a toepcb then the connection must 1528 * be getting aborted. 1529 */ 1530 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1531 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1532 toep, toep->flags); 1533 m_freem(m); 1534 return (0); 1535 } 1536 1537 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1538 1539 /* strip off CPL header */ 1540 m_adj(m, sizeof(*cpl)); 1541 len = m->m_pkthdr.len; 1542 1543 INP_WLOCK(inp); 1544 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1545 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1546 __func__, tid, len, inp->inp_flags); 1547 INP_WUNLOCK(inp); 1548 m_freem(m); 1549 return (0); 1550 } 1551 1552 tp = intotcpcb(inp); 1553 1554 if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS && 1555 toep->flags & TPF_TLS_RECEIVE)) { 1556 /* Received "raw" data on a TLS socket. */ 1557 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)", 1558 __func__, tid, len); 1559 do_rx_data_tls(cpl, toep, m); 1560 return (0); 1561 } 1562 1563 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1564 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1565 1566 tp->rcv_nxt += len; 1567 if (tp->rcv_wnd < len) { 1568 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1569 ("%s: negative window size", __func__)); 1570 } 1571 1572 tp->rcv_wnd -= len; 1573 tp->t_rcvtime = ticks; 1574 1575 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1576 DDP_LOCK(toep); 1577 so = inp_inpcbtosocket(inp); 1578 sb = &so->so_rcv; 1579 SOCKBUF_LOCK(sb); 1580 1581 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1582 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1583 __func__, tid, len); 1584 m_freem(m); 1585 SOCKBUF_UNLOCK(sb); 1586 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1587 DDP_UNLOCK(toep); 1588 INP_WUNLOCK(inp); 1589 1590 CURVNET_SET(toep->vnet); 1591 NET_EPOCH_ENTER(et); 1592 INP_WLOCK(inp); 1593 tp = tcp_drop(tp, ECONNRESET); 1594 if (tp) 1595 INP_WUNLOCK(inp); 1596 NET_EPOCH_EXIT(et); 1597 CURVNET_RESTORE(); 1598 1599 return (0); 1600 } 1601 1602 /* receive buffer autosize */ 1603 MPASS(toep->vnet == so->so_vnet); 1604 CURVNET_SET(toep->vnet); 1605 if (sb->sb_flags & SB_AUTOSIZE && 1606 V_tcp_do_autorcvbuf && 1607 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1608 len > (sbspace(sb) / 8 * 7)) { 1609 unsigned int hiwat = sb->sb_hiwat; 1610 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1611 V_tcp_autorcvbuf_max); 1612 1613 if (!sbreserve_locked(sb, newsize, so, NULL)) 1614 sb->sb_flags &= ~SB_AUTOSIZE; 1615 } 1616 1617 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1618 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1619 1620 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1621 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1622 __func__, tid, len); 1623 1624 if (changed) { 1625 if (toep->ddp.flags & DDP_SC_REQ) 1626 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1627 else { 1628 KASSERT(cpl->ddp_off == 1, 1629 ("%s: DDP switched on by itself.", 1630 __func__)); 1631 1632 /* Fell out of DDP mode */ 1633 toep->ddp.flags &= ~DDP_ON; 1634 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1635 __func__); 1636 1637 insert_ddp_data(toep, ddp_placed); 1638 } 1639 } 1640 1641 if (toep->ddp.flags & DDP_ON) { 1642 /* 1643 * CPL_RX_DATA with DDP on can only be an indicate. 1644 * Start posting queued AIO requests via DDP. The 1645 * payload that arrived in this indicate is appended 1646 * to the socket buffer as usual. 1647 */ 1648 handle_ddp_indicate(toep); 1649 } 1650 } 1651 1652 sbappendstream_locked(sb, m, 0); 1653 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1654 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1655 rx_credits = send_rx_credits(sc, toep, rx_credits); 1656 tp->rcv_wnd += rx_credits; 1657 tp->rcv_adv += rx_credits; 1658 } 1659 1660 if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1661 sbavail(sb) != 0) { 1662 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1663 tid); 1664 ddp_queue_toep(toep); 1665 } 1666 sorwakeup_locked(so); 1667 SOCKBUF_UNLOCK_ASSERT(sb); 1668 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1669 DDP_UNLOCK(toep); 1670 1671 INP_WUNLOCK(inp); 1672 CURVNET_RESTORE(); 1673 return (0); 1674 } 1675 1676 static int 1677 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1678 { 1679 struct adapter *sc = iq->adapter; 1680 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1681 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1682 struct toepcb *toep = lookup_tid(sc, tid); 1683 struct inpcb *inp; 1684 struct tcpcb *tp; 1685 struct socket *so; 1686 uint8_t credits = cpl->credits; 1687 struct ofld_tx_sdesc *txsd; 1688 int plen; 1689 #ifdef INVARIANTS 1690 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1691 #endif 1692 1693 /* 1694 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1695 * now this comes back carrying the credits for the flowc. 1696 */ 1697 if (__predict_false(toep->flags & TPF_SYNQE)) { 1698 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1699 ("%s: credits for a synq entry %p", __func__, toep)); 1700 return (0); 1701 } 1702 1703 inp = toep->inp; 1704 1705 KASSERT(opcode == CPL_FW4_ACK, 1706 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1707 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1708 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1709 1710 INP_WLOCK(inp); 1711 1712 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1713 INP_WUNLOCK(inp); 1714 return (0); 1715 } 1716 1717 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1718 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1719 1720 tp = intotcpcb(inp); 1721 1722 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1723 tcp_seq snd_una = be32toh(cpl->snd_una); 1724 1725 #ifdef INVARIANTS 1726 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1727 log(LOG_ERR, 1728 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1729 __func__, snd_una, toep->tid, tp->snd_una); 1730 } 1731 #endif 1732 1733 if (tp->snd_una != snd_una) { 1734 tp->snd_una = snd_una; 1735 tp->ts_recent_age = tcp_ts_getticks(); 1736 } 1737 } 1738 1739 #ifdef VERBOSE_TRACES 1740 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1741 #endif 1742 so = inp->inp_socket; 1743 txsd = &toep->txsd[toep->txsd_cidx]; 1744 plen = 0; 1745 while (credits) { 1746 KASSERT(credits >= txsd->tx_credits, 1747 ("%s: too many (or partial) credits", __func__)); 1748 credits -= txsd->tx_credits; 1749 toep->tx_credits += txsd->tx_credits; 1750 plen += txsd->plen; 1751 if (txsd->iv_buffer) { 1752 free(txsd->iv_buffer, M_CXGBE); 1753 txsd->iv_buffer = NULL; 1754 } 1755 txsd++; 1756 toep->txsd_avail++; 1757 KASSERT(toep->txsd_avail <= toep->txsd_total, 1758 ("%s: txsd avail > total", __func__)); 1759 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1760 txsd = &toep->txsd[0]; 1761 toep->txsd_cidx = 0; 1762 } 1763 } 1764 1765 if (toep->tx_credits == toep->tx_total) { 1766 toep->tx_nocompl = 0; 1767 toep->plen_nocompl = 0; 1768 } 1769 1770 if (toep->flags & TPF_TX_SUSPENDED && 1771 toep->tx_credits >= toep->tx_total / 4) { 1772 #ifdef VERBOSE_TRACES 1773 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1774 tid); 1775 #endif 1776 toep->flags &= ~TPF_TX_SUSPENDED; 1777 CURVNET_SET(toep->vnet); 1778 t4_push_data(sc, toep, plen); 1779 CURVNET_RESTORE(); 1780 } else if (plen > 0) { 1781 struct sockbuf *sb = &so->so_snd; 1782 int sbu; 1783 1784 SOCKBUF_LOCK(sb); 1785 sbu = sbused(sb); 1786 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 1787 1788 if (__predict_false(sbu > 0)) { 1789 /* 1790 * The data trasmitted before the tid's ULP mode 1791 * changed to ISCSI is still in so_snd. 1792 * Incoming credits should account for so_snd 1793 * first. 1794 */ 1795 sbdrop_locked(sb, min(sbu, plen)); 1796 plen -= min(sbu, plen); 1797 } 1798 sowwakeup_locked(so); /* unlocks so_snd */ 1799 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1800 } else { 1801 #ifdef VERBOSE_TRACES 1802 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1803 tid, plen); 1804 #endif 1805 sbdrop_locked(sb, plen); 1806 if (tls_tx_key(toep) && 1807 toep->tls.mode == TLS_MODE_TLSOM) { 1808 struct tls_ofld_info *tls_ofld = &toep->tls; 1809 1810 MPASS(tls_ofld->sb_off >= plen); 1811 tls_ofld->sb_off -= plen; 1812 } 1813 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1814 t4_aiotx_queue_toep(so, toep); 1815 sowwakeup_locked(so); /* unlocks so_snd */ 1816 } 1817 SOCKBUF_UNLOCK_ASSERT(sb); 1818 } 1819 1820 INP_WUNLOCK(inp); 1821 1822 return (0); 1823 } 1824 1825 void 1826 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1827 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1828 { 1829 struct wrqe *wr; 1830 struct cpl_set_tcb_field *req; 1831 struct ofld_tx_sdesc *txsd; 1832 1833 MPASS((cookie & ~M_COOKIE) == 0); 1834 if (reply) { 1835 MPASS(cookie != CPL_COOKIE_RESERVED); 1836 } 1837 1838 wr = alloc_wrqe(sizeof(*req), wrq); 1839 if (wr == NULL) { 1840 /* XXX */ 1841 panic("%s: allocation failure.", __func__); 1842 } 1843 req = wrtod(wr); 1844 1845 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1846 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1847 if (reply == 0) 1848 req->reply_ctrl |= htobe16(F_NO_REPLY); 1849 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1850 req->mask = htobe64(mask); 1851 req->val = htobe64(val); 1852 if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { 1853 txsd = &toep->txsd[toep->txsd_pidx]; 1854 txsd->tx_credits = howmany(sizeof(*req), 16); 1855 txsd->plen = 0; 1856 KASSERT(toep->tx_credits >= txsd->tx_credits && 1857 toep->txsd_avail > 0, 1858 ("%s: not enough credits (%d)", __func__, 1859 toep->tx_credits)); 1860 toep->tx_credits -= txsd->tx_credits; 1861 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1862 toep->txsd_pidx = 0; 1863 toep->txsd_avail--; 1864 } 1865 1866 t4_wrq_tx(sc, wr); 1867 } 1868 1869 void 1870 t4_init_cpl_io_handlers(void) 1871 { 1872 1873 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1874 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1875 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1876 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1877 CPL_COOKIE_TOM); 1878 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1879 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1880 } 1881 1882 void 1883 t4_uninit_cpl_io_handlers(void) 1884 { 1885 1886 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1887 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1888 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1889 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1890 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1891 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1892 } 1893 1894 /* 1895 * Use the 'backend1' field in AIO jobs to hold an error that should 1896 * be reported when the job is completed, the 'backend3' field to 1897 * store the amount of data sent by the AIO job so far, and the 1898 * 'backend4' field to hold a reference count on the job. 1899 * 1900 * Each unmapped mbuf holds a reference on the job as does the queue 1901 * so long as the job is queued. 1902 */ 1903 #define aio_error backend1 1904 #define aio_sent backend3 1905 #define aio_refs backend4 1906 1907 #define jobtotid(job) \ 1908 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1909 1910 static void 1911 aiotx_free_job(struct kaiocb *job) 1912 { 1913 long status; 1914 int error; 1915 1916 if (refcount_release(&job->aio_refs) == 0) 1917 return; 1918 1919 error = (intptr_t)job->aio_error; 1920 status = job->aio_sent; 1921 #ifdef VERBOSE_TRACES 1922 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1923 jobtotid(job), job, status, error); 1924 #endif 1925 if (error != 0 && status != 0) 1926 error = 0; 1927 if (error == ECANCELED) 1928 aio_cancel(job); 1929 else if (error) 1930 aio_complete(job, -1, error); 1931 else { 1932 job->msgsnd = 1; 1933 aio_complete(job, status, 0); 1934 } 1935 } 1936 1937 static void 1938 aiotx_free_pgs(struct mbuf *m) 1939 { 1940 struct kaiocb *job; 1941 vm_page_t pg; 1942 1943 M_ASSERTEXTPG(m); 1944 job = m->m_ext.ext_arg1; 1945 #ifdef VERBOSE_TRACES 1946 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1947 m->m_len, jobtotid(job)); 1948 #endif 1949 1950 for (int i = 0; i < m->m_epg_npgs; i++) { 1951 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); 1952 vm_page_unwire(pg, PQ_ACTIVE); 1953 } 1954 1955 aiotx_free_job(job); 1956 } 1957 1958 /* 1959 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 1960 * of an AIO job. 1961 */ 1962 static struct mbuf * 1963 alloc_aiotx_mbuf(struct kaiocb *job, int len) 1964 { 1965 struct vmspace *vm; 1966 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 1967 struct mbuf *m, *top, *last; 1968 vm_map_t map; 1969 vm_offset_t start; 1970 int i, mlen, npages, pgoff; 1971 1972 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 1973 ("%s(%p, %d): request to send beyond end of buffer", __func__, 1974 job, len)); 1975 1976 /* 1977 * The AIO subsystem will cancel and drain all requests before 1978 * permitting a process to exit or exec, so p_vmspace should 1979 * be stable here. 1980 */ 1981 vm = job->userproc->p_vmspace; 1982 map = &vm->vm_map; 1983 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 1984 pgoff = start & PAGE_MASK; 1985 1986 top = NULL; 1987 last = NULL; 1988 while (len > 0) { 1989 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 1990 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 1991 ("%s: next start (%#jx + %#x) is not page aligned", 1992 __func__, (uintmax_t)start, mlen)); 1993 1994 npages = vm_fault_quick_hold_pages(map, start, mlen, 1995 VM_PROT_WRITE, pgs, nitems(pgs)); 1996 if (npages < 0) 1997 break; 1998 1999 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs); 2000 if (m == NULL) { 2001 vm_page_unhold_pages(pgs, npages); 2002 break; 2003 } 2004 2005 m->m_epg_1st_off = pgoff; 2006 m->m_epg_npgs = npages; 2007 if (npages == 1) { 2008 KASSERT(mlen + pgoff <= PAGE_SIZE, 2009 ("%s: single page is too large (off %d len %d)", 2010 __func__, pgoff, mlen)); 2011 m->m_epg_last_len = mlen; 2012 } else { 2013 m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) - 2014 (npages - 2) * PAGE_SIZE; 2015 } 2016 for (i = 0; i < npages; i++) 2017 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2018 2019 m->m_len = mlen; 2020 m->m_ext.ext_size = npages * PAGE_SIZE; 2021 m->m_ext.ext_arg1 = job; 2022 refcount_acquire(&job->aio_refs); 2023 2024 #ifdef VERBOSE_TRACES 2025 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2026 __func__, jobtotid(job), m, job, npages); 2027 #endif 2028 2029 if (top == NULL) 2030 top = m; 2031 else 2032 last->m_next = m; 2033 last = m; 2034 2035 len -= mlen; 2036 start += mlen; 2037 pgoff = 0; 2038 } 2039 2040 return (top); 2041 } 2042 2043 static void 2044 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2045 { 2046 struct sockbuf *sb; 2047 struct file *fp; 2048 struct inpcb *inp; 2049 struct tcpcb *tp; 2050 struct mbuf *m; 2051 int error, len; 2052 bool moretocome, sendmore; 2053 2054 sb = &so->so_snd; 2055 SOCKBUF_UNLOCK(sb); 2056 fp = job->fd_file; 2057 m = NULL; 2058 2059 #ifdef MAC 2060 error = mac_socket_check_send(fp->f_cred, so); 2061 if (error != 0) 2062 goto out; 2063 #endif 2064 2065 /* Inline sosend_generic(). */ 2066 2067 error = sblock(sb, SBL_WAIT); 2068 MPASS(error == 0); 2069 2070 sendanother: 2071 SOCKBUF_LOCK(sb); 2072 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2073 SOCKBUF_UNLOCK(sb); 2074 sbunlock(sb); 2075 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2076 PROC_LOCK(job->userproc); 2077 kern_psignal(job->userproc, SIGPIPE); 2078 PROC_UNLOCK(job->userproc); 2079 } 2080 error = EPIPE; 2081 goto out; 2082 } 2083 if (so->so_error) { 2084 error = so->so_error; 2085 so->so_error = 0; 2086 SOCKBUF_UNLOCK(sb); 2087 sbunlock(sb); 2088 goto out; 2089 } 2090 if ((so->so_state & SS_ISCONNECTED) == 0) { 2091 SOCKBUF_UNLOCK(sb); 2092 sbunlock(sb); 2093 error = ENOTCONN; 2094 goto out; 2095 } 2096 if (sbspace(sb) < sb->sb_lowat) { 2097 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2098 2099 /* 2100 * Don't block if there is too little room in the socket 2101 * buffer. Instead, requeue the request. 2102 */ 2103 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2104 SOCKBUF_UNLOCK(sb); 2105 sbunlock(sb); 2106 error = ECANCELED; 2107 goto out; 2108 } 2109 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2110 SOCKBUF_UNLOCK(sb); 2111 sbunlock(sb); 2112 goto out; 2113 } 2114 2115 /* 2116 * Write as much data as the socket permits, but no more than a 2117 * a single sndbuf at a time. 2118 */ 2119 len = sbspace(sb); 2120 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2121 len = job->uaiocb.aio_nbytes - job->aio_sent; 2122 moretocome = false; 2123 } else 2124 moretocome = true; 2125 if (len > toep->params.sndbuf) { 2126 len = toep->params.sndbuf; 2127 sendmore = true; 2128 } else 2129 sendmore = false; 2130 2131 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2132 moretocome = true; 2133 SOCKBUF_UNLOCK(sb); 2134 MPASS(len != 0); 2135 2136 m = alloc_aiotx_mbuf(job, len); 2137 if (m == NULL) { 2138 sbunlock(sb); 2139 error = EFAULT; 2140 goto out; 2141 } 2142 2143 /* Inlined tcp_usr_send(). */ 2144 2145 inp = toep->inp; 2146 INP_WLOCK(inp); 2147 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2148 INP_WUNLOCK(inp); 2149 sbunlock(sb); 2150 error = ECONNRESET; 2151 goto out; 2152 } 2153 2154 job->aio_sent += m_length(m, NULL); 2155 2156 sbappendstream(sb, m, 0); 2157 m = NULL; 2158 2159 if (!(inp->inp_flags & INP_DROPPED)) { 2160 tp = intotcpcb(inp); 2161 if (moretocome) 2162 tp->t_flags |= TF_MORETOCOME; 2163 error = tp->t_fb->tfb_tcp_output(tp); 2164 if (moretocome) 2165 tp->t_flags &= ~TF_MORETOCOME; 2166 } 2167 2168 INP_WUNLOCK(inp); 2169 if (sendmore) 2170 goto sendanother; 2171 sbunlock(sb); 2172 2173 if (error) 2174 goto out; 2175 2176 /* 2177 * If this is a blocking socket and the request has not been 2178 * fully completed, requeue it until the socket is ready 2179 * again. 2180 */ 2181 if (job->aio_sent < job->uaiocb.aio_nbytes && 2182 !(so->so_state & SS_NBIO)) { 2183 SOCKBUF_LOCK(sb); 2184 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2185 SOCKBUF_UNLOCK(sb); 2186 error = ECANCELED; 2187 goto out; 2188 } 2189 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2190 return; 2191 } 2192 2193 /* 2194 * If the request will not be requeued, drop the queue's 2195 * reference to the job. Any mbufs in flight should still 2196 * hold a reference, but this drops the reference that the 2197 * queue owns while it is waiting to queue mbufs to the 2198 * socket. 2199 */ 2200 aiotx_free_job(job); 2201 2202 out: 2203 if (error) { 2204 job->aio_error = (void *)(intptr_t)error; 2205 aiotx_free_job(job); 2206 } 2207 m_freem(m); 2208 SOCKBUF_LOCK(sb); 2209 } 2210 2211 static void 2212 t4_aiotx_task(void *context, int pending) 2213 { 2214 struct toepcb *toep = context; 2215 struct socket *so; 2216 struct kaiocb *job; 2217 2218 so = toep->aiotx_so; 2219 CURVNET_SET(toep->vnet); 2220 SOCKBUF_LOCK(&so->so_snd); 2221 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2222 job = TAILQ_FIRST(&toep->aiotx_jobq); 2223 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2224 if (!aio_clear_cancel_function(job)) 2225 continue; 2226 2227 t4_aiotx_process_job(toep, so, job); 2228 } 2229 toep->aiotx_so = NULL; 2230 SOCKBUF_UNLOCK(&so->so_snd); 2231 CURVNET_RESTORE(); 2232 2233 free_toepcb(toep); 2234 SOCK_LOCK(so); 2235 sorele(so); 2236 } 2237 2238 static void 2239 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2240 { 2241 2242 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2243 #ifdef VERBOSE_TRACES 2244 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2245 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2246 #endif 2247 if (toep->aiotx_so != NULL) 2248 return; 2249 soref(so); 2250 toep->aiotx_so = so; 2251 hold_toepcb(toep); 2252 soaio_enqueue(&toep->aiotx_task); 2253 } 2254 2255 static void 2256 t4_aiotx_cancel(struct kaiocb *job) 2257 { 2258 struct socket *so; 2259 struct sockbuf *sb; 2260 struct tcpcb *tp; 2261 struct toepcb *toep; 2262 2263 so = job->fd_file->f_data; 2264 tp = so_sototcpcb(so); 2265 toep = tp->t_toe; 2266 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2267 sb = &so->so_snd; 2268 2269 SOCKBUF_LOCK(sb); 2270 if (!aio_cancel_cleared(job)) 2271 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2272 SOCKBUF_UNLOCK(sb); 2273 2274 job->aio_error = (void *)(intptr_t)ECANCELED; 2275 aiotx_free_job(job); 2276 } 2277 2278 int 2279 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2280 { 2281 struct tcpcb *tp = so_sototcpcb(so); 2282 struct toepcb *toep = tp->t_toe; 2283 struct adapter *sc = td_adapter(toep->td); 2284 2285 /* This only handles writes. */ 2286 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2287 return (EOPNOTSUPP); 2288 2289 if (!sc->tt.tx_zcopy) 2290 return (EOPNOTSUPP); 2291 2292 if (tls_tx_key(toep)) 2293 return (EOPNOTSUPP); 2294 2295 SOCKBUF_LOCK(&so->so_snd); 2296 #ifdef VERBOSE_TRACES 2297 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2298 #endif 2299 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2300 panic("new job was cancelled"); 2301 refcount_init(&job->aio_refs, 1); 2302 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2303 if (sowriteable(so)) 2304 t4_aiotx_queue_toep(so, toep); 2305 SOCKBUF_UNLOCK(&so->so_snd); 2306 return (0); 2307 } 2308 2309 void 2310 aiotx_init_toep(struct toepcb *toep) 2311 { 2312 2313 TAILQ_INIT(&toep->aiotx_jobq); 2314 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2315 } 2316 #endif 2317