1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012, 2015 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_kern_tls.h" 36 #include "opt_ratelimit.h" 37 38 #ifdef TCP_OFFLOAD 39 #include <sys/param.h> 40 #include <sys/aio.h> 41 #include <sys/file.h> 42 #include <sys/kernel.h> 43 #include <sys/ktr.h> 44 #include <sys/module.h> 45 #include <sys/proc.h> 46 #include <sys/protosw.h> 47 #include <sys/domain.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sglist.h> 51 #include <sys/taskqueue.h> 52 #include <netinet/in.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/ip.h> 55 #include <netinet/ip6.h> 56 #define TCPSTATES 57 #include <netinet/tcp_fsm.h> 58 #include <netinet/tcp_seq.h> 59 #include <netinet/tcp_var.h> 60 #include <netinet/toecore.h> 61 62 #include <security/mac/mac_framework.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_extern.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_page.h> 69 70 #include "common/common.h" 71 #include "common/t4_msg.h" 72 #include "common/t4_regs.h" 73 #include "common/t4_tcb.h" 74 #include "tom/t4_tom_l2t.h" 75 #include "tom/t4_tom.h" 76 77 static void t4_aiotx_cancel(struct kaiocb *job); 78 static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); 79 80 void 81 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) 82 { 83 struct wrqe *wr; 84 struct fw_flowc_wr *flowc; 85 unsigned int nparams, flowclen, paramidx; 86 struct vi_info *vi = toep->vi; 87 struct port_info *pi = vi->pi; 88 struct adapter *sc = pi->adapter; 89 unsigned int pfvf = sc->pf << S_FW_VIID_PFN; 90 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 91 92 KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), 93 ("%s: flowc for tid %u sent already", __func__, toep->tid)); 94 95 if (tp != NULL) 96 nparams = 8; 97 else 98 nparams = 6; 99 if (ulp_mode(toep) == ULP_MODE_TLS) 100 nparams++; 101 if (toep->tls.fcplenmax != 0) 102 nparams++; 103 if (toep->params.tc_idx != -1) { 104 MPASS(toep->params.tc_idx >= 0 && 105 toep->params.tc_idx < sc->chip_params->nsched_cls); 106 nparams++; 107 } 108 109 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 110 111 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 112 if (wr == NULL) { 113 /* XXX */ 114 panic("%s: allocation failure.", __func__); 115 } 116 flowc = wrtod(wr); 117 memset(flowc, 0, wr->wr_len); 118 119 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 120 V_FW_FLOWC_WR_NPARAMS(nparams)); 121 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 122 V_FW_WR_FLOWID(toep->tid)); 123 124 #define FLOWC_PARAM(__m, __v) \ 125 do { \ 126 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \ 127 flowc->mnemval[paramidx].val = htobe32(__v); \ 128 paramidx++; \ 129 } while (0) 130 131 paramidx = 0; 132 133 FLOWC_PARAM(PFNVFN, pfvf); 134 FLOWC_PARAM(CH, pi->tx_chan); 135 FLOWC_PARAM(PORT, pi->tx_chan); 136 FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); 137 FLOWC_PARAM(SNDBUF, toep->params.sndbuf); 138 if (tp) { 139 FLOWC_PARAM(MSS, toep->params.emss); 140 FLOWC_PARAM(SNDNXT, tp->snd_nxt); 141 FLOWC_PARAM(RCVNXT, tp->rcv_nxt); 142 } else 143 FLOWC_PARAM(MSS, 512); 144 CTR6(KTR_CXGBE, 145 "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", 146 __func__, toep->tid, toep->params.emss, toep->params.sndbuf, 147 tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); 148 149 if (ulp_mode(toep) == ULP_MODE_TLS) 150 FLOWC_PARAM(ULP_MODE, ulp_mode(toep)); 151 if (toep->tls.fcplenmax != 0) 152 FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); 153 if (toep->params.tc_idx != -1) 154 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); 155 #undef FLOWC_PARAM 156 157 KASSERT(paramidx == nparams, ("nparams mismatch")); 158 159 txsd->tx_credits = howmany(flowclen, 16); 160 txsd->plen = 0; 161 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 162 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 163 toep->tx_credits -= txsd->tx_credits; 164 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 165 toep->txsd_pidx = 0; 166 toep->txsd_avail--; 167 168 toep->flags |= TPF_FLOWC_WR_SENT; 169 t4_wrq_tx(sc, wr); 170 } 171 172 #ifdef RATELIMIT 173 /* 174 * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second. 175 */ 176 static int 177 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) 178 { 179 int tc_idx, rc; 180 const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; 181 const int port_id = toep->vi->pi->port_id; 182 183 CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); 184 185 if (kbps == 0) { 186 /* unbind */ 187 tc_idx = -1; 188 } else { 189 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); 190 if (rc != 0) 191 return (rc); 192 MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); 193 } 194 195 if (toep->params.tc_idx != tc_idx) { 196 struct wrqe *wr; 197 struct fw_flowc_wr *flowc; 198 int nparams = 1, flowclen, flowclen16; 199 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 200 201 flowclen = sizeof(*flowc) + nparams * sizeof(struct 202 fw_flowc_mnemval); 203 flowclen16 = howmany(flowclen, 16); 204 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || 205 (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { 206 if (tc_idx >= 0) 207 t4_release_cl_rl(sc, port_id, tc_idx); 208 return (ENOMEM); 209 } 210 211 flowc = wrtod(wr); 212 memset(flowc, 0, wr->wr_len); 213 214 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 215 V_FW_FLOWC_WR_NPARAMS(nparams)); 216 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | 217 V_FW_WR_FLOWID(toep->tid)); 218 219 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; 220 if (tc_idx == -1) 221 flowc->mnemval[0].val = htobe32(0xff); 222 else 223 flowc->mnemval[0].val = htobe32(tc_idx); 224 225 txsd->tx_credits = flowclen16; 226 txsd->plen = 0; 227 toep->tx_credits -= txsd->tx_credits; 228 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 229 toep->txsd_pidx = 0; 230 toep->txsd_avail--; 231 t4_wrq_tx(sc, wr); 232 } 233 234 if (toep->params.tc_idx >= 0) 235 t4_release_cl_rl(sc, port_id, toep->params.tc_idx); 236 toep->params.tc_idx = tc_idx; 237 238 return (0); 239 } 240 #endif 241 242 void 243 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) 244 { 245 struct wrqe *wr; 246 struct cpl_abort_req *req; 247 int tid = toep->tid; 248 struct inpcb *inp = toep->inp; 249 struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ 250 251 INP_WLOCK_ASSERT(inp); 252 253 CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", 254 __func__, toep->tid, 255 inp->inp_flags & INP_DROPPED ? "inp dropped" : 256 tcpstates[tp->t_state], 257 toep->flags, inp->inp_flags, 258 toep->flags & TPF_ABORT_SHUTDOWN ? 259 " (abort already in progress)" : ""); 260 261 if (toep->flags & TPF_ABORT_SHUTDOWN) 262 return; /* abort already in progress */ 263 264 toep->flags |= TPF_ABORT_SHUTDOWN; 265 266 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 267 ("%s: flowc_wr not sent for tid %d.", __func__, tid)); 268 269 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 270 if (wr == NULL) { 271 /* XXX */ 272 panic("%s: allocation failure.", __func__); 273 } 274 req = wrtod(wr); 275 276 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); 277 if (inp->inp_flags & INP_DROPPED) 278 req->rsvd0 = htobe32(snd_nxt); 279 else 280 req->rsvd0 = htobe32(tp->snd_nxt); 281 req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT); 282 req->cmd = CPL_ABORT_SEND_RST; 283 284 /* 285 * XXX: What's the correct way to tell that the inp hasn't been detached 286 * from its socket? Should I even be flushing the snd buffer here? 287 */ 288 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 289 struct socket *so = inp->inp_socket; 290 291 if (so != NULL) /* because I'm not sure. See comment above */ 292 sbflush(&so->so_snd); 293 } 294 295 t4_l2t_send(sc, wr, toep->l2te); 296 } 297 298 /* 299 * Called when a connection is established to translate the TCP options 300 * reported by HW to FreeBSD's native format. 301 */ 302 static void 303 assign_rxopt(struct tcpcb *tp, uint16_t opt) 304 { 305 struct toepcb *toep = tp->t_toe; 306 struct inpcb *inp = tp->t_inpcb; 307 struct adapter *sc = td_adapter(toep->td); 308 309 INP_LOCK_ASSERT(inp); 310 311 toep->params.mtu_idx = G_TCPOPT_MSS(opt); 312 tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; 313 if (inp->inp_inc.inc_flags & INC_ISIPV6) 314 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 315 else 316 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); 317 318 toep->params.emss = tp->t_maxseg; 319 if (G_TCPOPT_TSTAMP(opt)) { 320 toep->params.tstamp = 1; 321 toep->params.emss -= TCPOLEN_TSTAMP_APPA; 322 tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ 323 tp->ts_recent = 0; /* hmmm */ 324 tp->ts_recent_age = tcp_ts_getticks(); 325 } else 326 toep->params.tstamp = 0; 327 328 if (G_TCPOPT_SACK(opt)) { 329 toep->params.sack = 1; 330 tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ 331 } else { 332 toep->params.sack = 0; 333 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ 334 } 335 336 if (G_TCPOPT_WSCALE_OK(opt)) 337 tp->t_flags |= TF_RCVD_SCALE; 338 339 /* Doing window scaling? */ 340 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 341 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 342 tp->rcv_scale = tp->request_r_scale; 343 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); 344 } else 345 toep->params.wscale = 0; 346 347 CTR6(KTR_CXGBE, 348 "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", 349 toep->tid, toep->params.mtu_idx, toep->params.emss, 350 toep->params.tstamp, toep->params.sack, toep->params.wscale); 351 } 352 353 /* 354 * Completes some final bits of initialization for just established connections 355 * and changes their state to TCPS_ESTABLISHED. 356 * 357 * The ISNs are from the exchange of SYNs. 358 */ 359 void 360 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt) 361 { 362 struct inpcb *inp = toep->inp; 363 struct socket *so = inp->inp_socket; 364 struct tcpcb *tp = intotcpcb(inp); 365 uint16_t tcpopt = be16toh(opt); 366 367 INP_WLOCK_ASSERT(inp); 368 KASSERT(tp->t_state == TCPS_SYN_SENT || 369 tp->t_state == TCPS_SYN_RECEIVED, 370 ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); 371 372 CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p", 373 __func__, toep->tid, so, inp, tp, toep); 374 375 tcp_state_change(tp, TCPS_ESTABLISHED); 376 tp->t_starttime = ticks; 377 TCPSTAT_INC(tcps_connects); 378 379 tp->irs = irs; 380 tcp_rcvseqinit(tp); 381 tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; 382 tp->rcv_adv += tp->rcv_wnd; 383 tp->last_ack_sent = tp->rcv_nxt; 384 385 tp->iss = iss; 386 tcp_sendseqinit(tp); 387 tp->snd_una = iss + 1; 388 tp->snd_nxt = iss + 1; 389 tp->snd_max = iss + 1; 390 391 assign_rxopt(tp, tcpopt); 392 send_flowc_wr(toep, tp); 393 394 soisconnected(so); 395 } 396 397 int 398 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) 399 { 400 struct wrqe *wr; 401 struct cpl_rx_data_ack *req; 402 uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 403 404 KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); 405 406 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 407 if (wr == NULL) 408 return (0); 409 req = wrtod(wr); 410 411 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 412 req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); 413 414 t4_wrq_tx(sc, wr); 415 return (credits); 416 } 417 418 void 419 send_rx_modulate(struct adapter *sc, struct toepcb *toep) 420 { 421 struct wrqe *wr; 422 struct cpl_rx_data_ack *req; 423 424 wr = alloc_wrqe(sizeof(*req), toep->ctrlq); 425 if (wr == NULL) 426 return; 427 req = wrtod(wr); 428 429 INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); 430 req->credit_dack = htobe32(F_RX_MODULATE_RX); 431 432 t4_wrq_tx(sc, wr); 433 } 434 435 void 436 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp) 437 { 438 struct adapter *sc = tod->tod_softc; 439 struct inpcb *inp = tp->t_inpcb; 440 struct socket *so = inp->inp_socket; 441 struct sockbuf *sb = &so->so_rcv; 442 struct toepcb *toep = tp->t_toe; 443 int rx_credits; 444 445 INP_WLOCK_ASSERT(inp); 446 SOCKBUF_LOCK_ASSERT(sb); 447 448 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 449 if (ulp_mode(toep) == ULP_MODE_TLS) { 450 if (toep->tls.rcv_over >= rx_credits) { 451 toep->tls.rcv_over -= rx_credits; 452 rx_credits = 0; 453 } else { 454 rx_credits -= toep->tls.rcv_over; 455 toep->tls.rcv_over = 0; 456 } 457 } 458 459 if (rx_credits > 0 && 460 (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 || 461 (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) || 462 sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) { 463 rx_credits = send_rx_credits(sc, toep, rx_credits); 464 tp->rcv_wnd += rx_credits; 465 tp->rcv_adv += rx_credits; 466 } else if (toep->flags & TPF_FORCE_CREDITS) 467 send_rx_modulate(sc, toep); 468 } 469 470 void 471 t4_rcvd(struct toedev *tod, struct tcpcb *tp) 472 { 473 struct inpcb *inp = tp->t_inpcb; 474 struct socket *so = inp->inp_socket; 475 struct sockbuf *sb = &so->so_rcv; 476 477 SOCKBUF_LOCK(sb); 478 t4_rcvd_locked(tod, tp); 479 SOCKBUF_UNLOCK(sb); 480 } 481 482 /* 483 * Close a connection by sending a CPL_CLOSE_CON_REQ message. 484 */ 485 int 486 t4_close_conn(struct adapter *sc, struct toepcb *toep) 487 { 488 struct wrqe *wr; 489 struct cpl_close_con_req *req; 490 unsigned int tid = toep->tid; 491 492 CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, 493 toep->flags & TPF_FIN_SENT ? ", IGNORED" : ""); 494 495 if (toep->flags & TPF_FIN_SENT) 496 return (0); 497 498 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 499 ("%s: flowc_wr not sent for tid %u.", __func__, tid)); 500 501 wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); 502 if (wr == NULL) { 503 /* XXX */ 504 panic("%s: allocation failure.", __func__); 505 } 506 req = wrtod(wr); 507 508 req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | 509 V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); 510 req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | 511 V_FW_WR_FLOWID(tid)); 512 req->wr.wr_lo = cpu_to_be64(0); 513 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 514 req->rsvd = 0; 515 516 toep->flags |= TPF_FIN_SENT; 517 toep->flags &= ~TPF_SEND_FIN; 518 t4_l2t_send(sc, wr, toep->l2te); 519 520 return (0); 521 } 522 523 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) 524 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) 525 526 /* Maximum amount of immediate data we could stuff in a WR */ 527 static inline int 528 max_imm_payload(int tx_credits) 529 { 530 const int n = 1; /* Use no more than one desc for imm. data WR */ 531 532 KASSERT(tx_credits >= 0 && 533 tx_credits <= MAX_OFLD_TX_CREDITS, 534 ("%s: %d credits", __func__, tx_credits)); 535 536 if (tx_credits < MIN_OFLD_TX_CREDITS) 537 return (0); 538 539 if (tx_credits >= (n * EQ_ESIZE) / 16) 540 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); 541 else 542 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); 543 } 544 545 /* Maximum number of SGL entries we could stuff in a WR */ 546 static inline int 547 max_dsgl_nsegs(int tx_credits) 548 { 549 int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ 550 int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; 551 552 KASSERT(tx_credits >= 0 && 553 tx_credits <= MAX_OFLD_TX_CREDITS, 554 ("%s: %d credits", __func__, tx_credits)); 555 556 if (tx_credits < MIN_OFLD_TX_CREDITS) 557 return (0); 558 559 nseg += 2 * (sge_pair_credits * 16 / 24); 560 if ((sge_pair_credits * 16) % 24 == 16) 561 nseg++; 562 563 return (nseg); 564 } 565 566 static inline void 567 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, 568 unsigned int plen, uint8_t credits, int shove, int ulp_submode) 569 { 570 struct fw_ofld_tx_data_wr *txwr = dst; 571 572 txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | 573 V_FW_WR_IMMDLEN(immdlen)); 574 txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | 575 V_FW_WR_LEN16(credits)); 576 txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | 577 V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); 578 txwr->plen = htobe32(plen); 579 580 if (toep->params.tx_align > 0) { 581 if (plen < 2 * toep->params.emss) 582 txwr->lsodisable_to_flags |= 583 htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); 584 else 585 txwr->lsodisable_to_flags |= 586 htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | 587 (toep->params.nagle == 0 ? 0 : 588 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); 589 } 590 } 591 592 /* 593 * Generate a DSGL from a starting mbuf. The total number of segments and the 594 * maximum segments in any one mbuf are provided. 595 */ 596 static void 597 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) 598 { 599 struct mbuf *m; 600 struct ulptx_sgl *usgl = dst; 601 int i, j, rc; 602 struct sglist sg; 603 struct sglist_seg segs[n]; 604 605 KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); 606 607 sglist_init(&sg, n, segs); 608 usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | 609 V_ULPTX_NSGE(nsegs)); 610 611 i = -1; 612 for (m = start; m != stop; m = m->m_next) { 613 if (m->m_flags & M_NOMAP) 614 rc = sglist_append_mb_ext_pgs(&sg, m); 615 else 616 rc = sglist_append(&sg, mtod(m, void *), m->m_len); 617 if (__predict_false(rc != 0)) 618 panic("%s: sglist_append %d", __func__, rc); 619 620 for (j = 0; j < sg.sg_nseg; i++, j++) { 621 if (i < 0) { 622 usgl->len0 = htobe32(segs[j].ss_len); 623 usgl->addr0 = htobe64(segs[j].ss_paddr); 624 } else { 625 usgl->sge[i / 2].len[i & 1] = 626 htobe32(segs[j].ss_len); 627 usgl->sge[i / 2].addr[i & 1] = 628 htobe64(segs[j].ss_paddr); 629 } 630 #ifdef INVARIANTS 631 nsegs--; 632 #endif 633 } 634 sglist_reset(&sg); 635 } 636 if (i & 1) 637 usgl->sge[i / 2].len[1] = htobe32(0); 638 KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", 639 __func__, nsegs, start, stop)); 640 } 641 642 /* 643 * Max number of SGL entries an offload tx work request can have. This is 41 644 * (1 + 40) for a full 512B work request. 645 * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) 646 */ 647 #define OFLD_SGL_LEN (41) 648 649 /* 650 * Send data and/or a FIN to the peer. 651 * 652 * The socket's so_snd buffer consists of a stream of data starting with sb_mb 653 * and linked together with m_next. sb_sndptr, if set, is the last mbuf that 654 * was transmitted. 655 * 656 * drop indicates the number of bytes that should be dropped from the head of 657 * the send buffer. It is an optimization that lets do_fw4_ack avoid creating 658 * contention on the send buffer lock (before this change it used to do 659 * sowwakeup and then t4_push_frames right after that when recovering from tx 660 * stalls). When drop is set this function MUST drop the bytes and wake up any 661 * writers. 662 */ 663 void 664 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop) 665 { 666 struct mbuf *sndptr, *m, *sb_sndptr; 667 struct fw_ofld_tx_data_wr *txwr; 668 struct wrqe *wr; 669 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 670 struct inpcb *inp = toep->inp; 671 struct tcpcb *tp = intotcpcb(inp); 672 struct socket *so = inp->inp_socket; 673 struct sockbuf *sb = &so->so_snd; 674 int tx_credits, shove, compl, sowwakeup; 675 struct ofld_tx_sdesc *txsd; 676 bool nomap_mbuf_seen; 677 678 INP_WLOCK_ASSERT(inp); 679 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 680 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 681 682 KASSERT(ulp_mode(toep) == ULP_MODE_NONE || 683 ulp_mode(toep) == ULP_MODE_TCPDDP || 684 ulp_mode(toep) == ULP_MODE_TLS || 685 ulp_mode(toep) == ULP_MODE_RDMA, 686 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 687 688 #ifdef VERBOSE_TRACES 689 CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", 690 __func__, toep->tid, toep->flags, tp->t_flags, drop); 691 #endif 692 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 693 return; 694 695 #ifdef RATELIMIT 696 if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && 697 (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { 698 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 699 } 700 #endif 701 702 /* 703 * This function doesn't resume by itself. Someone else must clear the 704 * flag and call this function. 705 */ 706 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 707 KASSERT(drop == 0, 708 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 709 return; 710 } 711 712 txsd = &toep->txsd[toep->txsd_pidx]; 713 do { 714 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 715 max_imm = max_imm_payload(tx_credits); 716 max_nsegs = max_dsgl_nsegs(tx_credits); 717 718 SOCKBUF_LOCK(sb); 719 sowwakeup = drop; 720 if (drop) { 721 sbdrop_locked(sb, drop); 722 drop = 0; 723 } 724 sb_sndptr = sb->sb_sndptr; 725 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; 726 plen = 0; 727 nsegs = 0; 728 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 729 nomap_mbuf_seen = false; 730 for (m = sndptr; m != NULL; m = m->m_next) { 731 int n; 732 733 if (m->m_flags & M_NOMAP) { 734 #ifdef KERN_TLS 735 if (m->m_ext.ext_pgs->tls != NULL) { 736 toep->flags |= TPF_KTLS; 737 if (plen == 0) { 738 SOCKBUF_UNLOCK(sb); 739 t4_push_ktls(sc, toep, 0); 740 return; 741 } 742 break; 743 } 744 #endif 745 n = sglist_count_mb_ext_pgs(m); 746 } else 747 n = sglist_count(mtod(m, void *), m->m_len); 748 749 nsegs += n; 750 plen += m->m_len; 751 752 /* This mbuf sent us _over_ the nsegs limit, back out */ 753 if (plen > max_imm && nsegs > max_nsegs) { 754 nsegs -= n; 755 plen -= m->m_len; 756 if (plen == 0) { 757 /* Too few credits */ 758 toep->flags |= TPF_TX_SUSPENDED; 759 if (sowwakeup) { 760 if (!TAILQ_EMPTY( 761 &toep->aiotx_jobq)) 762 t4_aiotx_queue_toep(so, 763 toep); 764 sowwakeup_locked(so); 765 } else 766 SOCKBUF_UNLOCK(sb); 767 SOCKBUF_UNLOCK_ASSERT(sb); 768 return; 769 } 770 break; 771 } 772 773 if (m->m_flags & M_NOMAP) 774 nomap_mbuf_seen = true; 775 if (max_nsegs_1mbuf < n) 776 max_nsegs_1mbuf = n; 777 sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ 778 779 /* This mbuf put us right at the max_nsegs limit */ 780 if (plen > max_imm && nsegs == max_nsegs) { 781 m = m->m_next; 782 break; 783 } 784 } 785 786 if (sbused(sb) > sb->sb_hiwat * 5 / 8 && 787 toep->plen_nocompl + plen >= sb->sb_hiwat / 4) 788 compl = 1; 789 else 790 compl = 0; 791 792 if (sb->sb_flags & SB_AUTOSIZE && 793 V_tcp_do_autosndbuf && 794 sb->sb_hiwat < V_tcp_autosndbuf_max && 795 sbused(sb) >= sb->sb_hiwat * 7 / 8) { 796 int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, 797 V_tcp_autosndbuf_max); 798 799 if (!sbreserve_locked(sb, newsize, so, NULL)) 800 sb->sb_flags &= ~SB_AUTOSIZE; 801 else 802 sowwakeup = 1; /* room available */ 803 } 804 if (sowwakeup) { 805 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 806 t4_aiotx_queue_toep(so, toep); 807 sowwakeup_locked(so); 808 } else 809 SOCKBUF_UNLOCK(sb); 810 SOCKBUF_UNLOCK_ASSERT(sb); 811 812 /* nothing to send */ 813 if (plen == 0) { 814 KASSERT(m == NULL, 815 ("%s: nothing to send, but m != NULL", __func__)); 816 break; 817 } 818 819 if (__predict_false(toep->flags & TPF_FIN_SENT)) 820 panic("%s: excess tx.", __func__); 821 822 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME); 823 if (plen <= max_imm && !nomap_mbuf_seen) { 824 825 /* Immediate data tx */ 826 827 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 828 toep->ofld_txq); 829 if (wr == NULL) { 830 /* XXX: how will we recover from this? */ 831 toep->flags |= TPF_TX_SUSPENDED; 832 return; 833 } 834 txwr = wrtod(wr); 835 credits = howmany(wr->wr_len, 16); 836 write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); 837 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 838 nsegs = 0; 839 } else { 840 int wr_len; 841 842 /* DSGL tx */ 843 844 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 845 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 846 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 847 if (wr == NULL) { 848 /* XXX: how will we recover from this? */ 849 toep->flags |= TPF_TX_SUSPENDED; 850 return; 851 } 852 txwr = wrtod(wr); 853 credits = howmany(wr_len, 16); 854 write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); 855 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 856 max_nsegs_1mbuf); 857 if (wr_len & 0xf) { 858 uint64_t *pad = (uint64_t *) 859 ((uintptr_t)txwr + wr_len); 860 *pad = 0; 861 } 862 } 863 864 KASSERT(toep->tx_credits >= credits, 865 ("%s: not enough credits", __func__)); 866 867 toep->tx_credits -= credits; 868 toep->tx_nocompl += credits; 869 toep->plen_nocompl += plen; 870 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 871 toep->tx_nocompl >= toep->tx_total / 4) 872 compl = 1; 873 874 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { 875 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 876 toep->tx_nocompl = 0; 877 toep->plen_nocompl = 0; 878 } 879 880 tp->snd_nxt += plen; 881 tp->snd_max += plen; 882 883 SOCKBUF_LOCK(sb); 884 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); 885 sb->sb_sndptr = sb_sndptr; 886 SOCKBUF_UNLOCK(sb); 887 888 toep->flags |= TPF_TX_DATA_SENT; 889 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 890 toep->flags |= TPF_TX_SUSPENDED; 891 892 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 893 txsd->plen = plen; 894 txsd->tx_credits = credits; 895 txsd++; 896 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 897 toep->txsd_pidx = 0; 898 txsd = &toep->txsd[0]; 899 } 900 toep->txsd_avail--; 901 902 t4_l2t_send(sc, wr, toep->l2te); 903 } while (m != NULL); 904 905 /* Send a FIN if requested, but only if there's no more data to send */ 906 if (m == NULL && toep->flags & TPF_SEND_FIN) 907 t4_close_conn(sc, toep); 908 } 909 910 static inline void 911 rqdrop_locked(struct mbufq *q, int plen) 912 { 913 struct mbuf *m; 914 915 while (plen > 0) { 916 m = mbufq_dequeue(q); 917 918 /* Too many credits. */ 919 MPASS(m != NULL); 920 M_ASSERTPKTHDR(m); 921 922 /* Partial credits. */ 923 MPASS(plen >= m->m_pkthdr.len); 924 925 plen -= m->m_pkthdr.len; 926 m_freem(m); 927 } 928 } 929 930 void 931 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop) 932 { 933 struct mbuf *sndptr, *m; 934 struct fw_ofld_tx_data_wr *txwr; 935 struct wrqe *wr; 936 u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; 937 u_int adjusted_plen, ulp_submode; 938 struct inpcb *inp = toep->inp; 939 struct tcpcb *tp = intotcpcb(inp); 940 int tx_credits, shove; 941 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 942 struct mbufq *pduq = &toep->ulp_pduq; 943 static const u_int ulp_extra_len[] = {0, 4, 4, 8}; 944 945 INP_WLOCK_ASSERT(inp); 946 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 947 ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); 948 KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, 949 ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); 950 951 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) 952 return; 953 954 /* 955 * This function doesn't resume by itself. Someone else must clear the 956 * flag and call this function. 957 */ 958 if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { 959 KASSERT(drop == 0, 960 ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); 961 return; 962 } 963 964 if (drop) 965 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop); 966 967 while ((sndptr = mbufq_first(pduq)) != NULL) { 968 M_ASSERTPKTHDR(sndptr); 969 970 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); 971 max_imm = max_imm_payload(tx_credits); 972 max_nsegs = max_dsgl_nsegs(tx_credits); 973 974 plen = 0; 975 nsegs = 0; 976 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ 977 for (m = sndptr; m != NULL; m = m->m_next) { 978 int n = sglist_count(mtod(m, void *), m->m_len); 979 980 nsegs += n; 981 plen += m->m_len; 982 983 /* 984 * This mbuf would send us _over_ the nsegs limit. 985 * Suspend tx because the PDU can't be sent out. 986 */ 987 if (plen > max_imm && nsegs > max_nsegs) { 988 toep->flags |= TPF_TX_SUSPENDED; 989 return; 990 } 991 992 if (max_nsegs_1mbuf < n) 993 max_nsegs_1mbuf = n; 994 } 995 996 if (__predict_false(toep->flags & TPF_FIN_SENT)) 997 panic("%s: excess tx.", __func__); 998 999 /* 1000 * We have a PDU to send. All of it goes out in one WR so 'm' 1001 * is NULL. A PDU's length is always a multiple of 4. 1002 */ 1003 MPASS(m == NULL); 1004 MPASS((plen & 3) == 0); 1005 MPASS(sndptr->m_pkthdr.len == plen); 1006 1007 shove = !(tp->t_flags & TF_MORETOCOME); 1008 ulp_submode = mbuf_ulp_submode(sndptr); 1009 MPASS(ulp_submode < nitems(ulp_extra_len)); 1010 1011 /* 1012 * plen doesn't include header and data digests, which are 1013 * generated and inserted in the right places by the TOE, but 1014 * they do occupy TCP sequence space and need to be accounted 1015 * for. 1016 */ 1017 adjusted_plen = plen + ulp_extra_len[ulp_submode]; 1018 if (plen <= max_imm) { 1019 1020 /* Immediate data tx */ 1021 1022 wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16), 1023 toep->ofld_txq); 1024 if (wr == NULL) { 1025 /* XXX: how will we recover from this? */ 1026 toep->flags |= TPF_TX_SUSPENDED; 1027 return; 1028 } 1029 txwr = wrtod(wr); 1030 credits = howmany(wr->wr_len, 16); 1031 write_tx_wr(txwr, toep, plen, adjusted_plen, credits, 1032 shove, ulp_submode); 1033 m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); 1034 nsegs = 0; 1035 } else { 1036 int wr_len; 1037 1038 /* DSGL tx */ 1039 wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + 1040 ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; 1041 wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq); 1042 if (wr == NULL) { 1043 /* XXX: how will we recover from this? */ 1044 toep->flags |= TPF_TX_SUSPENDED; 1045 return; 1046 } 1047 txwr = wrtod(wr); 1048 credits = howmany(wr_len, 16); 1049 write_tx_wr(txwr, toep, 0, adjusted_plen, credits, 1050 shove, ulp_submode); 1051 write_tx_sgl(txwr + 1, sndptr, m, nsegs, 1052 max_nsegs_1mbuf); 1053 if (wr_len & 0xf) { 1054 uint64_t *pad = (uint64_t *) 1055 ((uintptr_t)txwr + wr_len); 1056 *pad = 0; 1057 } 1058 } 1059 1060 KASSERT(toep->tx_credits >= credits, 1061 ("%s: not enough credits", __func__)); 1062 1063 m = mbufq_dequeue(pduq); 1064 MPASS(m == sndptr); 1065 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m); 1066 1067 toep->tx_credits -= credits; 1068 toep->tx_nocompl += credits; 1069 toep->plen_nocompl += plen; 1070 if (toep->tx_credits <= toep->tx_total * 3 / 8 && 1071 toep->tx_nocompl >= toep->tx_total / 4) { 1072 txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); 1073 toep->tx_nocompl = 0; 1074 toep->plen_nocompl = 0; 1075 } 1076 1077 tp->snd_nxt += adjusted_plen; 1078 tp->snd_max += adjusted_plen; 1079 1080 toep->flags |= TPF_TX_DATA_SENT; 1081 if (toep->tx_credits < MIN_OFLD_TX_CREDITS) 1082 toep->flags |= TPF_TX_SUSPENDED; 1083 1084 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); 1085 txsd->plen = plen; 1086 txsd->tx_credits = credits; 1087 txsd++; 1088 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { 1089 toep->txsd_pidx = 0; 1090 txsd = &toep->txsd[0]; 1091 } 1092 toep->txsd_avail--; 1093 1094 t4_l2t_send(sc, wr, toep->l2te); 1095 } 1096 1097 /* Send a FIN if requested, but only if there are no more PDUs to send */ 1098 if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN) 1099 t4_close_conn(sc, toep); 1100 } 1101 1102 static inline void 1103 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop) 1104 { 1105 1106 if (ulp_mode(toep) == ULP_MODE_ISCSI) 1107 t4_push_pdus(sc, toep, drop); 1108 else if (tls_tx_key(toep) && toep->tls.mode == TLS_MODE_TLSOM) 1109 t4_push_tls_records(sc, toep, drop); 1110 #ifdef KERN_TLS 1111 else if (toep->flags & TPF_KTLS) 1112 t4_push_ktls(sc, toep, drop); 1113 #endif 1114 else 1115 t4_push_frames(sc, toep, drop); 1116 } 1117 1118 int 1119 t4_tod_output(struct toedev *tod, struct tcpcb *tp) 1120 { 1121 struct adapter *sc = tod->tod_softc; 1122 #ifdef INVARIANTS 1123 struct inpcb *inp = tp->t_inpcb; 1124 #endif 1125 struct toepcb *toep = tp->t_toe; 1126 1127 INP_WLOCK_ASSERT(inp); 1128 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1129 ("%s: inp %p dropped.", __func__, inp)); 1130 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1131 1132 t4_push_data(sc, toep, 0); 1133 1134 return (0); 1135 } 1136 1137 int 1138 t4_send_fin(struct toedev *tod, struct tcpcb *tp) 1139 { 1140 struct adapter *sc = tod->tod_softc; 1141 #ifdef INVARIANTS 1142 struct inpcb *inp = tp->t_inpcb; 1143 #endif 1144 struct toepcb *toep = tp->t_toe; 1145 1146 INP_WLOCK_ASSERT(inp); 1147 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1148 ("%s: inp %p dropped.", __func__, inp)); 1149 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1150 1151 toep->flags |= TPF_SEND_FIN; 1152 if (tp->t_state >= TCPS_ESTABLISHED) 1153 t4_push_data(sc, toep, 0); 1154 1155 return (0); 1156 } 1157 1158 int 1159 t4_send_rst(struct toedev *tod, struct tcpcb *tp) 1160 { 1161 struct adapter *sc = tod->tod_softc; 1162 #if defined(INVARIANTS) 1163 struct inpcb *inp = tp->t_inpcb; 1164 #endif 1165 struct toepcb *toep = tp->t_toe; 1166 1167 INP_WLOCK_ASSERT(inp); 1168 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1169 ("%s: inp %p dropped.", __func__, inp)); 1170 KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); 1171 1172 /* hmmmm */ 1173 KASSERT(toep->flags & TPF_FLOWC_WR_SENT, 1174 ("%s: flowc for tid %u [%s] not sent already", 1175 __func__, toep->tid, tcpstates[tp->t_state])); 1176 1177 send_reset(sc, toep, 0); 1178 return (0); 1179 } 1180 1181 /* 1182 * Peer has sent us a FIN. 1183 */ 1184 static int 1185 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1186 { 1187 struct adapter *sc = iq->adapter; 1188 const struct cpl_peer_close *cpl = (const void *)(rss + 1); 1189 unsigned int tid = GET_TID(cpl); 1190 struct toepcb *toep = lookup_tid(sc, tid); 1191 struct inpcb *inp = toep->inp; 1192 struct tcpcb *tp = NULL; 1193 struct socket *so; 1194 struct epoch_tracker et; 1195 #ifdef INVARIANTS 1196 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1197 #endif 1198 1199 KASSERT(opcode == CPL_PEER_CLOSE, 1200 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1201 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1202 1203 if (__predict_false(toep->flags & TPF_SYNQE)) { 1204 /* 1205 * do_pass_establish must have run before do_peer_close and if 1206 * this is still a synqe instead of a toepcb then the connection 1207 * must be getting aborted. 1208 */ 1209 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1210 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1211 toep, toep->flags); 1212 return (0); 1213 } 1214 1215 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1216 1217 CURVNET_SET(toep->vnet); 1218 NET_EPOCH_ENTER(et); 1219 INP_WLOCK(inp); 1220 tp = intotcpcb(inp); 1221 1222 CTR6(KTR_CXGBE, 1223 "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p", 1224 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1225 toep->ddp.flags, inp); 1226 1227 if (toep->flags & TPF_ABORT_SHUTDOWN) 1228 goto done; 1229 1230 tp->rcv_nxt++; /* FIN */ 1231 1232 so = inp->inp_socket; 1233 socantrcvmore(so); 1234 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1235 DDP_LOCK(toep); 1236 if (__predict_false(toep->ddp.flags & 1237 (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) 1238 handle_ddp_close(toep, tp, cpl->rcv_nxt); 1239 DDP_UNLOCK(toep); 1240 } 1241 1242 if (ulp_mode(toep) != ULP_MODE_RDMA) { 1243 KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), 1244 ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, 1245 be32toh(cpl->rcv_nxt))); 1246 } 1247 1248 switch (tp->t_state) { 1249 case TCPS_SYN_RECEIVED: 1250 tp->t_starttime = ticks; 1251 /* FALLTHROUGH */ 1252 1253 case TCPS_ESTABLISHED: 1254 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1255 break; 1256 1257 case TCPS_FIN_WAIT_1: 1258 tcp_state_change(tp, TCPS_CLOSING); 1259 break; 1260 1261 case TCPS_FIN_WAIT_2: 1262 tcp_twstart(tp); 1263 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1264 NET_EPOCH_EXIT(et); 1265 CURVNET_RESTORE(); 1266 1267 INP_WLOCK(inp); 1268 final_cpl_received(toep); 1269 return (0); 1270 1271 default: 1272 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", 1273 __func__, tid, tp->t_state); 1274 } 1275 done: 1276 INP_WUNLOCK(inp); 1277 NET_EPOCH_EXIT(et); 1278 CURVNET_RESTORE(); 1279 return (0); 1280 } 1281 1282 /* 1283 * Peer has ACK'd our FIN. 1284 */ 1285 static int 1286 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, 1287 struct mbuf *m) 1288 { 1289 struct adapter *sc = iq->adapter; 1290 const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); 1291 unsigned int tid = GET_TID(cpl); 1292 struct toepcb *toep = lookup_tid(sc, tid); 1293 struct inpcb *inp = toep->inp; 1294 struct tcpcb *tp = NULL; 1295 struct socket *so = NULL; 1296 struct epoch_tracker et; 1297 #ifdef INVARIANTS 1298 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1299 #endif 1300 1301 KASSERT(opcode == CPL_CLOSE_CON_RPL, 1302 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1303 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1304 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1305 1306 CURVNET_SET(toep->vnet); 1307 NET_EPOCH_ENTER(et); 1308 INP_WLOCK(inp); 1309 tp = intotcpcb(inp); 1310 1311 CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", 1312 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); 1313 1314 if (toep->flags & TPF_ABORT_SHUTDOWN) 1315 goto done; 1316 1317 so = inp->inp_socket; 1318 tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ 1319 1320 switch (tp->t_state) { 1321 case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ 1322 tcp_twstart(tp); 1323 release: 1324 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ 1325 NET_EPOCH_EXIT(et); 1326 CURVNET_RESTORE(); 1327 1328 INP_WLOCK(inp); 1329 final_cpl_received(toep); /* no more CPLs expected */ 1330 1331 return (0); 1332 case TCPS_LAST_ACK: 1333 if (tcp_close(tp)) 1334 INP_WUNLOCK(inp); 1335 goto release; 1336 1337 case TCPS_FIN_WAIT_1: 1338 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1339 soisdisconnected(so); 1340 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1341 break; 1342 1343 default: 1344 log(LOG_ERR, 1345 "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", 1346 __func__, tid, tcpstates[tp->t_state]); 1347 } 1348 done: 1349 INP_WUNLOCK(inp); 1350 NET_EPOCH_EXIT(et); 1351 CURVNET_RESTORE(); 1352 return (0); 1353 } 1354 1355 void 1356 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, 1357 int rst_status) 1358 { 1359 struct wrqe *wr; 1360 struct cpl_abort_rpl *cpl; 1361 1362 wr = alloc_wrqe(sizeof(*cpl), ofld_txq); 1363 if (wr == NULL) { 1364 /* XXX */ 1365 panic("%s: allocation failure.", __func__); 1366 } 1367 cpl = wrtod(wr); 1368 1369 INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); 1370 cpl->cmd = rst_status; 1371 1372 t4_wrq_tx(sc, wr); 1373 } 1374 1375 static int 1376 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) 1377 { 1378 switch (abort_reason) { 1379 case CPL_ERR_BAD_SYN: 1380 case CPL_ERR_CONN_RESET: 1381 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 1382 case CPL_ERR_XMIT_TIMEDOUT: 1383 case CPL_ERR_PERSIST_TIMEDOUT: 1384 case CPL_ERR_FINWAIT2_TIMEDOUT: 1385 case CPL_ERR_KEEPALIVE_TIMEDOUT: 1386 return (ETIMEDOUT); 1387 default: 1388 return (EIO); 1389 } 1390 } 1391 1392 /* 1393 * TCP RST from the peer, timeout, or some other such critical error. 1394 */ 1395 static int 1396 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1397 { 1398 struct adapter *sc = iq->adapter; 1399 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1400 unsigned int tid = GET_TID(cpl); 1401 struct toepcb *toep = lookup_tid(sc, tid); 1402 struct sge_wrq *ofld_txq = toep->ofld_txq; 1403 struct inpcb *inp; 1404 struct tcpcb *tp; 1405 struct epoch_tracker et; 1406 #ifdef INVARIANTS 1407 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1408 #endif 1409 1410 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1411 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1412 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1413 1414 if (toep->flags & TPF_SYNQE) 1415 return (do_abort_req_synqe(iq, rss, m)); 1416 1417 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1418 1419 if (negative_advice(cpl->status)) { 1420 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", 1421 __func__, cpl->status, tid, toep->flags); 1422 return (0); /* Ignore negative advice */ 1423 } 1424 1425 inp = toep->inp; 1426 CURVNET_SET(toep->vnet); 1427 NET_EPOCH_ENTER(et); /* for tcp_close */ 1428 INP_WLOCK(inp); 1429 1430 tp = intotcpcb(inp); 1431 1432 CTR6(KTR_CXGBE, 1433 "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", 1434 __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, 1435 inp->inp_flags, cpl->status); 1436 1437 /* 1438 * If we'd initiated an abort earlier the reply to it is responsible for 1439 * cleaning up resources. Otherwise we tear everything down right here 1440 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1441 */ 1442 if (toep->flags & TPF_ABORT_SHUTDOWN) { 1443 INP_WUNLOCK(inp); 1444 goto done; 1445 } 1446 toep->flags |= TPF_ABORT_SHUTDOWN; 1447 1448 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 1449 struct socket *so = inp->inp_socket; 1450 1451 if (so != NULL) 1452 so_error_set(so, abort_status_to_errno(tp, 1453 cpl->status)); 1454 tp = tcp_close(tp); 1455 if (tp == NULL) 1456 INP_WLOCK(inp); /* re-acquire */ 1457 } 1458 1459 final_cpl_received(toep); 1460 done: 1461 NET_EPOCH_EXIT(et); 1462 CURVNET_RESTORE(); 1463 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1464 return (0); 1465 } 1466 1467 /* 1468 * Reply to the CPL_ABORT_REQ (send_reset) 1469 */ 1470 static int 1471 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1472 { 1473 struct adapter *sc = iq->adapter; 1474 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1475 unsigned int tid = GET_TID(cpl); 1476 struct toepcb *toep = lookup_tid(sc, tid); 1477 struct inpcb *inp = toep->inp; 1478 #ifdef INVARIANTS 1479 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1480 #endif 1481 1482 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1483 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1484 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1485 1486 if (toep->flags & TPF_SYNQE) 1487 return (do_abort_rpl_synqe(iq, rss, m)); 1488 1489 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1490 1491 CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", 1492 __func__, tid, toep, inp, cpl->status); 1493 1494 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1495 ("%s: wasn't expecting abort reply", __func__)); 1496 1497 INP_WLOCK(inp); 1498 final_cpl_received(toep); 1499 1500 return (0); 1501 } 1502 1503 static int 1504 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1505 { 1506 struct adapter *sc = iq->adapter; 1507 const struct cpl_rx_data *cpl = mtod(m, const void *); 1508 unsigned int tid = GET_TID(cpl); 1509 struct toepcb *toep = lookup_tid(sc, tid); 1510 struct inpcb *inp = toep->inp; 1511 struct tcpcb *tp; 1512 struct socket *so; 1513 struct sockbuf *sb; 1514 struct epoch_tracker et; 1515 int len, rx_credits; 1516 uint32_t ddp_placed = 0; 1517 1518 if (__predict_false(toep->flags & TPF_SYNQE)) { 1519 /* 1520 * do_pass_establish must have run before do_rx_data and if this 1521 * is still a synqe instead of a toepcb then the connection must 1522 * be getting aborted. 1523 */ 1524 MPASS(toep->flags & TPF_ABORT_SHUTDOWN); 1525 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, 1526 toep, toep->flags); 1527 m_freem(m); 1528 return (0); 1529 } 1530 1531 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1532 1533 /* strip off CPL header */ 1534 m_adj(m, sizeof(*cpl)); 1535 len = m->m_pkthdr.len; 1536 1537 INP_WLOCK(inp); 1538 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 1539 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", 1540 __func__, tid, len, inp->inp_flags); 1541 INP_WUNLOCK(inp); 1542 m_freem(m); 1543 return (0); 1544 } 1545 1546 tp = intotcpcb(inp); 1547 1548 if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) 1549 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt; 1550 1551 tp->rcv_nxt += len; 1552 if (tp->rcv_wnd < len) { 1553 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, 1554 ("%s: negative window size", __func__)); 1555 } 1556 1557 tp->rcv_wnd -= len; 1558 tp->t_rcvtime = ticks; 1559 1560 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1561 DDP_LOCK(toep); 1562 so = inp_inpcbtosocket(inp); 1563 sb = &so->so_rcv; 1564 SOCKBUF_LOCK(sb); 1565 1566 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { 1567 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", 1568 __func__, tid, len); 1569 m_freem(m); 1570 SOCKBUF_UNLOCK(sb); 1571 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1572 DDP_UNLOCK(toep); 1573 INP_WUNLOCK(inp); 1574 1575 CURVNET_SET(toep->vnet); 1576 NET_EPOCH_ENTER(et); 1577 INP_WLOCK(inp); 1578 tp = tcp_drop(tp, ECONNRESET); 1579 if (tp) 1580 INP_WUNLOCK(inp); 1581 NET_EPOCH_EXIT(et); 1582 CURVNET_RESTORE(); 1583 1584 return (0); 1585 } 1586 1587 /* receive buffer autosize */ 1588 MPASS(toep->vnet == so->so_vnet); 1589 CURVNET_SET(toep->vnet); 1590 if (sb->sb_flags & SB_AUTOSIZE && 1591 V_tcp_do_autorcvbuf && 1592 sb->sb_hiwat < V_tcp_autorcvbuf_max && 1593 len > (sbspace(sb) / 8 * 7)) { 1594 unsigned int hiwat = sb->sb_hiwat; 1595 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, 1596 V_tcp_autorcvbuf_max); 1597 1598 if (!sbreserve_locked(sb, newsize, so, NULL)) 1599 sb->sb_flags &= ~SB_AUTOSIZE; 1600 } 1601 1602 if (ulp_mode(toep) == ULP_MODE_TCPDDP) { 1603 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; 1604 1605 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) 1606 CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", 1607 __func__, tid, len); 1608 1609 if (changed) { 1610 if (toep->ddp.flags & DDP_SC_REQ) 1611 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ; 1612 else { 1613 KASSERT(cpl->ddp_off == 1, 1614 ("%s: DDP switched on by itself.", 1615 __func__)); 1616 1617 /* Fell out of DDP mode */ 1618 toep->ddp.flags &= ~DDP_ON; 1619 CTR1(KTR_CXGBE, "%s: fell out of DDP mode", 1620 __func__); 1621 1622 insert_ddp_data(toep, ddp_placed); 1623 } 1624 } 1625 1626 if (toep->ddp.flags & DDP_ON) { 1627 /* 1628 * CPL_RX_DATA with DDP on can only be an indicate. 1629 * Start posting queued AIO requests via DDP. The 1630 * payload that arrived in this indicate is appended 1631 * to the socket buffer as usual. 1632 */ 1633 handle_ddp_indicate(toep); 1634 } 1635 } 1636 1637 sbappendstream_locked(sb, m, 0); 1638 rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; 1639 if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) { 1640 rx_credits = send_rx_credits(sc, toep, rx_credits); 1641 tp->rcv_wnd += rx_credits; 1642 tp->rcv_adv += rx_credits; 1643 } 1644 1645 if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && 1646 sbavail(sb) != 0) { 1647 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, 1648 tid); 1649 ddp_queue_toep(toep); 1650 } 1651 sorwakeup_locked(so); 1652 SOCKBUF_UNLOCK_ASSERT(sb); 1653 if (ulp_mode(toep) == ULP_MODE_TCPDDP) 1654 DDP_UNLOCK(toep); 1655 1656 INP_WUNLOCK(inp); 1657 CURVNET_RESTORE(); 1658 return (0); 1659 } 1660 1661 static int 1662 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 1663 { 1664 struct adapter *sc = iq->adapter; 1665 const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); 1666 unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); 1667 struct toepcb *toep = lookup_tid(sc, tid); 1668 struct inpcb *inp; 1669 struct tcpcb *tp; 1670 struct socket *so; 1671 uint8_t credits = cpl->credits; 1672 struct ofld_tx_sdesc *txsd; 1673 int plen; 1674 #ifdef INVARIANTS 1675 unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); 1676 #endif 1677 1678 /* 1679 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and 1680 * now this comes back carrying the credits for the flowc. 1681 */ 1682 if (__predict_false(toep->flags & TPF_SYNQE)) { 1683 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, 1684 ("%s: credits for a synq entry %p", __func__, toep)); 1685 return (0); 1686 } 1687 1688 inp = toep->inp; 1689 1690 KASSERT(opcode == CPL_FW4_ACK, 1691 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1692 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1693 KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); 1694 1695 INP_WLOCK(inp); 1696 1697 if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { 1698 INP_WUNLOCK(inp); 1699 return (0); 1700 } 1701 1702 KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, 1703 ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); 1704 1705 tp = intotcpcb(inp); 1706 1707 if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { 1708 tcp_seq snd_una = be32toh(cpl->snd_una); 1709 1710 #ifdef INVARIANTS 1711 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 1712 log(LOG_ERR, 1713 "%s: unexpected seq# %x for TID %u, snd_una %x\n", 1714 __func__, snd_una, toep->tid, tp->snd_una); 1715 } 1716 #endif 1717 1718 if (tp->snd_una != snd_una) { 1719 tp->snd_una = snd_una; 1720 tp->ts_recent_age = tcp_ts_getticks(); 1721 } 1722 } 1723 1724 #ifdef VERBOSE_TRACES 1725 CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits); 1726 #endif 1727 so = inp->inp_socket; 1728 txsd = &toep->txsd[toep->txsd_cidx]; 1729 plen = 0; 1730 while (credits) { 1731 KASSERT(credits >= txsd->tx_credits, 1732 ("%s: too many (or partial) credits", __func__)); 1733 credits -= txsd->tx_credits; 1734 toep->tx_credits += txsd->tx_credits; 1735 plen += txsd->plen; 1736 if (txsd->iv_buffer) { 1737 free(txsd->iv_buffer, M_CXGBE); 1738 txsd->iv_buffer = NULL; 1739 } 1740 txsd++; 1741 toep->txsd_avail++; 1742 KASSERT(toep->txsd_avail <= toep->txsd_total, 1743 ("%s: txsd avail > total", __func__)); 1744 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { 1745 txsd = &toep->txsd[0]; 1746 toep->txsd_cidx = 0; 1747 } 1748 } 1749 1750 if (toep->tx_credits == toep->tx_total) { 1751 toep->tx_nocompl = 0; 1752 toep->plen_nocompl = 0; 1753 } 1754 1755 if (toep->flags & TPF_TX_SUSPENDED && 1756 toep->tx_credits >= toep->tx_total / 4) { 1757 #ifdef VERBOSE_TRACES 1758 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__, 1759 tid); 1760 #endif 1761 toep->flags &= ~TPF_TX_SUSPENDED; 1762 CURVNET_SET(toep->vnet); 1763 t4_push_data(sc, toep, plen); 1764 CURVNET_RESTORE(); 1765 } else if (plen > 0) { 1766 struct sockbuf *sb = &so->so_snd; 1767 int sbu; 1768 1769 SOCKBUF_LOCK(sb); 1770 sbu = sbused(sb); 1771 if (ulp_mode(toep) == ULP_MODE_ISCSI) { 1772 1773 if (__predict_false(sbu > 0)) { 1774 /* 1775 * The data trasmitted before the tid's ULP mode 1776 * changed to ISCSI is still in so_snd. 1777 * Incoming credits should account for so_snd 1778 * first. 1779 */ 1780 sbdrop_locked(sb, min(sbu, plen)); 1781 plen -= min(sbu, plen); 1782 } 1783 sowwakeup_locked(so); /* unlocks so_snd */ 1784 rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); 1785 } else { 1786 #ifdef VERBOSE_TRACES 1787 CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__, 1788 tid, plen); 1789 #endif 1790 sbdrop_locked(sb, plen); 1791 if (tls_tx_key(toep) && 1792 toep->tls.mode == TLS_MODE_TLSOM) { 1793 struct tls_ofld_info *tls_ofld = &toep->tls; 1794 1795 MPASS(tls_ofld->sb_off >= plen); 1796 tls_ofld->sb_off -= plen; 1797 } 1798 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 1799 t4_aiotx_queue_toep(so, toep); 1800 sowwakeup_locked(so); /* unlocks so_snd */ 1801 } 1802 SOCKBUF_UNLOCK_ASSERT(sb); 1803 } 1804 1805 INP_WUNLOCK(inp); 1806 1807 return (0); 1808 } 1809 1810 void 1811 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep, 1812 uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie) 1813 { 1814 struct wrqe *wr; 1815 struct cpl_set_tcb_field *req; 1816 struct ofld_tx_sdesc *txsd; 1817 1818 MPASS((cookie & ~M_COOKIE) == 0); 1819 if (reply) { 1820 MPASS(cookie != CPL_COOKIE_RESERVED); 1821 } 1822 1823 wr = alloc_wrqe(sizeof(*req), wrq); 1824 if (wr == NULL) { 1825 /* XXX */ 1826 panic("%s: allocation failure.", __func__); 1827 } 1828 req = wrtod(wr); 1829 1830 INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid); 1831 req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 1832 if (reply == 0) 1833 req->reply_ctrl |= htobe16(F_NO_REPLY); 1834 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie)); 1835 req->mask = htobe64(mask); 1836 req->val = htobe64(val); 1837 if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) { 1838 txsd = &toep->txsd[toep->txsd_pidx]; 1839 txsd->tx_credits = howmany(sizeof(*req), 16); 1840 txsd->plen = 0; 1841 KASSERT(toep->tx_credits >= txsd->tx_credits && 1842 toep->txsd_avail > 0, 1843 ("%s: not enough credits (%d)", __func__, 1844 toep->tx_credits)); 1845 toep->tx_credits -= txsd->tx_credits; 1846 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 1847 toep->txsd_pidx = 0; 1848 toep->txsd_avail--; 1849 } 1850 1851 t4_wrq_tx(sc, wr); 1852 } 1853 1854 void 1855 t4_init_cpl_io_handlers(void) 1856 { 1857 1858 t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 1859 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 1860 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 1861 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl, 1862 CPL_COOKIE_TOM); 1863 t4_register_cpl_handler(CPL_RX_DATA, do_rx_data); 1864 t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM); 1865 } 1866 1867 void 1868 t4_uninit_cpl_io_handlers(void) 1869 { 1870 1871 t4_register_cpl_handler(CPL_PEER_CLOSE, NULL); 1872 t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL); 1873 t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL); 1874 t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM); 1875 t4_register_cpl_handler(CPL_RX_DATA, NULL); 1876 t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM); 1877 } 1878 1879 /* 1880 * Use the 'backend1' field in AIO jobs to hold an error that should 1881 * be reported when the job is completed, the 'backend3' field to 1882 * store the amount of data sent by the AIO job so far, and the 1883 * 'backend4' field to hold a reference count on the job. 1884 * 1885 * Each unmapped mbuf holds a reference on the job as does the queue 1886 * so long as the job is queued. 1887 */ 1888 #define aio_error backend1 1889 #define aio_sent backend3 1890 #define aio_refs backend4 1891 1892 #define jobtotid(job) \ 1893 (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid) 1894 1895 static void 1896 aiotx_free_job(struct kaiocb *job) 1897 { 1898 long status; 1899 int error; 1900 1901 if (refcount_release(&job->aio_refs) == 0) 1902 return; 1903 1904 error = (intptr_t)job->aio_error; 1905 status = job->aio_sent; 1906 #ifdef VERBOSE_TRACES 1907 CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__, 1908 jobtotid(job), job, status, error); 1909 #endif 1910 if (error != 0 && status != 0) 1911 error = 0; 1912 if (error == ECANCELED) 1913 aio_cancel(job); 1914 else if (error) 1915 aio_complete(job, -1, error); 1916 else { 1917 job->msgsnd = 1; 1918 aio_complete(job, status, 0); 1919 } 1920 } 1921 1922 static void 1923 aiotx_free_pgs(struct mbuf *m) 1924 { 1925 struct mbuf_ext_pgs *ext_pgs; 1926 struct kaiocb *job; 1927 vm_page_t pg; 1928 1929 MBUF_EXT_PGS_ASSERT(m); 1930 ext_pgs = m->m_ext.ext_pgs; 1931 job = m->m_ext.ext_arg1; 1932 #ifdef VERBOSE_TRACES 1933 CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__, 1934 m->m_len, jobtotid(job)); 1935 #endif 1936 1937 for (int i = 0; i < ext_pgs->npgs; i++) { 1938 pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); 1939 vm_page_unwire(pg, PQ_ACTIVE); 1940 } 1941 1942 aiotx_free_job(job); 1943 } 1944 1945 /* 1946 * Allocate a chain of unmapped mbufs describing the next 'len' bytes 1947 * of an AIO job. 1948 */ 1949 static struct mbuf * 1950 alloc_aiotx_mbuf(struct kaiocb *job, int len) 1951 { 1952 struct vmspace *vm; 1953 vm_page_t pgs[MBUF_PEXT_MAX_PGS]; 1954 struct mbuf *m, *top, *last; 1955 struct mbuf_ext_pgs *ext_pgs; 1956 vm_map_t map; 1957 vm_offset_t start; 1958 int i, mlen, npages, pgoff; 1959 1960 KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes, 1961 ("%s(%p, %d): request to send beyond end of buffer", __func__, 1962 job, len)); 1963 1964 /* 1965 * The AIO subsystem will cancel and drain all requests before 1966 * permitting a process to exit or exec, so p_vmspace should 1967 * be stable here. 1968 */ 1969 vm = job->userproc->p_vmspace; 1970 map = &vm->vm_map; 1971 start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent; 1972 pgoff = start & PAGE_MASK; 1973 1974 top = NULL; 1975 last = NULL; 1976 while (len > 0) { 1977 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff); 1978 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0, 1979 ("%s: next start (%#jx + %#x) is not page aligned", 1980 __func__, (uintmax_t)start, mlen)); 1981 1982 npages = vm_fault_quick_hold_pages(map, start, mlen, 1983 VM_PROT_WRITE, pgs, nitems(pgs)); 1984 if (npages < 0) 1985 break; 1986 1987 m = mb_alloc_ext_pgs(M_WAITOK, false, aiotx_free_pgs); 1988 if (m == NULL) { 1989 vm_page_unhold_pages(pgs, npages); 1990 break; 1991 } 1992 1993 ext_pgs = m->m_ext.ext_pgs; 1994 ext_pgs->first_pg_off = pgoff; 1995 ext_pgs->npgs = npages; 1996 if (npages == 1) { 1997 KASSERT(mlen + pgoff <= PAGE_SIZE, 1998 ("%s: single page is too large (off %d len %d)", 1999 __func__, pgoff, mlen)); 2000 ext_pgs->last_pg_len = mlen; 2001 } else { 2002 ext_pgs->last_pg_len = mlen - (PAGE_SIZE - pgoff) - 2003 (npages - 2) * PAGE_SIZE; 2004 } 2005 for (i = 0; i < npages; i++) 2006 ext_pgs->pa[i] = VM_PAGE_TO_PHYS(pgs[i]); 2007 2008 m->m_len = mlen; 2009 m->m_ext.ext_size = npages * PAGE_SIZE; 2010 m->m_ext.ext_arg1 = job; 2011 refcount_acquire(&job->aio_refs); 2012 2013 #ifdef VERBOSE_TRACES 2014 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d", 2015 __func__, jobtotid(job), m, job, npages); 2016 #endif 2017 2018 if (top == NULL) 2019 top = m; 2020 else 2021 last->m_next = m; 2022 last = m; 2023 2024 len -= mlen; 2025 start += mlen; 2026 pgoff = 0; 2027 } 2028 2029 return (top); 2030 } 2031 2032 static void 2033 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job) 2034 { 2035 struct sockbuf *sb; 2036 struct file *fp; 2037 struct inpcb *inp; 2038 struct tcpcb *tp; 2039 struct mbuf *m; 2040 int error, len; 2041 bool moretocome, sendmore; 2042 2043 sb = &so->so_snd; 2044 SOCKBUF_UNLOCK(sb); 2045 fp = job->fd_file; 2046 m = NULL; 2047 2048 #ifdef MAC 2049 error = mac_socket_check_send(fp->f_cred, so); 2050 if (error != 0) 2051 goto out; 2052 #endif 2053 2054 /* Inline sosend_generic(). */ 2055 2056 error = sblock(sb, SBL_WAIT); 2057 MPASS(error == 0); 2058 2059 sendanother: 2060 SOCKBUF_LOCK(sb); 2061 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2062 SOCKBUF_UNLOCK(sb); 2063 sbunlock(sb); 2064 if ((so->so_options & SO_NOSIGPIPE) == 0) { 2065 PROC_LOCK(job->userproc); 2066 kern_psignal(job->userproc, SIGPIPE); 2067 PROC_UNLOCK(job->userproc); 2068 } 2069 error = EPIPE; 2070 goto out; 2071 } 2072 if (so->so_error) { 2073 error = so->so_error; 2074 so->so_error = 0; 2075 SOCKBUF_UNLOCK(sb); 2076 sbunlock(sb); 2077 goto out; 2078 } 2079 if ((so->so_state & SS_ISCONNECTED) == 0) { 2080 SOCKBUF_UNLOCK(sb); 2081 sbunlock(sb); 2082 error = ENOTCONN; 2083 goto out; 2084 } 2085 if (sbspace(sb) < sb->sb_lowat) { 2086 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO)); 2087 2088 /* 2089 * Don't block if there is too little room in the socket 2090 * buffer. Instead, requeue the request. 2091 */ 2092 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2093 SOCKBUF_UNLOCK(sb); 2094 sbunlock(sb); 2095 error = ECANCELED; 2096 goto out; 2097 } 2098 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2099 SOCKBUF_UNLOCK(sb); 2100 sbunlock(sb); 2101 goto out; 2102 } 2103 2104 /* 2105 * Write as much data as the socket permits, but no more than a 2106 * a single sndbuf at a time. 2107 */ 2108 len = sbspace(sb); 2109 if (len > job->uaiocb.aio_nbytes - job->aio_sent) { 2110 len = job->uaiocb.aio_nbytes - job->aio_sent; 2111 moretocome = false; 2112 } else 2113 moretocome = true; 2114 if (len > toep->params.sndbuf) { 2115 len = toep->params.sndbuf; 2116 sendmore = true; 2117 } else 2118 sendmore = false; 2119 2120 if (!TAILQ_EMPTY(&toep->aiotx_jobq)) 2121 moretocome = true; 2122 SOCKBUF_UNLOCK(sb); 2123 MPASS(len != 0); 2124 2125 m = alloc_aiotx_mbuf(job, len); 2126 if (m == NULL) { 2127 sbunlock(sb); 2128 error = EFAULT; 2129 goto out; 2130 } 2131 2132 /* Inlined tcp_usr_send(). */ 2133 2134 inp = toep->inp; 2135 INP_WLOCK(inp); 2136 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2137 INP_WUNLOCK(inp); 2138 sbunlock(sb); 2139 error = ECONNRESET; 2140 goto out; 2141 } 2142 2143 job->aio_sent += m_length(m, NULL); 2144 2145 sbappendstream(sb, m, 0); 2146 m = NULL; 2147 2148 if (!(inp->inp_flags & INP_DROPPED)) { 2149 tp = intotcpcb(inp); 2150 if (moretocome) 2151 tp->t_flags |= TF_MORETOCOME; 2152 error = tp->t_fb->tfb_tcp_output(tp); 2153 if (moretocome) 2154 tp->t_flags &= ~TF_MORETOCOME; 2155 } 2156 2157 INP_WUNLOCK(inp); 2158 if (sendmore) 2159 goto sendanother; 2160 sbunlock(sb); 2161 2162 if (error) 2163 goto out; 2164 2165 /* 2166 * If this is a blocking socket and the request has not been 2167 * fully completed, requeue it until the socket is ready 2168 * again. 2169 */ 2170 if (job->aio_sent < job->uaiocb.aio_nbytes && 2171 !(so->so_state & SS_NBIO)) { 2172 SOCKBUF_LOCK(sb); 2173 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) { 2174 SOCKBUF_UNLOCK(sb); 2175 error = ECANCELED; 2176 goto out; 2177 } 2178 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list); 2179 return; 2180 } 2181 2182 /* 2183 * If the request will not be requeued, drop the queue's 2184 * reference to the job. Any mbufs in flight should still 2185 * hold a reference, but this drops the reference that the 2186 * queue owns while it is waiting to queue mbufs to the 2187 * socket. 2188 */ 2189 aiotx_free_job(job); 2190 2191 out: 2192 if (error) { 2193 job->aio_error = (void *)(intptr_t)error; 2194 aiotx_free_job(job); 2195 } 2196 if (m != NULL) 2197 m_free(m); 2198 SOCKBUF_LOCK(sb); 2199 } 2200 2201 static void 2202 t4_aiotx_task(void *context, int pending) 2203 { 2204 struct toepcb *toep = context; 2205 struct socket *so; 2206 struct kaiocb *job; 2207 2208 so = toep->aiotx_so; 2209 CURVNET_SET(toep->vnet); 2210 SOCKBUF_LOCK(&so->so_snd); 2211 while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { 2212 job = TAILQ_FIRST(&toep->aiotx_jobq); 2213 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2214 if (!aio_clear_cancel_function(job)) 2215 continue; 2216 2217 t4_aiotx_process_job(toep, so, job); 2218 } 2219 toep->aiotx_so = NULL; 2220 SOCKBUF_UNLOCK(&so->so_snd); 2221 CURVNET_RESTORE(); 2222 2223 free_toepcb(toep); 2224 SOCK_LOCK(so); 2225 sorele(so); 2226 } 2227 2228 static void 2229 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep) 2230 { 2231 2232 SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd); 2233 #ifdef VERBOSE_TRACES 2234 CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s", 2235 __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false"); 2236 #endif 2237 if (toep->aiotx_so != NULL) 2238 return; 2239 soref(so); 2240 toep->aiotx_so = so; 2241 hold_toepcb(toep); 2242 soaio_enqueue(&toep->aiotx_task); 2243 } 2244 2245 static void 2246 t4_aiotx_cancel(struct kaiocb *job) 2247 { 2248 struct socket *so; 2249 struct sockbuf *sb; 2250 struct tcpcb *tp; 2251 struct toepcb *toep; 2252 2253 so = job->fd_file->f_data; 2254 tp = so_sototcpcb(so); 2255 toep = tp->t_toe; 2256 MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE); 2257 sb = &so->so_snd; 2258 2259 SOCKBUF_LOCK(sb); 2260 if (!aio_cancel_cleared(job)) 2261 TAILQ_REMOVE(&toep->aiotx_jobq, job, list); 2262 SOCKBUF_UNLOCK(sb); 2263 2264 job->aio_error = (void *)(intptr_t)ECANCELED; 2265 aiotx_free_job(job); 2266 } 2267 2268 int 2269 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job) 2270 { 2271 struct tcpcb *tp = so_sototcpcb(so); 2272 struct toepcb *toep = tp->t_toe; 2273 struct adapter *sc = td_adapter(toep->td); 2274 2275 /* This only handles writes. */ 2276 if (job->uaiocb.aio_lio_opcode != LIO_WRITE) 2277 return (EOPNOTSUPP); 2278 2279 if (!sc->tt.tx_zcopy) 2280 return (EOPNOTSUPP); 2281 2282 if (tls_tx_key(toep)) 2283 return (EOPNOTSUPP); 2284 2285 SOCKBUF_LOCK(&so->so_snd); 2286 #ifdef VERBOSE_TRACES 2287 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); 2288 #endif 2289 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) 2290 panic("new job was cancelled"); 2291 refcount_init(&job->aio_refs, 1); 2292 TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list); 2293 if (sowriteable(so)) 2294 t4_aiotx_queue_toep(so, toep); 2295 SOCKBUF_UNLOCK(&so->so_snd); 2296 return (0); 2297 } 2298 2299 void 2300 aiotx_init_toep(struct toepcb *toep) 2301 { 2302 2303 TAILQ_INIT(&toep->aiotx_jobq); 2304 TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep); 2305 } 2306 #endif 2307