xref: /freebsd/sys/dev/cxgbe/tom/t4_cpl_io.c (revision c93b6e5fa24ba172ab271432c6692f9cc604e15a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_ratelimit.h"
36 
37 #ifdef TCP_OFFLOAD
38 #include <sys/param.h>
39 #include <sys/aio.h>
40 #include <sys/file.h>
41 #include <sys/kernel.h>
42 #include <sys/ktr.h>
43 #include <sys/module.h>
44 #include <sys/proc.h>
45 #include <sys/protosw.h>
46 #include <sys/domain.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sglist.h>
50 #include <sys/taskqueue.h>
51 #include <netinet/in.h>
52 #include <netinet/in_pcb.h>
53 #include <netinet/ip.h>
54 #include <netinet/ip6.h>
55 #define TCPSTATES
56 #include <netinet/tcp_fsm.h>
57 #include <netinet/tcp_seq.h>
58 #include <netinet/tcp_var.h>
59 #include <netinet/toecore.h>
60 
61 #include <security/mac/mac_framework.h>
62 
63 #include <vm/vm.h>
64 #include <vm/vm_extern.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_map.h>
67 #include <vm/vm_page.h>
68 
69 #include "common/common.h"
70 #include "common/t4_msg.h"
71 #include "common/t4_regs.h"
72 #include "common/t4_tcb.h"
73 #include "tom/t4_tom_l2t.h"
74 #include "tom/t4_tom.h"
75 
76 static void	t4_aiotx_cancel(struct kaiocb *job);
77 static void	t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
78 
79 void
80 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
81 {
82 	struct wrqe *wr;
83 	struct fw_flowc_wr *flowc;
84 	unsigned int nparams, flowclen, paramidx;
85 	struct vi_info *vi = toep->vi;
86 	struct port_info *pi = vi->pi;
87 	struct adapter *sc = pi->adapter;
88 	unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
89 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
90 
91 	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
92 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
93 
94 	if (ftxp != NULL)
95 		nparams = 8;
96 	else
97 		nparams = 6;
98 	if (toep->ulp_mode == ULP_MODE_TLS)
99 		nparams++;
100 	if (toep->tls.fcplenmax != 0)
101 		nparams++;
102 	if (toep->tc_idx != -1) {
103 		MPASS(toep->tc_idx >= 0 &&
104 		    toep->tc_idx < sc->chip_params->nsched_cls);
105 		nparams++;
106 	}
107 
108 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
109 
110 	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
111 	if (wr == NULL) {
112 		/* XXX */
113 		panic("%s: allocation failure.", __func__);
114 	}
115 	flowc = wrtod(wr);
116 	memset(flowc, 0, wr->wr_len);
117 
118 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
119 	    V_FW_FLOWC_WR_NPARAMS(nparams));
120 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
121 	    V_FW_WR_FLOWID(toep->tid));
122 
123 #define FLOWC_PARAM(__m, __v) \
124 	do { \
125 		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
126 		flowc->mnemval[paramidx].val = htobe32(__v); \
127 		paramidx++; \
128 	} while (0)
129 
130 	paramidx = 0;
131 
132 	FLOWC_PARAM(PFNVFN, pfvf);
133 	FLOWC_PARAM(CH, pi->tx_chan);
134 	FLOWC_PARAM(PORT, pi->tx_chan);
135 	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
136 	if (ftxp) {
137 		uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
138 
139 		FLOWC_PARAM(SNDNXT, ftxp->snd_nxt);
140 		FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt);
141 		FLOWC_PARAM(SNDBUF, sndbuf);
142 		FLOWC_PARAM(MSS, ftxp->mss);
143 
144 		CTR6(KTR_CXGBE,
145 		    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
146 		    __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt,
147 		    ftxp->rcv_nxt);
148 	} else {
149 		FLOWC_PARAM(SNDBUF, 512);
150 		FLOWC_PARAM(MSS, 512);
151 
152 		CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
153 	}
154 	if (toep->ulp_mode == ULP_MODE_TLS)
155 		FLOWC_PARAM(ULP_MODE, toep->ulp_mode);
156 	if (toep->tls.fcplenmax != 0)
157 		FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
158 	if (toep->tc_idx != -1)
159 		FLOWC_PARAM(SCHEDCLASS, toep->tc_idx);
160 #undef FLOWC_PARAM
161 
162 	KASSERT(paramidx == nparams, ("nparams mismatch"));
163 
164 	txsd->tx_credits = howmany(flowclen, 16);
165 	txsd->plen = 0;
166 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
167 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
168 	toep->tx_credits -= txsd->tx_credits;
169 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
170 		toep->txsd_pidx = 0;
171 	toep->txsd_avail--;
172 
173 	toep->flags |= TPF_FLOWC_WR_SENT;
174         t4_wrq_tx(sc, wr);
175 }
176 
177 #ifdef RATELIMIT
178 /*
179  * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second.
180  */
181 static int
182 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
183 {
184 	int tc_idx, rc;
185 	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
186 	const int port_id = toep->vi->pi->port_id;
187 
188 	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
189 
190 	if (kbps == 0) {
191 		/* unbind */
192 		tc_idx = -1;
193 	} else {
194 		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
195 		if (rc != 0)
196 			return (rc);
197 		MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
198 	}
199 
200 	if (toep->tc_idx != tc_idx) {
201 		struct wrqe *wr;
202 		struct fw_flowc_wr *flowc;
203 		int nparams = 1, flowclen, flowclen16;
204 		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
205 
206 		flowclen = sizeof(*flowc) + nparams * sizeof(struct
207 		    fw_flowc_mnemval);
208 		flowclen16 = howmany(flowclen, 16);
209 		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
210 		    (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) {
211 			if (tc_idx >= 0)
212 				t4_release_cl_rl(sc, port_id, tc_idx);
213 			return (ENOMEM);
214 		}
215 
216 		flowc = wrtod(wr);
217 		memset(flowc, 0, wr->wr_len);
218 
219 		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
220 		    V_FW_FLOWC_WR_NPARAMS(nparams));
221 		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
222 		    V_FW_WR_FLOWID(toep->tid));
223 
224 		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
225 		if (tc_idx == -1)
226 			flowc->mnemval[0].val = htobe32(0xff);
227 		else
228 			flowc->mnemval[0].val = htobe32(tc_idx);
229 
230 		txsd->tx_credits = flowclen16;
231 		txsd->plen = 0;
232 		toep->tx_credits -= txsd->tx_credits;
233 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
234 			toep->txsd_pidx = 0;
235 		toep->txsd_avail--;
236 		t4_wrq_tx(sc, wr);
237 	}
238 
239 	if (toep->tc_idx >= 0)
240 		t4_release_cl_rl(sc, port_id, toep->tc_idx);
241 	toep->tc_idx = tc_idx;
242 
243 	return (0);
244 }
245 #endif
246 
247 void
248 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
249 {
250 	struct wrqe *wr;
251 	struct cpl_abort_req *req;
252 	int tid = toep->tid;
253 	struct inpcb *inp = toep->inp;
254 	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
255 
256 	INP_WLOCK_ASSERT(inp);
257 
258 	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
259 	    __func__, toep->tid,
260 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
261 	    tcpstates[tp->t_state],
262 	    toep->flags, inp->inp_flags,
263 	    toep->flags & TPF_ABORT_SHUTDOWN ?
264 	    " (abort already in progress)" : "");
265 
266 	if (toep->flags & TPF_ABORT_SHUTDOWN)
267 		return;	/* abort already in progress */
268 
269 	toep->flags |= TPF_ABORT_SHUTDOWN;
270 
271 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
272 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
273 
274 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
275 	if (wr == NULL) {
276 		/* XXX */
277 		panic("%s: allocation failure.", __func__);
278 	}
279 	req = wrtod(wr);
280 
281 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
282 	if (inp->inp_flags & INP_DROPPED)
283 		req->rsvd0 = htobe32(snd_nxt);
284 	else
285 		req->rsvd0 = htobe32(tp->snd_nxt);
286 	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
287 	req->cmd = CPL_ABORT_SEND_RST;
288 
289 	/*
290 	 * XXX: What's the correct way to tell that the inp hasn't been detached
291 	 * from its socket?  Should I even be flushing the snd buffer here?
292 	 */
293 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
294 		struct socket *so = inp->inp_socket;
295 
296 		if (so != NULL)	/* because I'm not sure.  See comment above */
297 			sbflush(&so->so_snd);
298 	}
299 
300 	t4_l2t_send(sc, wr, toep->l2te);
301 }
302 
303 /*
304  * Called when a connection is established to translate the TCP options
305  * reported by HW to FreeBSD's native format.
306  */
307 static void
308 assign_rxopt(struct tcpcb *tp, uint16_t opt)
309 {
310 	struct toepcb *toep = tp->t_toe;
311 	struct inpcb *inp = tp->t_inpcb;
312 	struct adapter *sc = td_adapter(toep->td);
313 
314 	INP_LOCK_ASSERT(inp);
315 
316 	toep->tcp_opt = opt;
317 	toep->mtu_idx = G_TCPOPT_MSS(opt);
318 	tp->t_maxseg = sc->params.mtus[toep->mtu_idx];
319 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
320 		tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
321 	else
322 		tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
323 
324 	toep->emss = tp->t_maxseg;
325 	if (G_TCPOPT_TSTAMP(opt)) {
326 		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
327 		tp->ts_recent = 0;		/* hmmm */
328 		tp->ts_recent_age = tcp_ts_getticks();
329 		toep->emss -= TCPOLEN_TSTAMP_APPA;
330 	}
331 
332 	CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u",
333 	    __func__, toep->tid, toep->mtu_idx,
334 	    sc->params.mtus[G_TCPOPT_MSS(opt)], tp->t_maxseg, toep->emss);
335 
336 	if (G_TCPOPT_SACK(opt))
337 		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
338 	else
339 		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
340 
341 	if (G_TCPOPT_WSCALE_OK(opt))
342 		tp->t_flags |= TF_RCVD_SCALE;
343 
344 	/* Doing window scaling? */
345 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
346 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
347 		tp->rcv_scale = tp->request_r_scale;
348 		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
349 	}
350 }
351 
352 /*
353  * Completes some final bits of initialization for just established connections
354  * and changes their state to TCPS_ESTABLISHED.
355  *
356  * The ISNs are from the exchange of SYNs.
357  */
358 void
359 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
360 {
361 	struct inpcb *inp = toep->inp;
362 	struct socket *so = inp->inp_socket;
363 	struct tcpcb *tp = intotcpcb(inp);
364 	long bufsize;
365 	uint16_t tcpopt = be16toh(opt);
366 	struct flowc_tx_params ftxp;
367 
368 	INP_WLOCK_ASSERT(inp);
369 	KASSERT(tp->t_state == TCPS_SYN_SENT ||
370 	    tp->t_state == TCPS_SYN_RECEIVED,
371 	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
372 
373 	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
374 	    __func__, toep->tid, so, inp, tp, toep);
375 
376 	tcp_state_change(tp, TCPS_ESTABLISHED);
377 	tp->t_starttime = ticks;
378 	TCPSTAT_INC(tcps_connects);
379 
380 	tp->irs = irs;
381 	tcp_rcvseqinit(tp);
382 	tp->rcv_wnd = (u_int)toep->opt0_rcv_bufsize << 10;
383 	tp->rcv_adv += tp->rcv_wnd;
384 	tp->last_ack_sent = tp->rcv_nxt;
385 
386 	tp->iss = iss;
387 	tcp_sendseqinit(tp);
388 	tp->snd_una = iss + 1;
389 	tp->snd_nxt = iss + 1;
390 	tp->snd_max = iss + 1;
391 
392 	assign_rxopt(tp, tcpopt);
393 
394 	SOCKBUF_LOCK(&so->so_snd);
395 	if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
396 		bufsize = V_tcp_autosndbuf_max;
397 	else
398 		bufsize = sbspace(&so->so_snd);
399 	SOCKBUF_UNLOCK(&so->so_snd);
400 
401 	ftxp.snd_nxt = tp->snd_nxt;
402 	ftxp.rcv_nxt = tp->rcv_nxt;
403 	ftxp.snd_space = bufsize;
404 	ftxp.mss = toep->emss;
405 	send_flowc_wr(toep, &ftxp);
406 
407 	soisconnected(so);
408 }
409 
410 int
411 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
412 {
413 	struct wrqe *wr;
414 	struct cpl_rx_data_ack *req;
415 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
416 
417 	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
418 
419 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
420 	if (wr == NULL)
421 		return (0);
422 	req = wrtod(wr);
423 
424 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
425 	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
426 
427 	t4_wrq_tx(sc, wr);
428 	return (credits);
429 }
430 
431 void
432 send_rx_modulate(struct adapter *sc, struct toepcb *toep)
433 {
434 	struct wrqe *wr;
435 	struct cpl_rx_data_ack *req;
436 
437 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
438 	if (wr == NULL)
439 		return;
440 	req = wrtod(wr);
441 
442 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
443 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
444 
445 	t4_wrq_tx(sc, wr);
446 }
447 
448 void
449 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
450 {
451 	struct adapter *sc = tod->tod_softc;
452 	struct inpcb *inp = tp->t_inpcb;
453 	struct socket *so = inp->inp_socket;
454 	struct sockbuf *sb = &so->so_rcv;
455 	struct toepcb *toep = tp->t_toe;
456 	int rx_credits;
457 
458 	INP_WLOCK_ASSERT(inp);
459 	SOCKBUF_LOCK_ASSERT(sb);
460 
461 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
462 	if (toep->ulp_mode == ULP_MODE_TLS) {
463 		if (toep->tls.rcv_over >= rx_credits) {
464 			toep->tls.rcv_over -= rx_credits;
465 			rx_credits = 0;
466 		} else {
467 			rx_credits -= toep->tls.rcv_over;
468 			toep->tls.rcv_over = 0;
469 		}
470 	}
471 
472 	if (rx_credits > 0 &&
473 	    (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
474 	    (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
475 	    sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
476 		rx_credits = send_rx_credits(sc, toep, rx_credits);
477 		tp->rcv_wnd += rx_credits;
478 		tp->rcv_adv += rx_credits;
479 	} else if (toep->flags & TPF_FORCE_CREDITS)
480 		send_rx_modulate(sc, toep);
481 }
482 
483 void
484 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
485 {
486 	struct inpcb *inp = tp->t_inpcb;
487 	struct socket *so = inp->inp_socket;
488 	struct sockbuf *sb = &so->so_rcv;
489 
490 	SOCKBUF_LOCK(sb);
491 	t4_rcvd_locked(tod, tp);
492 	SOCKBUF_UNLOCK(sb);
493 }
494 
495 /*
496  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
497  */
498 int
499 t4_close_conn(struct adapter *sc, struct toepcb *toep)
500 {
501 	struct wrqe *wr;
502 	struct cpl_close_con_req *req;
503 	unsigned int tid = toep->tid;
504 
505 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
506 	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
507 
508 	if (toep->flags & TPF_FIN_SENT)
509 		return (0);
510 
511 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
512 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
513 
514 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
515 	if (wr == NULL) {
516 		/* XXX */
517 		panic("%s: allocation failure.", __func__);
518 	}
519 	req = wrtod(wr);
520 
521         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
522 	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
523 	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
524 	    V_FW_WR_FLOWID(tid));
525         req->wr.wr_lo = cpu_to_be64(0);
526         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
527 	req->rsvd = 0;
528 
529 	toep->flags |= TPF_FIN_SENT;
530 	toep->flags &= ~TPF_SEND_FIN;
531 	t4_l2t_send(sc, wr, toep->l2te);
532 
533 	return (0);
534 }
535 
536 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
537 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
538 
539 /* Maximum amount of immediate data we could stuff in a WR */
540 static inline int
541 max_imm_payload(int tx_credits)
542 {
543 	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
544 
545 	KASSERT(tx_credits >= 0 &&
546 		tx_credits <= MAX_OFLD_TX_CREDITS,
547 		("%s: %d credits", __func__, tx_credits));
548 
549 	if (tx_credits < MIN_OFLD_TX_CREDITS)
550 		return (0);
551 
552 	if (tx_credits >= (n * EQ_ESIZE) / 16)
553 		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
554 	else
555 		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
556 }
557 
558 /* Maximum number of SGL entries we could stuff in a WR */
559 static inline int
560 max_dsgl_nsegs(int tx_credits)
561 {
562 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
563 	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
564 
565 	KASSERT(tx_credits >= 0 &&
566 		tx_credits <= MAX_OFLD_TX_CREDITS,
567 		("%s: %d credits", __func__, tx_credits));
568 
569 	if (tx_credits < MIN_OFLD_TX_CREDITS)
570 		return (0);
571 
572 	nseg += 2 * (sge_pair_credits * 16 / 24);
573 	if ((sge_pair_credits * 16) % 24 == 16)
574 		nseg++;
575 
576 	return (nseg);
577 }
578 
579 static inline void
580 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
581     unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign)
582 {
583 	struct fw_ofld_tx_data_wr *txwr = dst;
584 
585 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
586 	    V_FW_WR_IMMDLEN(immdlen));
587 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
588 	    V_FW_WR_LEN16(credits));
589 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) |
590 	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
591 	txwr->plen = htobe32(plen);
592 
593 	if (txalign > 0) {
594 		struct tcpcb *tp = intotcpcb(toep->inp);
595 
596 		if (plen < 2 * toep->emss)
597 			txwr->lsodisable_to_flags |=
598 			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
599 		else
600 			txwr->lsodisable_to_flags |=
601 			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
602 				(tp->t_flags & TF_NODELAY ? 0 :
603 				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
604 	}
605 }
606 
607 /*
608  * Generate a DSGL from a starting mbuf.  The total number of segments and the
609  * maximum segments in any one mbuf are provided.
610  */
611 static void
612 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
613 {
614 	struct mbuf *m;
615 	struct ulptx_sgl *usgl = dst;
616 	int i, j, rc;
617 	struct sglist sg;
618 	struct sglist_seg segs[n];
619 
620 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
621 
622 	sglist_init(&sg, n, segs);
623 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
624 	    V_ULPTX_NSGE(nsegs));
625 
626 	i = -1;
627 	for (m = start; m != stop; m = m->m_next) {
628 		if (m->m_flags & M_NOMAP)
629 			rc = sglist_append_mb_ext_pgs(&sg, m);
630 		else
631 			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
632 		if (__predict_false(rc != 0))
633 			panic("%s: sglist_append %d", __func__, rc);
634 
635 		for (j = 0; j < sg.sg_nseg; i++, j++) {
636 			if (i < 0) {
637 				usgl->len0 = htobe32(segs[j].ss_len);
638 				usgl->addr0 = htobe64(segs[j].ss_paddr);
639 			} else {
640 				usgl->sge[i / 2].len[i & 1] =
641 				    htobe32(segs[j].ss_len);
642 				usgl->sge[i / 2].addr[i & 1] =
643 				    htobe64(segs[j].ss_paddr);
644 			}
645 #ifdef INVARIANTS
646 			nsegs--;
647 #endif
648 		}
649 		sglist_reset(&sg);
650 	}
651 	if (i & 1)
652 		usgl->sge[i / 2].len[1] = htobe32(0);
653 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
654 	    __func__, nsegs, start, stop));
655 }
656 
657 /*
658  * Max number of SGL entries an offload tx work request can have.  This is 41
659  * (1 + 40) for a full 512B work request.
660  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
661  */
662 #define OFLD_SGL_LEN (41)
663 
664 /*
665  * Send data and/or a FIN to the peer.
666  *
667  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
668  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
669  * was transmitted.
670  *
671  * drop indicates the number of bytes that should be dropped from the head of
672  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
673  * contention on the send buffer lock (before this change it used to do
674  * sowwakeup and then t4_push_frames right after that when recovering from tx
675  * stalls).  When drop is set this function MUST drop the bytes and wake up any
676  * writers.
677  */
678 void
679 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
680 {
681 	struct mbuf *sndptr, *m, *sb_sndptr;
682 	struct fw_ofld_tx_data_wr *txwr;
683 	struct wrqe *wr;
684 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
685 	struct inpcb *inp = toep->inp;
686 	struct tcpcb *tp = intotcpcb(inp);
687 	struct socket *so = inp->inp_socket;
688 	struct sockbuf *sb = &so->so_snd;
689 	int tx_credits, shove, compl, sowwakeup;
690 	struct ofld_tx_sdesc *txsd;
691 	bool nomap_mbuf_seen;
692 
693 	INP_WLOCK_ASSERT(inp);
694 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
695 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
696 
697 	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
698 	    toep->ulp_mode == ULP_MODE_TCPDDP ||
699 	    toep->ulp_mode == ULP_MODE_TLS ||
700 	    toep->ulp_mode == ULP_MODE_RDMA,
701 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
702 
703 #ifdef VERBOSE_TRACES
704 	CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
705 	    __func__, toep->tid, toep->flags, tp->t_flags, drop);
706 #endif
707 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
708 		return;
709 
710 #ifdef RATELIMIT
711 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
712 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
713 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
714 	}
715 #endif
716 
717 	/*
718 	 * This function doesn't resume by itself.  Someone else must clear the
719 	 * flag and call this function.
720 	 */
721 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
722 		KASSERT(drop == 0,
723 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
724 		return;
725 	}
726 
727 	txsd = &toep->txsd[toep->txsd_pidx];
728 	do {
729 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
730 		max_imm = max_imm_payload(tx_credits);
731 		max_nsegs = max_dsgl_nsegs(tx_credits);
732 
733 		SOCKBUF_LOCK(sb);
734 		sowwakeup = drop;
735 		if (drop) {
736 			sbdrop_locked(sb, drop);
737 			drop = 0;
738 		}
739 		sb_sndptr = sb->sb_sndptr;
740 		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
741 		plen = 0;
742 		nsegs = 0;
743 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
744 		nomap_mbuf_seen = false;
745 		for (m = sndptr; m != NULL; m = m->m_next) {
746 			int n;
747 
748 			if (m->m_flags & M_NOMAP)
749 				n = sglist_count_mb_ext_pgs(m);
750 			else
751 				n = sglist_count(mtod(m, void *), m->m_len);
752 
753 			nsegs += n;
754 			plen += m->m_len;
755 
756 			/* This mbuf sent us _over_ the nsegs limit, back out */
757 			if (plen > max_imm && nsegs > max_nsegs) {
758 				nsegs -= n;
759 				plen -= m->m_len;
760 				if (plen == 0) {
761 					/* Too few credits */
762 					toep->flags |= TPF_TX_SUSPENDED;
763 					if (sowwakeup) {
764 						if (!TAILQ_EMPTY(
765 						    &toep->aiotx_jobq))
766 							t4_aiotx_queue_toep(so,
767 							    toep);
768 						sowwakeup_locked(so);
769 					} else
770 						SOCKBUF_UNLOCK(sb);
771 					SOCKBUF_UNLOCK_ASSERT(sb);
772 					return;
773 				}
774 				break;
775 			}
776 
777 			if (m->m_flags & M_NOMAP)
778 				nomap_mbuf_seen = true;
779 			if (max_nsegs_1mbuf < n)
780 				max_nsegs_1mbuf = n;
781 			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
782 
783 			/* This mbuf put us right at the max_nsegs limit */
784 			if (plen > max_imm && nsegs == max_nsegs) {
785 				m = m->m_next;
786 				break;
787 			}
788 		}
789 
790 		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
791 		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
792 			compl = 1;
793 		else
794 			compl = 0;
795 
796 		if (sb->sb_flags & SB_AUTOSIZE &&
797 		    V_tcp_do_autosndbuf &&
798 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
799 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
800 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
801 			    V_tcp_autosndbuf_max);
802 
803 			if (!sbreserve_locked(sb, newsize, so, NULL))
804 				sb->sb_flags &= ~SB_AUTOSIZE;
805 			else
806 				sowwakeup = 1;	/* room available */
807 		}
808 		if (sowwakeup) {
809 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
810 				t4_aiotx_queue_toep(so, toep);
811 			sowwakeup_locked(so);
812 		} else
813 			SOCKBUF_UNLOCK(sb);
814 		SOCKBUF_UNLOCK_ASSERT(sb);
815 
816 		/* nothing to send */
817 		if (plen == 0) {
818 			KASSERT(m == NULL,
819 			    ("%s: nothing to send, but m != NULL", __func__));
820 			break;
821 		}
822 
823 		if (__predict_false(toep->flags & TPF_FIN_SENT))
824 			panic("%s: excess tx.", __func__);
825 
826 		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
827 		if (plen <= max_imm && !nomap_mbuf_seen) {
828 
829 			/* Immediate data tx */
830 
831 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
832 					toep->ofld_txq);
833 			if (wr == NULL) {
834 				/* XXX: how will we recover from this? */
835 				toep->flags |= TPF_TX_SUSPENDED;
836 				return;
837 			}
838 			txwr = wrtod(wr);
839 			credits = howmany(wr->wr_len, 16);
840 			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0,
841 			    sc->tt.tx_align);
842 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
843 			nsegs = 0;
844 		} else {
845 			int wr_len;
846 
847 			/* DSGL tx */
848 
849 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
850 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
851 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
852 			if (wr == NULL) {
853 				/* XXX: how will we recover from this? */
854 				toep->flags |= TPF_TX_SUSPENDED;
855 				return;
856 			}
857 			txwr = wrtod(wr);
858 			credits = howmany(wr_len, 16);
859 			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0,
860 			    sc->tt.tx_align);
861 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
862 			    max_nsegs_1mbuf);
863 			if (wr_len & 0xf) {
864 				uint64_t *pad = (uint64_t *)
865 				    ((uintptr_t)txwr + wr_len);
866 				*pad = 0;
867 			}
868 		}
869 
870 		KASSERT(toep->tx_credits >= credits,
871 			("%s: not enough credits", __func__));
872 
873 		toep->tx_credits -= credits;
874 		toep->tx_nocompl += credits;
875 		toep->plen_nocompl += plen;
876 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
877 		    toep->tx_nocompl >= toep->tx_total / 4)
878 			compl = 1;
879 
880 		if (compl || toep->ulp_mode == ULP_MODE_RDMA) {
881 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
882 			toep->tx_nocompl = 0;
883 			toep->plen_nocompl = 0;
884 		}
885 
886 		tp->snd_nxt += plen;
887 		tp->snd_max += plen;
888 
889 		SOCKBUF_LOCK(sb);
890 		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
891 		sb->sb_sndptr = sb_sndptr;
892 		SOCKBUF_UNLOCK(sb);
893 
894 		toep->flags |= TPF_TX_DATA_SENT;
895 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
896 			toep->flags |= TPF_TX_SUSPENDED;
897 
898 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
899 		txsd->plen = plen;
900 		txsd->tx_credits = credits;
901 		txsd++;
902 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
903 			toep->txsd_pidx = 0;
904 			txsd = &toep->txsd[0];
905 		}
906 		toep->txsd_avail--;
907 
908 		t4_l2t_send(sc, wr, toep->l2te);
909 	} while (m != NULL);
910 
911 	/* Send a FIN if requested, but only if there's no more data to send */
912 	if (m == NULL && toep->flags & TPF_SEND_FIN)
913 		t4_close_conn(sc, toep);
914 }
915 
916 static inline void
917 rqdrop_locked(struct mbufq *q, int plen)
918 {
919 	struct mbuf *m;
920 
921 	while (plen > 0) {
922 		m = mbufq_dequeue(q);
923 
924 		/* Too many credits. */
925 		MPASS(m != NULL);
926 		M_ASSERTPKTHDR(m);
927 
928 		/* Partial credits. */
929 		MPASS(plen >= m->m_pkthdr.len);
930 
931 		plen -= m->m_pkthdr.len;
932 		m_freem(m);
933 	}
934 }
935 
936 void
937 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
938 {
939 	struct mbuf *sndptr, *m;
940 	struct fw_ofld_tx_data_wr *txwr;
941 	struct wrqe *wr;
942 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
943 	u_int adjusted_plen, ulp_submode;
944 	struct inpcb *inp = toep->inp;
945 	struct tcpcb *tp = intotcpcb(inp);
946 	int tx_credits, shove;
947 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
948 	struct mbufq *pduq = &toep->ulp_pduq;
949 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
950 
951 	INP_WLOCK_ASSERT(inp);
952 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
953 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
954 	KASSERT(toep->ulp_mode == ULP_MODE_ISCSI,
955 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
956 
957 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
958 		return;
959 
960 	/*
961 	 * This function doesn't resume by itself.  Someone else must clear the
962 	 * flag and call this function.
963 	 */
964 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
965 		KASSERT(drop == 0,
966 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
967 		return;
968 	}
969 
970 	if (drop)
971 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
972 
973 	while ((sndptr = mbufq_first(pduq)) != NULL) {
974 		M_ASSERTPKTHDR(sndptr);
975 
976 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
977 		max_imm = max_imm_payload(tx_credits);
978 		max_nsegs = max_dsgl_nsegs(tx_credits);
979 
980 		plen = 0;
981 		nsegs = 0;
982 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
983 		for (m = sndptr; m != NULL; m = m->m_next) {
984 			int n = sglist_count(mtod(m, void *), m->m_len);
985 
986 			nsegs += n;
987 			plen += m->m_len;
988 
989 			/*
990 			 * This mbuf would send us _over_ the nsegs limit.
991 			 * Suspend tx because the PDU can't be sent out.
992 			 */
993 			if (plen > max_imm && nsegs > max_nsegs) {
994 				toep->flags |= TPF_TX_SUSPENDED;
995 				return;
996 			}
997 
998 			if (max_nsegs_1mbuf < n)
999 				max_nsegs_1mbuf = n;
1000 		}
1001 
1002 		if (__predict_false(toep->flags & TPF_FIN_SENT))
1003 			panic("%s: excess tx.", __func__);
1004 
1005 		/*
1006 		 * We have a PDU to send.  All of it goes out in one WR so 'm'
1007 		 * is NULL.  A PDU's length is always a multiple of 4.
1008 		 */
1009 		MPASS(m == NULL);
1010 		MPASS((plen & 3) == 0);
1011 		MPASS(sndptr->m_pkthdr.len == plen);
1012 
1013 		shove = !(tp->t_flags & TF_MORETOCOME);
1014 		ulp_submode = mbuf_ulp_submode(sndptr);
1015 		MPASS(ulp_submode < nitems(ulp_extra_len));
1016 
1017 		/*
1018 		 * plen doesn't include header and data digests, which are
1019 		 * generated and inserted in the right places by the TOE, but
1020 		 * they do occupy TCP sequence space and need to be accounted
1021 		 * for.
1022 		 */
1023 		adjusted_plen = plen + ulp_extra_len[ulp_submode];
1024 		if (plen <= max_imm) {
1025 
1026 			/* Immediate data tx */
1027 
1028 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
1029 					toep->ofld_txq);
1030 			if (wr == NULL) {
1031 				/* XXX: how will we recover from this? */
1032 				toep->flags |= TPF_TX_SUSPENDED;
1033 				return;
1034 			}
1035 			txwr = wrtod(wr);
1036 			credits = howmany(wr->wr_len, 16);
1037 			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
1038 			    shove, ulp_submode, sc->tt.tx_align);
1039 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
1040 			nsegs = 0;
1041 		} else {
1042 			int wr_len;
1043 
1044 			/* DSGL tx */
1045 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
1046 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
1047 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
1048 			if (wr == NULL) {
1049 				/* XXX: how will we recover from this? */
1050 				toep->flags |= TPF_TX_SUSPENDED;
1051 				return;
1052 			}
1053 			txwr = wrtod(wr);
1054 			credits = howmany(wr_len, 16);
1055 			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
1056 			    shove, ulp_submode, sc->tt.tx_align);
1057 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
1058 			    max_nsegs_1mbuf);
1059 			if (wr_len & 0xf) {
1060 				uint64_t *pad = (uint64_t *)
1061 				    ((uintptr_t)txwr + wr_len);
1062 				*pad = 0;
1063 			}
1064 		}
1065 
1066 		KASSERT(toep->tx_credits >= credits,
1067 			("%s: not enough credits", __func__));
1068 
1069 		m = mbufq_dequeue(pduq);
1070 		MPASS(m == sndptr);
1071 		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
1072 
1073 		toep->tx_credits -= credits;
1074 		toep->tx_nocompl += credits;
1075 		toep->plen_nocompl += plen;
1076 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
1077 		    toep->tx_nocompl >= toep->tx_total / 4) {
1078 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
1079 			toep->tx_nocompl = 0;
1080 			toep->plen_nocompl = 0;
1081 		}
1082 
1083 		tp->snd_nxt += adjusted_plen;
1084 		tp->snd_max += adjusted_plen;
1085 
1086 		toep->flags |= TPF_TX_DATA_SENT;
1087 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
1088 			toep->flags |= TPF_TX_SUSPENDED;
1089 
1090 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
1091 		txsd->plen = plen;
1092 		txsd->tx_credits = credits;
1093 		txsd++;
1094 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
1095 			toep->txsd_pidx = 0;
1096 			txsd = &toep->txsd[0];
1097 		}
1098 		toep->txsd_avail--;
1099 
1100 		t4_l2t_send(sc, wr, toep->l2te);
1101 	}
1102 
1103 	/* Send a FIN if requested, but only if there are no more PDUs to send */
1104 	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
1105 		t4_close_conn(sc, toep);
1106 }
1107 
1108 int
1109 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
1110 {
1111 	struct adapter *sc = tod->tod_softc;
1112 #ifdef INVARIANTS
1113 	struct inpcb *inp = tp->t_inpcb;
1114 #endif
1115 	struct toepcb *toep = tp->t_toe;
1116 
1117 	INP_WLOCK_ASSERT(inp);
1118 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1119 	    ("%s: inp %p dropped.", __func__, inp));
1120 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1121 
1122 	if (toep->ulp_mode == ULP_MODE_ISCSI)
1123 		t4_push_pdus(sc, toep, 0);
1124 	else if (tls_tx_key(toep))
1125 		t4_push_tls_records(sc, toep, 0);
1126 	else
1127 		t4_push_frames(sc, toep, 0);
1128 
1129 	return (0);
1130 }
1131 
1132 int
1133 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
1134 {
1135 	struct adapter *sc = tod->tod_softc;
1136 #ifdef INVARIANTS
1137 	struct inpcb *inp = tp->t_inpcb;
1138 #endif
1139 	struct toepcb *toep = tp->t_toe;
1140 
1141 	INP_WLOCK_ASSERT(inp);
1142 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1143 	    ("%s: inp %p dropped.", __func__, inp));
1144 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1145 
1146 	toep->flags |= TPF_SEND_FIN;
1147 	if (tp->t_state >= TCPS_ESTABLISHED) {
1148 		if (toep->ulp_mode == ULP_MODE_ISCSI)
1149 			t4_push_pdus(sc, toep, 0);
1150 		else if (tls_tx_key(toep))
1151 			t4_push_tls_records(sc, toep, 0);
1152 		else
1153 			t4_push_frames(sc, toep, 0);
1154 	}
1155 
1156 	return (0);
1157 }
1158 
1159 int
1160 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
1161 {
1162 	struct adapter *sc = tod->tod_softc;
1163 #if defined(INVARIANTS)
1164 	struct inpcb *inp = tp->t_inpcb;
1165 #endif
1166 	struct toepcb *toep = tp->t_toe;
1167 
1168 	INP_WLOCK_ASSERT(inp);
1169 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1170 	    ("%s: inp %p dropped.", __func__, inp));
1171 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1172 
1173 	/* hmmmm */
1174 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
1175 	    ("%s: flowc for tid %u [%s] not sent already",
1176 	    __func__, toep->tid, tcpstates[tp->t_state]));
1177 
1178 	send_reset(sc, toep, 0);
1179 	return (0);
1180 }
1181 
1182 /*
1183  * Peer has sent us a FIN.
1184  */
1185 static int
1186 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1187 {
1188 	struct adapter *sc = iq->adapter;
1189 	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
1190 	unsigned int tid = GET_TID(cpl);
1191 	struct toepcb *toep = lookup_tid(sc, tid);
1192 	struct inpcb *inp = toep->inp;
1193 	struct tcpcb *tp = NULL;
1194 	struct socket *so;
1195 	struct epoch_tracker et;
1196 #ifdef INVARIANTS
1197 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1198 #endif
1199 
1200 	KASSERT(opcode == CPL_PEER_CLOSE,
1201 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1202 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1203 
1204 	if (__predict_false(toep->flags & TPF_SYNQE)) {
1205 		/*
1206 		 * do_pass_establish must have run before do_peer_close and if
1207 		 * this is still a synqe instead of a toepcb then the connection
1208 		 * must be getting aborted.
1209 		 */
1210 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
1211 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1212 		    toep, toep->flags);
1213 		return (0);
1214 	}
1215 
1216 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1217 
1218 	CURVNET_SET(toep->vnet);
1219 	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
1220 	INP_WLOCK(inp);
1221 	tp = intotcpcb(inp);
1222 
1223 	CTR6(KTR_CXGBE,
1224 	    "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p",
1225 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
1226 	    toep->ddp.flags, inp);
1227 
1228 	if (toep->flags & TPF_ABORT_SHUTDOWN)
1229 		goto done;
1230 
1231 	tp->rcv_nxt++;	/* FIN */
1232 
1233 	so = inp->inp_socket;
1234 	socantrcvmore(so);
1235 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
1236 		DDP_LOCK(toep);
1237 		if (__predict_false(toep->ddp.flags &
1238 		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
1239 			handle_ddp_close(toep, tp, cpl->rcv_nxt);
1240 		DDP_UNLOCK(toep);
1241 	}
1242 
1243 	if (toep->ulp_mode != ULP_MODE_RDMA) {
1244 		KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
1245 	    		("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
1246 	    		be32toh(cpl->rcv_nxt)));
1247 	}
1248 
1249 	switch (tp->t_state) {
1250 	case TCPS_SYN_RECEIVED:
1251 		tp->t_starttime = ticks;
1252 		/* FALLTHROUGH */
1253 
1254 	case TCPS_ESTABLISHED:
1255 		tcp_state_change(tp, TCPS_CLOSE_WAIT);
1256 		break;
1257 
1258 	case TCPS_FIN_WAIT_1:
1259 		tcp_state_change(tp, TCPS_CLOSING);
1260 		break;
1261 
1262 	case TCPS_FIN_WAIT_2:
1263 		tcp_twstart(tp);
1264 		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
1265 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1266 		CURVNET_RESTORE();
1267 
1268 		INP_WLOCK(inp);
1269 		final_cpl_received(toep);
1270 		return (0);
1271 
1272 	default:
1273 		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
1274 		    __func__, tid, tp->t_state);
1275 	}
1276 done:
1277 	INP_WUNLOCK(inp);
1278 	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1279 	CURVNET_RESTORE();
1280 	return (0);
1281 }
1282 
1283 /*
1284  * Peer has ACK'd our FIN.
1285  */
1286 static int
1287 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
1288     struct mbuf *m)
1289 {
1290 	struct adapter *sc = iq->adapter;
1291 	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
1292 	unsigned int tid = GET_TID(cpl);
1293 	struct toepcb *toep = lookup_tid(sc, tid);
1294 	struct inpcb *inp = toep->inp;
1295 	struct tcpcb *tp = NULL;
1296 	struct socket *so = NULL;
1297 	struct epoch_tracker et;
1298 #ifdef INVARIANTS
1299 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1300 #endif
1301 
1302 	KASSERT(opcode == CPL_CLOSE_CON_RPL,
1303 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1304 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1305 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1306 
1307 	CURVNET_SET(toep->vnet);
1308 	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
1309 	INP_WLOCK(inp);
1310 	tp = intotcpcb(inp);
1311 
1312 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
1313 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
1314 
1315 	if (toep->flags & TPF_ABORT_SHUTDOWN)
1316 		goto done;
1317 
1318 	so = inp->inp_socket;
1319 	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
1320 
1321 	switch (tp->t_state) {
1322 	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
1323 		tcp_twstart(tp);
1324 release:
1325 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1326 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1327 		CURVNET_RESTORE();
1328 
1329 		INP_WLOCK(inp);
1330 		final_cpl_received(toep);	/* no more CPLs expected */
1331 
1332 		return (0);
1333 	case TCPS_LAST_ACK:
1334 		if (tcp_close(tp))
1335 			INP_WUNLOCK(inp);
1336 		goto release;
1337 
1338 	case TCPS_FIN_WAIT_1:
1339 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1340 			soisdisconnected(so);
1341 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
1342 		break;
1343 
1344 	default:
1345 		log(LOG_ERR,
1346 		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
1347 		    __func__, tid, tcpstates[tp->t_state]);
1348 	}
1349 done:
1350 	INP_WUNLOCK(inp);
1351 	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1352 	CURVNET_RESTORE();
1353 	return (0);
1354 }
1355 
1356 void
1357 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
1358     int rst_status)
1359 {
1360 	struct wrqe *wr;
1361 	struct cpl_abort_rpl *cpl;
1362 
1363 	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
1364 	if (wr == NULL) {
1365 		/* XXX */
1366 		panic("%s: allocation failure.", __func__);
1367 	}
1368 	cpl = wrtod(wr);
1369 
1370 	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
1371 	cpl->cmd = rst_status;
1372 
1373 	t4_wrq_tx(sc, wr);
1374 }
1375 
1376 static int
1377 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
1378 {
1379 	switch (abort_reason) {
1380 	case CPL_ERR_BAD_SYN:
1381 	case CPL_ERR_CONN_RESET:
1382 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
1383 	case CPL_ERR_XMIT_TIMEDOUT:
1384 	case CPL_ERR_PERSIST_TIMEDOUT:
1385 	case CPL_ERR_FINWAIT2_TIMEDOUT:
1386 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
1387 		return (ETIMEDOUT);
1388 	default:
1389 		return (EIO);
1390 	}
1391 }
1392 
1393 /*
1394  * TCP RST from the peer, timeout, or some other such critical error.
1395  */
1396 static int
1397 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1398 {
1399 	struct adapter *sc = iq->adapter;
1400 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
1401 	unsigned int tid = GET_TID(cpl);
1402 	struct toepcb *toep = lookup_tid(sc, tid);
1403 	struct sge_wrq *ofld_txq = toep->ofld_txq;
1404 	struct inpcb *inp;
1405 	struct tcpcb *tp;
1406 	struct epoch_tracker et;
1407 #ifdef INVARIANTS
1408 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1409 #endif
1410 
1411 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
1412 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1413 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1414 
1415 	if (toep->flags & TPF_SYNQE)
1416 		return (do_abort_req_synqe(iq, rss, m));
1417 
1418 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1419 
1420 	if (negative_advice(cpl->status)) {
1421 		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
1422 		    __func__, cpl->status, tid, toep->flags);
1423 		return (0);	/* Ignore negative advice */
1424 	}
1425 
1426 	inp = toep->inp;
1427 	CURVNET_SET(toep->vnet);
1428 	INP_INFO_RLOCK_ET(&V_tcbinfo, et);	/* for tcp_close */
1429 	INP_WLOCK(inp);
1430 
1431 	tp = intotcpcb(inp);
1432 
1433 	CTR6(KTR_CXGBE,
1434 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
1435 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
1436 	    inp->inp_flags, cpl->status);
1437 
1438 	/*
1439 	 * If we'd initiated an abort earlier the reply to it is responsible for
1440 	 * cleaning up resources.  Otherwise we tear everything down right here
1441 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
1442 	 */
1443 	if (toep->flags & TPF_ABORT_SHUTDOWN) {
1444 		INP_WUNLOCK(inp);
1445 		goto done;
1446 	}
1447 	toep->flags |= TPF_ABORT_SHUTDOWN;
1448 
1449 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
1450 		struct socket *so = inp->inp_socket;
1451 
1452 		if (so != NULL)
1453 			so_error_set(so, abort_status_to_errno(tp,
1454 			    cpl->status));
1455 		tp = tcp_close(tp);
1456 		if (tp == NULL)
1457 			INP_WLOCK(inp);	/* re-acquire */
1458 	}
1459 
1460 	final_cpl_received(toep);
1461 done:
1462 	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1463 	CURVNET_RESTORE();
1464 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
1465 	return (0);
1466 }
1467 
1468 /*
1469  * Reply to the CPL_ABORT_REQ (send_reset)
1470  */
1471 static int
1472 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1473 {
1474 	struct adapter *sc = iq->adapter;
1475 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
1476 	unsigned int tid = GET_TID(cpl);
1477 	struct toepcb *toep = lookup_tid(sc, tid);
1478 	struct inpcb *inp = toep->inp;
1479 #ifdef INVARIANTS
1480 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1481 #endif
1482 
1483 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
1484 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1485 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1486 
1487 	if (toep->flags & TPF_SYNQE)
1488 		return (do_abort_rpl_synqe(iq, rss, m));
1489 
1490 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1491 
1492 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
1493 	    __func__, tid, toep, inp, cpl->status);
1494 
1495 	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1496 	    ("%s: wasn't expecting abort reply", __func__));
1497 
1498 	INP_WLOCK(inp);
1499 	final_cpl_received(toep);
1500 
1501 	return (0);
1502 }
1503 
1504 static int
1505 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1506 {
1507 	struct adapter *sc = iq->adapter;
1508 	const struct cpl_rx_data *cpl = mtod(m, const void *);
1509 	unsigned int tid = GET_TID(cpl);
1510 	struct toepcb *toep = lookup_tid(sc, tid);
1511 	struct inpcb *inp = toep->inp;
1512 	struct tcpcb *tp;
1513 	struct socket *so;
1514 	struct sockbuf *sb;
1515 	struct epoch_tracker et;
1516 	int len, rx_credits;
1517 	uint32_t ddp_placed = 0;
1518 
1519 	if (__predict_false(toep->flags & TPF_SYNQE)) {
1520 		/*
1521 		 * do_pass_establish must have run before do_rx_data and if this
1522 		 * is still a synqe instead of a toepcb then the connection must
1523 		 * be getting aborted.
1524 		 */
1525 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
1526 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1527 		    toep, toep->flags);
1528 		m_freem(m);
1529 		return (0);
1530 	}
1531 
1532 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1533 
1534 	/* strip off CPL header */
1535 	m_adj(m, sizeof(*cpl));
1536 	len = m->m_pkthdr.len;
1537 
1538 	INP_WLOCK(inp);
1539 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
1540 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
1541 		    __func__, tid, len, inp->inp_flags);
1542 		INP_WUNLOCK(inp);
1543 		m_freem(m);
1544 		return (0);
1545 	}
1546 
1547 	tp = intotcpcb(inp);
1548 
1549 	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
1550 		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
1551 
1552 	tp->rcv_nxt += len;
1553 	if (tp->rcv_wnd < len) {
1554 		KASSERT(toep->ulp_mode == ULP_MODE_RDMA,
1555 				("%s: negative window size", __func__));
1556 	}
1557 
1558 	tp->rcv_wnd -= len;
1559 	tp->t_rcvtime = ticks;
1560 
1561 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
1562 		DDP_LOCK(toep);
1563 	so = inp_inpcbtosocket(inp);
1564 	sb = &so->so_rcv;
1565 	SOCKBUF_LOCK(sb);
1566 
1567 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
1568 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
1569 		    __func__, tid, len);
1570 		m_freem(m);
1571 		SOCKBUF_UNLOCK(sb);
1572 		if (toep->ulp_mode == ULP_MODE_TCPDDP)
1573 			DDP_UNLOCK(toep);
1574 		INP_WUNLOCK(inp);
1575 
1576 		CURVNET_SET(toep->vnet);
1577 		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
1578 		INP_WLOCK(inp);
1579 		tp = tcp_drop(tp, ECONNRESET);
1580 		if (tp)
1581 			INP_WUNLOCK(inp);
1582 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1583 		CURVNET_RESTORE();
1584 
1585 		return (0);
1586 	}
1587 
1588 	/* receive buffer autosize */
1589 	MPASS(toep->vnet == so->so_vnet);
1590 	CURVNET_SET(toep->vnet);
1591 	if (sb->sb_flags & SB_AUTOSIZE &&
1592 	    V_tcp_do_autorcvbuf &&
1593 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
1594 	    len > (sbspace(sb) / 8 * 7)) {
1595 		unsigned int hiwat = sb->sb_hiwat;
1596 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
1597 		    V_tcp_autorcvbuf_max);
1598 
1599 		if (!sbreserve_locked(sb, newsize, so, NULL))
1600 			sb->sb_flags &= ~SB_AUTOSIZE;
1601 	}
1602 
1603 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
1604 		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
1605 
1606 		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
1607 			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
1608 			    __func__, tid, len);
1609 
1610 		if (changed) {
1611 			if (toep->ddp.flags & DDP_SC_REQ)
1612 				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
1613 			else {
1614 				KASSERT(cpl->ddp_off == 1,
1615 				    ("%s: DDP switched on by itself.",
1616 				    __func__));
1617 
1618 				/* Fell out of DDP mode */
1619 				toep->ddp.flags &= ~DDP_ON;
1620 				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
1621 				    __func__);
1622 
1623 				insert_ddp_data(toep, ddp_placed);
1624 			}
1625 		}
1626 
1627 		if (toep->ddp.flags & DDP_ON) {
1628 			/*
1629 			 * CPL_RX_DATA with DDP on can only be an indicate.
1630 			 * Start posting queued AIO requests via DDP.  The
1631 			 * payload that arrived in this indicate is appended
1632 			 * to the socket buffer as usual.
1633 			 */
1634 			handle_ddp_indicate(toep);
1635 		}
1636 	}
1637 
1638 	sbappendstream_locked(sb, m, 0);
1639 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
1640 	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
1641 		rx_credits = send_rx_credits(sc, toep, rx_credits);
1642 		tp->rcv_wnd += rx_credits;
1643 		tp->rcv_adv += rx_credits;
1644 	}
1645 
1646 	if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
1647 	    sbavail(sb) != 0) {
1648 		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
1649 		    tid);
1650 		ddp_queue_toep(toep);
1651 	}
1652 	sorwakeup_locked(so);
1653 	SOCKBUF_UNLOCK_ASSERT(sb);
1654 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
1655 		DDP_UNLOCK(toep);
1656 
1657 	INP_WUNLOCK(inp);
1658 	CURVNET_RESTORE();
1659 	return (0);
1660 }
1661 
1662 static int
1663 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1664 {
1665 	struct adapter *sc = iq->adapter;
1666 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
1667 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
1668 	struct toepcb *toep = lookup_tid(sc, tid);
1669 	struct inpcb *inp;
1670 	struct tcpcb *tp;
1671 	struct socket *so;
1672 	uint8_t credits = cpl->credits;
1673 	struct ofld_tx_sdesc *txsd;
1674 	int plen;
1675 #ifdef INVARIANTS
1676 	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
1677 #endif
1678 
1679 	/*
1680 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
1681 	 * now this comes back carrying the credits for the flowc.
1682 	 */
1683 	if (__predict_false(toep->flags & TPF_SYNQE)) {
1684 		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1685 		    ("%s: credits for a synq entry %p", __func__, toep));
1686 		return (0);
1687 	}
1688 
1689 	inp = toep->inp;
1690 
1691 	KASSERT(opcode == CPL_FW4_ACK,
1692 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1693 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1694 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1695 
1696 	INP_WLOCK(inp);
1697 
1698 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
1699 		INP_WUNLOCK(inp);
1700 		return (0);
1701 	}
1702 
1703 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
1704 	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
1705 
1706 	tp = intotcpcb(inp);
1707 
1708 	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
1709 		tcp_seq snd_una = be32toh(cpl->snd_una);
1710 
1711 #ifdef INVARIANTS
1712 		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
1713 			log(LOG_ERR,
1714 			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
1715 			    __func__, snd_una, toep->tid, tp->snd_una);
1716 		}
1717 #endif
1718 
1719 		if (tp->snd_una != snd_una) {
1720 			tp->snd_una = snd_una;
1721 			tp->ts_recent_age = tcp_ts_getticks();
1722 		}
1723 	}
1724 
1725 #ifdef VERBOSE_TRACES
1726 	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
1727 #endif
1728 	so = inp->inp_socket;
1729 	txsd = &toep->txsd[toep->txsd_cidx];
1730 	plen = 0;
1731 	while (credits) {
1732 		KASSERT(credits >= txsd->tx_credits,
1733 		    ("%s: too many (or partial) credits", __func__));
1734 		credits -= txsd->tx_credits;
1735 		toep->tx_credits += txsd->tx_credits;
1736 		plen += txsd->plen;
1737 		if (txsd->iv_buffer) {
1738 			free(txsd->iv_buffer, M_CXGBE);
1739 			txsd->iv_buffer = NULL;
1740 		}
1741 		txsd++;
1742 		toep->txsd_avail++;
1743 		KASSERT(toep->txsd_avail <= toep->txsd_total,
1744 		    ("%s: txsd avail > total", __func__));
1745 		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
1746 			txsd = &toep->txsd[0];
1747 			toep->txsd_cidx = 0;
1748 		}
1749 	}
1750 
1751 	if (toep->tx_credits == toep->tx_total) {
1752 		toep->tx_nocompl = 0;
1753 		toep->plen_nocompl = 0;
1754 	}
1755 
1756 	if (toep->flags & TPF_TX_SUSPENDED &&
1757 	    toep->tx_credits >= toep->tx_total / 4) {
1758 #ifdef VERBOSE_TRACES
1759 		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
1760 		    tid);
1761 #endif
1762 		toep->flags &= ~TPF_TX_SUSPENDED;
1763 		CURVNET_SET(toep->vnet);
1764 		if (toep->ulp_mode == ULP_MODE_ISCSI)
1765 			t4_push_pdus(sc, toep, plen);
1766 		else if (tls_tx_key(toep))
1767 			t4_push_tls_records(sc, toep, plen);
1768 		else
1769 			t4_push_frames(sc, toep, plen);
1770 		CURVNET_RESTORE();
1771 	} else if (plen > 0) {
1772 		struct sockbuf *sb = &so->so_snd;
1773 		int sbu;
1774 
1775 		SOCKBUF_LOCK(sb);
1776 		sbu = sbused(sb);
1777 		if (toep->ulp_mode == ULP_MODE_ISCSI) {
1778 
1779 			if (__predict_false(sbu > 0)) {
1780 				/*
1781 				 * The data trasmitted before the tid's ULP mode
1782 				 * changed to ISCSI is still in so_snd.
1783 				 * Incoming credits should account for so_snd
1784 				 * first.
1785 				 */
1786 				sbdrop_locked(sb, min(sbu, plen));
1787 				plen -= min(sbu, plen);
1788 			}
1789 			sowwakeup_locked(so);	/* unlocks so_snd */
1790 			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
1791 		} else {
1792 #ifdef VERBOSE_TRACES
1793 			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
1794 			    tid, plen);
1795 #endif
1796 			sbdrop_locked(sb, plen);
1797 			if (tls_tx_key(toep)) {
1798 				struct tls_ofld_info *tls_ofld = &toep->tls;
1799 
1800 				MPASS(tls_ofld->sb_off >= plen);
1801 				tls_ofld->sb_off -= plen;
1802 			}
1803 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
1804 				t4_aiotx_queue_toep(so, toep);
1805 			sowwakeup_locked(so);	/* unlocks so_snd */
1806 		}
1807 		SOCKBUF_UNLOCK_ASSERT(sb);
1808 	}
1809 
1810 	INP_WUNLOCK(inp);
1811 
1812 	return (0);
1813 }
1814 
1815 void
1816 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
1817     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
1818 {
1819 	struct wrqe *wr;
1820 	struct cpl_set_tcb_field *req;
1821 	struct ofld_tx_sdesc *txsd;
1822 
1823 	MPASS((cookie & ~M_COOKIE) == 0);
1824 	if (reply) {
1825 		MPASS(cookie != CPL_COOKIE_RESERVED);
1826 	}
1827 
1828 	wr = alloc_wrqe(sizeof(*req), wrq);
1829 	if (wr == NULL) {
1830 		/* XXX */
1831 		panic("%s: allocation failure.", __func__);
1832 	}
1833 	req = wrtod(wr);
1834 
1835 	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
1836 	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
1837 	if (reply == 0)
1838 		req->reply_ctrl |= htobe16(F_NO_REPLY);
1839 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
1840 	req->mask = htobe64(mask);
1841 	req->val = htobe64(val);
1842 	if ((wrq->eq.flags & EQ_TYPEMASK) == EQ_OFLD) {
1843 		txsd = &toep->txsd[toep->txsd_pidx];
1844 		txsd->tx_credits = howmany(sizeof(*req), 16);
1845 		txsd->plen = 0;
1846 		KASSERT(toep->tx_credits >= txsd->tx_credits &&
1847 		    toep->txsd_avail > 0,
1848 		    ("%s: not enough credits (%d)", __func__,
1849 		    toep->tx_credits));
1850 		toep->tx_credits -= txsd->tx_credits;
1851 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
1852 			toep->txsd_pidx = 0;
1853 		toep->txsd_avail--;
1854 	}
1855 
1856 	t4_wrq_tx(sc, wr);
1857 }
1858 
1859 void
1860 t4_init_cpl_io_handlers(void)
1861 {
1862 
1863 	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
1864 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
1865 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
1866 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
1867 	    CPL_COOKIE_TOM);
1868 	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
1869 	t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
1870 }
1871 
1872 void
1873 t4_uninit_cpl_io_handlers(void)
1874 {
1875 
1876 	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
1877 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
1878 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
1879 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
1880 	t4_register_cpl_handler(CPL_RX_DATA, NULL);
1881 	t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
1882 }
1883 
1884 /*
1885  * Use the 'backend1' field in AIO jobs to hold an error that should
1886  * be reported when the job is completed, the 'backend3' field to
1887  * store the amount of data sent by the AIO job so far, and the
1888  * 'backend4' field to hold a reference count on the job.
1889  *
1890  * Each unmapped mbuf holds a reference on the job as does the queue
1891  * so long as the job is queued.
1892  */
1893 #define	aio_error	backend1
1894 #define	aio_sent	backend3
1895 #define	aio_refs	backend4
1896 
1897 #define	jobtotid(job)							\
1898 	(((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
1899 
1900 static void
1901 aiotx_free_job(struct kaiocb *job)
1902 {
1903 	long status;
1904 	int error;
1905 
1906 	if (refcount_release(&job->aio_refs) == 0)
1907 		return;
1908 
1909 	error = (intptr_t)job->aio_error;
1910 	status = job->aio_sent;
1911 #ifdef VERBOSE_TRACES
1912 	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
1913 	    jobtotid(job), job, status, error);
1914 #endif
1915 	if (error != 0 && status != 0)
1916 		error = 0;
1917 	if (error == ECANCELED)
1918 		aio_cancel(job);
1919 	else if (error)
1920 		aio_complete(job, -1, error);
1921 	else {
1922 		job->msgsnd = 1;
1923 		aio_complete(job, status, 0);
1924 	}
1925 }
1926 
1927 static void
1928 aiotx_free_pgs(struct mbuf *m)
1929 {
1930 	struct mbuf_ext_pgs *ext_pgs;
1931 	struct kaiocb *job;
1932 	struct mtx *mtx;
1933 	vm_page_t pg;
1934 
1935 	MBUF_EXT_PGS_ASSERT(m);
1936 	ext_pgs = m->m_ext.ext_pgs;
1937 	job = m->m_ext.ext_arg1;
1938 #ifdef VERBOSE_TRACES
1939 	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
1940 	    m->m_len, jobtotid(job));
1941 #endif
1942 
1943 	mtx = NULL;
1944 	for (int i = 0; i < ext_pgs->npgs; i++) {
1945 		pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
1946 		vm_page_change_lock(pg, &mtx);
1947 		vm_page_unwire(pg, PQ_ACTIVE);
1948 	}
1949 	if (mtx != NULL)
1950 		mtx_unlock(mtx);
1951 
1952 	aiotx_free_job(job);
1953 }
1954 
1955 /*
1956  * Allocate a chain of unmapped mbufs describing the next 'len' bytes
1957  * of an AIO job.
1958  */
1959 static struct mbuf *
1960 alloc_aiotx_mbuf(struct kaiocb *job, int len)
1961 {
1962 	struct vmspace *vm;
1963 	vm_page_t pgs[MBUF_PEXT_MAX_PGS];
1964 	struct mbuf *m, *top, *last;
1965 	struct mbuf_ext_pgs *ext_pgs;
1966 	vm_map_t map;
1967 	vm_offset_t start;
1968 	int i, mlen, npages, pgoff;
1969 
1970 	KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes,
1971 	    ("%s(%p, %d): request to send beyond end of buffer", __func__,
1972 	    job, len));
1973 
1974 	/*
1975 	 * The AIO subsystem will cancel and drain all requests before
1976 	 * permitting a process to exit or exec, so p_vmspace should
1977 	 * be stable here.
1978 	 */
1979 	vm = job->userproc->p_vmspace;
1980 	map = &vm->vm_map;
1981 	start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent;
1982 	pgoff = start & PAGE_MASK;
1983 
1984 	top = NULL;
1985 	last = NULL;
1986 	while (len > 0) {
1987 		mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff);
1988 		KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0,
1989 		    ("%s: next start (%#jx + %#x) is not page aligned",
1990 		    __func__, (uintmax_t)start, mlen));
1991 
1992 		npages = vm_fault_quick_hold_pages(map, start, mlen,
1993 		    VM_PROT_WRITE, pgs, nitems(pgs));
1994 		if (npages < 0)
1995 			break;
1996 
1997 		m = mb_alloc_ext_pgs(M_WAITOK, false, aiotx_free_pgs);
1998 		if (m == NULL) {
1999 			vm_page_unhold_pages(pgs, npages);
2000 			break;
2001 		}
2002 
2003 		ext_pgs = m->m_ext.ext_pgs;
2004 		ext_pgs->first_pg_off = pgoff;
2005 		ext_pgs->npgs = npages;
2006 		if (npages == 1) {
2007 			KASSERT(mlen + pgoff <= PAGE_SIZE,
2008 			    ("%s: single page is too large (off %d len %d)",
2009 			    __func__, pgoff, mlen));
2010 			ext_pgs->last_pg_len = mlen;
2011 		} else {
2012 			ext_pgs->last_pg_len = mlen - (PAGE_SIZE - pgoff) -
2013 			    (npages - 2) * PAGE_SIZE;
2014 		}
2015 		for (i = 0; i < npages; i++)
2016 			ext_pgs->pa[i] = VM_PAGE_TO_PHYS(pgs[i]);
2017 
2018 		m->m_len = mlen;
2019 		m->m_ext.ext_size = npages * PAGE_SIZE;
2020 		m->m_ext.ext_arg1 = job;
2021 		refcount_acquire(&job->aio_refs);
2022 
2023 #ifdef VERBOSE_TRACES
2024 		CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d",
2025 		    __func__, jobtotid(job), m, job, npages);
2026 #endif
2027 
2028 		if (top == NULL)
2029 			top = m;
2030 		else
2031 			last->m_next = m;
2032 		last = m;
2033 
2034 		len -= mlen;
2035 		start += mlen;
2036 		pgoff = 0;
2037 	}
2038 
2039 	return (top);
2040 }
2041 
2042 static void
2043 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
2044 {
2045 	struct adapter *sc;
2046 	struct sockbuf *sb;
2047 	struct file *fp;
2048 	struct inpcb *inp;
2049 	struct tcpcb *tp;
2050 	struct mbuf *m;
2051 	int error, len;
2052 	bool moretocome, sendmore;
2053 
2054 	sc = td_adapter(toep->td);
2055 	sb = &so->so_snd;
2056 	SOCKBUF_UNLOCK(sb);
2057 	fp = job->fd_file;
2058 	m = NULL;
2059 
2060 #ifdef MAC
2061 	error = mac_socket_check_send(fp->f_cred, so);
2062 	if (error != 0)
2063 		goto out;
2064 #endif
2065 
2066 	/* Inline sosend_generic(). */
2067 
2068 	error = sblock(sb, SBL_WAIT);
2069 	MPASS(error == 0);
2070 
2071 sendanother:
2072 	SOCKBUF_LOCK(sb);
2073 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2074 		SOCKBUF_UNLOCK(sb);
2075 		sbunlock(sb);
2076 		if ((so->so_options & SO_NOSIGPIPE) == 0) {
2077 			PROC_LOCK(job->userproc);
2078 			kern_psignal(job->userproc, SIGPIPE);
2079 			PROC_UNLOCK(job->userproc);
2080 		}
2081 		error = EPIPE;
2082 		goto out;
2083 	}
2084 	if (so->so_error) {
2085 		error = so->so_error;
2086 		so->so_error = 0;
2087 		SOCKBUF_UNLOCK(sb);
2088 		sbunlock(sb);
2089 		goto out;
2090 	}
2091 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2092 		SOCKBUF_UNLOCK(sb);
2093 		sbunlock(sb);
2094 		error = ENOTCONN;
2095 		goto out;
2096 	}
2097 	if (sbspace(sb) < sb->sb_lowat) {
2098 		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
2099 
2100 		/*
2101 		 * Don't block if there is too little room in the socket
2102 		 * buffer.  Instead, requeue the request.
2103 		 */
2104 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2105 			SOCKBUF_UNLOCK(sb);
2106 			sbunlock(sb);
2107 			error = ECANCELED;
2108 			goto out;
2109 		}
2110 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2111 		SOCKBUF_UNLOCK(sb);
2112 		sbunlock(sb);
2113 		goto out;
2114 	}
2115 
2116 	/*
2117 	 * Write as much data as the socket permits, but no more than a
2118 	 * a single sndbuf at a time.
2119 	 */
2120 	len = sbspace(sb);
2121 	if (len > job->uaiocb.aio_nbytes - job->aio_sent) {
2122 		len = job->uaiocb.aio_nbytes - job->aio_sent;
2123 		moretocome = false;
2124 	} else
2125 		moretocome = true;
2126 	if (len > sc->tt.sndbuf) {
2127 		len = sc->tt.sndbuf;
2128 		sendmore = true;
2129 	} else
2130 		sendmore = false;
2131 
2132 	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
2133 		moretocome = true;
2134 	SOCKBUF_UNLOCK(sb);
2135 	MPASS(len != 0);
2136 
2137 	m = alloc_aiotx_mbuf(job, len);
2138 	if (m == NULL) {
2139 		sbunlock(sb);
2140 		error = EFAULT;
2141 		goto out;
2142 	}
2143 
2144 	/* Inlined tcp_usr_send(). */
2145 
2146 	inp = toep->inp;
2147 	INP_WLOCK(inp);
2148 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
2149 		INP_WUNLOCK(inp);
2150 		sbunlock(sb);
2151 		error = ECONNRESET;
2152 		goto out;
2153 	}
2154 
2155 	job->aio_sent += m_length(m, NULL);
2156 
2157 	sbappendstream(sb, m, 0);
2158 	m = NULL;
2159 
2160 	if (!(inp->inp_flags & INP_DROPPED)) {
2161 		tp = intotcpcb(inp);
2162 		if (moretocome)
2163 			tp->t_flags |= TF_MORETOCOME;
2164 		error = tp->t_fb->tfb_tcp_output(tp);
2165 		if (moretocome)
2166 			tp->t_flags &= ~TF_MORETOCOME;
2167 	}
2168 
2169 	INP_WUNLOCK(inp);
2170 	if (sendmore)
2171 		goto sendanother;
2172 	sbunlock(sb);
2173 
2174 	if (error)
2175 		goto out;
2176 
2177 	/*
2178 	 * If this is a blocking socket and the request has not been
2179 	 * fully completed, requeue it until the socket is ready
2180 	 * again.
2181 	 */
2182 	if (job->aio_sent < job->uaiocb.aio_nbytes &&
2183 	    !(so->so_state & SS_NBIO)) {
2184 		SOCKBUF_LOCK(sb);
2185 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2186 			SOCKBUF_UNLOCK(sb);
2187 			error = ECANCELED;
2188 			goto out;
2189 		}
2190 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2191 		return;
2192 	}
2193 
2194 	/*
2195 	 * If the request will not be requeued, drop the queue's
2196 	 * reference to the job.  Any mbufs in flight should still
2197 	 * hold a reference, but this drops the reference that the
2198 	 * queue owns while it is waiting to queue mbufs to the
2199 	 * socket.
2200 	 */
2201 	aiotx_free_job(job);
2202 
2203 out:
2204 	if (error) {
2205 		job->aio_error = (void *)(intptr_t)error;
2206 		aiotx_free_job(job);
2207 	}
2208 	if (m != NULL)
2209 		m_free(m);
2210 	SOCKBUF_LOCK(sb);
2211 }
2212 
2213 static void
2214 t4_aiotx_task(void *context, int pending)
2215 {
2216 	struct toepcb *toep = context;
2217 	struct socket *so;
2218 	struct kaiocb *job;
2219 
2220 	so = toep->aiotx_so;
2221 	CURVNET_SET(toep->vnet);
2222 	SOCKBUF_LOCK(&so->so_snd);
2223 	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
2224 		job = TAILQ_FIRST(&toep->aiotx_jobq);
2225 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2226 		if (!aio_clear_cancel_function(job))
2227 			continue;
2228 
2229 		t4_aiotx_process_job(toep, so, job);
2230 	}
2231 	toep->aiotx_so = NULL;
2232 	SOCKBUF_UNLOCK(&so->so_snd);
2233 	CURVNET_RESTORE();
2234 
2235 	free_toepcb(toep);
2236 	SOCK_LOCK(so);
2237 	sorele(so);
2238 }
2239 
2240 static void
2241 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep)
2242 {
2243 
2244 	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
2245 #ifdef VERBOSE_TRACES
2246 	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
2247 	    __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false");
2248 #endif
2249 	if (toep->aiotx_so != NULL)
2250 		return;
2251 	soref(so);
2252 	toep->aiotx_so = so;
2253 	hold_toepcb(toep);
2254 	soaio_enqueue(&toep->aiotx_task);
2255 }
2256 
2257 static void
2258 t4_aiotx_cancel(struct kaiocb *job)
2259 {
2260 	struct socket *so;
2261 	struct sockbuf *sb;
2262 	struct tcpcb *tp;
2263 	struct toepcb *toep;
2264 
2265 	so = job->fd_file->f_data;
2266 	tp = so_sototcpcb(so);
2267 	toep = tp->t_toe;
2268 	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
2269 	sb = &so->so_snd;
2270 
2271 	SOCKBUF_LOCK(sb);
2272 	if (!aio_cancel_cleared(job))
2273 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2274 	SOCKBUF_UNLOCK(sb);
2275 
2276 	job->aio_error = (void *)(intptr_t)ECANCELED;
2277 	aiotx_free_job(job);
2278 }
2279 
2280 int
2281 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
2282 {
2283 	struct tcpcb *tp = so_sototcpcb(so);
2284 	struct toepcb *toep = tp->t_toe;
2285 	struct adapter *sc = td_adapter(toep->td);
2286 
2287 	/* This only handles writes. */
2288 	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
2289 		return (EOPNOTSUPP);
2290 
2291 	if (!sc->tt.tx_zcopy)
2292 		return (EOPNOTSUPP);
2293 
2294 	if (tls_tx_key(toep))
2295 		return (EOPNOTSUPP);
2296 
2297 	SOCKBUF_LOCK(&so->so_snd);
2298 #ifdef VERBOSE_TRACES
2299 	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
2300 #endif
2301 	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
2302 		panic("new job was cancelled");
2303 	refcount_init(&job->aio_refs, 1);
2304 	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
2305 	if (sowriteable(so))
2306 		t4_aiotx_queue_toep(so, toep);
2307 	SOCKBUF_UNLOCK(&so->so_snd);
2308 	return (0);
2309 }
2310 
2311 void
2312 aiotx_init_toep(struct toepcb *toep)
2313 {
2314 
2315 	TAILQ_INIT(&toep->aiotx_jobq);
2316 	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
2317 }
2318 #endif
2319