xref: /freebsd/sys/dev/cxgbe/tom/t4_cpl_io.c (revision ca987d4641cdcd7f27e153db17c5bf064934faf5)
1 /*-
2  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_ratelimit.h"
34 
35 #ifdef TCP_OFFLOAD
36 #include <sys/param.h>
37 #include <sys/aio.h>
38 #include <sys/file.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/domain.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sglist.h>
48 #include <sys/taskqueue.h>
49 #include <netinet/in.h>
50 #include <netinet/in_pcb.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 #define TCPSTATES
54 #include <netinet/tcp_fsm.h>
55 #include <netinet/tcp_seq.h>
56 #include <netinet/tcp_var.h>
57 #include <netinet/toecore.h>
58 
59 #include <security/mac/mac_framework.h>
60 
61 #include <vm/vm.h>
62 #include <vm/vm_extern.h>
63 #include <vm/pmap.h>
64 #include <vm/vm_map.h>
65 #include <vm/vm_page.h>
66 
67 #include "common/common.h"
68 #include "common/t4_msg.h"
69 #include "common/t4_regs.h"
70 #include "common/t4_tcb.h"
71 #include "tom/t4_tom_l2t.h"
72 #include "tom/t4_tom.h"
73 
74 #define	IS_AIOTX_MBUF(m)						\
75 	((m)->m_flags & M_EXT && (m)->m_ext.ext_flags & EXT_FLAG_AIOTX)
76 
77 static void	t4_aiotx_cancel(struct kaiocb *job);
78 static void	t4_aiotx_queue_toep(struct toepcb *toep);
79 
80 static size_t
81 aiotx_mbuf_pgoff(struct mbuf *m)
82 {
83 	struct aiotx_buffer *ab;
84 
85 	MPASS(IS_AIOTX_MBUF(m));
86 	ab = m->m_ext.ext_arg1;
87 	return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE);
88 }
89 
90 static vm_page_t *
91 aiotx_mbuf_pages(struct mbuf *m)
92 {
93 	struct aiotx_buffer *ab;
94 	int npages;
95 
96 	MPASS(IS_AIOTX_MBUF(m));
97 	ab = m->m_ext.ext_arg1;
98 	npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE;
99 	return (ab->ps.pages + npages);
100 }
101 
102 void
103 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
104 {
105 	struct wrqe *wr;
106 	struct fw_flowc_wr *flowc;
107 	unsigned int nparams = ftxp ? 8 : 6, flowclen;
108 	struct vi_info *vi = toep->vi;
109 	struct port_info *pi = vi->pi;
110 	struct adapter *sc = pi->adapter;
111 	unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
112 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
113 
114 	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
115 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
116 
117 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
118 
119 	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
120 	if (wr == NULL) {
121 		/* XXX */
122 		panic("%s: allocation failure.", __func__);
123 	}
124 	flowc = wrtod(wr);
125 	memset(flowc, 0, wr->wr_len);
126 
127 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
128 	    V_FW_FLOWC_WR_NPARAMS(nparams));
129 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
130 	    V_FW_WR_FLOWID(toep->tid));
131 
132 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
133 	flowc->mnemval[0].val = htobe32(pfvf);
134 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
135 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
136 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
137 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
138 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
139 	flowc->mnemval[3].val = htobe32(toep->ofld_rxq->iq.abs_id);
140 	if (ftxp) {
141 		uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
142 
143 		flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
144 		flowc->mnemval[4].val = htobe32(ftxp->snd_nxt);
145 		flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
146 		flowc->mnemval[5].val = htobe32(ftxp->rcv_nxt);
147 		flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
148 		flowc->mnemval[6].val = htobe32(sndbuf);
149 		flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
150 		flowc->mnemval[7].val = htobe32(ftxp->mss);
151 
152 		CTR6(KTR_CXGBE,
153 		    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
154 		    __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt,
155 		    ftxp->rcv_nxt);
156 	} else {
157 		flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
158 		flowc->mnemval[4].val = htobe32(512);
159 		flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
160 		flowc->mnemval[5].val = htobe32(512);
161 
162 		CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
163 	}
164 
165 	txsd->tx_credits = howmany(flowclen, 16);
166 	txsd->plen = 0;
167 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
168 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
169 	toep->tx_credits -= txsd->tx_credits;
170 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
171 		toep->txsd_pidx = 0;
172 	toep->txsd_avail--;
173 
174 	toep->flags |= TPF_FLOWC_WR_SENT;
175         t4_wrq_tx(sc, wr);
176 }
177 
178 #ifdef RATELIMIT
179 /*
180  * Input is Bytes/second (so_max_pacing-rate), chip counts in Kilobits/second.
181  */
182 static int
183 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
184 {
185 	int tc_idx, rc;
186 	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
187 	const int port_id = toep->vi->pi->port_id;
188 
189 	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
190 
191 	if (kbps == 0) {
192 		/* unbind */
193 		tc_idx = -1;
194 	} else {
195 		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
196 		if (rc != 0)
197 			return (rc);
198 		MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
199 	}
200 
201 	if (toep->tc_idx != tc_idx) {
202 		struct wrqe *wr;
203 		struct fw_flowc_wr *flowc;
204 		int nparams = 1, flowclen, flowclen16;
205 		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
206 
207 		flowclen = sizeof(*flowc) + nparams * sizeof(struct
208 		    fw_flowc_mnemval);
209 		flowclen16 = howmany(flowclen, 16);
210 		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
211 		    (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) {
212 			if (tc_idx >= 0)
213 				t4_release_cl_rl_kbps(sc, port_id, tc_idx);
214 			return (ENOMEM);
215 		}
216 
217 		flowc = wrtod(wr);
218 		memset(flowc, 0, wr->wr_len);
219 
220 		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
221 		    V_FW_FLOWC_WR_NPARAMS(nparams));
222 		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
223 		    V_FW_WR_FLOWID(toep->tid));
224 
225 		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
226 		if (tc_idx == -1)
227 			flowc->mnemval[0].val = htobe32(0xff);
228 		else
229 			flowc->mnemval[0].val = htobe32(tc_idx);
230 
231 		txsd->tx_credits = flowclen16;
232 		txsd->plen = 0;
233 		toep->tx_credits -= txsd->tx_credits;
234 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
235 			toep->txsd_pidx = 0;
236 		toep->txsd_avail--;
237 		t4_wrq_tx(sc, wr);
238 	}
239 
240 	if (toep->tc_idx >= 0)
241 		t4_release_cl_rl_kbps(sc, port_id, toep->tc_idx);
242 	toep->tc_idx = tc_idx;
243 
244 	return (0);
245 }
246 #endif
247 
248 void
249 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
250 {
251 	struct wrqe *wr;
252 	struct cpl_abort_req *req;
253 	int tid = toep->tid;
254 	struct inpcb *inp = toep->inp;
255 	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
256 
257 	INP_WLOCK_ASSERT(inp);
258 
259 	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
260 	    __func__, toep->tid,
261 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
262 	    tcpstates[tp->t_state],
263 	    toep->flags, inp->inp_flags,
264 	    toep->flags & TPF_ABORT_SHUTDOWN ?
265 	    " (abort already in progress)" : "");
266 
267 	if (toep->flags & TPF_ABORT_SHUTDOWN)
268 		return;	/* abort already in progress */
269 
270 	toep->flags |= TPF_ABORT_SHUTDOWN;
271 
272 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
273 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
274 
275 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
276 	if (wr == NULL) {
277 		/* XXX */
278 		panic("%s: allocation failure.", __func__);
279 	}
280 	req = wrtod(wr);
281 
282 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
283 	if (inp->inp_flags & INP_DROPPED)
284 		req->rsvd0 = htobe32(snd_nxt);
285 	else
286 		req->rsvd0 = htobe32(tp->snd_nxt);
287 	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
288 	req->cmd = CPL_ABORT_SEND_RST;
289 
290 	/*
291 	 * XXX: What's the correct way to tell that the inp hasn't been detached
292 	 * from its socket?  Should I even be flushing the snd buffer here?
293 	 */
294 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
295 		struct socket *so = inp->inp_socket;
296 
297 		if (so != NULL)	/* because I'm not sure.  See comment above */
298 			sbflush(&so->so_snd);
299 	}
300 
301 	t4_l2t_send(sc, wr, toep->l2te);
302 }
303 
304 /*
305  * Called when a connection is established to translate the TCP options
306  * reported by HW to FreeBSD's native format.
307  */
308 static void
309 assign_rxopt(struct tcpcb *tp, unsigned int opt)
310 {
311 	struct toepcb *toep = tp->t_toe;
312 	struct inpcb *inp = tp->t_inpcb;
313 	struct adapter *sc = td_adapter(toep->td);
314 	int n;
315 
316 	INP_LOCK_ASSERT(inp);
317 
318 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
319 		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
320 	else
321 		n = sizeof(struct ip) + sizeof(struct tcphdr);
322 	if (V_tcp_do_rfc1323)
323 		n += TCPOLEN_TSTAMP_APPA;
324 	tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n;
325 
326 	CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid,
327 	    G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]);
328 
329 	if (G_TCPOPT_TSTAMP(opt)) {
330 		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
331 		tp->ts_recent = 0;		/* hmmm */
332 		tp->ts_recent_age = tcp_ts_getticks();
333 	}
334 
335 	if (G_TCPOPT_SACK(opt))
336 		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
337 	else
338 		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
339 
340 	if (G_TCPOPT_WSCALE_OK(opt))
341 		tp->t_flags |= TF_RCVD_SCALE;
342 
343 	/* Doing window scaling? */
344 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
345 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
346 		tp->rcv_scale = tp->request_r_scale;
347 		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
348 	}
349 }
350 
351 /*
352  * Completes some final bits of initialization for just established connections
353  * and changes their state to TCPS_ESTABLISHED.
354  *
355  * The ISNs are from after the exchange of SYNs.  i.e., the true ISN + 1.
356  */
357 void
358 make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn,
359     uint16_t opt)
360 {
361 	struct inpcb *inp = toep->inp;
362 	struct socket *so = inp->inp_socket;
363 	struct tcpcb *tp = intotcpcb(inp);
364 	long bufsize;
365 	uint32_t iss = be32toh(snd_isn) - 1;	/* true ISS */
366 	uint32_t irs = be32toh(rcv_isn) - 1;	/* true IRS */
367 	uint16_t tcpopt = be16toh(opt);
368 	struct flowc_tx_params ftxp;
369 
370 	INP_WLOCK_ASSERT(inp);
371 	KASSERT(tp->t_state == TCPS_SYN_SENT ||
372 	    tp->t_state == TCPS_SYN_RECEIVED,
373 	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
374 
375 	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
376 	    __func__, toep->tid, so, inp, tp, toep);
377 
378 	tp->t_state = TCPS_ESTABLISHED;
379 	tp->t_starttime = ticks;
380 	TCPSTAT_INC(tcps_connects);
381 
382 	tp->irs = irs;
383 	tcp_rcvseqinit(tp);
384 	tp->rcv_wnd = toep->rx_credits << 10;
385 	tp->rcv_adv += tp->rcv_wnd;
386 	tp->last_ack_sent = tp->rcv_nxt;
387 
388 	/*
389 	 * If we were unable to send all rx credits via opt0, save the remainder
390 	 * in rx_credits so that they can be handed over with the next credit
391 	 * update.
392 	 */
393 	SOCKBUF_LOCK(&so->so_rcv);
394 	bufsize = select_rcv_wnd(so);
395 	SOCKBUF_UNLOCK(&so->so_rcv);
396 	toep->rx_credits = bufsize - tp->rcv_wnd;
397 
398 	tp->iss = iss;
399 	tcp_sendseqinit(tp);
400 	tp->snd_una = iss + 1;
401 	tp->snd_nxt = iss + 1;
402 	tp->snd_max = iss + 1;
403 
404 	assign_rxopt(tp, tcpopt);
405 
406 	SOCKBUF_LOCK(&so->so_snd);
407 	if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
408 		bufsize = V_tcp_autosndbuf_max;
409 	else
410 		bufsize = sbspace(&so->so_snd);
411 	SOCKBUF_UNLOCK(&so->so_snd);
412 
413 	ftxp.snd_nxt = tp->snd_nxt;
414 	ftxp.rcv_nxt = tp->rcv_nxt;
415 	ftxp.snd_space = bufsize;
416 	ftxp.mss = tp->t_maxseg;
417 	send_flowc_wr(toep, &ftxp);
418 
419 	soisconnected(so);
420 }
421 
422 static int
423 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
424 {
425 	struct wrqe *wr;
426 	struct cpl_rx_data_ack *req;
427 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
428 
429 	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
430 
431 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
432 	if (wr == NULL)
433 		return (0);
434 	req = wrtod(wr);
435 
436 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
437 	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
438 
439 	t4_wrq_tx(sc, wr);
440 	return (credits);
441 }
442 
443 void
444 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
445 {
446 	struct adapter *sc = tod->tod_softc;
447 	struct inpcb *inp = tp->t_inpcb;
448 	struct socket *so = inp->inp_socket;
449 	struct sockbuf *sb = &so->so_rcv;
450 	struct toepcb *toep = tp->t_toe;
451 	int credits;
452 
453 	INP_WLOCK_ASSERT(inp);
454 
455 	SOCKBUF_LOCK_ASSERT(sb);
456 	KASSERT(toep->sb_cc >= sbused(sb),
457 	    ("%s: sb %p has more data (%d) than last time (%d).",
458 	    __func__, sb, sbused(sb), toep->sb_cc));
459 
460 	toep->rx_credits += toep->sb_cc - sbused(sb);
461 	toep->sb_cc = sbused(sb);
462 
463 	if (toep->rx_credits > 0 &&
464 	    (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 ||
465 	    (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
466 	    toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) {
467 
468 		credits = send_rx_credits(sc, toep, toep->rx_credits);
469 		toep->rx_credits -= credits;
470 		tp->rcv_wnd += credits;
471 		tp->rcv_adv += credits;
472 	}
473 }
474 
475 void
476 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
477 {
478 	struct inpcb *inp = tp->t_inpcb;
479 	struct socket *so = inp->inp_socket;
480 	struct sockbuf *sb = &so->so_rcv;
481 
482 	SOCKBUF_LOCK(sb);
483 	t4_rcvd_locked(tod, tp);
484 	SOCKBUF_UNLOCK(sb);
485 }
486 
487 /*
488  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
489  */
490 static int
491 close_conn(struct adapter *sc, struct toepcb *toep)
492 {
493 	struct wrqe *wr;
494 	struct cpl_close_con_req *req;
495 	unsigned int tid = toep->tid;
496 
497 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
498 	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
499 
500 	if (toep->flags & TPF_FIN_SENT)
501 		return (0);
502 
503 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
504 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
505 
506 	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
507 	if (wr == NULL) {
508 		/* XXX */
509 		panic("%s: allocation failure.", __func__);
510 	}
511 	req = wrtod(wr);
512 
513         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
514 	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
515 	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
516 	    V_FW_WR_FLOWID(tid));
517         req->wr.wr_lo = cpu_to_be64(0);
518         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
519 	req->rsvd = 0;
520 
521 	toep->flags |= TPF_FIN_SENT;
522 	toep->flags &= ~TPF_SEND_FIN;
523 	t4_l2t_send(sc, wr, toep->l2te);
524 
525 	return (0);
526 }
527 
528 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
529 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
530 
531 /* Maximum amount of immediate data we could stuff in a WR */
532 static inline int
533 max_imm_payload(int tx_credits)
534 {
535 	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
536 
537 	KASSERT(tx_credits >= 0 &&
538 		tx_credits <= MAX_OFLD_TX_CREDITS,
539 		("%s: %d credits", __func__, tx_credits));
540 
541 	if (tx_credits < MIN_OFLD_TX_CREDITS)
542 		return (0);
543 
544 	if (tx_credits >= (n * EQ_ESIZE) / 16)
545 		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
546 	else
547 		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
548 }
549 
550 /* Maximum number of SGL entries we could stuff in a WR */
551 static inline int
552 max_dsgl_nsegs(int tx_credits)
553 {
554 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
555 	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
556 
557 	KASSERT(tx_credits >= 0 &&
558 		tx_credits <= MAX_OFLD_TX_CREDITS,
559 		("%s: %d credits", __func__, tx_credits));
560 
561 	if (tx_credits < MIN_OFLD_TX_CREDITS)
562 		return (0);
563 
564 	nseg += 2 * (sge_pair_credits * 16 / 24);
565 	if ((sge_pair_credits * 16) % 24 == 16)
566 		nseg++;
567 
568 	return (nseg);
569 }
570 
571 static inline void
572 write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
573     unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign)
574 {
575 	struct fw_ofld_tx_data_wr *txwr = dst;
576 
577 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
578 	    V_FW_WR_IMMDLEN(immdlen));
579 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
580 	    V_FW_WR_LEN16(credits));
581 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) |
582 	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
583 	txwr->plen = htobe32(plen);
584 
585 	if (txalign > 0) {
586 		struct tcpcb *tp = intotcpcb(toep->inp);
587 
588 		if (plen < 2 * tp->t_maxseg || is_10G_port(toep->vi->pi))
589 			txwr->lsodisable_to_flags |=
590 			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
591 		else
592 			txwr->lsodisable_to_flags |=
593 			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
594 				(tp->t_flags & TF_NODELAY ? 0 :
595 				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
596 	}
597 }
598 
599 /*
600  * Generate a DSGL from a starting mbuf.  The total number of segments and the
601  * maximum segments in any one mbuf are provided.
602  */
603 static void
604 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
605 {
606 	struct mbuf *m;
607 	struct ulptx_sgl *usgl = dst;
608 	int i, j, rc;
609 	struct sglist sg;
610 	struct sglist_seg segs[n];
611 
612 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
613 
614 	sglist_init(&sg, n, segs);
615 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
616 	    V_ULPTX_NSGE(nsegs));
617 
618 	i = -1;
619 	for (m = start; m != stop; m = m->m_next) {
620 		if (IS_AIOTX_MBUF(m))
621 			rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m),
622 			    aiotx_mbuf_pgoff(m), m->m_len);
623 		else
624 			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
625 		if (__predict_false(rc != 0))
626 			panic("%s: sglist_append %d", __func__, rc);
627 
628 		for (j = 0; j < sg.sg_nseg; i++, j++) {
629 			if (i < 0) {
630 				usgl->len0 = htobe32(segs[j].ss_len);
631 				usgl->addr0 = htobe64(segs[j].ss_paddr);
632 			} else {
633 				usgl->sge[i / 2].len[i & 1] =
634 				    htobe32(segs[j].ss_len);
635 				usgl->sge[i / 2].addr[i & 1] =
636 				    htobe64(segs[j].ss_paddr);
637 			}
638 #ifdef INVARIANTS
639 			nsegs--;
640 #endif
641 		}
642 		sglist_reset(&sg);
643 	}
644 	if (i & 1)
645 		usgl->sge[i / 2].len[1] = htobe32(0);
646 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
647 	    __func__, nsegs, start, stop));
648 }
649 
650 /*
651  * Max number of SGL entries an offload tx work request can have.  This is 41
652  * (1 + 40) for a full 512B work request.
653  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
654  */
655 #define OFLD_SGL_LEN (41)
656 
657 /*
658  * Send data and/or a FIN to the peer.
659  *
660  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
661  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
662  * was transmitted.
663  *
664  * drop indicates the number of bytes that should be dropped from the head of
665  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
666  * contention on the send buffer lock (before this change it used to do
667  * sowwakeup and then t4_push_frames right after that when recovering from tx
668  * stalls).  When drop is set this function MUST drop the bytes and wake up any
669  * writers.
670  */
671 void
672 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
673 {
674 	struct mbuf *sndptr, *m, *sb_sndptr;
675 	struct fw_ofld_tx_data_wr *txwr;
676 	struct wrqe *wr;
677 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
678 	struct inpcb *inp = toep->inp;
679 	struct tcpcb *tp = intotcpcb(inp);
680 	struct socket *so = inp->inp_socket;
681 	struct sockbuf *sb = &so->so_snd;
682 	int tx_credits, shove, compl, sowwakeup;
683 	struct ofld_tx_sdesc *txsd;
684 	bool aiotx_mbuf_seen;
685 
686 	INP_WLOCK_ASSERT(inp);
687 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
688 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
689 
690 	KASSERT(toep->ulp_mode == ULP_MODE_NONE ||
691 	    toep->ulp_mode == ULP_MODE_TCPDDP ||
692 	    toep->ulp_mode == ULP_MODE_RDMA,
693 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
694 
695 #ifdef VERBOSE_TRACES
696 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
697 	    __func__, toep->tid, toep->flags, tp->t_flags);
698 #endif
699 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
700 		return;
701 
702 #ifdef RATELIMIT
703 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
704 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
705 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
706 	}
707 #endif
708 
709 	/*
710 	 * This function doesn't resume by itself.  Someone else must clear the
711 	 * flag and call this function.
712 	 */
713 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
714 		KASSERT(drop == 0,
715 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
716 		return;
717 	}
718 
719 	txsd = &toep->txsd[toep->txsd_pidx];
720 	do {
721 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
722 		max_imm = max_imm_payload(tx_credits);
723 		max_nsegs = max_dsgl_nsegs(tx_credits);
724 
725 		SOCKBUF_LOCK(sb);
726 		sowwakeup = drop;
727 		if (drop) {
728 			sbdrop_locked(sb, drop);
729 			drop = 0;
730 		}
731 		sb_sndptr = sb->sb_sndptr;
732 		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
733 		plen = 0;
734 		nsegs = 0;
735 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
736 		aiotx_mbuf_seen = false;
737 		for (m = sndptr; m != NULL; m = m->m_next) {
738 			int n;
739 
740 			if (IS_AIOTX_MBUF(m))
741 				n = sglist_count_vmpages(aiotx_mbuf_pages(m),
742 				    aiotx_mbuf_pgoff(m), m->m_len);
743 			else
744 				n = sglist_count(mtod(m, void *), m->m_len);
745 
746 			nsegs += n;
747 			plen += m->m_len;
748 
749 			/* This mbuf sent us _over_ the nsegs limit, back out */
750 			if (plen > max_imm && nsegs > max_nsegs) {
751 				nsegs -= n;
752 				plen -= m->m_len;
753 				if (plen == 0) {
754 					/* Too few credits */
755 					toep->flags |= TPF_TX_SUSPENDED;
756 					if (sowwakeup) {
757 						if (!TAILQ_EMPTY(
758 						    &toep->aiotx_jobq))
759 							t4_aiotx_queue_toep(
760 							    toep);
761 						sowwakeup_locked(so);
762 					} else
763 						SOCKBUF_UNLOCK(sb);
764 					SOCKBUF_UNLOCK_ASSERT(sb);
765 					return;
766 				}
767 				break;
768 			}
769 
770 			if (IS_AIOTX_MBUF(m))
771 				aiotx_mbuf_seen = true;
772 			if (max_nsegs_1mbuf < n)
773 				max_nsegs_1mbuf = n;
774 			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
775 
776 			/* This mbuf put us right at the max_nsegs limit */
777 			if (plen > max_imm && nsegs == max_nsegs) {
778 				m = m->m_next;
779 				break;
780 			}
781 		}
782 
783 		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
784 		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
785 			compl = 1;
786 		else
787 			compl = 0;
788 
789 		if (sb->sb_flags & SB_AUTOSIZE &&
790 		    V_tcp_do_autosndbuf &&
791 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
792 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
793 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
794 			    V_tcp_autosndbuf_max);
795 
796 			if (!sbreserve_locked(sb, newsize, so, NULL))
797 				sb->sb_flags &= ~SB_AUTOSIZE;
798 			else
799 				sowwakeup = 1;	/* room available */
800 		}
801 		if (sowwakeup) {
802 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
803 				t4_aiotx_queue_toep(toep);
804 			sowwakeup_locked(so);
805 		} else
806 			SOCKBUF_UNLOCK(sb);
807 		SOCKBUF_UNLOCK_ASSERT(sb);
808 
809 		/* nothing to send */
810 		if (plen == 0) {
811 			KASSERT(m == NULL,
812 			    ("%s: nothing to send, but m != NULL", __func__));
813 			break;
814 		}
815 
816 		if (__predict_false(toep->flags & TPF_FIN_SENT))
817 			panic("%s: excess tx.", __func__);
818 
819 		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
820 		if (plen <= max_imm && !aiotx_mbuf_seen) {
821 
822 			/* Immediate data tx */
823 
824 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
825 					toep->ofld_txq);
826 			if (wr == NULL) {
827 				/* XXX: how will we recover from this? */
828 				toep->flags |= TPF_TX_SUSPENDED;
829 				return;
830 			}
831 			txwr = wrtod(wr);
832 			credits = howmany(wr->wr_len, 16);
833 			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0,
834 			    sc->tt.tx_align);
835 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
836 			nsegs = 0;
837 		} else {
838 			int wr_len;
839 
840 			/* DSGL tx */
841 
842 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
843 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
844 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
845 			if (wr == NULL) {
846 				/* XXX: how will we recover from this? */
847 				toep->flags |= TPF_TX_SUSPENDED;
848 				return;
849 			}
850 			txwr = wrtod(wr);
851 			credits = howmany(wr_len, 16);
852 			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0,
853 			    sc->tt.tx_align);
854 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
855 			    max_nsegs_1mbuf);
856 			if (wr_len & 0xf) {
857 				uint64_t *pad = (uint64_t *)
858 				    ((uintptr_t)txwr + wr_len);
859 				*pad = 0;
860 			}
861 		}
862 
863 		KASSERT(toep->tx_credits >= credits,
864 			("%s: not enough credits", __func__));
865 
866 		toep->tx_credits -= credits;
867 		toep->tx_nocompl += credits;
868 		toep->plen_nocompl += plen;
869 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
870 		    toep->tx_nocompl >= toep->tx_total / 4)
871 			compl = 1;
872 
873 		if (compl || toep->ulp_mode == ULP_MODE_RDMA) {
874 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
875 			toep->tx_nocompl = 0;
876 			toep->plen_nocompl = 0;
877 		}
878 
879 		tp->snd_nxt += plen;
880 		tp->snd_max += plen;
881 
882 		SOCKBUF_LOCK(sb);
883 		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
884 		sb->sb_sndptr = sb_sndptr;
885 		SOCKBUF_UNLOCK(sb);
886 
887 		toep->flags |= TPF_TX_DATA_SENT;
888 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
889 			toep->flags |= TPF_TX_SUSPENDED;
890 
891 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
892 		txsd->plen = plen;
893 		txsd->tx_credits = credits;
894 		txsd++;
895 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
896 			toep->txsd_pidx = 0;
897 			txsd = &toep->txsd[0];
898 		}
899 		toep->txsd_avail--;
900 
901 		t4_l2t_send(sc, wr, toep->l2te);
902 	} while (m != NULL);
903 
904 	/* Send a FIN if requested, but only if there's no more data to send */
905 	if (m == NULL && toep->flags & TPF_SEND_FIN)
906 		close_conn(sc, toep);
907 }
908 
909 static inline void
910 rqdrop_locked(struct mbufq *q, int plen)
911 {
912 	struct mbuf *m;
913 
914 	while (plen > 0) {
915 		m = mbufq_dequeue(q);
916 
917 		/* Too many credits. */
918 		MPASS(m != NULL);
919 		M_ASSERTPKTHDR(m);
920 
921 		/* Partial credits. */
922 		MPASS(plen >= m->m_pkthdr.len);
923 
924 		plen -= m->m_pkthdr.len;
925 		m_freem(m);
926 	}
927 }
928 
929 void
930 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
931 {
932 	struct mbuf *sndptr, *m;
933 	struct fw_ofld_tx_data_wr *txwr;
934 	struct wrqe *wr;
935 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
936 	u_int adjusted_plen, ulp_submode;
937 	struct inpcb *inp = toep->inp;
938 	struct tcpcb *tp = intotcpcb(inp);
939 	int tx_credits, shove;
940 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
941 	struct mbufq *pduq = &toep->ulp_pduq;
942 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
943 
944 	INP_WLOCK_ASSERT(inp);
945 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
946 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
947 	KASSERT(toep->ulp_mode == ULP_MODE_ISCSI,
948 	    ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
949 
950 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
951 		return;
952 
953 	/*
954 	 * This function doesn't resume by itself.  Someone else must clear the
955 	 * flag and call this function.
956 	 */
957 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
958 		KASSERT(drop == 0,
959 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
960 		return;
961 	}
962 
963 	if (drop)
964 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
965 
966 	while ((sndptr = mbufq_first(pduq)) != NULL) {
967 		M_ASSERTPKTHDR(sndptr);
968 
969 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
970 		max_imm = max_imm_payload(tx_credits);
971 		max_nsegs = max_dsgl_nsegs(tx_credits);
972 
973 		plen = 0;
974 		nsegs = 0;
975 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
976 		for (m = sndptr; m != NULL; m = m->m_next) {
977 			int n = sglist_count(mtod(m, void *), m->m_len);
978 
979 			nsegs += n;
980 			plen += m->m_len;
981 
982 			/*
983 			 * This mbuf would send us _over_ the nsegs limit.
984 			 * Suspend tx because the PDU can't be sent out.
985 			 */
986 			if (plen > max_imm && nsegs > max_nsegs) {
987 				toep->flags |= TPF_TX_SUSPENDED;
988 				return;
989 			}
990 
991 			if (max_nsegs_1mbuf < n)
992 				max_nsegs_1mbuf = n;
993 		}
994 
995 		if (__predict_false(toep->flags & TPF_FIN_SENT))
996 			panic("%s: excess tx.", __func__);
997 
998 		/*
999 		 * We have a PDU to send.  All of it goes out in one WR so 'm'
1000 		 * is NULL.  A PDU's length is always a multiple of 4.
1001 		 */
1002 		MPASS(m == NULL);
1003 		MPASS((plen & 3) == 0);
1004 		MPASS(sndptr->m_pkthdr.len == plen);
1005 
1006 		shove = !(tp->t_flags & TF_MORETOCOME);
1007 		ulp_submode = mbuf_ulp_submode(sndptr);
1008 		MPASS(ulp_submode < nitems(ulp_extra_len));
1009 
1010 		/*
1011 		 * plen doesn't include header and data digests, which are
1012 		 * generated and inserted in the right places by the TOE, but
1013 		 * they do occupy TCP sequence space and need to be accounted
1014 		 * for.
1015 		 */
1016 		adjusted_plen = plen + ulp_extra_len[ulp_submode];
1017 		if (plen <= max_imm) {
1018 
1019 			/* Immediate data tx */
1020 
1021 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
1022 					toep->ofld_txq);
1023 			if (wr == NULL) {
1024 				/* XXX: how will we recover from this? */
1025 				toep->flags |= TPF_TX_SUSPENDED;
1026 				return;
1027 			}
1028 			txwr = wrtod(wr);
1029 			credits = howmany(wr->wr_len, 16);
1030 			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
1031 			    shove, ulp_submode, sc->tt.tx_align);
1032 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
1033 			nsegs = 0;
1034 		} else {
1035 			int wr_len;
1036 
1037 			/* DSGL tx */
1038 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
1039 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
1040 			wr = alloc_wrqe(roundup2(wr_len, 16), toep->ofld_txq);
1041 			if (wr == NULL) {
1042 				/* XXX: how will we recover from this? */
1043 				toep->flags |= TPF_TX_SUSPENDED;
1044 				return;
1045 			}
1046 			txwr = wrtod(wr);
1047 			credits = howmany(wr_len, 16);
1048 			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
1049 			    shove, ulp_submode, sc->tt.tx_align);
1050 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
1051 			    max_nsegs_1mbuf);
1052 			if (wr_len & 0xf) {
1053 				uint64_t *pad = (uint64_t *)
1054 				    ((uintptr_t)txwr + wr_len);
1055 				*pad = 0;
1056 			}
1057 		}
1058 
1059 		KASSERT(toep->tx_credits >= credits,
1060 			("%s: not enough credits", __func__));
1061 
1062 		m = mbufq_dequeue(pduq);
1063 		MPASS(m == sndptr);
1064 		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
1065 
1066 		toep->tx_credits -= credits;
1067 		toep->tx_nocompl += credits;
1068 		toep->plen_nocompl += plen;
1069 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
1070 		    toep->tx_nocompl >= toep->tx_total / 4) {
1071 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
1072 			toep->tx_nocompl = 0;
1073 			toep->plen_nocompl = 0;
1074 		}
1075 
1076 		tp->snd_nxt += adjusted_plen;
1077 		tp->snd_max += adjusted_plen;
1078 
1079 		toep->flags |= TPF_TX_DATA_SENT;
1080 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
1081 			toep->flags |= TPF_TX_SUSPENDED;
1082 
1083 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
1084 		txsd->plen = plen;
1085 		txsd->tx_credits = credits;
1086 		txsd++;
1087 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
1088 			toep->txsd_pidx = 0;
1089 			txsd = &toep->txsd[0];
1090 		}
1091 		toep->txsd_avail--;
1092 
1093 		t4_l2t_send(sc, wr, toep->l2te);
1094 	}
1095 
1096 	/* Send a FIN if requested, but only if there are no more PDUs to send */
1097 	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
1098 		close_conn(sc, toep);
1099 }
1100 
1101 int
1102 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
1103 {
1104 	struct adapter *sc = tod->tod_softc;
1105 #ifdef INVARIANTS
1106 	struct inpcb *inp = tp->t_inpcb;
1107 #endif
1108 	struct toepcb *toep = tp->t_toe;
1109 
1110 	INP_WLOCK_ASSERT(inp);
1111 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1112 	    ("%s: inp %p dropped.", __func__, inp));
1113 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1114 
1115 	if (toep->ulp_mode == ULP_MODE_ISCSI)
1116 		t4_push_pdus(sc, toep, 0);
1117 	else
1118 		t4_push_frames(sc, toep, 0);
1119 
1120 	return (0);
1121 }
1122 
1123 int
1124 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
1125 {
1126 	struct adapter *sc = tod->tod_softc;
1127 #ifdef INVARIANTS
1128 	struct inpcb *inp = tp->t_inpcb;
1129 #endif
1130 	struct toepcb *toep = tp->t_toe;
1131 
1132 	INP_WLOCK_ASSERT(inp);
1133 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1134 	    ("%s: inp %p dropped.", __func__, inp));
1135 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1136 
1137 	toep->flags |= TPF_SEND_FIN;
1138 	if (tp->t_state >= TCPS_ESTABLISHED) {
1139 		if (toep->ulp_mode == ULP_MODE_ISCSI)
1140 			t4_push_pdus(sc, toep, 0);
1141 		else
1142 			t4_push_frames(sc, toep, 0);
1143 	}
1144 
1145 	return (0);
1146 }
1147 
1148 int
1149 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
1150 {
1151 	struct adapter *sc = tod->tod_softc;
1152 #if defined(INVARIANTS)
1153 	struct inpcb *inp = tp->t_inpcb;
1154 #endif
1155 	struct toepcb *toep = tp->t_toe;
1156 
1157 	INP_WLOCK_ASSERT(inp);
1158 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1159 	    ("%s: inp %p dropped.", __func__, inp));
1160 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
1161 
1162 	/* hmmmm */
1163 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
1164 	    ("%s: flowc for tid %u [%s] not sent already",
1165 	    __func__, toep->tid, tcpstates[tp->t_state]));
1166 
1167 	send_reset(sc, toep, 0);
1168 	return (0);
1169 }
1170 
1171 /*
1172  * Peer has sent us a FIN.
1173  */
1174 static int
1175 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1176 {
1177 	struct adapter *sc = iq->adapter;
1178 	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
1179 	unsigned int tid = GET_TID(cpl);
1180 	struct toepcb *toep = lookup_tid(sc, tid);
1181 	struct inpcb *inp = toep->inp;
1182 	struct tcpcb *tp = NULL;
1183 	struct socket *so;
1184 #ifdef INVARIANTS
1185 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1186 #endif
1187 
1188 	KASSERT(opcode == CPL_PEER_CLOSE,
1189 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1190 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1191 
1192 	if (__predict_false(toep->flags & TPF_SYNQE)) {
1193 #ifdef INVARIANTS
1194 		struct synq_entry *synqe = (void *)toep;
1195 
1196 		INP_WLOCK(synqe->lctx->inp);
1197 		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1198 			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1199 			    ("%s: listen socket closed but tid %u not aborted.",
1200 			    __func__, tid));
1201 		} else {
1202 			/*
1203 			 * do_pass_accept_req is still running and will
1204 			 * eventually take care of this tid.
1205 			 */
1206 		}
1207 		INP_WUNLOCK(synqe->lctx->inp);
1208 #endif
1209 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1210 		    toep, toep->flags);
1211 		return (0);
1212 	}
1213 
1214 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1215 
1216 	CURVNET_SET(toep->vnet);
1217 	INP_INFO_RLOCK(&V_tcbinfo);
1218 	INP_WLOCK(inp);
1219 	tp = intotcpcb(inp);
1220 
1221 	CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
1222 	    tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
1223 
1224 	if (toep->flags & TPF_ABORT_SHUTDOWN)
1225 		goto done;
1226 
1227 	tp->rcv_nxt++;	/* FIN */
1228 
1229 	so = inp->inp_socket;
1230 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
1231 		DDP_LOCK(toep);
1232 		if (__predict_false(toep->ddp_flags &
1233 		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
1234 			handle_ddp_close(toep, tp, cpl->rcv_nxt);
1235 		DDP_UNLOCK(toep);
1236 	}
1237 	socantrcvmore(so);
1238 
1239 	if (toep->ulp_mode != ULP_MODE_RDMA) {
1240 		KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
1241 	    		("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
1242 	    		be32toh(cpl->rcv_nxt)));
1243 	}
1244 
1245 	switch (tp->t_state) {
1246 	case TCPS_SYN_RECEIVED:
1247 		tp->t_starttime = ticks;
1248 		/* FALLTHROUGH */
1249 
1250 	case TCPS_ESTABLISHED:
1251 		tp->t_state = TCPS_CLOSE_WAIT;
1252 		break;
1253 
1254 	case TCPS_FIN_WAIT_1:
1255 		tp->t_state = TCPS_CLOSING;
1256 		break;
1257 
1258 	case TCPS_FIN_WAIT_2:
1259 		tcp_twstart(tp);
1260 		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
1261 		INP_INFO_RUNLOCK(&V_tcbinfo);
1262 		CURVNET_RESTORE();
1263 
1264 		INP_WLOCK(inp);
1265 		final_cpl_received(toep);
1266 		return (0);
1267 
1268 	default:
1269 		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
1270 		    __func__, tid, tp->t_state);
1271 	}
1272 done:
1273 	INP_WUNLOCK(inp);
1274 	INP_INFO_RUNLOCK(&V_tcbinfo);
1275 	CURVNET_RESTORE();
1276 	return (0);
1277 }
1278 
1279 /*
1280  * Peer has ACK'd our FIN.
1281  */
1282 static int
1283 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
1284     struct mbuf *m)
1285 {
1286 	struct adapter *sc = iq->adapter;
1287 	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
1288 	unsigned int tid = GET_TID(cpl);
1289 	struct toepcb *toep = lookup_tid(sc, tid);
1290 	struct inpcb *inp = toep->inp;
1291 	struct tcpcb *tp = NULL;
1292 	struct socket *so = NULL;
1293 #ifdef INVARIANTS
1294 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1295 #endif
1296 
1297 	KASSERT(opcode == CPL_CLOSE_CON_RPL,
1298 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1299 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1300 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1301 
1302 	CURVNET_SET(toep->vnet);
1303 	INP_INFO_RLOCK(&V_tcbinfo);
1304 	INP_WLOCK(inp);
1305 	tp = intotcpcb(inp);
1306 
1307 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
1308 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
1309 
1310 	if (toep->flags & TPF_ABORT_SHUTDOWN)
1311 		goto done;
1312 
1313 	so = inp->inp_socket;
1314 	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
1315 
1316 	switch (tp->t_state) {
1317 	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
1318 		tcp_twstart(tp);
1319 release:
1320 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
1321 		INP_INFO_RUNLOCK(&V_tcbinfo);
1322 		CURVNET_RESTORE();
1323 
1324 		INP_WLOCK(inp);
1325 		final_cpl_received(toep);	/* no more CPLs expected */
1326 
1327 		return (0);
1328 	case TCPS_LAST_ACK:
1329 		if (tcp_close(tp))
1330 			INP_WUNLOCK(inp);
1331 		goto release;
1332 
1333 	case TCPS_FIN_WAIT_1:
1334 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1335 			soisdisconnected(so);
1336 		tp->t_state = TCPS_FIN_WAIT_2;
1337 		break;
1338 
1339 	default:
1340 		log(LOG_ERR,
1341 		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
1342 		    __func__, tid, tcpstates[tp->t_state]);
1343 	}
1344 done:
1345 	INP_WUNLOCK(inp);
1346 	INP_INFO_RUNLOCK(&V_tcbinfo);
1347 	CURVNET_RESTORE();
1348 	return (0);
1349 }
1350 
1351 void
1352 send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
1353     int rst_status)
1354 {
1355 	struct wrqe *wr;
1356 	struct cpl_abort_rpl *cpl;
1357 
1358 	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
1359 	if (wr == NULL) {
1360 		/* XXX */
1361 		panic("%s: allocation failure.", __func__);
1362 	}
1363 	cpl = wrtod(wr);
1364 
1365 	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
1366 	cpl->cmd = rst_status;
1367 
1368 	t4_wrq_tx(sc, wr);
1369 }
1370 
1371 static int
1372 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
1373 {
1374 	switch (abort_reason) {
1375 	case CPL_ERR_BAD_SYN:
1376 	case CPL_ERR_CONN_RESET:
1377 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
1378 	case CPL_ERR_XMIT_TIMEDOUT:
1379 	case CPL_ERR_PERSIST_TIMEDOUT:
1380 	case CPL_ERR_FINWAIT2_TIMEDOUT:
1381 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
1382 		return (ETIMEDOUT);
1383 	default:
1384 		return (EIO);
1385 	}
1386 }
1387 
1388 /*
1389  * TCP RST from the peer, timeout, or some other such critical error.
1390  */
1391 static int
1392 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1393 {
1394 	struct adapter *sc = iq->adapter;
1395 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
1396 	unsigned int tid = GET_TID(cpl);
1397 	struct toepcb *toep = lookup_tid(sc, tid);
1398 	struct sge_wrq *ofld_txq = toep->ofld_txq;
1399 	struct inpcb *inp;
1400 	struct tcpcb *tp;
1401 #ifdef INVARIANTS
1402 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1403 #endif
1404 
1405 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
1406 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1407 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1408 
1409 	if (toep->flags & TPF_SYNQE)
1410 		return (do_abort_req_synqe(iq, rss, m));
1411 
1412 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1413 
1414 	if (negative_advice(cpl->status)) {
1415 		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
1416 		    __func__, cpl->status, tid, toep->flags);
1417 		return (0);	/* Ignore negative advice */
1418 	}
1419 
1420 	inp = toep->inp;
1421 	CURVNET_SET(toep->vnet);
1422 	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
1423 	INP_WLOCK(inp);
1424 
1425 	tp = intotcpcb(inp);
1426 
1427 	CTR6(KTR_CXGBE,
1428 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
1429 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
1430 	    inp->inp_flags, cpl->status);
1431 
1432 	/*
1433 	 * If we'd initiated an abort earlier the reply to it is responsible for
1434 	 * cleaning up resources.  Otherwise we tear everything down right here
1435 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
1436 	 */
1437 	if (toep->flags & TPF_ABORT_SHUTDOWN) {
1438 		INP_WUNLOCK(inp);
1439 		goto done;
1440 	}
1441 	toep->flags |= TPF_ABORT_SHUTDOWN;
1442 
1443 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
1444 		struct socket *so = inp->inp_socket;
1445 
1446 		if (so != NULL)
1447 			so_error_set(so, abort_status_to_errno(tp,
1448 			    cpl->status));
1449 		tp = tcp_close(tp);
1450 		if (tp == NULL)
1451 			INP_WLOCK(inp);	/* re-acquire */
1452 	}
1453 
1454 	final_cpl_received(toep);
1455 done:
1456 	INP_INFO_RUNLOCK(&V_tcbinfo);
1457 	CURVNET_RESTORE();
1458 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
1459 	return (0);
1460 }
1461 
1462 /*
1463  * Reply to the CPL_ABORT_REQ (send_reset)
1464  */
1465 static int
1466 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1467 {
1468 	struct adapter *sc = iq->adapter;
1469 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
1470 	unsigned int tid = GET_TID(cpl);
1471 	struct toepcb *toep = lookup_tid(sc, tid);
1472 	struct inpcb *inp = toep->inp;
1473 #ifdef INVARIANTS
1474 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1475 #endif
1476 
1477 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
1478 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1479 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1480 
1481 	if (toep->flags & TPF_SYNQE)
1482 		return (do_abort_rpl_synqe(iq, rss, m));
1483 
1484 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1485 
1486 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
1487 	    __func__, tid, toep, inp, cpl->status);
1488 
1489 	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1490 	    ("%s: wasn't expecting abort reply", __func__));
1491 
1492 	INP_WLOCK(inp);
1493 	final_cpl_received(toep);
1494 
1495 	return (0);
1496 }
1497 
1498 static int
1499 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1500 {
1501 	struct adapter *sc = iq->adapter;
1502 	const struct cpl_rx_data *cpl = mtod(m, const void *);
1503 	unsigned int tid = GET_TID(cpl);
1504 	struct toepcb *toep = lookup_tid(sc, tid);
1505 	struct inpcb *inp = toep->inp;
1506 	struct tcpcb *tp;
1507 	struct socket *so;
1508 	struct sockbuf *sb;
1509 	int len;
1510 	uint32_t ddp_placed = 0;
1511 
1512 	if (__predict_false(toep->flags & TPF_SYNQE)) {
1513 #ifdef INVARIANTS
1514 		struct synq_entry *synqe = (void *)toep;
1515 
1516 		INP_WLOCK(synqe->lctx->inp);
1517 		if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1518 			KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1519 			    ("%s: listen socket closed but tid %u not aborted.",
1520 			    __func__, tid));
1521 		} else {
1522 			/*
1523 			 * do_pass_accept_req is still running and will
1524 			 * eventually take care of this tid.
1525 			 */
1526 		}
1527 		INP_WUNLOCK(synqe->lctx->inp);
1528 #endif
1529 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
1530 		    toep, toep->flags);
1531 		m_freem(m);
1532 		return (0);
1533 	}
1534 
1535 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1536 
1537 	/* strip off CPL header */
1538 	m_adj(m, sizeof(*cpl));
1539 	len = m->m_pkthdr.len;
1540 
1541 	INP_WLOCK(inp);
1542 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
1543 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
1544 		    __func__, tid, len, inp->inp_flags);
1545 		INP_WUNLOCK(inp);
1546 		m_freem(m);
1547 		return (0);
1548 	}
1549 
1550 	tp = intotcpcb(inp);
1551 
1552 	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
1553 		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
1554 
1555 	tp->rcv_nxt += len;
1556 	if (tp->rcv_wnd < len) {
1557 		KASSERT(toep->ulp_mode == ULP_MODE_RDMA,
1558 				("%s: negative window size", __func__));
1559 	}
1560 
1561 	tp->rcv_wnd -= len;
1562 	tp->t_rcvtime = ticks;
1563 
1564 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
1565 		DDP_LOCK(toep);
1566 	so = inp_inpcbtosocket(inp);
1567 	sb = &so->so_rcv;
1568 	SOCKBUF_LOCK(sb);
1569 
1570 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
1571 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
1572 		    __func__, tid, len);
1573 		m_freem(m);
1574 		SOCKBUF_UNLOCK(sb);
1575 		if (toep->ulp_mode == ULP_MODE_TCPDDP)
1576 			DDP_UNLOCK(toep);
1577 		INP_WUNLOCK(inp);
1578 
1579 		CURVNET_SET(toep->vnet);
1580 		INP_INFO_RLOCK(&V_tcbinfo);
1581 		INP_WLOCK(inp);
1582 		tp = tcp_drop(tp, ECONNRESET);
1583 		if (tp)
1584 			INP_WUNLOCK(inp);
1585 		INP_INFO_RUNLOCK(&V_tcbinfo);
1586 		CURVNET_RESTORE();
1587 
1588 		return (0);
1589 	}
1590 
1591 	/* receive buffer autosize */
1592 	MPASS(toep->vnet == so->so_vnet);
1593 	CURVNET_SET(toep->vnet);
1594 	if (sb->sb_flags & SB_AUTOSIZE &&
1595 	    V_tcp_do_autorcvbuf &&
1596 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
1597 	    len > (sbspace(sb) / 8 * 7)) {
1598 		unsigned int hiwat = sb->sb_hiwat;
1599 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
1600 		    V_tcp_autorcvbuf_max);
1601 
1602 		if (!sbreserve_locked(sb, newsize, so, NULL))
1603 			sb->sb_flags &= ~SB_AUTOSIZE;
1604 		else
1605 			toep->rx_credits += newsize - hiwat;
1606 	}
1607 
1608 	if (toep->ddp_waiting_count != 0 || toep->ddp_active_count != 0)
1609 		CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)", __func__,
1610 		    tid, len);
1611 
1612 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
1613 		int changed = !(toep->ddp_flags & DDP_ON) ^ cpl->ddp_off;
1614 
1615 		if (changed) {
1616 			if (toep->ddp_flags & DDP_SC_REQ)
1617 				toep->ddp_flags ^= DDP_ON | DDP_SC_REQ;
1618 			else {
1619 				KASSERT(cpl->ddp_off == 1,
1620 				    ("%s: DDP switched on by itself.",
1621 				    __func__));
1622 
1623 				/* Fell out of DDP mode */
1624 				toep->ddp_flags &= ~DDP_ON;
1625 				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
1626 				    __func__);
1627 
1628 				insert_ddp_data(toep, ddp_placed);
1629 			}
1630 		}
1631 
1632 		if (toep->ddp_flags & DDP_ON) {
1633 			/*
1634 			 * CPL_RX_DATA with DDP on can only be an indicate.
1635 			 * Start posting queued AIO requests via DDP.  The
1636 			 * payload that arrived in this indicate is appended
1637 			 * to the socket buffer as usual.
1638 			 */
1639 			handle_ddp_indicate(toep);
1640 		}
1641 	}
1642 
1643 	KASSERT(toep->sb_cc >= sbused(sb),
1644 	    ("%s: sb %p has more data (%d) than last time (%d).",
1645 	    __func__, sb, sbused(sb), toep->sb_cc));
1646 	toep->rx_credits += toep->sb_cc - sbused(sb);
1647 	sbappendstream_locked(sb, m, 0);
1648 	toep->sb_cc = sbused(sb);
1649 	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
1650 		int credits;
1651 
1652 		credits = send_rx_credits(sc, toep, toep->rx_credits);
1653 		toep->rx_credits -= credits;
1654 		tp->rcv_wnd += credits;
1655 		tp->rcv_adv += credits;
1656 	}
1657 
1658 	if (toep->ddp_waiting_count > 0 && sbavail(sb) != 0) {
1659 		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
1660 		    tid);
1661 		ddp_queue_toep(toep);
1662 	}
1663 	sorwakeup_locked(so);
1664 	SOCKBUF_UNLOCK_ASSERT(sb);
1665 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
1666 		DDP_UNLOCK(toep);
1667 
1668 	INP_WUNLOCK(inp);
1669 	CURVNET_RESTORE();
1670 	return (0);
1671 }
1672 
1673 #define S_CPL_FW4_ACK_OPCODE    24
1674 #define M_CPL_FW4_ACK_OPCODE    0xff
1675 #define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE)
1676 #define G_CPL_FW4_ACK_OPCODE(x) \
1677     (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE)
1678 
1679 #define S_CPL_FW4_ACK_FLOWID    0
1680 #define M_CPL_FW4_ACK_FLOWID    0xffffff
1681 #define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID)
1682 #define G_CPL_FW4_ACK_FLOWID(x) \
1683     (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID)
1684 
1685 #define S_CPL_FW4_ACK_CR        24
1686 #define M_CPL_FW4_ACK_CR        0xff
1687 #define V_CPL_FW4_ACK_CR(x)     ((x) << S_CPL_FW4_ACK_CR)
1688 #define G_CPL_FW4_ACK_CR(x)     (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR)
1689 
1690 #define S_CPL_FW4_ACK_SEQVAL    0
1691 #define M_CPL_FW4_ACK_SEQVAL    0x1
1692 #define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL)
1693 #define G_CPL_FW4_ACK_SEQVAL(x) \
1694     (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL)
1695 #define F_CPL_FW4_ACK_SEQVAL    V_CPL_FW4_ACK_SEQVAL(1U)
1696 
1697 static int
1698 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1699 {
1700 	struct adapter *sc = iq->adapter;
1701 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
1702 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
1703 	struct toepcb *toep = lookup_tid(sc, tid);
1704 	struct inpcb *inp;
1705 	struct tcpcb *tp;
1706 	struct socket *so;
1707 	uint8_t credits = cpl->credits;
1708 	struct ofld_tx_sdesc *txsd;
1709 	int plen;
1710 #ifdef INVARIANTS
1711 	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
1712 #endif
1713 
1714 	/*
1715 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
1716 	 * now this comes back carrying the credits for the flowc.
1717 	 */
1718 	if (__predict_false(toep->flags & TPF_SYNQE)) {
1719 		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
1720 		    ("%s: credits for a synq entry %p", __func__, toep));
1721 		return (0);
1722 	}
1723 
1724 	inp = toep->inp;
1725 
1726 	KASSERT(opcode == CPL_FW4_ACK,
1727 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1728 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1729 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
1730 
1731 	INP_WLOCK(inp);
1732 
1733 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
1734 		INP_WUNLOCK(inp);
1735 		return (0);
1736 	}
1737 
1738 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
1739 	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
1740 
1741 	tp = intotcpcb(inp);
1742 
1743 	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
1744 		tcp_seq snd_una = be32toh(cpl->snd_una);
1745 
1746 #ifdef INVARIANTS
1747 		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
1748 			log(LOG_ERR,
1749 			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
1750 			    __func__, snd_una, toep->tid, tp->snd_una);
1751 		}
1752 #endif
1753 
1754 		if (tp->snd_una != snd_una) {
1755 			tp->snd_una = snd_una;
1756 			tp->ts_recent_age = tcp_ts_getticks();
1757 		}
1758 	}
1759 
1760 #ifdef VERBOSE_TRACES
1761 	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
1762 #endif
1763 	so = inp->inp_socket;
1764 	txsd = &toep->txsd[toep->txsd_cidx];
1765 	plen = 0;
1766 	while (credits) {
1767 		KASSERT(credits >= txsd->tx_credits,
1768 		    ("%s: too many (or partial) credits", __func__));
1769 		credits -= txsd->tx_credits;
1770 		toep->tx_credits += txsd->tx_credits;
1771 		plen += txsd->plen;
1772 		txsd++;
1773 		toep->txsd_avail++;
1774 		KASSERT(toep->txsd_avail <= toep->txsd_total,
1775 		    ("%s: txsd avail > total", __func__));
1776 		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
1777 			txsd = &toep->txsd[0];
1778 			toep->txsd_cidx = 0;
1779 		}
1780 	}
1781 
1782 	if (toep->tx_credits == toep->tx_total) {
1783 		toep->tx_nocompl = 0;
1784 		toep->plen_nocompl = 0;
1785 	}
1786 
1787 	if (toep->flags & TPF_TX_SUSPENDED &&
1788 	    toep->tx_credits >= toep->tx_total / 4) {
1789 #ifdef VERBOSE_TRACES
1790 		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
1791 		    tid);
1792 #endif
1793 		toep->flags &= ~TPF_TX_SUSPENDED;
1794 		CURVNET_SET(toep->vnet);
1795 		if (toep->ulp_mode == ULP_MODE_ISCSI)
1796 			t4_push_pdus(sc, toep, plen);
1797 		else
1798 			t4_push_frames(sc, toep, plen);
1799 		CURVNET_RESTORE();
1800 	} else if (plen > 0) {
1801 		struct sockbuf *sb = &so->so_snd;
1802 		int sbu;
1803 
1804 		SOCKBUF_LOCK(sb);
1805 		sbu = sbused(sb);
1806 		if (toep->ulp_mode == ULP_MODE_ISCSI) {
1807 
1808 			if (__predict_false(sbu > 0)) {
1809 				/*
1810 				 * The data trasmitted before the tid's ULP mode
1811 				 * changed to ISCSI is still in so_snd.
1812 				 * Incoming credits should account for so_snd
1813 				 * first.
1814 				 */
1815 				sbdrop_locked(sb, min(sbu, plen));
1816 				plen -= min(sbu, plen);
1817 			}
1818 			sowwakeup_locked(so);	/* unlocks so_snd */
1819 			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
1820 		} else {
1821 #ifdef VERBOSE_TRACES
1822 			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
1823 			    tid, plen);
1824 #endif
1825 			sbdrop_locked(sb, plen);
1826 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
1827 				t4_aiotx_queue_toep(toep);
1828 			sowwakeup_locked(so);	/* unlocks so_snd */
1829 		}
1830 		SOCKBUF_UNLOCK_ASSERT(sb);
1831 	}
1832 
1833 	INP_WUNLOCK(inp);
1834 
1835 	return (0);
1836 }
1837 
1838 int
1839 do_set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1840 {
1841 	struct adapter *sc = iq->adapter;
1842 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
1843 	unsigned int tid = GET_TID(cpl);
1844 	struct toepcb *toep;
1845 #ifdef INVARIANTS
1846 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1847 #endif
1848 
1849 	KASSERT(opcode == CPL_SET_TCB_RPL,
1850 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
1851 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1852 	MPASS(iq != &sc->sge.fwq);
1853 
1854 	toep = lookup_tid(sc, tid);
1855 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
1856 		handle_ddp_tcb_rpl(toep, cpl);
1857 		return (0);
1858 	}
1859 
1860 	/*
1861 	 * TOM and/or other ULPs don't request replies for CPL_SET_TCB or
1862 	 * CPL_SET_TCB_FIELD requests.  This can easily change and when it does
1863 	 * the dispatch code will go here.
1864 	 */
1865 #ifdef INVARIANTS
1866 	panic("%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p", __func__,
1867 	    tid, iq);
1868 #else
1869 	log(LOG_ERR, "%s: Unexpected CPL_SET_TCB_RPL for tid %u on iq %p\n",
1870 	    __func__, tid, iq);
1871 #endif
1872 
1873 	return (0);
1874 }
1875 
1876 void
1877 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, int tid,
1878     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie, int iqid)
1879 {
1880 	struct wrqe *wr;
1881 	struct cpl_set_tcb_field *req;
1882 
1883 	MPASS((cookie & ~M_COOKIE) == 0);
1884 	MPASS((iqid & ~M_QUEUENO) == 0);
1885 
1886 	wr = alloc_wrqe(sizeof(*req), wrq);
1887 	if (wr == NULL) {
1888 		/* XXX */
1889 		panic("%s: allocation failure.", __func__);
1890 	}
1891 	req = wrtod(wr);
1892 
1893 	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, tid);
1894 	req->reply_ctrl = htobe16(V_QUEUENO(iqid));
1895 	if (reply == 0)
1896 		req->reply_ctrl |= htobe16(F_NO_REPLY);
1897 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
1898 	req->mask = htobe64(mask);
1899 	req->val = htobe64(val);
1900 
1901 	t4_wrq_tx(sc, wr);
1902 }
1903 
1904 void
1905 t4_init_cpl_io_handlers(void)
1906 {
1907 
1908 	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
1909 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
1910 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
1911 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
1912 	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
1913 	t4_register_cpl_handler(CPL_FW4_ACK, do_fw4_ack);
1914 }
1915 
1916 void
1917 t4_uninit_cpl_io_handlers(void)
1918 {
1919 
1920 	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
1921 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
1922 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
1923 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, NULL);
1924 	t4_register_cpl_handler(CPL_RX_DATA, NULL);
1925 	t4_register_cpl_handler(CPL_FW4_ACK, NULL);
1926 }
1927 
1928 /*
1929  * Use the 'backend3' field in AIO jobs to store the amount of data
1930  * sent by the AIO job so far and the 'backend4' field to hold an
1931  * error that should be reported when the job is completed.
1932  */
1933 #define	aio_sent	backend3
1934 #define	aio_error	backend4
1935 
1936 #define	jobtotid(job)							\
1937 	(((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
1938 
1939 static void
1940 free_aiotx_buffer(struct aiotx_buffer *ab)
1941 {
1942 	struct kaiocb *job;
1943 	long status;
1944 	int error;
1945 
1946 	if (refcount_release(&ab->refcount) == 0)
1947 		return;
1948 
1949 	job = ab->job;
1950 	error = job->aio_error;
1951 	status = job->aio_sent;
1952 	vm_page_unhold_pages(ab->ps.pages, ab->ps.npages);
1953 	free(ab, M_CXGBE);
1954 #ifdef VERBOSE_TRACES
1955 	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
1956 	    jobtotid(job), job, status, error);
1957 #endif
1958 	if (error == ECANCELED && status != 0)
1959 		error = 0;
1960 	if (error == ECANCELED)
1961 		aio_cancel(job);
1962 	else if (error)
1963 		aio_complete(job, -1, error);
1964 	else
1965 		aio_complete(job, status, 0);
1966 }
1967 
1968 static void
1969 t4_aiotx_mbuf_free(struct mbuf *m)
1970 {
1971 	struct aiotx_buffer *ab = m->m_ext.ext_arg1;
1972 
1973 #ifdef VERBOSE_TRACES
1974 	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
1975 	    m->m_len, jobtotid(ab->job));
1976 #endif
1977 	free_aiotx_buffer(ab);
1978 }
1979 
1980 /*
1981  * Hold the buffer backing an AIO request and return an AIO transmit
1982  * buffer.
1983  */
1984 static int
1985 hold_aio(struct kaiocb *job)
1986 {
1987 	struct aiotx_buffer *ab;
1988 	struct vmspace *vm;
1989 	vm_map_t map;
1990 	vm_offset_t start, end, pgoff;
1991 	int n;
1992 
1993 	MPASS(job->backend1 == NULL);
1994 
1995 	/*
1996 	 * The AIO subsystem will cancel and drain all requests before
1997 	 * permitting a process to exit or exec, so p_vmspace should
1998 	 * be stable here.
1999 	 */
2000 	vm = job->userproc->p_vmspace;
2001 	map = &vm->vm_map;
2002 	start = (uintptr_t)job->uaiocb.aio_buf;
2003 	pgoff = start & PAGE_MASK;
2004 	end = round_page(start + job->uaiocb.aio_nbytes);
2005 	start = trunc_page(start);
2006 	n = atop(end - start);
2007 
2008 	ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
2009 	    M_ZERO);
2010 	refcount_init(&ab->refcount, 1);
2011 	ab->ps.pages = (vm_page_t *)(ab + 1);
2012 	ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start,
2013 	    VM_PROT_WRITE, ab->ps.pages, n);
2014 	if (ab->ps.npages < 0) {
2015 		free(ab, M_CXGBE);
2016 		return (EFAULT);
2017 	}
2018 
2019 	KASSERT(ab->ps.npages == n,
2020 	    ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n));
2021 
2022 	ab->ps.offset = pgoff;
2023 	ab->ps.len = job->uaiocb.aio_nbytes;
2024 	ab->job = job;
2025 	job->backend1 = ab;
2026 #ifdef VERBOSE_TRACES
2027 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
2028 	    __func__, jobtotid(job), &ab->ps, job, ab->ps.npages);
2029 #endif
2030 	return (0);
2031 }
2032 
2033 static void
2034 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
2035 {
2036 	struct adapter *sc;
2037 	struct sockbuf *sb;
2038 	struct file *fp;
2039 	struct aiotx_buffer *ab;
2040 	struct inpcb *inp;
2041 	struct tcpcb *tp;
2042 	struct mbuf *m;
2043 	int error;
2044 	bool moretocome, sendmore;
2045 
2046 	sc = td_adapter(toep->td);
2047 	sb = &so->so_snd;
2048 	SOCKBUF_UNLOCK(sb);
2049 	fp = job->fd_file;
2050 	ab = job->backend1;
2051 	m = NULL;
2052 
2053 #ifdef MAC
2054 	error = mac_socket_check_send(fp->f_cred, so);
2055 	if (error != 0)
2056 		goto out;
2057 #endif
2058 
2059 	if (ab == NULL) {
2060 		error = hold_aio(job);
2061 		if (error != 0)
2062 			goto out;
2063 		ab = job->backend1;
2064 	}
2065 
2066 	/* Inline sosend_generic(). */
2067 
2068 	job->msgsnd = 1;
2069 
2070 	error = sblock(sb, SBL_WAIT);
2071 	MPASS(error == 0);
2072 
2073 sendanother:
2074 	m = m_get(M_WAITOK, MT_DATA);
2075 
2076 	SOCKBUF_LOCK(sb);
2077 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2078 		SOCKBUF_UNLOCK(sb);
2079 		sbunlock(sb);
2080 		if ((so->so_options & SO_NOSIGPIPE) == 0) {
2081 			PROC_LOCK(job->userproc);
2082 			kern_psignal(job->userproc, SIGPIPE);
2083 			PROC_UNLOCK(job->userproc);
2084 		}
2085 		error = EPIPE;
2086 		goto out;
2087 	}
2088 	if (so->so_error) {
2089 		error = so->so_error;
2090 		so->so_error = 0;
2091 		SOCKBUF_UNLOCK(sb);
2092 		sbunlock(sb);
2093 		goto out;
2094 	}
2095 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2096 		SOCKBUF_UNLOCK(sb);
2097 		sbunlock(sb);
2098 		error = ENOTCONN;
2099 		goto out;
2100 	}
2101 	if (sbspace(sb) < sb->sb_lowat) {
2102 		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
2103 
2104 		/*
2105 		 * Don't block if there is too little room in the socket
2106 		 * buffer.  Instead, requeue the request.
2107 		 */
2108 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2109 			SOCKBUF_UNLOCK(sb);
2110 			sbunlock(sb);
2111 			error = ECANCELED;
2112 			goto out;
2113 		}
2114 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2115 		SOCKBUF_UNLOCK(sb);
2116 		sbunlock(sb);
2117 		goto out;
2118 	}
2119 
2120 	/*
2121 	 * Write as much data as the socket permits, but no more than a
2122 	 * a single sndbuf at a time.
2123 	 */
2124 	m->m_len = sbspace(sb);
2125 	if (m->m_len > ab->ps.len - job->aio_sent) {
2126 		m->m_len = ab->ps.len - job->aio_sent;
2127 		moretocome = false;
2128 	} else
2129 		moretocome = true;
2130 	if (m->m_len > sc->tt.sndbuf) {
2131 		m->m_len = sc->tt.sndbuf;
2132 		sendmore = true;
2133 	} else
2134 		sendmore = false;
2135 
2136 	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
2137 		moretocome = true;
2138 	SOCKBUF_UNLOCK(sb);
2139 	MPASS(m->m_len != 0);
2140 
2141 	/* Inlined tcp_usr_send(). */
2142 
2143 	inp = toep->inp;
2144 	INP_WLOCK(inp);
2145 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
2146 		INP_WUNLOCK(inp);
2147 		sbunlock(sb);
2148 		error = ECONNRESET;
2149 		goto out;
2150 	}
2151 
2152 	refcount_acquire(&ab->refcount);
2153 	m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab,
2154 	    (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV);
2155 	m->m_ext.ext_flags |= EXT_FLAG_AIOTX;
2156 	job->aio_sent += m->m_len;
2157 
2158 	sbappendstream(sb, m, 0);
2159 	m = NULL;
2160 
2161 	if (!(inp->inp_flags & INP_DROPPED)) {
2162 		tp = intotcpcb(inp);
2163 		if (moretocome)
2164 			tp->t_flags |= TF_MORETOCOME;
2165 		error = tp->t_fb->tfb_tcp_output(tp);
2166 		if (moretocome)
2167 			tp->t_flags &= ~TF_MORETOCOME;
2168 	}
2169 
2170 	INP_WUNLOCK(inp);
2171 	if (sendmore)
2172 		goto sendanother;
2173 	sbunlock(sb);
2174 
2175 	if (error)
2176 		goto out;
2177 
2178 	/*
2179 	 * If this is a non-blocking socket and the request has not
2180 	 * been fully completed, requeue it until the socket is ready
2181 	 * again.
2182 	 */
2183 	if (job->aio_sent < job->uaiocb.aio_nbytes &&
2184 	    !(so->so_state & SS_NBIO)) {
2185 		SOCKBUF_LOCK(sb);
2186 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
2187 			SOCKBUF_UNLOCK(sb);
2188 			error = ECANCELED;
2189 			goto out;
2190 		}
2191 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
2192 		return;
2193 	}
2194 
2195 	/*
2196 	 * If the request will not be requeued, drop a reference on
2197 	 * the aiotx buffer.  Any mbufs in flight should still
2198 	 * contain a reference, but this drops the reference that the
2199 	 * job owns while it is waiting to queue mbufs to the socket.
2200 	 */
2201 	free_aiotx_buffer(ab);
2202 
2203 out:
2204 	if (error) {
2205 		if (ab != NULL) {
2206 			job->aio_error = error;
2207 			free_aiotx_buffer(ab);
2208 		} else {
2209 			MPASS(job->aio_sent == 0);
2210 			aio_complete(job, -1, error);
2211 		}
2212 	}
2213 	if (m != NULL)
2214 		m_free(m);
2215 	SOCKBUF_LOCK(sb);
2216 }
2217 
2218 static void
2219 t4_aiotx_task(void *context, int pending)
2220 {
2221 	struct toepcb *toep = context;
2222 	struct inpcb *inp = toep->inp;
2223 	struct socket *so = inp->inp_socket;
2224 	struct kaiocb *job;
2225 
2226 	CURVNET_SET(toep->vnet);
2227 	SOCKBUF_LOCK(&so->so_snd);
2228 	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
2229 		job = TAILQ_FIRST(&toep->aiotx_jobq);
2230 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2231 		if (!aio_clear_cancel_function(job))
2232 			continue;
2233 
2234 		t4_aiotx_process_job(toep, so, job);
2235 	}
2236 	toep->aiotx_task_active = false;
2237 	SOCKBUF_UNLOCK(&so->so_snd);
2238 	CURVNET_RESTORE();
2239 
2240 	free_toepcb(toep);
2241 }
2242 
2243 static void
2244 t4_aiotx_queue_toep(struct toepcb *toep)
2245 {
2246 
2247 	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
2248 #ifdef VERBOSE_TRACES
2249 	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
2250 	    __func__, toep->tid, toep->aiotx_task_active ? "true" : "false");
2251 #endif
2252 	if (toep->aiotx_task_active)
2253 		return;
2254 	toep->aiotx_task_active = true;
2255 	hold_toepcb(toep);
2256 	soaio_enqueue(&toep->aiotx_task);
2257 }
2258 
2259 static void
2260 t4_aiotx_cancel(struct kaiocb *job)
2261 {
2262 	struct aiotx_buffer *ab;
2263 	struct socket *so;
2264 	struct sockbuf *sb;
2265 	struct tcpcb *tp;
2266 	struct toepcb *toep;
2267 
2268 	so = job->fd_file->f_data;
2269 	tp = so_sototcpcb(so);
2270 	toep = tp->t_toe;
2271 	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
2272 	sb = &so->so_snd;
2273 
2274 	SOCKBUF_LOCK(sb);
2275 	if (!aio_cancel_cleared(job))
2276 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
2277 	SOCKBUF_UNLOCK(sb);
2278 
2279 	ab = job->backend1;
2280 	if (ab != NULL)
2281 		free_aiotx_buffer(ab);
2282 	else
2283 		aio_cancel(job);
2284 }
2285 
2286 int
2287 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
2288 {
2289 	struct tcpcb *tp = so_sototcpcb(so);
2290 	struct toepcb *toep = tp->t_toe;
2291 	struct adapter *sc = td_adapter(toep->td);
2292 
2293 	/* This only handles writes. */
2294 	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
2295 		return (EOPNOTSUPP);
2296 
2297 	if (!sc->tt.tx_zcopy)
2298 		return (EOPNOTSUPP);
2299 
2300 	SOCKBUF_LOCK(&so->so_snd);
2301 #ifdef VERBOSE_TRACES
2302 	CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
2303 #endif
2304 	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
2305 		panic("new job was cancelled");
2306 	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
2307 	if (sowriteable(so))
2308 		t4_aiotx_queue_toep(toep);
2309 	SOCKBUF_UNLOCK(&so->so_snd);
2310 	return (0);
2311 }
2312 
2313 void
2314 aiotx_init_toep(struct toepcb *toep)
2315 {
2316 
2317 	TAILQ_INIT(&toep->aiotx_jobq);
2318 	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
2319 }
2320 #endif
2321