xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 8c2dd68caa963f1900a8228b0732b04f5d530ffa)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/systm.h>
43 
44 #ifdef TCP_OFFLOAD
45 #include <sys/errno.h>
46 #include <sys/kthread.h>
47 #include <sys/smp.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/mbuf.h>
51 #include <sys/lock.h>
52 #include <sys/mutex.h>
53 #include <sys/condvar.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/in_pcb.h>
57 #include <netinet/toecore.h>
58 #include <netinet/tcp_var.h>
59 #include <netinet/tcp_fsm.h>
60 
61 #include <cam/scsi/scsi_all.h>
62 #include <cam/scsi/scsi_da.h>
63 #include <cam/ctl/ctl_io.h>
64 #include <cam/ctl/ctl.h>
65 #include <cam/ctl/ctl_backend.h>
66 #include <cam/ctl/ctl_error.h>
67 #include <cam/ctl/ctl_frontend.h>
68 #include <cam/ctl/ctl_debug.h>
69 #include <cam/ctl/ctl_ha.h>
70 #include <cam/ctl/ctl_ioctl.h>
71 
72 #include <dev/iscsi/icl.h>
73 #include <dev/iscsi/iscsi_proto.h>
74 #include <dev/iscsi/iscsi_ioctl.h>
75 #include <dev/iscsi/iscsi.h>
76 #include <cam/ctl/ctl_frontend_iscsi.h>
77 
78 #include <cam/cam.h>
79 #include <cam/cam_ccb.h>
80 #include <cam/cam_xpt.h>
81 #include <cam/cam_debug.h>
82 #include <cam/cam_sim.h>
83 #include <cam/cam_xpt_sim.h>
84 #include <cam/cam_xpt_periph.h>
85 #include <cam/cam_periph.h>
86 #include <cam/cam_compat.h>
87 #include <cam/scsi/scsi_message.h>
88 
89 #include "common/common.h"
90 #include "common/t4_msg.h"
91 #include "common/t4_regs.h"	/* for PCIE_MEM_ACCESS */
92 #include "tom/t4_tom.h"
93 #include "cxgbei.h"
94 
95 static int worker_thread_count;
96 static struct cxgbei_worker_thread_softc *cwt_softc;
97 static struct proc *cxgbei_proc;
98 
99 static void
100 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len,
101     uint32_t *max_rx_pdu_len)
102 {
103 	uint32_t tx_len, rx_len, r, v;
104 
105 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
106 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
107 
108 	r = t4_read_reg(sc, A_TP_PARA_REG2);
109 	rx_len = min(rx_len, G_MAXRXDATA(r));
110 	tx_len = min(tx_len, G_MAXRXDATA(r));
111 
112 	r = t4_read_reg(sc, A_TP_PARA_REG7);
113 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
114 	rx_len = min(rx_len, v);
115 	tx_len = min(tx_len, v);
116 
117 	*max_tx_pdu_len = rounddown2(tx_len, 512);
118 	*max_rx_pdu_len = rounddown2(rx_len, 512);
119 }
120 
121 /*
122  * Initialize the software state of the iSCSI ULP driver.
123  *
124  * ENXIO means firmware didn't set up something that it was supposed to.
125  */
126 static int
127 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
128 {
129 	struct sysctl_oid *oid;
130 	struct sysctl_oid_list *children;
131 	struct ppod_region *pr;
132 	uint32_t r;
133 	int rc;
134 
135 	MPASS(sc->vres.iscsi.size > 0);
136 	MPASS(ci != NULL);
137 
138 	read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len);
139 
140 	pr = &ci->pr;
141 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
142 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
143 	if (rc != 0) {
144 		device_printf(sc->dev,
145 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
146 		    __func__, rc);
147 		return (rc);
148 	}
149 
150 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
151 	r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
152 	if (r != pr->pr_tag_mask) {
153 		/*
154 		 * Recent firmwares are supposed to set up the iSCSI tagmask
155 		 * but we'll do it ourselves it the computed value doesn't match
156 		 * what's in the register.
157 		 */
158 		device_printf(sc->dev,
159 		    "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
160 		    pr->pr_tag_mask);
161 		t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
162 		    V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
163 	}
164 
165 	sysctl_ctx_init(&ci->ctx);
166 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
167 	children = SYSCTL_CHILDREN(oid);
168 
169 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
170 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
171 	children = SYSCTL_CHILDREN(oid);
172 
173 	ci->ddp_threshold = 2048;
174 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
175 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
176 
177 	return (0);
178 }
179 
180 static int
181 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
182 {
183 	struct adapter *sc = iq->adapter;
184 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
185 	u_int tid = GET_TID(cpl);
186 	struct toepcb *toep = lookup_tid(sc, tid);
187 	struct icl_pdu *ip;
188 	struct icl_cxgbei_pdu *icp;
189 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
190 	uint16_t len = be16toh(cpl->len);
191 
192 	M_ASSERTPKTHDR(m);
193 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
194 
195 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
196 	if (ip == NULL)
197 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
198 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
199 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
200 	icp = ip_to_icp(ip);
201 	icp->icp_seq = ntohl(cpl->seq);
202 	icp->icp_flags = ICPF_RX_HDR;
203 
204 	/* This is the start of a new PDU.  There should be no old state. */
205 	MPASS(toep->ulpcb2 == NULL);
206 	toep->ulpcb2 = icp;
207 
208 #if 0
209 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
210 	    __func__, tid, len, len_ddp, icp);
211 #endif
212 
213 	m_freem(m);
214 	return (0);
215 }
216 
217 static int
218 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
219 {
220 	struct adapter *sc = iq->adapter;
221 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
222 	u_int tid = GET_TID(cpl);
223 	struct toepcb *toep = lookup_tid(sc, tid);
224 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
225 	struct icl_pdu *ip;
226 
227 	M_ASSERTPKTHDR(m);
228 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
229 
230 	if (icp == NULL) {
231 		/*
232 		 * T6 completion enabled, start of a new pdu. Header
233 		 * will come in completion CPL.
234 		 */
235 	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
236 	        if (ip == NULL)
237 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
238 		icp = ip_to_icp(ip);
239 	} else {
240 		/* T5 mode, header is already received. */
241 		MPASS(icp->icp_flags == ICPF_RX_HDR);
242 		MPASS(icp->ip.ip_data_mbuf == NULL);
243 		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
244 	}
245 
246 	/* Trim the cpl header from mbuf. */
247 	m_adj(m, sizeof(*cpl));
248 
249 	icp->icp_flags |= ICPF_RX_FLBUF;
250 	icp->ip.ip_data_mbuf = m;
251 	toep->ofld_rxq->rx_iscsi_fl_pdus++;
252 	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
253 
254 	/*
255 	 * For T6, save the icp for further processing in the
256 	 * completion handler.
257 	 */
258 	if (icp->icp_flags == ICPF_RX_FLBUF) {
259 		MPASS(toep->ulpcb2 == NULL);
260 		toep->ulpcb2 = icp;
261 	}
262 
263 #if 0
264 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
265 	    be16toh(cpl->len), icp);
266 #endif
267 
268 	return (0);
269 }
270 
271 static int
272 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
273 {
274 	struct adapter *sc = iq->adapter;
275 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
276 	u_int tid = GET_TID(cpl);
277 	struct toepcb *toep = lookup_tid(sc, tid);
278 	struct inpcb *inp = toep->inp;
279 	struct socket *so;
280 	struct sockbuf *sb;
281 	struct tcpcb *tp;
282 	struct icl_cxgbei_conn *icc;
283 	struct icl_conn *ic;
284 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
285 	struct icl_pdu *ip;
286 	u_int pdu_len, val;
287 	struct epoch_tracker et;
288 
289 	MPASS(m == NULL);
290 
291 	/* Must already be assembling a PDU. */
292 	MPASS(icp != NULL);
293 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
294 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
295 
296 	pdu_len = be16toh(cpl->len);	/* includes everything. */
297 	val = be32toh(cpl->ddpvld);
298 
299 #if 0
300 	CTR5(KTR_CXGBE,
301 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
302 	    __func__, tid, pdu_len, val, icp->icp_flags);
303 #endif
304 
305 	icp->icp_flags |= ICPF_RX_STATUS;
306 	ip = &icp->ip;
307 	if (val & F_DDP_PADDING_ERR)
308 		icp->icp_flags |= ICPF_PAD_ERR;
309 	if (val & F_DDP_HDRCRC_ERR)
310 		icp->icp_flags |= ICPF_HCRC_ERR;
311 	if (val & F_DDP_DATACRC_ERR)
312 		icp->icp_flags |= ICPF_DCRC_ERR;
313 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
314 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
315 		MPASS(ip->ip_data_len > 0);
316 		icp->icp_flags |= ICPF_RX_DDP;
317 		toep->ofld_rxq->rx_iscsi_ddp_pdus++;
318 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
319 	}
320 
321 	INP_WLOCK(inp);
322 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
323 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
324 		    __func__, tid, pdu_len, inp->inp_flags);
325 		INP_WUNLOCK(inp);
326 		icl_cxgbei_conn_pdu_free(NULL, ip);
327 		toep->ulpcb2 = NULL;
328 		return (0);
329 	}
330 
331 	/*
332 	 * T6+ does not report data PDUs received via DDP without F
333 	 * set.  This can result in gaps in the TCP sequence space.
334 	 */
335 	tp = intotcpcb(inp);
336 	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
337 	tp->rcv_nxt = icp->icp_seq + pdu_len;
338 	tp->t_rcvtime = ticks;
339 
340 	/*
341 	 * Don't update the window size or return credits since RX
342 	 * flow control is disabled.
343 	 */
344 
345 	so = inp->inp_socket;
346 	sb = &so->so_rcv;
347 	SOCKBUF_LOCK(sb);
348 
349 	icc = toep->ulpcb;
350 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
351 		CTR5(KTR_CXGBE,
352 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
353 		    __func__, tid, pdu_len, icc, sb->sb_state);
354 		SOCKBUF_UNLOCK(sb);
355 		INP_WUNLOCK(inp);
356 
357 		CURVNET_SET(so->so_vnet);
358 		NET_EPOCH_ENTER(et);
359 		INP_WLOCK(inp);
360 		tp = tcp_drop(tp, ECONNRESET);
361 		if (tp)
362 			INP_WUNLOCK(inp);
363 		NET_EPOCH_EXIT(et);
364 		CURVNET_RESTORE();
365 
366 		icl_cxgbei_conn_pdu_free(NULL, ip);
367 		toep->ulpcb2 = NULL;
368 		return (0);
369 	}
370 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
371 	ic = &icc->ic;
372 	icl_cxgbei_new_pdu_set_conn(ip, ic);
373 
374 	MPASS(m == NULL); /* was unused, we'll use it now. */
375 	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
376 	if (__predict_false(m != NULL)) {
377 		int len = m_length(m, NULL);
378 
379 		/*
380 		 * PDUs were received before the tid transitioned to ULP mode.
381 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
382 		 * the PDU in icp/ip.
383 		 */
384 		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
385 		    len);
386 
387 		/* XXXNP: needs to be rewritten. */
388 		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
389 		    iscsi_bhs)) {
390 			struct icl_cxgbei_pdu *icp0;
391 			struct icl_pdu *ip0;
392 
393 			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
394 			if (ip0 == NULL)
395 				CXGBE_UNIMPLEMENTED("PDU allocation failure");
396 			icl_cxgbei_new_pdu_set_conn(ip0, ic);
397 			icp0 = ip_to_icp(ip0);
398 			icp0->icp_seq = 0; /* XXX */
399 			icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS;
400 			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
401 			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
402 		}
403 		m_freem(m);
404 	}
405 
406 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
407 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
408 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
409 
410 		mtx_lock(&cwt->cwt_lock);
411 		icc->rx_flags |= RXF_ACTIVE;
412 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
413 		if (cwt->cwt_state == CWT_SLEEPING) {
414 			cwt->cwt_state = CWT_RUNNING;
415 			cv_signal(&cwt->cwt_cv);
416 		}
417 		mtx_unlock(&cwt->cwt_lock);
418 	}
419 	SOCKBUF_UNLOCK(sb);
420 	INP_WUNLOCK(inp);
421 
422 	toep->ulpcb2 = NULL;
423 
424 	return (0);
425 }
426 
427 static int
428 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
429 {
430 	struct epoch_tracker et;
431 	struct adapter *sc = iq->adapter;
432 	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
433 	u_int tid = GET_TID(cpl);
434 	struct toepcb *toep = lookup_tid(sc, tid);
435 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
436 	struct icl_pdu *ip;
437 	struct cxgbei_cmp *cmp;
438 	struct inpcb *inp = toep->inp;
439 #ifdef INVARIANTS
440 	uint16_t len = be16toh(cpl->len);
441 #endif
442 	struct socket *so;
443 	struct sockbuf *sb;
444 	struct tcpcb *tp;
445 	struct icl_cxgbei_conn *icc;
446 	struct icl_conn *ic;
447 	struct iscsi_bhs_data_out *bhsdo;
448 	u_int val = be32toh(cpl->ddpvld);
449 	u_int npdus, pdu_len, data_digest_len, hdr_digest_len;
450 	uint32_t prev_seg_len;
451 
452 	M_ASSERTPKTHDR(m);
453 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
454 
455 	if ((val & F_DDP_PDU) == 0) {
456 		MPASS(icp != NULL);
457 		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
458 		ip = &icp->ip;
459 	}
460 
461 	if (icp == NULL) {
462 		/* T6 completion enabled, start of a new PDU. */
463 		ip = icl_cxgbei_new_pdu(M_NOWAIT);
464 		if (ip == NULL)
465 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
466 		icp = ip_to_icp(ip);
467 	}
468 	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
469 
470 #if 0
471 	CTR5(KTR_CXGBE,
472 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
473 	    __func__, tid, pdu_len, val, icp);
474 #endif
475 
476 	/* Copy header */
477 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
478 	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
479 	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
480 	    bhsdo->bhsdo_data_segment_len[1] << 8 |
481 	    bhsdo->bhsdo_data_segment_len[2];
482 	icp->icp_seq = ntohl(cpl->seq);
483 	icp->icp_flags |= ICPF_RX_HDR;
484 	icp->icp_flags |= ICPF_RX_STATUS;
485 
486 	if (val & F_DDP_PADDING_ERR)
487 		icp->icp_flags |= ICPF_PAD_ERR;
488 	if (val & F_DDP_HDRCRC_ERR)
489 		icp->icp_flags |= ICPF_HCRC_ERR;
490 	if (val & F_DDP_DATACRC_ERR)
491 		icp->icp_flags |= ICPF_DCRC_ERR;
492 
493 	INP_WLOCK(inp);
494 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
495 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
496 		    __func__, tid, pdu_len, inp->inp_flags);
497 		INP_WUNLOCK(inp);
498 		icl_cxgbei_conn_pdu_free(NULL, ip);
499 		toep->ulpcb2 = NULL;
500 		m_freem(m);
501 		return (0);
502 	}
503 
504 	tp = intotcpcb(inp);
505 
506 	/*
507 	 * If icc is NULL, the connection is being closed in
508 	 * icl_cxgbei_conn_close(), just drop this data.
509 	 */
510 	icc = toep->ulpcb;
511 	if (__predict_false(icc == NULL)) {
512 		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
513 		    __func__, tid, pdu_len, icc);
514 
515 		/*
516 		 * Update rcv_nxt so the sequence number of the FIN
517 		 * doesn't appear wrong.
518 		 */
519 		tp->rcv_nxt = icp->icp_seq + pdu_len;
520 		tp->t_rcvtime = ticks;
521 		INP_WUNLOCK(inp);
522 
523 		icl_cxgbei_conn_pdu_free(NULL, ip);
524 		toep->ulpcb2 = NULL;
525 		m_freem(m);
526 		return (0);
527 	}
528 
529 	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
530 	    ISCSI_DATA_DIGEST_SIZE : 0;
531 	hdr_digest_len = (icc->ulp_submode & ULP_CRC_HEADER) ?
532 	    ISCSI_HEADER_DIGEST_SIZE : 0;
533 	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
534 
535 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
536 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
537 		MPASS(ip->ip_data_len > 0);
538 		icp->icp_flags |= ICPF_RX_DDP;
539 		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
540 
541 		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
542 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
543 			cmp = cxgbei_find_cmp(icc,
544 			    be32toh(bhsdo->bhsdo_initiator_task_tag));
545 			break;
546 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
547 			cmp = cxgbei_find_cmp(icc,
548 			    be32toh(bhsdo->bhsdo_target_transfer_tag));
549 			break;
550 		default:
551 			__assert_unreachable();
552 		}
553 		MPASS(cmp != NULL);
554 
555 		/* Must be the final PDU. */
556 		MPASS(bhsdo->bhsdo_flags & BHSDO_FLAGS_F);
557 
558 		/*
559 		 * The difference between the end of the last burst
560 		 * and the offset of the last PDU in this burst is
561 		 * the additional data received via DDP.
562 		 */
563 		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
564 		    cmp->next_buffer_offset;
565 
566 		if (prev_seg_len != 0) {
567 			/*
568 			 * Since cfiscsi doesn't know about previous
569 			 * headers, pretend that the entire r2t data
570 			 * length was received in this single segment.
571 			 */
572 			ip->ip_data_len += prev_seg_len;
573 			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
574 			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
575 			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
576 			bhsdo->bhsdo_buffer_offset =
577 			    htobe32(cmp->next_buffer_offset);
578 
579 			npdus = htobe32(bhsdo->bhsdo_datasn) - cmp->last_datasn;
580 		} else {
581 			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
582 			    cmp->last_datasn + 1);
583 			npdus = 1;
584 		}
585 
586 		cmp->next_buffer_offset += ip->ip_data_len;
587 		cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
588 		bhsdo->bhsdo_datasn = htobe32(cmp->next_datasn);
589 		cmp->next_datasn++;
590 		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
591 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
592 	} else {
593 		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
594 		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
595 		MPASS(icp->icp_seq == tp->rcv_nxt);
596 	}
597 
598 	tp->rcv_nxt = icp->icp_seq + pdu_len;
599 	tp->t_rcvtime = ticks;
600 
601 	/*
602 	 * Don't update the window size or return credits since RX
603 	 * flow control is disabled.
604 	 */
605 
606 	so = inp->inp_socket;
607 	sb = &so->so_rcv;
608 	SOCKBUF_LOCK(sb);
609 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
610 		CTR5(KTR_CXGBE,
611 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
612 		    __func__, tid, pdu_len, icc, sb->sb_state);
613 		SOCKBUF_UNLOCK(sb);
614 		INP_WUNLOCK(inp);
615 
616 		CURVNET_SET(so->so_vnet);
617 		NET_EPOCH_ENTER(et);
618 		INP_WLOCK(inp);
619 		tp = tcp_drop(tp, ECONNRESET);
620 		if (tp != NULL)
621 			INP_WUNLOCK(inp);
622 		NET_EPOCH_EXIT(et);
623 		CURVNET_RESTORE();
624 
625 		icl_cxgbei_conn_pdu_free(NULL, ip);
626 		toep->ulpcb2 = NULL;
627 		m_freem(m);
628 		return (0);
629 	}
630 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
631 	ic = &icc->ic;
632 	icl_cxgbei_new_pdu_set_conn(ip, ic);
633 
634 	/* Enqueue the PDU to the received pdus queue. */
635 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
636 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
637 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
638 
639 		mtx_lock(&cwt->cwt_lock);
640 		icc->rx_flags |= RXF_ACTIVE;
641 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
642 		if (cwt->cwt_state == CWT_SLEEPING) {
643 			cwt->cwt_state = CWT_RUNNING;
644 			cv_signal(&cwt->cwt_cv);
645 		}
646 		mtx_unlock(&cwt->cwt_lock);
647 	}
648 	SOCKBUF_UNLOCK(sb);
649 	INP_WUNLOCK(inp);
650 
651 	toep->ulpcb2 = NULL;
652 	m_freem(m);
653 
654 	return (0);
655 }
656 
657 static int
658 cxgbei_activate(struct adapter *sc)
659 {
660 	struct cxgbei_data *ci;
661 	int rc;
662 
663 	ASSERT_SYNCHRONIZED_OP(sc);
664 
665 	if (uld_active(sc, ULD_ISCSI)) {
666 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
667 		    __func__, sc));
668 		return (0);
669 	}
670 
671 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
672 		device_printf(sc->dev,
673 		    "not iSCSI offload capable, or capability disabled.\n");
674 		return (ENOSYS);
675 	}
676 
677 	/* per-adapter softc for iSCSI */
678 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
679 	if (ci == NULL)
680 		return (ENOMEM);
681 
682 	rc = cxgbei_init(sc, ci);
683 	if (rc != 0) {
684 		free(ci, M_CXGBE);
685 		return (rc);
686 	}
687 
688 	sc->iscsi_ulp_softc = ci;
689 
690 	return (0);
691 }
692 
693 static int
694 cxgbei_deactivate(struct adapter *sc)
695 {
696 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
697 
698 	ASSERT_SYNCHRONIZED_OP(sc);
699 
700 	if (ci != NULL) {
701 		sysctl_ctx_free(&ci->ctx);
702 		t4_free_ppod_region(&ci->pr);
703 		free(ci, M_CXGBE);
704 		sc->iscsi_ulp_softc = NULL;
705 	}
706 
707 	return (0);
708 }
709 
710 static void
711 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
712 {
713 
714 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
715 		return;
716 
717 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
718 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
719 		(void) t4_activate_uld(sc, ULD_ISCSI);
720 
721 	end_synchronized_op(sc, 0);
722 }
723 
724 static void
725 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
726 {
727 
728 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
729 		return;
730 
731 	if (uld_active(sc, ULD_ISCSI))
732 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
733 
734 	end_synchronized_op(sc, 0);
735 }
736 
737 static struct uld_info cxgbei_uld_info = {
738 	.uld_id = ULD_ISCSI,
739 	.activate = cxgbei_activate,
740 	.deactivate = cxgbei_deactivate,
741 };
742 
743 static void
744 cwt_main(void *arg)
745 {
746 	struct cxgbei_worker_thread_softc *cwt = arg;
747 	struct icl_cxgbei_conn *icc = NULL;
748 	struct icl_conn *ic;
749 	struct icl_pdu *ip;
750 	struct sockbuf *sb;
751 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
752 
753 	MPASS(cwt != NULL);
754 
755 	mtx_lock(&cwt->cwt_lock);
756 	MPASS(cwt->cwt_state == 0);
757 	cwt->cwt_state = CWT_RUNNING;
758 	cv_signal(&cwt->cwt_cv);
759 
760 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
761 		cwt->cwt_state = CWT_RUNNING;
762 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
763 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
764 			mtx_unlock(&cwt->cwt_lock);
765 
766 			ic = &icc->ic;
767 			sb = &ic->ic_socket->so_rcv;
768 
769 			SOCKBUF_LOCK(sb);
770 			MPASS(icc->rx_flags & RXF_ACTIVE);
771 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
772 				MPASS(STAILQ_EMPTY(&rx_pdus));
773 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
774 				SOCKBUF_UNLOCK(sb);
775 
776 				/* Hand over PDUs to ICL. */
777 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
778 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
779 					ic->ic_receive(ip);
780 				}
781 
782 				SOCKBUF_LOCK(sb);
783 				MPASS(STAILQ_EMPTY(&rx_pdus));
784 			}
785 			MPASS(icc->rx_flags & RXF_ACTIVE);
786 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
787 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
788 				icc->rx_flags &= ~RXF_ACTIVE;
789 			} else {
790 				/*
791 				 * More PDUs were received while we were busy
792 				 * handing over the previous batch to ICL.
793 				 * Re-add this connection to the end of the
794 				 * queue.
795 				 */
796 				mtx_lock(&cwt->cwt_lock);
797 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
798 				    rx_link);
799 				mtx_unlock(&cwt->cwt_lock);
800 			}
801 			SOCKBUF_UNLOCK(sb);
802 
803 			mtx_lock(&cwt->cwt_lock);
804 		}
805 
806 		/* Inner loop doesn't check for CWT_STOP, do that first. */
807 		if (__predict_false(cwt->cwt_state == CWT_STOP))
808 			break;
809 		cwt->cwt_state = CWT_SLEEPING;
810 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
811 	}
812 
813 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
814 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
815 	cwt->cwt_state = CWT_STOPPED;
816 	cv_signal(&cwt->cwt_cv);
817 	mtx_unlock(&cwt->cwt_lock);
818 	kthread_exit();
819 }
820 
821 static int
822 start_worker_threads(void)
823 {
824 	int i, rc;
825 	struct cxgbei_worker_thread_softc *cwt;
826 
827 	worker_thread_count = min(mp_ncpus, 32);
828 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
829 	    M_WAITOK | M_ZERO);
830 
831 	MPASS(cxgbei_proc == NULL);
832 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
833 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
834 		cv_init(&cwt->cwt_cv, "cwt cv");
835 		TAILQ_INIT(&cwt->rx_head);
836 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
837 		    "cxgbei", "%d", i);
838 		if (rc != 0) {
839 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
840 			    i + 1, worker_thread_count, rc);
841 			mtx_destroy(&cwt->cwt_lock);
842 			cv_destroy(&cwt->cwt_cv);
843 			bzero(cwt, sizeof(*cwt));
844 			if (i == 0) {
845 				free(cwt_softc, M_CXGBE);
846 				worker_thread_count = 0;
847 
848 				return (rc);
849 			}
850 
851 			/* Not fatal, carry on with fewer threads. */
852 			worker_thread_count = i;
853 			rc = 0;
854 			break;
855 		}
856 
857 		/* Wait for thread to start before moving on to the next one. */
858 		mtx_lock(&cwt->cwt_lock);
859 		while (cwt->cwt_state == 0)
860 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
861 		mtx_unlock(&cwt->cwt_lock);
862 	}
863 
864 	MPASS(cwt_softc != NULL);
865 	MPASS(worker_thread_count > 0);
866 	return (0);
867 }
868 
869 static void
870 stop_worker_threads(void)
871 {
872 	int i;
873 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
874 
875 	MPASS(worker_thread_count >= 0);
876 
877 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
878 		mtx_lock(&cwt->cwt_lock);
879 		MPASS(cwt->cwt_state == CWT_RUNNING ||
880 		    cwt->cwt_state == CWT_SLEEPING);
881 		cwt->cwt_state = CWT_STOP;
882 		cv_signal(&cwt->cwt_cv);
883 		do {
884 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
885 		} while (cwt->cwt_state != CWT_STOPPED);
886 		mtx_unlock(&cwt->cwt_lock);
887 		mtx_destroy(&cwt->cwt_lock);
888 		cv_destroy(&cwt->cwt_cv);
889 	}
890 	free(cwt_softc, M_CXGBE);
891 }
892 
893 /* Select a worker thread for a connection. */
894 u_int
895 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
896 {
897 	struct adapter *sc = icc->sc;
898 	struct toepcb *toep = icc->toep;
899 	u_int i, n;
900 
901 	n = worker_thread_count / sc->sge.nofldrxq;
902 	if (n > 0)
903 		i = toep->vi->pi->port_id * n + arc4random() % n;
904 	else
905 		i = arc4random() % worker_thread_count;
906 
907 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
908 
909 	return (i);
910 }
911 
912 static int
913 cxgbei_mod_load(void)
914 {
915 	int rc;
916 
917 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
918 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
919 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
920 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
921 
922 	rc = start_worker_threads();
923 	if (rc != 0)
924 		return (rc);
925 
926 	rc = t4_register_uld(&cxgbei_uld_info);
927 	if (rc != 0) {
928 		stop_worker_threads();
929 		return (rc);
930 	}
931 
932 	t4_iterate(cxgbei_activate_all, NULL);
933 
934 	return (rc);
935 }
936 
937 static int
938 cxgbei_mod_unload(void)
939 {
940 
941 	t4_iterate(cxgbei_deactivate_all, NULL);
942 
943 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
944 		return (EBUSY);
945 
946 	stop_worker_threads();
947 
948 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
949 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
950 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
951 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
952 
953 	return (0);
954 }
955 #endif
956 
957 static int
958 cxgbei_modevent(module_t mod, int cmd, void *arg)
959 {
960 	int rc = 0;
961 
962 #ifdef TCP_OFFLOAD
963 	switch (cmd) {
964 	case MOD_LOAD:
965 		rc = cxgbei_mod_load();
966 		if (rc == 0)
967 			rc = icl_cxgbei_mod_load();
968 		break;
969 
970 	case MOD_UNLOAD:
971 		rc = icl_cxgbei_mod_unload();
972 		if (rc == 0)
973 			rc = cxgbei_mod_unload();
974 		break;
975 
976 	default:
977 		rc = EINVAL;
978 	}
979 #else
980 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
981 	rc = EOPNOTSUPP;
982 #endif
983 
984 	return (rc);
985 }
986 
987 static moduledata_t cxgbei_mod = {
988 	"cxgbei",
989 	cxgbei_modevent,
990 	NULL,
991 };
992 
993 MODULE_VERSION(cxgbei, 1);
994 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
995 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
996 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
997 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
998