xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 47e46b1123b9c732c366848e8184ce42e086a0fb)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/systm.h>
43 
44 #ifdef TCP_OFFLOAD
45 #include <sys/errno.h>
46 #include <sys/gsb_crc32.h>
47 #include <sys/kthread.h>
48 #include <sys/smp.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/mbuf.h>
52 #include <sys/lock.h>
53 #include <sys/mutex.h>
54 #include <sys/condvar.h>
55 
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/toecore.h>
59 #include <netinet/tcp_var.h>
60 #include <netinet/tcp_fsm.h>
61 
62 #include <cam/scsi/scsi_all.h>
63 #include <cam/scsi/scsi_da.h>
64 #include <cam/ctl/ctl_io.h>
65 #include <cam/ctl/ctl.h>
66 #include <cam/ctl/ctl_backend.h>
67 #include <cam/ctl/ctl_error.h>
68 #include <cam/ctl/ctl_frontend.h>
69 #include <cam/ctl/ctl_debug.h>
70 #include <cam/ctl/ctl_ha.h>
71 #include <cam/ctl/ctl_ioctl.h>
72 
73 #include <dev/iscsi/icl.h>
74 #include <dev/iscsi/iscsi_proto.h>
75 #include <dev/iscsi/iscsi_ioctl.h>
76 #include <dev/iscsi/iscsi.h>
77 #include <cam/ctl/ctl_frontend_iscsi.h>
78 
79 #include <cam/cam.h>
80 #include <cam/cam_ccb.h>
81 #include <cam/cam_xpt.h>
82 #include <cam/cam_debug.h>
83 #include <cam/cam_sim.h>
84 #include <cam/cam_xpt_sim.h>
85 #include <cam/cam_xpt_periph.h>
86 #include <cam/cam_periph.h>
87 #include <cam/cam_compat.h>
88 #include <cam/scsi/scsi_message.h>
89 
90 #include "common/common.h"
91 #include "common/t4_msg.h"
92 #include "common/t4_regs.h"	/* for PCIE_MEM_ACCESS */
93 #include "tom/t4_tom.h"
94 #include "cxgbei.h"
95 
96 static int worker_thread_count;
97 static struct cxgbei_worker_thread_softc *cwt_softc;
98 static struct proc *cxgbei_proc;
99 
100 static void
101 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len,
102     uint32_t *max_rx_data_len, struct ppod_region *pr)
103 {
104 	uint32_t tx_len, rx_len, r, v;
105 
106 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
107 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
108 
109 	r = t4_read_reg(sc, A_TP_PARA_REG2);
110 	rx_len = min(rx_len, G_MAXRXDATA(r));
111 	tx_len = min(tx_len, G_MAXRXDATA(r));
112 
113 	r = t4_read_reg(sc, A_TP_PARA_REG7);
114 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
115 	rx_len = min(rx_len, v);
116 	tx_len = min(tx_len, v);
117 
118 	/*
119 	 * AHS is not supported by the kernel so we'll not account for
120 	 * it either in our PDU len -> data segment len conversions.
121 	 */
122 	rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
123 	    ISCSI_DATA_DIGEST_SIZE;
124 	tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
125 	    ISCSI_DATA_DIGEST_SIZE;
126 
127 	/*
128 	 * DDP can place only 4 pages for a single PDU.  A single
129 	 * request might use larger pages than the smallest page size,
130 	 * but that cannot be guaranteed.  Assume the smallest DDP
131 	 * page size for this limit.
132 	 */
133 	rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0]));
134 
135 	if (chip_id(sc) == CHELSIO_T5) {
136 		tx_len = min(tx_len, 15360);
137 
138 		rx_len = rounddown2(rx_len, 512);
139 		tx_len = rounddown2(tx_len, 512);
140 	}
141 
142 	*max_tx_data_len = tx_len;
143 	*max_rx_data_len = rx_len;
144 }
145 
146 /*
147  * Initialize the software state of the iSCSI ULP driver.
148  *
149  * ENXIO means firmware didn't set up something that it was supposed to.
150  */
151 static int
152 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
153 {
154 	struct sysctl_oid *oid;
155 	struct sysctl_oid_list *children;
156 	struct ppod_region *pr;
157 	uint32_t r;
158 	int rc;
159 
160 	MPASS(sc->vres.iscsi.size > 0);
161 	MPASS(ci != NULL);
162 
163 	pr = &ci->pr;
164 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
165 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
166 	if (rc != 0) {
167 		device_printf(sc->dev,
168 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
169 		    __func__, rc);
170 		return (rc);
171 	}
172 
173 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
174 	r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
175 	if (r != pr->pr_tag_mask) {
176 		/*
177 		 * Recent firmwares are supposed to set up the iSCSI tagmask
178 		 * but we'll do it ourselves it the computed value doesn't match
179 		 * what's in the register.
180 		 */
181 		device_printf(sc->dev,
182 		    "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
183 		    pr->pr_tag_mask);
184 		t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
185 		    V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
186 	}
187 
188 	read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr);
189 
190 	sysctl_ctx_init(&ci->ctx);
191 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
192 	children = SYSCTL_CHILDREN(oid);
193 
194 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
195 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
196 	children = SYSCTL_CHILDREN(oid);
197 
198 	ci->ddp_threshold = 2048;
199 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
200 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
201 
202 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len",
203 	    CTLFLAG_RD, &ci->max_rx_data_len, 0,
204 	    "Maximum receive data segment length");
205 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len",
206 	    CTLFLAG_RD, &ci->max_tx_data_len, 0,
207 	    "Maximum transmit data segment length");
208 
209 	return (0);
210 }
211 
212 static int
213 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
214 {
215 	struct adapter *sc = iq->adapter;
216 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
217 	u_int tid = GET_TID(cpl);
218 	struct toepcb *toep = lookup_tid(sc, tid);
219 	struct icl_pdu *ip;
220 	struct icl_cxgbei_pdu *icp;
221 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
222 	uint16_t len = be16toh(cpl->len);
223 
224 	M_ASSERTPKTHDR(m);
225 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
226 
227 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
228 	if (ip == NULL)
229 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
230 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
231 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
232 	icp = ip_to_icp(ip);
233 	icp->icp_seq = ntohl(cpl->seq);
234 	icp->icp_flags = ICPF_RX_HDR;
235 
236 	/* This is the start of a new PDU.  There should be no old state. */
237 	MPASS(toep->ulpcb2 == NULL);
238 	toep->ulpcb2 = icp;
239 
240 #if 0
241 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
242 	    __func__, tid, len, len_ddp, icp);
243 #endif
244 
245 	m_freem(m);
246 	return (0);
247 }
248 
249 static int
250 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
251 {
252 	struct adapter *sc = iq->adapter;
253 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
254 	u_int tid = GET_TID(cpl);
255 	struct toepcb *toep = lookup_tid(sc, tid);
256 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
257 	struct icl_pdu *ip;
258 
259 	M_ASSERTPKTHDR(m);
260 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
261 
262 	if (icp == NULL) {
263 		/*
264 		 * T6 completion enabled, start of a new pdu. Header
265 		 * will come in completion CPL.
266 		 */
267 	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
268 	        if (ip == NULL)
269 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
270 		icp = ip_to_icp(ip);
271 	} else {
272 		/* T5 mode, header is already received. */
273 		MPASS(icp->icp_flags == ICPF_RX_HDR);
274 		MPASS(icp->ip.ip_data_mbuf == NULL);
275 		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
276 	}
277 
278 	/* Trim the cpl header from mbuf. */
279 	m_adj(m, sizeof(*cpl));
280 
281 	icp->icp_flags |= ICPF_RX_FLBUF;
282 	icp->ip.ip_data_mbuf = m;
283 	toep->ofld_rxq->rx_iscsi_fl_pdus++;
284 	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
285 
286 	/*
287 	 * For T6, save the icp for further processing in the
288 	 * completion handler.
289 	 */
290 	if (icp->icp_flags == ICPF_RX_FLBUF) {
291 		MPASS(toep->ulpcb2 == NULL);
292 		toep->ulpcb2 = icp;
293 	}
294 
295 #if 0
296 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
297 	    be16toh(cpl->len), icp);
298 #endif
299 
300 	return (0);
301 }
302 
303 static int
304 mbuf_crc32c_helper(void *arg, void *data, u_int len)
305 {
306 	uint32_t *digestp = arg;
307 
308 	*digestp = calculate_crc32c(*digestp, data, len);
309 	return (0);
310 }
311 
312 static bool
313 parse_pdus(struct toepcb *toep, struct icl_cxgbei_conn *icc, struct sockbuf *sb)
314 {
315 	struct iscsi_bhs bhs;
316 	struct mbuf *m;
317 	struct icl_pdu *ip;
318 	u_int ahs_len, data_len, header_len, pdu_len, total_len;
319 	uint32_t calc_digest, wire_digest;
320 
321 	total_len = sbused(sb);
322 	CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, toep->tid,
323 	    total_len);
324 
325 	m = sbcut_locked(sb, total_len);
326 	KASSERT(m_length(m, NULL) == total_len,
327 	    ("sbcut returned less data (%u vs %u)", total_len,
328 	    m_length(m, NULL)));
329 
330 	header_len = sizeof(struct iscsi_bhs);
331 	if (icc->ic.ic_header_crc32c)
332 		header_len += ISCSI_HEADER_DIGEST_SIZE;
333 	for (;;) {
334 		if (total_len < sizeof(struct iscsi_bhs)) {
335 			ICL_WARN("truncated pre-offload PDU with len %u",
336 			    total_len);
337 			m_freem(m);
338 			return (false);
339 		}
340 		m_copydata(m, 0, sizeof(struct iscsi_bhs), (caddr_t)&bhs);
341 
342 		ahs_len = bhs.bhs_total_ahs_len * 4;
343 		data_len = bhs.bhs_data_segment_len[0] << 16 |
344 		    bhs.bhs_data_segment_len[1] << 8 |
345 		    bhs.bhs_data_segment_len[0];
346 		pdu_len = header_len + ahs_len + roundup2(data_len, 4);
347 		if (icc->ic.ic_data_crc32c && data_len != 0)
348 			pdu_len += ISCSI_DATA_DIGEST_SIZE;
349 
350 		if (total_len < pdu_len) {
351 			ICL_WARN("truncated pre-offload PDU len %u vs %u",
352 			    total_len, pdu_len);
353 			m_freem(m);
354 			return (false);
355 		}
356 
357 		if (ahs_len != 0) {
358 			ICL_WARN("received pre-offload PDU with AHS");
359 			m_freem(m);
360 			return (false);
361 		}
362 
363 		if (icc->ic.ic_header_crc32c) {
364 			m_copydata(m, sizeof(struct iscsi_bhs),
365 			    sizeof(wire_digest), (caddr_t)&wire_digest);
366 
367 			calc_digest = calculate_crc32c(0xffffffff,
368 			    (caddr_t)&bhs, sizeof(bhs));
369 			calc_digest ^= 0xffffffff;
370 			if (calc_digest != wire_digest) {
371 				ICL_WARN("received pre-offload PDU 0x%02x "
372 				    "with invalid header digest (0x%x vs 0x%x)",
373 				    bhs.bhs_opcode, wire_digest, calc_digest);
374 				toep->ofld_rxq->rx_iscsi_header_digest_errors++;
375 				m_free(m);
376 				return (false);
377 			}
378 		}
379 
380 		m_adj(m, header_len);
381 
382 		if (icc->ic.ic_data_crc32c && data_len != 0) {
383 			m_copydata(m, data_len, sizeof(wire_digest),
384 			    (caddr_t)&wire_digest);
385 
386 			calc_digest = 0xffffffff;
387 			m_apply(m, 0, roundup2(data_len, 4), mbuf_crc32c_helper,
388 			    &calc_digest);
389 			calc_digest ^= 0xffffffff;
390 			if (calc_digest != wire_digest) {
391 				ICL_WARN("received pre-offload PDU 0x%02x "
392 				    "with invalid data digest (0x%x vs 0x%x)",
393 				    bhs.bhs_opcode, wire_digest, calc_digest);
394 				toep->ofld_rxq->rx_iscsi_data_digest_errors++;
395 				m_free(m);
396 				return (false);
397 			}
398 		}
399 
400 		ip = icl_cxgbei_new_pdu(M_NOWAIT);
401 		if (ip == NULL)
402 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
403 		icl_cxgbei_new_pdu_set_conn(ip, &icc->ic);
404 		*ip->ip_bhs = bhs;
405 		ip->ip_data_len = data_len;
406 		if (data_len != 0)
407 			ip->ip_data_mbuf = m;
408 
409 		STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
410 
411 		total_len -= pdu_len;
412 		if (total_len == 0) {
413 			if (data_len == 0)
414 				m_freem(m);
415 			return (true);
416 		}
417 
418 		if (data_len != 0) {
419 			m = m_split(m, roundup2(data_len, 4), M_NOWAIT);
420 			if (m == NULL) {
421 				ICL_WARN("failed to split mbuf chain for "
422 				    "pre-offload PDU");
423 
424 				/* Don't free the mbuf chain as 'ip' owns it. */
425 				return (false);
426 			}
427 			if (icc->ic.ic_data_crc32c)
428 				m_adj(m, ISCSI_DATA_DIGEST_SIZE);
429 		}
430 	}
431 }
432 
433 static int
434 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
435 {
436 	struct adapter *sc = iq->adapter;
437 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
438 	u_int tid = GET_TID(cpl);
439 	struct toepcb *toep = lookup_tid(sc, tid);
440 	struct inpcb *inp = toep->inp;
441 	struct socket *so;
442 	struct sockbuf *sb;
443 	struct tcpcb *tp;
444 	struct icl_cxgbei_conn *icc;
445 	struct icl_conn *ic;
446 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
447 	struct icl_pdu *ip;
448 	u_int pdu_len, val;
449 	struct epoch_tracker et;
450 
451 	MPASS(m == NULL);
452 
453 	/* Must already be assembling a PDU. */
454 	MPASS(icp != NULL);
455 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
456 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
457 
458 	pdu_len = be16toh(cpl->len);	/* includes everything. */
459 	val = be32toh(cpl->ddpvld);
460 
461 #if 0
462 	CTR5(KTR_CXGBE,
463 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
464 	    __func__, tid, pdu_len, val, icp->icp_flags);
465 #endif
466 
467 	icp->icp_flags |= ICPF_RX_STATUS;
468 	ip = &icp->ip;
469 	if (val & F_DDP_PADDING_ERR) {
470 		ICL_WARN("received PDU 0x%02x with invalid padding",
471 		    ip->ip_bhs->bhs_opcode);
472 		toep->ofld_rxq->rx_iscsi_padding_errors++;
473 	}
474 	if (val & F_DDP_HDRCRC_ERR) {
475 		ICL_WARN("received PDU 0x%02x with invalid header digest",
476 		    ip->ip_bhs->bhs_opcode);
477 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
478 	}
479 	if (val & F_DDP_DATACRC_ERR) {
480 		ICL_WARN("received PDU 0x%02x with invalid data digest",
481 		    ip->ip_bhs->bhs_opcode);
482 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
483 	}
484 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
485 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
486 		MPASS(ip->ip_data_len > 0);
487 		icp->icp_flags |= ICPF_RX_DDP;
488 		toep->ofld_rxq->rx_iscsi_ddp_pdus++;
489 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
490 	}
491 
492 	INP_WLOCK(inp);
493 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
494 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
495 		    __func__, tid, pdu_len, inp->inp_flags);
496 		INP_WUNLOCK(inp);
497 		icl_cxgbei_conn_pdu_free(NULL, ip);
498 		toep->ulpcb2 = NULL;
499 		return (0);
500 	}
501 
502 	/*
503 	 * T6+ does not report data PDUs received via DDP without F
504 	 * set.  This can result in gaps in the TCP sequence space.
505 	 */
506 	tp = intotcpcb(inp);
507 	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
508 	tp->rcv_nxt = icp->icp_seq + pdu_len;
509 	tp->t_rcvtime = ticks;
510 
511 	/*
512 	 * Don't update the window size or return credits since RX
513 	 * flow control is disabled.
514 	 */
515 
516 	so = inp->inp_socket;
517 	sb = &so->so_rcv;
518 	SOCKBUF_LOCK(sb);
519 
520 	icc = toep->ulpcb;
521 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
522 		CTR5(KTR_CXGBE,
523 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
524 		    __func__, tid, pdu_len, icc, sb->sb_state);
525 		SOCKBUF_UNLOCK(sb);
526 		INP_WUNLOCK(inp);
527 
528 		CURVNET_SET(so->so_vnet);
529 		NET_EPOCH_ENTER(et);
530 		INP_WLOCK(inp);
531 		tp = tcp_drop(tp, ECONNRESET);
532 		if (tp)
533 			INP_WUNLOCK(inp);
534 		NET_EPOCH_EXIT(et);
535 		CURVNET_RESTORE();
536 
537 		icl_cxgbei_conn_pdu_free(NULL, ip);
538 		toep->ulpcb2 = NULL;
539 		return (0);
540 	}
541 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
542 	ic = &icc->ic;
543 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
544 	    F_DDP_DATACRC_ERR)) != 0) {
545 		SOCKBUF_UNLOCK(sb);
546 		INP_WUNLOCK(inp);
547 
548 		icl_cxgbei_conn_pdu_free(NULL, ip);
549 		toep->ulpcb2 = NULL;
550 		ic->ic_error(ic);
551 		return (0);
552 	}
553 
554 	if (__predict_false(sbused(sb)) != 0) {
555 		/*
556 		 * PDUs were received before the tid transitioned to ULP mode.
557 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
558 		 * the PDU in icp/ip.
559 		 */
560 		if (!parse_pdus(toep, icc, sb)) {
561 			SOCKBUF_UNLOCK(sb);
562 			INP_WUNLOCK(inp);
563 
564 			icl_cxgbei_conn_pdu_free(NULL, ip);
565 			toep->ulpcb2 = NULL;
566 			ic->ic_error(ic);
567 			return (0);
568 		}
569 	}
570 	icl_cxgbei_new_pdu_set_conn(ip, ic);
571 
572 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
573 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
574 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
575 
576 		mtx_lock(&cwt->cwt_lock);
577 		icc->rx_flags |= RXF_ACTIVE;
578 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
579 		if (cwt->cwt_state == CWT_SLEEPING) {
580 			cwt->cwt_state = CWT_RUNNING;
581 			cv_signal(&cwt->cwt_cv);
582 		}
583 		mtx_unlock(&cwt->cwt_lock);
584 	}
585 	SOCKBUF_UNLOCK(sb);
586 	INP_WUNLOCK(inp);
587 
588 	toep->ulpcb2 = NULL;
589 
590 	return (0);
591 }
592 
593 static int
594 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
595 {
596 	struct epoch_tracker et;
597 	struct adapter *sc = iq->adapter;
598 	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
599 	u_int tid = GET_TID(cpl);
600 	struct toepcb *toep = lookup_tid(sc, tid);
601 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
602 	struct icl_pdu *ip;
603 	struct cxgbei_cmp *cmp;
604 	struct inpcb *inp = toep->inp;
605 #ifdef INVARIANTS
606 	uint16_t len = be16toh(cpl->len);
607 	u_int data_digest_len;
608 #endif
609 	struct socket *so;
610 	struct sockbuf *sb;
611 	struct tcpcb *tp;
612 	struct icl_cxgbei_conn *icc;
613 	struct icl_conn *ic;
614 	struct iscsi_bhs_data_out *bhsdo;
615 	u_int val = be32toh(cpl->ddpvld);
616 	u_int npdus, pdu_len;
617 	uint32_t prev_seg_len;
618 
619 	M_ASSERTPKTHDR(m);
620 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
621 
622 	if ((val & F_DDP_PDU) == 0) {
623 		MPASS(icp != NULL);
624 		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
625 		ip = &icp->ip;
626 	}
627 
628 	if (icp == NULL) {
629 		/* T6 completion enabled, start of a new PDU. */
630 		ip = icl_cxgbei_new_pdu(M_NOWAIT);
631 		if (ip == NULL)
632 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
633 		icp = ip_to_icp(ip);
634 	}
635 	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
636 
637 #if 0
638 	CTR5(KTR_CXGBE,
639 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
640 	    __func__, tid, pdu_len, val, icp);
641 #endif
642 
643 	/* Copy header */
644 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
645 	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
646 	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
647 	    bhsdo->bhsdo_data_segment_len[1] << 8 |
648 	    bhsdo->bhsdo_data_segment_len[2];
649 	icp->icp_seq = ntohl(cpl->seq);
650 	icp->icp_flags |= ICPF_RX_HDR;
651 	icp->icp_flags |= ICPF_RX_STATUS;
652 
653 	if (val & F_DDP_PADDING_ERR) {
654 		ICL_WARN("received PDU 0x%02x with invalid padding",
655 		    ip->ip_bhs->bhs_opcode);
656 		toep->ofld_rxq->rx_iscsi_padding_errors++;
657 	}
658 	if (val & F_DDP_HDRCRC_ERR) {
659 		ICL_WARN("received PDU 0x%02x with invalid header digest",
660 		    ip->ip_bhs->bhs_opcode);
661 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
662 	}
663 	if (val & F_DDP_DATACRC_ERR) {
664 		ICL_WARN("received PDU 0x%02x with invalid data digest",
665 		    ip->ip_bhs->bhs_opcode);
666 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
667 	}
668 
669 	INP_WLOCK(inp);
670 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
671 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
672 		    __func__, tid, pdu_len, inp->inp_flags);
673 		INP_WUNLOCK(inp);
674 		icl_cxgbei_conn_pdu_free(NULL, ip);
675 		toep->ulpcb2 = NULL;
676 		m_freem(m);
677 		return (0);
678 	}
679 
680 	tp = intotcpcb(inp);
681 
682 	/*
683 	 * If icc is NULL, the connection is being closed in
684 	 * icl_cxgbei_conn_close(), just drop this data.
685 	 */
686 	icc = toep->ulpcb;
687 	if (__predict_false(icc == NULL)) {
688 		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
689 		    __func__, tid, pdu_len, icc);
690 
691 		/*
692 		 * Update rcv_nxt so the sequence number of the FIN
693 		 * doesn't appear wrong.
694 		 */
695 		tp->rcv_nxt = icp->icp_seq + pdu_len;
696 		tp->t_rcvtime = ticks;
697 		INP_WUNLOCK(inp);
698 
699 		icl_cxgbei_conn_pdu_free(NULL, ip);
700 		toep->ulpcb2 = NULL;
701 		m_freem(m);
702 		return (0);
703 	}
704 
705 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
706 	ic = &icc->ic;
707 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
708 	    F_DDP_DATACRC_ERR)) != 0) {
709 		INP_WUNLOCK(inp);
710 
711 		icl_cxgbei_conn_pdu_free(NULL, ip);
712 		toep->ulpcb2 = NULL;
713 		m_freem(m);
714 		ic->ic_error(ic);
715 		return (0);
716 	}
717 
718 #ifdef INVARIANTS
719 	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
720 	    ISCSI_DATA_DIGEST_SIZE : 0;
721 	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
722 #endif
723 
724 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
725 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
726 		MPASS(ip->ip_data_len > 0);
727 		icp->icp_flags |= ICPF_RX_DDP;
728 		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
729 
730 		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
731 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
732 			cmp = cxgbei_find_cmp(icc,
733 			    be32toh(bhsdo->bhsdo_initiator_task_tag));
734 			break;
735 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
736 			cmp = cxgbei_find_cmp(icc,
737 			    be32toh(bhsdo->bhsdo_target_transfer_tag));
738 			break;
739 		default:
740 			__assert_unreachable();
741 		}
742 		MPASS(cmp != NULL);
743 
744 		/*
745 		 * The difference between the end of the last burst
746 		 * and the offset of the last PDU in this burst is
747 		 * the additional data received via DDP.
748 		 */
749 		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
750 		    cmp->next_buffer_offset;
751 
752 		if (prev_seg_len != 0) {
753 			uint32_t orig_datasn;
754 
755 			/*
756 			 * Return a "large" PDU representing the burst
757 			 * of PDUs.  Adjust the offset and length of
758 			 * this PDU to represent the entire burst.
759 			 */
760 			ip->ip_data_len += prev_seg_len;
761 			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
762 			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
763 			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
764 			bhsdo->bhsdo_buffer_offset =
765 			    htobe32(cmp->next_buffer_offset);
766 
767 			orig_datasn = htobe32(bhsdo->bhsdo_datasn);
768 			npdus = orig_datasn - cmp->last_datasn;
769 			bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1);
770 			cmp->last_datasn = orig_datasn;
771 			ip->ip_additional_pdus = npdus - 1;
772 		} else {
773 			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
774 			    cmp->last_datasn + 1);
775 			npdus = 1;
776 			cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
777 		}
778 
779 		cmp->next_buffer_offset += ip->ip_data_len;
780 		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
781 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
782 	} else {
783 		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
784 		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
785 	}
786 
787 	tp->rcv_nxt = icp->icp_seq + pdu_len;
788 	tp->t_rcvtime = ticks;
789 
790 	/*
791 	 * Don't update the window size or return credits since RX
792 	 * flow control is disabled.
793 	 */
794 
795 	so = inp->inp_socket;
796 	sb = &so->so_rcv;
797 	SOCKBUF_LOCK(sb);
798 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
799 		CTR5(KTR_CXGBE,
800 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
801 		    __func__, tid, pdu_len, icc, sb->sb_state);
802 		SOCKBUF_UNLOCK(sb);
803 		INP_WUNLOCK(inp);
804 
805 		CURVNET_SET(so->so_vnet);
806 		NET_EPOCH_ENTER(et);
807 		INP_WLOCK(inp);
808 		tp = tcp_drop(tp, ECONNRESET);
809 		if (tp != NULL)
810 			INP_WUNLOCK(inp);
811 		NET_EPOCH_EXIT(et);
812 		CURVNET_RESTORE();
813 
814 		icl_cxgbei_conn_pdu_free(NULL, ip);
815 		toep->ulpcb2 = NULL;
816 		m_freem(m);
817 		return (0);
818 	}
819 
820 	if (__predict_false(sbused(sb)) != 0) {
821 		/*
822 		 * PDUs were received before the tid transitioned to ULP mode.
823 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
824 		 * the PDU in icp/ip.
825 		 */
826 		if (!parse_pdus(toep, icc, sb)) {
827 			SOCKBUF_UNLOCK(sb);
828 			INP_WUNLOCK(inp);
829 
830 			icl_cxgbei_conn_pdu_free(NULL, ip);
831 			toep->ulpcb2 = NULL;
832 			ic->ic_error(ic);
833 			return (0);
834 		}
835 	}
836 	icl_cxgbei_new_pdu_set_conn(ip, ic);
837 
838 	/* Enqueue the PDU to the received pdus queue. */
839 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
840 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
841 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
842 
843 		mtx_lock(&cwt->cwt_lock);
844 		icc->rx_flags |= RXF_ACTIVE;
845 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
846 		if (cwt->cwt_state == CWT_SLEEPING) {
847 			cwt->cwt_state = CWT_RUNNING;
848 			cv_signal(&cwt->cwt_cv);
849 		}
850 		mtx_unlock(&cwt->cwt_lock);
851 	}
852 	SOCKBUF_UNLOCK(sb);
853 	INP_WUNLOCK(inp);
854 
855 	toep->ulpcb2 = NULL;
856 	m_freem(m);
857 
858 	return (0);
859 }
860 
861 static int
862 cxgbei_activate(struct adapter *sc)
863 {
864 	struct cxgbei_data *ci;
865 	int rc;
866 
867 	ASSERT_SYNCHRONIZED_OP(sc);
868 
869 	if (uld_active(sc, ULD_ISCSI)) {
870 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
871 		    __func__, sc));
872 		return (0);
873 	}
874 
875 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
876 		device_printf(sc->dev,
877 		    "not iSCSI offload capable, or capability disabled.\n");
878 		return (ENOSYS);
879 	}
880 
881 	/* per-adapter softc for iSCSI */
882 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
883 	if (ci == NULL)
884 		return (ENOMEM);
885 
886 	rc = cxgbei_init(sc, ci);
887 	if (rc != 0) {
888 		free(ci, M_CXGBE);
889 		return (rc);
890 	}
891 
892 	sc->iscsi_ulp_softc = ci;
893 
894 	return (0);
895 }
896 
897 static int
898 cxgbei_deactivate(struct adapter *sc)
899 {
900 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
901 
902 	ASSERT_SYNCHRONIZED_OP(sc);
903 
904 	if (ci != NULL) {
905 		sysctl_ctx_free(&ci->ctx);
906 		t4_free_ppod_region(&ci->pr);
907 		free(ci, M_CXGBE);
908 		sc->iscsi_ulp_softc = NULL;
909 	}
910 
911 	return (0);
912 }
913 
914 static void
915 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
916 {
917 
918 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
919 		return;
920 
921 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
922 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
923 		(void) t4_activate_uld(sc, ULD_ISCSI);
924 
925 	end_synchronized_op(sc, 0);
926 }
927 
928 static void
929 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
930 {
931 
932 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
933 		return;
934 
935 	if (uld_active(sc, ULD_ISCSI))
936 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
937 
938 	end_synchronized_op(sc, 0);
939 }
940 
941 static struct uld_info cxgbei_uld_info = {
942 	.uld_id = ULD_ISCSI,
943 	.activate = cxgbei_activate,
944 	.deactivate = cxgbei_deactivate,
945 };
946 
947 static void
948 cwt_main(void *arg)
949 {
950 	struct cxgbei_worker_thread_softc *cwt = arg;
951 	struct icl_cxgbei_conn *icc = NULL;
952 	struct icl_conn *ic;
953 	struct icl_pdu *ip;
954 	struct sockbuf *sb;
955 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
956 
957 	MPASS(cwt != NULL);
958 
959 	mtx_lock(&cwt->cwt_lock);
960 	MPASS(cwt->cwt_state == 0);
961 	cwt->cwt_state = CWT_RUNNING;
962 	cv_signal(&cwt->cwt_cv);
963 
964 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
965 		cwt->cwt_state = CWT_RUNNING;
966 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
967 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
968 			mtx_unlock(&cwt->cwt_lock);
969 
970 			ic = &icc->ic;
971 			sb = &ic->ic_socket->so_rcv;
972 
973 			SOCKBUF_LOCK(sb);
974 			MPASS(icc->rx_flags & RXF_ACTIVE);
975 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
976 				MPASS(STAILQ_EMPTY(&rx_pdus));
977 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
978 				SOCKBUF_UNLOCK(sb);
979 
980 				/* Hand over PDUs to ICL. */
981 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
982 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
983 					ic->ic_receive(ip);
984 				}
985 
986 				SOCKBUF_LOCK(sb);
987 				MPASS(STAILQ_EMPTY(&rx_pdus));
988 			}
989 			MPASS(icc->rx_flags & RXF_ACTIVE);
990 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
991 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
992 				icc->rx_flags &= ~RXF_ACTIVE;
993 			} else {
994 				/*
995 				 * More PDUs were received while we were busy
996 				 * handing over the previous batch to ICL.
997 				 * Re-add this connection to the end of the
998 				 * queue.
999 				 */
1000 				mtx_lock(&cwt->cwt_lock);
1001 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
1002 				    rx_link);
1003 				mtx_unlock(&cwt->cwt_lock);
1004 			}
1005 			SOCKBUF_UNLOCK(sb);
1006 
1007 			mtx_lock(&cwt->cwt_lock);
1008 		}
1009 
1010 		/* Inner loop doesn't check for CWT_STOP, do that first. */
1011 		if (__predict_false(cwt->cwt_state == CWT_STOP))
1012 			break;
1013 		cwt->cwt_state = CWT_SLEEPING;
1014 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
1015 	}
1016 
1017 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
1018 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
1019 	cwt->cwt_state = CWT_STOPPED;
1020 	cv_signal(&cwt->cwt_cv);
1021 	mtx_unlock(&cwt->cwt_lock);
1022 	kthread_exit();
1023 }
1024 
1025 static int
1026 start_worker_threads(void)
1027 {
1028 	int i, rc;
1029 	struct cxgbei_worker_thread_softc *cwt;
1030 
1031 	worker_thread_count = min(mp_ncpus, 32);
1032 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
1033 	    M_WAITOK | M_ZERO);
1034 
1035 	MPASS(cxgbei_proc == NULL);
1036 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
1037 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
1038 		cv_init(&cwt->cwt_cv, "cwt cv");
1039 		TAILQ_INIT(&cwt->rx_head);
1040 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
1041 		    "cxgbei", "%d", i);
1042 		if (rc != 0) {
1043 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
1044 			    i + 1, worker_thread_count, rc);
1045 			mtx_destroy(&cwt->cwt_lock);
1046 			cv_destroy(&cwt->cwt_cv);
1047 			bzero(cwt, sizeof(*cwt));
1048 			if (i == 0) {
1049 				free(cwt_softc, M_CXGBE);
1050 				worker_thread_count = 0;
1051 
1052 				return (rc);
1053 			}
1054 
1055 			/* Not fatal, carry on with fewer threads. */
1056 			worker_thread_count = i;
1057 			rc = 0;
1058 			break;
1059 		}
1060 
1061 		/* Wait for thread to start before moving on to the next one. */
1062 		mtx_lock(&cwt->cwt_lock);
1063 		while (cwt->cwt_state == 0)
1064 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
1065 		mtx_unlock(&cwt->cwt_lock);
1066 	}
1067 
1068 	MPASS(cwt_softc != NULL);
1069 	MPASS(worker_thread_count > 0);
1070 	return (0);
1071 }
1072 
1073 static void
1074 stop_worker_threads(void)
1075 {
1076 	int i;
1077 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
1078 
1079 	MPASS(worker_thread_count >= 0);
1080 
1081 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
1082 		mtx_lock(&cwt->cwt_lock);
1083 		MPASS(cwt->cwt_state == CWT_RUNNING ||
1084 		    cwt->cwt_state == CWT_SLEEPING);
1085 		cwt->cwt_state = CWT_STOP;
1086 		cv_signal(&cwt->cwt_cv);
1087 		do {
1088 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
1089 		} while (cwt->cwt_state != CWT_STOPPED);
1090 		mtx_unlock(&cwt->cwt_lock);
1091 		mtx_destroy(&cwt->cwt_lock);
1092 		cv_destroy(&cwt->cwt_cv);
1093 	}
1094 	free(cwt_softc, M_CXGBE);
1095 }
1096 
1097 /* Select a worker thread for a connection. */
1098 u_int
1099 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
1100 {
1101 	struct adapter *sc = icc->sc;
1102 	struct toepcb *toep = icc->toep;
1103 	u_int i, n;
1104 
1105 	n = worker_thread_count / sc->sge.nofldrxq;
1106 	if (n > 0)
1107 		i = toep->vi->pi->port_id * n + arc4random() % n;
1108 	else
1109 		i = arc4random() % worker_thread_count;
1110 
1111 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
1112 
1113 	return (i);
1114 }
1115 
1116 static int
1117 cxgbei_mod_load(void)
1118 {
1119 	int rc;
1120 
1121 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
1122 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
1123 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
1124 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
1125 
1126 	rc = start_worker_threads();
1127 	if (rc != 0)
1128 		return (rc);
1129 
1130 	rc = t4_register_uld(&cxgbei_uld_info);
1131 	if (rc != 0) {
1132 		stop_worker_threads();
1133 		return (rc);
1134 	}
1135 
1136 	t4_iterate(cxgbei_activate_all, NULL);
1137 
1138 	return (rc);
1139 }
1140 
1141 static int
1142 cxgbei_mod_unload(void)
1143 {
1144 
1145 	t4_iterate(cxgbei_deactivate_all, NULL);
1146 
1147 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
1148 		return (EBUSY);
1149 
1150 	stop_worker_threads();
1151 
1152 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
1153 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
1154 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
1155 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
1156 
1157 	return (0);
1158 }
1159 #endif
1160 
1161 static int
1162 cxgbei_modevent(module_t mod, int cmd, void *arg)
1163 {
1164 	int rc = 0;
1165 
1166 #ifdef TCP_OFFLOAD
1167 	switch (cmd) {
1168 	case MOD_LOAD:
1169 		rc = cxgbei_mod_load();
1170 		if (rc == 0)
1171 			rc = icl_cxgbei_mod_load();
1172 		break;
1173 
1174 	case MOD_UNLOAD:
1175 		rc = icl_cxgbei_mod_unload();
1176 		if (rc == 0)
1177 			rc = cxgbei_mod_unload();
1178 		break;
1179 
1180 	default:
1181 		rc = EINVAL;
1182 	}
1183 #else
1184 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
1185 	rc = EOPNOTSUPP;
1186 #endif
1187 
1188 	return (rc);
1189 }
1190 
1191 static moduledata_t cxgbei_mod = {
1192 	"cxgbei",
1193 	cxgbei_modevent,
1194 	NULL,
1195 };
1196 
1197 MODULE_VERSION(cxgbei, 1);
1198 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1199 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
1200 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
1201 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
1202