xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 5405b282e1f319b6f3597bb77f68be903e7f248c)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/systm.h>
43 
44 #ifdef TCP_OFFLOAD
45 #include <sys/errno.h>
46 #include <sys/kthread.h>
47 #include <sys/smp.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/mbuf.h>
51 #include <sys/lock.h>
52 #include <sys/mutex.h>
53 #include <sys/condvar.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/in_pcb.h>
57 #include <netinet/toecore.h>
58 #include <netinet/tcp_var.h>
59 #include <netinet/tcp_fsm.h>
60 
61 #include <cam/scsi/scsi_all.h>
62 #include <cam/scsi/scsi_da.h>
63 #include <cam/ctl/ctl_io.h>
64 #include <cam/ctl/ctl.h>
65 #include <cam/ctl/ctl_backend.h>
66 #include <cam/ctl/ctl_error.h>
67 #include <cam/ctl/ctl_frontend.h>
68 #include <cam/ctl/ctl_debug.h>
69 #include <cam/ctl/ctl_ha.h>
70 #include <cam/ctl/ctl_ioctl.h>
71 
72 #include <dev/iscsi/icl.h>
73 #include <dev/iscsi/iscsi_proto.h>
74 #include <dev/iscsi/iscsi_ioctl.h>
75 #include <dev/iscsi/iscsi.h>
76 #include <cam/ctl/ctl_frontend_iscsi.h>
77 
78 #include <cam/cam.h>
79 #include <cam/cam_ccb.h>
80 #include <cam/cam_xpt.h>
81 #include <cam/cam_debug.h>
82 #include <cam/cam_sim.h>
83 #include <cam/cam_xpt_sim.h>
84 #include <cam/cam_xpt_periph.h>
85 #include <cam/cam_periph.h>
86 #include <cam/cam_compat.h>
87 #include <cam/scsi/scsi_message.h>
88 
89 #include "common/common.h"
90 #include "common/t4_msg.h"
91 #include "common/t4_regs.h"     /* for PCIE_MEM_ACCESS */
92 #include "tom/t4_tom.h"
93 #include "cxgbei.h"
94 
95 static int worker_thread_count;
96 static struct cxgbei_worker_thread_softc *cwt_softc;
97 static struct proc *cxgbei_proc;
98 
99 /* XXXNP some header instead. */
100 struct icl_pdu *icl_cxgbei_new_pdu(int);
101 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
102 void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
103 
104 static void
105 free_ci_counters(struct cxgbei_data *ci)
106 {
107 
108 #define FREE_CI_COUNTER(x) do { \
109 	if (ci->x != NULL) { \
110 		counter_u64_free(ci->x); \
111 		ci->x = NULL; \
112 	} \
113 } while (0)
114 
115 	FREE_CI_COUNTER(ddp_setup_ok);
116 	FREE_CI_COUNTER(ddp_setup_error);
117 	FREE_CI_COUNTER(ddp_bytes);
118 	FREE_CI_COUNTER(ddp_pdus);
119 	FREE_CI_COUNTER(fl_bytes);
120 	FREE_CI_COUNTER(fl_pdus);
121 #undef FREE_CI_COUNTER
122 }
123 
124 static int
125 alloc_ci_counters(struct cxgbei_data *ci)
126 {
127 
128 #define ALLOC_CI_COUNTER(x) do { \
129 	ci->x = counter_u64_alloc(M_WAITOK); \
130 	if (ci->x == NULL) \
131 		goto fail; \
132 } while (0)
133 
134 	ALLOC_CI_COUNTER(ddp_setup_ok);
135 	ALLOC_CI_COUNTER(ddp_setup_error);
136 	ALLOC_CI_COUNTER(ddp_bytes);
137 	ALLOC_CI_COUNTER(ddp_pdus);
138 	ALLOC_CI_COUNTER(fl_bytes);
139 	ALLOC_CI_COUNTER(fl_pdus);
140 #undef ALLOC_CI_COUNTER
141 
142 	return (0);
143 fail:
144 	free_ci_counters(ci);
145 	return (ENOMEM);
146 }
147 
148 static void
149 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len,
150     uint32_t *max_rx_pdu_len)
151 {
152 	uint32_t tx_len, rx_len, r, v;
153 
154 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
155 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
156 
157 	r = t4_read_reg(sc, A_TP_PARA_REG2);
158 	rx_len = min(rx_len, G_MAXRXDATA(r));
159 	tx_len = min(tx_len, G_MAXRXDATA(r));
160 
161 	r = t4_read_reg(sc, A_TP_PARA_REG7);
162 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
163 	rx_len = min(rx_len, v);
164 	tx_len = min(tx_len, v);
165 
166 	/* Remove after FW_FLOWC_MNEM_TXDATAPLEN_MAX fix in firmware. */
167 	tx_len = min(tx_len, 3 * 4096);
168 
169 	*max_tx_pdu_len = rounddown2(tx_len, 512);
170 	*max_rx_pdu_len = rounddown2(rx_len, 512);
171 }
172 
173 /*
174  * Initialize the software state of the iSCSI ULP driver.
175  *
176  * ENXIO means firmware didn't set up something that it was supposed to.
177  */
178 static int
179 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
180 {
181 	struct sysctl_oid *oid;
182 	struct sysctl_oid_list *children;
183 	struct ppod_region *pr;
184 	uint32_t r;
185 	int rc;
186 
187 	MPASS(sc->vres.iscsi.size > 0);
188 	MPASS(ci != NULL);
189 
190 	rc = alloc_ci_counters(ci);
191 	if (rc != 0)
192 		return (rc);
193 
194 	read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len);
195 
196 	pr = &ci->pr;
197 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
198 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
199 	if (rc != 0) {
200 		device_printf(sc->dev,
201 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
202 		    __func__, rc);
203 		free_ci_counters(ci);
204 		return (rc);
205 	}
206 
207 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
208 	r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
209 	if (r != pr->pr_tag_mask) {
210 		/*
211 		 * Recent firmwares are supposed to set up the iSCSI tagmask
212 		 * but we'll do it ourselves it the computed value doesn't match
213 		 * what's in the register.
214 		 */
215 		device_printf(sc->dev,
216 		    "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
217 		    pr->pr_tag_mask);
218 		t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
219 		    V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
220 	}
221 
222 	sysctl_ctx_init(&ci->ctx);
223 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
224 	children = SYSCTL_CHILDREN(oid);
225 
226 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD,
227 	    NULL, "iSCSI ULP statistics");
228 	children = SYSCTL_CHILDREN(oid);
229 
230 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_ok",
231 	    CTLFLAG_RD, &ci->ddp_setup_ok,
232 	    "# of times DDP buffer was setup successfully.");
233 
234 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_error",
235 	    CTLFLAG_RD, &ci->ddp_setup_error,
236 	    "# of times DDP buffer setup failed.");
237 
238 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_bytes",
239 	    CTLFLAG_RD, &ci->ddp_bytes, "# of bytes placed directly");
240 
241 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_pdus",
242 	    CTLFLAG_RD, &ci->ddp_pdus, "# of PDUs with data placed directly.");
243 
244 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_bytes",
245 	    CTLFLAG_RD, &ci->fl_bytes, "# of data bytes delivered in freelist");
246 
247 	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_pdus",
248 	    CTLFLAG_RD, &ci->fl_pdus,
249 	    "# of PDUs with data delivered in freelist");
250 
251 	ci->ddp_threshold = 2048;
252 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
253 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
254 
255 	return (0);
256 }
257 
258 static int
259 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
260 {
261 	struct adapter *sc = iq->adapter;
262 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
263 	u_int tid = GET_TID(cpl);
264 	struct toepcb *toep = lookup_tid(sc, tid);
265 	struct icl_pdu *ip;
266 	struct icl_cxgbei_pdu *icp;
267 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
268 	uint16_t len = be16toh(cpl->len);
269 
270 	M_ASSERTPKTHDR(m);
271 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
272 
273 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
274 	if (ip == NULL)
275 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
276 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
277 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
278 	icp = ip_to_icp(ip);
279 	icp->icp_seq = ntohl(cpl->seq);
280 	icp->icp_flags = ICPF_RX_HDR;
281 
282 	/* This is the start of a new PDU.  There should be no old state. */
283 	MPASS(toep->ulpcb2 == NULL);
284 	toep->ulpcb2 = icp;
285 
286 #if 0
287 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
288 	    __func__, tid, len, len_ddp, icp);
289 #endif
290 
291 	m_freem(m);
292 	return (0);
293 }
294 
295 static int
296 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
297 {
298 	struct adapter *sc = iq->adapter;
299 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
300 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
301 	u_int tid = GET_TID(cpl);
302 	struct toepcb *toep = lookup_tid(sc, tid);
303 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
304 
305 	M_ASSERTPKTHDR(m);
306 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
307 
308 	/* Must already have received the header (but not the data). */
309 	MPASS(icp != NULL);
310 	MPASS(icp->icp_flags == ICPF_RX_HDR);
311 	MPASS(icp->ip.ip_data_mbuf == NULL);
312 
313 
314 	m_adj(m, sizeof(*cpl));
315 	MPASS(icp->ip.ip_data_len == m->m_pkthdr.len);
316 
317 	icp->icp_flags |= ICPF_RX_FLBUF;
318 	icp->ip.ip_data_mbuf = m;
319 	counter_u64_add(ci->fl_pdus, 1);
320 	counter_u64_add(ci->fl_bytes, m->m_pkthdr.len);
321 
322 #if 0
323 	CTR3(KTR_CXGBE, "%s: tid %u, cpl->len %u", __func__, tid,
324 	    be16toh(cpl->len));
325 #endif
326 
327 	return (0);
328 }
329 
330 static int
331 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
332 {
333 	struct adapter *sc = iq->adapter;
334 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
335 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
336 	u_int tid = GET_TID(cpl);
337 	struct toepcb *toep = lookup_tid(sc, tid);
338 	struct inpcb *inp = toep->inp;
339 	struct socket *so;
340 	struct sockbuf *sb;
341 	struct tcpcb *tp;
342 	struct icl_cxgbei_conn *icc;
343 	struct icl_conn *ic;
344 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
345 	struct icl_pdu *ip;
346 	u_int pdu_len, val;
347 	struct epoch_tracker et;
348 
349 	MPASS(m == NULL);
350 
351 	/* Must already be assembling a PDU. */
352 	MPASS(icp != NULL);
353 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
354 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
355 
356 	pdu_len = be16toh(cpl->len);	/* includes everything. */
357 	val = be32toh(cpl->ddpvld);
358 
359 #if 0
360 	CTR5(KTR_CXGBE,
361 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
362 	    __func__, tid, pdu_len, val, icp->icp_flags);
363 #endif
364 
365 	icp->icp_flags |= ICPF_RX_STATUS;
366 	ip = &icp->ip;
367 	if (val & F_DDP_PADDING_ERR)
368 		icp->icp_flags |= ICPF_PAD_ERR;
369 	if (val & F_DDP_HDRCRC_ERR)
370 		icp->icp_flags |= ICPF_HCRC_ERR;
371 	if (val & F_DDP_DATACRC_ERR)
372 		icp->icp_flags |= ICPF_DCRC_ERR;
373 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
374 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
375 		MPASS(ip->ip_data_len > 0);
376 		icp->icp_flags |= ICPF_RX_DDP;
377 		counter_u64_add(ci->ddp_pdus, 1);
378 		counter_u64_add(ci->ddp_bytes, ip->ip_data_len);
379 	}
380 
381 	INP_WLOCK(inp);
382 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
383 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
384 		    __func__, tid, pdu_len, inp->inp_flags);
385 		INP_WUNLOCK(inp);
386 		icl_cxgbei_conn_pdu_free(NULL, ip);
387 #ifdef INVARIANTS
388 		toep->ulpcb2 = NULL;
389 #endif
390 		return (0);
391 	}
392 
393 	tp = intotcpcb(inp);
394 	MPASS(icp->icp_seq == tp->rcv_nxt);
395 	MPASS(tp->rcv_wnd >= pdu_len);
396 	tp->rcv_nxt += pdu_len;
397 	tp->rcv_wnd -= pdu_len;
398 	tp->t_rcvtime = ticks;
399 
400 	/* update rx credits */
401 	toep->rx_credits += pdu_len;
402 	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
403 
404 	so = inp->inp_socket;
405 	sb = &so->so_rcv;
406 	SOCKBUF_LOCK(sb);
407 
408 	icc = toep->ulpcb;
409 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
410 		CTR5(KTR_CXGBE,
411 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
412 		    __func__, tid, pdu_len, icc, sb->sb_state);
413 		SOCKBUF_UNLOCK(sb);
414 		INP_WUNLOCK(inp);
415 
416 		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
417 		INP_WLOCK(inp);
418 		tp = tcp_drop(tp, ECONNRESET);
419 		if (tp)
420 			INP_WUNLOCK(inp);
421 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
422 
423 		icl_cxgbei_conn_pdu_free(NULL, ip);
424 #ifdef INVARIANTS
425 		toep->ulpcb2 = NULL;
426 #endif
427 		return (0);
428 	}
429 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
430 	ic = &icc->ic;
431 	icl_cxgbei_new_pdu_set_conn(ip, ic);
432 
433 	MPASS(m == NULL); /* was unused, we'll use it now. */
434 	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
435 	if (__predict_false(m != NULL)) {
436 		int len = m_length(m, NULL);
437 
438 		/*
439 		 * PDUs were received before the tid transitioned to ULP mode.
440 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
441 		 * the PDU in icp/ip.
442 		 */
443 		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
444 		    len);
445 
446 		/* XXXNP: needs to be rewritten. */
447 		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
448 		    iscsi_bhs)) {
449 			struct icl_cxgbei_pdu *icp0;
450 			struct icl_pdu *ip0;
451 
452 			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
453 			if (ip0 == NULL)
454 				CXGBE_UNIMPLEMENTED("PDU allocation failure");
455 			icl_cxgbei_new_pdu_set_conn(ip0, ic);
456 			icp0 = ip_to_icp(ip0);
457 			icp0->icp_seq = 0; /* XXX */
458 			icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS;
459 			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
460 			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
461 		}
462 		m_freem(m);
463 	}
464 
465 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
466 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
467 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
468 
469 		mtx_lock(&cwt->cwt_lock);
470 		icc->rx_flags |= RXF_ACTIVE;
471 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
472 		if (cwt->cwt_state == CWT_SLEEPING) {
473 			cwt->cwt_state = CWT_RUNNING;
474 			cv_signal(&cwt->cwt_cv);
475 		}
476 		mtx_unlock(&cwt->cwt_lock);
477 	}
478 	SOCKBUF_UNLOCK(sb);
479 	INP_WUNLOCK(inp);
480 
481 #ifdef INVARIANTS
482 	toep->ulpcb2 = NULL;
483 #endif
484 
485 	return (0);
486 }
487 
488 static int
489 cxgbei_activate(struct adapter *sc)
490 {
491 	struct cxgbei_data *ci;
492 	int rc;
493 
494 	ASSERT_SYNCHRONIZED_OP(sc);
495 
496 	if (uld_active(sc, ULD_ISCSI)) {
497 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
498 		    __func__, sc));
499 		return (0);
500 	}
501 
502 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
503 		device_printf(sc->dev,
504 		    "not iSCSI offload capable, or capability disabled.\n");
505 		return (ENOSYS);
506 	}
507 
508 	/* per-adapter softc for iSCSI */
509 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
510 	if (ci == NULL)
511 		return (ENOMEM);
512 
513 	rc = cxgbei_init(sc, ci);
514 	if (rc != 0) {
515 		free(ci, M_CXGBE);
516 		return (rc);
517 	}
518 
519 	sc->iscsi_ulp_softc = ci;
520 
521 	return (0);
522 }
523 
524 static int
525 cxgbei_deactivate(struct adapter *sc)
526 {
527 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
528 
529 	ASSERT_SYNCHRONIZED_OP(sc);
530 
531 	if (ci != NULL) {
532 		sysctl_ctx_free(&ci->ctx);
533 		t4_free_ppod_region(&ci->pr);
534 		free_ci_counters(ci);
535 		free(ci, M_CXGBE);
536 		sc->iscsi_ulp_softc = NULL;
537 	}
538 
539 	return (0);
540 }
541 
542 static void
543 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
544 {
545 
546 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
547 		return;
548 
549 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
550 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
551 		(void) t4_activate_uld(sc, ULD_ISCSI);
552 
553 	end_synchronized_op(sc, 0);
554 }
555 
556 static void
557 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
558 {
559 
560 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
561 		return;
562 
563 	if (uld_active(sc, ULD_ISCSI))
564 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
565 
566 	end_synchronized_op(sc, 0);
567 }
568 
569 static struct uld_info cxgbei_uld_info = {
570 	.uld_id = ULD_ISCSI,
571 	.activate = cxgbei_activate,
572 	.deactivate = cxgbei_deactivate,
573 };
574 
575 static void
576 cwt_main(void *arg)
577 {
578 	struct cxgbei_worker_thread_softc *cwt = arg;
579 	struct icl_cxgbei_conn *icc = NULL;
580 	struct icl_conn *ic;
581 	struct icl_pdu *ip;
582 	struct sockbuf *sb;
583 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
584 
585 	MPASS(cwt != NULL);
586 
587 	mtx_lock(&cwt->cwt_lock);
588 	MPASS(cwt->cwt_state == 0);
589 	cwt->cwt_state = CWT_RUNNING;
590 	cv_signal(&cwt->cwt_cv);
591 
592 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
593 		cwt->cwt_state = CWT_RUNNING;
594 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
595 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
596 			mtx_unlock(&cwt->cwt_lock);
597 
598 			ic = &icc->ic;
599 			sb = &ic->ic_socket->so_rcv;
600 
601 			SOCKBUF_LOCK(sb);
602 			MPASS(icc->rx_flags & RXF_ACTIVE);
603 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
604 				MPASS(STAILQ_EMPTY(&rx_pdus));
605 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
606 				SOCKBUF_UNLOCK(sb);
607 
608 				/* Hand over PDUs to ICL. */
609 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
610 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
611 					ic->ic_receive(ip);
612 				}
613 
614 				SOCKBUF_LOCK(sb);
615 				MPASS(STAILQ_EMPTY(&rx_pdus));
616 			}
617 			MPASS(icc->rx_flags & RXF_ACTIVE);
618 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
619 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
620 				icc->rx_flags &= ~RXF_ACTIVE;
621 			} else {
622 				/*
623 				 * More PDUs were received while we were busy
624 				 * handing over the previous batch to ICL.
625 				 * Re-add this connection to the end of the
626 				 * queue.
627 				 */
628 				mtx_lock(&cwt->cwt_lock);
629 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
630 				    rx_link);
631 				mtx_unlock(&cwt->cwt_lock);
632 			}
633 			SOCKBUF_UNLOCK(sb);
634 
635 			mtx_lock(&cwt->cwt_lock);
636 		}
637 
638 		/* Inner loop doesn't check for CWT_STOP, do that first. */
639 		if (__predict_false(cwt->cwt_state == CWT_STOP))
640 			break;
641 		cwt->cwt_state = CWT_SLEEPING;
642 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
643 	}
644 
645 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
646 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
647 	cwt->cwt_state = CWT_STOPPED;
648 	cv_signal(&cwt->cwt_cv);
649 	mtx_unlock(&cwt->cwt_lock);
650 	kthread_exit();
651 }
652 
653 static int
654 start_worker_threads(void)
655 {
656 	int i, rc;
657 	struct cxgbei_worker_thread_softc *cwt;
658 
659 	worker_thread_count = min(mp_ncpus, 32);
660 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
661 	    M_WAITOK | M_ZERO);
662 
663 	MPASS(cxgbei_proc == NULL);
664 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
665 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
666 		cv_init(&cwt->cwt_cv, "cwt cv");
667 		TAILQ_INIT(&cwt->rx_head);
668 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
669 		    "cxgbei", "%d", i);
670 		if (rc != 0) {
671 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
672 			    i + 1, worker_thread_count, rc);
673 			mtx_destroy(&cwt->cwt_lock);
674 			cv_destroy(&cwt->cwt_cv);
675 			bzero(cwt, sizeof(*cwt));
676 			if (i == 0) {
677 				free(cwt_softc, M_CXGBE);
678 				worker_thread_count = 0;
679 
680 				return (rc);
681 			}
682 
683 			/* Not fatal, carry on with fewer threads. */
684 			worker_thread_count = i;
685 			rc = 0;
686 			break;
687 		}
688 
689 		/* Wait for thread to start before moving on to the next one. */
690 		mtx_lock(&cwt->cwt_lock);
691 		while (cwt->cwt_state == 0)
692 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
693 		mtx_unlock(&cwt->cwt_lock);
694 	}
695 
696 	MPASS(cwt_softc != NULL);
697 	MPASS(worker_thread_count > 0);
698 	return (0);
699 }
700 
701 static void
702 stop_worker_threads(void)
703 {
704 	int i;
705 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
706 
707 	MPASS(worker_thread_count >= 0);
708 
709 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
710 		mtx_lock(&cwt->cwt_lock);
711 		MPASS(cwt->cwt_state == CWT_RUNNING ||
712 		    cwt->cwt_state == CWT_SLEEPING);
713 		cwt->cwt_state = CWT_STOP;
714 		cv_signal(&cwt->cwt_cv);
715 		do {
716 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
717 		} while (cwt->cwt_state != CWT_STOPPED);
718 		mtx_unlock(&cwt->cwt_lock);
719 	}
720 	free(cwt_softc, M_CXGBE);
721 }
722 
723 /* Select a worker thread for a connection. */
724 u_int
725 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
726 {
727 	struct adapter *sc = icc->sc;
728 	struct toepcb *toep = icc->toep;
729 	u_int i, n;
730 
731 	n = worker_thread_count / sc->sge.nofldrxq;
732 	if (n > 0)
733 		i = toep->vi->pi->port_id * n + arc4random() % n;
734 	else
735 		i = arc4random() % worker_thread_count;
736 
737 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
738 
739 	return (i);
740 }
741 
742 static int
743 cxgbei_mod_load(void)
744 {
745 	int rc;
746 
747 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
748 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
749 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
750 
751 	rc = start_worker_threads();
752 	if (rc != 0)
753 		return (rc);
754 
755 	rc = t4_register_uld(&cxgbei_uld_info);
756 	if (rc != 0) {
757 		stop_worker_threads();
758 		return (rc);
759 	}
760 
761 	t4_iterate(cxgbei_activate_all, NULL);
762 
763 	return (rc);
764 }
765 
766 static int
767 cxgbei_mod_unload(void)
768 {
769 
770 	t4_iterate(cxgbei_deactivate_all, NULL);
771 
772 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
773 		return (EBUSY);
774 
775 	stop_worker_threads();
776 
777 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
778 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
779 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
780 
781 	return (0);
782 }
783 #endif
784 
785 static int
786 cxgbei_modevent(module_t mod, int cmd, void *arg)
787 {
788 	int rc = 0;
789 
790 #ifdef TCP_OFFLOAD
791 	switch (cmd) {
792 	case MOD_LOAD:
793 		rc = cxgbei_mod_load();
794 		if (rc == 0)
795 			rc = icl_cxgbei_mod_load();
796 		break;
797 
798 	case MOD_UNLOAD:
799 		rc = icl_cxgbei_mod_unload();
800 		if (rc == 0)
801 			rc = cxgbei_mod_unload();
802 		break;
803 
804 	default:
805 		rc = EINVAL;
806 	}
807 #else
808 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
809 	rc = EOPNOTSUPP;
810 #endif
811 
812 	return (rc);
813 }
814 
815 static moduledata_t cxgbei_mod = {
816 	"cxgbei",
817 	cxgbei_modevent,
818 	NULL,
819 };
820 
821 MODULE_VERSION(cxgbei, 1);
822 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
823 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
824 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
825 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
826