xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision bc5304a006238115291e7568583632889dffbab9)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/systm.h>
43 
44 #ifdef TCP_OFFLOAD
45 #include <sys/errno.h>
46 #include <sys/kthread.h>
47 #include <sys/smp.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/mbuf.h>
51 #include <sys/lock.h>
52 #include <sys/mutex.h>
53 #include <sys/condvar.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/in_pcb.h>
57 #include <netinet/toecore.h>
58 #include <netinet/tcp_var.h>
59 #include <netinet/tcp_fsm.h>
60 
61 #include <cam/scsi/scsi_all.h>
62 #include <cam/scsi/scsi_da.h>
63 #include <cam/ctl/ctl_io.h>
64 #include <cam/ctl/ctl.h>
65 #include <cam/ctl/ctl_backend.h>
66 #include <cam/ctl/ctl_error.h>
67 #include <cam/ctl/ctl_frontend.h>
68 #include <cam/ctl/ctl_debug.h>
69 #include <cam/ctl/ctl_ha.h>
70 #include <cam/ctl/ctl_ioctl.h>
71 
72 #include <dev/iscsi/icl.h>
73 #include <dev/iscsi/iscsi_proto.h>
74 #include <dev/iscsi/iscsi_ioctl.h>
75 #include <dev/iscsi/iscsi.h>
76 #include <cam/ctl/ctl_frontend_iscsi.h>
77 
78 #include <cam/cam.h>
79 #include <cam/cam_ccb.h>
80 #include <cam/cam_xpt.h>
81 #include <cam/cam_debug.h>
82 #include <cam/cam_sim.h>
83 #include <cam/cam_xpt_sim.h>
84 #include <cam/cam_xpt_periph.h>
85 #include <cam/cam_periph.h>
86 #include <cam/cam_compat.h>
87 #include <cam/scsi/scsi_message.h>
88 
89 #include "common/common.h"
90 #include "common/t4_msg.h"
91 #include "common/t4_regs.h"	/* for PCIE_MEM_ACCESS */
92 #include "tom/t4_tom.h"
93 #include "cxgbei.h"
94 
95 static int worker_thread_count;
96 static struct cxgbei_worker_thread_softc *cwt_softc;
97 static struct proc *cxgbei_proc;
98 
99 static void
100 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len,
101     uint32_t *max_rx_data_len, struct ppod_region *pr)
102 {
103 	uint32_t tx_len, rx_len, r, v;
104 
105 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
106 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
107 
108 	r = t4_read_reg(sc, A_TP_PARA_REG2);
109 	rx_len = min(rx_len, G_MAXRXDATA(r));
110 	tx_len = min(tx_len, G_MAXRXDATA(r));
111 
112 	r = t4_read_reg(sc, A_TP_PARA_REG7);
113 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
114 	rx_len = min(rx_len, v);
115 	tx_len = min(tx_len, v);
116 
117 	/*
118 	 * AHS is not supported by the kernel so we'll not account for
119 	 * it either in our PDU len -> data segment len conversions.
120 	 */
121 	rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
122 	    ISCSI_DATA_DIGEST_SIZE;
123 	tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
124 	    ISCSI_DATA_DIGEST_SIZE;
125 
126 	/*
127 	 * DDP can place only 4 pages for a single PDU.  A single
128 	 * request might use larger pages than the smallest page size,
129 	 * but that cannot be guaranteed.  Assume the smallest DDP
130 	 * page size for this limit.
131 	 */
132 	rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0]));
133 
134 	if (chip_id(sc) == CHELSIO_T5) {
135 		rx_len = rounddown2(rx_len, 512);
136 		tx_len = rounddown2(tx_len, 512);
137 	}
138 
139 	*max_tx_data_len = tx_len;
140 	*max_rx_data_len = rx_len;
141 }
142 
143 /*
144  * Initialize the software state of the iSCSI ULP driver.
145  *
146  * ENXIO means firmware didn't set up something that it was supposed to.
147  */
148 static int
149 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
150 {
151 	struct sysctl_oid *oid;
152 	struct sysctl_oid_list *children;
153 	struct ppod_region *pr;
154 	uint32_t r;
155 	int rc;
156 
157 	MPASS(sc->vres.iscsi.size > 0);
158 	MPASS(ci != NULL);
159 
160 	pr = &ci->pr;
161 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
162 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
163 	if (rc != 0) {
164 		device_printf(sc->dev,
165 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
166 		    __func__, rc);
167 		return (rc);
168 	}
169 
170 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
171 	r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
172 	if (r != pr->pr_tag_mask) {
173 		/*
174 		 * Recent firmwares are supposed to set up the iSCSI tagmask
175 		 * but we'll do it ourselves it the computed value doesn't match
176 		 * what's in the register.
177 		 */
178 		device_printf(sc->dev,
179 		    "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
180 		    pr->pr_tag_mask);
181 		t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
182 		    V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
183 	}
184 
185 	read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr);
186 
187 	sysctl_ctx_init(&ci->ctx);
188 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
189 	children = SYSCTL_CHILDREN(oid);
190 
191 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
192 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
193 	children = SYSCTL_CHILDREN(oid);
194 
195 	ci->ddp_threshold = 2048;
196 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
197 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
198 
199 	return (0);
200 }
201 
202 static int
203 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
204 {
205 	struct adapter *sc = iq->adapter;
206 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
207 	u_int tid = GET_TID(cpl);
208 	struct toepcb *toep = lookup_tid(sc, tid);
209 	struct icl_pdu *ip;
210 	struct icl_cxgbei_pdu *icp;
211 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
212 	uint16_t len = be16toh(cpl->len);
213 
214 	M_ASSERTPKTHDR(m);
215 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
216 
217 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
218 	if (ip == NULL)
219 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
220 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
221 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
222 	icp = ip_to_icp(ip);
223 	icp->icp_seq = ntohl(cpl->seq);
224 	icp->icp_flags = ICPF_RX_HDR;
225 
226 	/* This is the start of a new PDU.  There should be no old state. */
227 	MPASS(toep->ulpcb2 == NULL);
228 	toep->ulpcb2 = icp;
229 
230 #if 0
231 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
232 	    __func__, tid, len, len_ddp, icp);
233 #endif
234 
235 	m_freem(m);
236 	return (0);
237 }
238 
239 static int
240 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
241 {
242 	struct adapter *sc = iq->adapter;
243 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
244 	u_int tid = GET_TID(cpl);
245 	struct toepcb *toep = lookup_tid(sc, tid);
246 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
247 	struct icl_pdu *ip;
248 
249 	M_ASSERTPKTHDR(m);
250 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
251 
252 	if (icp == NULL) {
253 		/*
254 		 * T6 completion enabled, start of a new pdu. Header
255 		 * will come in completion CPL.
256 		 */
257 	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
258 	        if (ip == NULL)
259 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
260 		icp = ip_to_icp(ip);
261 	} else {
262 		/* T5 mode, header is already received. */
263 		MPASS(icp->icp_flags == ICPF_RX_HDR);
264 		MPASS(icp->ip.ip_data_mbuf == NULL);
265 		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
266 	}
267 
268 	/* Trim the cpl header from mbuf. */
269 	m_adj(m, sizeof(*cpl));
270 
271 	icp->icp_flags |= ICPF_RX_FLBUF;
272 	icp->ip.ip_data_mbuf = m;
273 	toep->ofld_rxq->rx_iscsi_fl_pdus++;
274 	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
275 
276 	/*
277 	 * For T6, save the icp for further processing in the
278 	 * completion handler.
279 	 */
280 	if (icp->icp_flags == ICPF_RX_FLBUF) {
281 		MPASS(toep->ulpcb2 == NULL);
282 		toep->ulpcb2 = icp;
283 	}
284 
285 #if 0
286 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
287 	    be16toh(cpl->len), icp);
288 #endif
289 
290 	return (0);
291 }
292 
293 static int
294 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
295 {
296 	struct adapter *sc = iq->adapter;
297 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
298 	u_int tid = GET_TID(cpl);
299 	struct toepcb *toep = lookup_tid(sc, tid);
300 	struct inpcb *inp = toep->inp;
301 	struct socket *so;
302 	struct sockbuf *sb;
303 	struct tcpcb *tp;
304 	struct icl_cxgbei_conn *icc;
305 	struct icl_conn *ic;
306 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
307 	struct icl_pdu *ip;
308 	u_int pdu_len, val;
309 	struct epoch_tracker et;
310 
311 	MPASS(m == NULL);
312 
313 	/* Must already be assembling a PDU. */
314 	MPASS(icp != NULL);
315 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
316 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
317 
318 	pdu_len = be16toh(cpl->len);	/* includes everything. */
319 	val = be32toh(cpl->ddpvld);
320 
321 #if 0
322 	CTR5(KTR_CXGBE,
323 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
324 	    __func__, tid, pdu_len, val, icp->icp_flags);
325 #endif
326 
327 	icp->icp_flags |= ICPF_RX_STATUS;
328 	ip = &icp->ip;
329 	if (val & F_DDP_PADDING_ERR)
330 		icp->icp_flags |= ICPF_PAD_ERR;
331 	if (val & F_DDP_HDRCRC_ERR)
332 		icp->icp_flags |= ICPF_HCRC_ERR;
333 	if (val & F_DDP_DATACRC_ERR)
334 		icp->icp_flags |= ICPF_DCRC_ERR;
335 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
336 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
337 		MPASS(ip->ip_data_len > 0);
338 		icp->icp_flags |= ICPF_RX_DDP;
339 		toep->ofld_rxq->rx_iscsi_ddp_pdus++;
340 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
341 	}
342 
343 	INP_WLOCK(inp);
344 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
345 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
346 		    __func__, tid, pdu_len, inp->inp_flags);
347 		INP_WUNLOCK(inp);
348 		icl_cxgbei_conn_pdu_free(NULL, ip);
349 		toep->ulpcb2 = NULL;
350 		return (0);
351 	}
352 
353 	/*
354 	 * T6+ does not report data PDUs received via DDP without F
355 	 * set.  This can result in gaps in the TCP sequence space.
356 	 */
357 	tp = intotcpcb(inp);
358 	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
359 	tp->rcv_nxt = icp->icp_seq + pdu_len;
360 	tp->t_rcvtime = ticks;
361 
362 	/*
363 	 * Don't update the window size or return credits since RX
364 	 * flow control is disabled.
365 	 */
366 
367 	so = inp->inp_socket;
368 	sb = &so->so_rcv;
369 	SOCKBUF_LOCK(sb);
370 
371 	icc = toep->ulpcb;
372 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
373 		CTR5(KTR_CXGBE,
374 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
375 		    __func__, tid, pdu_len, icc, sb->sb_state);
376 		SOCKBUF_UNLOCK(sb);
377 		INP_WUNLOCK(inp);
378 
379 		CURVNET_SET(so->so_vnet);
380 		NET_EPOCH_ENTER(et);
381 		INP_WLOCK(inp);
382 		tp = tcp_drop(tp, ECONNRESET);
383 		if (tp)
384 			INP_WUNLOCK(inp);
385 		NET_EPOCH_EXIT(et);
386 		CURVNET_RESTORE();
387 
388 		icl_cxgbei_conn_pdu_free(NULL, ip);
389 		toep->ulpcb2 = NULL;
390 		return (0);
391 	}
392 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
393 	ic = &icc->ic;
394 	icl_cxgbei_new_pdu_set_conn(ip, ic);
395 
396 	MPASS(m == NULL); /* was unused, we'll use it now. */
397 	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
398 	if (__predict_false(m != NULL)) {
399 		int len = m_length(m, NULL);
400 
401 		/*
402 		 * PDUs were received before the tid transitioned to ULP mode.
403 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
404 		 * the PDU in icp/ip.
405 		 */
406 		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
407 		    len);
408 
409 		/* XXXNP: needs to be rewritten. */
410 		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
411 		    iscsi_bhs)) {
412 			struct icl_cxgbei_pdu *icp0;
413 			struct icl_pdu *ip0;
414 
415 			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
416 			if (ip0 == NULL)
417 				CXGBE_UNIMPLEMENTED("PDU allocation failure");
418 			icl_cxgbei_new_pdu_set_conn(ip0, ic);
419 			icp0 = ip_to_icp(ip0);
420 			icp0->icp_seq = 0; /* XXX */
421 			icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS;
422 			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
423 			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
424 		}
425 		m_freem(m);
426 	}
427 
428 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
429 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
430 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
431 
432 		mtx_lock(&cwt->cwt_lock);
433 		icc->rx_flags |= RXF_ACTIVE;
434 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
435 		if (cwt->cwt_state == CWT_SLEEPING) {
436 			cwt->cwt_state = CWT_RUNNING;
437 			cv_signal(&cwt->cwt_cv);
438 		}
439 		mtx_unlock(&cwt->cwt_lock);
440 	}
441 	SOCKBUF_UNLOCK(sb);
442 	INP_WUNLOCK(inp);
443 
444 	toep->ulpcb2 = NULL;
445 
446 	return (0);
447 }
448 
449 static int
450 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
451 {
452 	struct epoch_tracker et;
453 	struct adapter *sc = iq->adapter;
454 	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
455 	u_int tid = GET_TID(cpl);
456 	struct toepcb *toep = lookup_tid(sc, tid);
457 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
458 	struct icl_pdu *ip;
459 	struct cxgbei_cmp *cmp;
460 	struct inpcb *inp = toep->inp;
461 #ifdef INVARIANTS
462 	uint16_t len = be16toh(cpl->len);
463 #endif
464 	struct socket *so;
465 	struct sockbuf *sb;
466 	struct tcpcb *tp;
467 	struct icl_cxgbei_conn *icc;
468 	struct icl_conn *ic;
469 	struct iscsi_bhs_data_out *bhsdo;
470 	u_int val = be32toh(cpl->ddpvld);
471 	u_int npdus, pdu_len, data_digest_len, hdr_digest_len;
472 	uint32_t prev_seg_len;
473 
474 	M_ASSERTPKTHDR(m);
475 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
476 
477 	if ((val & F_DDP_PDU) == 0) {
478 		MPASS(icp != NULL);
479 		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
480 		ip = &icp->ip;
481 	}
482 
483 	if (icp == NULL) {
484 		/* T6 completion enabled, start of a new PDU. */
485 		ip = icl_cxgbei_new_pdu(M_NOWAIT);
486 		if (ip == NULL)
487 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
488 		icp = ip_to_icp(ip);
489 	}
490 	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
491 
492 #if 0
493 	CTR5(KTR_CXGBE,
494 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
495 	    __func__, tid, pdu_len, val, icp);
496 #endif
497 
498 	/* Copy header */
499 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
500 	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
501 	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
502 	    bhsdo->bhsdo_data_segment_len[1] << 8 |
503 	    bhsdo->bhsdo_data_segment_len[2];
504 	icp->icp_seq = ntohl(cpl->seq);
505 	icp->icp_flags |= ICPF_RX_HDR;
506 	icp->icp_flags |= ICPF_RX_STATUS;
507 
508 	if (val & F_DDP_PADDING_ERR)
509 		icp->icp_flags |= ICPF_PAD_ERR;
510 	if (val & F_DDP_HDRCRC_ERR)
511 		icp->icp_flags |= ICPF_HCRC_ERR;
512 	if (val & F_DDP_DATACRC_ERR)
513 		icp->icp_flags |= ICPF_DCRC_ERR;
514 
515 	INP_WLOCK(inp);
516 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
517 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
518 		    __func__, tid, pdu_len, inp->inp_flags);
519 		INP_WUNLOCK(inp);
520 		icl_cxgbei_conn_pdu_free(NULL, ip);
521 		toep->ulpcb2 = NULL;
522 		m_freem(m);
523 		return (0);
524 	}
525 
526 	tp = intotcpcb(inp);
527 
528 	/*
529 	 * If icc is NULL, the connection is being closed in
530 	 * icl_cxgbei_conn_close(), just drop this data.
531 	 */
532 	icc = toep->ulpcb;
533 	if (__predict_false(icc == NULL)) {
534 		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
535 		    __func__, tid, pdu_len, icc);
536 
537 		/*
538 		 * Update rcv_nxt so the sequence number of the FIN
539 		 * doesn't appear wrong.
540 		 */
541 		tp->rcv_nxt = icp->icp_seq + pdu_len;
542 		tp->t_rcvtime = ticks;
543 		INP_WUNLOCK(inp);
544 
545 		icl_cxgbei_conn_pdu_free(NULL, ip);
546 		toep->ulpcb2 = NULL;
547 		m_freem(m);
548 		return (0);
549 	}
550 
551 	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
552 	    ISCSI_DATA_DIGEST_SIZE : 0;
553 	hdr_digest_len = (icc->ulp_submode & ULP_CRC_HEADER) ?
554 	    ISCSI_HEADER_DIGEST_SIZE : 0;
555 	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
556 
557 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
558 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
559 		MPASS(ip->ip_data_len > 0);
560 		icp->icp_flags |= ICPF_RX_DDP;
561 		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
562 
563 		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
564 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
565 			cmp = cxgbei_find_cmp(icc,
566 			    be32toh(bhsdo->bhsdo_initiator_task_tag));
567 			break;
568 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
569 			cmp = cxgbei_find_cmp(icc,
570 			    be32toh(bhsdo->bhsdo_target_transfer_tag));
571 			break;
572 		default:
573 			__assert_unreachable();
574 		}
575 		MPASS(cmp != NULL);
576 
577 		/*
578 		 * The difference between the end of the last burst
579 		 * and the offset of the last PDU in this burst is
580 		 * the additional data received via DDP.
581 		 */
582 		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
583 		    cmp->next_buffer_offset;
584 
585 		if (prev_seg_len != 0) {
586 			uint32_t orig_datasn;
587 
588 			/*
589 			 * Return a "large" PDU representing the burst
590 			 * of PDUs.  Adjust the offset and length of
591 			 * this PDU to represent the entire burst.
592 			 */
593 			ip->ip_data_len += prev_seg_len;
594 			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
595 			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
596 			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
597 			bhsdo->bhsdo_buffer_offset =
598 			    htobe32(cmp->next_buffer_offset);
599 
600 			orig_datasn = htobe32(bhsdo->bhsdo_datasn);
601 			npdus = orig_datasn - cmp->last_datasn;
602 			bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1);
603 			cmp->last_datasn = orig_datasn;
604 			ip->ip_additional_pdus = npdus - 1;
605 		} else {
606 			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
607 			    cmp->last_datasn + 1);
608 			npdus = 1;
609 			cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
610 		}
611 
612 		cmp->next_buffer_offset += ip->ip_data_len;
613 		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
614 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
615 	} else {
616 		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
617 		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
618 	}
619 
620 	tp->rcv_nxt = icp->icp_seq + pdu_len;
621 	tp->t_rcvtime = ticks;
622 
623 	/*
624 	 * Don't update the window size or return credits since RX
625 	 * flow control is disabled.
626 	 */
627 
628 	so = inp->inp_socket;
629 	sb = &so->so_rcv;
630 	SOCKBUF_LOCK(sb);
631 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
632 		CTR5(KTR_CXGBE,
633 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
634 		    __func__, tid, pdu_len, icc, sb->sb_state);
635 		SOCKBUF_UNLOCK(sb);
636 		INP_WUNLOCK(inp);
637 
638 		CURVNET_SET(so->so_vnet);
639 		NET_EPOCH_ENTER(et);
640 		INP_WLOCK(inp);
641 		tp = tcp_drop(tp, ECONNRESET);
642 		if (tp != NULL)
643 			INP_WUNLOCK(inp);
644 		NET_EPOCH_EXIT(et);
645 		CURVNET_RESTORE();
646 
647 		icl_cxgbei_conn_pdu_free(NULL, ip);
648 		toep->ulpcb2 = NULL;
649 		m_freem(m);
650 		return (0);
651 	}
652 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
653 	ic = &icc->ic;
654 	icl_cxgbei_new_pdu_set_conn(ip, ic);
655 
656 	/* Enqueue the PDU to the received pdus queue. */
657 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
658 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
659 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
660 
661 		mtx_lock(&cwt->cwt_lock);
662 		icc->rx_flags |= RXF_ACTIVE;
663 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
664 		if (cwt->cwt_state == CWT_SLEEPING) {
665 			cwt->cwt_state = CWT_RUNNING;
666 			cv_signal(&cwt->cwt_cv);
667 		}
668 		mtx_unlock(&cwt->cwt_lock);
669 	}
670 	SOCKBUF_UNLOCK(sb);
671 	INP_WUNLOCK(inp);
672 
673 	toep->ulpcb2 = NULL;
674 	m_freem(m);
675 
676 	return (0);
677 }
678 
679 static int
680 cxgbei_activate(struct adapter *sc)
681 {
682 	struct cxgbei_data *ci;
683 	int rc;
684 
685 	ASSERT_SYNCHRONIZED_OP(sc);
686 
687 	if (uld_active(sc, ULD_ISCSI)) {
688 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
689 		    __func__, sc));
690 		return (0);
691 	}
692 
693 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
694 		device_printf(sc->dev,
695 		    "not iSCSI offload capable, or capability disabled.\n");
696 		return (ENOSYS);
697 	}
698 
699 	/* per-adapter softc for iSCSI */
700 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
701 	if (ci == NULL)
702 		return (ENOMEM);
703 
704 	rc = cxgbei_init(sc, ci);
705 	if (rc != 0) {
706 		free(ci, M_CXGBE);
707 		return (rc);
708 	}
709 
710 	sc->iscsi_ulp_softc = ci;
711 
712 	return (0);
713 }
714 
715 static int
716 cxgbei_deactivate(struct adapter *sc)
717 {
718 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
719 
720 	ASSERT_SYNCHRONIZED_OP(sc);
721 
722 	if (ci != NULL) {
723 		sysctl_ctx_free(&ci->ctx);
724 		t4_free_ppod_region(&ci->pr);
725 		free(ci, M_CXGBE);
726 		sc->iscsi_ulp_softc = NULL;
727 	}
728 
729 	return (0);
730 }
731 
732 static void
733 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
734 {
735 
736 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
737 		return;
738 
739 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
740 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
741 		(void) t4_activate_uld(sc, ULD_ISCSI);
742 
743 	end_synchronized_op(sc, 0);
744 }
745 
746 static void
747 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
748 {
749 
750 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
751 		return;
752 
753 	if (uld_active(sc, ULD_ISCSI))
754 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
755 
756 	end_synchronized_op(sc, 0);
757 }
758 
759 static struct uld_info cxgbei_uld_info = {
760 	.uld_id = ULD_ISCSI,
761 	.activate = cxgbei_activate,
762 	.deactivate = cxgbei_deactivate,
763 };
764 
765 static void
766 cwt_main(void *arg)
767 {
768 	struct cxgbei_worker_thread_softc *cwt = arg;
769 	struct icl_cxgbei_conn *icc = NULL;
770 	struct icl_conn *ic;
771 	struct icl_pdu *ip;
772 	struct sockbuf *sb;
773 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
774 
775 	MPASS(cwt != NULL);
776 
777 	mtx_lock(&cwt->cwt_lock);
778 	MPASS(cwt->cwt_state == 0);
779 	cwt->cwt_state = CWT_RUNNING;
780 	cv_signal(&cwt->cwt_cv);
781 
782 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
783 		cwt->cwt_state = CWT_RUNNING;
784 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
785 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
786 			mtx_unlock(&cwt->cwt_lock);
787 
788 			ic = &icc->ic;
789 			sb = &ic->ic_socket->so_rcv;
790 
791 			SOCKBUF_LOCK(sb);
792 			MPASS(icc->rx_flags & RXF_ACTIVE);
793 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
794 				MPASS(STAILQ_EMPTY(&rx_pdus));
795 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
796 				SOCKBUF_UNLOCK(sb);
797 
798 				/* Hand over PDUs to ICL. */
799 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
800 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
801 					ic->ic_receive(ip);
802 				}
803 
804 				SOCKBUF_LOCK(sb);
805 				MPASS(STAILQ_EMPTY(&rx_pdus));
806 			}
807 			MPASS(icc->rx_flags & RXF_ACTIVE);
808 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
809 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
810 				icc->rx_flags &= ~RXF_ACTIVE;
811 			} else {
812 				/*
813 				 * More PDUs were received while we were busy
814 				 * handing over the previous batch to ICL.
815 				 * Re-add this connection to the end of the
816 				 * queue.
817 				 */
818 				mtx_lock(&cwt->cwt_lock);
819 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
820 				    rx_link);
821 				mtx_unlock(&cwt->cwt_lock);
822 			}
823 			SOCKBUF_UNLOCK(sb);
824 
825 			mtx_lock(&cwt->cwt_lock);
826 		}
827 
828 		/* Inner loop doesn't check for CWT_STOP, do that first. */
829 		if (__predict_false(cwt->cwt_state == CWT_STOP))
830 			break;
831 		cwt->cwt_state = CWT_SLEEPING;
832 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
833 	}
834 
835 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
836 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
837 	cwt->cwt_state = CWT_STOPPED;
838 	cv_signal(&cwt->cwt_cv);
839 	mtx_unlock(&cwt->cwt_lock);
840 	kthread_exit();
841 }
842 
843 static int
844 start_worker_threads(void)
845 {
846 	int i, rc;
847 	struct cxgbei_worker_thread_softc *cwt;
848 
849 	worker_thread_count = min(mp_ncpus, 32);
850 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
851 	    M_WAITOK | M_ZERO);
852 
853 	MPASS(cxgbei_proc == NULL);
854 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
855 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
856 		cv_init(&cwt->cwt_cv, "cwt cv");
857 		TAILQ_INIT(&cwt->rx_head);
858 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
859 		    "cxgbei", "%d", i);
860 		if (rc != 0) {
861 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
862 			    i + 1, worker_thread_count, rc);
863 			mtx_destroy(&cwt->cwt_lock);
864 			cv_destroy(&cwt->cwt_cv);
865 			bzero(cwt, sizeof(*cwt));
866 			if (i == 0) {
867 				free(cwt_softc, M_CXGBE);
868 				worker_thread_count = 0;
869 
870 				return (rc);
871 			}
872 
873 			/* Not fatal, carry on with fewer threads. */
874 			worker_thread_count = i;
875 			rc = 0;
876 			break;
877 		}
878 
879 		/* Wait for thread to start before moving on to the next one. */
880 		mtx_lock(&cwt->cwt_lock);
881 		while (cwt->cwt_state == 0)
882 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
883 		mtx_unlock(&cwt->cwt_lock);
884 	}
885 
886 	MPASS(cwt_softc != NULL);
887 	MPASS(worker_thread_count > 0);
888 	return (0);
889 }
890 
891 static void
892 stop_worker_threads(void)
893 {
894 	int i;
895 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
896 
897 	MPASS(worker_thread_count >= 0);
898 
899 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
900 		mtx_lock(&cwt->cwt_lock);
901 		MPASS(cwt->cwt_state == CWT_RUNNING ||
902 		    cwt->cwt_state == CWT_SLEEPING);
903 		cwt->cwt_state = CWT_STOP;
904 		cv_signal(&cwt->cwt_cv);
905 		do {
906 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
907 		} while (cwt->cwt_state != CWT_STOPPED);
908 		mtx_unlock(&cwt->cwt_lock);
909 		mtx_destroy(&cwt->cwt_lock);
910 		cv_destroy(&cwt->cwt_cv);
911 	}
912 	free(cwt_softc, M_CXGBE);
913 }
914 
915 /* Select a worker thread for a connection. */
916 u_int
917 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
918 {
919 	struct adapter *sc = icc->sc;
920 	struct toepcb *toep = icc->toep;
921 	u_int i, n;
922 
923 	n = worker_thread_count / sc->sge.nofldrxq;
924 	if (n > 0)
925 		i = toep->vi->pi->port_id * n + arc4random() % n;
926 	else
927 		i = arc4random() % worker_thread_count;
928 
929 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
930 
931 	return (i);
932 }
933 
934 static int
935 cxgbei_mod_load(void)
936 {
937 	int rc;
938 
939 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
940 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
941 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
942 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
943 
944 	rc = start_worker_threads();
945 	if (rc != 0)
946 		return (rc);
947 
948 	rc = t4_register_uld(&cxgbei_uld_info);
949 	if (rc != 0) {
950 		stop_worker_threads();
951 		return (rc);
952 	}
953 
954 	t4_iterate(cxgbei_activate_all, NULL);
955 
956 	return (rc);
957 }
958 
959 static int
960 cxgbei_mod_unload(void)
961 {
962 
963 	t4_iterate(cxgbei_deactivate_all, NULL);
964 
965 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
966 		return (EBUSY);
967 
968 	stop_worker_threads();
969 
970 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
971 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
972 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
973 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
974 
975 	return (0);
976 }
977 #endif
978 
979 static int
980 cxgbei_modevent(module_t mod, int cmd, void *arg)
981 {
982 	int rc = 0;
983 
984 #ifdef TCP_OFFLOAD
985 	switch (cmd) {
986 	case MOD_LOAD:
987 		rc = cxgbei_mod_load();
988 		if (rc == 0)
989 			rc = icl_cxgbei_mod_load();
990 		break;
991 
992 	case MOD_UNLOAD:
993 		rc = icl_cxgbei_mod_unload();
994 		if (rc == 0)
995 			rc = cxgbei_mod_unload();
996 		break;
997 
998 	default:
999 		rc = EINVAL;
1000 	}
1001 #else
1002 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
1003 	rc = EOPNOTSUPP;
1004 #endif
1005 
1006 	return (rc);
1007 }
1008 
1009 static moduledata_t cxgbei_mod = {
1010 	"cxgbei",
1011 	cxgbei_modevent,
1012 	NULL,
1013 };
1014 
1015 MODULE_VERSION(cxgbei, 1);
1016 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1017 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
1018 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
1019 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
1020