xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 963f5dc7a30624e95d72fb7f87b8892651164e46)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/systm.h>
43 
44 #ifdef TCP_OFFLOAD
45 #include <sys/errno.h>
46 #include <sys/kthread.h>
47 #include <sys/smp.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/mbuf.h>
51 #include <sys/lock.h>
52 #include <sys/mutex.h>
53 #include <sys/condvar.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/in_pcb.h>
57 #include <netinet/toecore.h>
58 #include <netinet/tcp_var.h>
59 #include <netinet/tcp_fsm.h>
60 
61 #include <cam/scsi/scsi_all.h>
62 #include <cam/scsi/scsi_da.h>
63 #include <cam/ctl/ctl_io.h>
64 #include <cam/ctl/ctl.h>
65 #include <cam/ctl/ctl_backend.h>
66 #include <cam/ctl/ctl_error.h>
67 #include <cam/ctl/ctl_frontend.h>
68 #include <cam/ctl/ctl_debug.h>
69 #include <cam/ctl/ctl_ha.h>
70 #include <cam/ctl/ctl_ioctl.h>
71 
72 #include <dev/iscsi/icl.h>
73 #include <dev/iscsi/iscsi_proto.h>
74 #include <dev/iscsi/iscsi_ioctl.h>
75 #include <dev/iscsi/iscsi.h>
76 #include <cam/ctl/ctl_frontend_iscsi.h>
77 
78 #include <cam/cam.h>
79 #include <cam/cam_ccb.h>
80 #include <cam/cam_xpt.h>
81 #include <cam/cam_debug.h>
82 #include <cam/cam_sim.h>
83 #include <cam/cam_xpt_sim.h>
84 #include <cam/cam_xpt_periph.h>
85 #include <cam/cam_periph.h>
86 #include <cam/cam_compat.h>
87 #include <cam/scsi/scsi_message.h>
88 
89 #include "common/common.h"
90 #include "common/t4_msg.h"
91 #include "common/t4_regs.h"	/* for PCIE_MEM_ACCESS */
92 #include "tom/t4_tom.h"
93 #include "cxgbei.h"
94 
95 static int worker_thread_count;
96 static struct cxgbei_worker_thread_softc *cwt_softc;
97 static struct proc *cxgbei_proc;
98 
99 static void
100 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len,
101     uint32_t *max_rx_data_len, struct ppod_region *pr)
102 {
103 	uint32_t tx_len, rx_len, r, v;
104 
105 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
106 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
107 
108 	r = t4_read_reg(sc, A_TP_PARA_REG2);
109 	rx_len = min(rx_len, G_MAXRXDATA(r));
110 	tx_len = min(tx_len, G_MAXRXDATA(r));
111 
112 	r = t4_read_reg(sc, A_TP_PARA_REG7);
113 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
114 	rx_len = min(rx_len, v);
115 	tx_len = min(tx_len, v);
116 
117 	/*
118 	 * AHS is not supported by the kernel so we'll not account for
119 	 * it either in our PDU len -> data segment len conversions.
120 	 */
121 	rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
122 	    ISCSI_DATA_DIGEST_SIZE;
123 	tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
124 	    ISCSI_DATA_DIGEST_SIZE;
125 
126 	/*
127 	 * DDP can place only 4 pages for a single PDU.  A single
128 	 * request might use larger pages than the smallest page size,
129 	 * but that cannot be guaranteed.  Assume the smallest DDP
130 	 * page size for this limit.
131 	 */
132 	rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0]));
133 
134 	if (chip_id(sc) == CHELSIO_T5) {
135 		tx_len = min(tx_len, 15360);
136 
137 		rx_len = rounddown2(rx_len, 512);
138 		tx_len = rounddown2(tx_len, 512);
139 	}
140 
141 	*max_tx_data_len = tx_len;
142 	*max_rx_data_len = rx_len;
143 }
144 
145 /*
146  * Initialize the software state of the iSCSI ULP driver.
147  *
148  * ENXIO means firmware didn't set up something that it was supposed to.
149  */
150 static int
151 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
152 {
153 	struct sysctl_oid *oid;
154 	struct sysctl_oid_list *children;
155 	struct ppod_region *pr;
156 	uint32_t r;
157 	int rc;
158 
159 	MPASS(sc->vres.iscsi.size > 0);
160 	MPASS(ci != NULL);
161 
162 	pr = &ci->pr;
163 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
164 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
165 	if (rc != 0) {
166 		device_printf(sc->dev,
167 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
168 		    __func__, rc);
169 		return (rc);
170 	}
171 
172 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
173 	r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
174 	if (r != pr->pr_tag_mask) {
175 		/*
176 		 * Recent firmwares are supposed to set up the iSCSI tagmask
177 		 * but we'll do it ourselves it the computed value doesn't match
178 		 * what's in the register.
179 		 */
180 		device_printf(sc->dev,
181 		    "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
182 		    pr->pr_tag_mask);
183 		t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
184 		    V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
185 	}
186 
187 	read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr);
188 
189 	sysctl_ctx_init(&ci->ctx);
190 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
191 	children = SYSCTL_CHILDREN(oid);
192 
193 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
194 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
195 	children = SYSCTL_CHILDREN(oid);
196 
197 	ci->ddp_threshold = 2048;
198 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
199 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
200 
201 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len",
202 	    CTLFLAG_RD, &ci->max_rx_data_len, 0,
203 	    "Maximum receive data segment length");
204 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len",
205 	    CTLFLAG_RD, &ci->max_tx_data_len, 0,
206 	    "Maximum transmit data segment length");
207 
208 	return (0);
209 }
210 
211 static int
212 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
213 {
214 	struct adapter *sc = iq->adapter;
215 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
216 	u_int tid = GET_TID(cpl);
217 	struct toepcb *toep = lookup_tid(sc, tid);
218 	struct icl_pdu *ip;
219 	struct icl_cxgbei_pdu *icp;
220 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
221 	uint16_t len = be16toh(cpl->len);
222 
223 	M_ASSERTPKTHDR(m);
224 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
225 
226 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
227 	if (ip == NULL)
228 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
229 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
230 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
231 	icp = ip_to_icp(ip);
232 	icp->icp_seq = ntohl(cpl->seq);
233 	icp->icp_flags = ICPF_RX_HDR;
234 
235 	/* This is the start of a new PDU.  There should be no old state. */
236 	MPASS(toep->ulpcb2 == NULL);
237 	toep->ulpcb2 = icp;
238 
239 #if 0
240 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
241 	    __func__, tid, len, len_ddp, icp);
242 #endif
243 
244 	m_freem(m);
245 	return (0);
246 }
247 
248 static int
249 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
250 {
251 	struct adapter *sc = iq->adapter;
252 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
253 	u_int tid = GET_TID(cpl);
254 	struct toepcb *toep = lookup_tid(sc, tid);
255 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
256 	struct icl_pdu *ip;
257 
258 	M_ASSERTPKTHDR(m);
259 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
260 
261 	if (icp == NULL) {
262 		/*
263 		 * T6 completion enabled, start of a new pdu. Header
264 		 * will come in completion CPL.
265 		 */
266 	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
267 	        if (ip == NULL)
268 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
269 		icp = ip_to_icp(ip);
270 	} else {
271 		/* T5 mode, header is already received. */
272 		MPASS(icp->icp_flags == ICPF_RX_HDR);
273 		MPASS(icp->ip.ip_data_mbuf == NULL);
274 		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
275 	}
276 
277 	/* Trim the cpl header from mbuf. */
278 	m_adj(m, sizeof(*cpl));
279 
280 	icp->icp_flags |= ICPF_RX_FLBUF;
281 	icp->ip.ip_data_mbuf = m;
282 	toep->ofld_rxq->rx_iscsi_fl_pdus++;
283 	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
284 
285 	/*
286 	 * For T6, save the icp for further processing in the
287 	 * completion handler.
288 	 */
289 	if (icp->icp_flags == ICPF_RX_FLBUF) {
290 		MPASS(toep->ulpcb2 == NULL);
291 		toep->ulpcb2 = icp;
292 	}
293 
294 #if 0
295 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
296 	    be16toh(cpl->len), icp);
297 #endif
298 
299 	return (0);
300 }
301 
302 static int
303 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
304 {
305 	struct adapter *sc = iq->adapter;
306 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
307 	u_int tid = GET_TID(cpl);
308 	struct toepcb *toep = lookup_tid(sc, tid);
309 	struct inpcb *inp = toep->inp;
310 	struct socket *so;
311 	struct sockbuf *sb;
312 	struct tcpcb *tp;
313 	struct icl_cxgbei_conn *icc;
314 	struct icl_conn *ic;
315 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
316 	struct icl_pdu *ip;
317 	u_int pdu_len, val;
318 	struct epoch_tracker et;
319 
320 	MPASS(m == NULL);
321 
322 	/* Must already be assembling a PDU. */
323 	MPASS(icp != NULL);
324 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
325 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
326 
327 	pdu_len = be16toh(cpl->len);	/* includes everything. */
328 	val = be32toh(cpl->ddpvld);
329 
330 #if 0
331 	CTR5(KTR_CXGBE,
332 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
333 	    __func__, tid, pdu_len, val, icp->icp_flags);
334 #endif
335 
336 	icp->icp_flags |= ICPF_RX_STATUS;
337 	ip = &icp->ip;
338 	if (val & F_DDP_PADDING_ERR) {
339 		ICL_WARN("received PDU 0x%02x with invalid padding",
340 		    ip->ip_bhs->bhs_opcode);
341 		toep->ofld_rxq->rx_iscsi_padding_errors++;
342 	}
343 	if (val & F_DDP_HDRCRC_ERR) {
344 		ICL_WARN("received PDU 0x%02x with invalid header digest",
345 		    ip->ip_bhs->bhs_opcode);
346 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
347 	}
348 	if (val & F_DDP_DATACRC_ERR) {
349 		ICL_WARN("received PDU 0x%02x with invalid data digest",
350 		    ip->ip_bhs->bhs_opcode);
351 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
352 	}
353 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
354 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
355 		MPASS(ip->ip_data_len > 0);
356 		icp->icp_flags |= ICPF_RX_DDP;
357 		toep->ofld_rxq->rx_iscsi_ddp_pdus++;
358 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
359 	}
360 
361 	INP_WLOCK(inp);
362 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
363 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
364 		    __func__, tid, pdu_len, inp->inp_flags);
365 		INP_WUNLOCK(inp);
366 		icl_cxgbei_conn_pdu_free(NULL, ip);
367 		toep->ulpcb2 = NULL;
368 		return (0);
369 	}
370 
371 	/*
372 	 * T6+ does not report data PDUs received via DDP without F
373 	 * set.  This can result in gaps in the TCP sequence space.
374 	 */
375 	tp = intotcpcb(inp);
376 	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
377 	tp->rcv_nxt = icp->icp_seq + pdu_len;
378 	tp->t_rcvtime = ticks;
379 
380 	/*
381 	 * Don't update the window size or return credits since RX
382 	 * flow control is disabled.
383 	 */
384 
385 	so = inp->inp_socket;
386 	sb = &so->so_rcv;
387 	SOCKBUF_LOCK(sb);
388 
389 	icc = toep->ulpcb;
390 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
391 		CTR5(KTR_CXGBE,
392 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
393 		    __func__, tid, pdu_len, icc, sb->sb_state);
394 		SOCKBUF_UNLOCK(sb);
395 		INP_WUNLOCK(inp);
396 
397 		CURVNET_SET(so->so_vnet);
398 		NET_EPOCH_ENTER(et);
399 		INP_WLOCK(inp);
400 		tp = tcp_drop(tp, ECONNRESET);
401 		if (tp)
402 			INP_WUNLOCK(inp);
403 		NET_EPOCH_EXIT(et);
404 		CURVNET_RESTORE();
405 
406 		icl_cxgbei_conn_pdu_free(NULL, ip);
407 		toep->ulpcb2 = NULL;
408 		return (0);
409 	}
410 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
411 	ic = &icc->ic;
412 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
413 	    F_DDP_DATACRC_ERR)) != 0) {
414 		SOCKBUF_UNLOCK(sb);
415 		INP_WUNLOCK(inp);
416 
417 		icl_cxgbei_conn_pdu_free(NULL, ip);
418 		toep->ulpcb2 = NULL;
419 		ic->ic_error(ic);
420 		return (0);
421 	}
422 	icl_cxgbei_new_pdu_set_conn(ip, ic);
423 
424 	MPASS(m == NULL); /* was unused, we'll use it now. */
425 	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
426 	if (__predict_false(m != NULL)) {
427 		int len = m_length(m, NULL);
428 
429 		/*
430 		 * PDUs were received before the tid transitioned to ULP mode.
431 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
432 		 * the PDU in icp/ip.
433 		 */
434 		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
435 		    len);
436 
437 		/* XXXNP: needs to be rewritten. */
438 		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
439 		    iscsi_bhs)) {
440 			struct icl_cxgbei_pdu *icp0;
441 			struct icl_pdu *ip0;
442 
443 			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
444 			if (ip0 == NULL)
445 				CXGBE_UNIMPLEMENTED("PDU allocation failure");
446 			icl_cxgbei_new_pdu_set_conn(ip0, ic);
447 			icp0 = ip_to_icp(ip0);
448 			icp0->icp_seq = 0; /* XXX */
449 			icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS;
450 			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
451 			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
452 		}
453 		m_freem(m);
454 	}
455 
456 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
457 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
458 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
459 
460 		mtx_lock(&cwt->cwt_lock);
461 		icc->rx_flags |= RXF_ACTIVE;
462 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
463 		if (cwt->cwt_state == CWT_SLEEPING) {
464 			cwt->cwt_state = CWT_RUNNING;
465 			cv_signal(&cwt->cwt_cv);
466 		}
467 		mtx_unlock(&cwt->cwt_lock);
468 	}
469 	SOCKBUF_UNLOCK(sb);
470 	INP_WUNLOCK(inp);
471 
472 	toep->ulpcb2 = NULL;
473 
474 	return (0);
475 }
476 
477 static int
478 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
479 {
480 	struct epoch_tracker et;
481 	struct adapter *sc = iq->adapter;
482 	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
483 	u_int tid = GET_TID(cpl);
484 	struct toepcb *toep = lookup_tid(sc, tid);
485 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
486 	struct icl_pdu *ip;
487 	struct cxgbei_cmp *cmp;
488 	struct inpcb *inp = toep->inp;
489 #ifdef INVARIANTS
490 	uint16_t len = be16toh(cpl->len);
491 #endif
492 	struct socket *so;
493 	struct sockbuf *sb;
494 	struct tcpcb *tp;
495 	struct icl_cxgbei_conn *icc;
496 	struct icl_conn *ic;
497 	struct iscsi_bhs_data_out *bhsdo;
498 	u_int val = be32toh(cpl->ddpvld);
499 	u_int npdus, pdu_len, data_digest_len, hdr_digest_len;
500 	uint32_t prev_seg_len;
501 
502 	M_ASSERTPKTHDR(m);
503 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
504 
505 	if ((val & F_DDP_PDU) == 0) {
506 		MPASS(icp != NULL);
507 		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
508 		ip = &icp->ip;
509 	}
510 
511 	if (icp == NULL) {
512 		/* T6 completion enabled, start of a new PDU. */
513 		ip = icl_cxgbei_new_pdu(M_NOWAIT);
514 		if (ip == NULL)
515 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
516 		icp = ip_to_icp(ip);
517 	}
518 	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
519 
520 #if 0
521 	CTR5(KTR_CXGBE,
522 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
523 	    __func__, tid, pdu_len, val, icp);
524 #endif
525 
526 	/* Copy header */
527 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
528 	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
529 	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
530 	    bhsdo->bhsdo_data_segment_len[1] << 8 |
531 	    bhsdo->bhsdo_data_segment_len[2];
532 	icp->icp_seq = ntohl(cpl->seq);
533 	icp->icp_flags |= ICPF_RX_HDR;
534 	icp->icp_flags |= ICPF_RX_STATUS;
535 
536 	if (val & F_DDP_PADDING_ERR) {
537 		ICL_WARN("received PDU 0x%02x with invalid padding",
538 		    ip->ip_bhs->bhs_opcode);
539 		toep->ofld_rxq->rx_iscsi_padding_errors++;
540 	}
541 	if (val & F_DDP_HDRCRC_ERR) {
542 		ICL_WARN("received PDU 0x%02x with invalid header digest",
543 		    ip->ip_bhs->bhs_opcode);
544 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
545 	}
546 	if (val & F_DDP_DATACRC_ERR) {
547 		ICL_WARN("received PDU 0x%02x with invalid data digest",
548 		    ip->ip_bhs->bhs_opcode);
549 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
550 	}
551 
552 	INP_WLOCK(inp);
553 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
554 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
555 		    __func__, tid, pdu_len, inp->inp_flags);
556 		INP_WUNLOCK(inp);
557 		icl_cxgbei_conn_pdu_free(NULL, ip);
558 		toep->ulpcb2 = NULL;
559 		m_freem(m);
560 		return (0);
561 	}
562 
563 	tp = intotcpcb(inp);
564 
565 	/*
566 	 * If icc is NULL, the connection is being closed in
567 	 * icl_cxgbei_conn_close(), just drop this data.
568 	 */
569 	icc = toep->ulpcb;
570 	if (__predict_false(icc == NULL)) {
571 		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
572 		    __func__, tid, pdu_len, icc);
573 
574 		/*
575 		 * Update rcv_nxt so the sequence number of the FIN
576 		 * doesn't appear wrong.
577 		 */
578 		tp->rcv_nxt = icp->icp_seq + pdu_len;
579 		tp->t_rcvtime = ticks;
580 		INP_WUNLOCK(inp);
581 
582 		icl_cxgbei_conn_pdu_free(NULL, ip);
583 		toep->ulpcb2 = NULL;
584 		m_freem(m);
585 		return (0);
586 	}
587 
588 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
589 	ic = &icc->ic;
590 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
591 	    F_DDP_DATACRC_ERR)) != 0) {
592 		INP_WUNLOCK(inp);
593 
594 		icl_cxgbei_conn_pdu_free(NULL, ip);
595 		toep->ulpcb2 = NULL;
596 		m_freem(m);
597 		ic->ic_error(ic);
598 		return (0);
599 	}
600 
601 	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
602 	    ISCSI_DATA_DIGEST_SIZE : 0;
603 	hdr_digest_len = (icc->ulp_submode & ULP_CRC_HEADER) ?
604 	    ISCSI_HEADER_DIGEST_SIZE : 0;
605 	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
606 
607 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
608 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
609 		MPASS(ip->ip_data_len > 0);
610 		icp->icp_flags |= ICPF_RX_DDP;
611 		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
612 
613 		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
614 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
615 			cmp = cxgbei_find_cmp(icc,
616 			    be32toh(bhsdo->bhsdo_initiator_task_tag));
617 			break;
618 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
619 			cmp = cxgbei_find_cmp(icc,
620 			    be32toh(bhsdo->bhsdo_target_transfer_tag));
621 			break;
622 		default:
623 			__assert_unreachable();
624 		}
625 		MPASS(cmp != NULL);
626 
627 		/*
628 		 * The difference between the end of the last burst
629 		 * and the offset of the last PDU in this burst is
630 		 * the additional data received via DDP.
631 		 */
632 		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
633 		    cmp->next_buffer_offset;
634 
635 		if (prev_seg_len != 0) {
636 			uint32_t orig_datasn;
637 
638 			/*
639 			 * Return a "large" PDU representing the burst
640 			 * of PDUs.  Adjust the offset and length of
641 			 * this PDU to represent the entire burst.
642 			 */
643 			ip->ip_data_len += prev_seg_len;
644 			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
645 			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
646 			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
647 			bhsdo->bhsdo_buffer_offset =
648 			    htobe32(cmp->next_buffer_offset);
649 
650 			orig_datasn = htobe32(bhsdo->bhsdo_datasn);
651 			npdus = orig_datasn - cmp->last_datasn;
652 			bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1);
653 			cmp->last_datasn = orig_datasn;
654 			ip->ip_additional_pdus = npdus - 1;
655 		} else {
656 			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
657 			    cmp->last_datasn + 1);
658 			npdus = 1;
659 			cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
660 		}
661 
662 		cmp->next_buffer_offset += ip->ip_data_len;
663 		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
664 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
665 	} else {
666 		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
667 		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
668 	}
669 
670 	tp->rcv_nxt = icp->icp_seq + pdu_len;
671 	tp->t_rcvtime = ticks;
672 
673 	/*
674 	 * Don't update the window size or return credits since RX
675 	 * flow control is disabled.
676 	 */
677 
678 	so = inp->inp_socket;
679 	sb = &so->so_rcv;
680 	SOCKBUF_LOCK(sb);
681 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
682 		CTR5(KTR_CXGBE,
683 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
684 		    __func__, tid, pdu_len, icc, sb->sb_state);
685 		SOCKBUF_UNLOCK(sb);
686 		INP_WUNLOCK(inp);
687 
688 		CURVNET_SET(so->so_vnet);
689 		NET_EPOCH_ENTER(et);
690 		INP_WLOCK(inp);
691 		tp = tcp_drop(tp, ECONNRESET);
692 		if (tp != NULL)
693 			INP_WUNLOCK(inp);
694 		NET_EPOCH_EXIT(et);
695 		CURVNET_RESTORE();
696 
697 		icl_cxgbei_conn_pdu_free(NULL, ip);
698 		toep->ulpcb2 = NULL;
699 		m_freem(m);
700 		return (0);
701 	}
702 	icl_cxgbei_new_pdu_set_conn(ip, ic);
703 
704 	/* Enqueue the PDU to the received pdus queue. */
705 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
706 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
707 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
708 
709 		mtx_lock(&cwt->cwt_lock);
710 		icc->rx_flags |= RXF_ACTIVE;
711 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
712 		if (cwt->cwt_state == CWT_SLEEPING) {
713 			cwt->cwt_state = CWT_RUNNING;
714 			cv_signal(&cwt->cwt_cv);
715 		}
716 		mtx_unlock(&cwt->cwt_lock);
717 	}
718 	SOCKBUF_UNLOCK(sb);
719 	INP_WUNLOCK(inp);
720 
721 	toep->ulpcb2 = NULL;
722 	m_freem(m);
723 
724 	return (0);
725 }
726 
727 static int
728 cxgbei_activate(struct adapter *sc)
729 {
730 	struct cxgbei_data *ci;
731 	int rc;
732 
733 	ASSERT_SYNCHRONIZED_OP(sc);
734 
735 	if (uld_active(sc, ULD_ISCSI)) {
736 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
737 		    __func__, sc));
738 		return (0);
739 	}
740 
741 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
742 		device_printf(sc->dev,
743 		    "not iSCSI offload capable, or capability disabled.\n");
744 		return (ENOSYS);
745 	}
746 
747 	/* per-adapter softc for iSCSI */
748 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
749 	if (ci == NULL)
750 		return (ENOMEM);
751 
752 	rc = cxgbei_init(sc, ci);
753 	if (rc != 0) {
754 		free(ci, M_CXGBE);
755 		return (rc);
756 	}
757 
758 	sc->iscsi_ulp_softc = ci;
759 
760 	return (0);
761 }
762 
763 static int
764 cxgbei_deactivate(struct adapter *sc)
765 {
766 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
767 
768 	ASSERT_SYNCHRONIZED_OP(sc);
769 
770 	if (ci != NULL) {
771 		sysctl_ctx_free(&ci->ctx);
772 		t4_free_ppod_region(&ci->pr);
773 		free(ci, M_CXGBE);
774 		sc->iscsi_ulp_softc = NULL;
775 	}
776 
777 	return (0);
778 }
779 
780 static void
781 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
782 {
783 
784 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
785 		return;
786 
787 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
788 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
789 		(void) t4_activate_uld(sc, ULD_ISCSI);
790 
791 	end_synchronized_op(sc, 0);
792 }
793 
794 static void
795 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
796 {
797 
798 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
799 		return;
800 
801 	if (uld_active(sc, ULD_ISCSI))
802 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
803 
804 	end_synchronized_op(sc, 0);
805 }
806 
807 static struct uld_info cxgbei_uld_info = {
808 	.uld_id = ULD_ISCSI,
809 	.activate = cxgbei_activate,
810 	.deactivate = cxgbei_deactivate,
811 };
812 
813 static void
814 cwt_main(void *arg)
815 {
816 	struct cxgbei_worker_thread_softc *cwt = arg;
817 	struct icl_cxgbei_conn *icc = NULL;
818 	struct icl_conn *ic;
819 	struct icl_pdu *ip;
820 	struct sockbuf *sb;
821 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
822 
823 	MPASS(cwt != NULL);
824 
825 	mtx_lock(&cwt->cwt_lock);
826 	MPASS(cwt->cwt_state == 0);
827 	cwt->cwt_state = CWT_RUNNING;
828 	cv_signal(&cwt->cwt_cv);
829 
830 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
831 		cwt->cwt_state = CWT_RUNNING;
832 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
833 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
834 			mtx_unlock(&cwt->cwt_lock);
835 
836 			ic = &icc->ic;
837 			sb = &ic->ic_socket->so_rcv;
838 
839 			SOCKBUF_LOCK(sb);
840 			MPASS(icc->rx_flags & RXF_ACTIVE);
841 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
842 				MPASS(STAILQ_EMPTY(&rx_pdus));
843 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
844 				SOCKBUF_UNLOCK(sb);
845 
846 				/* Hand over PDUs to ICL. */
847 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
848 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
849 					ic->ic_receive(ip);
850 				}
851 
852 				SOCKBUF_LOCK(sb);
853 				MPASS(STAILQ_EMPTY(&rx_pdus));
854 			}
855 			MPASS(icc->rx_flags & RXF_ACTIVE);
856 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
857 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
858 				icc->rx_flags &= ~RXF_ACTIVE;
859 			} else {
860 				/*
861 				 * More PDUs were received while we were busy
862 				 * handing over the previous batch to ICL.
863 				 * Re-add this connection to the end of the
864 				 * queue.
865 				 */
866 				mtx_lock(&cwt->cwt_lock);
867 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
868 				    rx_link);
869 				mtx_unlock(&cwt->cwt_lock);
870 			}
871 			SOCKBUF_UNLOCK(sb);
872 
873 			mtx_lock(&cwt->cwt_lock);
874 		}
875 
876 		/* Inner loop doesn't check for CWT_STOP, do that first. */
877 		if (__predict_false(cwt->cwt_state == CWT_STOP))
878 			break;
879 		cwt->cwt_state = CWT_SLEEPING;
880 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
881 	}
882 
883 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
884 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
885 	cwt->cwt_state = CWT_STOPPED;
886 	cv_signal(&cwt->cwt_cv);
887 	mtx_unlock(&cwt->cwt_lock);
888 	kthread_exit();
889 }
890 
891 static int
892 start_worker_threads(void)
893 {
894 	int i, rc;
895 	struct cxgbei_worker_thread_softc *cwt;
896 
897 	worker_thread_count = min(mp_ncpus, 32);
898 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
899 	    M_WAITOK | M_ZERO);
900 
901 	MPASS(cxgbei_proc == NULL);
902 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
903 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
904 		cv_init(&cwt->cwt_cv, "cwt cv");
905 		TAILQ_INIT(&cwt->rx_head);
906 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
907 		    "cxgbei", "%d", i);
908 		if (rc != 0) {
909 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
910 			    i + 1, worker_thread_count, rc);
911 			mtx_destroy(&cwt->cwt_lock);
912 			cv_destroy(&cwt->cwt_cv);
913 			bzero(cwt, sizeof(*cwt));
914 			if (i == 0) {
915 				free(cwt_softc, M_CXGBE);
916 				worker_thread_count = 0;
917 
918 				return (rc);
919 			}
920 
921 			/* Not fatal, carry on with fewer threads. */
922 			worker_thread_count = i;
923 			rc = 0;
924 			break;
925 		}
926 
927 		/* Wait for thread to start before moving on to the next one. */
928 		mtx_lock(&cwt->cwt_lock);
929 		while (cwt->cwt_state == 0)
930 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
931 		mtx_unlock(&cwt->cwt_lock);
932 	}
933 
934 	MPASS(cwt_softc != NULL);
935 	MPASS(worker_thread_count > 0);
936 	return (0);
937 }
938 
939 static void
940 stop_worker_threads(void)
941 {
942 	int i;
943 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
944 
945 	MPASS(worker_thread_count >= 0);
946 
947 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
948 		mtx_lock(&cwt->cwt_lock);
949 		MPASS(cwt->cwt_state == CWT_RUNNING ||
950 		    cwt->cwt_state == CWT_SLEEPING);
951 		cwt->cwt_state = CWT_STOP;
952 		cv_signal(&cwt->cwt_cv);
953 		do {
954 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
955 		} while (cwt->cwt_state != CWT_STOPPED);
956 		mtx_unlock(&cwt->cwt_lock);
957 		mtx_destroy(&cwt->cwt_lock);
958 		cv_destroy(&cwt->cwt_cv);
959 	}
960 	free(cwt_softc, M_CXGBE);
961 }
962 
963 /* Select a worker thread for a connection. */
964 u_int
965 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
966 {
967 	struct adapter *sc = icc->sc;
968 	struct toepcb *toep = icc->toep;
969 	u_int i, n;
970 
971 	n = worker_thread_count / sc->sge.nofldrxq;
972 	if (n > 0)
973 		i = toep->vi->pi->port_id * n + arc4random() % n;
974 	else
975 		i = arc4random() % worker_thread_count;
976 
977 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
978 
979 	return (i);
980 }
981 
982 static int
983 cxgbei_mod_load(void)
984 {
985 	int rc;
986 
987 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
988 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
989 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
990 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
991 
992 	rc = start_worker_threads();
993 	if (rc != 0)
994 		return (rc);
995 
996 	rc = t4_register_uld(&cxgbei_uld_info);
997 	if (rc != 0) {
998 		stop_worker_threads();
999 		return (rc);
1000 	}
1001 
1002 	t4_iterate(cxgbei_activate_all, NULL);
1003 
1004 	return (rc);
1005 }
1006 
1007 static int
1008 cxgbei_mod_unload(void)
1009 {
1010 
1011 	t4_iterate(cxgbei_deactivate_all, NULL);
1012 
1013 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
1014 		return (EBUSY);
1015 
1016 	stop_worker_threads();
1017 
1018 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
1019 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
1020 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
1021 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
1022 
1023 	return (0);
1024 }
1025 #endif
1026 
1027 static int
1028 cxgbei_modevent(module_t mod, int cmd, void *arg)
1029 {
1030 	int rc = 0;
1031 
1032 #ifdef TCP_OFFLOAD
1033 	switch (cmd) {
1034 	case MOD_LOAD:
1035 		rc = cxgbei_mod_load();
1036 		if (rc == 0)
1037 			rc = icl_cxgbei_mod_load();
1038 		break;
1039 
1040 	case MOD_UNLOAD:
1041 		rc = icl_cxgbei_mod_unload();
1042 		if (rc == 0)
1043 			rc = cxgbei_mod_unload();
1044 		break;
1045 
1046 	default:
1047 		rc = EINVAL;
1048 	}
1049 #else
1050 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
1051 	rc = EOPNOTSUPP;
1052 #endif
1053 
1054 	return (rc);
1055 }
1056 
1057 static moduledata_t cxgbei_mod = {
1058 	"cxgbei",
1059 	cxgbei_modevent,
1060 	NULL,
1061 };
1062 
1063 MODULE_VERSION(cxgbei, 1);
1064 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1065 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
1066 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
1067 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
1068