xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/systm.h>
43 
44 #ifdef TCP_OFFLOAD
45 #include <sys/errno.h>
46 #include <sys/gsb_crc32.h>
47 #include <sys/kthread.h>
48 #include <sys/smp.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/mbuf.h>
52 #include <sys/lock.h>
53 #include <sys/mutex.h>
54 #include <sys/condvar.h>
55 #include <sys/uio.h>
56 
57 #include <netinet/in.h>
58 #include <netinet/in_pcb.h>
59 #include <netinet/toecore.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/tcp_fsm.h>
62 
63 #include <cam/scsi/scsi_all.h>
64 #include <cam/scsi/scsi_da.h>
65 #include <cam/ctl/ctl_io.h>
66 #include <cam/ctl/ctl.h>
67 #include <cam/ctl/ctl_backend.h>
68 #include <cam/ctl/ctl_error.h>
69 #include <cam/ctl/ctl_frontend.h>
70 #include <cam/ctl/ctl_debug.h>
71 #include <cam/ctl/ctl_ha.h>
72 #include <cam/ctl/ctl_ioctl.h>
73 
74 #include <dev/iscsi/icl.h>
75 #include <dev/iscsi/iscsi_proto.h>
76 #include <dev/iscsi/iscsi_ioctl.h>
77 #include <dev/iscsi/iscsi.h>
78 #include <cam/ctl/ctl_frontend_iscsi.h>
79 
80 #include <cam/cam.h>
81 #include <cam/cam_ccb.h>
82 #include <cam/cam_xpt.h>
83 #include <cam/cam_debug.h>
84 #include <cam/cam_sim.h>
85 #include <cam/cam_xpt_sim.h>
86 #include <cam/cam_xpt_periph.h>
87 #include <cam/cam_periph.h>
88 #include <cam/cam_compat.h>
89 #include <cam/scsi/scsi_message.h>
90 
91 #include "common/common.h"
92 #include "common/t4_msg.h"
93 #include "common/t4_regs.h"	/* for PCIE_MEM_ACCESS */
94 #include "tom/t4_tom.h"
95 #include "cxgbei.h"
96 
97 static void
98 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len,
99     uint32_t *max_rx_data_len, struct ppod_region *pr)
100 {
101 	uint32_t tx_len, rx_len, r, v;
102 
103 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
104 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
105 
106 	r = t4_read_reg(sc, A_TP_PARA_REG2);
107 	rx_len = min(rx_len, G_MAXRXDATA(r));
108 	tx_len = min(tx_len, G_MAXRXDATA(r));
109 
110 	r = t4_read_reg(sc, A_TP_PARA_REG7);
111 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
112 	rx_len = min(rx_len, v);
113 	tx_len = min(tx_len, v);
114 
115 	/*
116 	 * AHS is not supported by the kernel so we'll not account for
117 	 * it either in our PDU len -> data segment len conversions.
118 	 */
119 	rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
120 	    ISCSI_DATA_DIGEST_SIZE;
121 	tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
122 	    ISCSI_DATA_DIGEST_SIZE;
123 
124 	/*
125 	 * DDP can place only 4 pages for a single PDU.  A single
126 	 * request might use larger pages than the smallest page size,
127 	 * but that cannot be guaranteed.  Assume the smallest DDP
128 	 * page size for this limit.
129 	 */
130 	rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0]));
131 
132 	if (chip_id(sc) == CHELSIO_T5) {
133 		tx_len = min(tx_len, 15360);
134 
135 		rx_len = rounddown2(rx_len, 512);
136 		tx_len = rounddown2(tx_len, 512);
137 	}
138 
139 	*max_tx_data_len = tx_len;
140 	*max_rx_data_len = rx_len;
141 }
142 
143 /*
144  * Initialize the software state of the iSCSI ULP driver.
145  *
146  * ENXIO means firmware didn't set up something that it was supposed to.
147  */
148 static int
149 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
150 {
151 	struct sysctl_oid *oid;
152 	struct sysctl_oid_list *children;
153 	struct ppod_region *pr;
154 	uint32_t r;
155 	int rc;
156 
157 	MPASS(sc->vres.iscsi.size > 0);
158 	MPASS(ci != NULL);
159 
160 	pr = &ci->pr;
161 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
162 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
163 	if (rc != 0) {
164 		device_printf(sc->dev,
165 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
166 		    __func__, rc);
167 		return (rc);
168 	}
169 
170 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
171 	r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
172 	if (r != pr->pr_tag_mask) {
173 		/*
174 		 * Recent firmwares are supposed to set up the iSCSI tagmask
175 		 * but we'll do it ourselves it the computed value doesn't match
176 		 * what's in the register.
177 		 */
178 		device_printf(sc->dev,
179 		    "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
180 		    pr->pr_tag_mask);
181 		t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
182 		    V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
183 	}
184 
185 	read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr);
186 
187 	sysctl_ctx_init(&ci->ctx);
188 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
189 	children = SYSCTL_CHILDREN(oid);
190 
191 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
192 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
193 	children = SYSCTL_CHILDREN(oid);
194 
195 	ci->ddp_threshold = 2048;
196 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
197 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
198 
199 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len",
200 	    CTLFLAG_RW, &ci->max_rx_data_len, 0,
201 	    "Maximum receive data segment length");
202 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len",
203 	    CTLFLAG_RW, &ci->max_tx_data_len, 0,
204 	    "Maximum transmit data segment length");
205 
206 	return (0);
207 }
208 
209 static int
210 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
211 {
212 	struct adapter *sc = iq->adapter;
213 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
214 	u_int tid = GET_TID(cpl);
215 	struct toepcb *toep = lookup_tid(sc, tid);
216 	struct icl_pdu *ip;
217 	struct icl_cxgbei_pdu *icp;
218 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
219 	uint16_t len = be16toh(cpl->len);
220 
221 	M_ASSERTPKTHDR(m);
222 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
223 
224 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
225 	if (ip == NULL)
226 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
227 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
228 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
229 	icp = ip_to_icp(ip);
230 	icp->icp_seq = ntohl(cpl->seq);
231 	icp->icp_flags = ICPF_RX_HDR;
232 
233 	/* This is the start of a new PDU.  There should be no old state. */
234 	MPASS(toep->ulpcb2 == NULL);
235 	toep->ulpcb2 = icp;
236 
237 #if 0
238 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
239 	    __func__, tid, len, len_ddp, icp);
240 #endif
241 
242 	m_freem(m);
243 	return (0);
244 }
245 
246 static int
247 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
248 {
249 	struct adapter *sc = iq->adapter;
250 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
251 	u_int tid = GET_TID(cpl);
252 	struct toepcb *toep = lookup_tid(sc, tid);
253 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
254 	struct icl_pdu *ip;
255 
256 	M_ASSERTPKTHDR(m);
257 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
258 
259 	if (icp == NULL) {
260 		/*
261 		 * T6 completion enabled, start of a new pdu. Header
262 		 * will come in completion CPL.
263 		 */
264 	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
265 	        if (ip == NULL)
266 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
267 		icp = ip_to_icp(ip);
268 	} else {
269 		/* T5 mode, header is already received. */
270 		MPASS(icp->icp_flags == ICPF_RX_HDR);
271 		MPASS(icp->ip.ip_data_mbuf == NULL);
272 		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
273 	}
274 
275 	/* Trim the cpl header from mbuf. */
276 	m_adj(m, sizeof(*cpl));
277 
278 	icp->icp_flags |= ICPF_RX_FLBUF;
279 	icp->ip.ip_data_mbuf = m;
280 	toep->ofld_rxq->rx_iscsi_fl_pdus++;
281 	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
282 
283 	/*
284 	 * For T6, save the icp for further processing in the
285 	 * completion handler.
286 	 */
287 	if (icp->icp_flags == ICPF_RX_FLBUF) {
288 		MPASS(toep->ulpcb2 == NULL);
289 		toep->ulpcb2 = icp;
290 	}
291 
292 #if 0
293 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
294 	    be16toh(cpl->len), icp);
295 #endif
296 
297 	return (0);
298 }
299 
300 static int
301 mbuf_crc32c_helper(void *arg, void *data, u_int len)
302 {
303 	uint32_t *digestp = arg;
304 
305 	*digestp = calculate_crc32c(*digestp, data, len);
306 	return (0);
307 }
308 
309 static struct icl_pdu *
310 parse_pdu(struct socket *so, struct toepcb *toep, struct icl_cxgbei_conn *icc,
311     struct sockbuf *sb, u_int total_len)
312 {
313 	struct uio uio;
314 	struct iovec iov[2];
315 	struct iscsi_bhs bhs;
316 	struct mbuf *m;
317 	struct icl_pdu *ip;
318 	u_int ahs_len, data_len, header_len, pdu_len;
319 	uint32_t calc_digest, wire_digest;
320 	int error;
321 
322 	uio.uio_segflg = UIO_SYSSPACE;
323 	uio.uio_rw = UIO_READ;
324 	uio.uio_td = curthread;
325 
326 	header_len = sizeof(struct iscsi_bhs);
327 	if (icc->ic.ic_header_crc32c)
328 		header_len += ISCSI_HEADER_DIGEST_SIZE;
329 
330 	if (total_len < header_len) {
331 		ICL_WARN("truncated pre-offload PDU with len %u", total_len);
332 		return (NULL);
333 	}
334 
335 	iov[0].iov_base = &bhs;
336 	iov[0].iov_len = sizeof(bhs);
337 	iov[1].iov_base = &wire_digest;
338 	iov[1].iov_len = sizeof(wire_digest);
339 	uio.uio_iov = iov;
340 	uio.uio_iovcnt = 1;
341 	uio.uio_offset = 0;
342 	uio.uio_resid = header_len;
343 	error = soreceive(so, NULL, &uio, NULL, NULL, NULL);
344 	if (error != 0) {
345 		ICL_WARN("failed to read BHS from pre-offload PDU: %d", error);
346 		return (NULL);
347 	}
348 
349 	ahs_len = bhs.bhs_total_ahs_len * 4;
350 	data_len = bhs.bhs_data_segment_len[0] << 16 |
351 	    bhs.bhs_data_segment_len[1] << 8 |
352 	    bhs.bhs_data_segment_len[2];
353 	pdu_len = header_len + ahs_len + roundup2(data_len, 4);
354 	if (icc->ic.ic_data_crc32c && data_len != 0)
355 		pdu_len += ISCSI_DATA_DIGEST_SIZE;
356 
357 	if (total_len < pdu_len) {
358 		ICL_WARN("truncated pre-offload PDU len %u vs %u", total_len,
359 		    pdu_len);
360 		return (NULL);
361 	}
362 
363 	if (ahs_len != 0) {
364 		ICL_WARN("received pre-offload PDU with AHS");
365 		return (NULL);
366 	}
367 
368 	if (icc->ic.ic_header_crc32c) {
369 		calc_digest = calculate_crc32c(0xffffffff, (caddr_t)&bhs,
370 		    sizeof(bhs));
371 		calc_digest ^= 0xffffffff;
372 		if (calc_digest != wire_digest) {
373 			ICL_WARN("received pre-offload PDU 0x%02x with "
374 			    "invalid header digest (0x%x vs 0x%x)",
375 			    bhs.bhs_opcode, wire_digest, calc_digest);
376 			toep->ofld_rxq->rx_iscsi_header_digest_errors++;
377 			return (NULL);
378 		}
379 	}
380 
381 	m = NULL;
382 	if (data_len != 0) {
383 		uio.uio_iov = NULL;
384 		uio.uio_resid = roundup2(data_len, 4);
385 		if (icc->ic.ic_data_crc32c)
386 			uio.uio_resid += ISCSI_DATA_DIGEST_SIZE;
387 
388 		error = soreceive(so, NULL, &uio, &m, NULL, NULL);
389 		if (error != 0) {
390 			ICL_WARN("failed to read data payload from "
391 			    "pre-offload PDU: %d", error);
392 			return (NULL);
393 		}
394 
395 		if (icc->ic.ic_data_crc32c) {
396 			m_copydata(m, roundup2(data_len, 4),
397 			    sizeof(wire_digest), (caddr_t)&wire_digest);
398 
399 			calc_digest = 0xffffffff;
400 			m_apply(m, 0, roundup2(data_len, 4), mbuf_crc32c_helper,
401 			    &calc_digest);
402 			calc_digest ^= 0xffffffff;
403 			if (calc_digest != wire_digest) {
404 				ICL_WARN("received pre-offload PDU 0x%02x "
405 				    "with invalid data digest (0x%x vs 0x%x)",
406 				    bhs.bhs_opcode, wire_digest, calc_digest);
407 				toep->ofld_rxq->rx_iscsi_data_digest_errors++;
408 				m_freem(m);
409 				return (NULL);
410 			}
411 		}
412 	}
413 
414 	ip = icl_cxgbei_new_pdu(M_WAITOK);
415 	icl_cxgbei_new_pdu_set_conn(ip, &icc->ic);
416 	*ip->ip_bhs = bhs;
417 	ip->ip_data_len = data_len;
418 	ip->ip_data_mbuf = m;
419 	return (ip);
420 }
421 
422 void
423 parse_pdus(struct icl_cxgbei_conn *icc, struct sockbuf *sb)
424 {
425 	struct icl_conn *ic = &icc->ic;
426 	struct socket *so = ic->ic_socket;
427 	struct toepcb *toep = icc->toep;
428 	struct icl_pdu *ip, *lastip;
429 	u_int total_len;
430 
431 	SOCKBUF_LOCK_ASSERT(sb);
432 
433 	CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, toep->tid,
434 	    sbused(sb));
435 
436 	lastip = NULL;
437 	while (sbused(sb) != 0 && (sb->sb_state & SBS_CANTRCVMORE) == 0) {
438 		total_len = sbused(sb);
439 		SOCKBUF_UNLOCK(sb);
440 
441 		ip = parse_pdu(so, toep, icc, sb, total_len);
442 
443 		if (ip == NULL) {
444 			ic->ic_error(ic);
445 			SOCKBUF_LOCK(sb);
446 			return;
447 		}
448 
449 		if (lastip == NULL)
450 			STAILQ_INSERT_HEAD(&icc->rcvd_pdus, ip, ip_next);
451 		else
452 			STAILQ_INSERT_AFTER(&icc->rcvd_pdus, lastip, ip,
453 			    ip_next);
454 		lastip = ip;
455 
456 		SOCKBUF_LOCK(sb);
457 	}
458 }
459 
460 static int
461 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
462 {
463 	struct adapter *sc = iq->adapter;
464 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
465 	u_int tid = GET_TID(cpl);
466 	struct toepcb *toep = lookup_tid(sc, tid);
467 	struct inpcb *inp = toep->inp;
468 	struct socket *so;
469 	struct sockbuf *sb;
470 	struct tcpcb *tp;
471 	struct icl_cxgbei_conn *icc;
472 	struct icl_conn *ic;
473 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
474 	struct icl_pdu *ip;
475 	u_int pdu_len, val;
476 	struct epoch_tracker et;
477 
478 	MPASS(m == NULL);
479 
480 	/* Must already be assembling a PDU. */
481 	MPASS(icp != NULL);
482 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
483 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
484 
485 	pdu_len = be16toh(cpl->len);	/* includes everything. */
486 	val = be32toh(cpl->ddpvld);
487 
488 #if 0
489 	CTR5(KTR_CXGBE,
490 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
491 	    __func__, tid, pdu_len, val, icp->icp_flags);
492 #endif
493 
494 	icp->icp_flags |= ICPF_RX_STATUS;
495 	ip = &icp->ip;
496 	if (val & F_DDP_PADDING_ERR) {
497 		ICL_WARN("received PDU 0x%02x with invalid padding",
498 		    ip->ip_bhs->bhs_opcode);
499 		toep->ofld_rxq->rx_iscsi_padding_errors++;
500 	}
501 	if (val & F_DDP_HDRCRC_ERR) {
502 		ICL_WARN("received PDU 0x%02x with invalid header digest",
503 		    ip->ip_bhs->bhs_opcode);
504 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
505 	}
506 	if (val & F_DDP_DATACRC_ERR) {
507 		ICL_WARN("received PDU 0x%02x with invalid data digest",
508 		    ip->ip_bhs->bhs_opcode);
509 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
510 	}
511 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
512 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
513 		MPASS(ip->ip_data_len > 0);
514 		icp->icp_flags |= ICPF_RX_DDP;
515 		toep->ofld_rxq->rx_iscsi_ddp_pdus++;
516 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
517 	}
518 
519 	INP_WLOCK(inp);
520 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
521 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
522 		    __func__, tid, pdu_len, inp->inp_flags);
523 		INP_WUNLOCK(inp);
524 		icl_cxgbei_conn_pdu_free(NULL, ip);
525 		toep->ulpcb2 = NULL;
526 		return (0);
527 	}
528 
529 	/*
530 	 * T6+ does not report data PDUs received via DDP without F
531 	 * set.  This can result in gaps in the TCP sequence space.
532 	 */
533 	tp = intotcpcb(inp);
534 	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
535 	tp->rcv_nxt = icp->icp_seq + pdu_len;
536 	tp->t_rcvtime = ticks;
537 
538 	/*
539 	 * Don't update the window size or return credits since RX
540 	 * flow control is disabled.
541 	 */
542 
543 	so = inp->inp_socket;
544 	sb = &so->so_rcv;
545 	SOCKBUF_LOCK(sb);
546 
547 	icc = toep->ulpcb;
548 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
549 		CTR5(KTR_CXGBE,
550 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
551 		    __func__, tid, pdu_len, icc, sb->sb_state);
552 		SOCKBUF_UNLOCK(sb);
553 		INP_WUNLOCK(inp);
554 
555 		CURVNET_SET(so->so_vnet);
556 		NET_EPOCH_ENTER(et);
557 		INP_WLOCK(inp);
558 		tp = tcp_drop(tp, ECONNRESET);
559 		if (tp)
560 			INP_WUNLOCK(inp);
561 		NET_EPOCH_EXIT(et);
562 		CURVNET_RESTORE();
563 
564 		icl_cxgbei_conn_pdu_free(NULL, ip);
565 		toep->ulpcb2 = NULL;
566 		return (0);
567 	}
568 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
569 	ic = &icc->ic;
570 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
571 	    F_DDP_DATACRC_ERR)) != 0) {
572 		SOCKBUF_UNLOCK(sb);
573 		INP_WUNLOCK(inp);
574 
575 		icl_cxgbei_conn_pdu_free(NULL, ip);
576 		toep->ulpcb2 = NULL;
577 		ic->ic_error(ic);
578 		return (0);
579 	}
580 
581 	icl_cxgbei_new_pdu_set_conn(ip, ic);
582 
583 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
584 	if (!icc->rx_active) {
585 		icc->rx_active = true;
586 		wakeup(&icc->rx_active);
587 	}
588 	SOCKBUF_UNLOCK(sb);
589 	INP_WUNLOCK(inp);
590 
591 	toep->ulpcb2 = NULL;
592 
593 	return (0);
594 }
595 
596 static int
597 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
598 {
599 	struct epoch_tracker et;
600 	struct adapter *sc = iq->adapter;
601 	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
602 	u_int tid = GET_TID(cpl);
603 	struct toepcb *toep = lookup_tid(sc, tid);
604 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
605 	struct icl_pdu *ip;
606 	struct cxgbei_cmp *cmp;
607 	struct inpcb *inp = toep->inp;
608 #ifdef INVARIANTS
609 	uint16_t len = be16toh(cpl->len);
610 	u_int data_digest_len;
611 #endif
612 	struct socket *so;
613 	struct sockbuf *sb;
614 	struct tcpcb *tp;
615 	struct icl_cxgbei_conn *icc;
616 	struct icl_conn *ic;
617 	struct iscsi_bhs_data_out *bhsdo;
618 	u_int val = be32toh(cpl->ddpvld);
619 	u_int npdus, pdu_len;
620 	uint32_t prev_seg_len;
621 
622 	M_ASSERTPKTHDR(m);
623 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
624 
625 	if ((val & F_DDP_PDU) == 0) {
626 		MPASS(icp != NULL);
627 		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
628 		ip = &icp->ip;
629 	}
630 
631 	if (icp == NULL) {
632 		/* T6 completion enabled, start of a new PDU. */
633 		ip = icl_cxgbei_new_pdu(M_NOWAIT);
634 		if (ip == NULL)
635 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
636 		icp = ip_to_icp(ip);
637 	}
638 	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
639 
640 #if 0
641 	CTR5(KTR_CXGBE,
642 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
643 	    __func__, tid, pdu_len, val, icp);
644 #endif
645 
646 	/* Copy header */
647 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
648 	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
649 	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
650 	    bhsdo->bhsdo_data_segment_len[1] << 8 |
651 	    bhsdo->bhsdo_data_segment_len[2];
652 	icp->icp_seq = ntohl(cpl->seq);
653 	icp->icp_flags |= ICPF_RX_HDR;
654 	icp->icp_flags |= ICPF_RX_STATUS;
655 
656 	if (val & F_DDP_PADDING_ERR) {
657 		ICL_WARN("received PDU 0x%02x with invalid padding",
658 		    ip->ip_bhs->bhs_opcode);
659 		toep->ofld_rxq->rx_iscsi_padding_errors++;
660 	}
661 	if (val & F_DDP_HDRCRC_ERR) {
662 		ICL_WARN("received PDU 0x%02x with invalid header digest",
663 		    ip->ip_bhs->bhs_opcode);
664 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
665 	}
666 	if (val & F_DDP_DATACRC_ERR) {
667 		ICL_WARN("received PDU 0x%02x with invalid data digest",
668 		    ip->ip_bhs->bhs_opcode);
669 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
670 	}
671 
672 	INP_WLOCK(inp);
673 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
674 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
675 		    __func__, tid, pdu_len, inp->inp_flags);
676 		INP_WUNLOCK(inp);
677 		icl_cxgbei_conn_pdu_free(NULL, ip);
678 		toep->ulpcb2 = NULL;
679 		m_freem(m);
680 		return (0);
681 	}
682 
683 	tp = intotcpcb(inp);
684 
685 	/*
686 	 * If icc is NULL, the connection is being closed in
687 	 * icl_cxgbei_conn_close(), just drop this data.
688 	 */
689 	icc = toep->ulpcb;
690 	if (__predict_false(icc == NULL)) {
691 		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
692 		    __func__, tid, pdu_len, icc);
693 
694 		/*
695 		 * Update rcv_nxt so the sequence number of the FIN
696 		 * doesn't appear wrong.
697 		 */
698 		tp->rcv_nxt = icp->icp_seq + pdu_len;
699 		tp->t_rcvtime = ticks;
700 		INP_WUNLOCK(inp);
701 
702 		icl_cxgbei_conn_pdu_free(NULL, ip);
703 		toep->ulpcb2 = NULL;
704 		m_freem(m);
705 		return (0);
706 	}
707 
708 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
709 	ic = &icc->ic;
710 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
711 	    F_DDP_DATACRC_ERR)) != 0) {
712 		INP_WUNLOCK(inp);
713 
714 		icl_cxgbei_conn_pdu_free(NULL, ip);
715 		toep->ulpcb2 = NULL;
716 		m_freem(m);
717 		ic->ic_error(ic);
718 		return (0);
719 	}
720 
721 #ifdef INVARIANTS
722 	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
723 	    ISCSI_DATA_DIGEST_SIZE : 0;
724 	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
725 #endif
726 
727 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
728 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
729 		MPASS(ip->ip_data_len > 0);
730 		icp->icp_flags |= ICPF_RX_DDP;
731 		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
732 
733 		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
734 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
735 			cmp = cxgbei_find_cmp(icc,
736 			    be32toh(bhsdo->bhsdo_initiator_task_tag));
737 			break;
738 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
739 			cmp = cxgbei_find_cmp(icc,
740 			    be32toh(bhsdo->bhsdo_target_transfer_tag));
741 			break;
742 		default:
743 			__assert_unreachable();
744 		}
745 		MPASS(cmp != NULL);
746 
747 		/*
748 		 * The difference between the end of the last burst
749 		 * and the offset of the last PDU in this burst is
750 		 * the additional data received via DDP.
751 		 */
752 		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
753 		    cmp->next_buffer_offset;
754 
755 		if (prev_seg_len != 0) {
756 			uint32_t orig_datasn;
757 
758 			/*
759 			 * Return a "large" PDU representing the burst
760 			 * of PDUs.  Adjust the offset and length of
761 			 * this PDU to represent the entire burst.
762 			 */
763 			ip->ip_data_len += prev_seg_len;
764 			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
765 			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
766 			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
767 			bhsdo->bhsdo_buffer_offset =
768 			    htobe32(cmp->next_buffer_offset);
769 
770 			orig_datasn = htobe32(bhsdo->bhsdo_datasn);
771 			npdus = orig_datasn - cmp->last_datasn;
772 			bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1);
773 			cmp->last_datasn = orig_datasn;
774 			ip->ip_additional_pdus = npdus - 1;
775 		} else {
776 			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
777 			    cmp->last_datasn + 1);
778 			npdus = 1;
779 			cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
780 		}
781 
782 		cmp->next_buffer_offset += ip->ip_data_len;
783 		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
784 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
785 	} else {
786 		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
787 		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
788 	}
789 
790 	tp->rcv_nxt = icp->icp_seq + pdu_len;
791 	tp->t_rcvtime = ticks;
792 
793 	/*
794 	 * Don't update the window size or return credits since RX
795 	 * flow control is disabled.
796 	 */
797 
798 	so = inp->inp_socket;
799 	sb = &so->so_rcv;
800 	SOCKBUF_LOCK(sb);
801 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
802 		CTR5(KTR_CXGBE,
803 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
804 		    __func__, tid, pdu_len, icc, sb->sb_state);
805 		SOCKBUF_UNLOCK(sb);
806 		INP_WUNLOCK(inp);
807 
808 		CURVNET_SET(so->so_vnet);
809 		NET_EPOCH_ENTER(et);
810 		INP_WLOCK(inp);
811 		tp = tcp_drop(tp, ECONNRESET);
812 		if (tp != NULL)
813 			INP_WUNLOCK(inp);
814 		NET_EPOCH_EXIT(et);
815 		CURVNET_RESTORE();
816 
817 		icl_cxgbei_conn_pdu_free(NULL, ip);
818 		toep->ulpcb2 = NULL;
819 		m_freem(m);
820 		return (0);
821 	}
822 
823 	icl_cxgbei_new_pdu_set_conn(ip, ic);
824 
825 	/* Enqueue the PDU to the received pdus queue. */
826 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
827 	if (!icc->rx_active) {
828 		icc->rx_active = true;
829 		wakeup(&icc->rx_active);
830 	}
831 	SOCKBUF_UNLOCK(sb);
832 	INP_WUNLOCK(inp);
833 
834 	toep->ulpcb2 = NULL;
835 	m_freem(m);
836 
837 	return (0);
838 }
839 
840 static int
841 cxgbei_activate(struct adapter *sc)
842 {
843 	struct cxgbei_data *ci;
844 	int rc;
845 
846 	ASSERT_SYNCHRONIZED_OP(sc);
847 
848 	if (uld_active(sc, ULD_ISCSI)) {
849 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
850 		    __func__, sc));
851 		return (0);
852 	}
853 
854 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
855 		device_printf(sc->dev,
856 		    "not iSCSI offload capable, or capability disabled.\n");
857 		return (ENOSYS);
858 	}
859 
860 	/* per-adapter softc for iSCSI */
861 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
862 	if (ci == NULL)
863 		return (ENOMEM);
864 
865 	rc = cxgbei_init(sc, ci);
866 	if (rc != 0) {
867 		free(ci, M_CXGBE);
868 		return (rc);
869 	}
870 
871 	sc->iscsi_ulp_softc = ci;
872 
873 	return (0);
874 }
875 
876 static int
877 cxgbei_deactivate(struct adapter *sc)
878 {
879 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
880 
881 	ASSERT_SYNCHRONIZED_OP(sc);
882 
883 	if (ci != NULL) {
884 		sysctl_ctx_free(&ci->ctx);
885 		t4_free_ppod_region(&ci->pr);
886 		free(ci, M_CXGBE);
887 		sc->iscsi_ulp_softc = NULL;
888 	}
889 
890 	return (0);
891 }
892 
893 static void
894 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
895 {
896 
897 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
898 		return;
899 
900 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
901 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
902 		(void) t4_activate_uld(sc, ULD_ISCSI);
903 
904 	end_synchronized_op(sc, 0);
905 }
906 
907 static void
908 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
909 {
910 
911 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
912 		return;
913 
914 	if (uld_active(sc, ULD_ISCSI))
915 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
916 
917 	end_synchronized_op(sc, 0);
918 }
919 
920 static struct uld_info cxgbei_uld_info = {
921 	.uld_id = ULD_ISCSI,
922 	.activate = cxgbei_activate,
923 	.deactivate = cxgbei_deactivate,
924 };
925 
926 static int
927 cxgbei_mod_load(void)
928 {
929 	int rc;
930 
931 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
932 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
933 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
934 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
935 
936 	rc = t4_register_uld(&cxgbei_uld_info);
937 	if (rc != 0)
938 		return (rc);
939 
940 	t4_iterate(cxgbei_activate_all, NULL);
941 
942 	return (rc);
943 }
944 
945 static int
946 cxgbei_mod_unload(void)
947 {
948 
949 	t4_iterate(cxgbei_deactivate_all, NULL);
950 
951 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
952 		return (EBUSY);
953 
954 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
955 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
956 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
957 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
958 
959 	return (0);
960 }
961 #endif
962 
963 static int
964 cxgbei_modevent(module_t mod, int cmd, void *arg)
965 {
966 	int rc = 0;
967 
968 #ifdef TCP_OFFLOAD
969 	switch (cmd) {
970 	case MOD_LOAD:
971 		rc = cxgbei_mod_load();
972 		if (rc == 0)
973 			rc = icl_cxgbei_mod_load();
974 		break;
975 
976 	case MOD_UNLOAD:
977 		rc = icl_cxgbei_mod_unload();
978 		if (rc == 0)
979 			rc = cxgbei_mod_unload();
980 		break;
981 
982 	default:
983 		rc = EINVAL;
984 	}
985 #else
986 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
987 	rc = EOPNOTSUPP;
988 #endif
989 
990 	return (rc);
991 }
992 
993 static moduledata_t cxgbei_mod = {
994 	"cxgbei",
995 	cxgbei_modevent,
996 	NULL,
997 };
998 
999 MODULE_VERSION(cxgbei, 1);
1000 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1001 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
1002 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
1003 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
1004