xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 058ac3e8063366dafa634d9107642e12b038bf09)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/ktr.h>
41 #include <sys/module.h>
42 #include <sys/systm.h>
43 
44 #ifdef TCP_OFFLOAD
45 #include <sys/errno.h>
46 #include <sys/gsb_crc32.h>
47 #include <sys/kthread.h>
48 #include <sys/smp.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/mbuf.h>
52 #include <sys/lock.h>
53 #include <sys/mutex.h>
54 #include <sys/condvar.h>
55 #include <sys/uio.h>
56 
57 #include <netinet/in.h>
58 #include <netinet/in_pcb.h>
59 #include <netinet/toecore.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/tcp_fsm.h>
62 
63 #include <cam/scsi/scsi_all.h>
64 #include <cam/scsi/scsi_da.h>
65 #include <cam/ctl/ctl_io.h>
66 #include <cam/ctl/ctl.h>
67 #include <cam/ctl/ctl_backend.h>
68 #include <cam/ctl/ctl_error.h>
69 #include <cam/ctl/ctl_frontend.h>
70 #include <cam/ctl/ctl_debug.h>
71 #include <cam/ctl/ctl_ha.h>
72 #include <cam/ctl/ctl_ioctl.h>
73 
74 #include <dev/iscsi/icl.h>
75 #include <dev/iscsi/iscsi_proto.h>
76 #include <dev/iscsi/iscsi_ioctl.h>
77 #include <dev/iscsi/iscsi.h>
78 #include <cam/ctl/ctl_frontend_iscsi.h>
79 
80 #include <cam/cam.h>
81 #include <cam/cam_ccb.h>
82 #include <cam/cam_xpt.h>
83 #include <cam/cam_debug.h>
84 #include <cam/cam_sim.h>
85 #include <cam/cam_xpt_sim.h>
86 #include <cam/cam_xpt_periph.h>
87 #include <cam/cam_periph.h>
88 #include <cam/cam_compat.h>
89 #include <cam/scsi/scsi_message.h>
90 
91 #include "common/common.h"
92 #include "common/t4_msg.h"
93 #include "common/t4_regs.h"	/* for PCIE_MEM_ACCESS */
94 #include "tom/t4_tom.h"
95 #include "cxgbei.h"
96 
97 static void
98 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len,
99     uint32_t *max_rx_data_len, struct ppod_region *pr)
100 {
101 	uint32_t tx_len, rx_len, r, v;
102 
103 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
104 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
105 
106 	r = t4_read_reg(sc, A_TP_PARA_REG2);
107 	rx_len = min(rx_len, G_MAXRXDATA(r));
108 	tx_len = min(tx_len, G_MAXRXDATA(r));
109 
110 	r = t4_read_reg(sc, A_TP_PARA_REG7);
111 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
112 	rx_len = min(rx_len, v);
113 	tx_len = min(tx_len, v);
114 
115 	/*
116 	 * AHS is not supported by the kernel so we'll not account for
117 	 * it either in our PDU len -> data segment len conversions.
118 	 */
119 	rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
120 	    ISCSI_DATA_DIGEST_SIZE;
121 	tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
122 	    ISCSI_DATA_DIGEST_SIZE;
123 
124 	/*
125 	 * DDP can place only 4 pages for a single PDU.  A single
126 	 * request might use larger pages than the smallest page size,
127 	 * but that cannot be guaranteed.  Assume the smallest DDP
128 	 * page size for this limit.
129 	 */
130 	rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0]));
131 
132 	if (chip_id(sc) == CHELSIO_T5) {
133 		tx_len = min(tx_len, 15360);
134 
135 		rx_len = rounddown2(rx_len, 512);
136 		tx_len = rounddown2(tx_len, 512);
137 	}
138 
139 	*max_tx_data_len = tx_len;
140 	*max_rx_data_len = rx_len;
141 }
142 
143 /*
144  * Initialize the software state of the iSCSI ULP driver.
145  *
146  * ENXIO means firmware didn't set up something that it was supposed to.
147  */
148 static int
149 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
150 {
151 	struct sysctl_oid *oid;
152 	struct sysctl_oid_list *children;
153 	struct ppod_region *pr;
154 	uint32_t r;
155 	int rc;
156 
157 	MPASS(sc->vres.iscsi.size > 0);
158 	MPASS(ci != NULL);
159 
160 	pr = &ci->pr;
161 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
162 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
163 	if (rc != 0) {
164 		device_printf(sc->dev,
165 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
166 		    __func__, rc);
167 		return (rc);
168 	}
169 
170 	read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr);
171 
172 	sysctl_ctx_init(&ci->ctx);
173 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
174 	children = SYSCTL_CHILDREN(oid);
175 
176 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
177 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
178 	children = SYSCTL_CHILDREN(oid);
179 
180 	ci->ddp_threshold = 2048;
181 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
182 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
183 
184 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len",
185 	    CTLFLAG_RW, &ci->max_rx_data_len, 0,
186 	    "Maximum receive data segment length");
187 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len",
188 	    CTLFLAG_RW, &ci->max_tx_data_len, 0,
189 	    "Maximum transmit data segment length");
190 
191 	return (0);
192 }
193 
194 static int
195 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
196 {
197 	struct adapter *sc = iq->adapter;
198 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
199 	u_int tid = GET_TID(cpl);
200 	struct toepcb *toep = lookup_tid(sc, tid);
201 	struct icl_pdu *ip;
202 	struct icl_cxgbei_pdu *icp;
203 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
204 	uint16_t len = be16toh(cpl->len);
205 
206 	M_ASSERTPKTHDR(m);
207 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
208 
209 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
210 	if (ip == NULL)
211 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
212 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
213 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
214 	icp = ip_to_icp(ip);
215 	icp->icp_seq = ntohl(cpl->seq);
216 	icp->icp_flags = ICPF_RX_HDR;
217 
218 	/* This is the start of a new PDU.  There should be no old state. */
219 	MPASS(toep->ulpcb2 == NULL);
220 	toep->ulpcb2 = icp;
221 
222 #if 0
223 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
224 	    __func__, tid, len, len_ddp, icp);
225 #endif
226 
227 	m_freem(m);
228 	return (0);
229 }
230 
231 static int
232 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
233 {
234 	struct adapter *sc = iq->adapter;
235 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
236 	u_int tid = GET_TID(cpl);
237 	struct toepcb *toep = lookup_tid(sc, tid);
238 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
239 	struct icl_pdu *ip;
240 
241 	M_ASSERTPKTHDR(m);
242 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
243 
244 	if (icp == NULL) {
245 		/*
246 		 * T6 completion enabled, start of a new pdu. Header
247 		 * will come in completion CPL.
248 		 */
249 	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
250 	        if (ip == NULL)
251 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
252 		icp = ip_to_icp(ip);
253 	} else {
254 		/* T5 mode, header is already received. */
255 		MPASS(icp->icp_flags == ICPF_RX_HDR);
256 		MPASS(icp->ip.ip_data_mbuf == NULL);
257 		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
258 	}
259 
260 	/* Trim the cpl header from mbuf. */
261 	m_adj(m, sizeof(*cpl));
262 
263 	icp->icp_flags |= ICPF_RX_FLBUF;
264 	icp->ip.ip_data_mbuf = m;
265 	toep->ofld_rxq->rx_iscsi_fl_pdus++;
266 	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
267 
268 	/*
269 	 * For T6, save the icp for further processing in the
270 	 * completion handler.
271 	 */
272 	if (icp->icp_flags == ICPF_RX_FLBUF) {
273 		MPASS(toep->ulpcb2 == NULL);
274 		toep->ulpcb2 = icp;
275 	}
276 
277 #if 0
278 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
279 	    be16toh(cpl->len), icp);
280 #endif
281 
282 	return (0);
283 }
284 
285 static int
286 mbuf_crc32c_helper(void *arg, void *data, u_int len)
287 {
288 	uint32_t *digestp = arg;
289 
290 	*digestp = calculate_crc32c(*digestp, data, len);
291 	return (0);
292 }
293 
294 static struct icl_pdu *
295 parse_pdu(struct socket *so, struct toepcb *toep, struct icl_cxgbei_conn *icc,
296     struct sockbuf *sb, u_int total_len)
297 {
298 	struct uio uio;
299 	struct iovec iov[2];
300 	struct iscsi_bhs bhs;
301 	struct mbuf *m;
302 	struct icl_pdu *ip;
303 	u_int ahs_len, data_len, header_len, pdu_len;
304 	uint32_t calc_digest, wire_digest;
305 	int error;
306 
307 	uio.uio_segflg = UIO_SYSSPACE;
308 	uio.uio_rw = UIO_READ;
309 	uio.uio_td = curthread;
310 
311 	header_len = sizeof(struct iscsi_bhs);
312 	if (icc->ic.ic_header_crc32c)
313 		header_len += ISCSI_HEADER_DIGEST_SIZE;
314 
315 	if (total_len < header_len) {
316 		ICL_WARN("truncated pre-offload PDU with len %u", total_len);
317 		return (NULL);
318 	}
319 
320 	iov[0].iov_base = &bhs;
321 	iov[0].iov_len = sizeof(bhs);
322 	iov[1].iov_base = &wire_digest;
323 	iov[1].iov_len = sizeof(wire_digest);
324 	uio.uio_iov = iov;
325 	uio.uio_iovcnt = 1;
326 	uio.uio_offset = 0;
327 	uio.uio_resid = header_len;
328 	error = soreceive(so, NULL, &uio, NULL, NULL, NULL);
329 	if (error != 0) {
330 		ICL_WARN("failed to read BHS from pre-offload PDU: %d", error);
331 		return (NULL);
332 	}
333 
334 	ahs_len = bhs.bhs_total_ahs_len * 4;
335 	data_len = bhs.bhs_data_segment_len[0] << 16 |
336 	    bhs.bhs_data_segment_len[1] << 8 |
337 	    bhs.bhs_data_segment_len[2];
338 	pdu_len = header_len + ahs_len + roundup2(data_len, 4);
339 	if (icc->ic.ic_data_crc32c && data_len != 0)
340 		pdu_len += ISCSI_DATA_DIGEST_SIZE;
341 
342 	if (total_len < pdu_len) {
343 		ICL_WARN("truncated pre-offload PDU len %u vs %u", total_len,
344 		    pdu_len);
345 		return (NULL);
346 	}
347 
348 	if (ahs_len != 0) {
349 		ICL_WARN("received pre-offload PDU with AHS");
350 		return (NULL);
351 	}
352 
353 	if (icc->ic.ic_header_crc32c) {
354 		calc_digest = calculate_crc32c(0xffffffff, (caddr_t)&bhs,
355 		    sizeof(bhs));
356 		calc_digest ^= 0xffffffff;
357 		if (calc_digest != wire_digest) {
358 			ICL_WARN("received pre-offload PDU 0x%02x with "
359 			    "invalid header digest (0x%x vs 0x%x)",
360 			    bhs.bhs_opcode, wire_digest, calc_digest);
361 			toep->ofld_rxq->rx_iscsi_header_digest_errors++;
362 			return (NULL);
363 		}
364 	}
365 
366 	m = NULL;
367 	if (data_len != 0) {
368 		uio.uio_iov = NULL;
369 		uio.uio_resid = roundup2(data_len, 4);
370 		if (icc->ic.ic_data_crc32c)
371 			uio.uio_resid += ISCSI_DATA_DIGEST_SIZE;
372 
373 		error = soreceive(so, NULL, &uio, &m, NULL, NULL);
374 		if (error != 0) {
375 			ICL_WARN("failed to read data payload from "
376 			    "pre-offload PDU: %d", error);
377 			return (NULL);
378 		}
379 
380 		if (icc->ic.ic_data_crc32c) {
381 			m_copydata(m, roundup2(data_len, 4),
382 			    sizeof(wire_digest), (caddr_t)&wire_digest);
383 
384 			calc_digest = 0xffffffff;
385 			m_apply(m, 0, roundup2(data_len, 4), mbuf_crc32c_helper,
386 			    &calc_digest);
387 			calc_digest ^= 0xffffffff;
388 			if (calc_digest != wire_digest) {
389 				ICL_WARN("received pre-offload PDU 0x%02x "
390 				    "with invalid data digest (0x%x vs 0x%x)",
391 				    bhs.bhs_opcode, wire_digest, calc_digest);
392 				toep->ofld_rxq->rx_iscsi_data_digest_errors++;
393 				m_freem(m);
394 				return (NULL);
395 			}
396 		}
397 	}
398 
399 	ip = icl_cxgbei_new_pdu(M_WAITOK);
400 	icl_cxgbei_new_pdu_set_conn(ip, &icc->ic);
401 	*ip->ip_bhs = bhs;
402 	ip->ip_data_len = data_len;
403 	ip->ip_data_mbuf = m;
404 	return (ip);
405 }
406 
407 void
408 parse_pdus(struct icl_cxgbei_conn *icc, struct sockbuf *sb)
409 {
410 	struct icl_conn *ic = &icc->ic;
411 	struct socket *so = ic->ic_socket;
412 	struct toepcb *toep = icc->toep;
413 	struct icl_pdu *ip, *lastip;
414 	u_int total_len;
415 
416 	SOCKBUF_LOCK_ASSERT(sb);
417 
418 	CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, toep->tid,
419 	    sbused(sb));
420 
421 	lastip = NULL;
422 	while (sbused(sb) != 0 && (sb->sb_state & SBS_CANTRCVMORE) == 0) {
423 		total_len = sbused(sb);
424 		SOCKBUF_UNLOCK(sb);
425 
426 		ip = parse_pdu(so, toep, icc, sb, total_len);
427 
428 		if (ip == NULL) {
429 			ic->ic_error(ic);
430 			SOCKBUF_LOCK(sb);
431 			return;
432 		}
433 
434 		if (lastip == NULL)
435 			STAILQ_INSERT_HEAD(&icc->rcvd_pdus, ip, ip_next);
436 		else
437 			STAILQ_INSERT_AFTER(&icc->rcvd_pdus, lastip, ip,
438 			    ip_next);
439 		lastip = ip;
440 
441 		SOCKBUF_LOCK(sb);
442 	}
443 }
444 
445 static int
446 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
447 {
448 	struct adapter *sc = iq->adapter;
449 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
450 	u_int tid = GET_TID(cpl);
451 	struct toepcb *toep = lookup_tid(sc, tid);
452 	struct inpcb *inp = toep->inp;
453 	struct socket *so;
454 	struct sockbuf *sb;
455 	struct tcpcb *tp;
456 	struct icl_cxgbei_conn *icc;
457 	struct icl_conn *ic;
458 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
459 	struct icl_pdu *ip;
460 	u_int pdu_len, val;
461 	struct epoch_tracker et;
462 
463 	MPASS(m == NULL);
464 
465 	/* Must already be assembling a PDU. */
466 	MPASS(icp != NULL);
467 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
468 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
469 
470 	pdu_len = be16toh(cpl->len);	/* includes everything. */
471 	val = be32toh(cpl->ddpvld);
472 
473 #if 0
474 	CTR5(KTR_CXGBE,
475 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
476 	    __func__, tid, pdu_len, val, icp->icp_flags);
477 #endif
478 
479 	icp->icp_flags |= ICPF_RX_STATUS;
480 	ip = &icp->ip;
481 	if (val & F_DDP_PADDING_ERR) {
482 		ICL_WARN("received PDU 0x%02x with invalid padding",
483 		    ip->ip_bhs->bhs_opcode);
484 		toep->ofld_rxq->rx_iscsi_padding_errors++;
485 	}
486 	if (val & F_DDP_HDRCRC_ERR) {
487 		ICL_WARN("received PDU 0x%02x with invalid header digest",
488 		    ip->ip_bhs->bhs_opcode);
489 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
490 	}
491 	if (val & F_DDP_DATACRC_ERR) {
492 		ICL_WARN("received PDU 0x%02x with invalid data digest",
493 		    ip->ip_bhs->bhs_opcode);
494 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
495 	}
496 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
497 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
498 		MPASS(ip->ip_data_len > 0);
499 		icp->icp_flags |= ICPF_RX_DDP;
500 		toep->ofld_rxq->rx_iscsi_ddp_pdus++;
501 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
502 	}
503 
504 	INP_WLOCK(inp);
505 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
506 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
507 		    __func__, tid, pdu_len, inp->inp_flags);
508 		INP_WUNLOCK(inp);
509 		icl_cxgbei_conn_pdu_free(NULL, ip);
510 		toep->ulpcb2 = NULL;
511 		return (0);
512 	}
513 
514 	/*
515 	 * T6+ does not report data PDUs received via DDP without F
516 	 * set.  This can result in gaps in the TCP sequence space.
517 	 */
518 	tp = intotcpcb(inp);
519 	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
520 	tp->rcv_nxt = icp->icp_seq + pdu_len;
521 	tp->t_rcvtime = ticks;
522 
523 	/*
524 	 * Don't update the window size or return credits since RX
525 	 * flow control is disabled.
526 	 */
527 
528 	so = inp->inp_socket;
529 	sb = &so->so_rcv;
530 	SOCKBUF_LOCK(sb);
531 
532 	icc = toep->ulpcb;
533 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
534 		CTR5(KTR_CXGBE,
535 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
536 		    __func__, tid, pdu_len, icc, sb->sb_state);
537 		SOCKBUF_UNLOCK(sb);
538 		INP_WUNLOCK(inp);
539 
540 		CURVNET_SET(so->so_vnet);
541 		NET_EPOCH_ENTER(et);
542 		INP_WLOCK(inp);
543 		tp = tcp_drop(tp, ECONNRESET);
544 		if (tp)
545 			INP_WUNLOCK(inp);
546 		NET_EPOCH_EXIT(et);
547 		CURVNET_RESTORE();
548 
549 		icl_cxgbei_conn_pdu_free(NULL, ip);
550 		toep->ulpcb2 = NULL;
551 		return (0);
552 	}
553 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
554 	ic = &icc->ic;
555 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
556 	    F_DDP_DATACRC_ERR)) != 0) {
557 		SOCKBUF_UNLOCK(sb);
558 		INP_WUNLOCK(inp);
559 
560 		icl_cxgbei_conn_pdu_free(NULL, ip);
561 		toep->ulpcb2 = NULL;
562 		ic->ic_error(ic);
563 		return (0);
564 	}
565 
566 	icl_cxgbei_new_pdu_set_conn(ip, ic);
567 
568 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
569 	if (!icc->rx_active) {
570 		icc->rx_active = true;
571 		wakeup(&icc->rx_active);
572 	}
573 	SOCKBUF_UNLOCK(sb);
574 	INP_WUNLOCK(inp);
575 
576 	toep->ulpcb2 = NULL;
577 
578 	return (0);
579 }
580 
581 static int
582 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
583 {
584 	struct epoch_tracker et;
585 	struct adapter *sc = iq->adapter;
586 	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
587 	u_int tid = GET_TID(cpl);
588 	struct toepcb *toep = lookup_tid(sc, tid);
589 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
590 	struct icl_pdu *ip;
591 	struct cxgbei_cmp *cmp;
592 	struct inpcb *inp = toep->inp;
593 #ifdef INVARIANTS
594 	uint16_t len = be16toh(cpl->len);
595 	u_int data_digest_len;
596 #endif
597 	struct socket *so;
598 	struct sockbuf *sb;
599 	struct tcpcb *tp;
600 	struct icl_cxgbei_conn *icc;
601 	struct icl_conn *ic;
602 	struct iscsi_bhs_data_out *bhsdo;
603 	u_int val = be32toh(cpl->ddpvld);
604 	u_int npdus, pdu_len;
605 	uint32_t prev_seg_len;
606 
607 	M_ASSERTPKTHDR(m);
608 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
609 
610 	if ((val & F_DDP_PDU) == 0) {
611 		MPASS(icp != NULL);
612 		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
613 		ip = &icp->ip;
614 	}
615 
616 	if (icp == NULL) {
617 		/* T6 completion enabled, start of a new PDU. */
618 		ip = icl_cxgbei_new_pdu(M_NOWAIT);
619 		if (ip == NULL)
620 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
621 		icp = ip_to_icp(ip);
622 	}
623 	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
624 
625 #if 0
626 	CTR5(KTR_CXGBE,
627 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
628 	    __func__, tid, pdu_len, val, icp);
629 #endif
630 
631 	/* Copy header */
632 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
633 	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
634 	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
635 	    bhsdo->bhsdo_data_segment_len[1] << 8 |
636 	    bhsdo->bhsdo_data_segment_len[2];
637 	icp->icp_seq = ntohl(cpl->seq);
638 	icp->icp_flags |= ICPF_RX_HDR;
639 	icp->icp_flags |= ICPF_RX_STATUS;
640 
641 	if (val & F_DDP_PADDING_ERR) {
642 		ICL_WARN("received PDU 0x%02x with invalid padding",
643 		    ip->ip_bhs->bhs_opcode);
644 		toep->ofld_rxq->rx_iscsi_padding_errors++;
645 	}
646 	if (val & F_DDP_HDRCRC_ERR) {
647 		ICL_WARN("received PDU 0x%02x with invalid header digest",
648 		    ip->ip_bhs->bhs_opcode);
649 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
650 	}
651 	if (val & F_DDP_DATACRC_ERR) {
652 		ICL_WARN("received PDU 0x%02x with invalid data digest",
653 		    ip->ip_bhs->bhs_opcode);
654 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
655 	}
656 
657 	INP_WLOCK(inp);
658 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
659 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
660 		    __func__, tid, pdu_len, inp->inp_flags);
661 		INP_WUNLOCK(inp);
662 		icl_cxgbei_conn_pdu_free(NULL, ip);
663 		toep->ulpcb2 = NULL;
664 		m_freem(m);
665 		return (0);
666 	}
667 
668 	tp = intotcpcb(inp);
669 
670 	/*
671 	 * If icc is NULL, the connection is being closed in
672 	 * icl_cxgbei_conn_close(), just drop this data.
673 	 */
674 	icc = toep->ulpcb;
675 	if (__predict_false(icc == NULL)) {
676 		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
677 		    __func__, tid, pdu_len, icc);
678 
679 		/*
680 		 * Update rcv_nxt so the sequence number of the FIN
681 		 * doesn't appear wrong.
682 		 */
683 		tp->rcv_nxt = icp->icp_seq + pdu_len;
684 		tp->t_rcvtime = ticks;
685 		INP_WUNLOCK(inp);
686 
687 		icl_cxgbei_conn_pdu_free(NULL, ip);
688 		toep->ulpcb2 = NULL;
689 		m_freem(m);
690 		return (0);
691 	}
692 
693 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
694 	ic = &icc->ic;
695 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
696 	    F_DDP_DATACRC_ERR)) != 0) {
697 		INP_WUNLOCK(inp);
698 
699 		icl_cxgbei_conn_pdu_free(NULL, ip);
700 		toep->ulpcb2 = NULL;
701 		m_freem(m);
702 		ic->ic_error(ic);
703 		return (0);
704 	}
705 
706 #ifdef INVARIANTS
707 	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
708 	    ISCSI_DATA_DIGEST_SIZE : 0;
709 	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
710 #endif
711 
712 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
713 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
714 		MPASS(ip->ip_data_len > 0);
715 		icp->icp_flags |= ICPF_RX_DDP;
716 		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
717 
718 		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
719 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
720 			cmp = cxgbei_find_cmp(icc,
721 			    be32toh(bhsdo->bhsdo_initiator_task_tag));
722 			break;
723 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
724 			cmp = cxgbei_find_cmp(icc,
725 			    be32toh(bhsdo->bhsdo_target_transfer_tag));
726 			break;
727 		default:
728 			__assert_unreachable();
729 		}
730 		MPASS(cmp != NULL);
731 
732 		/*
733 		 * The difference between the end of the last burst
734 		 * and the offset of the last PDU in this burst is
735 		 * the additional data received via DDP.
736 		 */
737 		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
738 		    cmp->next_buffer_offset;
739 
740 		if (prev_seg_len != 0) {
741 			uint32_t orig_datasn;
742 
743 			/*
744 			 * Return a "large" PDU representing the burst
745 			 * of PDUs.  Adjust the offset and length of
746 			 * this PDU to represent the entire burst.
747 			 */
748 			ip->ip_data_len += prev_seg_len;
749 			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
750 			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
751 			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
752 			bhsdo->bhsdo_buffer_offset =
753 			    htobe32(cmp->next_buffer_offset);
754 
755 			orig_datasn = htobe32(bhsdo->bhsdo_datasn);
756 			npdus = orig_datasn - cmp->last_datasn;
757 			bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1);
758 			cmp->last_datasn = orig_datasn;
759 			ip->ip_additional_pdus = npdus - 1;
760 		} else {
761 			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
762 			    cmp->last_datasn + 1);
763 			npdus = 1;
764 			cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
765 		}
766 
767 		cmp->next_buffer_offset += ip->ip_data_len;
768 		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
769 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
770 	} else {
771 		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
772 		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
773 	}
774 
775 	tp->rcv_nxt = icp->icp_seq + pdu_len;
776 	tp->t_rcvtime = ticks;
777 
778 	/*
779 	 * Don't update the window size or return credits since RX
780 	 * flow control is disabled.
781 	 */
782 
783 	so = inp->inp_socket;
784 	sb = &so->so_rcv;
785 	SOCKBUF_LOCK(sb);
786 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
787 		CTR5(KTR_CXGBE,
788 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
789 		    __func__, tid, pdu_len, icc, sb->sb_state);
790 		SOCKBUF_UNLOCK(sb);
791 		INP_WUNLOCK(inp);
792 
793 		CURVNET_SET(so->so_vnet);
794 		NET_EPOCH_ENTER(et);
795 		INP_WLOCK(inp);
796 		tp = tcp_drop(tp, ECONNRESET);
797 		if (tp != NULL)
798 			INP_WUNLOCK(inp);
799 		NET_EPOCH_EXIT(et);
800 		CURVNET_RESTORE();
801 
802 		icl_cxgbei_conn_pdu_free(NULL, ip);
803 		toep->ulpcb2 = NULL;
804 		m_freem(m);
805 		return (0);
806 	}
807 
808 	icl_cxgbei_new_pdu_set_conn(ip, ic);
809 
810 	/* Enqueue the PDU to the received pdus queue. */
811 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
812 	if (!icc->rx_active) {
813 		icc->rx_active = true;
814 		wakeup(&icc->rx_active);
815 	}
816 	SOCKBUF_UNLOCK(sb);
817 	INP_WUNLOCK(inp);
818 
819 	toep->ulpcb2 = NULL;
820 	m_freem(m);
821 
822 	return (0);
823 }
824 
825 static int
826 cxgbei_activate(struct adapter *sc)
827 {
828 	struct cxgbei_data *ci;
829 	int rc;
830 
831 	ASSERT_SYNCHRONIZED_OP(sc);
832 
833 	if (uld_active(sc, ULD_ISCSI)) {
834 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
835 		    __func__, sc));
836 		return (0);
837 	}
838 
839 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
840 		device_printf(sc->dev,
841 		    "not iSCSI offload capable, or capability disabled.\n");
842 		return (ENOSYS);
843 	}
844 
845 	/* per-adapter softc for iSCSI */
846 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
847 	if (ci == NULL)
848 		return (ENOMEM);
849 
850 	rc = cxgbei_init(sc, ci);
851 	if (rc != 0) {
852 		free(ci, M_CXGBE);
853 		return (rc);
854 	}
855 
856 	sc->iscsi_ulp_softc = ci;
857 
858 	return (0);
859 }
860 
861 static int
862 cxgbei_deactivate(struct adapter *sc)
863 {
864 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
865 
866 	ASSERT_SYNCHRONIZED_OP(sc);
867 
868 	if (ci != NULL) {
869 		sysctl_ctx_free(&ci->ctx);
870 		t4_free_ppod_region(&ci->pr);
871 		free(ci, M_CXGBE);
872 		sc->iscsi_ulp_softc = NULL;
873 	}
874 
875 	return (0);
876 }
877 
878 static void
879 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
880 {
881 
882 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
883 		return;
884 
885 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
886 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
887 		(void) t4_activate_uld(sc, ULD_ISCSI);
888 
889 	end_synchronized_op(sc, 0);
890 }
891 
892 static void
893 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
894 {
895 
896 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
897 		return;
898 
899 	if (uld_active(sc, ULD_ISCSI))
900 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
901 
902 	end_synchronized_op(sc, 0);
903 }
904 
905 static struct uld_info cxgbei_uld_info = {
906 	.uld_id = ULD_ISCSI,
907 	.activate = cxgbei_activate,
908 	.deactivate = cxgbei_deactivate,
909 };
910 
911 static int
912 cxgbei_mod_load(void)
913 {
914 	int rc;
915 
916 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
917 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
918 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
919 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
920 
921 	rc = t4_register_uld(&cxgbei_uld_info);
922 	if (rc != 0)
923 		return (rc);
924 
925 	t4_iterate(cxgbei_activate_all, NULL);
926 
927 	return (rc);
928 }
929 
930 static int
931 cxgbei_mod_unload(void)
932 {
933 
934 	t4_iterate(cxgbei_deactivate_all, NULL);
935 
936 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
937 		return (EBUSY);
938 
939 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
940 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
941 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
942 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
943 
944 	return (0);
945 }
946 #endif
947 
948 static int
949 cxgbei_modevent(module_t mod, int cmd, void *arg)
950 {
951 	int rc = 0;
952 
953 #ifdef TCP_OFFLOAD
954 	switch (cmd) {
955 	case MOD_LOAD:
956 		rc = cxgbei_mod_load();
957 		if (rc == 0)
958 			rc = icl_cxgbei_mod_load();
959 		break;
960 
961 	case MOD_UNLOAD:
962 		rc = icl_cxgbei_mod_unload();
963 		if (rc == 0)
964 			rc = cxgbei_mod_unload();
965 		break;
966 
967 	default:
968 		rc = EINVAL;
969 	}
970 #else
971 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
972 	rc = EOPNOTSUPP;
973 #endif
974 
975 	return (rc);
976 }
977 
978 static moduledata_t cxgbei_mod = {
979 	"cxgbei",
980 	cxgbei_modevent,
981 	NULL,
982 };
983 
984 MODULE_VERSION(cxgbei, 1);
985 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
986 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
987 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
988 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
989