xref: /freebsd/sys/dev/cxgbe/cxgbei/cxgbei.c (revision 2c8d04d0228871c24017509cf039e7c5d97d97be)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  *
5  * Chelsio T5xx iSCSI driver
6  *
7  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/systm.h>
42 
43 #ifdef TCP_OFFLOAD
44 #include <sys/errno.h>
45 #include <sys/kthread.h>
46 #include <sys/smp.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/mbuf.h>
50 #include <sys/lock.h>
51 #include <sys/mutex.h>
52 #include <sys/condvar.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/toecore.h>
57 #include <netinet/tcp_var.h>
58 #include <netinet/tcp_fsm.h>
59 
60 #include <cam/scsi/scsi_all.h>
61 #include <cam/scsi/scsi_da.h>
62 #include <cam/ctl/ctl_io.h>
63 #include <cam/ctl/ctl.h>
64 #include <cam/ctl/ctl_backend.h>
65 #include <cam/ctl/ctl_error.h>
66 #include <cam/ctl/ctl_frontend.h>
67 #include <cam/ctl/ctl_debug.h>
68 #include <cam/ctl/ctl_ha.h>
69 #include <cam/ctl/ctl_ioctl.h>
70 
71 #include <dev/iscsi/icl.h>
72 #include <dev/iscsi/iscsi_proto.h>
73 #include <dev/iscsi/iscsi_ioctl.h>
74 #include <dev/iscsi/iscsi.h>
75 #include <cam/ctl/ctl_frontend_iscsi.h>
76 
77 #include <cam/cam.h>
78 #include <cam/cam_ccb.h>
79 #include <cam/cam_xpt.h>
80 #include <cam/cam_debug.h>
81 #include <cam/cam_sim.h>
82 #include <cam/cam_xpt_sim.h>
83 #include <cam/cam_xpt_periph.h>
84 #include <cam/cam_periph.h>
85 #include <cam/cam_compat.h>
86 #include <cam/scsi/scsi_message.h>
87 
88 #include "common/common.h"
89 #include "common/t4_msg.h"
90 #include "common/t4_regs.h"     /* for PCIE_MEM_ACCESS */
91 #include "tom/t4_tom.h"
92 #include "cxgbei.h"
93 #include "cxgbei_ulp2_ddp.h"
94 
95 static int worker_thread_count;
96 static struct cxgbei_worker_thread_softc *cwt_softc;
97 static struct proc *cxgbei_proc;
98 
99 /* XXXNP some header instead. */
100 struct icl_pdu *icl_cxgbei_new_pdu(int);
101 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
102 void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
103 
104 /*
105  * Direct Data Placement -
106  * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted
107  * final destination host-memory buffers based on the Initiator Task Tag (ITT)
108  * in Data-In or Target Task Tag (TTT) in Data-Out PDUs.
109  * The host memory address is programmed into h/w in the format of pagepod
110  * entries.
111  * The location of the pagepod entry is encoded into ddp tag which is used as
112  * the base for ITT/TTT.
113  */
114 
115 /*
116  * functions to program the pagepod in h/w
117  */
118 static void inline
119 ppod_set(struct pagepod *ppod,
120 	struct cxgbei_ulp2_pagepod_hdr *hdr,
121 	struct cxgbei_ulp2_gather_list *gl,
122 	unsigned int pidx)
123 {
124 	int i;
125 
126 	memcpy(ppod, hdr, sizeof(*hdr));
127 
128 	for (i = 0; i < (PPOD_PAGES + 1); i++, pidx++) {
129 		ppod->addr[i] = pidx < gl->nelem ?
130 			cpu_to_be64(gl->dma_sg[pidx].phys_addr) : 0ULL;
131 	}
132 }
133 
134 static void inline
135 ppod_clear(struct pagepod *ppod)
136 {
137 	memset(ppod, 0, sizeof(*ppod));
138 }
139 
140 static inline void
141 ulp_mem_io_set_hdr(struct adapter *sc, int tid, struct ulp_mem_io *req,
142 		unsigned int wr_len, unsigned int dlen,
143 		unsigned int pm_addr)
144 {
145 	struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1);
146 
147 	INIT_ULPTX_WR(req, wr_len, 0, 0);
148 	req->cmd = cpu_to_be32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
149 				V_ULP_MEMIO_ORDER(is_t4(sc)) |
150 				V_T5_ULP_MEMIO_IMM(is_t5(sc)));
151 	req->dlen = htonl(V_ULP_MEMIO_DATA_LEN(dlen >> 5));
152 	req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16)
153 				| V_FW_WR_FLOWID(tid));
154 	req->lock_addr = htonl(V_ULP_MEMIO_ADDR(pm_addr >> 5));
155 
156 	idata->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM));
157 	idata->len = htonl(dlen);
158 }
159 
160 #define PPOD_SIZE		sizeof(struct pagepod)
161 #define ULPMEM_IDATA_MAX_NPPODS 1	/* 256/PPOD_SIZE */
162 #define PCIE_MEMWIN_MAX_NPPODS 16	/* 1024/PPOD_SIZE */
163 
164 static int
165 ppod_write_idata(struct cxgbei_data *ci,
166 			struct cxgbei_ulp2_pagepod_hdr *hdr,
167 			unsigned int idx, unsigned int npods,
168 			struct cxgbei_ulp2_gather_list *gl,
169 			unsigned int gl_pidx, struct toepcb *toep)
170 {
171 	u_int dlen = PPOD_SIZE * npods;
172 	u_int pm_addr = idx * PPOD_SIZE + ci->llimit;
173 	u_int wr_len = roundup(sizeof(struct ulp_mem_io) +
174 	    sizeof(struct ulptx_idata) + dlen, 16);
175 	struct ulp_mem_io *req;
176 	struct ulptx_idata *idata;
177 	struct pagepod *ppod;
178 	u_int i;
179 	struct wrqe *wr;
180 	struct adapter *sc = toep->vi->pi->adapter;
181 
182 	wr = alloc_wrqe(wr_len, toep->ctrlq);
183 	if (wr == NULL) {
184 		CXGBE_UNIMPLEMENTED("ppod_write_idata: alloc_wrqe failure");
185 		return (ENOMEM);
186 	}
187 
188 	req = wrtod(wr);
189 	memset(req, 0, wr_len);
190 	ulp_mem_io_set_hdr(sc, toep->tid, req, wr_len, dlen, pm_addr);
191 	idata = (struct ulptx_idata *)(req + 1);
192 
193 	ppod = (struct pagepod *)(idata + 1);
194 	for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES) {
195 		if (!hdr) /* clear the pagepod */
196 			ppod_clear(ppod);
197 		else /* set the pagepod */
198 			ppod_set(ppod, hdr, gl, gl_pidx);
199 	}
200 
201 	t4_wrq_tx(sc, wr);
202 	return 0;
203 }
204 
205 int
206 t4_ddp_set_map(struct cxgbei_data *ci, void *iccp,
207     struct cxgbei_ulp2_pagepod_hdr *hdr, u_int idx, u_int npods,
208     struct cxgbei_ulp2_gather_list *gl, int reply)
209 {
210 	struct icl_cxgbei_conn *icc = (struct icl_cxgbei_conn *)iccp;
211 	struct toepcb *toep = icc->toep;
212 	int err;
213 	unsigned int pidx = 0, w_npods = 0, cnt;
214 
215 	/*
216 	 * on T4, if we use a mix of IMMD and DSGL with ULP_MEM_WRITE,
217 	 * the order would not be guaranteed, so we will stick with IMMD
218 	 */
219 	gl->tid = toep->tid;
220 	gl->port_id = toep->vi->pi->port_id;
221 	gl->egress_dev = (void *)toep->vi->ifp;
222 
223 	/* send via immediate data */
224 	for (; w_npods < npods; idx += cnt, w_npods += cnt,
225 		pidx += PPOD_PAGES) {
226 		cnt = npods - w_npods;
227 		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
228 			cnt = ULPMEM_IDATA_MAX_NPPODS;
229 		err = ppod_write_idata(ci, hdr, idx, cnt, gl, pidx, toep);
230 		if (err) {
231 			printf("%s: ppod_write_idata failed\n", __func__);
232 			break;
233 		}
234 	}
235 	return err;
236 }
237 
238 void
239 t4_ddp_clear_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl,
240     u_int tag, u_int idx, u_int npods, struct icl_cxgbei_conn *icc)
241 {
242 	struct toepcb *toep = icc->toep;
243 	int err = -1;
244 	u_int pidx = 0;
245 	u_int w_npods = 0;
246 	u_int cnt;
247 
248 	for (; w_npods < npods; idx += cnt, w_npods += cnt,
249 		pidx += PPOD_PAGES) {
250 		cnt = npods - w_npods;
251 		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
252 			cnt = ULPMEM_IDATA_MAX_NPPODS;
253 		err = ppod_write_idata(ci, NULL, idx, cnt, gl, 0, toep);
254 		if (err)
255 			break;
256 	}
257 }
258 
259 static int
260 cxgbei_map_sg(struct cxgbei_sgl *sgl, struct ccb_scsiio *csio)
261 {
262 	unsigned int data_len = csio->dxfer_len;
263 	unsigned int sgoffset = (uint64_t)csio->data_ptr & PAGE_MASK;
264 	unsigned int nsge;
265 	unsigned char *sgaddr = csio->data_ptr;
266 	unsigned int len = 0;
267 
268 	nsge = (csio->dxfer_len + sgoffset + PAGE_SIZE - 1) >> PAGE_SHIFT;
269 	sgl->sg_addr = sgaddr;
270 	sgl->sg_offset = sgoffset;
271 	if (data_len <  (PAGE_SIZE - sgoffset))
272 		len = data_len;
273 	else
274 		len = PAGE_SIZE - sgoffset;
275 
276 	sgl->sg_length = len;
277 
278 	data_len -= len;
279 	sgaddr += len;
280 	sgl = sgl+1;
281 
282 	while (data_len > 0) {
283 		sgl->sg_addr = sgaddr;
284 		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
285 		sgl->sg_length = len;
286 	        sgaddr += len;
287 		data_len -= len;
288 		sgl = sgl + 1;
289 	}
290 
291 	return nsge;
292 }
293 
294 static int
295 cxgbei_map_sg_tgt(struct cxgbei_sgl *sgl, union ctl_io *io)
296 {
297 	unsigned int data_len, sgoffset, nsge;
298 	unsigned char *sgaddr;
299 	unsigned int len = 0, index = 0, ctl_sg_count, i;
300 	struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
301 
302 	if (io->scsiio.kern_sg_entries > 0) {
303 		ctl_sglist = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr;
304 		ctl_sg_count = io->scsiio.kern_sg_entries;
305 	} else {
306 		ctl_sglist = &ctl_sg_entry;
307 		ctl_sglist->addr = io->scsiio.kern_data_ptr;
308 		ctl_sglist->len = io->scsiio.kern_data_len;
309 		ctl_sg_count = 1;
310 	}
311 
312 	sgaddr = sgl->sg_addr = ctl_sglist[index].addr;
313 	sgoffset = sgl->sg_offset = (uint64_t)sgl->sg_addr & PAGE_MASK;
314 	data_len = ctl_sglist[index].len;
315 
316 	if (data_len <  (PAGE_SIZE - sgoffset))
317 		len = data_len;
318 	else
319 		len = PAGE_SIZE - sgoffset;
320 
321 	sgl->sg_length = len;
322 
323 	data_len -= len;
324 	sgaddr += len;
325 	sgl = sgl+1;
326 
327 	len = 0;
328 	for (i = 0;  i< ctl_sg_count; i++)
329 		len += ctl_sglist[i].len;
330 	nsge = (len + sgoffset + PAGE_SIZE -1) >> PAGE_SHIFT;
331 	while (data_len > 0) {
332 		sgl->sg_addr = sgaddr;
333 		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
334 		sgl->sg_length = len;
335 		sgaddr += len;
336 		data_len -= len;
337 		sgl = sgl + 1;
338 		if (data_len == 0) {
339 			if (index == ctl_sg_count - 1)
340 				break;
341 			index++;
342 			sgaddr = ctl_sglist[index].addr;
343 			data_len = ctl_sglist[index].len;
344 		}
345 	}
346 
347 	return nsge;
348 }
349 
350 static int
351 t4_sk_ddp_tag_reserve(struct cxgbei_data *ci, struct icl_cxgbei_conn *icc,
352     u_int xferlen, struct cxgbei_sgl *sgl, u_int sgcnt, u_int *ddp_tag)
353 {
354 	struct cxgbei_ulp2_gather_list *gl;
355 	int err = -EINVAL;
356 	struct toepcb *toep = icc->toep;
357 
358 	gl = cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(xferlen, sgl, sgcnt, ci, 0);
359 	if (gl) {
360 		err = cxgbei_ulp2_ddp_tag_reserve(ci, icc, toep->tid,
361 		    &ci->tag_format, ddp_tag, gl, 0, 0);
362 		if (err) {
363 			cxgbei_ulp2_ddp_release_gl(ci, gl);
364 		}
365 	}
366 
367 	return err;
368 }
369 
370 static unsigned int
371 cxgbei_task_reserve_itt(struct icl_conn *ic, void **prv,
372 			struct ccb_scsiio *scmd, unsigned int *itt)
373 {
374 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
375 	int xferlen = scmd->dxfer_len;
376 	struct cxgbei_task_data *tdata = NULL;
377 	struct cxgbei_sgl *sge = NULL;
378 	struct toepcb *toep = icc->toep;
379 	struct adapter *sc = td_adapter(toep->td);
380 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
381 	int err = -1;
382 
383 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
384 
385 	tdata = (struct cxgbei_task_data *)*prv;
386 	if (xferlen == 0 || tdata == NULL)
387 		goto out;
388 	if (xferlen < DDP_THRESHOLD)
389 		goto out;
390 
391 	if ((scmd->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
392 		tdata->nsge = cxgbei_map_sg(tdata->sgl, scmd);
393 		if (tdata->nsge == 0) {
394 			CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
395 			return 0;
396 		}
397 		sge = tdata->sgl;
398 
399 		tdata->sc_ddp_tag = *itt;
400 
401 		CTR3(KTR_CXGBE, "%s: *itt:0x%x sc_ddp_tag:0x%x",
402 				__func__, *itt, tdata->sc_ddp_tag);
403 		if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format,
404 							tdata->sc_ddp_tag)) {
405 			err = t4_sk_ddp_tag_reserve(ci, icc, scmd->dxfer_len,
406 			    sge, tdata->nsge, &tdata->sc_ddp_tag);
407 		} else {
408 			CTR3(KTR_CXGBE,
409 				"%s: itt:0x%x sc_ddp_tag:0x%x not usable",
410 				__func__, *itt, tdata->sc_ddp_tag);
411 		}
412 	}
413 out:
414 	if (err < 0)
415 		tdata->sc_ddp_tag =
416 			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *itt);
417 
418 	return tdata->sc_ddp_tag;
419 }
420 
421 static unsigned int
422 cxgbei_task_reserve_ttt(struct icl_conn *ic, void **prv, union ctl_io *io,
423 				unsigned int *ttt)
424 {
425 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
426 	struct toepcb *toep = icc->toep;
427 	struct adapter *sc = td_adapter(toep->td);
428 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
429 	struct cxgbei_task_data *tdata = NULL;
430 	int xferlen, err = -1;
431 	struct cxgbei_sgl *sge = NULL;
432 
433 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
434 
435 	xferlen = (io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
436 	tdata = (struct cxgbei_task_data *)*prv;
437 	if ((xferlen == 0) || (tdata == NULL))
438 		goto out;
439 	if (xferlen < DDP_THRESHOLD)
440 		goto out;
441 	tdata->nsge = cxgbei_map_sg_tgt(tdata->sgl, io);
442 	if (tdata->nsge == 0) {
443 		CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
444 		return 0;
445 	}
446 	sge = tdata->sgl;
447 
448 	tdata->sc_ddp_tag = *ttt;
449 	if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, tdata->sc_ddp_tag)) {
450 		err = t4_sk_ddp_tag_reserve(ci, icc, xferlen, sge,
451 		    tdata->nsge, &tdata->sc_ddp_tag);
452 	} else {
453 		CTR2(KTR_CXGBE, "%s: sc_ddp_tag:0x%x not usable",
454 				__func__, tdata->sc_ddp_tag);
455 	}
456 out:
457 	if (err < 0)
458 		tdata->sc_ddp_tag =
459 			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *ttt);
460 	return tdata->sc_ddp_tag;
461 }
462 
463 static int
464 t4_sk_ddp_tag_release(struct icl_cxgbei_conn *icc, unsigned int ddp_tag)
465 {
466 	struct toepcb *toep = icc->toep;
467 	struct adapter *sc = td_adapter(toep->td);
468 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
469 
470 	cxgbei_ulp2_ddp_tag_release(ci, ddp_tag, icc);
471 
472 	return (0);
473 }
474 
475 static int
476 cxgbei_ddp_init(struct adapter *sc, struct cxgbei_data *ci)
477 {
478 	int nppods, bits, max_sz, rc;
479 	static const u_int pgsz_order[] = {0, 1, 2, 3};
480 
481 	MPASS(sc->vres.iscsi.size > 0);
482 
483 	ci->llimit = sc->vres.iscsi.start;
484 	ci->ulimit = sc->vres.iscsi.start + sc->vres.iscsi.size - 1;
485 	max_sz = G_MAXRXDATA(t4_read_reg(sc, A_TP_PARA_REG2));
486 
487 	nppods = sc->vres.iscsi.size >> IPPOD_SIZE_SHIFT;
488 	if (nppods <= 1024)
489 		return (ENXIO);
490 
491 	bits = fls(nppods);
492 	if (bits > IPPOD_IDX_MAX_SIZE)
493 		bits = IPPOD_IDX_MAX_SIZE;
494 	nppods = (1 << (bits - 1)) - 1;
495 
496 	rc = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR,
497 	    BUS_SPACE_MAXADDR, NULL, NULL, UINT32_MAX , 8, BUS_SPACE_MAXSIZE,
498 	    BUS_DMA_ALLOCNOW, NULL, NULL, &ci->ulp_ddp_tag);
499 	if (rc != 0) {
500 		device_printf(sc->dev, "%s: failed to create DMA tag: %u.\n",
501 		    __func__, rc);
502 		return (rc);
503 	}
504 
505 	ci->colors = malloc(nppods * sizeof(char), M_CXGBE, M_NOWAIT | M_ZERO);
506 	ci->gl_map = malloc(nppods * sizeof(struct cxgbei_ulp2_gather_list *),
507 	    M_CXGBE, M_NOWAIT | M_ZERO);
508 	if (ci->colors == NULL || ci->gl_map == NULL) {
509 		bus_dma_tag_destroy(ci->ulp_ddp_tag);
510 		free(ci->colors, M_CXGBE);
511 		free(ci->gl_map, M_CXGBE);
512 		return (ENOMEM);
513 	}
514 
515 	mtx_init(&ci->map_lock, "ddp lock", NULL, MTX_DEF | MTX_DUPOK);
516 	ci->max_txsz = ci->max_rxsz = min(max_sz, ULP2_MAX_PKT_SIZE);
517 	ci->nppods = nppods;
518 	ci->idx_last = nppods;
519 	ci->idx_bits = bits;
520 	ci->idx_mask = (1 << bits) - 1;
521 	ci->rsvd_tag_mask = (1 << (bits + IPPOD_IDX_SHIFT)) - 1;
522 
523 	ci->tag_format.sw_bits = bits;
524 	ci->tag_format.rsvd_bits = bits;
525 	ci->tag_format.rsvd_shift = IPPOD_IDX_SHIFT;
526 	ci->tag_format.rsvd_mask = ci->idx_mask;
527 
528 	t4_iscsi_init(sc, ci->idx_mask << IPPOD_IDX_SHIFT, pgsz_order);
529 
530 	return (rc);
531 }
532 
533 static int
534 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
535 {
536 	struct adapter *sc = iq->adapter;
537 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
538 	u_int tid = GET_TID(cpl);
539 	struct toepcb *toep = lookup_tid(sc, tid);
540 	struct icl_pdu *ip;
541 	struct icl_cxgbei_pdu *icp;
542 
543 	M_ASSERTPKTHDR(m);
544 
545 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
546 	if (ip == NULL)
547 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
548 	icp = ip_to_icp(ip);
549 	bcopy(mtod(m, caddr_t) + sizeof(*cpl), icp->ip.ip_bhs, sizeof(struct
550 	    iscsi_bhs));
551 	icp->pdu_seq = ntohl(cpl->seq);
552 	icp->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD;
553 
554 	/* This is the start of a new PDU.  There should be no old state. */
555 	MPASS(toep->ulpcb2 == NULL);
556 	toep->ulpcb2 = icp;
557 
558 #if 0
559 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len hlen %u, m->m_len hlen %u",
560 	    __func__, tid, ntohs(cpl->len), m->m_len);
561 #endif
562 
563 	m_freem(m);
564 	return (0);
565 }
566 
567 static int
568 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
569 {
570 	struct adapter *sc = iq->adapter;
571 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
572 	u_int tid = GET_TID(cpl);
573 	struct toepcb *toep = lookup_tid(sc, tid);
574 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
575 
576 	M_ASSERTPKTHDR(m);
577 
578 	/* Must already have received the header (but not the data). */
579 	MPASS(icp != NULL);
580 	MPASS(icp->pdu_flags == SBUF_ULP_FLAG_HDR_RCVD);
581 	MPASS(icp->ip.ip_data_mbuf == NULL);
582 	MPASS(icp->ip.ip_data_len == 0);
583 
584 	m_adj(m, sizeof(*cpl));
585 
586 	icp->pdu_flags |= SBUF_ULP_FLAG_DATA_RCVD;
587 	icp->ip.ip_data_mbuf = m;
588 	icp->ip.ip_data_len = m->m_pkthdr.len;
589 
590 #if 0
591 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len dlen %u, m->m_len dlen %u",
592 	    __func__, tid, ntohs(cpl->len), m->m_len);
593 #endif
594 
595 	return (0);
596 }
597 
598 static int
599 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
600 {
601 	struct adapter *sc = iq->adapter;
602 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
603 	u_int tid = GET_TID(cpl);
604 	struct toepcb *toep = lookup_tid(sc, tid);
605 	struct inpcb *inp = toep->inp;
606 	struct socket *so;
607 	struct sockbuf *sb;
608 	struct tcpcb *tp;
609 	struct icl_cxgbei_conn *icc;
610 	struct icl_conn *ic;
611 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
612 	struct icl_pdu *ip;
613 	u_int pdu_len, val;
614 
615 	MPASS(m == NULL);
616 
617 	/* Must already be assembling a PDU. */
618 	MPASS(icp != NULL);
619 	MPASS(icp->pdu_flags & SBUF_ULP_FLAG_HDR_RCVD);	/* Data is optional. */
620 	ip = &icp->ip;
621 	icp->pdu_flags |= SBUF_ULP_FLAG_STATUS_RCVD;
622 	val = ntohl(cpl->ddpvld);
623 	if (val & F_DDP_PADDING_ERR)
624 		icp->pdu_flags |= SBUF_ULP_FLAG_PAD_ERROR;
625 	if (val & F_DDP_HDRCRC_ERR)
626 		icp->pdu_flags |= SBUF_ULP_FLAG_HCRC_ERROR;
627 	if (val & F_DDP_DATACRC_ERR)
628 		icp->pdu_flags |= SBUF_ULP_FLAG_DCRC_ERROR;
629 	if (ip->ip_data_mbuf == NULL) {
630 		/* XXXNP: what should ip->ip_data_len be, and why? */
631 		icp->pdu_flags |= SBUF_ULP_FLAG_DATA_DDPED;
632 	}
633 	pdu_len = ntohs(cpl->len);	/* includes everything. */
634 
635 	INP_WLOCK(inp);
636 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
637 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
638 		    __func__, tid, pdu_len, inp->inp_flags);
639 		INP_WUNLOCK(inp);
640 		icl_cxgbei_conn_pdu_free(NULL, ip);
641 #ifdef INVARIANTS
642 		toep->ulpcb2 = NULL;
643 #endif
644 		return (0);
645 	}
646 
647 	tp = intotcpcb(inp);
648 	MPASS(icp->pdu_seq == tp->rcv_nxt);
649 	MPASS(tp->rcv_wnd >= pdu_len);
650 	tp->rcv_nxt += pdu_len;
651 	tp->rcv_wnd -= pdu_len;
652 	tp->t_rcvtime = ticks;
653 
654 	/* update rx credits */
655 	toep->rx_credits += pdu_len;
656 	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
657 
658 	so = inp->inp_socket;
659 	sb = &so->so_rcv;
660 	SOCKBUF_LOCK(sb);
661 
662 	icc = toep->ulpcb;
663 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
664 		CTR5(KTR_CXGBE,
665 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
666 		    __func__, tid, pdu_len, icc, sb->sb_state);
667 		SOCKBUF_UNLOCK(sb);
668 		INP_WUNLOCK(inp);
669 
670 		INP_INFO_RLOCK(&V_tcbinfo);
671 		INP_WLOCK(inp);
672 		tp = tcp_drop(tp, ECONNRESET);
673 		if (tp)
674 			INP_WUNLOCK(inp);
675 		INP_INFO_RUNLOCK(&V_tcbinfo);
676 
677 		icl_cxgbei_conn_pdu_free(NULL, ip);
678 #ifdef INVARIANTS
679 		toep->ulpcb2 = NULL;
680 #endif
681 		return (0);
682 	}
683 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
684 	ic = &icc->ic;
685 	icl_cxgbei_new_pdu_set_conn(ip, ic);
686 
687 	MPASS(m == NULL); /* was unused, we'll use it now. */
688 	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
689 	if (__predict_false(m != NULL)) {
690 		int len = m_length(m, NULL);
691 
692 		/*
693 		 * PDUs were received before the tid transitioned to ULP mode.
694 		 * Convert them to icl_cxgbei_pdus and send them to ICL before
695 		 * the PDU in icp/ip.
696 		 */
697 		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
698 		    len);
699 
700 		/* XXXNP: needs to be rewritten. */
701 		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
702 		    iscsi_bhs)) {
703 			struct icl_cxgbei_pdu *icp0;
704 			struct icl_pdu *ip0;
705 
706 			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
707 			icl_cxgbei_new_pdu_set_conn(ip0, ic);
708 			if (ip0 == NULL)
709 				CXGBE_UNIMPLEMENTED("PDU allocation failure");
710 			icp0 = ip_to_icp(ip0);
711 			icp0->pdu_seq = 0; /* XXX */
712 			icp0->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD |
713 			    SBUF_ULP_FLAG_STATUS_RCVD;
714 			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
715 			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
716 		}
717 		m_freem(m);
718 	}
719 
720 #if 0
721 	CTR4(KTR_CXGBE, "%s: tid %u, pdu_len %u, pdu_flags 0x%x",
722 	    __func__, tid, pdu_len, icp->pdu_flags);
723 #endif
724 
725 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
726 	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
727 		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
728 
729 		mtx_lock(&cwt->cwt_lock);
730 		icc->rx_flags |= RXF_ACTIVE;
731 		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
732 		if (cwt->cwt_state == CWT_SLEEPING) {
733 			cwt->cwt_state = CWT_RUNNING;
734 			cv_signal(&cwt->cwt_cv);
735 		}
736 		mtx_unlock(&cwt->cwt_lock);
737 	}
738 	SOCKBUF_UNLOCK(sb);
739 	INP_WUNLOCK(inp);
740 
741 #ifdef INVARIANTS
742 	toep->ulpcb2 = NULL;
743 #endif
744 
745 	return (0);
746 }
747 
748 /* initiator */
749 void
750 cxgbei_conn_task_reserve_itt(void *conn, void **prv,
751 				void *scmd, unsigned int *itt)
752 {
753 	unsigned int tag;
754 	tag = cxgbei_task_reserve_itt(conn, prv, scmd, itt);
755 	if (tag)
756 		*itt = htonl(tag);
757 	return;
758 }
759 
760 /* target */
761 void
762 cxgbei_conn_transfer_reserve_ttt(void *conn, void **prv,
763 				void *scmd, unsigned int *ttt)
764 {
765 	unsigned int tag;
766 	tag = cxgbei_task_reserve_ttt(conn, prv, scmd, ttt);
767 	if (tag)
768 		*ttt = htonl(tag);
769 	return;
770 }
771 
772 void
773 cxgbei_cleanup_task(void *conn, void *ofld_priv)
774 {
775 	struct icl_conn *ic = (struct icl_conn *)conn;
776 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
777 	struct cxgbei_task_data *tdata = ofld_priv;
778 	struct adapter *sc = icc->sc;
779 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
780 
781 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
782 	MPASS(tdata != NULL);
783 
784 	if (cxgbei_ulp2_is_ddp_tag(&ci->tag_format, tdata->sc_ddp_tag))
785 		t4_sk_ddp_tag_release(icc, tdata->sc_ddp_tag);
786 	memset(tdata, 0, sizeof(*tdata));
787 }
788 
789 static int
790 cxgbei_activate(struct adapter *sc)
791 {
792 	struct cxgbei_data *ci;
793 	int rc;
794 
795 	ASSERT_SYNCHRONIZED_OP(sc);
796 
797 	if (uld_active(sc, ULD_ISCSI)) {
798 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
799 		    __func__, sc));
800 		return (0);
801 	}
802 
803 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
804 		device_printf(sc->dev,
805 		    "not iSCSI offload capable, or capability disabled.\n");
806 		return (ENOSYS);
807 	}
808 
809 	/* per-adapter softc for iSCSI */
810 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_NOWAIT);
811 	if (ci == NULL)
812 		return (ENOMEM);
813 
814 	rc = cxgbei_ddp_init(sc, ci);
815 	if (rc != 0) {
816 		free(ci, M_CXGBE);
817 		return (rc);
818 	}
819 
820 	sc->iscsi_ulp_softc = ci;
821 
822 	return (0);
823 }
824 
825 static int
826 cxgbei_deactivate(struct adapter *sc)
827 {
828 
829 	ASSERT_SYNCHRONIZED_OP(sc);
830 
831 	if (sc->iscsi_ulp_softc != NULL) {
832 		cxgbei_ddp_cleanup(sc->iscsi_ulp_softc);
833 		free(sc->iscsi_ulp_softc, M_CXGBE);
834 		sc->iscsi_ulp_softc = NULL;
835 	}
836 
837 	return (0);
838 }
839 
840 static void
841 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
842 {
843 
844 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
845 		return;
846 
847 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
848 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
849 		(void) t4_activate_uld(sc, ULD_ISCSI);
850 
851 	end_synchronized_op(sc, 0);
852 }
853 
854 static void
855 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
856 {
857 
858 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
859 		return;
860 
861 	if (uld_active(sc, ULD_ISCSI))
862 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
863 
864 	end_synchronized_op(sc, 0);
865 }
866 
867 static struct uld_info cxgbei_uld_info = {
868 	.uld_id = ULD_ISCSI,
869 	.activate = cxgbei_activate,
870 	.deactivate = cxgbei_deactivate,
871 };
872 
873 static void
874 cwt_main(void *arg)
875 {
876 	struct cxgbei_worker_thread_softc *cwt = arg;
877 	struct icl_cxgbei_conn *icc = NULL;
878 	struct icl_conn *ic;
879 	struct icl_pdu *ip;
880 	struct sockbuf *sb;
881 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
882 
883 	MPASS(cwt != NULL);
884 
885 	mtx_lock(&cwt->cwt_lock);
886 	MPASS(cwt->cwt_state == 0);
887 	cwt->cwt_state = CWT_RUNNING;
888 	cv_signal(&cwt->cwt_cv);
889 
890 	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
891 		cwt->cwt_state = CWT_RUNNING;
892 		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
893 			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
894 			mtx_unlock(&cwt->cwt_lock);
895 
896 			ic = &icc->ic;
897 			sb = &ic->ic_socket->so_rcv;
898 
899 			SOCKBUF_LOCK(sb);
900 			MPASS(icc->rx_flags & RXF_ACTIVE);
901 			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
902 				MPASS(STAILQ_EMPTY(&rx_pdus));
903 				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
904 				SOCKBUF_UNLOCK(sb);
905 
906 				/* Hand over PDUs to ICL. */
907 				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
908 					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
909 					ic->ic_receive(ip);
910 				}
911 
912 				SOCKBUF_LOCK(sb);
913 				MPASS(STAILQ_EMPTY(&rx_pdus));
914 			}
915 			MPASS(icc->rx_flags & RXF_ACTIVE);
916 			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
917 			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
918 				icc->rx_flags &= ~RXF_ACTIVE;
919 			} else {
920 				/*
921 				 * More PDUs were received while we were busy
922 				 * handing over the previous batch to ICL.
923 				 * Re-add this connection to the end of the
924 				 * queue.
925 				 */
926 				mtx_lock(&cwt->cwt_lock);
927 				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
928 				    rx_link);
929 				mtx_unlock(&cwt->cwt_lock);
930 			}
931 			SOCKBUF_UNLOCK(sb);
932 
933 			mtx_lock(&cwt->cwt_lock);
934 		}
935 
936 		/* Inner loop doesn't check for CWT_STOP, do that first. */
937 		if (__predict_false(cwt->cwt_state == CWT_STOP))
938 			break;
939 		cwt->cwt_state = CWT_SLEEPING;
940 		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
941 	}
942 
943 	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
944 	mtx_assert(&cwt->cwt_lock, MA_OWNED);
945 	cwt->cwt_state = CWT_STOPPED;
946 	cv_signal(&cwt->cwt_cv);
947 	mtx_unlock(&cwt->cwt_lock);
948 	kthread_exit();
949 }
950 
951 static int
952 start_worker_threads(void)
953 {
954 	int i, rc;
955 	struct cxgbei_worker_thread_softc *cwt;
956 
957 	worker_thread_count = min(mp_ncpus, 32);
958 	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
959 	    M_WAITOK | M_ZERO);
960 
961 	MPASS(cxgbei_proc == NULL);
962 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
963 		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
964 		cv_init(&cwt->cwt_cv, "cwt cv");
965 		TAILQ_INIT(&cwt->rx_head);
966 		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
967 		    "cxgbei", "%d", i);
968 		if (rc != 0) {
969 			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
970 			    i + 1, worker_thread_count, rc);
971 			mtx_destroy(&cwt->cwt_lock);
972 			cv_destroy(&cwt->cwt_cv);
973 			bzero(&cwt, sizeof(*cwt));
974 			if (i == 0) {
975 				free(cwt_softc, M_CXGBE);
976 				worker_thread_count = 0;
977 
978 				return (rc);
979 			}
980 
981 			/* Not fatal, carry on with fewer threads. */
982 			worker_thread_count = i;
983 			rc = 0;
984 			break;
985 		}
986 
987 		/* Wait for thread to start before moving on to the next one. */
988 		mtx_lock(&cwt->cwt_lock);
989 		while (cwt->cwt_state == 0)
990 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
991 		mtx_unlock(&cwt->cwt_lock);
992 	}
993 
994 	MPASS(cwt_softc != NULL);
995 	MPASS(worker_thread_count > 0);
996 	return (0);
997 }
998 
999 static void
1000 stop_worker_threads(void)
1001 {
1002 	int i;
1003 	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
1004 
1005 	MPASS(worker_thread_count >= 0);
1006 
1007 	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
1008 		mtx_lock(&cwt->cwt_lock);
1009 		MPASS(cwt->cwt_state == CWT_RUNNING ||
1010 		    cwt->cwt_state == CWT_SLEEPING);
1011 		cwt->cwt_state = CWT_STOP;
1012 		cv_signal(&cwt->cwt_cv);
1013 		do {
1014 			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
1015 		} while (cwt->cwt_state != CWT_STOPPED);
1016 		mtx_unlock(&cwt->cwt_lock);
1017 	}
1018 	free(cwt_softc, M_CXGBE);
1019 }
1020 
1021 /* Select a worker thread for a connection. */
1022 u_int
1023 cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
1024 {
1025 	struct adapter *sc = icc->sc;
1026 	struct toepcb *toep = icc->toep;
1027 	u_int i, n;
1028 
1029 	n = worker_thread_count / sc->sge.nofldrxq;
1030 	if (n > 0)
1031 		i = toep->vi->pi->port_id * n + arc4random() % n;
1032 	else
1033 		i = arc4random() % worker_thread_count;
1034 
1035 	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
1036 
1037 	return (i);
1038 }
1039 
1040 static int
1041 cxgbei_mod_load(void)
1042 {
1043 	int rc;
1044 
1045 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
1046 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
1047 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
1048 
1049 	rc = start_worker_threads();
1050 	if (rc != 0)
1051 		return (rc);
1052 
1053 	rc = t4_register_uld(&cxgbei_uld_info);
1054 	if (rc != 0) {
1055 		stop_worker_threads();
1056 		return (rc);
1057 	}
1058 
1059 	t4_iterate(cxgbei_activate_all, NULL);
1060 
1061 	return (rc);
1062 }
1063 
1064 static int
1065 cxgbei_mod_unload(void)
1066 {
1067 
1068 	t4_iterate(cxgbei_deactivate_all, NULL);
1069 
1070 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
1071 		return (EBUSY);
1072 
1073 	stop_worker_threads();
1074 
1075 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
1076 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
1077 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
1078 
1079 	return (0);
1080 }
1081 #endif
1082 
1083 static int
1084 cxgbei_modevent(module_t mod, int cmd, void *arg)
1085 {
1086 	int rc = 0;
1087 
1088 #ifdef TCP_OFFLOAD
1089 	switch (cmd) {
1090 	case MOD_LOAD:
1091 		rc = cxgbei_mod_load();
1092 		break;
1093 
1094 	case MOD_UNLOAD:
1095 		rc = cxgbei_mod_unload();
1096 		break;
1097 
1098 	default:
1099 		rc = EINVAL;
1100 	}
1101 #else
1102 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
1103 	rc = EOPNOTSUPP;
1104 #endif
1105 
1106 	return (rc);
1107 }
1108 
1109 static moduledata_t cxgbei_mod = {
1110 	"cxgbei",
1111 	cxgbei_modevent,
1112 	NULL,
1113 };
1114 
1115 MODULE_VERSION(cxgbei, 1);
1116 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1117 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
1118 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
1119 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
1120