xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 63518eccca27064285cf2e680510ba9a4c3e2231)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Chelsio Corporation nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <machine/bus.h>
44 #include <machine/resource.h>
45 #include <sys/bus_dma.h>
46 #include <sys/rman.h>
47 #include <sys/queue.h>
48 #include <sys/sysctl.h>
49 #include <sys/taskqueue.h>
50 
51 
52 #include <sys/proc.h>
53 #include <sys/sched.h>
54 #include <sys/smp.h>
55 #include <sys/systm.h>
56 
57 #include <netinet/in_systm.h>
58 #include <netinet/in.h>
59 #include <netinet/ip.h>
60 #include <netinet/tcp.h>
61 
62 #include <dev/pci/pcireg.h>
63 #include <dev/pci/pcivar.h>
64 #include <dev/cxgb/common/cxgb_common.h>
65 #include <dev/cxgb/common/cxgb_regs.h>
66 #include <dev/cxgb/common/cxgb_sge_defs.h>
67 #include <dev/cxgb/common/cxgb_t3_cpl.h>
68 #include <dev/cxgb/common/cxgb_firmware_exports.h>
69 
70 #define USE_GTS 0
71 
72 #define SGE_RX_SM_BUF_SIZE	1536
73 #define SGE_RX_DROP_THRES	16
74 
75 /*
76  * Period of the Tx buffer reclaim timer.  This timer does not need to run
77  * frequently as Tx buffers are usually reclaimed by new Tx packets.
78  */
79 #define TX_RECLAIM_PERIOD       (hz >> 2)
80 
81 /*
82  * work request size in bytes
83  */
84 #define WR_LEN (WR_FLITS * 8)
85 
86 /*
87  * Values for sge_txq.flags
88  */
89 enum {
90 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
91 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
92 };
93 
94 struct tx_desc {
95 	uint64_t	flit[TX_DESC_FLITS];
96 } __packed;
97 
98 struct rx_desc {
99 	uint32_t	addr_lo;
100 	uint32_t	len_gen;
101 	uint32_t	gen2;
102 	uint32_t	addr_hi;
103 } __packed;;
104 
105 struct rsp_desc {               /* response queue descriptor */
106 	struct rss_header	rss_hdr;
107 	uint32_t		flags;
108 	uint32_t		len_cq;
109 	uint8_t			imm_data[47];
110 	uint8_t			intr_gen;
111 } __packed;
112 
113 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
114 #define RX_SW_DESC_INUSE        (1 << 3)
115 #define TX_SW_DESC_MAPPED       (1 << 4)
116 
117 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
118 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
119 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
120 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
121 
122 struct tx_sw_desc {                /* SW state per Tx descriptor */
123 	struct mbuf	*m;
124 	bus_dmamap_t	map;
125 	int		flags;
126 };
127 
128 struct rx_sw_desc {                /* SW state per Rx descriptor */
129 	struct mbuf	*m;
130 	bus_dmamap_t	map;
131 	int		flags;
132 };
133 
134 struct txq_state {
135 	unsigned int compl;
136 	unsigned int gen;
137 	unsigned int pidx;
138 };
139 
140 /*
141  * Maps a number of flits to the number of Tx descriptors that can hold them.
142  * The formula is
143  *
144  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
145  *
146  * HW allows up to 4 descriptors to be combined into a WR.
147  */
148 static uint8_t flit_desc_map[] = {
149 	0,
150 #if SGE_NUM_GENBITS == 1
151 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
152 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
153 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
154 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
155 #elif SGE_NUM_GENBITS == 2
156 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
158 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
159 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
160 #else
161 # error "SGE_NUM_GENBITS must be 1 or 2"
162 #endif
163 };
164 
165 
166 static int lro_default = 0;
167 int cxgb_debug = 0;
168 
169 static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
170 static void sge_timer_cb(void *arg);
171 static void sge_timer_reclaim(void *arg, int ncount);
172 static int free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec);
173 
174 /**
175  *	reclaim_completed_tx - reclaims completed Tx descriptors
176  *	@adapter: the adapter
177  *	@q: the Tx queue to reclaim completed descriptors from
178  *
179  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
180  *	and frees the associated buffers if possible.  Called with the Tx
181  *	queue's lock held.
182  */
183 static __inline int
184 reclaim_completed_tx(adapter_t *adapter, struct sge_txq *q, int nbufs, struct mbuf **mvec)
185 {
186 	int reclaimed, reclaim = desc_reclaimable(q);
187 	int n = 0;
188 
189 	mtx_assert(&q->lock, MA_OWNED);
190 
191 	if (reclaim > 0) {
192 		n = free_tx_desc(adapter, q, min(reclaim, nbufs), mvec);
193 		reclaimed = min(reclaim, nbufs);
194 		q->cleaned += reclaimed;
195 		q->in_use -= reclaimed;
196 	}
197 
198 	return (n);
199 }
200 
201 /**
202  *	t3_sge_init - initialize SGE
203  *	@adap: the adapter
204  *	@p: the SGE parameters
205  *
206  *	Performs SGE initialization needed every time after a chip reset.
207  *	We do not initialize any of the queue sets here, instead the driver
208  *	top-level must request those individually.  We also do not enable DMA
209  *	here, that should be done after the queues have been set up.
210  */
211 void
212 t3_sge_init(adapter_t *adap, struct sge_params *p)
213 {
214 	u_int ctrl, ups;
215 
216 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
217 
218 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
219 	       F_CQCRDTCTRL |
220 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
221 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
222 #if SGE_NUM_GENBITS == 1
223 	ctrl |= F_EGRGENCTRL;
224 #endif
225 	if (adap->params.rev > 0) {
226 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
227 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
228 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
229 	}
230 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
231 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
232 		     V_LORCQDRBTHRSH(512));
233 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
234 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
235 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
236 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
237 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
238 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
239 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
240 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
241 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
242 }
243 
244 
245 /**
246  *	sgl_len - calculates the size of an SGL of the given capacity
247  *	@n: the number of SGL entries
248  *
249  *	Calculates the number of flits needed for a scatter/gather list that
250  *	can hold the given number of entries.
251  */
252 static __inline unsigned int
253 sgl_len(unsigned int n)
254 {
255 	return ((3 * n) / 2 + (n & 1));
256 }
257 
258 /**
259  *	get_imm_packet - return the next ingress packet buffer from a response
260  *	@resp: the response descriptor containing the packet data
261  *
262  *	Return a packet containing the immediate data of the given response.
263  */
264 static __inline int
265 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh)
266 {
267 	struct mbuf *m;
268 	int len;
269 	uint32_t flags = ntohl(resp->flags);
270 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
271 
272 	/*
273 	 * would be a firmware bug
274 	 */
275 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
276 		return (0);
277 
278 	m = m_gethdr(M_NOWAIT, MT_DATA);
279 	len = G_RSPD_LEN(ntohl(resp->len_cq));
280 
281 	if (m) {
282 		MH_ALIGN(m, IMMED_PKT_SIZE);
283 		memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE);
284 		m->m_len = len;
285 
286 		switch (sopeop) {
287 		case RSPQ_SOP_EOP:
288 			mh->mh_head = mh->mh_tail = m;
289 			m->m_pkthdr.len = len;
290 			m->m_flags |= M_PKTHDR;
291 			break;
292 		case RSPQ_EOP:
293 			m->m_flags &= ~M_PKTHDR;
294 			mh->mh_head->m_pkthdr.len += len;
295 			mh->mh_tail->m_next = m;
296 			mh->mh_tail = m;
297 			break;
298 		}
299 	}
300 	return (m != NULL);
301 }
302 
303 
304 static __inline u_int
305 flits_to_desc(u_int n)
306 {
307 	return (flit_desc_map[n]);
308 }
309 
310 void
311 t3_sge_err_intr_handler(adapter_t *adapter)
312 {
313 	unsigned int v, status;
314 
315 
316 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
317 
318 	if (status & F_RSPQCREDITOVERFOW)
319 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
320 
321 	if (status & F_RSPQDISABLED) {
322 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
323 
324 		CH_ALERT(adapter,
325 			 "packet delivered to disabled response queue (0x%x)\n",
326 			 (v >> S_RSPQ0DISABLED) & 0xff);
327 	}
328 
329 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
330 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
331 		t3_fatal_err(adapter);
332 }
333 
334 void
335 t3_sge_prep(adapter_t *adap, struct sge_params *p)
336 {
337 	int i;
338 
339 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
340 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
341 
342 	for (i = 0; i < SGE_QSETS; ++i) {
343 		struct qset_params *q = p->qset + i;
344 
345 		q->polling = adap->params.rev > 0;
346 
347 		if (adap->flags & USING_MSIX)
348 			q->coalesce_nsecs = 6000;
349 		else
350 			q->coalesce_nsecs = 3500;
351 
352 		q->rspq_size = RSPQ_Q_SIZE;
353 		q->fl_size = FL_Q_SIZE;
354 		q->jumbo_size = JUMBO_Q_SIZE;
355 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
356 		q->txq_size[TXQ_OFLD] = 1024;
357 		q->txq_size[TXQ_CTRL] = 256;
358 		q->cong_thres = 0;
359 	}
360 }
361 
362 int
363 t3_sge_alloc(adapter_t *sc)
364 {
365 
366 	/* The parent tag. */
367 	if (bus_dma_tag_create( NULL,			/* parent */
368 				1, 0,			/* algnmnt, boundary */
369 				BUS_SPACE_MAXADDR,	/* lowaddr */
370 				BUS_SPACE_MAXADDR,	/* highaddr */
371 				NULL, NULL,		/* filter, filterarg */
372 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
373 				BUS_SPACE_UNRESTRICTED, /* nsegments */
374 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
375 				0,			/* flags */
376 				NULL, NULL,		/* lock, lockarg */
377 				&sc->parent_dmat)) {
378 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
379 		return (ENOMEM);
380 	}
381 
382 	/*
383 	 * DMA tag for normal sized RX frames
384 	 */
385 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
386 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
387 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
388 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
389 		return (ENOMEM);
390 	}
391 
392 	/*
393 	 * DMA tag for jumbo sized RX frames.
394 	 */
395 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
396 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
397 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
398 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
399 		return (ENOMEM);
400 	}
401 
402 	/*
403 	 * DMA tag for TX frames.
404 	 */
405 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
406 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
407 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
408 		NULL, NULL, &sc->tx_dmat)) {
409 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
410 		return (ENOMEM);
411 	}
412 
413 	return (0);
414 }
415 
416 int
417 t3_sge_free(struct adapter * sc)
418 {
419 
420 	if (sc->tx_dmat != NULL)
421 		bus_dma_tag_destroy(sc->tx_dmat);
422 
423 	if (sc->rx_jumbo_dmat != NULL)
424 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
425 
426 	if (sc->rx_dmat != NULL)
427 		bus_dma_tag_destroy(sc->rx_dmat);
428 
429 	if (sc->parent_dmat != NULL)
430 		bus_dma_tag_destroy(sc->parent_dmat);
431 
432 	return (0);
433 }
434 
435 void
436 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
437 {
438 
439 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
440 	qs->rspq.polling = 0 /* p->polling */;
441 }
442 
443 
444 /**
445  *	refill_fl - refill an SGE free-buffer list
446  *	@sc: the controller softc
447  *	@q: the free-list to refill
448  *	@n: the number of new buffers to allocate
449  *
450  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
451  *	The caller must assure that @n does not exceed the queue's capacity.
452  */
453 static void
454 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
455 {
456 	bus_dma_segment_t seg;
457 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
458 	struct rx_desc *d = &q->desc[q->pidx];
459 	struct mbuf *m;
460 	int err, nsegs;
461 
462 	while (n--) {
463 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, q->buf_size);
464 
465 		if (m == NULL) {
466 			log(LOG_WARNING, "Failed to allocate mbuf\n");
467 			goto done;
468 		}
469 
470 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
471 			if ((err = bus_dmamap_create(sc->rx_jumbo_dmat, 0, &sd->map))) {
472 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
473 				goto done;
474 			}
475 			sd->flags |= RX_SW_DESC_MAP_CREATED;
476 		}
477 		sd->flags |= RX_SW_DESC_INUSE;
478 
479 		m->m_pkthdr.len = m->m_len = q->buf_size;
480 		err = bus_dmamap_load_mbuf_sg(sc->rx_jumbo_dmat, sd->map, m, &seg,
481 		    &nsegs, BUS_DMA_NOWAIT);
482 		if (err != 0) {
483 			log(LOG_WARNING, "failure in refill_fl %d\n", err);
484 			m_freem(m);
485 			return;
486 		}
487 
488 		sd->m = m;
489 		d->addr_lo = htobe32(seg.ds_addr & 0xffffffff);
490 		d->addr_hi = htobe32(((uint64_t)seg.ds_addr >>32) & 0xffffffff);
491 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
492 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
493 
494 		d++;
495 		sd++;
496 
497 		if (++q->pidx == q->size) {
498 			q->pidx = 0;
499 			q->gen ^= 1;
500 			sd = q->sdesc;
501 			d = q->desc;
502 		}
503 		q->credits++;
504 	}
505 
506 done:
507 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
508 }
509 
510 
511 /**
512  *	free_rx_bufs - free the Rx buffers on an SGE free list
513  *	@sc: the controle softc
514  *	@q: the SGE free list to clean up
515  *
516  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
517  *	this queue should be stopped before calling this function.
518  */
519 static void
520 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
521 {
522 	u_int cidx = q->cidx;
523 
524 	while (q->credits--) {
525 		struct rx_sw_desc *d = &q->sdesc[cidx];
526 
527 		if (d->flags & RX_SW_DESC_INUSE) {
528 			bus_dmamap_unload(sc->rx_jumbo_dmat, d->map);
529 			bus_dmamap_destroy(sc->rx_jumbo_dmat, d->map);
530 			m_freem(d->m);
531 		}
532 		d->m = NULL;
533 		if (++cidx == q->size)
534 			cidx = 0;
535 	}
536 }
537 
538 static __inline void
539 __refill_fl(adapter_t *adap, struct sge_fl *fl)
540 {
541 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
542 }
543 
544 static void
545 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
546 {
547 	uint32_t *addr;
548 
549 	addr = arg;
550 	*addr = segs[0].ds_addr;
551 }
552 
553 static int
554 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
555 	   bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
556 	   bus_dmamap_t *map)
557 {
558 	size_t len = nelem * elem_size;
559 	void *s = NULL;
560 	void *p = NULL;
561 	int err;
562 
563 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
564 				      BUS_SPACE_MAXADDR_32BIT,
565 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
566 				      len, 0, NULL, NULL, tag)) != 0) {
567 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
568 		return (ENOMEM);
569 	}
570 
571 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
572 				    map)) != 0) {
573 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
574 		return (ENOMEM);
575 	}
576 
577 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
578 	bzero(p, len);
579 	*(void **)desc = p;
580 
581 	if (sw_size) {
582 		len = nelem * sw_size;
583 		s = malloc(len, M_DEVBUF, M_WAITOK);
584 		bzero(s, len);
585 		*(void **)sdesc = s;
586 	}
587 	return (0);
588 }
589 
590 static void
591 sge_slow_intr_handler(void *arg, int ncount)
592 {
593 	adapter_t *sc = arg;
594 
595 	t3_slow_intr_handler(sc);
596 }
597 
598 static void
599 sge_timer_cb(void *arg)
600 {
601 	adapter_t *sc = arg;
602 	struct sge_qset *qs;
603 	struct sge_txq  *txq;
604 	int i, j;
605 	int reclaim_eth, reclaim_ofl, refill_rx;
606 
607 	for (i = 0; i < sc->params.nports; i++)
608 		for (j = 0; j < sc->port[i].nqsets; j++) {
609 			qs = &sc->sge.qs[i + j];
610 			txq = &qs->txq[0];
611 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
612 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
613 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
614 			    (qs->fl[1].credits < qs->fl[1].size));
615 			if (reclaim_eth || reclaim_ofl || refill_rx) {
616 				taskqueue_enqueue(sc->tq, &sc->timer_reclaim_task);
617 				goto done;
618 			}
619 		}
620 done:
621 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
622 }
623 
624 /*
625  * This is meant to be a catch-all function to keep sge state private
626  * to sge.c
627  *
628  */
629 int
630 t3_sge_init_sw(adapter_t *sc)
631 {
632 
633 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
634 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
635 	TASK_INIT(&sc->timer_reclaim_task, 0, sge_timer_reclaim, sc);
636 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
637 	return (0);
638 }
639 
640 void
641 t3_sge_deinit_sw(adapter_t *sc)
642 {
643 	callout_drain(&sc->sge_timer_ch);
644 	if (sc->tq) {
645 		taskqueue_drain(sc->tq, &sc->timer_reclaim_task);
646 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
647 	}
648 }
649 
650 /**
651  *	refill_rspq - replenish an SGE response queue
652  *	@adapter: the adapter
653  *	@q: the response queue to replenish
654  *	@credits: how many new responses to make available
655  *
656  *	Replenishes a response queue by making the supplied number of responses
657  *	available to HW.
658  */
659 static __inline void
660 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
661 {
662 
663 	/* mbufs are allocated on demand when a rspq entry is processed. */
664 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
665 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
666 }
667 
668 
669 static void
670 sge_timer_reclaim(void *arg, int ncount)
671 {
672 	adapter_t *sc = arg;
673 	int i, nqsets = 0;
674 	struct sge_qset *qs;
675 	struct sge_txq *txq;
676 	struct mtx *lock;
677 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
678 	int n, reclaimable;
679 	/*
680 	 * XXX assuming these quantities are allowed to change during operation
681 	 */
682 	for (i = 0; i < sc->params.nports; i++)
683 		nqsets += sc->port[i].nqsets;
684 
685 	for (i = 0; i < nqsets; i++) {
686 		qs = &sc->sge.qs[i];
687 		txq = &qs->txq[TXQ_ETH];
688 		reclaimable = desc_reclaimable(txq);
689 		if (reclaimable > 0) {
690 			mtx_lock(&txq->lock);
691 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
692 			mtx_unlock(&txq->lock);
693 
694 			for (i = 0; i < n; i++) {
695 				m_freem(m_vec[i]);
696 			}
697 		}
698 
699 		txq = &qs->txq[TXQ_OFLD];
700 		reclaimable = desc_reclaimable(txq);
701 		if (reclaimable > 0) {
702 			mtx_lock(&txq->lock);
703 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
704 			mtx_unlock(&txq->lock);
705 
706 			for (i = 0; i < n; i++) {
707 				m_freem(m_vec[i]);
708 			}
709 		}
710 
711 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
712 			    &sc->sge.qs[0].rspq.lock;
713 
714 		if (mtx_trylock(lock)) {
715 			/* XXX currently assume that we are *NOT* polling */
716 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
717 
718 			if (qs->fl[0].credits < qs->fl[0].size - 16)
719 				__refill_fl(sc, &qs->fl[0]);
720 			if (qs->fl[1].credits < qs->fl[1].size - 16)
721 				__refill_fl(sc, &qs->fl[1]);
722 
723 			if (status & (1 << qs->rspq.cntxt_id)) {
724 				if (qs->rspq.credits) {
725 					refill_rspq(sc, &qs->rspq, 1);
726 					qs->rspq.credits--;
727 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
728 					    1 << qs->rspq.cntxt_id);
729 				}
730 			}
731 			mtx_unlock(lock);
732 		}
733 	}
734 }
735 
736 /**
737  *	init_qset_cntxt - initialize an SGE queue set context info
738  *	@qs: the queue set
739  *	@id: the queue set id
740  *
741  *	Initializes the TIDs and context ids for the queues of a queue set.
742  */
743 static void
744 init_qset_cntxt(struct sge_qset *qs, u_int id)
745 {
746 
747 	qs->rspq.cntxt_id = id;
748 	qs->fl[0].cntxt_id = 2 * id;
749 	qs->fl[1].cntxt_id = 2 * id + 1;
750 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
751 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
752 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
753 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
754 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
755 }
756 
757 
758 static void
759 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
760 {
761 	txq->in_use += ndesc;
762 	/*
763 	 * XXX we don't handle stopping of queue
764 	 * presumably start handles this when we bump against the end
765 	 */
766 	txqs->gen = txq->gen;
767 	txq->unacked += ndesc;
768 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
769 	txq->unacked &= 7;
770 	txqs->pidx = txq->pidx;
771 	txq->pidx += ndesc;
772 
773 	if (txq->pidx >= txq->size) {
774 		txq->pidx -= txq->size;
775 		txq->gen ^= 1;
776 	}
777 
778 }
779 
780 /**
781  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
782  *	@m: the packet mbufs
783  *      @nsegs: the number of segments
784  *
785  * 	Returns the number of Tx descriptors needed for the given Ethernet
786  * 	packet.  Ethernet packets require addition of WR and CPL headers.
787  */
788 static __inline unsigned int
789 calc_tx_descs(const struct mbuf *m, int nsegs)
790 {
791 	unsigned int flits;
792 
793 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
794 		return 1;
795 
796 	flits = sgl_len(nsegs) + 2;
797 #ifdef TSO_SUPPORTED
798 	if (m->m_pkthdr.tso_segsz)
799 		flits++;
800 #endif
801 	return flits_to_desc(flits);
802 }
803 
804 static __inline unsigned int
805 busdma_map_mbufs(struct mbuf **m, adapter_t *sc, struct tx_sw_desc *stx,
806     bus_dma_segment_t *segs, int *nsegs)
807 {
808 	struct mbuf *m0, *mtmp;
809 	int err, pktlen;
810 
811 	m0 = *m;
812 	pktlen = m0->m_pkthdr.len;
813 	err = bus_dmamap_load_mbuf_sg(sc->tx_dmat, stx->map, m0, segs, nsegs, 0);
814 	if (err) {
815 		int n = 0;
816 		mtmp = m0;
817 		while(mtmp) {
818 			n++;
819 			mtmp = mtmp->m_next;
820 		}
821 #ifdef DEBUG
822 		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
823 		    err, m0->m_pkthdr.len, n);
824 #endif
825 	}
826 
827 
828 	if (err == EFBIG) {
829 		/* Too many segments, try to defrag */
830 		m0 = m_defrag(m0, M_NOWAIT);
831 		if (m0 == NULL) {
832 			m_freem(*m);
833 			*m = NULL;
834 			return (ENOBUFS);
835 		}
836 		*m = m0;
837 		err = bus_dmamap_load_mbuf_sg(sc->tx_dmat, stx->map, m0, segs, nsegs, 0);
838 	}
839 
840 	if (err == ENOMEM) {
841 		return (err);
842 	}
843 
844 	if (err) {
845 		if (cxgb_debug)
846 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
847 		m_freem(m0);
848 		*m = NULL;
849 		return (err);
850 	}
851 
852 	bus_dmamap_sync(sc->tx_dmat, stx->map, BUS_DMASYNC_PREWRITE);
853 	stx->flags |= TX_SW_DESC_MAPPED;
854 
855 	return (0);
856 }
857 
858 /**
859  *	make_sgl - populate a scatter/gather list for a packet
860  *	@sgp: the SGL to populate
861  *	@segs: the packet dma segments
862  *	@nsegs: the number of segments
863  *
864  *	Generates a scatter/gather list for the buffers that make up a packet
865  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
866  *	appropriately.
867  */
868 static __inline void
869 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
870 {
871 	int i, idx;
872 
873 	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
874 		if (i && idx == 0)
875 			++sgp;
876 
877 		sgp->len[idx] = htobe32(segs[i].ds_len);
878 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
879 	}
880 
881 	if (idx)
882 		sgp->len[idx] = 0;
883 }
884 
885 /**
886  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
887  *	@adap: the adapter
888  *	@q: the Tx queue
889  *
890  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
891  *	where the HW is going to sleep just after we checked, however,
892  *	then the interrupt handler will detect the outstanding TX packet
893  *	and ring the doorbell for us.
894  *
895  *	When GTS is disabled we unconditionally ring the doorbell.
896  */
897 static __inline void
898 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
899 {
900 #if USE_GTS
901 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
902 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
903 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
904 #ifdef T3_TRACE
905 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
906 			  q->cntxt_id);
907 #endif
908 		t3_write_reg(adap, A_SG_KDOORBELL,
909 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
910 	}
911 #else
912 	wmb();            /* write descriptors before telling HW */
913 	t3_write_reg(adap, A_SG_KDOORBELL,
914 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
915 #endif
916 }
917 
918 static __inline void
919 wr_gen2(struct tx_desc *d, unsigned int gen)
920 {
921 #if SGE_NUM_GENBITS == 2
922 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
923 #endif
924 }
925 
926 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
927 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
928 
929 int
930 t3_encap(struct port_info *p, struct mbuf **m)
931 {
932 	adapter_t *sc;
933 	struct mbuf *m0;
934 	struct sge_qset *qs;
935 	struct sge_txq *txq;
936 	struct tx_sw_desc *stx;
937 	struct txq_state txqs;
938 	unsigned int nsegs, ndesc, flits, cntrl, mlen, tso_info;
939 	int err;
940 
941 	struct work_request_hdr *wrp;
942 	struct tx_sw_desc *txsd;
943 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
944 	bus_dma_segment_t segs[TX_MAX_SEGS];
945 	uint32_t wr_hi, wr_lo, sgl_flits;
946 
947 	struct tx_desc *txd;
948 	struct cpl_tx_pkt *cpl;
949 
950 	DPRINTF("t3_encap ");
951 	m0 = *m;
952 	sc = p->adapter;
953 	qs = &sc->sge.qs[p->first_qset];
954 	txq = &qs->txq[TXQ_ETH];
955 	stx = &txq->sdesc[txq->pidx];
956 	txd = &txq->desc[txq->pidx];
957 	cpl = (struct cpl_tx_pkt *)txd;
958 	mlen = m0->m_pkthdr.len;
959 	cpl->len = htonl(mlen | 0x80000000);
960 
961 	DPRINTF("mlen=%d\n", mlen);
962 	/*
963 	 * XXX handle checksum, TSO, and VLAN here
964 	 *
965 	 */
966 	cntrl = V_TXPKT_INTF(p->port);
967 
968 	/*
969 	 * XXX need to add VLAN support for 6.x
970 	 */
971 #ifdef VLAN_SUPPORTED
972 	if (m0->m_flags & M_VLANTAG)
973 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
974 
975 	tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
976 #else
977 	tso_info = 0;
978 #endif
979 	if (tso_info) {
980 		int eth_type;
981 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
982 		struct ip *ip;
983 		struct tcphdr *tcp;
984 		uint8_t *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
985 
986 		txd->flit[2] = 0;
987 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
988 		hdr->cntrl = htonl(cntrl);
989 
990 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
991 			pkthdr = &tmp[0];
992 			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
993 		} else {
994 			pkthdr = m0->m_data;
995 		}
996 
997 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
998 			eth_type = CPL_ETH_II_VLAN;
999 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1000 			    ETHER_VLAN_ENCAP_LEN);
1001 		} else {
1002 			eth_type = CPL_ETH_II;
1003 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1004 		}
1005 		tcp = (struct tcphdr *)((uint8_t *)ip +
1006 		    sizeof(*ip));
1007 
1008 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1009 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1010 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1011 		hdr->lso_info = htonl(tso_info);
1012 
1013 		flits = 3;
1014 	} else {
1015 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1016 		cpl->cntrl = htonl(cntrl);
1017 
1018 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1019 			txq_prod(txq, 1, &txqs);
1020 			txq->sdesc[txqs.pidx].m = m0;
1021 
1022 			if (m0->m_len == m0->m_pkthdr.len)
1023 				memcpy(&txd->flit[2], m0->m_data, mlen);
1024 			else
1025 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1026 
1027 			flits = (mlen + 7) / 8 + 2;
1028 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1029 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1030 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1031 			wmb();
1032 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1033 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1034 
1035 			wr_gen2(txd, txqs.gen);
1036 			check_ring_tx_db(sc, txq);
1037 			return (0);
1038 		}
1039 		flits = 2;
1040 	}
1041 
1042 	wrp = (struct work_request_hdr *)txd;
1043 
1044 	if ((err = busdma_map_mbufs(m, sc, stx, segs, &nsegs)) != 0) {
1045 		return (err);
1046 	}
1047 	m0 = *m;
1048 	ndesc = calc_tx_descs(m0, nsegs);
1049 
1050 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : &sgl[0];
1051 	make_sgl(sgp, segs, nsegs);
1052 
1053 	sgl_flits = sgl_len(nsegs);
1054 
1055 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1056 	txq_prod(txq, ndesc, &txqs);
1057 	txsd = &txq->sdesc[txqs.pidx];
1058 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1059 	wr_lo = htonl(V_WR_TID(txq->token));
1060 	txsd->m = m0;
1061 
1062 	if (__predict_true(ndesc == 1)) {
1063 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1064 		    V_WR_SGLSFLT(flits)) | wr_hi;
1065 		wmb();
1066 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1067 		    V_WR_GEN(txqs.gen)) | wr_lo;
1068 		/* XXX gen? */
1069 		wr_gen2(txd, txqs.gen);
1070 	} else {
1071 		unsigned int ogen = txqs.gen;
1072 		const uint64_t *fp = (const uint64_t *)sgl;
1073 		struct work_request_hdr *wp = wrp;
1074 
1075 		/* XXX - CHECK ME */
1076 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1077 		    V_WR_SGLSFLT(flits)) | wr_hi;
1078 
1079 		while (sgl_flits) {
1080 			unsigned int avail = WR_FLITS - flits;
1081 
1082 			if (avail > sgl_flits)
1083 				avail = sgl_flits;
1084 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1085 			sgl_flits -= avail;
1086 			ndesc--;
1087 			if (!sgl_flits)
1088 				break;
1089 
1090 			fp += avail;
1091 			txd++;
1092 			txsd++;
1093 			if (++txqs.pidx == txq->size) {
1094 				txqs.pidx = 0;
1095 				txqs.gen ^= 1;
1096 				txd = txq->desc;
1097 				txsd = txq->sdesc;
1098 			}
1099 
1100 			/*
1101 			 * when the head of the mbuf chain
1102 			 * is freed all clusters will be freed
1103 			 * with it
1104 			 */
1105 			txsd->m = NULL;
1106 			wrp = (struct work_request_hdr *)txd;
1107 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1108 			    V_WR_SGLSFLT(1)) | wr_hi;
1109 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1110 				    sgl_flits + 1)) |
1111 			    V_WR_GEN(txqs.gen)) | wr_lo;
1112 			wr_gen2(txd, txqs.gen);
1113 			flits = 1;
1114 		}
1115 #ifdef WHY
1116 		skb->priority = pidx;
1117 #endif
1118 		wrp->wr_hi |= htonl(F_WR_EOP);
1119 		wmb();
1120 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1121 		wr_gen2((struct tx_desc *)wp, ogen);
1122 	}
1123 	check_ring_tx_db(p->adapter, txq);
1124 
1125 	return (0);
1126 }
1127 
1128 
1129 /**
1130  *	write_imm - write a packet into a Tx descriptor as immediate data
1131  *	@d: the Tx descriptor to write
1132  *	@m: the packet
1133  *	@len: the length of packet data to write as immediate data
1134  *	@gen: the generation bit value to write
1135  *
1136  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1137  *	contains a work request at its beginning.  We must write the packet
1138  *	carefully so the SGE doesn't read accidentally before it's written in
1139  *	its entirety.
1140  */
1141 static __inline void write_imm(struct tx_desc *d, struct mbuf *m,
1142 			     unsigned int len, unsigned int gen)
1143 {
1144 	struct work_request_hdr *from = (struct work_request_hdr *)m->m_data;
1145 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1146 
1147 	memcpy(&to[1], &from[1], len - sizeof(*from));
1148 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1149 					V_WR_BCNTLFLT(len & 7));
1150 	wmb();
1151 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1152 					V_WR_LEN((len + 7) / 8));
1153 	wr_gen2(d, gen);
1154 	m_freem(m);
1155 }
1156 
1157 /**
1158  *	check_desc_avail - check descriptor availability on a send queue
1159  *	@adap: the adapter
1160  *	@q: the TX queue
1161  *	@m: the packet needing the descriptors
1162  *	@ndesc: the number of Tx descriptors needed
1163  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1164  *
1165  *	Checks if the requested number of Tx descriptors is available on an
1166  *	SGE send queue.  If the queue is already suspended or not enough
1167  *	descriptors are available the packet is queued for later transmission.
1168  *	Must be called with the Tx queue locked.
1169  *
1170  *	Returns 0 if enough descriptors are available, 1 if there aren't
1171  *	enough descriptors and the packet has been queued, and 2 if the caller
1172  *	needs to retry because there weren't enough descriptors at the
1173  *	beginning of the call but some freed up in the mean time.
1174  */
1175 static __inline int
1176 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1177 				   struct mbuf *m, unsigned int ndesc,
1178 				   unsigned int qid)
1179 {
1180 	/*
1181 	 * XXX We currently only use this for checking the control queue
1182 	 * the control queue is only used for binding qsets which happens
1183 	 * at init time so we are guaranteed enough descriptors
1184 	 */
1185 #if 0
1186 	if (__predict_false(!skb_queue_empty(&q->sendq))) {
1187 addq_exit:	__skb_queue_tail(&q->sendq, skb);
1188 		return 1;
1189 	}
1190 	if (__predict_false(q->size - q->in_use < ndesc)) {
1191 
1192 		struct sge_qset *qs = txq_to_qset(q, qid);
1193 
1194 		set_bit(qid, &qs->txq_stopped);
1195 		smp_mb__after_clear_bit();
1196 
1197 		if (should_restart_tx(q) &&
1198 		    test_and_clear_bit(qid, &qs->txq_stopped))
1199 			return 2;
1200 
1201 		q->stops++;
1202 		goto addq_exit;
1203 	}
1204 #endif
1205 	return 0;
1206 }
1207 
1208 
1209 /**
1210  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1211  *	@q: the SGE control Tx queue
1212  *
1213  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1214  *	that send only immediate data (presently just the control queues) and
1215  *	thus do not have any sk_buffs to release.
1216  */
1217 static __inline void
1218 reclaim_completed_tx_imm(struct sge_txq *q)
1219 {
1220 	unsigned int reclaim = q->processed - q->cleaned;
1221 
1222 	mtx_assert(&q->lock, MA_OWNED);
1223 
1224 	q->in_use -= reclaim;
1225 	q->cleaned += reclaim;
1226 }
1227 
1228 static __inline int
1229 immediate(const struct mbuf *m)
1230 {
1231 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1232 }
1233 
1234 /**
1235  *	ctrl_xmit - send a packet through an SGE control Tx queue
1236  *	@adap: the adapter
1237  *	@q: the control queue
1238  *	@m: the packet
1239  *
1240  *	Send a packet through an SGE control Tx queue.  Packets sent through
1241  *	a control queue must fit entirely as immediate data in a single Tx
1242  *	descriptor and have no page fragments.
1243  */
1244 static int
1245 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1246 {
1247 	int ret;
1248 	struct work_request_hdr *wrp = (struct work_request_hdr *)m->m_data;
1249 
1250 	if (__predict_false(!immediate(m))) {
1251 		m_freem(m);
1252 		return 0;
1253 	}
1254 
1255 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1256 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1257 
1258 	mtx_lock(&q->lock);
1259 again:	reclaim_completed_tx_imm(q);
1260 
1261 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1262 	if (__predict_false(ret)) {
1263 		if (ret == 1) {
1264 			mtx_unlock(&q->lock);
1265 			return (-1);
1266 		}
1267 		goto again;
1268 	}
1269 
1270 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1271 
1272 	q->in_use++;
1273 	if (++q->pidx >= q->size) {
1274 		q->pidx = 0;
1275 		q->gen ^= 1;
1276 	}
1277 	mtx_unlock(&q->lock);
1278 	wmb();
1279 	t3_write_reg(adap, A_SG_KDOORBELL,
1280 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1281 	return (0);
1282 }
1283 
1284 #ifdef RESTART_CTRLQ
1285 /**
1286  *	restart_ctrlq - restart a suspended control queue
1287  *	@qs: the queue set cotaining the control queue
1288  *
1289  *	Resumes transmission on a suspended Tx control queue.
1290  */
1291 static void
1292 restart_ctrlq(unsigned long data)
1293 {
1294 	struct mbuf *m;
1295 	struct sge_qset *qs = (struct sge_qset *)data;
1296 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1297 	adapter_t *adap = qs->port->adapter;
1298 
1299 	mtx_lock(&q->lock);
1300 again:	reclaim_completed_tx_imm(q);
1301 
1302 	while (q->in_use < q->size &&
1303 	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
1304 
1305 		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1306 
1307 		if (++q->pidx >= q->size) {
1308 			q->pidx = 0;
1309 			q->gen ^= 1;
1310 		}
1311 		q->in_use++;
1312 	}
1313 	if (!skb_queue_empty(&q->sendq)) {
1314 		set_bit(TXQ_CTRL, &qs->txq_stopped);
1315 		smp_mb__after_clear_bit();
1316 
1317 		if (should_restart_tx(q) &&
1318 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1319 			goto again;
1320 		q->stops++;
1321 	}
1322 
1323 	mtx_unlock(&q->lock);
1324 	t3_write_reg(adap, A_SG_KDOORBELL,
1325 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1326 }
1327 #endif
1328 
1329 /*
1330  * Send a management message through control queue 0
1331  */
1332 int
1333 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1334 {
1335 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1336 }
1337 
1338 /**
1339  *	t3_sge_alloc_qset - initialize an SGE queue set
1340  *	@sc: the controller softc
1341  *	@id: the queue set id
1342  *	@nports: how many Ethernet ports will be using this queue set
1343  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
1344  *	@p: configuration parameters for this queue set
1345  *	@ntxq: number of Tx queues for the queue set
1346  *	@pi: port info for queue set
1347  *
1348  *	Allocate resources and initialize an SGE queue set.  A queue set
1349  *	comprises a response queue, two Rx free-buffer queues, and up to 3
1350  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
1351  *	queue, offload queue, and control queue.
1352  */
1353 int
1354 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
1355 		  const struct qset_params *p, int ntxq, struct port_info *pi)
1356 {
1357 	struct sge_qset *q = &sc->sge.qs[id];
1358 	int i, ret = 0;
1359 
1360 	init_qset_cntxt(q, id);
1361 
1362 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
1363 			      sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
1364 			      &q->fl[0].desc, &q->fl[0].sdesc,
1365 			      &q->fl[0].desc_tag, &q->fl[0].desc_map)) != 0) {
1366 		printf("error %d from alloc ring fl0\n", ret);
1367 		goto err;
1368 	}
1369 
1370 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
1371 			      sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
1372 			      &q->fl[1].desc, &q->fl[1].sdesc,
1373 			      &q->fl[1].desc_tag, &q->fl[1].desc_map)) != 0) {
1374 		printf("error %d from alloc ring fl1\n", ret);
1375 		goto err;
1376 	}
1377 
1378 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
1379 			      &q->rspq.phys_addr, &q->rspq.desc, NULL,
1380 			      &q->rspq.desc_tag, &q->rspq.desc_map)) != 0) {
1381 		printf("error %d from alloc ring rspq\n", ret);
1382 		goto err;
1383 	}
1384 
1385 	for (i = 0; i < ntxq; ++i) {
1386 		/*
1387 		 * The control queue always uses immediate data so does not
1388 		 * need to keep track of any mbufs.
1389 		 * XXX Placeholder for future TOE support.
1390 		 */
1391 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
1392 
1393 		if ((ret = alloc_ring(sc, p->txq_size[i],
1394 				      sizeof(struct tx_desc), sz,
1395 				      &q->txq[i].phys_addr, &q->txq[i].desc,
1396 				      &q->txq[i].sdesc, &q->txq[i].desc_tag,
1397 				      &q->txq[i].desc_map)) != 0) {
1398 			printf("error %d from alloc ring tx %i\n", ret, i);
1399 			goto err;
1400 		}
1401 
1402 		q->txq[i].gen = 1;
1403 		q->txq[i].size = p->txq_size[i];
1404 		mtx_init(&q->txq[i].lock, "t3 txq lock", NULL, MTX_DEF);
1405 	}
1406 
1407 	q->fl[0].gen = q->fl[1].gen = 1;
1408 	q->fl[0].size = p->fl_size;
1409 	q->fl[1].size = p->jumbo_size;
1410 
1411 	q->rspq.gen = 1;
1412 	q->rspq.size = p->rspq_size;
1413 	mtx_init(&q->rspq.lock, "t3 rspq lock", NULL, MTX_DEF);
1414 
1415 	q->txq[TXQ_ETH].stop_thres = nports *
1416 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
1417 
1418 	q->fl[0].buf_size = MCLBYTES;
1419 	q->fl[1].buf_size = MJUMPAGESIZE;
1420 	q->lro.enabled = lro_default;
1421 
1422 	mtx_lock(&sc->sge.reg_lock);
1423 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
1424 				   q->rspq.phys_addr, q->rspq.size,
1425 				   q->fl[0].buf_size, 1, 0);
1426 	if (ret) {
1427 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
1428 		goto err_unlock;
1429 	}
1430 
1431 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1432 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
1433 					  q->fl[i].phys_addr, q->fl[i].size,
1434 					  q->fl[i].buf_size, p->cong_thres, 1,
1435 					  0);
1436 		if (ret) {
1437 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
1438 			goto err_unlock;
1439 		}
1440 	}
1441 
1442 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
1443 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
1444 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
1445 				 1, 0);
1446 	if (ret) {
1447 		printf("error %d from t3_sge_init_ecntxt\n", ret);
1448 		goto err_unlock;
1449 	}
1450 
1451 	if (ntxq > 1) {
1452 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
1453 					 USE_GTS, SGE_CNTXT_OFLD, id,
1454 					 q->txq[TXQ_OFLD].phys_addr,
1455 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
1456 		if (ret) {
1457 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1458 			goto err_unlock;
1459 		}
1460 	}
1461 
1462 	if (ntxq > 2) {
1463 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
1464 					 SGE_CNTXT_CTRL, id,
1465 					 q->txq[TXQ_CTRL].phys_addr,
1466 					 q->txq[TXQ_CTRL].size,
1467 					 q->txq[TXQ_CTRL].token, 1, 0);
1468 		if (ret) {
1469 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1470 			goto err_unlock;
1471 		}
1472 	}
1473 
1474 	mtx_unlock(&sc->sge.reg_lock);
1475 	t3_update_qset_coalesce(q, p);
1476 	q->port = pi;
1477 
1478 	refill_fl(sc, &q->fl[0], q->fl[0].size);
1479 	refill_fl(sc, &q->fl[1], q->fl[1].size);
1480 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
1481 
1482 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
1483 		     V_NEWTIMER(q->rspq.holdoff_tmr));
1484 
1485 	return (0);
1486 
1487 err_unlock:
1488 	mtx_unlock(&sc->sge.reg_lock);
1489 err:
1490 	t3_free_qset(sc, q);
1491 
1492 	return (ret);
1493 }
1494 
1495 
1496 /**
1497  *	free_qset - free the resources of an SGE queue set
1498  *	@sc: the controller owning the queue set
1499  *	@q: the queue set
1500  *
1501  *	Release the HW and SW resources associated with an SGE queue set, such
1502  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1503  *	queue set must be quiesced prior to calling this.
1504  */
1505 static void
1506 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1507 {
1508 	int i;
1509 
1510 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1511 		if (q->fl[i].desc) {
1512 			mtx_lock(&sc->sge.reg_lock);
1513 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1514 			mtx_unlock(&sc->sge.reg_lock);
1515 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1516 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1517 					q->fl[i].desc_map);
1518 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1519 		}
1520 		if (q->fl[i].sdesc) {
1521 			free_rx_bufs(sc, &q->fl[i]);
1522 			free(q->fl[i].sdesc, M_DEVBUF);
1523 		}
1524 	}
1525 
1526 	for (i = 0; i < SGE_TXQ_PER_SET; ++i) {
1527 		if (q->txq[i].desc) {
1528 			mtx_lock(&sc->sge.reg_lock);
1529 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1530 			mtx_unlock(&sc->sge.reg_lock);
1531 			bus_dmamap_unload(q->txq[i].desc_tag,
1532 					q->txq[i].desc_map);
1533 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1534 					q->txq[i].desc_map);
1535 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1536 		}
1537 		if (q->txq[i].sdesc) {
1538 			free(q->txq[i].sdesc, M_DEVBUF);
1539 		}
1540 		if (mtx_initialized(&q->txq[i].lock)) {
1541 			mtx_destroy(&q->txq[i].lock);
1542 		}
1543 	}
1544 
1545 	if (q->rspq.desc) {
1546 		mtx_lock(&sc->sge.reg_lock);
1547 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1548 		mtx_unlock(&sc->sge.reg_lock);
1549 
1550 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1551 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1552 			        q->rspq.desc_map);
1553 		bus_dma_tag_destroy(q->rspq.desc_tag);
1554 	}
1555 	if (mtx_initialized(&q->rspq.lock)) {
1556 		mtx_destroy(&q->rspq.lock);
1557 	}
1558 
1559 	bzero(q, sizeof(*q));
1560 }
1561 
1562 /**
1563  *	t3_free_sge_resources - free SGE resources
1564  *	@sc: the adapter softc
1565  *
1566  *	Frees resources used by the SGE queue sets.
1567  */
1568 void
1569 t3_free_sge_resources(adapter_t *sc)
1570 {
1571 	int i;
1572 
1573 	for (i = 0; i < SGE_QSETS; ++i)
1574 		t3_free_qset(sc, &sc->sge.qs[i]);
1575 }
1576 
1577 /**
1578  *	t3_sge_start - enable SGE
1579  *	@sc: the controller softc
1580  *
1581  *	Enables the SGE for DMAs.  This is the last step in starting packet
1582  *	transfers.
1583  */
1584 void
1585 t3_sge_start(adapter_t *sc)
1586 {
1587 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1588 }
1589 
1590 
1591 /**
1592  *	free_tx_desc - reclaims Tx descriptors and their buffers
1593  *	@adapter: the adapter
1594  *	@q: the Tx queue to reclaim descriptors from
1595  *	@n: the number of descriptors to reclaim
1596  *
1597  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1598  *	Tx buffers.  Called with the Tx queue lock held.
1599  */
1600 int
1601 free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec)
1602 {
1603 	struct tx_sw_desc *d;
1604 	unsigned int cidx = q->cidx;
1605 	int nbufs = 0;
1606 
1607 #ifdef T3_TRACE
1608 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1609 		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
1610 #endif
1611 	d = &q->sdesc[cidx];
1612 
1613 	while (n-- > 0) {
1614 		DPRINTF("cidx=%d d=%p\n", cidx, d);
1615 		if (d->m) {
1616 			if (d->flags & TX_SW_DESC_MAPPED) {
1617 				bus_dmamap_unload(sc->tx_dmat, d->map);
1618 				bus_dmamap_destroy(sc->tx_dmat, d->map);
1619 				d->flags &= ~TX_SW_DESC_MAPPED;
1620 			}
1621 			m_vec[nbufs] = d->m;
1622 			d->m = NULL;
1623 			nbufs++;
1624 		}
1625 		++d;
1626 		if (++cidx == q->size) {
1627 			cidx = 0;
1628 			d = q->sdesc;
1629 		}
1630 	}
1631 	q->cidx = cidx;
1632 
1633 	return (nbufs);
1634 }
1635 
1636 /**
1637  *	is_new_response - check if a response is newly written
1638  *	@r: the response descriptor
1639  *	@q: the response queue
1640  *
1641  *	Returns true if a response descriptor contains a yet unprocessed
1642  *	response.
1643  */
1644 static __inline int
1645 is_new_response(const struct rsp_desc *r,
1646     const struct sge_rspq *q)
1647 {
1648 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1649 }
1650 
1651 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1652 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1653 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1654 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1655 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1656 
1657 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1658 #define NOMEM_INTR_DELAY 2500
1659 
1660 static __inline void
1661 deliver_partial_bundle(struct t3cdev *tdev,
1662 					  struct sge_rspq *q)
1663 {
1664 	;
1665 }
1666 
1667 static __inline void
1668 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1669     struct mbuf *m)
1670 {
1671 #ifdef notyet
1672 	if (rq->polling) {
1673 		rq->offload_skbs[rq->offload_skbs_idx++] = skb;
1674 		if (rq->offload_skbs_idx == RX_BUNDLE_SIZE) {
1675 			cxgb_ofld_recv(tdev, rq->offload_skbs, RX_BUNDLE_SIZE);
1676 			rq->offload_skbs_idx = 0;
1677 			rq->offload_bundles++;
1678 		}
1679 	} else
1680 #endif
1681 	{
1682 		/* XXX */
1683 		panic("implement offload enqueue\n");
1684 	}
1685 
1686 }
1687 
1688 static void
1689 restart_tx(struct sge_qset *qs)
1690 {
1691 	;
1692 }
1693 
1694 void
1695 t3_rx_eth(struct port_info *pi, struct sge_rspq *rq, struct mbuf *m, int ethpad)
1696 {
1697 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(m->m_data + ethpad);
1698 	struct ifnet *ifp = pi->ifp;
1699 
1700 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, m->m_data, cpl->iff);
1701 	if (&pi->adapter->port[cpl->iff] != pi)
1702 		panic("bad port index %d m->m_data=%p\n", cpl->iff, m->m_data);
1703 
1704 
1705 	m_adj(m, sizeof(*cpl) + ethpad);
1706 
1707 
1708 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
1709 	    cpl->csum_valid && cpl->csum == 0xffff) {
1710 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
1711 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
1712 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1713 		m->m_pkthdr.csum_data = 0xffff;
1714 	}
1715 	/*
1716 	 * XXX need to add VLAN support for 6.x
1717 	 */
1718 #ifdef VLAN_SUPPORTED
1719 	if (__predict_false(cpl->vlan_valid)) {
1720 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
1721 		m->m_flags |= M_VLANTAG;
1722 	}
1723 #endif
1724 	m->m_pkthdr.rcvif = ifp;
1725 
1726 	(*ifp->if_input)(ifp, m);
1727 }
1728 
1729 /**
1730  *	get_packet - return the next ingress packet buffer from a free list
1731  *	@adap: the adapter that received the packet
1732  *	@drop_thres: # of remaining buffers before we start dropping packets
1733  *	@qs: the qset that the SGE free list holding the packet belongs to
1734  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
1735  *      @r: response descriptor
1736  *
1737  *	Get the next packet from a free list and complete setup of the
1738  *	sk_buff.  If the packet is small we make a copy and recycle the
1739  *	original buffer, otherwise we use the original buffer itself.  If a
1740  *	positive drop threshold is supplied packets are dropped and their
1741  *	buffers recycled if (a) the number of remaining buffers is under the
1742  *	threshold and the packet is too big to copy, or (b) the packet should
1743  *	be copied but there is no memory for the copy.
1744  */
1745 static int
1746 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
1747     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
1748 {
1749 
1750 	struct mbuf *m = NULL;
1751 	unsigned int len_cq =  ntohl(r->len_cq);
1752 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
1753 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
1754 	uint32_t len = G_RSPD_LEN(len_cq);
1755 	uint32_t flags = ntohl(r->flags);
1756 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
1757 	int ret = 0;
1758 
1759 	prefetch(sd->m->m_data);
1760 
1761 	fl->credits--;
1762 	bus_dmamap_sync(adap->rx_jumbo_dmat, sd->map, BUS_DMASYNC_POSTREAD);
1763 	bus_dmamap_unload(adap->rx_jumbo_dmat, sd->map);
1764 	m = sd->m;
1765 	m->m_len = len;
1766 
1767 	switch(sopeop) {
1768 	case RSPQ_SOP_EOP:
1769 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
1770 		mh->mh_head = mh->mh_tail = m;
1771 		m->m_pkthdr.len = len;
1772 		m->m_flags |= M_PKTHDR;
1773 		ret = 1;
1774 		break;
1775 	case RSPQ_NSOP_NEOP:
1776 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
1777 		m->m_flags &= ~M_PKTHDR;
1778 		if (mh->mh_tail == NULL) {
1779 			if (cxgb_debug)
1780 				printf("discarding intermediate descriptor entry\n");
1781 			m_freem(m);
1782 			break;
1783 		}
1784 		mh->mh_tail->m_next = m;
1785 		mh->mh_tail = m;
1786 		mh->mh_head->m_pkthdr.len += len;
1787 		ret = 0;
1788 		break;
1789 	case RSPQ_SOP:
1790 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
1791 		m->m_pkthdr.len = len;
1792 		mh->mh_head = mh->mh_tail = m;
1793 		m->m_flags |= M_PKTHDR;
1794 		ret = 0;
1795 		break;
1796 	case RSPQ_EOP:
1797 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
1798 		m->m_flags &= ~M_PKTHDR;
1799 		mh->mh_head->m_pkthdr.len += len;
1800 		mh->mh_tail->m_next = m;
1801 		mh->mh_tail = m;
1802 		ret = 1;
1803 		break;
1804 	}
1805 	if (++fl->cidx == fl->size)
1806 		fl->cidx = 0;
1807 
1808 	return (ret);
1809 }
1810 
1811 
1812 /**
1813  *	handle_rsp_cntrl_info - handles control information in a response
1814  *	@qs: the queue set corresponding to the response
1815  *	@flags: the response control flags
1816  *
1817  *	Handles the control information of an SGE response, such as GTS
1818  *	indications and completion credits for the queue set's Tx queues.
1819  *	HW coalesces credits, we don't do any extra SW coalescing.
1820  */
1821 static __inline void
1822 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
1823 {
1824 	unsigned int credits;
1825 
1826 #if USE_GTS
1827 	if (flags & F_RSPD_TXQ0_GTS)
1828 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
1829 #endif
1830 	credits = G_RSPD_TXQ0_CR(flags);
1831 	if (credits) {
1832 		qs->txq[TXQ_ETH].processed += credits;
1833 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
1834 			taskqueue_enqueue(qs->port->adapter->tq,
1835 			    &qs->port->adapter->timer_reclaim_task);
1836 	}
1837 
1838 	credits = G_RSPD_TXQ2_CR(flags);
1839 	if (credits)
1840 		qs->txq[TXQ_CTRL].processed += credits;
1841 
1842 # if USE_GTS
1843 	if (flags & F_RSPD_TXQ1_GTS)
1844 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
1845 # endif
1846 	credits = G_RSPD_TXQ1_CR(flags);
1847 	if (credits)
1848 		qs->txq[TXQ_OFLD].processed += credits;
1849 }
1850 
1851 static void
1852 check_ring_db(adapter_t *adap, struct sge_qset *qs,
1853     unsigned int sleeping)
1854 {
1855 	;
1856 }
1857 
1858 /*
1859  * This is an awful hack to bind the ithread to CPU 1
1860  * to work around lack of ithread affinity
1861  */
1862 static void
1863 bind_ithread(int cpu)
1864 {
1865 	KASSERT(cpu < mp_ncpus, ("invalid cpu identifier"));
1866 #if 0
1867 	if (mp_ncpus > 1) {
1868 		mtx_lock_spin(&sched_lock);
1869 		sched_bind(curthread, cpu);
1870 		mtx_unlock_spin(&sched_lock);
1871 	}
1872 #endif
1873 }
1874 
1875 /**
1876  *	process_responses - process responses from an SGE response queue
1877  *	@adap: the adapter
1878  *	@qs: the queue set to which the response queue belongs
1879  *	@budget: how many responses can be processed in this round
1880  *
1881  *	Process responses from an SGE response queue up to the supplied budget.
1882  *	Responses include received packets as well as credits and other events
1883  *	for the queues that belong to the response queue's queue set.
1884  *	A negative budget is effectively unlimited.
1885  *
1886  *	Additionally choose the interrupt holdoff time for the next interrupt
1887  *	on this queue.  If the system is under memory shortage use a fairly
1888  *	long delay to help recovery.
1889  */
1890 static int
1891 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
1892 {
1893 	struct sge_rspq *rspq = &qs->rspq;
1894 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
1895 	int budget_left = budget;
1896 	unsigned int sleeping = 0;
1897 	int lro = qs->lro.enabled;
1898 
1899 	static uint8_t pinned[MAXCPU];
1900 
1901 #ifdef DEBUG
1902 	static int last_holdoff = 0;
1903 	if (rspq->holdoff_tmr != last_holdoff) {
1904 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
1905 		last_holdoff = rspq->holdoff_tmr;
1906 	}
1907 #endif
1908 	if (pinned[qs->rspq.cntxt_id * adap->params.nports] == 0) {
1909 		/*
1910 		 * Assumes that cntxt_id < mp_ncpus
1911 		 */
1912 		bind_ithread(qs->rspq.cntxt_id);
1913 		pinned[qs->rspq.cntxt_id * adap->params.nports] = 1;
1914 	}
1915 	rspq->next_holdoff = rspq->holdoff_tmr;
1916 
1917 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
1918 		int eth, eop = 0, ethpad = 0;
1919 		uint32_t flags = ntohl(r->flags);
1920 		uint32_t rss_csum = *(const uint32_t *)r;
1921 		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
1922 
1923 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
1924 
1925 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
1926 			/* XXX */
1927 			printf("async notification\n");
1928 
1929 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
1930 			if (cxgb_debug)
1931 				printf("IMM DATA VALID\n");
1932 
1933 			if(get_imm_packet(adap, r, &rspq->mh) == 0) {
1934 				rspq->next_holdoff = NOMEM_INTR_DELAY;
1935 				budget_left--;
1936 				break;
1937 			} else {
1938 				eop = 1;
1939 			}
1940 
1941 			rspq->imm_data++;
1942 		} else if (r->len_cq) {
1943 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
1944 
1945 			ethpad = 2;
1946 			eop = get_packet(adap, drop_thresh, qs, &rspq->mh, r);
1947 		} else {
1948 			DPRINTF("pure response\n");
1949 			rspq->pure_rsps++;
1950 		}
1951 
1952 		if (flags & RSPD_CTRL_MASK) {
1953 			sleeping |= flags & RSPD_GTS_MASK;
1954 			handle_rsp_cntrl_info(qs, flags);
1955 		}
1956 
1957 		r++;
1958 		if (__predict_false(++rspq->cidx == rspq->size)) {
1959 			rspq->cidx = 0;
1960 			rspq->gen ^= 1;
1961 			r = rspq->desc;
1962 		}
1963 
1964 		prefetch(r);
1965 		if (++rspq->credits >= (rspq->size / 4)) {
1966 			refill_rspq(adap, rspq, rspq->credits);
1967 			rspq->credits = 0;
1968 		}
1969 
1970 		if (eop) {
1971 			prefetch(rspq->mh.mh_head->m_data);
1972 			prefetch(rspq->mh.mh_head->m_data + L1_CACHE_BYTES);
1973 
1974 			if (eth) {
1975 				t3_rx_eth_lro(adap, rspq, &rspq->mh, ethpad,
1976 				    rss_hash, rss_csum, lro);
1977 
1978 				rspq->mh.mh_tail = rspq->mh.mh_head = NULL;
1979 			} else {
1980 #ifdef notyet
1981 				if (__predict_false(r->rss_hdr.opcode == CPL_TRACE_PKT))
1982 					m_adj(m, 2);
1983 
1984 				rx_offload(&adap->tdev, rspq, m);
1985 #endif
1986 			}
1987 #ifdef notyet
1988 			taskqueue_enqueue(adap->tq, &adap->timer_reclaim_task);
1989 #else
1990 			__refill_fl(adap, &qs->fl[0]);
1991 			__refill_fl(adap, &qs->fl[1]);
1992 #endif
1993 
1994 		}
1995 		--budget_left;
1996 	}
1997 	t3_sge_lro_flush_all(adap, qs);
1998 	deliver_partial_bundle(&adap->tdev, rspq);
1999 
2000 	if (sleeping)
2001 		check_ring_db(adap, qs, sleeping);
2002 
2003 	smp_mb();  /* commit Tx queue processed updates */
2004 	if (__predict_false(qs->txq_stopped != 0))
2005 		restart_tx(qs);
2006 
2007 	budget -= budget_left;
2008 	return (budget);
2009 }
2010 
2011 /*
2012  * A helper function that processes responses and issues GTS.
2013  */
2014 static __inline int
2015 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2016 {
2017 	int work;
2018 	static int last_holdoff = 0;
2019 
2020 	work = process_responses(adap, rspq_to_qset(rq), -1);
2021 
2022 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2023 		printf("next_holdoff=%d\n", rq->next_holdoff);
2024 		last_holdoff = rq->next_holdoff;
2025 	}
2026 
2027 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2028 		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2029 	return work;
2030 }
2031 
2032 
2033 /*
2034  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2035  * Handles data events from SGE response queues as well as error and other
2036  * async events as they all use the same interrupt pin.  We use one SGE
2037  * response queue per port in this mode and protect all response queues with
2038  * queue 0's lock.
2039  */
2040 void
2041 t3b_intr(void *data)
2042 {
2043 	uint32_t map;
2044 	adapter_t *adap = data;
2045 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2046 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2047 
2048 
2049 	t3_write_reg(adap, A_PL_CLI, 0);
2050 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2051 
2052 	if (!map)
2053 		return;
2054 
2055 	if (__predict_false(map & F_ERRINTR))
2056 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2057 
2058 	mtx_lock(&q0->lock);
2059 
2060 	if (__predict_true(map & 1))
2061 		process_responses_gts(adap, q0);
2062 
2063 	if (map & 2)
2064 		process_responses_gts(adap, q1);
2065 
2066 	mtx_unlock(&q0->lock);
2067 }
2068 
2069 /*
2070  * The MSI interrupt handler.  This needs to handle data events from SGE
2071  * response queues as well as error and other async events as they all use
2072  * the same MSI vector.  We use one SGE response queue per port in this mode
2073  * and protect all response queues with queue 0's lock.
2074  */
2075 void
2076 t3_intr_msi(void *data)
2077 {
2078 	adapter_t *adap = data;
2079 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2080 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2081 	int new_packets = 0;
2082 
2083 	mtx_lock(&q0->lock);
2084 	if (process_responses_gts(adap, q0)) {
2085 		new_packets = 1;
2086 	}
2087 
2088 	if (adap->params.nports == 2 &&
2089 	    process_responses_gts(adap, q1)) {
2090 		new_packets = 1;
2091 	}
2092 
2093 
2094 	mtx_unlock(&q0->lock);
2095 	if (new_packets == 0)
2096 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2097 }
2098 
2099 void
2100 t3_intr_msix(void *data)
2101 {
2102 	struct sge_qset *qs = data;
2103 	adapter_t *adap = qs->port->adapter;
2104 	struct sge_rspq *rspq = &qs->rspq;
2105 
2106 	mtx_lock(&rspq->lock);
2107 	if (process_responses_gts(adap, rspq) == 0) {
2108 #ifdef notyet
2109 		rspq->unhandled_irqs++;
2110 #endif
2111 	}
2112 	mtx_unlock(&rspq->lock);
2113 }
2114 
2115 static int
2116 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2117 {
2118 	adapter_t *sc;
2119 	int i, j, enabled, err, nqsets = 0;
2120 
2121 	sc = arg1;
2122 	enabled = sc->sge.qs[0].lro.enabled;
2123         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2124 
2125 	if (err != 0) {
2126 		return (err);
2127 	}
2128 	if (enabled == sc->sge.qs[0].lro.enabled)
2129 		return (0);
2130 
2131 	for (i = 0; i < sc->params.nports; i++)
2132 		for (j = 0; j < sc->port[i].nqsets; j++)
2133 			nqsets++;
2134 
2135 	for (i = 0; i < nqsets; i++) {
2136 		sc->sge.qs[i].lro.enabled = enabled;
2137 	}
2138 
2139 	return (0);
2140 }
2141 
2142 static int
2143 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2144 {
2145 	adapter_t *sc = arg1;
2146 	struct qset_params *qsp = &sc->params.sge.qset[0];
2147 	int coalesce_nsecs;
2148 	struct sge_qset *qs;
2149 	int i, j, err, nqsets = 0;
2150 	struct mtx *lock;
2151 
2152 	coalesce_nsecs = qsp->coalesce_nsecs;
2153         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
2154 
2155 	if (err != 0) {
2156 		return (err);
2157 	}
2158 	if (coalesce_nsecs == qsp->coalesce_nsecs)
2159 		return (0);
2160 
2161 	for (i = 0; i < sc->params.nports; i++)
2162 		for (j = 0; j < sc->port[i].nqsets; j++)
2163 			nqsets++;
2164 
2165 	coalesce_nsecs = max(100, coalesce_nsecs);
2166 
2167 	for (i = 0; i < nqsets; i++) {
2168 		qs = &sc->sge.qs[i];
2169 		qsp = &sc->params.sge.qset[i];
2170 		qsp->coalesce_nsecs = coalesce_nsecs;
2171 
2172 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
2173 			    &sc->sge.qs[0].rspq.lock;
2174 
2175 		mtx_lock(lock);
2176 		t3_update_qset_coalesce(qs, qsp);
2177 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2178 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
2179 		mtx_unlock(lock);
2180 	}
2181 
2182 	return (0);
2183 }
2184 
2185 
2186 void
2187 t3_add_sysctls(adapter_t *sc)
2188 {
2189 	struct sysctl_ctx_list *ctx;
2190 	struct sysctl_oid_list *children;
2191 
2192 	ctx = device_get_sysctl_ctx(sc->dev);
2193 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
2194 
2195 	/* random information */
2196 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
2197 	    "firmware_version",
2198 	    CTLFLAG_RD, &sc->fw_version,
2199 	    0, "firmware version");
2200 
2201 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2202 	    "enable_lro",
2203 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2204 	    0, t3_lro_enable,
2205 	    "I", "enable large receive offload");
2206 
2207 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2208 	    "intr_coal",
2209 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2210 	    0, t3_set_coalesce_nsecs,
2211 	    "I", "interrupt coalescing timer (ns)");
2212 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2213 	    "enable_debug",
2214 	    CTLFLAG_RW, &cxgb_debug,
2215 	    0, "enable verbose debugging output");
2216 
2217 }
2218 
2219 /**
2220  *	t3_get_desc - dump an SGE descriptor for debugging purposes
2221  *	@qs: the queue set
2222  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2223  *	@idx: the descriptor index in the queue
2224  *	@data: where to dump the descriptor contents
2225  *
2226  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2227  *	size of the descriptor.
2228  */
2229 int
2230 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2231 		unsigned char *data)
2232 {
2233 	if (qnum >= 6)
2234 		return (EINVAL);
2235 
2236 	if (qnum < 3) {
2237 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2238 			return -EINVAL;
2239 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2240 		return sizeof(struct tx_desc);
2241 	}
2242 
2243 	if (qnum == 3) {
2244 		if (!qs->rspq.desc || idx >= qs->rspq.size)
2245 			return (EINVAL);
2246 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2247 		return sizeof(struct rsp_desc);
2248 	}
2249 
2250 	qnum -= 4;
2251 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2252 		return (EINVAL);
2253 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2254 	return sizeof(struct rx_desc);
2255 }
2256