xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 1aaed33edb24c98a09130cd66667d6a795b6b2a8)
1 /**************************************************************************
2 
3 Copyright (c) 2007-2009, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/module.h>
39 #include <sys/bus.h>
40 #include <sys/conf.h>
41 #include <machine/bus.h>
42 #include <machine/resource.h>
43 #include <sys/bus_dma.h>
44 #include <sys/rman.h>
45 #include <sys/queue.h>
46 #include <sys/sysctl.h>
47 #include <sys/taskqueue.h>
48 
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/systm.h>
54 #include <sys/syslog.h>
55 #include <sys/socket.h>
56 
57 #include <net/bpf.h>
58 #include <net/ethernet.h>
59 #include <net/if.h>
60 #include <net/if_vlan_var.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/ip6.h>
66 #include <netinet/tcp.h>
67 
68 #include <dev/pci/pcireg.h>
69 #include <dev/pci/pcivar.h>
70 
71 #include <vm/vm.h>
72 #include <vm/pmap.h>
73 
74 #include <cxgb_include.h>
75 #include <sys/mvec.h>
76 
77 int	txq_fills = 0;
78 int	multiq_tx_enable = 1;
79 
80 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
81 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
82 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
83 SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
84     "size of per-queue mbuf ring");
85 
86 static int cxgb_tx_coalesce_force = 0;
87 TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
88 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
89     &cxgb_tx_coalesce_force, 0,
90     "coalesce small packets into a single work request regardless of ring state");
91 
92 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
93 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
94 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
95 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
96 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
97 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
98 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
99 
100 
101 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
102 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
103     &cxgb_tx_coalesce_enable_start);
104 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
105     &cxgb_tx_coalesce_enable_start, 0,
106     "coalesce enable threshold");
107 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
108 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
109 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
110     &cxgb_tx_coalesce_enable_stop, 0,
111     "coalesce disable threshold");
112 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
113 TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
114 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
115     &cxgb_tx_reclaim_threshold, 0,
116     "tx cleaning minimum threshold");
117 
118 /*
119  * XXX don't re-enable this until TOE stops assuming
120  * we have an m_ext
121  */
122 static int recycle_enable = 0;
123 
124 extern int cxgb_use_16k_clusters;
125 extern int nmbjumbop;
126 extern int nmbjumbo9;
127 extern int nmbjumbo16;
128 
129 #define USE_GTS 0
130 
131 #define SGE_RX_SM_BUF_SIZE	1536
132 #define SGE_RX_DROP_THRES	16
133 #define SGE_RX_COPY_THRES	128
134 
135 /*
136  * Period of the Tx buffer reclaim timer.  This timer does not need to run
137  * frequently as Tx buffers are usually reclaimed by new Tx packets.
138  */
139 #define TX_RECLAIM_PERIOD       (hz >> 1)
140 
141 /*
142  * Values for sge_txq.flags
143  */
144 enum {
145 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
146 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
147 };
148 
149 struct tx_desc {
150 	uint64_t	flit[TX_DESC_FLITS];
151 } __packed;
152 
153 struct rx_desc {
154 	uint32_t	addr_lo;
155 	uint32_t	len_gen;
156 	uint32_t	gen2;
157 	uint32_t	addr_hi;
158 } __packed;
159 
160 struct rsp_desc {               /* response queue descriptor */
161 	struct rss_header	rss_hdr;
162 	uint32_t		flags;
163 	uint32_t		len_cq;
164 	uint8_t			imm_data[47];
165 	uint8_t			intr_gen;
166 } __packed;
167 
168 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
169 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
170 #define RX_SW_DESC_INUSE        (1 << 3)
171 #define TX_SW_DESC_MAPPED       (1 << 4)
172 
173 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
174 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
175 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
176 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
177 
178 struct tx_sw_desc {                /* SW state per Tx descriptor */
179 	struct mbuf	*m;
180 	bus_dmamap_t	map;
181 	int		flags;
182 };
183 
184 struct rx_sw_desc {                /* SW state per Rx descriptor */
185 	caddr_t		rxsd_cl;
186 	struct mbuf	*m;
187 	bus_dmamap_t	map;
188 	int		flags;
189 };
190 
191 struct txq_state {
192 	unsigned int	compl;
193 	unsigned int	gen;
194 	unsigned int	pidx;
195 };
196 
197 struct refill_fl_cb_arg {
198 	int               error;
199 	bus_dma_segment_t seg;
200 	int               nseg;
201 };
202 
203 
204 /*
205  * Maps a number of flits to the number of Tx descriptors that can hold them.
206  * The formula is
207  *
208  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
209  *
210  * HW allows up to 4 descriptors to be combined into a WR.
211  */
212 static uint8_t flit_desc_map[] = {
213 	0,
214 #if SGE_NUM_GENBITS == 1
215 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
216 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
217 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
218 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
219 #elif SGE_NUM_GENBITS == 2
220 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
221 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
222 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
223 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
224 #else
225 # error "SGE_NUM_GENBITS must be 1 or 2"
226 #endif
227 };
228 
229 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
230 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
231 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
232 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
233 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
234 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
235 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
237 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
238 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
239 #define	TXQ_RING_DEQUEUE(qs) \
240 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
241 
242 int cxgb_debug = 0;
243 
244 static void sge_timer_cb(void *arg);
245 static void sge_timer_reclaim(void *arg, int ncount);
246 static void sge_txq_reclaim_handler(void *arg, int ncount);
247 static void cxgb_start_locked(struct sge_qset *qs);
248 
249 /*
250  * XXX need to cope with bursty scheduling by looking at a wider
251  * window than we are now for determining the need for coalescing
252  *
253  */
254 static __inline uint64_t
255 check_pkt_coalesce(struct sge_qset *qs)
256 {
257         struct adapter *sc;
258         struct sge_txq *txq;
259 	uint8_t *fill;
260 
261 	if (__predict_false(cxgb_tx_coalesce_force))
262 		return (1);
263 	txq = &qs->txq[TXQ_ETH];
264         sc = qs->port->adapter;
265 	fill = &sc->tunq_fill[qs->idx];
266 
267 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
268 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
269 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
270 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
271 	/*
272 	 * if the hardware transmit queue is more than 1/8 full
273 	 * we mark it as coalescing - we drop back from coalescing
274 	 * when we go below 1/32 full and there are no packets enqueued,
275 	 * this provides us with some degree of hysteresis
276 	 */
277         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
278 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
279                 *fill = 0;
280         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
281                 *fill = 1;
282 
283 	return (sc->tunq_coalesce);
284 }
285 
286 #ifdef __LP64__
287 static void
288 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
289 {
290 	uint64_t wr_hilo;
291 #if _BYTE_ORDER == _LITTLE_ENDIAN
292 	wr_hilo = wr_hi;
293 	wr_hilo |= (((uint64_t)wr_lo)<<32);
294 #else
295 	wr_hilo = wr_lo;
296 	wr_hilo |= (((uint64_t)wr_hi)<<32);
297 #endif
298 	wrp->wrh_hilo = wr_hilo;
299 }
300 #else
301 static void
302 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
303 {
304 
305 	wrp->wrh_hi = wr_hi;
306 	wmb();
307 	wrp->wrh_lo = wr_lo;
308 }
309 #endif
310 
311 struct coalesce_info {
312 	int count;
313 	int nbytes;
314 };
315 
316 static int
317 coalesce_check(struct mbuf *m, void *arg)
318 {
319 	struct coalesce_info *ci = arg;
320 	int *count = &ci->count;
321 	int *nbytes = &ci->nbytes;
322 
323 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
324 		(*count < 7) && (m->m_next == NULL))) {
325 		*count += 1;
326 		*nbytes += m->m_len;
327 		return (1);
328 	}
329 	return (0);
330 }
331 
332 static struct mbuf *
333 cxgb_dequeue(struct sge_qset *qs)
334 {
335 	struct mbuf *m, *m_head, *m_tail;
336 	struct coalesce_info ci;
337 
338 
339 	if (check_pkt_coalesce(qs) == 0)
340 		return TXQ_RING_DEQUEUE(qs);
341 
342 	m_head = m_tail = NULL;
343 	ci.count = ci.nbytes = 0;
344 	do {
345 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
346 		if (m_head == NULL) {
347 			m_tail = m_head = m;
348 		} else if (m != NULL) {
349 			m_tail->m_nextpkt = m;
350 			m_tail = m;
351 		}
352 	} while (m != NULL);
353 	if (ci.count > 7)
354 		panic("trying to coalesce %d packets in to one WR", ci.count);
355 	return (m_head);
356 }
357 
358 /**
359  *	reclaim_completed_tx - reclaims completed Tx descriptors
360  *	@adapter: the adapter
361  *	@q: the Tx queue to reclaim completed descriptors from
362  *
363  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
364  *	and frees the associated buffers if possible.  Called with the Tx
365  *	queue's lock held.
366  */
367 static __inline int
368 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
369 {
370 	struct sge_txq *q = &qs->txq[queue];
371 	int reclaim = desc_reclaimable(q);
372 
373 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
374 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
375 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
376 
377 	if (reclaim < reclaim_min)
378 		return (0);
379 
380 	mtx_assert(&qs->lock, MA_OWNED);
381 	if (reclaim > 0) {
382 		t3_free_tx_desc(qs, reclaim, queue);
383 		q->cleaned += reclaim;
384 		q->in_use -= reclaim;
385 	}
386 	if (isset(&qs->txq_stopped, TXQ_ETH))
387                 clrbit(&qs->txq_stopped, TXQ_ETH);
388 
389 	return (reclaim);
390 }
391 
392 /**
393  *	should_restart_tx - are there enough resources to restart a Tx queue?
394  *	@q: the Tx queue
395  *
396  *	Checks if there are enough descriptors to restart a suspended Tx queue.
397  */
398 static __inline int
399 should_restart_tx(const struct sge_txq *q)
400 {
401 	unsigned int r = q->processed - q->cleaned;
402 
403 	return q->in_use - r < (q->size >> 1);
404 }
405 
406 /**
407  *	t3_sge_init - initialize SGE
408  *	@adap: the adapter
409  *	@p: the SGE parameters
410  *
411  *	Performs SGE initialization needed every time after a chip reset.
412  *	We do not initialize any of the queue sets here, instead the driver
413  *	top-level must request those individually.  We also do not enable DMA
414  *	here, that should be done after the queues have been set up.
415  */
416 void
417 t3_sge_init(adapter_t *adap, struct sge_params *p)
418 {
419 	u_int ctrl, ups;
420 
421 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
422 
423 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
424 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
425 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
426 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
427 #if SGE_NUM_GENBITS == 1
428 	ctrl |= F_EGRGENCTRL;
429 #endif
430 	if (adap->params.rev > 0) {
431 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
432 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
433 	}
434 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
435 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
436 		     V_LORCQDRBTHRSH(512));
437 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
438 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
439 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
440 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
441 		     adap->params.rev < T3_REV_C ? 1000 : 500);
442 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
443 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
444 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
445 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
446 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
447 }
448 
449 
450 /**
451  *	sgl_len - calculates the size of an SGL of the given capacity
452  *	@n: the number of SGL entries
453  *
454  *	Calculates the number of flits needed for a scatter/gather list that
455  *	can hold the given number of entries.
456  */
457 static __inline unsigned int
458 sgl_len(unsigned int n)
459 {
460 	return ((3 * n) / 2 + (n & 1));
461 }
462 
463 /**
464  *	get_imm_packet - return the next ingress packet buffer from a response
465  *	@resp: the response descriptor containing the packet data
466  *
467  *	Return a packet containing the immediate data of the given response.
468  */
469 static int
470 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
471 {
472 
473 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
474 	m->m_ext.ext_buf = NULL;
475 	m->m_ext.ext_type = 0;
476 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
477 	return (0);
478 }
479 
480 static __inline u_int
481 flits_to_desc(u_int n)
482 {
483 	return (flit_desc_map[n]);
484 }
485 
486 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
487 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
488 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
489 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
490 		    F_HIRCQPARITYERROR)
491 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
492 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
493 		      F_RSPQDISABLED)
494 
495 /**
496  *	t3_sge_err_intr_handler - SGE async event interrupt handler
497  *	@adapter: the adapter
498  *
499  *	Interrupt handler for SGE asynchronous (non-data) events.
500  */
501 void
502 t3_sge_err_intr_handler(adapter_t *adapter)
503 {
504 	unsigned int v, status;
505 
506 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
507 	if (status & SGE_PARERR)
508 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
509 			 status & SGE_PARERR);
510 	if (status & SGE_FRAMINGERR)
511 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
512 			 status & SGE_FRAMINGERR);
513 	if (status & F_RSPQCREDITOVERFOW)
514 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
515 
516 	if (status & F_RSPQDISABLED) {
517 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
518 
519 		CH_ALERT(adapter,
520 			 "packet delivered to disabled response queue (0x%x)\n",
521 			 (v >> S_RSPQ0DISABLED) & 0xff);
522 	}
523 
524 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
525 	if (status & SGE_FATALERR)
526 		t3_fatal_err(adapter);
527 }
528 
529 void
530 t3_sge_prep(adapter_t *adap, struct sge_params *p)
531 {
532 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
533 
534 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
535 	nqsets *= adap->params.nports;
536 
537 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
538 
539 	while (!powerof2(fl_q_size))
540 		fl_q_size--;
541 
542 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
543 	    is_offload(adap);
544 
545 #if __FreeBSD_version >= 700111
546 	if (use_16k) {
547 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
548 		jumbo_buf_size = MJUM16BYTES;
549 	} else {
550 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
551 		jumbo_buf_size = MJUM9BYTES;
552 	}
553 #else
554 	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
555 	jumbo_buf_size = MJUMPAGESIZE;
556 #endif
557 	while (!powerof2(jumbo_q_size))
558 		jumbo_q_size--;
559 
560 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
561 		device_printf(adap->dev,
562 		    "Insufficient clusters and/or jumbo buffers.\n");
563 
564 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
565 
566 	for (i = 0; i < SGE_QSETS; ++i) {
567 		struct qset_params *q = p->qset + i;
568 
569 		if (adap->params.nports > 2) {
570 			q->coalesce_usecs = 50;
571 		} else {
572 #ifdef INVARIANTS
573 			q->coalesce_usecs = 10;
574 #else
575 			q->coalesce_usecs = 5;
576 #endif
577 		}
578 		q->polling = 0;
579 		q->rspq_size = RSPQ_Q_SIZE;
580 		q->fl_size = fl_q_size;
581 		q->jumbo_size = jumbo_q_size;
582 		q->jumbo_buf_size = jumbo_buf_size;
583 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
584 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
585 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
586 		q->cong_thres = 0;
587 	}
588 }
589 
590 int
591 t3_sge_alloc(adapter_t *sc)
592 {
593 
594 	/* The parent tag. */
595 	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
596 				1, 0,			/* algnmnt, boundary */
597 				BUS_SPACE_MAXADDR,	/* lowaddr */
598 				BUS_SPACE_MAXADDR,	/* highaddr */
599 				NULL, NULL,		/* filter, filterarg */
600 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
601 				BUS_SPACE_UNRESTRICTED, /* nsegments */
602 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
603 				0,			/* flags */
604 				NULL, NULL,		/* lock, lockarg */
605 				&sc->parent_dmat)) {
606 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
607 		return (ENOMEM);
608 	}
609 
610 	/*
611 	 * DMA tag for normal sized RX frames
612 	 */
613 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
614 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
615 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
616 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
617 		return (ENOMEM);
618 	}
619 
620 	/*
621 	 * DMA tag for jumbo sized RX frames.
622 	 */
623 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
624 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
625 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
626 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
627 		return (ENOMEM);
628 	}
629 
630 	/*
631 	 * DMA tag for TX frames.
632 	 */
633 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
634 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
635 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
636 		NULL, NULL, &sc->tx_dmat)) {
637 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
638 		return (ENOMEM);
639 	}
640 
641 	return (0);
642 }
643 
644 int
645 t3_sge_free(struct adapter * sc)
646 {
647 
648 	if (sc->tx_dmat != NULL)
649 		bus_dma_tag_destroy(sc->tx_dmat);
650 
651 	if (sc->rx_jumbo_dmat != NULL)
652 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
653 
654 	if (sc->rx_dmat != NULL)
655 		bus_dma_tag_destroy(sc->rx_dmat);
656 
657 	if (sc->parent_dmat != NULL)
658 		bus_dma_tag_destroy(sc->parent_dmat);
659 
660 	return (0);
661 }
662 
663 void
664 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
665 {
666 
667 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
668 	qs->rspq.polling = 0 /* p->polling */;
669 }
670 
671 #if !defined(__i386__) && !defined(__amd64__)
672 static void
673 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
674 {
675 	struct refill_fl_cb_arg *cb_arg = arg;
676 
677 	cb_arg->error = error;
678 	cb_arg->seg = segs[0];
679 	cb_arg->nseg = nseg;
680 
681 }
682 #endif
683 /**
684  *	refill_fl - refill an SGE free-buffer list
685  *	@sc: the controller softc
686  *	@q: the free-list to refill
687  *	@n: the number of new buffers to allocate
688  *
689  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
690  *	The caller must assure that @n does not exceed the queue's capacity.
691  */
692 static void
693 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
694 {
695 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
696 	struct rx_desc *d = &q->desc[q->pidx];
697 	struct refill_fl_cb_arg cb_arg;
698 	struct mbuf *m;
699 	caddr_t cl;
700 	int err;
701 
702 	cb_arg.error = 0;
703 	while (n--) {
704 		/*
705 		 * We only allocate a cluster, mbuf allocation happens after rx
706 		 */
707 		if (q->zone == zone_pack) {
708 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
709 				break;
710 			cl = m->m_ext.ext_buf;
711 		} else {
712 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
713 				break;
714 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
715 				uma_zfree(q->zone, cl);
716 				break;
717 			}
718 		}
719 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
720 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
721 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
722 				uma_zfree(q->zone, cl);
723 				goto done;
724 			}
725 			sd->flags |= RX_SW_DESC_MAP_CREATED;
726 		}
727 #if !defined(__i386__) && !defined(__amd64__)
728 		err = bus_dmamap_load(q->entry_tag, sd->map,
729 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
730 
731 		if (err != 0 || cb_arg.error) {
732 			if (q->zone == zone_pack)
733 				uma_zfree(q->zone, cl);
734 			m_free(m);
735 			goto done;
736 		}
737 #else
738 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
739 #endif
740 		sd->flags |= RX_SW_DESC_INUSE;
741 		sd->rxsd_cl = cl;
742 		sd->m = m;
743 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
744 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
745 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
746 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
747 
748 		d++;
749 		sd++;
750 
751 		if (++q->pidx == q->size) {
752 			q->pidx = 0;
753 			q->gen ^= 1;
754 			sd = q->sdesc;
755 			d = q->desc;
756 		}
757 		q->credits++;
758 		q->db_pending++;
759 	}
760 
761 done:
762 	if (q->db_pending >= 32) {
763 		q->db_pending = 0;
764 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
765 	}
766 }
767 
768 
769 /**
770  *	free_rx_bufs - free the Rx buffers on an SGE free list
771  *	@sc: the controle softc
772  *	@q: the SGE free list to clean up
773  *
774  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
775  *	this queue should be stopped before calling this function.
776  */
777 static void
778 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
779 {
780 	u_int cidx = q->cidx;
781 
782 	while (q->credits--) {
783 		struct rx_sw_desc *d = &q->sdesc[cidx];
784 
785 		if (d->flags & RX_SW_DESC_INUSE) {
786 			bus_dmamap_unload(q->entry_tag, d->map);
787 			bus_dmamap_destroy(q->entry_tag, d->map);
788 			if (q->zone == zone_pack) {
789 				m_init(d->m, zone_pack, MCLBYTES,
790 				    M_NOWAIT, MT_DATA, M_EXT);
791 				uma_zfree(zone_pack, d->m);
792 			} else {
793 				m_init(d->m, zone_mbuf, MLEN,
794 				    M_NOWAIT, MT_DATA, 0);
795 				uma_zfree(zone_mbuf, d->m);
796 				uma_zfree(q->zone, d->rxsd_cl);
797 			}
798 		}
799 
800 		d->rxsd_cl = NULL;
801 		d->m = NULL;
802 		if (++cidx == q->size)
803 			cidx = 0;
804 	}
805 }
806 
807 static __inline void
808 __refill_fl(adapter_t *adap, struct sge_fl *fl)
809 {
810 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
811 }
812 
813 static __inline void
814 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
815 {
816 	uint32_t reclaimable = fl->size - fl->credits;
817 
818 	if (reclaimable > 0)
819 		refill_fl(adap, fl, min(max, reclaimable));
820 }
821 
822 /**
823  *	recycle_rx_buf - recycle a receive buffer
824  *	@adapter: the adapter
825  *	@q: the SGE free list
826  *	@idx: index of buffer to recycle
827  *
828  *	Recycles the specified buffer on the given free list by adding it at
829  *	the next available slot on the list.
830  */
831 static void
832 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
833 {
834 	struct rx_desc *from = &q->desc[idx];
835 	struct rx_desc *to   = &q->desc[q->pidx];
836 
837 	q->sdesc[q->pidx] = q->sdesc[idx];
838 	to->addr_lo = from->addr_lo;        // already big endian
839 	to->addr_hi = from->addr_hi;        // likewise
840 	wmb();	/* necessary ? */
841 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
842 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
843 	q->credits++;
844 
845 	if (++q->pidx == q->size) {
846 		q->pidx = 0;
847 		q->gen ^= 1;
848 	}
849 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
850 }
851 
852 static void
853 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
854 {
855 	uint32_t *addr;
856 
857 	addr = arg;
858 	*addr = segs[0].ds_addr;
859 }
860 
861 static int
862 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
863     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
864     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
865 {
866 	size_t len = nelem * elem_size;
867 	void *s = NULL;
868 	void *p = NULL;
869 	int err;
870 
871 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
872 				      BUS_SPACE_MAXADDR_32BIT,
873 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
874 				      len, 0, NULL, NULL, tag)) != 0) {
875 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
876 		return (ENOMEM);
877 	}
878 
879 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
880 				    map)) != 0) {
881 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
882 		return (ENOMEM);
883 	}
884 
885 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
886 	bzero(p, len);
887 	*(void **)desc = p;
888 
889 	if (sw_size) {
890 		len = nelem * sw_size;
891 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
892 		*(void **)sdesc = s;
893 	}
894 	if (parent_entry_tag == NULL)
895 		return (0);
896 
897 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
898 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
899 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
900 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
901 		                      NULL, NULL, entry_tag)) != 0) {
902 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
903 		return (ENOMEM);
904 	}
905 	return (0);
906 }
907 
908 static void
909 sge_slow_intr_handler(void *arg, int ncount)
910 {
911 	adapter_t *sc = arg;
912 
913 	t3_slow_intr_handler(sc);
914 	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
915 	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
916 }
917 
918 /**
919  *	sge_timer_cb - perform periodic maintenance of an SGE qset
920  *	@data: the SGE queue set to maintain
921  *
922  *	Runs periodically from a timer to perform maintenance of an SGE queue
923  *	set.  It performs two tasks:
924  *
925  *	a) Cleans up any completed Tx descriptors that may still be pending.
926  *	Normal descriptor cleanup happens when new packets are added to a Tx
927  *	queue so this timer is relatively infrequent and does any cleanup only
928  *	if the Tx queue has not seen any new packets in a while.  We make a
929  *	best effort attempt to reclaim descriptors, in that we don't wait
930  *	around if we cannot get a queue's lock (which most likely is because
931  *	someone else is queueing new packets and so will also handle the clean
932  *	up).  Since control queues use immediate data exclusively we don't
933  *	bother cleaning them up here.
934  *
935  *	b) Replenishes Rx queues that have run out due to memory shortage.
936  *	Normally new Rx buffers are added when existing ones are consumed but
937  *	when out of memory a queue can become empty.  We try to add only a few
938  *	buffers here, the queue will be replenished fully as these new buffers
939  *	are used up if memory shortage has subsided.
940  *
941  *	c) Return coalesced response queue credits in case a response queue is
942  *	starved.
943  *
944  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
945  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
946  */
947 static void
948 sge_timer_cb(void *arg)
949 {
950 	adapter_t *sc = arg;
951 	if ((sc->flags & USING_MSIX) == 0) {
952 
953 		struct port_info *pi;
954 		struct sge_qset *qs;
955 		struct sge_txq  *txq;
956 		int i, j;
957 		int reclaim_ofl, refill_rx;
958 
959 		if (sc->open_device_map == 0)
960 			return;
961 
962 		for (i = 0; i < sc->params.nports; i++) {
963 			pi = &sc->port[i];
964 			for (j = 0; j < pi->nqsets; j++) {
965 				qs = &sc->sge.qs[pi->first_qset + j];
966 				txq = &qs->txq[0];
967 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
968 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
969 				    (qs->fl[1].credits < qs->fl[1].size));
970 				if (reclaim_ofl || refill_rx) {
971 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
972 					break;
973 				}
974 			}
975 		}
976 	}
977 
978 	if (sc->params.nports > 2) {
979 		int i;
980 
981 		for_each_port(sc, i) {
982 			struct port_info *pi = &sc->port[i];
983 
984 			t3_write_reg(sc, A_SG_KDOORBELL,
985 				     F_SELEGRCNTX |
986 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
987 		}
988 	}
989 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
990 	    sc->open_device_map != 0)
991 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
992 }
993 
994 /*
995  * This is meant to be a catch-all function to keep sge state private
996  * to sge.c
997  *
998  */
999 int
1000 t3_sge_init_adapter(adapter_t *sc)
1001 {
1002 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
1003 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1004 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1005 	return (0);
1006 }
1007 
1008 int
1009 t3_sge_reset_adapter(adapter_t *sc)
1010 {
1011 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1012 	return (0);
1013 }
1014 
1015 int
1016 t3_sge_init_port(struct port_info *pi)
1017 {
1018 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1019 	return (0);
1020 }
1021 
1022 /**
1023  *	refill_rspq - replenish an SGE response queue
1024  *	@adapter: the adapter
1025  *	@q: the response queue to replenish
1026  *	@credits: how many new responses to make available
1027  *
1028  *	Replenishes a response queue by making the supplied number of responses
1029  *	available to HW.
1030  */
1031 static __inline void
1032 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1033 {
1034 
1035 	/* mbufs are allocated on demand when a rspq entry is processed. */
1036 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1037 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1038 }
1039 
1040 static void
1041 sge_txq_reclaim_handler(void *arg, int ncount)
1042 {
1043 	struct sge_qset *qs = arg;
1044 	int i;
1045 
1046 	for (i = 0; i < 3; i++)
1047 		reclaim_completed_tx(qs, 16, i);
1048 }
1049 
1050 static void
1051 sge_timer_reclaim(void *arg, int ncount)
1052 {
1053 	struct port_info *pi = arg;
1054 	int i, nqsets = pi->nqsets;
1055 	adapter_t *sc = pi->adapter;
1056 	struct sge_qset *qs;
1057 	struct mtx *lock;
1058 
1059 	KASSERT((sc->flags & USING_MSIX) == 0,
1060 	    ("can't call timer reclaim for msi-x"));
1061 
1062 	for (i = 0; i < nqsets; i++) {
1063 		qs = &sc->sge.qs[pi->first_qset + i];
1064 
1065 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1066 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1067 			    &sc->sge.qs[0].rspq.lock;
1068 
1069 		if (mtx_trylock(lock)) {
1070 			/* XXX currently assume that we are *NOT* polling */
1071 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1072 
1073 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1074 				__refill_fl(sc, &qs->fl[0]);
1075 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1076 				__refill_fl(sc, &qs->fl[1]);
1077 
1078 			if (status & (1 << qs->rspq.cntxt_id)) {
1079 				if (qs->rspq.credits) {
1080 					refill_rspq(sc, &qs->rspq, 1);
1081 					qs->rspq.credits--;
1082 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1083 					    1 << qs->rspq.cntxt_id);
1084 				}
1085 			}
1086 			mtx_unlock(lock);
1087 		}
1088 	}
1089 }
1090 
1091 /**
1092  *	init_qset_cntxt - initialize an SGE queue set context info
1093  *	@qs: the queue set
1094  *	@id: the queue set id
1095  *
1096  *	Initializes the TIDs and context ids for the queues of a queue set.
1097  */
1098 static void
1099 init_qset_cntxt(struct sge_qset *qs, u_int id)
1100 {
1101 
1102 	qs->rspq.cntxt_id = id;
1103 	qs->fl[0].cntxt_id = 2 * id;
1104 	qs->fl[1].cntxt_id = 2 * id + 1;
1105 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1106 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1107 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1108 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1109 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1110 
1111 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1112 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1113 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1114 }
1115 
1116 
1117 static void
1118 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1119 {
1120 	txq->in_use += ndesc;
1121 	/*
1122 	 * XXX we don't handle stopping of queue
1123 	 * presumably start handles this when we bump against the end
1124 	 */
1125 	txqs->gen = txq->gen;
1126 	txq->unacked += ndesc;
1127 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1128 	txq->unacked &= 31;
1129 	txqs->pidx = txq->pidx;
1130 	txq->pidx += ndesc;
1131 #ifdef INVARIANTS
1132 	if (((txqs->pidx > txq->cidx) &&
1133 		(txq->pidx < txqs->pidx) &&
1134 		(txq->pidx >= txq->cidx)) ||
1135 	    ((txqs->pidx < txq->cidx) &&
1136 		(txq->pidx >= txq-> cidx)) ||
1137 	    ((txqs->pidx < txq->cidx) &&
1138 		(txq->cidx < txqs->pidx)))
1139 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1140 		    txqs->pidx, txq->pidx, txq->cidx);
1141 #endif
1142 	if (txq->pidx >= txq->size) {
1143 		txq->pidx -= txq->size;
1144 		txq->gen ^= 1;
1145 	}
1146 
1147 }
1148 
1149 /**
1150  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1151  *	@m: the packet mbufs
1152  *      @nsegs: the number of segments
1153  *
1154  * 	Returns the number of Tx descriptors needed for the given Ethernet
1155  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1156  */
1157 static __inline unsigned int
1158 calc_tx_descs(const struct mbuf *m, int nsegs)
1159 {
1160 	unsigned int flits;
1161 
1162 	if (m->m_pkthdr.len <= PIO_LEN)
1163 		return 1;
1164 
1165 	flits = sgl_len(nsegs) + 2;
1166 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1167 		flits++;
1168 
1169 	return flits_to_desc(flits);
1170 }
1171 
1172 static unsigned int
1173 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1174     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1175 {
1176 	struct mbuf *m0;
1177 	int err, pktlen, pass = 0;
1178 	bus_dma_tag_t tag = txq->entry_tag;
1179 
1180 retry:
1181 	err = 0;
1182 	m0 = *m;
1183 	pktlen = m0->m_pkthdr.len;
1184 #if defined(__i386__) || defined(__amd64__)
1185 	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1186 		goto done;
1187 	} else
1188 #endif
1189 		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1190 
1191 	if (err == 0) {
1192 		goto done;
1193 	}
1194 	if (err == EFBIG && pass == 0) {
1195 		pass = 1;
1196 		/* Too many segments, try to defrag */
1197 		m0 = m_defrag(m0, M_DONTWAIT);
1198 		if (m0 == NULL) {
1199 			m_freem(*m);
1200 			*m = NULL;
1201 			return (ENOBUFS);
1202 		}
1203 		*m = m0;
1204 		goto retry;
1205 	} else if (err == ENOMEM) {
1206 		return (err);
1207 	} if (err) {
1208 		if (cxgb_debug)
1209 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1210 		m_freem(m0);
1211 		*m = NULL;
1212 		return (err);
1213 	}
1214 done:
1215 #if !defined(__i386__) && !defined(__amd64__)
1216 	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1217 #endif
1218 	txsd->flags |= TX_SW_DESC_MAPPED;
1219 
1220 	return (0);
1221 }
1222 
1223 /**
1224  *	make_sgl - populate a scatter/gather list for a packet
1225  *	@sgp: the SGL to populate
1226  *	@segs: the packet dma segments
1227  *	@nsegs: the number of segments
1228  *
1229  *	Generates a scatter/gather list for the buffers that make up a packet
1230  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1231  *	appropriately.
1232  */
1233 static __inline void
1234 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1235 {
1236 	int i, idx;
1237 
1238 	for (idx = 0, i = 0; i < nsegs; i++) {
1239 		/*
1240 		 * firmware doesn't like empty segments
1241 		 */
1242 		if (segs[i].ds_len == 0)
1243 			continue;
1244 		if (i && idx == 0)
1245 			++sgp;
1246 
1247 		sgp->len[idx] = htobe32(segs[i].ds_len);
1248 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1249 		idx ^= 1;
1250 	}
1251 
1252 	if (idx) {
1253 		sgp->len[idx] = 0;
1254 		sgp->addr[idx] = 0;
1255 	}
1256 }
1257 
1258 /**
1259  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1260  *	@adap: the adapter
1261  *	@q: the Tx queue
1262  *
1263  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1264  *	where the HW is going to sleep just after we checked, however,
1265  *	then the interrupt handler will detect the outstanding TX packet
1266  *	and ring the doorbell for us.
1267  *
1268  *	When GTS is disabled we unconditionally ring the doorbell.
1269  */
1270 static __inline void
1271 check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1272 {
1273 #if USE_GTS
1274 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1275 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1276 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1277 #ifdef T3_TRACE
1278 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1279 			  q->cntxt_id);
1280 #endif
1281 		t3_write_reg(adap, A_SG_KDOORBELL,
1282 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1283 	}
1284 #else
1285 	if (mustring || ++q->db_pending >= 32) {
1286 		wmb();            /* write descriptors before telling HW */
1287 		t3_write_reg(adap, A_SG_KDOORBELL,
1288 		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1289 		q->db_pending = 0;
1290 	}
1291 #endif
1292 }
1293 
1294 static __inline void
1295 wr_gen2(struct tx_desc *d, unsigned int gen)
1296 {
1297 #if SGE_NUM_GENBITS == 2
1298 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1299 #endif
1300 }
1301 
1302 /**
1303  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1304  *	@ndesc: number of Tx descriptors spanned by the SGL
1305  *	@txd: first Tx descriptor to be written
1306  *	@txqs: txq state (generation and producer index)
1307  *	@txq: the SGE Tx queue
1308  *	@sgl: the SGL
1309  *	@flits: number of flits to the start of the SGL in the first descriptor
1310  *	@sgl_flits: the SGL size in flits
1311  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1312  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1313  *
1314  *	Write a work request header and an associated SGL.  If the SGL is
1315  *	small enough to fit into one Tx descriptor it has already been written
1316  *	and we just need to write the WR header.  Otherwise we distribute the
1317  *	SGL across the number of descriptors it spans.
1318  */
1319 static void
1320 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1321     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1322     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1323 {
1324 
1325 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1326 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1327 
1328 	if (__predict_true(ndesc == 1)) {
1329 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1330 			V_WR_SGLSFLT(flits)) | wr_hi,
1331 		    htonl(V_WR_LEN(flits + sgl_flits) |
1332 			V_WR_GEN(txqs->gen)) | wr_lo);
1333 		/* XXX gen? */
1334 		wr_gen2(txd, txqs->gen);
1335 
1336 	} else {
1337 		unsigned int ogen = txqs->gen;
1338 		const uint64_t *fp = (const uint64_t *)sgl;
1339 		struct work_request_hdr *wp = wrp;
1340 
1341 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1342 		    V_WR_SGLSFLT(flits)) | wr_hi;
1343 
1344 		while (sgl_flits) {
1345 			unsigned int avail = WR_FLITS - flits;
1346 
1347 			if (avail > sgl_flits)
1348 				avail = sgl_flits;
1349 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1350 			sgl_flits -= avail;
1351 			ndesc--;
1352 			if (!sgl_flits)
1353 				break;
1354 
1355 			fp += avail;
1356 			txd++;
1357 			txsd++;
1358 			if (++txqs->pidx == txq->size) {
1359 				txqs->pidx = 0;
1360 				txqs->gen ^= 1;
1361 				txd = txq->desc;
1362 				txsd = txq->sdesc;
1363 			}
1364 
1365 			/*
1366 			 * when the head of the mbuf chain
1367 			 * is freed all clusters will be freed
1368 			 * with it
1369 			 */
1370 			wrp = (struct work_request_hdr *)txd;
1371 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1372 			    V_WR_SGLSFLT(1)) | wr_hi;
1373 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1374 				    sgl_flits + 1)) |
1375 			    V_WR_GEN(txqs->gen)) | wr_lo;
1376 			wr_gen2(txd, txqs->gen);
1377 			flits = 1;
1378 		}
1379 		wrp->wrh_hi |= htonl(F_WR_EOP);
1380 		wmb();
1381 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1382 		wr_gen2((struct tx_desc *)wp, ogen);
1383 	}
1384 }
1385 
1386 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1387 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1388 
1389 #define GET_VTAG(cntrl, m) \
1390 do { \
1391 	if ((m)->m_flags & M_VLANTAG)					            \
1392 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1393 } while (0)
1394 
1395 static int
1396 t3_encap(struct sge_qset *qs, struct mbuf **m)
1397 {
1398 	adapter_t *sc;
1399 	struct mbuf *m0;
1400 	struct sge_txq *txq;
1401 	struct txq_state txqs;
1402 	struct port_info *pi;
1403 	unsigned int ndesc, flits, cntrl, mlen;
1404 	int err, nsegs, tso_info = 0;
1405 
1406 	struct work_request_hdr *wrp;
1407 	struct tx_sw_desc *txsd;
1408 	struct sg_ent *sgp, *sgl;
1409 	uint32_t wr_hi, wr_lo, sgl_flits;
1410 	bus_dma_segment_t segs[TX_MAX_SEGS];
1411 
1412 	struct tx_desc *txd;
1413 
1414 	pi = qs->port;
1415 	sc = pi->adapter;
1416 	txq = &qs->txq[TXQ_ETH];
1417 	txd = &txq->desc[txq->pidx];
1418 	txsd = &txq->sdesc[txq->pidx];
1419 	sgl = txq->txq_sgl;
1420 
1421 	prefetch(txd);
1422 	m0 = *m;
1423 
1424 	mtx_assert(&qs->lock, MA_OWNED);
1425 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1426 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1427 
1428 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1429 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1430 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1431 
1432 	if (m0->m_nextpkt != NULL) {
1433 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1434 		ndesc = 1;
1435 		mlen = 0;
1436 	} else {
1437 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1438 		    &m0, segs, &nsegs))) {
1439 			if (cxgb_debug)
1440 				printf("failed ... err=%d\n", err);
1441 			return (err);
1442 		}
1443 		mlen = m0->m_pkthdr.len;
1444 		ndesc = calc_tx_descs(m0, nsegs);
1445 	}
1446 	txq_prod(txq, ndesc, &txqs);
1447 
1448 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1449 	txsd->m = m0;
1450 
1451 	if (m0->m_nextpkt != NULL) {
1452 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1453 		int i, fidx;
1454 
1455 		if (nsegs > 7)
1456 			panic("trying to coalesce %d packets in to one WR", nsegs);
1457 		txq->txq_coalesced += nsegs;
1458 		wrp = (struct work_request_hdr *)txd;
1459 		flits = nsegs*2 + 1;
1460 
1461 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1462 			struct cpl_tx_pkt_batch_entry *cbe;
1463 			uint64_t flit;
1464 			uint32_t *hflit = (uint32_t *)&flit;
1465 			int cflags = m0->m_pkthdr.csum_flags;
1466 
1467 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1468 			GET_VTAG(cntrl, m0);
1469 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1470 			if (__predict_false(!(cflags & CSUM_IP)))
1471 				cntrl |= F_TXPKT_IPCSUM_DIS;
1472 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1473 				cntrl |= F_TXPKT_L4CSUM_DIS;
1474 
1475 			hflit[0] = htonl(cntrl);
1476 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1477 			flit |= htobe64(1 << 24);
1478 			cbe = &cpl_batch->pkt_entry[i];
1479 			cbe->cntrl = hflit[0];
1480 			cbe->len = hflit[1];
1481 			cbe->addr = htobe64(segs[i].ds_addr);
1482 		}
1483 
1484 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1485 		    V_WR_SGLSFLT(flits)) |
1486 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1487 		wr_lo = htonl(V_WR_LEN(flits) |
1488 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1489 		set_wr_hdr(wrp, wr_hi, wr_lo);
1490 		wmb();
1491 		ETHER_BPF_MTAP(pi->ifp, m0);
1492 		wr_gen2(txd, txqs.gen);
1493 		check_ring_tx_db(sc, txq, 0);
1494 		return (0);
1495 	} else if (tso_info) {
1496 		uint16_t eth_type;
1497 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1498 		struct ether_header *eh;
1499 		void *l3hdr;
1500 		struct tcphdr *tcp;
1501 
1502 		txd->flit[2] = 0;
1503 		GET_VTAG(cntrl, m0);
1504 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1505 		hdr->cntrl = htonl(cntrl);
1506 		hdr->len = htonl(mlen | 0x80000000);
1507 
1508 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1509 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1510 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1511 			    m0->m_pkthdr.csum_flags, m0->m_flags);
1512 			panic("tx tso packet too small");
1513 		}
1514 
1515 		/* Make sure that ether, ip, tcp headers are all in m0 */
1516 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1517 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1518 			if (__predict_false(m0 == NULL)) {
1519 				/* XXX panic probably an overreaction */
1520 				panic("couldn't fit header into mbuf");
1521 			}
1522 		}
1523 
1524 		eh = mtod(m0, struct ether_header *);
1525 		eth_type = eh->ether_type;
1526 		if (eth_type == htons(ETHERTYPE_VLAN)) {
1527 			struct ether_vlan_header *evh = (void *)eh;
1528 
1529 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1530 			l3hdr = evh + 1;
1531 			eth_type = evh->evl_proto;
1532 		} else {
1533 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1534 			l3hdr = eh + 1;
1535 		}
1536 
1537 		if (eth_type == htons(ETHERTYPE_IP)) {
1538 			struct ip *ip = l3hdr;
1539 
1540 			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1541 			tcp = (struct tcphdr *)(ip + 1);
1542 		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1543 			struct ip6_hdr *ip6 = l3hdr;
1544 
1545 			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1546 			    ("%s: CSUM_TSO with ip6_nxt %d",
1547 			    __func__, ip6->ip6_nxt));
1548 
1549 			tso_info |= F_LSO_IPV6;
1550 			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1551 			tcp = (struct tcphdr *)(ip6 + 1);
1552 		} else
1553 			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1554 
1555 		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1556 		hdr->lso_info = htonl(tso_info);
1557 
1558 		if (__predict_false(mlen <= PIO_LEN)) {
1559 			/*
1560 			 * pkt not undersized but fits in PIO_LEN
1561 			 * Indicates a TSO bug at the higher levels.
1562 			 */
1563 			txsd->m = NULL;
1564 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1565 			flits = (mlen + 7) / 8 + 3;
1566 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1567 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1568 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1569 			wr_lo = htonl(V_WR_LEN(flits) |
1570 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1571 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1572 			wmb();
1573 			ETHER_BPF_MTAP(pi->ifp, m0);
1574 			wr_gen2(txd, txqs.gen);
1575 			check_ring_tx_db(sc, txq, 0);
1576 			m_freem(m0);
1577 			return (0);
1578 		}
1579 		flits = 3;
1580 	} else {
1581 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1582 
1583 		GET_VTAG(cntrl, m0);
1584 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1585 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1586 			cntrl |= F_TXPKT_IPCSUM_DIS;
1587 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1588 			cntrl |= F_TXPKT_L4CSUM_DIS;
1589 		cpl->cntrl = htonl(cntrl);
1590 		cpl->len = htonl(mlen | 0x80000000);
1591 
1592 		if (mlen <= PIO_LEN) {
1593 			txsd->m = NULL;
1594 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1595 			flits = (mlen + 7) / 8 + 2;
1596 
1597 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1598 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1599 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1600 			wr_lo = htonl(V_WR_LEN(flits) |
1601 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1602 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1603 			wmb();
1604 			ETHER_BPF_MTAP(pi->ifp, m0);
1605 			wr_gen2(txd, txqs.gen);
1606 			check_ring_tx_db(sc, txq, 0);
1607 			m_freem(m0);
1608 			return (0);
1609 		}
1610 		flits = 2;
1611 	}
1612 	wrp = (struct work_request_hdr *)txd;
1613 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1614 	make_sgl(sgp, segs, nsegs);
1615 
1616 	sgl_flits = sgl_len(nsegs);
1617 
1618 	ETHER_BPF_MTAP(pi->ifp, m0);
1619 
1620 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1621 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1622 	wr_lo = htonl(V_WR_TID(txq->token));
1623 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1624 	    sgl_flits, wr_hi, wr_lo);
1625 	check_ring_tx_db(sc, txq, 0);
1626 
1627 	return (0);
1628 }
1629 
1630 void
1631 cxgb_tx_watchdog(void *arg)
1632 {
1633 	struct sge_qset *qs = arg;
1634 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1635 
1636         if (qs->coalescing != 0 &&
1637 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1638 	    TXQ_RING_EMPTY(qs))
1639                 qs->coalescing = 0;
1640         else if (qs->coalescing == 0 &&
1641 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1642                 qs->coalescing = 1;
1643 	if (TXQ_TRYLOCK(qs)) {
1644 		qs->qs_flags |= QS_FLUSHING;
1645 		cxgb_start_locked(qs);
1646 		qs->qs_flags &= ~QS_FLUSHING;
1647 		TXQ_UNLOCK(qs);
1648 	}
1649 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1650 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1651 		    qs, txq->txq_watchdog.c_cpu);
1652 }
1653 
1654 static void
1655 cxgb_tx_timeout(void *arg)
1656 {
1657 	struct sge_qset *qs = arg;
1658 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1659 
1660 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1661                 qs->coalescing = 1;
1662 	if (TXQ_TRYLOCK(qs)) {
1663 		qs->qs_flags |= QS_TIMEOUT;
1664 		cxgb_start_locked(qs);
1665 		qs->qs_flags &= ~QS_TIMEOUT;
1666 		TXQ_UNLOCK(qs);
1667 	}
1668 }
1669 
1670 static void
1671 cxgb_start_locked(struct sge_qset *qs)
1672 {
1673 	struct mbuf *m_head = NULL;
1674 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1675 	struct port_info *pi = qs->port;
1676 	struct ifnet *ifp = pi->ifp;
1677 
1678 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1679 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1680 
1681 	if (!pi->link_config.link_ok) {
1682 		TXQ_RING_FLUSH(qs);
1683 		return;
1684 	}
1685 	TXQ_LOCK_ASSERT(qs);
1686 	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1687 	    pi->link_config.link_ok) {
1688 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1689 
1690 		if (txq->size - txq->in_use <= TX_MAX_DESC)
1691 			break;
1692 
1693 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1694 			break;
1695 		/*
1696 		 *  Encapsulation can modify our pointer, and or make it
1697 		 *  NULL on failure.  In that event, we can't requeue.
1698 		 */
1699 		if (t3_encap(qs, &m_head) || m_head == NULL)
1700 			break;
1701 
1702 		m_head = NULL;
1703 	}
1704 
1705 	if (txq->db_pending)
1706 		check_ring_tx_db(pi->adapter, txq, 1);
1707 
1708 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1709 	    pi->link_config.link_ok)
1710 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1711 		    qs, txq->txq_timer.c_cpu);
1712 	if (m_head != NULL)
1713 		m_freem(m_head);
1714 }
1715 
1716 static int
1717 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1718 {
1719 	struct port_info *pi = qs->port;
1720 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1721 	struct buf_ring *br = txq->txq_mr;
1722 	int error, avail;
1723 
1724 	avail = txq->size - txq->in_use;
1725 	TXQ_LOCK_ASSERT(qs);
1726 
1727 	/*
1728 	 * We can only do a direct transmit if the following are true:
1729 	 * - we aren't coalescing (ring < 3/4 full)
1730 	 * - the link is up -- checked in caller
1731 	 * - there are no packets enqueued already
1732 	 * - there is space in hardware transmit queue
1733 	 */
1734 	if (check_pkt_coalesce(qs) == 0 &&
1735 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1736 		if (t3_encap(qs, &m)) {
1737 			if (m != NULL &&
1738 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1739 				return (error);
1740 		} else {
1741 			if (txq->db_pending)
1742 				check_ring_tx_db(pi->adapter, txq, 1);
1743 
1744 			/*
1745 			 * We've bypassed the buf ring so we need to update
1746 			 * the stats directly
1747 			 */
1748 			txq->txq_direct_packets++;
1749 			txq->txq_direct_bytes += m->m_pkthdr.len;
1750 		}
1751 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1752 		return (error);
1753 
1754 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1755 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1756 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1757 		cxgb_start_locked(qs);
1758 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1759 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1760 		    qs, txq->txq_timer.c_cpu);
1761 	return (0);
1762 }
1763 
1764 int
1765 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1766 {
1767 	struct sge_qset *qs;
1768 	struct port_info *pi = ifp->if_softc;
1769 	int error, qidx = pi->first_qset;
1770 
1771 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1772 	    ||(!pi->link_config.link_ok)) {
1773 		m_freem(m);
1774 		return (0);
1775 	}
1776 
1777 	if (m->m_flags & M_FLOWID)
1778 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1779 
1780 	qs = &pi->adapter->sge.qs[qidx];
1781 
1782 	if (TXQ_TRYLOCK(qs)) {
1783 		/* XXX running */
1784 		error = cxgb_transmit_locked(ifp, qs, m);
1785 		TXQ_UNLOCK(qs);
1786 	} else
1787 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1788 	return (error);
1789 }
1790 
1791 void
1792 cxgb_qflush(struct ifnet *ifp)
1793 {
1794 	/*
1795 	 * flush any enqueued mbufs in the buf_rings
1796 	 * and in the transmit queues
1797 	 * no-op for now
1798 	 */
1799 	return;
1800 }
1801 
1802 /**
1803  *	write_imm - write a packet into a Tx descriptor as immediate data
1804  *	@d: the Tx descriptor to write
1805  *	@m: the packet
1806  *	@len: the length of packet data to write as immediate data
1807  *	@gen: the generation bit value to write
1808  *
1809  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1810  *	contains a work request at its beginning.  We must write the packet
1811  *	carefully so the SGE doesn't read accidentally before it's written in
1812  *	its entirety.
1813  */
1814 static __inline void
1815 write_imm(struct tx_desc *d, struct mbuf *m,
1816 	  unsigned int len, unsigned int gen)
1817 {
1818 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1819 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1820 	uint32_t wr_hi, wr_lo;
1821 
1822 	if (len > WR_LEN)
1823 		panic("len too big %d\n", len);
1824 	if (len < sizeof(*from))
1825 		panic("len too small %d", len);
1826 
1827 	memcpy(&to[1], &from[1], len - sizeof(*from));
1828 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1829 					V_WR_BCNTLFLT(len & 7));
1830 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1831 					V_WR_LEN((len + 7) / 8));
1832 	set_wr_hdr(to, wr_hi, wr_lo);
1833 	wmb();
1834 	wr_gen2(d, gen);
1835 
1836 	/*
1837 	 * This check is a hack we should really fix the logic so
1838 	 * that this can't happen
1839 	 */
1840 	if (m->m_type != MT_DONTFREE)
1841 		m_freem(m);
1842 
1843 }
1844 
1845 /**
1846  *	check_desc_avail - check descriptor availability on a send queue
1847  *	@adap: the adapter
1848  *	@q: the TX queue
1849  *	@m: the packet needing the descriptors
1850  *	@ndesc: the number of Tx descriptors needed
1851  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1852  *
1853  *	Checks if the requested number of Tx descriptors is available on an
1854  *	SGE send queue.  If the queue is already suspended or not enough
1855  *	descriptors are available the packet is queued for later transmission.
1856  *	Must be called with the Tx queue locked.
1857  *
1858  *	Returns 0 if enough descriptors are available, 1 if there aren't
1859  *	enough descriptors and the packet has been queued, and 2 if the caller
1860  *	needs to retry because there weren't enough descriptors at the
1861  *	beginning of the call but some freed up in the mean time.
1862  */
1863 static __inline int
1864 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1865 		 struct mbuf *m, unsigned int ndesc,
1866 		 unsigned int qid)
1867 {
1868 	/*
1869 	 * XXX We currently only use this for checking the control queue
1870 	 * the control queue is only used for binding qsets which happens
1871 	 * at init time so we are guaranteed enough descriptors
1872 	 */
1873 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1874 addq_exit:	mbufq_tail(&q->sendq, m);
1875 		return 1;
1876 	}
1877 	if (__predict_false(q->size - q->in_use < ndesc)) {
1878 
1879 		struct sge_qset *qs = txq_to_qset(q, qid);
1880 
1881 		setbit(&qs->txq_stopped, qid);
1882 		if (should_restart_tx(q) &&
1883 		    test_and_clear_bit(qid, &qs->txq_stopped))
1884 			return 2;
1885 
1886 		q->stops++;
1887 		goto addq_exit;
1888 	}
1889 	return 0;
1890 }
1891 
1892 
1893 /**
1894  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1895  *	@q: the SGE control Tx queue
1896  *
1897  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1898  *	that send only immediate data (presently just the control queues) and
1899  *	thus do not have any mbufs
1900  */
1901 static __inline void
1902 reclaim_completed_tx_imm(struct sge_txq *q)
1903 {
1904 	unsigned int reclaim = q->processed - q->cleaned;
1905 
1906 	q->in_use -= reclaim;
1907 	q->cleaned += reclaim;
1908 }
1909 
1910 static __inline int
1911 immediate(const struct mbuf *m)
1912 {
1913 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1914 }
1915 
1916 /**
1917  *	ctrl_xmit - send a packet through an SGE control Tx queue
1918  *	@adap: the adapter
1919  *	@q: the control queue
1920  *	@m: the packet
1921  *
1922  *	Send a packet through an SGE control Tx queue.  Packets sent through
1923  *	a control queue must fit entirely as immediate data in a single Tx
1924  *	descriptor and have no page fragments.
1925  */
1926 static int
1927 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1928 {
1929 	int ret;
1930 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1931 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1932 
1933 	if (__predict_false(!immediate(m))) {
1934 		m_freem(m);
1935 		return 0;
1936 	}
1937 
1938 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1939 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1940 
1941 	TXQ_LOCK(qs);
1942 again:	reclaim_completed_tx_imm(q);
1943 
1944 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1945 	if (__predict_false(ret)) {
1946 		if (ret == 1) {
1947 			TXQ_UNLOCK(qs);
1948 			return (ENOSPC);
1949 		}
1950 		goto again;
1951 	}
1952 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1953 
1954 	q->in_use++;
1955 	if (++q->pidx >= q->size) {
1956 		q->pidx = 0;
1957 		q->gen ^= 1;
1958 	}
1959 	TXQ_UNLOCK(qs);
1960 	wmb();
1961 	t3_write_reg(adap, A_SG_KDOORBELL,
1962 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1963 	return (0);
1964 }
1965 
1966 
1967 /**
1968  *	restart_ctrlq - restart a suspended control queue
1969  *	@qs: the queue set cotaining the control queue
1970  *
1971  *	Resumes transmission on a suspended Tx control queue.
1972  */
1973 static void
1974 restart_ctrlq(void *data, int npending)
1975 {
1976 	struct mbuf *m;
1977 	struct sge_qset *qs = (struct sge_qset *)data;
1978 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1979 	adapter_t *adap = qs->port->adapter;
1980 
1981 	TXQ_LOCK(qs);
1982 again:	reclaim_completed_tx_imm(q);
1983 
1984 	while (q->in_use < q->size &&
1985 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1986 
1987 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1988 
1989 		if (++q->pidx >= q->size) {
1990 			q->pidx = 0;
1991 			q->gen ^= 1;
1992 		}
1993 		q->in_use++;
1994 	}
1995 	if (!mbufq_empty(&q->sendq)) {
1996 		setbit(&qs->txq_stopped, TXQ_CTRL);
1997 
1998 		if (should_restart_tx(q) &&
1999 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
2000 			goto again;
2001 		q->stops++;
2002 	}
2003 	TXQ_UNLOCK(qs);
2004 	t3_write_reg(adap, A_SG_KDOORBELL,
2005 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2006 }
2007 
2008 
2009 /*
2010  * Send a management message through control queue 0
2011  */
2012 int
2013 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
2014 {
2015 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
2016 }
2017 
2018 /**
2019  *	free_qset - free the resources of an SGE queue set
2020  *	@sc: the controller owning the queue set
2021  *	@q: the queue set
2022  *
2023  *	Release the HW and SW resources associated with an SGE queue set, such
2024  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2025  *	queue set must be quiesced prior to calling this.
2026  */
2027 static void
2028 t3_free_qset(adapter_t *sc, struct sge_qset *q)
2029 {
2030 	int i;
2031 
2032 	reclaim_completed_tx(q, 0, TXQ_ETH);
2033 	if (q->txq[TXQ_ETH].txq_mr != NULL)
2034 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
2035 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
2036 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
2037 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
2038 	}
2039 
2040 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2041 		if (q->fl[i].desc) {
2042 			mtx_lock_spin(&sc->sge.reg_lock);
2043 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2044 			mtx_unlock_spin(&sc->sge.reg_lock);
2045 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2046 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2047 					q->fl[i].desc_map);
2048 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2049 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2050 		}
2051 		if (q->fl[i].sdesc) {
2052 			free_rx_bufs(sc, &q->fl[i]);
2053 			free(q->fl[i].sdesc, M_DEVBUF);
2054 		}
2055 	}
2056 
2057 	mtx_unlock(&q->lock);
2058 	MTX_DESTROY(&q->lock);
2059 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2060 		if (q->txq[i].desc) {
2061 			mtx_lock_spin(&sc->sge.reg_lock);
2062 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2063 			mtx_unlock_spin(&sc->sge.reg_lock);
2064 			bus_dmamap_unload(q->txq[i].desc_tag,
2065 					q->txq[i].desc_map);
2066 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2067 					q->txq[i].desc_map);
2068 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2069 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2070 		}
2071 		if (q->txq[i].sdesc) {
2072 			free(q->txq[i].sdesc, M_DEVBUF);
2073 		}
2074 	}
2075 
2076 	if (q->rspq.desc) {
2077 		mtx_lock_spin(&sc->sge.reg_lock);
2078 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2079 		mtx_unlock_spin(&sc->sge.reg_lock);
2080 
2081 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2082 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2083 			        q->rspq.desc_map);
2084 		bus_dma_tag_destroy(q->rspq.desc_tag);
2085 		MTX_DESTROY(&q->rspq.lock);
2086 	}
2087 
2088 #ifdef INET
2089 	tcp_lro_free(&q->lro.ctrl);
2090 #endif
2091 
2092 	bzero(q, sizeof(*q));
2093 }
2094 
2095 /**
2096  *	t3_free_sge_resources - free SGE resources
2097  *	@sc: the adapter softc
2098  *
2099  *	Frees resources used by the SGE queue sets.
2100  */
2101 void
2102 t3_free_sge_resources(adapter_t *sc, int nqsets)
2103 {
2104 	int i;
2105 
2106 	for (i = 0; i < nqsets; ++i) {
2107 		TXQ_LOCK(&sc->sge.qs[i]);
2108 		t3_free_qset(sc, &sc->sge.qs[i]);
2109 	}
2110 }
2111 
2112 /**
2113  *	t3_sge_start - enable SGE
2114  *	@sc: the controller softc
2115  *
2116  *	Enables the SGE for DMAs.  This is the last step in starting packet
2117  *	transfers.
2118  */
2119 void
2120 t3_sge_start(adapter_t *sc)
2121 {
2122 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2123 }
2124 
2125 /**
2126  *	t3_sge_stop - disable SGE operation
2127  *	@sc: the adapter
2128  *
2129  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2130  *	from error interrupts) or from normal process context.  In the latter
2131  *	case it also disables any pending queue restart tasklets.  Note that
2132  *	if it is called in interrupt context it cannot disable the restart
2133  *	tasklets as it cannot wait, however the tasklets will have no effect
2134  *	since the doorbells are disabled and the driver will call this again
2135  *	later from process context, at which time the tasklets will be stopped
2136  *	if they are still running.
2137  */
2138 void
2139 t3_sge_stop(adapter_t *sc)
2140 {
2141 	int i, nqsets;
2142 
2143 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2144 
2145 	if (sc->tq == NULL)
2146 		return;
2147 
2148 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2149 		nqsets += sc->port[i].nqsets;
2150 #ifdef notyet
2151 	/*
2152 	 *
2153 	 * XXX
2154 	 */
2155 	for (i = 0; i < nqsets; ++i) {
2156 		struct sge_qset *qs = &sc->sge.qs[i];
2157 
2158 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2159 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2160 	}
2161 #endif
2162 }
2163 
2164 /**
2165  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2166  *	@adapter: the adapter
2167  *	@q: the Tx queue to reclaim descriptors from
2168  *	@reclaimable: the number of descriptors to reclaim
2169  *      @m_vec_size: maximum number of buffers to reclaim
2170  *      @desc_reclaimed: returns the number of descriptors reclaimed
2171  *
2172  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2173  *	Tx buffers.  Called with the Tx queue lock held.
2174  *
2175  *      Returns number of buffers of reclaimed
2176  */
2177 void
2178 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2179 {
2180 	struct tx_sw_desc *txsd;
2181 	unsigned int cidx, mask;
2182 	struct sge_txq *q = &qs->txq[queue];
2183 
2184 #ifdef T3_TRACE
2185 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2186 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2187 #endif
2188 	cidx = q->cidx;
2189 	mask = q->size - 1;
2190 	txsd = &q->sdesc[cidx];
2191 
2192 	mtx_assert(&qs->lock, MA_OWNED);
2193 	while (reclaimable--) {
2194 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2195 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2196 
2197 		if (txsd->m != NULL) {
2198 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2199 				bus_dmamap_unload(q->entry_tag, txsd->map);
2200 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2201 			}
2202 			m_freem_list(txsd->m);
2203 			txsd->m = NULL;
2204 		} else
2205 			q->txq_skipped++;
2206 
2207 		++txsd;
2208 		if (++cidx == q->size) {
2209 			cidx = 0;
2210 			txsd = q->sdesc;
2211 		}
2212 	}
2213 	q->cidx = cidx;
2214 
2215 }
2216 
2217 /**
2218  *	is_new_response - check if a response is newly written
2219  *	@r: the response descriptor
2220  *	@q: the response queue
2221  *
2222  *	Returns true if a response descriptor contains a yet unprocessed
2223  *	response.
2224  */
2225 static __inline int
2226 is_new_response(const struct rsp_desc *r,
2227     const struct sge_rspq *q)
2228 {
2229 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2230 }
2231 
2232 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2233 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2234 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2235 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2236 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2237 
2238 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2239 #define NOMEM_INTR_DELAY 2500
2240 
2241 /**
2242  *	write_ofld_wr - write an offload work request
2243  *	@adap: the adapter
2244  *	@m: the packet to send
2245  *	@q: the Tx queue
2246  *	@pidx: index of the first Tx descriptor to write
2247  *	@gen: the generation value to use
2248  *	@ndesc: number of descriptors the packet will occupy
2249  *
2250  *	Write an offload work request to send the supplied packet.  The packet
2251  *	data already carry the work request with most fields populated.
2252  */
2253 static void
2254 write_ofld_wr(adapter_t *adap, struct mbuf *m,
2255     struct sge_txq *q, unsigned int pidx,
2256     unsigned int gen, unsigned int ndesc,
2257     bus_dma_segment_t *segs, unsigned int nsegs)
2258 {
2259 	unsigned int sgl_flits, flits;
2260 	struct work_request_hdr *from;
2261 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2262 	struct tx_desc *d = &q->desc[pidx];
2263 	struct txq_state txqs;
2264 
2265 	if (immediate(m) && nsegs == 0) {
2266 		write_imm(d, m, m->m_len, gen);
2267 		return;
2268 	}
2269 
2270 	/* Only TX_DATA builds SGLs */
2271 	from = mtod(m, struct work_request_hdr *);
2272 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2273 
2274 	flits = m->m_len / 8;
2275 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2276 
2277 	make_sgl(sgp, segs, nsegs);
2278 	sgl_flits = sgl_len(nsegs);
2279 
2280 	txqs.gen = gen;
2281 	txqs.pidx = pidx;
2282 	txqs.compl = 0;
2283 
2284 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2285 	    from->wrh_hi, from->wrh_lo);
2286 }
2287 
2288 /**
2289  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2290  *	@m: the packet
2291  *
2292  * 	Returns the number of Tx descriptors needed for the given offload
2293  * 	packet.  These packets are already fully constructed.
2294  */
2295 static __inline unsigned int
2296 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2297 {
2298 	unsigned int flits, cnt = 0;
2299 	int ndescs;
2300 
2301 	if (m->m_len <= WR_LEN && nsegs == 0)
2302 		return (1);                 /* packet fits as immediate data */
2303 
2304 	/*
2305 	 * This needs to be re-visited for TOE
2306 	 */
2307 
2308 	cnt = nsegs;
2309 
2310 	/* headers */
2311 	flits = m->m_len / 8;
2312 
2313 	ndescs = flits_to_desc(flits + sgl_len(cnt));
2314 
2315 	return (ndescs);
2316 }
2317 
2318 /**
2319  *	ofld_xmit - send a packet through an offload queue
2320  *	@adap: the adapter
2321  *	@q: the Tx offload queue
2322  *	@m: the packet
2323  *
2324  *	Send an offload packet through an SGE offload queue.
2325  */
2326 static int
2327 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2328 {
2329 	int ret, nsegs;
2330 	unsigned int ndesc;
2331 	unsigned int pidx, gen;
2332 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2333 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2334 	struct tx_sw_desc *stx;
2335 
2336 	nsegs = m_get_sgllen(m);
2337 	vsegs = m_get_sgl(m);
2338 	ndesc = calc_tx_descs_ofld(m, nsegs);
2339 	busdma_map_sgl(vsegs, segs, nsegs);
2340 
2341 	stx = &q->sdesc[q->pidx];
2342 
2343 	TXQ_LOCK(qs);
2344 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2345 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2346 	if (__predict_false(ret)) {
2347 		if (ret == 1) {
2348 			printf("no ofld desc avail\n");
2349 
2350 			m_set_priority(m, ndesc);     /* save for restart */
2351 			TXQ_UNLOCK(qs);
2352 			return (EINTR);
2353 		}
2354 		goto again;
2355 	}
2356 
2357 	gen = q->gen;
2358 	q->in_use += ndesc;
2359 	pidx = q->pidx;
2360 	q->pidx += ndesc;
2361 	if (q->pidx >= q->size) {
2362 		q->pidx -= q->size;
2363 		q->gen ^= 1;
2364 	}
2365 #ifdef T3_TRACE
2366 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2367 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2368 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2369 		  skb_shinfo(skb)->nr_frags);
2370 #endif
2371 	TXQ_UNLOCK(qs);
2372 
2373 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2374 	check_ring_tx_db(adap, q, 1);
2375 	return (0);
2376 }
2377 
2378 /**
2379  *	restart_offloadq - restart a suspended offload queue
2380  *	@qs: the queue set cotaining the offload queue
2381  *
2382  *	Resumes transmission on a suspended Tx offload queue.
2383  */
2384 static void
2385 restart_offloadq(void *data, int npending)
2386 {
2387 	struct mbuf *m;
2388 	struct sge_qset *qs = data;
2389 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2390 	adapter_t *adap = qs->port->adapter;
2391 	bus_dma_segment_t segs[TX_MAX_SEGS];
2392 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2393 	int nsegs, cleaned;
2394 
2395 	TXQ_LOCK(qs);
2396 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2397 
2398 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2399 		unsigned int gen, pidx;
2400 		unsigned int ndesc = m_get_priority(m);
2401 
2402 		if (__predict_false(q->size - q->in_use < ndesc)) {
2403 			setbit(&qs->txq_stopped, TXQ_OFLD);
2404 			if (should_restart_tx(q) &&
2405 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2406 				goto again;
2407 			q->stops++;
2408 			break;
2409 		}
2410 
2411 		gen = q->gen;
2412 		q->in_use += ndesc;
2413 		pidx = q->pidx;
2414 		q->pidx += ndesc;
2415 		if (q->pidx >= q->size) {
2416 			q->pidx -= q->size;
2417 			q->gen ^= 1;
2418 		}
2419 
2420 		(void)mbufq_dequeue(&q->sendq);
2421 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2422 		TXQ_UNLOCK(qs);
2423 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2424 		TXQ_LOCK(qs);
2425 	}
2426 #if USE_GTS
2427 	set_bit(TXQ_RUNNING, &q->flags);
2428 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2429 #endif
2430 	TXQ_UNLOCK(qs);
2431 	wmb();
2432 	t3_write_reg(adap, A_SG_KDOORBELL,
2433 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2434 }
2435 
2436 /**
2437  *	queue_set - return the queue set a packet should use
2438  *	@m: the packet
2439  *
2440  *	Maps a packet to the SGE queue set it should use.  The desired queue
2441  *	set is carried in bits 1-3 in the packet's priority.
2442  */
2443 static __inline int
2444 queue_set(const struct mbuf *m)
2445 {
2446 	return m_get_priority(m) >> 1;
2447 }
2448 
2449 /**
2450  *	is_ctrl_pkt - return whether an offload packet is a control packet
2451  *	@m: the packet
2452  *
2453  *	Determines whether an offload packet should use an OFLD or a CTRL
2454  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2455  */
2456 static __inline int
2457 is_ctrl_pkt(const struct mbuf *m)
2458 {
2459 	return m_get_priority(m) & 1;
2460 }
2461 
2462 /**
2463  *	t3_offload_tx - send an offload packet
2464  *	@tdev: the offload device to send to
2465  *	@m: the packet
2466  *
2467  *	Sends an offload packet.  We use the packet priority to select the
2468  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2469  *	should be sent as regular or control, bits 1-3 select the queue set.
2470  */
2471 int
2472 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2473 {
2474 	adapter_t *adap = tdev2adap(tdev);
2475 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2476 
2477 	if (__predict_false(is_ctrl_pkt(m)))
2478 		return ctrl_xmit(adap, qs, m);
2479 
2480 	return ofld_xmit(adap, qs, m);
2481 }
2482 
2483 /**
2484  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2485  *	@tdev: the offload device that will be receiving the packets
2486  *	@q: the SGE response queue that assembled the bundle
2487  *	@m: the partial bundle
2488  *	@n: the number of packets in the bundle
2489  *
2490  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2491  */
2492 static __inline void
2493 deliver_partial_bundle(struct t3cdev *tdev,
2494 			struct sge_rspq *q,
2495 			struct mbuf *mbufs[], int n)
2496 {
2497 	if (n) {
2498 		q->offload_bundles++;
2499 		cxgb_ofld_recv(tdev, mbufs, n);
2500 	}
2501 }
2502 
2503 static __inline int
2504 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2505     struct mbuf *m, struct mbuf *rx_gather[],
2506     unsigned int gather_idx)
2507 {
2508 
2509 	rq->offload_pkts++;
2510 	m->m_pkthdr.header = mtod(m, void *);
2511 	rx_gather[gather_idx++] = m;
2512 	if (gather_idx == RX_BUNDLE_SIZE) {
2513 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2514 		gather_idx = 0;
2515 		rq->offload_bundles++;
2516 	}
2517 	return (gather_idx);
2518 }
2519 
2520 static void
2521 restart_tx(struct sge_qset *qs)
2522 {
2523 	struct adapter *sc = qs->port->adapter;
2524 
2525 
2526 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2527 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2528 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2529 		qs->txq[TXQ_OFLD].restarts++;
2530 		DPRINTF("restarting TXQ_OFLD\n");
2531 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2532 	}
2533 	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2534 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2535 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2536 	    qs->txq[TXQ_CTRL].in_use);
2537 
2538 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2539 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2540 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2541 		qs->txq[TXQ_CTRL].restarts++;
2542 		DPRINTF("restarting TXQ_CTRL\n");
2543 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2544 	}
2545 }
2546 
2547 /**
2548  *	t3_sge_alloc_qset - initialize an SGE queue set
2549  *	@sc: the controller softc
2550  *	@id: the queue set id
2551  *	@nports: how many Ethernet ports will be using this queue set
2552  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2553  *	@p: configuration parameters for this queue set
2554  *	@ntxq: number of Tx queues for the queue set
2555  *	@pi: port info for queue set
2556  *
2557  *	Allocate resources and initialize an SGE queue set.  A queue set
2558  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2559  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2560  *	queue, offload queue, and control queue.
2561  */
2562 int
2563 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2564 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2565 {
2566 	struct sge_qset *q = &sc->sge.qs[id];
2567 	int i, ret = 0;
2568 
2569 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2570 	q->port = pi;
2571 
2572 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2573 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2574 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2575 		goto err;
2576 	}
2577 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2578 	    M_NOWAIT | M_ZERO)) == NULL) {
2579 		device_printf(sc->dev, "failed to allocate ifq\n");
2580 		goto err;
2581 	}
2582 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2583 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2584 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2585 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2586 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2587 
2588 	init_qset_cntxt(q, id);
2589 	q->idx = id;
2590 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2591 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2592 		    &q->fl[0].desc, &q->fl[0].sdesc,
2593 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2594 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2595 		printf("error %d from alloc ring fl0\n", ret);
2596 		goto err;
2597 	}
2598 
2599 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2600 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2601 		    &q->fl[1].desc, &q->fl[1].sdesc,
2602 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2603 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2604 		printf("error %d from alloc ring fl1\n", ret);
2605 		goto err;
2606 	}
2607 
2608 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2609 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2610 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2611 		    NULL, NULL)) != 0) {
2612 		printf("error %d from alloc ring rspq\n", ret);
2613 		goto err;
2614 	}
2615 
2616 	for (i = 0; i < ntxq; ++i) {
2617 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2618 
2619 		if ((ret = alloc_ring(sc, p->txq_size[i],
2620 			    sizeof(struct tx_desc), sz,
2621 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2622 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2623 			    &q->txq[i].desc_map,
2624 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2625 			printf("error %d from alloc ring tx %i\n", ret, i);
2626 			goto err;
2627 		}
2628 		mbufq_init(&q->txq[i].sendq);
2629 		q->txq[i].gen = 1;
2630 		q->txq[i].size = p->txq_size[i];
2631 	}
2632 
2633 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2634 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2635 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2636 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2637 
2638 	q->fl[0].gen = q->fl[1].gen = 1;
2639 	q->fl[0].size = p->fl_size;
2640 	q->fl[1].size = p->jumbo_size;
2641 
2642 	q->rspq.gen = 1;
2643 	q->rspq.cidx = 0;
2644 	q->rspq.size = p->rspq_size;
2645 
2646 	q->txq[TXQ_ETH].stop_thres = nports *
2647 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2648 
2649 	q->fl[0].buf_size = MCLBYTES;
2650 	q->fl[0].zone = zone_pack;
2651 	q->fl[0].type = EXT_PACKET;
2652 
2653 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2654 		q->fl[1].zone = zone_jumbo16;
2655 		q->fl[1].type = EXT_JUMBO16;
2656 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2657 		q->fl[1].zone = zone_jumbo9;
2658 		q->fl[1].type = EXT_JUMBO9;
2659 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2660 		q->fl[1].zone = zone_jumbop;
2661 		q->fl[1].type = EXT_JUMBOP;
2662 	} else {
2663 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2664 		ret = EDOOFUS;
2665 		goto err;
2666 	}
2667 	q->fl[1].buf_size = p->jumbo_buf_size;
2668 
2669 	/* Allocate and setup the lro_ctrl structure */
2670 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2671 #ifdef INET
2672 	ret = tcp_lro_init(&q->lro.ctrl);
2673 	if (ret) {
2674 		printf("error %d from tcp_lro_init\n", ret);
2675 		goto err;
2676 	}
2677 #endif
2678 	q->lro.ctrl.ifp = pi->ifp;
2679 
2680 	mtx_lock_spin(&sc->sge.reg_lock);
2681 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2682 				   q->rspq.phys_addr, q->rspq.size,
2683 				   q->fl[0].buf_size, 1, 0);
2684 	if (ret) {
2685 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2686 		goto err_unlock;
2687 	}
2688 
2689 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2690 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2691 					  q->fl[i].phys_addr, q->fl[i].size,
2692 					  q->fl[i].buf_size, p->cong_thres, 1,
2693 					  0);
2694 		if (ret) {
2695 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2696 			goto err_unlock;
2697 		}
2698 	}
2699 
2700 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2701 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2702 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2703 				 1, 0);
2704 	if (ret) {
2705 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2706 		goto err_unlock;
2707 	}
2708 
2709 	if (ntxq > 1) {
2710 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2711 					 USE_GTS, SGE_CNTXT_OFLD, id,
2712 					 q->txq[TXQ_OFLD].phys_addr,
2713 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2714 		if (ret) {
2715 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2716 			goto err_unlock;
2717 		}
2718 	}
2719 
2720 	if (ntxq > 2) {
2721 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2722 					 SGE_CNTXT_CTRL, id,
2723 					 q->txq[TXQ_CTRL].phys_addr,
2724 					 q->txq[TXQ_CTRL].size,
2725 					 q->txq[TXQ_CTRL].token, 1, 0);
2726 		if (ret) {
2727 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2728 			goto err_unlock;
2729 		}
2730 	}
2731 
2732 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2733 	    device_get_unit(sc->dev), irq_vec_idx);
2734 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2735 
2736 	mtx_unlock_spin(&sc->sge.reg_lock);
2737 	t3_update_qset_coalesce(q, p);
2738 	q->port = pi;
2739 
2740 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2741 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2742 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2743 
2744 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2745 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2746 
2747 	return (0);
2748 
2749 err_unlock:
2750 	mtx_unlock_spin(&sc->sge.reg_lock);
2751 err:
2752 	TXQ_LOCK(q);
2753 	t3_free_qset(sc, q);
2754 
2755 	return (ret);
2756 }
2757 
2758 /*
2759  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2760  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2761  * will also be taken into account here.
2762  */
2763 void
2764 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2765 {
2766 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2767 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2768 	struct ifnet *ifp = pi->ifp;
2769 
2770 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2771 
2772 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2773 	    cpl->csum_valid && cpl->csum == 0xffff) {
2774 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2775 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2776 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2777 		m->m_pkthdr.csum_data = 0xffff;
2778 	}
2779 
2780 	if (cpl->vlan_valid) {
2781 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2782 		m->m_flags |= M_VLANTAG;
2783 	}
2784 
2785 	m->m_pkthdr.rcvif = ifp;
2786 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2787 	/*
2788 	 * adjust after conversion to mbuf chain
2789 	 */
2790 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2791 	m->m_len -= (sizeof(*cpl) + ethpad);
2792 	m->m_data += (sizeof(*cpl) + ethpad);
2793 }
2794 
2795 /**
2796  *	get_packet - return the next ingress packet buffer from a free list
2797  *	@adap: the adapter that received the packet
2798  *	@drop_thres: # of remaining buffers before we start dropping packets
2799  *	@qs: the qset that the SGE free list holding the packet belongs to
2800  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2801  *      @r: response descriptor
2802  *
2803  *	Get the next packet from a free list and complete setup of the
2804  *	sk_buff.  If the packet is small we make a copy and recycle the
2805  *	original buffer, otherwise we use the original buffer itself.  If a
2806  *	positive drop threshold is supplied packets are dropped and their
2807  *	buffers recycled if (a) the number of remaining buffers is under the
2808  *	threshold and the packet is too big to copy, or (b) the packet should
2809  *	be copied but there is no memory for the copy.
2810  */
2811 static int
2812 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2813     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2814 {
2815 
2816 	unsigned int len_cq =  ntohl(r->len_cq);
2817 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2818 	int mask, cidx = fl->cidx;
2819 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2820 	uint32_t len = G_RSPD_LEN(len_cq);
2821 	uint32_t flags = M_EXT;
2822 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2823 	caddr_t cl;
2824 	struct mbuf *m;
2825 	int ret = 0;
2826 
2827 	mask = fl->size - 1;
2828 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2829 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2830 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2831 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2832 
2833 	fl->credits--;
2834 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2835 
2836 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2837 	    sopeop == RSPQ_SOP_EOP) {
2838 		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2839 			goto skip_recycle;
2840 		cl = mtod(m, void *);
2841 		memcpy(cl, sd->rxsd_cl, len);
2842 		recycle_rx_buf(adap, fl, fl->cidx);
2843 		m->m_pkthdr.len = m->m_len = len;
2844 		m->m_flags = 0;
2845 		mh->mh_head = mh->mh_tail = m;
2846 		ret = 1;
2847 		goto done;
2848 	} else {
2849 	skip_recycle:
2850 		bus_dmamap_unload(fl->entry_tag, sd->map);
2851 		cl = sd->rxsd_cl;
2852 		m = sd->m;
2853 
2854 		if ((sopeop == RSPQ_SOP_EOP) ||
2855 		    (sopeop == RSPQ_SOP))
2856 			flags |= M_PKTHDR;
2857 		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2858 		if (fl->zone == zone_pack) {
2859 			/*
2860 			 * restore clobbered data pointer
2861 			 */
2862 			m->m_data = m->m_ext.ext_buf;
2863 		} else {
2864 			m_cljset(m, cl, fl->type);
2865 		}
2866 		m->m_len = len;
2867 	}
2868 	switch(sopeop) {
2869 	case RSPQ_SOP_EOP:
2870 		ret = 1;
2871 		/* FALLTHROUGH */
2872 	case RSPQ_SOP:
2873 		mh->mh_head = mh->mh_tail = m;
2874 		m->m_pkthdr.len = len;
2875 		break;
2876 	case RSPQ_EOP:
2877 		ret = 1;
2878 		/* FALLTHROUGH */
2879 	case RSPQ_NSOP_NEOP:
2880 		if (mh->mh_tail == NULL) {
2881 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2882 			m_freem(m);
2883 			break;
2884 		}
2885 		mh->mh_tail->m_next = m;
2886 		mh->mh_tail = m;
2887 		mh->mh_head->m_pkthdr.len += len;
2888 		break;
2889 	}
2890 	if (cxgb_debug)
2891 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2892 done:
2893 	if (++fl->cidx == fl->size)
2894 		fl->cidx = 0;
2895 
2896 	return (ret);
2897 }
2898 
2899 /**
2900  *	handle_rsp_cntrl_info - handles control information in a response
2901  *	@qs: the queue set corresponding to the response
2902  *	@flags: the response control flags
2903  *
2904  *	Handles the control information of an SGE response, such as GTS
2905  *	indications and completion credits for the queue set's Tx queues.
2906  *	HW coalesces credits, we don't do any extra SW coalescing.
2907  */
2908 static __inline void
2909 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2910 {
2911 	unsigned int credits;
2912 
2913 #if USE_GTS
2914 	if (flags & F_RSPD_TXQ0_GTS)
2915 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2916 #endif
2917 	credits = G_RSPD_TXQ0_CR(flags);
2918 	if (credits)
2919 		qs->txq[TXQ_ETH].processed += credits;
2920 
2921 	credits = G_RSPD_TXQ2_CR(flags);
2922 	if (credits)
2923 		qs->txq[TXQ_CTRL].processed += credits;
2924 
2925 # if USE_GTS
2926 	if (flags & F_RSPD_TXQ1_GTS)
2927 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2928 # endif
2929 	credits = G_RSPD_TXQ1_CR(flags);
2930 	if (credits)
2931 		qs->txq[TXQ_OFLD].processed += credits;
2932 
2933 }
2934 
2935 static void
2936 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2937     unsigned int sleeping)
2938 {
2939 	;
2940 }
2941 
2942 /**
2943  *	process_responses - process responses from an SGE response queue
2944  *	@adap: the adapter
2945  *	@qs: the queue set to which the response queue belongs
2946  *	@budget: how many responses can be processed in this round
2947  *
2948  *	Process responses from an SGE response queue up to the supplied budget.
2949  *	Responses include received packets as well as credits and other events
2950  *	for the queues that belong to the response queue's queue set.
2951  *	A negative budget is effectively unlimited.
2952  *
2953  *	Additionally choose the interrupt holdoff time for the next interrupt
2954  *	on this queue.  If the system is under memory shortage use a fairly
2955  *	long delay to help recovery.
2956  */
2957 static int
2958 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2959 {
2960 	struct sge_rspq *rspq = &qs->rspq;
2961 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2962 	int budget_left = budget;
2963 	unsigned int sleeping = 0;
2964 	int lro_enabled = qs->lro.enabled;
2965 	int skip_lro;
2966 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2967 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2968 	int ngathered = 0;
2969 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2970 #ifdef DEBUG
2971 	static int last_holdoff = 0;
2972 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2973 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2974 		last_holdoff = rspq->holdoff_tmr;
2975 	}
2976 #endif
2977 	rspq->next_holdoff = rspq->holdoff_tmr;
2978 
2979 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2980 		int eth, eop = 0, ethpad = 0;
2981 		uint32_t flags = ntohl(r->flags);
2982 		uint32_t rss_csum = *(const uint32_t *)r;
2983 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2984 
2985 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2986 
2987 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2988 			struct mbuf *m;
2989 
2990 			if (cxgb_debug)
2991 				printf("async notification\n");
2992 
2993 			if (mh->mh_head == NULL) {
2994 				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2995 				m = mh->mh_head;
2996 			} else {
2997 				m = m_gethdr(M_DONTWAIT, MT_DATA);
2998 			}
2999 			if (m == NULL)
3000 				goto no_mem;
3001 
3002                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
3003 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
3004                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
3005 			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
3006 			eop = 1;
3007                         rspq->async_notif++;
3008 			goto skip;
3009 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
3010 			struct mbuf *m = NULL;
3011 
3012 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
3013 			    r->rss_hdr.opcode, rspq->cidx);
3014 			if (mh->mh_head == NULL)
3015 				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3016                         else
3017 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3018 
3019 			if (mh->mh_head == NULL &&  m == NULL) {
3020 		no_mem:
3021 				rspq->next_holdoff = NOMEM_INTR_DELAY;
3022 				budget_left--;
3023 				break;
3024 			}
3025 			get_imm_packet(adap, r, mh->mh_head);
3026 			eop = 1;
3027 			rspq->imm_data++;
3028 		} else if (r->len_cq) {
3029 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3030 
3031 			eop = get_packet(adap, drop_thresh, qs, mh, r);
3032 			if (eop) {
3033 				if (r->rss_hdr.hash_type && !adap->timestamp)
3034 					mh->mh_head->m_flags |= M_FLOWID;
3035 				mh->mh_head->m_pkthdr.flowid = rss_hash;
3036 			}
3037 
3038 			ethpad = 2;
3039 		} else {
3040 			rspq->pure_rsps++;
3041 		}
3042 	skip:
3043 		if (flags & RSPD_CTRL_MASK) {
3044 			sleeping |= flags & RSPD_GTS_MASK;
3045 			handle_rsp_cntrl_info(qs, flags);
3046 		}
3047 
3048 		r++;
3049 		if (__predict_false(++rspq->cidx == rspq->size)) {
3050 			rspq->cidx = 0;
3051 			rspq->gen ^= 1;
3052 			r = rspq->desc;
3053 		}
3054 
3055 		if (++rspq->credits >= 64) {
3056 			refill_rspq(adap, rspq, rspq->credits);
3057 			rspq->credits = 0;
3058 		}
3059 		if (!eth && eop) {
3060 			mh->mh_head->m_pkthdr.csum_data = rss_csum;
3061 			/*
3062 			 * XXX size mismatch
3063 			 */
3064 			m_set_priority(mh->mh_head, rss_hash);
3065 
3066 
3067 			ngathered = rx_offload(&adap->tdev, rspq,
3068 			    mh->mh_head, offload_mbufs, ngathered);
3069 			mh->mh_head = NULL;
3070 			DPRINTF("received offload packet\n");
3071 
3072 		} else if (eth && eop) {
3073 			struct mbuf *m = mh->mh_head;
3074 
3075 			t3_rx_eth(adap, rspq, m, ethpad);
3076 
3077 			/*
3078 			 * The T304 sends incoming packets on any qset.  If LRO
3079 			 * is also enabled, we could end up sending packet up
3080 			 * lro_ctrl->ifp's input.  That is incorrect.
3081 			 *
3082 			 * The mbuf's rcvif was derived from the cpl header and
3083 			 * is accurate.  Skip LRO and just use that.
3084 			 */
3085 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3086 
3087 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
3088 #ifdef INET
3089 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
3090 #endif
3091 			    ) {
3092 				/* successfully queue'd for LRO */
3093 			} else {
3094 				/*
3095 				 * LRO not enabled, packet unsuitable for LRO,
3096 				 * or unable to queue.  Pass it up right now in
3097 				 * either case.
3098 				 */
3099 				struct ifnet *ifp = m->m_pkthdr.rcvif;
3100 				(*ifp->if_input)(ifp, m);
3101 			}
3102 			mh->mh_head = NULL;
3103 
3104 		}
3105 		__refill_fl_lt(adap, &qs->fl[0], 32);
3106 		__refill_fl_lt(adap, &qs->fl[1], 32);
3107 		--budget_left;
3108 	}
3109 
3110 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3111 
3112 #ifdef INET
3113 	/* Flush LRO */
3114 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3115 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3116 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3117 		tcp_lro_flush(lro_ctrl, queued);
3118 	}
3119 #endif
3120 
3121 	if (sleeping)
3122 		check_ring_db(adap, qs, sleeping);
3123 
3124 	mb();  /* commit Tx queue processed updates */
3125 	if (__predict_false(qs->txq_stopped > 1))
3126 		restart_tx(qs);
3127 
3128 	__refill_fl_lt(adap, &qs->fl[0], 512);
3129 	__refill_fl_lt(adap, &qs->fl[1], 512);
3130 	budget -= budget_left;
3131 	return (budget);
3132 }
3133 
3134 /*
3135  * A helper function that processes responses and issues GTS.
3136  */
3137 static __inline int
3138 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3139 {
3140 	int work;
3141 	static int last_holdoff = 0;
3142 
3143 	work = process_responses(adap, rspq_to_qset(rq), -1);
3144 
3145 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3146 		printf("next_holdoff=%d\n", rq->next_holdoff);
3147 		last_holdoff = rq->next_holdoff;
3148 	}
3149 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3150 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3151 
3152 	return (work);
3153 }
3154 
3155 
3156 /*
3157  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3158  * Handles data events from SGE response queues as well as error and other
3159  * async events as they all use the same interrupt pin.  We use one SGE
3160  * response queue per port in this mode and protect all response queues with
3161  * queue 0's lock.
3162  */
3163 void
3164 t3b_intr(void *data)
3165 {
3166 	uint32_t i, map;
3167 	adapter_t *adap = data;
3168 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3169 
3170 	t3_write_reg(adap, A_PL_CLI, 0);
3171 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3172 
3173 	if (!map)
3174 		return;
3175 
3176 	if (__predict_false(map & F_ERRINTR)) {
3177 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3178 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3179 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3180 	}
3181 
3182 	mtx_lock(&q0->lock);
3183 	for_each_port(adap, i)
3184 	    if (map & (1 << i))
3185 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3186 	mtx_unlock(&q0->lock);
3187 }
3188 
3189 /*
3190  * The MSI interrupt handler.  This needs to handle data events from SGE
3191  * response queues as well as error and other async events as they all use
3192  * the same MSI vector.  We use one SGE response queue per port in this mode
3193  * and protect all response queues with queue 0's lock.
3194  */
3195 void
3196 t3_intr_msi(void *data)
3197 {
3198 	adapter_t *adap = data;
3199 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3200 	int i, new_packets = 0;
3201 
3202 	mtx_lock(&q0->lock);
3203 
3204 	for_each_port(adap, i)
3205 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3206 		    new_packets = 1;
3207 	mtx_unlock(&q0->lock);
3208 	if (new_packets == 0) {
3209 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3210 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3211 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3212 	}
3213 }
3214 
3215 void
3216 t3_intr_msix(void *data)
3217 {
3218 	struct sge_qset *qs = data;
3219 	adapter_t *adap = qs->port->adapter;
3220 	struct sge_rspq *rspq = &qs->rspq;
3221 
3222 	if (process_responses_gts(adap, rspq) == 0)
3223 		rspq->unhandled_irqs++;
3224 }
3225 
3226 #define QDUMP_SBUF_SIZE		32 * 400
3227 static int
3228 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3229 {
3230 	struct sge_rspq *rspq;
3231 	struct sge_qset *qs;
3232 	int i, err, dump_end, idx;
3233 	struct sbuf *sb;
3234 	struct rsp_desc *rspd;
3235 	uint32_t data[4];
3236 
3237 	rspq = arg1;
3238 	qs = rspq_to_qset(rspq);
3239 	if (rspq->rspq_dump_count == 0)
3240 		return (0);
3241 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3242 		log(LOG_WARNING,
3243 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3244 		rspq->rspq_dump_count = 0;
3245 		return (EINVAL);
3246 	}
3247 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3248 		log(LOG_WARNING,
3249 		    "dump start of %d is greater than queue size\n",
3250 		    rspq->rspq_dump_start);
3251 		rspq->rspq_dump_start = 0;
3252 		return (EINVAL);
3253 	}
3254 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3255 	if (err)
3256 		return (err);
3257 	err = sysctl_wire_old_buffer(req, 0);
3258 	if (err)
3259 		return (err);
3260 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3261 
3262 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3263 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3264 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3265 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3266 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3267 
3268 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3269 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3270 
3271 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3272 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3273 		idx = i & (RSPQ_Q_SIZE-1);
3274 
3275 		rspd = &rspq->desc[idx];
3276 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3277 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3278 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3279 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3280 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3281 		    be32toh(rspd->len_cq), rspd->intr_gen);
3282 	}
3283 
3284 	err = sbuf_finish(sb);
3285 	/* Output a trailing NUL. */
3286 	if (err == 0)
3287 		err = SYSCTL_OUT(req, "", 1);
3288 	sbuf_delete(sb);
3289 	return (err);
3290 }
3291 
3292 static int
3293 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3294 {
3295 	struct sge_txq *txq;
3296 	struct sge_qset *qs;
3297 	int i, j, err, dump_end;
3298 	struct sbuf *sb;
3299 	struct tx_desc *txd;
3300 	uint32_t *WR, wr_hi, wr_lo, gen;
3301 	uint32_t data[4];
3302 
3303 	txq = arg1;
3304 	qs = txq_to_qset(txq, TXQ_ETH);
3305 	if (txq->txq_dump_count == 0) {
3306 		return (0);
3307 	}
3308 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3309 		log(LOG_WARNING,
3310 		    "dump count is too large %d\n", txq->txq_dump_count);
3311 		txq->txq_dump_count = 1;
3312 		return (EINVAL);
3313 	}
3314 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3315 		log(LOG_WARNING,
3316 		    "dump start of %d is greater than queue size\n",
3317 		    txq->txq_dump_start);
3318 		txq->txq_dump_start = 0;
3319 		return (EINVAL);
3320 	}
3321 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3322 	if (err)
3323 		return (err);
3324 	err = sysctl_wire_old_buffer(req, 0);
3325 	if (err)
3326 		return (err);
3327 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3328 
3329 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3330 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3331 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3332 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3333 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3334 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3335 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3336 	    txq->txq_dump_start,
3337 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3338 
3339 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3340 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3341 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3342 		WR = (uint32_t *)txd->flit;
3343 		wr_hi = ntohl(WR[0]);
3344 		wr_lo = ntohl(WR[1]);
3345 		gen = G_WR_GEN(wr_lo);
3346 
3347 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3348 		    wr_hi, wr_lo, gen);
3349 		for (j = 2; j < 30; j += 4)
3350 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3351 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3352 
3353 	}
3354 	err = sbuf_finish(sb);
3355 	/* Output a trailing NUL. */
3356 	if (err == 0)
3357 		err = SYSCTL_OUT(req, "", 1);
3358 	sbuf_delete(sb);
3359 	return (err);
3360 }
3361 
3362 static int
3363 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3364 {
3365 	struct sge_txq *txq;
3366 	struct sge_qset *qs;
3367 	int i, j, err, dump_end;
3368 	struct sbuf *sb;
3369 	struct tx_desc *txd;
3370 	uint32_t *WR, wr_hi, wr_lo, gen;
3371 
3372 	txq = arg1;
3373 	qs = txq_to_qset(txq, TXQ_CTRL);
3374 	if (txq->txq_dump_count == 0) {
3375 		return (0);
3376 	}
3377 	if (txq->txq_dump_count > 256) {
3378 		log(LOG_WARNING,
3379 		    "dump count is too large %d\n", txq->txq_dump_count);
3380 		txq->txq_dump_count = 1;
3381 		return (EINVAL);
3382 	}
3383 	if (txq->txq_dump_start > 255) {
3384 		log(LOG_WARNING,
3385 		    "dump start of %d is greater than queue size\n",
3386 		    txq->txq_dump_start);
3387 		txq->txq_dump_start = 0;
3388 		return (EINVAL);
3389 	}
3390 
3391 	err = sysctl_wire_old_buffer(req, 0);
3392 	if (err != 0)
3393 		return (err);
3394 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3395 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3396 	    txq->txq_dump_start,
3397 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3398 
3399 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3400 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3401 		txd = &txq->desc[i & (255)];
3402 		WR = (uint32_t *)txd->flit;
3403 		wr_hi = ntohl(WR[0]);
3404 		wr_lo = ntohl(WR[1]);
3405 		gen = G_WR_GEN(wr_lo);
3406 
3407 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3408 		    wr_hi, wr_lo, gen);
3409 		for (j = 2; j < 30; j += 4)
3410 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3411 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3412 
3413 	}
3414 	err = sbuf_finish(sb);
3415 	/* Output a trailing NUL. */
3416 	if (err == 0)
3417 		err = SYSCTL_OUT(req, "", 1);
3418 	sbuf_delete(sb);
3419 	return (err);
3420 }
3421 
3422 static int
3423 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3424 {
3425 	adapter_t *sc = arg1;
3426 	struct qset_params *qsp = &sc->params.sge.qset[0];
3427 	int coalesce_usecs;
3428 	struct sge_qset *qs;
3429 	int i, j, err, nqsets = 0;
3430 	struct mtx *lock;
3431 
3432 	if ((sc->flags & FULL_INIT_DONE) == 0)
3433 		return (ENXIO);
3434 
3435 	coalesce_usecs = qsp->coalesce_usecs;
3436         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3437 
3438 	if (err != 0) {
3439 		return (err);
3440 	}
3441 	if (coalesce_usecs == qsp->coalesce_usecs)
3442 		return (0);
3443 
3444 	for (i = 0; i < sc->params.nports; i++)
3445 		for (j = 0; j < sc->port[i].nqsets; j++)
3446 			nqsets++;
3447 
3448 	coalesce_usecs = max(1, coalesce_usecs);
3449 
3450 	for (i = 0; i < nqsets; i++) {
3451 		qs = &sc->sge.qs[i];
3452 		qsp = &sc->params.sge.qset[i];
3453 		qsp->coalesce_usecs = coalesce_usecs;
3454 
3455 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3456 			    &sc->sge.qs[0].rspq.lock;
3457 
3458 		mtx_lock(lock);
3459 		t3_update_qset_coalesce(qs, qsp);
3460 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3461 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3462 		mtx_unlock(lock);
3463 	}
3464 
3465 	return (0);
3466 }
3467 
3468 static int
3469 t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3470 {
3471 	adapter_t *sc = arg1;
3472 	int rc, timestamp;
3473 
3474 	if ((sc->flags & FULL_INIT_DONE) == 0)
3475 		return (ENXIO);
3476 
3477 	timestamp = sc->timestamp;
3478 	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3479 
3480 	if (rc != 0)
3481 		return (rc);
3482 
3483 	if (timestamp != sc->timestamp) {
3484 		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3485 		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3486 		sc->timestamp = timestamp;
3487 	}
3488 
3489 	return (0);
3490 }
3491 
3492 void
3493 t3_add_attach_sysctls(adapter_t *sc)
3494 {
3495 	struct sysctl_ctx_list *ctx;
3496 	struct sysctl_oid_list *children;
3497 
3498 	ctx = device_get_sysctl_ctx(sc->dev);
3499 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3500 
3501 	/* random information */
3502 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3503 	    "firmware_version",
3504 	    CTLFLAG_RD, &sc->fw_version,
3505 	    0, "firmware version");
3506 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3507 	    "hw_revision",
3508 	    CTLFLAG_RD, &sc->params.rev,
3509 	    0, "chip model");
3510 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3511 	    "port_types",
3512 	    CTLFLAG_RD, &sc->port_types,
3513 	    0, "type of ports");
3514 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3515 	    "enable_debug",
3516 	    CTLFLAG_RW, &cxgb_debug,
3517 	    0, "enable verbose debugging output");
3518 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3519 	    CTLFLAG_RD, &sc->tunq_coalesce,
3520 	    "#tunneled packets freed");
3521 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3522 	    "txq_overrun",
3523 	    CTLFLAG_RD, &txq_fills,
3524 	    0, "#times txq overrun");
3525 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3526 	    "core_clock",
3527 	    CTLFLAG_RD, &sc->params.vpd.cclk,
3528 	    0, "core clock frequency (in KHz)");
3529 }
3530 
3531 
3532 static const char *rspq_name = "rspq";
3533 static const char *txq_names[] =
3534 {
3535 	"txq_eth",
3536 	"txq_ofld",
3537 	"txq_ctrl"
3538 };
3539 
3540 static int
3541 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3542 {
3543 	struct port_info *p = arg1;
3544 	uint64_t *parg;
3545 
3546 	if (!p)
3547 		return (EINVAL);
3548 
3549 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3550 	PORT_LOCK(p);
3551 	t3_mac_update_stats(&p->mac);
3552 	PORT_UNLOCK(p);
3553 
3554 	return (sysctl_handle_64(oidp, parg, 0, req));
3555 }
3556 
3557 void
3558 t3_add_configured_sysctls(adapter_t *sc)
3559 {
3560 	struct sysctl_ctx_list *ctx;
3561 	struct sysctl_oid_list *children;
3562 	int i, j;
3563 
3564 	ctx = device_get_sysctl_ctx(sc->dev);
3565 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3566 
3567 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3568 	    "intr_coal",
3569 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3570 	    0, t3_set_coalesce_usecs,
3571 	    "I", "interrupt coalescing timer (us)");
3572 
3573 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3574 	    "pkt_timestamp",
3575 	    CTLTYPE_INT | CTLFLAG_RW, sc,
3576 	    0, t3_pkt_timestamp,
3577 	    "I", "provide packet timestamp instead of connection hash");
3578 
3579 	for (i = 0; i < sc->params.nports; i++) {
3580 		struct port_info *pi = &sc->port[i];
3581 		struct sysctl_oid *poid;
3582 		struct sysctl_oid_list *poidlist;
3583 		struct mac_stats *mstats = &pi->mac.stats;
3584 
3585 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3586 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3587 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3588 		poidlist = SYSCTL_CHILDREN(poid);
3589 		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3590 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3591 		    0, "#queue sets");
3592 
3593 		for (j = 0; j < pi->nqsets; j++) {
3594 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3595 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3596 					  *ctrlqpoid, *lropoid;
3597 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3598 					       *txqpoidlist, *ctrlqpoidlist,
3599 					       *lropoidlist;
3600 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3601 
3602 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3603 
3604 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3605 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3606 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3607 
3608 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3609 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3610 					"freelist #0 empty");
3611 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3612 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3613 					"freelist #1 empty");
3614 
3615 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3616 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3617 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3618 
3619 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3620 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3621 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3622 
3623 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3624 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3625 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3626 
3627 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3628 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3629 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3630 
3631 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3632 			    CTLFLAG_RD, &qs->rspq.size,
3633 			    0, "#entries in response queue");
3634 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3635 			    CTLFLAG_RD, &qs->rspq.cidx,
3636 			    0, "consumer index");
3637 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3638 			    CTLFLAG_RD, &qs->rspq.credits,
3639 			    0, "#credits");
3640 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3641 			    CTLFLAG_RD, &qs->rspq.starved,
3642 			    0, "#times starved");
3643 			SYSCTL_ADD_ULONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3644 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3645 			    "physical_address_of the queue");
3646 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3647 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3648 			    0, "start rspq dump entry");
3649 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3650 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3651 			    0, "#rspq entries to dump");
3652 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3653 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3654 			    0, t3_dump_rspq, "A", "dump of the response queue");
3655 
3656 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3657 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3658 			    "#tunneled packets dropped");
3659 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3660 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3661 			    0, "#tunneled packets waiting to be sent");
3662 #if 0
3663 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3664 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3665 			    0, "#tunneled packets queue producer index");
3666 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3667 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3668 			    0, "#tunneled packets queue consumer index");
3669 #endif
3670 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3671 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3672 			    0, "#tunneled packets processed by the card");
3673 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3674 			    CTLFLAG_RD, &txq->cleaned,
3675 			    0, "#tunneled packets cleaned");
3676 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3677 			    CTLFLAG_RD, &txq->in_use,
3678 			    0, "#tunneled packet slots in use");
3679 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3680 			    CTLFLAG_RD, &txq->txq_frees,
3681 			    "#tunneled packets freed");
3682 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3683 			    CTLFLAG_RD, &txq->txq_skipped,
3684 			    0, "#tunneled packet descriptors skipped");
3685 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3686 			    CTLFLAG_RD, &txq->txq_coalesced,
3687 			    "#tunneled packets coalesced");
3688 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3689 			    CTLFLAG_RD, &txq->txq_enqueued,
3690 			    0, "#tunneled packets enqueued to hardware");
3691 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3692 			    CTLFLAG_RD, &qs->txq_stopped,
3693 			    0, "tx queues stopped");
3694 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3695 			    CTLFLAG_RD, &txq->phys_addr,
3696 			    "physical_address_of the queue");
3697 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3698 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3699 			    0, "txq generation");
3700 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3701 			    CTLFLAG_RD, &txq->cidx,
3702 			    0, "hardware queue cidx");
3703 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3704 			    CTLFLAG_RD, &txq->pidx,
3705 			    0, "hardware queue pidx");
3706 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3707 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3708 			    0, "txq start idx for dump");
3709 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3710 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3711 			    0, "txq #entries to dump");
3712 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3713 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3714 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3715 
3716 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3717 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3718 			    0, "ctrlq start idx for dump");
3719 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3720 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3721 			    0, "ctrl #entries to dump");
3722 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3723 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3724 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3725 
3726 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3727 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3728 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3729 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3730 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3731 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3732 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3733 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3734 		}
3735 
3736 		/* Now add a node for mac stats. */
3737 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3738 		    CTLFLAG_RD, NULL, "MAC statistics");
3739 		poidlist = SYSCTL_CHILDREN(poid);
3740 
3741 		/*
3742 		 * We (ab)use the length argument (arg2) to pass on the offset
3743 		 * of the data that we are interested in.  This is only required
3744 		 * for the quad counters that are updated from the hardware (we
3745 		 * make sure that we return the latest value).
3746 		 * sysctl_handle_macstat first updates *all* the counters from
3747 		 * the hardware, and then returns the latest value of the
3748 		 * requested counter.  Best would be to update only the
3749 		 * requested counter from hardware, but t3_mac_update_stats()
3750 		 * hides all the register details and we don't want to dive into
3751 		 * all that here.
3752 		 */
3753 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3754     (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3755     sysctl_handle_macstat, "QU", 0)
3756 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3757 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3758 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3759 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3760 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3761 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3762 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3763 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3764 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3765 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3766 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3767 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3768 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3769 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3770 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3771 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3772 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3773 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3774 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3775 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3776 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3777 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3778 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3779 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3780 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3781 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3782 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3783 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3784 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3785 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3786 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3787 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3788 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3789 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3790 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3791 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3792 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3793 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3794 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3795 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3796 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3797 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3798 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3799 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3800 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3801 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3802 #undef CXGB_SYSCTL_ADD_QUAD
3803 
3804 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3805     CTLFLAG_RD, &mstats->a, 0)
3806 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3807 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3808 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3809 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3810 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3811 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3812 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3813 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3814 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3815 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3816 #undef CXGB_SYSCTL_ADD_ULONG
3817 	}
3818 }
3819 
3820 /**
3821  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3822  *	@qs: the queue set
3823  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3824  *	@idx: the descriptor index in the queue
3825  *	@data: where to dump the descriptor contents
3826  *
3827  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3828  *	size of the descriptor.
3829  */
3830 int
3831 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3832 		unsigned char *data)
3833 {
3834 	if (qnum >= 6)
3835 		return (EINVAL);
3836 
3837 	if (qnum < 3) {
3838 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3839 			return -EINVAL;
3840 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3841 		return sizeof(struct tx_desc);
3842 	}
3843 
3844 	if (qnum == 3) {
3845 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3846 			return (EINVAL);
3847 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3848 		return sizeof(struct rsp_desc);
3849 	}
3850 
3851 	qnum -= 4;
3852 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3853 		return (EINVAL);
3854 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3855 	return sizeof(struct rx_desc);
3856 }
3857