xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 9517e866259191fcd39434a97ad849a9b59b9b9f)
1 /**************************************************************************
2 
3 Copyright (c) 2007-2009, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/conf.h>
39 #include <machine/bus.h>
40 #include <machine/resource.h>
41 #include <sys/bus_dma.h>
42 #include <sys/rman.h>
43 #include <sys/queue.h>
44 #include <sys/sysctl.h>
45 #include <sys/taskqueue.h>
46 
47 #include <sys/proc.h>
48 #include <sys/sbuf.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/systm.h>
52 #include <sys/syslog.h>
53 
54 #include <net/bpf.h>
55 
56 #include <netinet/in_systm.h>
57 #include <netinet/in.h>
58 #include <netinet/ip.h>
59 #include <netinet/tcp.h>
60 
61 #include <dev/pci/pcireg.h>
62 #include <dev/pci/pcivar.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 
67 #include <cxgb_include.h>
68 #include <sys/mvec.h>
69 
70 int	txq_fills = 0;
71 int	multiq_tx_enable = 1;
72 
73 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
74 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
75 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
76 SYSCTL_UINT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
77     "size of per-queue mbuf ring");
78 
79 static int cxgb_tx_coalesce_force = 0;
80 TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
81 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
82     &cxgb_tx_coalesce_force, 0,
83     "coalesce small packets into a single work request regardless of ring state");
84 
85 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
86 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
87 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
88 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
89 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
90 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
91 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
92 
93 
94 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
95 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
96     &cxgb_tx_coalesce_enable_start);
97 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
98     &cxgb_tx_coalesce_enable_start, 0,
99     "coalesce enable threshold");
100 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
101 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
102 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
103     &cxgb_tx_coalesce_enable_stop, 0,
104     "coalesce disable threshold");
105 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
106 TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
107 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
108     &cxgb_tx_reclaim_threshold, 0,
109     "tx cleaning minimum threshold");
110 
111 /*
112  * XXX don't re-enable this until TOE stops assuming
113  * we have an m_ext
114  */
115 static int recycle_enable = 0;
116 int cxgb_ext_freed = 0;
117 int cxgb_ext_inited = 0;
118 int fl_q_size = 0;
119 int jumbo_q_size = 0;
120 
121 extern int cxgb_use_16k_clusters;
122 extern int nmbjumbo4;
123 extern int nmbjumbo9;
124 extern int nmbjumbo16;
125 
126 #define USE_GTS 0
127 
128 #define SGE_RX_SM_BUF_SIZE	1536
129 #define SGE_RX_DROP_THRES	16
130 #define SGE_RX_COPY_THRES	128
131 
132 /*
133  * Period of the Tx buffer reclaim timer.  This timer does not need to run
134  * frequently as Tx buffers are usually reclaimed by new Tx packets.
135  */
136 #define TX_RECLAIM_PERIOD       (hz >> 1)
137 
138 /*
139  * Values for sge_txq.flags
140  */
141 enum {
142 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
143 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
144 };
145 
146 struct tx_desc {
147 	uint64_t	flit[TX_DESC_FLITS];
148 } __packed;
149 
150 struct rx_desc {
151 	uint32_t	addr_lo;
152 	uint32_t	len_gen;
153 	uint32_t	gen2;
154 	uint32_t	addr_hi;
155 } __packed;;
156 
157 struct rsp_desc {               /* response queue descriptor */
158 	struct rss_header	rss_hdr;
159 	uint32_t		flags;
160 	uint32_t		len_cq;
161 	uint8_t			imm_data[47];
162 	uint8_t			intr_gen;
163 } __packed;
164 
165 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
166 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
167 #define RX_SW_DESC_INUSE        (1 << 3)
168 #define TX_SW_DESC_MAPPED       (1 << 4)
169 
170 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
171 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
172 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
173 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
174 
175 struct tx_sw_desc {                /* SW state per Tx descriptor */
176 	struct mbuf	*m;
177 	bus_dmamap_t	map;
178 	int		flags;
179 };
180 
181 struct rx_sw_desc {                /* SW state per Rx descriptor */
182 	caddr_t		rxsd_cl;
183 	struct mbuf	*m;
184 	bus_dmamap_t	map;
185 	int		flags;
186 };
187 
188 struct txq_state {
189 	unsigned int	compl;
190 	unsigned int	gen;
191 	unsigned int	pidx;
192 };
193 
194 struct refill_fl_cb_arg {
195 	int               error;
196 	bus_dma_segment_t seg;
197 	int               nseg;
198 };
199 
200 
201 /*
202  * Maps a number of flits to the number of Tx descriptors that can hold them.
203  * The formula is
204  *
205  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
206  *
207  * HW allows up to 4 descriptors to be combined into a WR.
208  */
209 static uint8_t flit_desc_map[] = {
210 	0,
211 #if SGE_NUM_GENBITS == 1
212 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
214 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
215 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
216 #elif SGE_NUM_GENBITS == 2
217 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
218 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
219 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
220 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
221 #else
222 # error "SGE_NUM_GENBITS must be 1 or 2"
223 #endif
224 };
225 
226 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
227 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
228 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
229 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
230 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
231 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
232 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
233 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
234 #define	TXQ_RING_DEQUEUE(qs) \
235 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236 
237 int cxgb_debug = 0;
238 
239 static void sge_timer_cb(void *arg);
240 static void sge_timer_reclaim(void *arg, int ncount);
241 static void sge_txq_reclaim_handler(void *arg, int ncount);
242 static void cxgb_start_locked(struct sge_qset *qs);
243 
244 /*
245  * XXX need to cope with bursty scheduling by looking at a wider
246  * window than we are now for determining the need for coalescing
247  *
248  */
249 static __inline uint64_t
250 check_pkt_coalesce(struct sge_qset *qs)
251 {
252         struct adapter *sc;
253         struct sge_txq *txq;
254 	uint8_t *fill;
255 
256 	if (__predict_false(cxgb_tx_coalesce_force))
257 		return (1);
258 	txq = &qs->txq[TXQ_ETH];
259         sc = qs->port->adapter;
260 	fill = &sc->tunq_fill[qs->idx];
261 
262 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
263 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
264 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
265 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
266 	/*
267 	 * if the hardware transmit queue is more than 1/8 full
268 	 * we mark it as coalescing - we drop back from coalescing
269 	 * when we go below 1/32 full and there are no packets enqueued,
270 	 * this provides us with some degree of hysteresis
271 	 */
272         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
273 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
274                 *fill = 0;
275         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
276                 *fill = 1;
277 
278 	return (sc->tunq_coalesce);
279 }
280 
281 #ifdef __LP64__
282 static void
283 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
284 {
285 	uint64_t wr_hilo;
286 #if _BYTE_ORDER == _LITTLE_ENDIAN
287 	wr_hilo = wr_hi;
288 	wr_hilo |= (((uint64_t)wr_lo)<<32);
289 #else
290 	wr_hilo = wr_lo;
291 	wr_hilo |= (((uint64_t)wr_hi)<<32);
292 #endif
293 	wrp->wrh_hilo = wr_hilo;
294 }
295 #else
296 static void
297 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
298 {
299 
300 	wrp->wrh_hi = wr_hi;
301 	wmb();
302 	wrp->wrh_lo = wr_lo;
303 }
304 #endif
305 
306 struct coalesce_info {
307 	int count;
308 	int nbytes;
309 };
310 
311 static int
312 coalesce_check(struct mbuf *m, void *arg)
313 {
314 	struct coalesce_info *ci = arg;
315 	int *count = &ci->count;
316 	int *nbytes = &ci->nbytes;
317 
318 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
319 		(*count < 7) && (m->m_next == NULL))) {
320 		*count += 1;
321 		*nbytes += m->m_len;
322 		return (1);
323 	}
324 	return (0);
325 }
326 
327 static struct mbuf *
328 cxgb_dequeue(struct sge_qset *qs)
329 {
330 	struct mbuf *m, *m_head, *m_tail;
331 	struct coalesce_info ci;
332 
333 
334 	if (check_pkt_coalesce(qs) == 0)
335 		return TXQ_RING_DEQUEUE(qs);
336 
337 	m_head = m_tail = NULL;
338 	ci.count = ci.nbytes = 0;
339 	do {
340 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
341 		if (m_head == NULL) {
342 			m_tail = m_head = m;
343 		} else if (m != NULL) {
344 			m_tail->m_nextpkt = m;
345 			m_tail = m;
346 		}
347 	} while (m != NULL);
348 	if (ci.count > 7)
349 		panic("trying to coalesce %d packets in to one WR", ci.count);
350 	return (m_head);
351 }
352 
353 /**
354  *	reclaim_completed_tx - reclaims completed Tx descriptors
355  *	@adapter: the adapter
356  *	@q: the Tx queue to reclaim completed descriptors from
357  *
358  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
359  *	and frees the associated buffers if possible.  Called with the Tx
360  *	queue's lock held.
361  */
362 static __inline int
363 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
364 {
365 	struct sge_txq *q = &qs->txq[queue];
366 	int reclaim = desc_reclaimable(q);
367 
368 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
369 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
370 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
371 
372 	if (reclaim < reclaim_min)
373 		return (0);
374 
375 	mtx_assert(&qs->lock, MA_OWNED);
376 	if (reclaim > 0) {
377 		t3_free_tx_desc(qs, reclaim, queue);
378 		q->cleaned += reclaim;
379 		q->in_use -= reclaim;
380 	}
381 	if (isset(&qs->txq_stopped, TXQ_ETH))
382                 clrbit(&qs->txq_stopped, TXQ_ETH);
383 
384 	return (reclaim);
385 }
386 
387 /**
388  *	should_restart_tx - are there enough resources to restart a Tx queue?
389  *	@q: the Tx queue
390  *
391  *	Checks if there are enough descriptors to restart a suspended Tx queue.
392  */
393 static __inline int
394 should_restart_tx(const struct sge_txq *q)
395 {
396 	unsigned int r = q->processed - q->cleaned;
397 
398 	return q->in_use - r < (q->size >> 1);
399 }
400 
401 /**
402  *	t3_sge_init - initialize SGE
403  *	@adap: the adapter
404  *	@p: the SGE parameters
405  *
406  *	Performs SGE initialization needed every time after a chip reset.
407  *	We do not initialize any of the queue sets here, instead the driver
408  *	top-level must request those individually.  We also do not enable DMA
409  *	here, that should be done after the queues have been set up.
410  */
411 void
412 t3_sge_init(adapter_t *adap, struct sge_params *p)
413 {
414 	u_int ctrl, ups;
415 
416 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
417 
418 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
419 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
420 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
421 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
422 #if SGE_NUM_GENBITS == 1
423 	ctrl |= F_EGRGENCTRL;
424 #endif
425 	if (adap->params.rev > 0) {
426 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
427 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
428 	}
429 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
430 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
431 		     V_LORCQDRBTHRSH(512));
432 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
433 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
434 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
435 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
436 		     adap->params.rev < T3_REV_C ? 1000 : 500);
437 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
438 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
439 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
440 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
441 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
442 }
443 
444 
445 /**
446  *	sgl_len - calculates the size of an SGL of the given capacity
447  *	@n: the number of SGL entries
448  *
449  *	Calculates the number of flits needed for a scatter/gather list that
450  *	can hold the given number of entries.
451  */
452 static __inline unsigned int
453 sgl_len(unsigned int n)
454 {
455 	return ((3 * n) / 2 + (n & 1));
456 }
457 
458 /**
459  *	get_imm_packet - return the next ingress packet buffer from a response
460  *	@resp: the response descriptor containing the packet data
461  *
462  *	Return a packet containing the immediate data of the given response.
463  */
464 static int
465 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
466 {
467 
468 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
469 	m->m_ext.ext_buf = NULL;
470 	m->m_ext.ext_type = 0;
471 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
472 	return (0);
473 }
474 
475 static __inline u_int
476 flits_to_desc(u_int n)
477 {
478 	return (flit_desc_map[n]);
479 }
480 
481 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
482 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
483 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
484 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
485 		    F_HIRCQPARITYERROR)
486 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
487 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
488 		      F_RSPQDISABLED)
489 
490 /**
491  *	t3_sge_err_intr_handler - SGE async event interrupt handler
492  *	@adapter: the adapter
493  *
494  *	Interrupt handler for SGE asynchronous (non-data) events.
495  */
496 void
497 t3_sge_err_intr_handler(adapter_t *adapter)
498 {
499 	unsigned int v, status;
500 
501 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
502 	if (status & SGE_PARERR)
503 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
504 			 status & SGE_PARERR);
505 	if (status & SGE_FRAMINGERR)
506 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
507 			 status & SGE_FRAMINGERR);
508 	if (status & F_RSPQCREDITOVERFOW)
509 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
510 
511 	if (status & F_RSPQDISABLED) {
512 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
513 
514 		CH_ALERT(adapter,
515 			 "packet delivered to disabled response queue (0x%x)\n",
516 			 (v >> S_RSPQ0DISABLED) & 0xff);
517 	}
518 
519 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
520 	if (status & SGE_FATALERR)
521 		t3_fatal_err(adapter);
522 }
523 
524 void
525 t3_sge_prep(adapter_t *adap, struct sge_params *p)
526 {
527 	int i, nqsets;
528 
529 	nqsets = min(SGE_QSETS, mp_ncpus*4);
530 
531 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
532 
533 	while (!powerof2(fl_q_size))
534 		fl_q_size--;
535 #if __FreeBSD_version >= 700111
536 	if (cxgb_use_16k_clusters)
537 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
538 	else
539 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
540 #else
541 	jumbo_q_size = min(nmbjumbo4/(3*nqsets), JUMBO_Q_SIZE);
542 #endif
543 	while (!powerof2(jumbo_q_size))
544 		jumbo_q_size--;
545 
546 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
547 	p->max_pkt_size = adap->sge.qs[0].fl[1].buf_size - sizeof(struct cpl_rx_data);
548 
549 	for (i = 0; i < SGE_QSETS; ++i) {
550 		struct qset_params *q = p->qset + i;
551 
552 		if (adap->params.nports > 2) {
553 			q->coalesce_usecs = 50;
554 		} else {
555 #ifdef INVARIANTS
556 			q->coalesce_usecs = 10;
557 #else
558 			q->coalesce_usecs = 5;
559 #endif
560 		}
561 		q->polling = 0;
562 		q->rspq_size = RSPQ_Q_SIZE;
563 		q->fl_size = fl_q_size;
564 		q->jumbo_size = jumbo_q_size;
565 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
566 		q->txq_size[TXQ_OFLD] = 1024;
567 		q->txq_size[TXQ_CTRL] = 256;
568 		q->cong_thres = 0;
569 	}
570 }
571 
572 int
573 t3_sge_alloc(adapter_t *sc)
574 {
575 
576 	/* The parent tag. */
577 	if (bus_dma_tag_create( NULL,			/* parent */
578 				1, 0,			/* algnmnt, boundary */
579 				BUS_SPACE_MAXADDR,	/* lowaddr */
580 				BUS_SPACE_MAXADDR,	/* highaddr */
581 				NULL, NULL,		/* filter, filterarg */
582 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
583 				BUS_SPACE_UNRESTRICTED, /* nsegments */
584 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
585 				0,			/* flags */
586 				NULL, NULL,		/* lock, lockarg */
587 				&sc->parent_dmat)) {
588 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
589 		return (ENOMEM);
590 	}
591 
592 	/*
593 	 * DMA tag for normal sized RX frames
594 	 */
595 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
596 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
597 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
598 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
599 		return (ENOMEM);
600 	}
601 
602 	/*
603 	 * DMA tag for jumbo sized RX frames.
604 	 */
605 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
606 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
607 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
608 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
609 		return (ENOMEM);
610 	}
611 
612 	/*
613 	 * DMA tag for TX frames.
614 	 */
615 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
616 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
617 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
618 		NULL, NULL, &sc->tx_dmat)) {
619 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
620 		return (ENOMEM);
621 	}
622 
623 	return (0);
624 }
625 
626 int
627 t3_sge_free(struct adapter * sc)
628 {
629 
630 	if (sc->tx_dmat != NULL)
631 		bus_dma_tag_destroy(sc->tx_dmat);
632 
633 	if (sc->rx_jumbo_dmat != NULL)
634 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
635 
636 	if (sc->rx_dmat != NULL)
637 		bus_dma_tag_destroy(sc->rx_dmat);
638 
639 	if (sc->parent_dmat != NULL)
640 		bus_dma_tag_destroy(sc->parent_dmat);
641 
642 	return (0);
643 }
644 
645 void
646 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
647 {
648 
649 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
650 	qs->rspq.polling = 0 /* p->polling */;
651 }
652 
653 #if !defined(__i386__) && !defined(__amd64__)
654 static void
655 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
656 {
657 	struct refill_fl_cb_arg *cb_arg = arg;
658 
659 	cb_arg->error = error;
660 	cb_arg->seg = segs[0];
661 	cb_arg->nseg = nseg;
662 
663 }
664 #endif
665 /**
666  *	refill_fl - refill an SGE free-buffer list
667  *	@sc: the controller softc
668  *	@q: the free-list to refill
669  *	@n: the number of new buffers to allocate
670  *
671  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
672  *	The caller must assure that @n does not exceed the queue's capacity.
673  */
674 static void
675 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
676 {
677 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
678 	struct rx_desc *d = &q->desc[q->pidx];
679 	struct refill_fl_cb_arg cb_arg;
680 	struct mbuf *m;
681 	caddr_t cl;
682 	int err, count = 0;
683 
684 	cb_arg.error = 0;
685 	while (n--) {
686 		/*
687 		 * We only allocate a cluster, mbuf allocation happens after rx
688 		 */
689 		if (q->zone == zone_pack) {
690 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
691 				break;
692 			cl = m->m_ext.ext_buf;
693 		} else {
694 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
695 				break;
696 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
697 				uma_zfree(q->zone, cl);
698 				break;
699 			}
700 		}
701 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
702 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
703 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
704 				uma_zfree(q->zone, cl);
705 				goto done;
706 			}
707 			sd->flags |= RX_SW_DESC_MAP_CREATED;
708 		}
709 #if !defined(__i386__) && !defined(__amd64__)
710 		err = bus_dmamap_load(q->entry_tag, sd->map,
711 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
712 
713 		if (err != 0 || cb_arg.error) {
714 			if (q->zone == zone_pack)
715 				uma_zfree(q->zone, cl);
716 			m_free(m);
717 			goto done;
718 		}
719 #else
720 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
721 #endif
722 		sd->flags |= RX_SW_DESC_INUSE;
723 		sd->rxsd_cl = cl;
724 		sd->m = m;
725 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
726 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
727 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
728 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
729 
730 		d++;
731 		sd++;
732 
733 		if (++q->pidx == q->size) {
734 			q->pidx = 0;
735 			q->gen ^= 1;
736 			sd = q->sdesc;
737 			d = q->desc;
738 		}
739 		q->credits++;
740 		count++;
741 	}
742 
743 done:
744 	if (count)
745 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
746 }
747 
748 
749 /**
750  *	free_rx_bufs - free the Rx buffers on an SGE free list
751  *	@sc: the controle softc
752  *	@q: the SGE free list to clean up
753  *
754  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
755  *	this queue should be stopped before calling this function.
756  */
757 static void
758 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
759 {
760 	u_int cidx = q->cidx;
761 
762 	while (q->credits--) {
763 		struct rx_sw_desc *d = &q->sdesc[cidx];
764 
765 		if (d->flags & RX_SW_DESC_INUSE) {
766 			bus_dmamap_unload(q->entry_tag, d->map);
767 			bus_dmamap_destroy(q->entry_tag, d->map);
768 			if (q->zone == zone_pack) {
769 				m_init(d->m, zone_pack, MCLBYTES,
770 				    M_NOWAIT, MT_DATA, M_EXT);
771 				uma_zfree(zone_pack, d->m);
772 			} else {
773 				m_init(d->m, zone_mbuf, MLEN,
774 				    M_NOWAIT, MT_DATA, 0);
775 				uma_zfree(zone_mbuf, d->m);
776 				uma_zfree(q->zone, d->rxsd_cl);
777 			}
778 		}
779 
780 		d->rxsd_cl = NULL;
781 		d->m = NULL;
782 		if (++cidx == q->size)
783 			cidx = 0;
784 	}
785 }
786 
787 static __inline void
788 __refill_fl(adapter_t *adap, struct sge_fl *fl)
789 {
790 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
791 }
792 
793 static __inline void
794 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
795 {
796 	if ((fl->size - fl->credits) < max)
797 		refill_fl(adap, fl, min(max, fl->size - fl->credits));
798 }
799 
800 /**
801  *	recycle_rx_buf - recycle a receive buffer
802  *	@adapter: the adapter
803  *	@q: the SGE free list
804  *	@idx: index of buffer to recycle
805  *
806  *	Recycles the specified buffer on the given free list by adding it at
807  *	the next available slot on the list.
808  */
809 static void
810 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
811 {
812 	struct rx_desc *from = &q->desc[idx];
813 	struct rx_desc *to   = &q->desc[q->pidx];
814 
815 	q->sdesc[q->pidx] = q->sdesc[idx];
816 	to->addr_lo = from->addr_lo;        // already big endian
817 	to->addr_hi = from->addr_hi;        // likewise
818 	wmb();	/* necessary ? */
819 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
820 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
821 	q->credits++;
822 
823 	if (++q->pidx == q->size) {
824 		q->pidx = 0;
825 		q->gen ^= 1;
826 	}
827 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
828 }
829 
830 static void
831 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
832 {
833 	uint32_t *addr;
834 
835 	addr = arg;
836 	*addr = segs[0].ds_addr;
837 }
838 
839 static int
840 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
841     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
842     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
843 {
844 	size_t len = nelem * elem_size;
845 	void *s = NULL;
846 	void *p = NULL;
847 	int err;
848 
849 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
850 				      BUS_SPACE_MAXADDR_32BIT,
851 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
852 				      len, 0, NULL, NULL, tag)) != 0) {
853 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
854 		return (ENOMEM);
855 	}
856 
857 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
858 				    map)) != 0) {
859 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
860 		return (ENOMEM);
861 	}
862 
863 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
864 	bzero(p, len);
865 	*(void **)desc = p;
866 
867 	if (sw_size) {
868 		len = nelem * sw_size;
869 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
870 		*(void **)sdesc = s;
871 	}
872 	if (parent_entry_tag == NULL)
873 		return (0);
874 
875 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
876 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
877 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
878 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
879 		                      NULL, NULL, entry_tag)) != 0) {
880 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
881 		return (ENOMEM);
882 	}
883 	return (0);
884 }
885 
886 static void
887 sge_slow_intr_handler(void *arg, int ncount)
888 {
889 	adapter_t *sc = arg;
890 
891 	t3_slow_intr_handler(sc);
892 }
893 
894 /**
895  *	sge_timer_cb - perform periodic maintenance of an SGE qset
896  *	@data: the SGE queue set to maintain
897  *
898  *	Runs periodically from a timer to perform maintenance of an SGE queue
899  *	set.  It performs two tasks:
900  *
901  *	a) Cleans up any completed Tx descriptors that may still be pending.
902  *	Normal descriptor cleanup happens when new packets are added to a Tx
903  *	queue so this timer is relatively infrequent and does any cleanup only
904  *	if the Tx queue has not seen any new packets in a while.  We make a
905  *	best effort attempt to reclaim descriptors, in that we don't wait
906  *	around if we cannot get a queue's lock (which most likely is because
907  *	someone else is queueing new packets and so will also handle the clean
908  *	up).  Since control queues use immediate data exclusively we don't
909  *	bother cleaning them up here.
910  *
911  *	b) Replenishes Rx queues that have run out due to memory shortage.
912  *	Normally new Rx buffers are added when existing ones are consumed but
913  *	when out of memory a queue can become empty.  We try to add only a few
914  *	buffers here, the queue will be replenished fully as these new buffers
915  *	are used up if memory shortage has subsided.
916  *
917  *	c) Return coalesced response queue credits in case a response queue is
918  *	starved.
919  *
920  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
921  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
922  */
923 static void
924 sge_timer_cb(void *arg)
925 {
926 	adapter_t *sc = arg;
927 	if ((sc->flags & USING_MSIX) == 0) {
928 
929 		struct port_info *pi;
930 		struct sge_qset *qs;
931 		struct sge_txq  *txq;
932 		int i, j;
933 		int reclaim_ofl, refill_rx;
934 
935 		if (sc->open_device_map == 0)
936 			return;
937 
938 		for (i = 0; i < sc->params.nports; i++) {
939 			pi = &sc->port[i];
940 			for (j = 0; j < pi->nqsets; j++) {
941 				qs = &sc->sge.qs[pi->first_qset + j];
942 				txq = &qs->txq[0];
943 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
944 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
945 				    (qs->fl[1].credits < qs->fl[1].size));
946 				if (reclaim_ofl || refill_rx) {
947 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
948 					break;
949 				}
950 			}
951 		}
952 	}
953 
954 	if (sc->params.nports > 2) {
955 		int i;
956 
957 		for_each_port(sc, i) {
958 			struct port_info *pi = &sc->port[i];
959 
960 			t3_write_reg(sc, A_SG_KDOORBELL,
961 				     F_SELEGRCNTX |
962 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
963 		}
964 	}
965 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
966 	    sc->open_device_map != 0)
967 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
968 }
969 
970 /*
971  * This is meant to be a catch-all function to keep sge state private
972  * to sge.c
973  *
974  */
975 int
976 t3_sge_init_adapter(adapter_t *sc)
977 {
978 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
979 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
980 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
981 	return (0);
982 }
983 
984 int
985 t3_sge_reset_adapter(adapter_t *sc)
986 {
987 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
988 	return (0);
989 }
990 
991 int
992 t3_sge_init_port(struct port_info *pi)
993 {
994 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
995 	return (0);
996 }
997 
998 /**
999  *	refill_rspq - replenish an SGE response queue
1000  *	@adapter: the adapter
1001  *	@q: the response queue to replenish
1002  *	@credits: how many new responses to make available
1003  *
1004  *	Replenishes a response queue by making the supplied number of responses
1005  *	available to HW.
1006  */
1007 static __inline void
1008 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1009 {
1010 
1011 	/* mbufs are allocated on demand when a rspq entry is processed. */
1012 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1013 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1014 }
1015 
1016 static void
1017 sge_txq_reclaim_handler(void *arg, int ncount)
1018 {
1019 	struct sge_qset *qs = arg;
1020 	int i;
1021 
1022 	for (i = 0; i < 3; i++)
1023 		reclaim_completed_tx(qs, 16, i);
1024 }
1025 
1026 static void
1027 sge_timer_reclaim(void *arg, int ncount)
1028 {
1029 	struct port_info *pi = arg;
1030 	int i, nqsets = pi->nqsets;
1031 	adapter_t *sc = pi->adapter;
1032 	struct sge_qset *qs;
1033 	struct mtx *lock;
1034 
1035 	KASSERT((sc->flags & USING_MSIX) == 0,
1036 	    ("can't call timer reclaim for msi-x"));
1037 
1038 	for (i = 0; i < nqsets; i++) {
1039 		qs = &sc->sge.qs[pi->first_qset + i];
1040 
1041 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1042 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1043 			    &sc->sge.qs[0].rspq.lock;
1044 
1045 		if (mtx_trylock(lock)) {
1046 			/* XXX currently assume that we are *NOT* polling */
1047 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1048 
1049 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1050 				__refill_fl(sc, &qs->fl[0]);
1051 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1052 				__refill_fl(sc, &qs->fl[1]);
1053 
1054 			if (status & (1 << qs->rspq.cntxt_id)) {
1055 				if (qs->rspq.credits) {
1056 					refill_rspq(sc, &qs->rspq, 1);
1057 					qs->rspq.credits--;
1058 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1059 					    1 << qs->rspq.cntxt_id);
1060 				}
1061 			}
1062 			mtx_unlock(lock);
1063 		}
1064 	}
1065 }
1066 
1067 /**
1068  *	init_qset_cntxt - initialize an SGE queue set context info
1069  *	@qs: the queue set
1070  *	@id: the queue set id
1071  *
1072  *	Initializes the TIDs and context ids for the queues of a queue set.
1073  */
1074 static void
1075 init_qset_cntxt(struct sge_qset *qs, u_int id)
1076 {
1077 
1078 	qs->rspq.cntxt_id = id;
1079 	qs->fl[0].cntxt_id = 2 * id;
1080 	qs->fl[1].cntxt_id = 2 * id + 1;
1081 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1082 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1083 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1084 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1085 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1086 
1087 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1088 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1089 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1090 }
1091 
1092 
1093 static void
1094 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1095 {
1096 	txq->in_use += ndesc;
1097 	/*
1098 	 * XXX we don't handle stopping of queue
1099 	 * presumably start handles this when we bump against the end
1100 	 */
1101 	txqs->gen = txq->gen;
1102 	txq->unacked += ndesc;
1103 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1104 	txq->unacked &= 31;
1105 	txqs->pidx = txq->pidx;
1106 	txq->pidx += ndesc;
1107 #ifdef INVARIANTS
1108 	if (((txqs->pidx > txq->cidx) &&
1109 		(txq->pidx < txqs->pidx) &&
1110 		(txq->pidx >= txq->cidx)) ||
1111 	    ((txqs->pidx < txq->cidx) &&
1112 		(txq->pidx >= txq-> cidx)) ||
1113 	    ((txqs->pidx < txq->cidx) &&
1114 		(txq->cidx < txqs->pidx)))
1115 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1116 		    txqs->pidx, txq->pidx, txq->cidx);
1117 #endif
1118 	if (txq->pidx >= txq->size) {
1119 		txq->pidx -= txq->size;
1120 		txq->gen ^= 1;
1121 	}
1122 
1123 }
1124 
1125 /**
1126  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1127  *	@m: the packet mbufs
1128  *      @nsegs: the number of segments
1129  *
1130  * 	Returns the number of Tx descriptors needed for the given Ethernet
1131  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1132  */
1133 static __inline unsigned int
1134 calc_tx_descs(const struct mbuf *m, int nsegs)
1135 {
1136 	unsigned int flits;
1137 
1138 	if (m->m_pkthdr.len <= PIO_LEN)
1139 		return 1;
1140 
1141 	flits = sgl_len(nsegs) + 2;
1142 #ifdef TSO_SUPPORTED
1143 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1144 		flits++;
1145 #endif
1146 	return flits_to_desc(flits);
1147 }
1148 
1149 static unsigned int
1150 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1151     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1152 {
1153 	struct mbuf *m0;
1154 	int err, pktlen, pass = 0;
1155 	bus_dma_tag_t tag = txq->entry_tag;
1156 
1157 retry:
1158 	err = 0;
1159 	m0 = *m;
1160 	pktlen = m0->m_pkthdr.len;
1161 #if defined(__i386__) || defined(__amd64__)
1162 	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1163 		goto done;
1164 	} else
1165 #endif
1166 		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1167 
1168 	if (err == 0) {
1169 		goto done;
1170 	}
1171 	if (err == EFBIG && pass == 0) {
1172 		pass = 1;
1173 		/* Too many segments, try to defrag */
1174 		m0 = m_defrag(m0, M_DONTWAIT);
1175 		if (m0 == NULL) {
1176 			m_freem(*m);
1177 			*m = NULL;
1178 			return (ENOBUFS);
1179 		}
1180 		*m = m0;
1181 		goto retry;
1182 	} else if (err == ENOMEM) {
1183 		return (err);
1184 	} if (err) {
1185 		if (cxgb_debug)
1186 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1187 		m_freem(m0);
1188 		*m = NULL;
1189 		return (err);
1190 	}
1191 done:
1192 #if !defined(__i386__) && !defined(__amd64__)
1193 	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1194 #endif
1195 	txsd->flags |= TX_SW_DESC_MAPPED;
1196 
1197 	return (0);
1198 }
1199 
1200 /**
1201  *	make_sgl - populate a scatter/gather list for a packet
1202  *	@sgp: the SGL to populate
1203  *	@segs: the packet dma segments
1204  *	@nsegs: the number of segments
1205  *
1206  *	Generates a scatter/gather list for the buffers that make up a packet
1207  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1208  *	appropriately.
1209  */
1210 static __inline void
1211 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1212 {
1213 	int i, idx;
1214 
1215 	for (idx = 0, i = 0; i < nsegs; i++) {
1216 		/*
1217 		 * firmware doesn't like empty segments
1218 		 */
1219 		if (segs[i].ds_len == 0)
1220 			continue;
1221 		if (i && idx == 0)
1222 			++sgp;
1223 
1224 		sgp->len[idx] = htobe32(segs[i].ds_len);
1225 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1226 		idx ^= 1;
1227 	}
1228 
1229 	if (idx) {
1230 		sgp->len[idx] = 0;
1231 		sgp->addr[idx] = 0;
1232 	}
1233 }
1234 
1235 /**
1236  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1237  *	@adap: the adapter
1238  *	@q: the Tx queue
1239  *
1240  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1241  *	where the HW is going to sleep just after we checked, however,
1242  *	then the interrupt handler will detect the outstanding TX packet
1243  *	and ring the doorbell for us.
1244  *
1245  *	When GTS is disabled we unconditionally ring the doorbell.
1246  */
1247 static __inline void
1248 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1249 {
1250 #if USE_GTS
1251 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1252 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1253 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1254 #ifdef T3_TRACE
1255 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1256 			  q->cntxt_id);
1257 #endif
1258 		t3_write_reg(adap, A_SG_KDOORBELL,
1259 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1260 	}
1261 #else
1262 	wmb();            /* write descriptors before telling HW */
1263 	t3_write_reg(adap, A_SG_KDOORBELL,
1264 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1265 #endif
1266 }
1267 
1268 static __inline void
1269 wr_gen2(struct tx_desc *d, unsigned int gen)
1270 {
1271 #if SGE_NUM_GENBITS == 2
1272 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1273 #endif
1274 }
1275 
1276 /**
1277  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1278  *	@ndesc: number of Tx descriptors spanned by the SGL
1279  *	@txd: first Tx descriptor to be written
1280  *	@txqs: txq state (generation and producer index)
1281  *	@txq: the SGE Tx queue
1282  *	@sgl: the SGL
1283  *	@flits: number of flits to the start of the SGL in the first descriptor
1284  *	@sgl_flits: the SGL size in flits
1285  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1286  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1287  *
1288  *	Write a work request header and an associated SGL.  If the SGL is
1289  *	small enough to fit into one Tx descriptor it has already been written
1290  *	and we just need to write the WR header.  Otherwise we distribute the
1291  *	SGL across the number of descriptors it spans.
1292  */
1293 static void
1294 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1295     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1296     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1297 {
1298 
1299 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1300 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1301 
1302 	if (__predict_true(ndesc == 1)) {
1303 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1304 			V_WR_SGLSFLT(flits)) | wr_hi,
1305 		    htonl(V_WR_LEN(flits + sgl_flits) |
1306 			V_WR_GEN(txqs->gen)) | wr_lo);
1307 		/* XXX gen? */
1308 		wr_gen2(txd, txqs->gen);
1309 
1310 	} else {
1311 		unsigned int ogen = txqs->gen;
1312 		const uint64_t *fp = (const uint64_t *)sgl;
1313 		struct work_request_hdr *wp = wrp;
1314 
1315 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1316 		    V_WR_SGLSFLT(flits)) | wr_hi;
1317 
1318 		while (sgl_flits) {
1319 			unsigned int avail = WR_FLITS - flits;
1320 
1321 			if (avail > sgl_flits)
1322 				avail = sgl_flits;
1323 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1324 			sgl_flits -= avail;
1325 			ndesc--;
1326 			if (!sgl_flits)
1327 				break;
1328 
1329 			fp += avail;
1330 			txd++;
1331 			txsd++;
1332 			if (++txqs->pidx == txq->size) {
1333 				txqs->pidx = 0;
1334 				txqs->gen ^= 1;
1335 				txd = txq->desc;
1336 				txsd = txq->sdesc;
1337 			}
1338 
1339 			/*
1340 			 * when the head of the mbuf chain
1341 			 * is freed all clusters will be freed
1342 			 * with it
1343 			 */
1344 			wrp = (struct work_request_hdr *)txd;
1345 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1346 			    V_WR_SGLSFLT(1)) | wr_hi;
1347 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1348 				    sgl_flits + 1)) |
1349 			    V_WR_GEN(txqs->gen)) | wr_lo;
1350 			wr_gen2(txd, txqs->gen);
1351 			flits = 1;
1352 		}
1353 		wrp->wrh_hi |= htonl(F_WR_EOP);
1354 		wmb();
1355 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1356 		wr_gen2((struct tx_desc *)wp, ogen);
1357 	}
1358 }
1359 
1360 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1361 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1362 
1363 #ifdef VLAN_SUPPORTED
1364 #define GET_VTAG(cntrl, m) \
1365 do { \
1366 	if ((m)->m_flags & M_VLANTAG)					            \
1367 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1368 } while (0)
1369 
1370 #else
1371 #define GET_VTAG(cntrl, m)
1372 #endif
1373 
1374 static int
1375 t3_encap(struct sge_qset *qs, struct mbuf **m)
1376 {
1377 	adapter_t *sc;
1378 	struct mbuf *m0;
1379 	struct sge_txq *txq;
1380 	struct txq_state txqs;
1381 	struct port_info *pi;
1382 	unsigned int ndesc, flits, cntrl, mlen;
1383 	int err, nsegs, tso_info = 0;
1384 
1385 	struct work_request_hdr *wrp;
1386 	struct tx_sw_desc *txsd;
1387 	struct sg_ent *sgp, *sgl;
1388 	uint32_t wr_hi, wr_lo, sgl_flits;
1389 	bus_dma_segment_t segs[TX_MAX_SEGS];
1390 
1391 	struct tx_desc *txd;
1392 
1393 	pi = qs->port;
1394 	sc = pi->adapter;
1395 	txq = &qs->txq[TXQ_ETH];
1396 	txd = &txq->desc[txq->pidx];
1397 	txsd = &txq->sdesc[txq->pidx];
1398 	sgl = txq->txq_sgl;
1399 
1400 	prefetch(txd);
1401 	m0 = *m;
1402 
1403 	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
1404 	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
1405 
1406 	mtx_assert(&qs->lock, MA_OWNED);
1407 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1408 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1409 
1410 #ifdef VLAN_SUPPORTED
1411 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1412 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1413 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1414 #endif
1415 	if (m0->m_nextpkt != NULL) {
1416 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1417 		ndesc = 1;
1418 		mlen = 0;
1419 	} else {
1420 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1421 		    &m0, segs, &nsegs))) {
1422 			if (cxgb_debug)
1423 				printf("failed ... err=%d\n", err);
1424 			return (err);
1425 		}
1426 		mlen = m0->m_pkthdr.len;
1427 		ndesc = calc_tx_descs(m0, nsegs);
1428 	}
1429 	txq_prod(txq, ndesc, &txqs);
1430 
1431 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1432 	txsd->m = m0;
1433 
1434 	if (m0->m_nextpkt != NULL) {
1435 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1436 		int i, fidx;
1437 
1438 		if (nsegs > 7)
1439 			panic("trying to coalesce %d packets in to one WR", nsegs);
1440 		txq->txq_coalesced += nsegs;
1441 		wrp = (struct work_request_hdr *)txd;
1442 		flits = nsegs*2 + 1;
1443 
1444 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1445 			struct cpl_tx_pkt_batch_entry *cbe;
1446 			uint64_t flit;
1447 			uint32_t *hflit = (uint32_t *)&flit;
1448 			int cflags = m0->m_pkthdr.csum_flags;
1449 
1450 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1451 			GET_VTAG(cntrl, m0);
1452 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1453 			if (__predict_false(!(cflags & CSUM_IP)))
1454 				cntrl |= F_TXPKT_IPCSUM_DIS;
1455 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1456 				cntrl |= F_TXPKT_L4CSUM_DIS;
1457 
1458 			hflit[0] = htonl(cntrl);
1459 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1460 			flit |= htobe64(1 << 24);
1461 			cbe = &cpl_batch->pkt_entry[i];
1462 			cbe->cntrl = hflit[0];
1463 			cbe->len = hflit[1];
1464 			cbe->addr = htobe64(segs[i].ds_addr);
1465 		}
1466 
1467 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1468 		    V_WR_SGLSFLT(flits)) |
1469 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1470 		wr_lo = htonl(V_WR_LEN(flits) |
1471 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1472 		set_wr_hdr(wrp, wr_hi, wr_lo);
1473 		wmb();
1474 		wr_gen2(txd, txqs.gen);
1475 		check_ring_tx_db(sc, txq);
1476 		return (0);
1477 	} else if (tso_info) {
1478 		int min_size = TCPPKTHDRSIZE, eth_type, tagged;
1479 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1480 		struct ip *ip;
1481 		struct tcphdr *tcp;
1482 		char *pkthdr;
1483 
1484 		txd->flit[2] = 0;
1485 		GET_VTAG(cntrl, m0);
1486 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1487 		hdr->cntrl = htonl(cntrl);
1488 		hdr->len = htonl(mlen | 0x80000000);
1489 
1490 		DPRINTF("tso buf len=%d\n", mlen);
1491 
1492 		tagged = m0->m_flags & M_VLANTAG;
1493 		if (!tagged)
1494 			min_size -= ETHER_VLAN_ENCAP_LEN;
1495 
1496 		if (__predict_false(mlen < min_size)) {
1497 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1498 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1499 			    m0->m_pkthdr.csum_flags, m0->m_flags);
1500 			panic("tx tso packet too small");
1501 		}
1502 
1503 		/* Make sure that ether, ip, tcp headers are all in m0 */
1504 		if (__predict_false(m0->m_len < min_size)) {
1505 			m0 = m_pullup(m0, min_size);
1506 			if (__predict_false(m0 == NULL)) {
1507 				/* XXX panic probably an overreaction */
1508 				panic("couldn't fit header into mbuf");
1509 			}
1510 		}
1511 		pkthdr = m0->m_data;
1512 
1513 		if (tagged) {
1514 			eth_type = CPL_ETH_II_VLAN;
1515 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1516 			    ETHER_VLAN_ENCAP_LEN);
1517 		} else {
1518 			eth_type = CPL_ETH_II;
1519 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1520 		}
1521 		tcp = (struct tcphdr *)((uint8_t *)ip +
1522 		    sizeof(*ip));
1523 
1524 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1525 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1526 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1527 		hdr->lso_info = htonl(tso_info);
1528 
1529 		if (__predict_false(mlen <= PIO_LEN)) {
1530 			/* pkt not undersized but fits in PIO_LEN
1531 			 * Indicates a TSO bug at the higher levels.
1532 			 *
1533 			 */
1534 			DPRINTF("**5592 Fix** mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1535 			    m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags);
1536 			txsd->m = NULL;
1537 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1538 			flits = (mlen + 7) / 8 + 3;
1539 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1540 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1541 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1542 			wr_lo = htonl(V_WR_LEN(flits) |
1543 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1544 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1545 			wmb();
1546 			wr_gen2(txd, txqs.gen);
1547 			check_ring_tx_db(sc, txq);
1548 			return (0);
1549 		}
1550 		flits = 3;
1551 	} else {
1552 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1553 
1554 		GET_VTAG(cntrl, m0);
1555 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1556 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1557 			cntrl |= F_TXPKT_IPCSUM_DIS;
1558 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1559 			cntrl |= F_TXPKT_L4CSUM_DIS;
1560 		cpl->cntrl = htonl(cntrl);
1561 		cpl->len = htonl(mlen | 0x80000000);
1562 
1563 		if (mlen <= PIO_LEN) {
1564 			txsd->m = NULL;
1565 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1566 			flits = (mlen + 7) / 8 + 2;
1567 
1568 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1569 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1570 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1571 			wr_lo = htonl(V_WR_LEN(flits) |
1572 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1573 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1574 			wmb();
1575 			wr_gen2(txd, txqs.gen);
1576 			check_ring_tx_db(sc, txq);
1577 			return (0);
1578 		}
1579 		flits = 2;
1580 	}
1581 	wrp = (struct work_request_hdr *)txd;
1582 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1583 	make_sgl(sgp, segs, nsegs);
1584 
1585 	sgl_flits = sgl_len(nsegs);
1586 
1587 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1588 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1589 	wr_lo = htonl(V_WR_TID(txq->token));
1590 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1591 	    sgl_flits, wr_hi, wr_lo);
1592 	check_ring_tx_db(pi->adapter, txq);
1593 
1594 	return (0);
1595 }
1596 
1597 void
1598 cxgb_tx_watchdog(void *arg)
1599 {
1600 	struct sge_qset *qs = arg;
1601 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1602 
1603         if (qs->coalescing != 0 &&
1604 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1605 	    TXQ_RING_EMPTY(qs))
1606                 qs->coalescing = 0;
1607         else if (qs->coalescing == 0 &&
1608 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1609                 qs->coalescing = 1;
1610 	if (TXQ_TRYLOCK(qs)) {
1611 		qs->qs_flags |= QS_FLUSHING;
1612 		cxgb_start_locked(qs);
1613 		qs->qs_flags &= ~QS_FLUSHING;
1614 		TXQ_UNLOCK(qs);
1615 	}
1616 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1617 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1618 		    qs, txq->txq_watchdog.c_cpu);
1619 }
1620 
1621 static void
1622 cxgb_tx_timeout(void *arg)
1623 {
1624 	struct sge_qset *qs = arg;
1625 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1626 
1627 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1628                 qs->coalescing = 1;
1629 	if (TXQ_TRYLOCK(qs)) {
1630 		qs->qs_flags |= QS_TIMEOUT;
1631 		cxgb_start_locked(qs);
1632 		qs->qs_flags &= ~QS_TIMEOUT;
1633 		TXQ_UNLOCK(qs);
1634 	}
1635 }
1636 
1637 static void
1638 cxgb_start_locked(struct sge_qset *qs)
1639 {
1640 	struct mbuf *m_head = NULL;
1641 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1642 	int avail, txmax;
1643 	int in_use_init = txq->in_use;
1644 	struct port_info *pi = qs->port;
1645 	struct ifnet *ifp = pi->ifp;
1646 	avail = txq->size - txq->in_use - 4;
1647 	txmax = min(TX_START_MAX_DESC, avail);
1648 
1649 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1650 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1651 
1652 	if (!pi->link_config.link_ok) {
1653 		TXQ_RING_FLUSH(qs);
1654 		return;
1655 	}
1656 	TXQ_LOCK_ASSERT(qs);
1657 	while ((txq->in_use - in_use_init < txmax) &&
1658 	    !TXQ_RING_EMPTY(qs) &&
1659 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1660 	    pi->link_config.link_ok) {
1661 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1662 
1663 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1664 			break;
1665 		/*
1666 		 *  Encapsulation can modify our pointer, and or make it
1667 		 *  NULL on failure.  In that event, we can't requeue.
1668 		 */
1669 		if (t3_encap(qs, &m_head) || m_head == NULL)
1670 			break;
1671 
1672 		/* Send a copy of the frame to the BPF listener */
1673 		ETHER_BPF_MTAP(ifp, m_head);
1674 
1675 		/*
1676 		 * We sent via PIO, no longer need a copy
1677 		 */
1678 		if (m_head->m_nextpkt == NULL &&
1679 		    m_head->m_pkthdr.len <= PIO_LEN)
1680 			m_freem(m_head);
1681 
1682 		m_head = NULL;
1683 	}
1684 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1685 	    pi->link_config.link_ok)
1686 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1687 		    qs, txq->txq_timer.c_cpu);
1688 	if (m_head != NULL)
1689 		m_freem(m_head);
1690 }
1691 
1692 static int
1693 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1694 {
1695 	struct port_info *pi = qs->port;
1696 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1697 	struct buf_ring *br = txq->txq_mr;
1698 	int error, avail;
1699 
1700 	avail = txq->size - txq->in_use;
1701 	TXQ_LOCK_ASSERT(qs);
1702 
1703 	/*
1704 	 * We can only do a direct transmit if the following are true:
1705 	 * - we aren't coalescing (ring < 3/4 full)
1706 	 * - the link is up -- checked in caller
1707 	 * - there are no packets enqueued already
1708 	 * - there is space in hardware transmit queue
1709 	 */
1710 	if (check_pkt_coalesce(qs) == 0 &&
1711 	    TXQ_RING_EMPTY(qs) && avail > 4) {
1712 		if (t3_encap(qs, &m)) {
1713 			if (m != NULL &&
1714 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1715 				return (error);
1716 		} else {
1717 			/*
1718 			 * We've bypassed the buf ring so we need to update
1719 			 * the stats directly
1720 			 */
1721 			txq->txq_direct_packets++;
1722 			txq->txq_direct_bytes += m->m_pkthdr.len;
1723 			/*
1724 			** Send a copy of the frame to the BPF
1725 			** listener and set the watchdog on.
1726 			*/
1727 			ETHER_BPF_MTAP(ifp, m);
1728 			/*
1729 			 * We sent via PIO, no longer need a copy
1730 			 */
1731 			if (m->m_pkthdr.len <= PIO_LEN)
1732 				m_freem(m);
1733 
1734 		}
1735 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1736 		return (error);
1737 
1738 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1739 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1740 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1741 		cxgb_start_locked(qs);
1742 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1743 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1744 		    qs, txq->txq_timer.c_cpu);
1745 	return (0);
1746 }
1747 
1748 int
1749 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1750 {
1751 	struct sge_qset *qs;
1752 	struct port_info *pi = ifp->if_softc;
1753 	int error, qidx = pi->first_qset;
1754 
1755 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1756 	    ||(!pi->link_config.link_ok)) {
1757 		m_freem(m);
1758 		return (0);
1759 	}
1760 
1761 	if (m->m_flags & M_FLOWID)
1762 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1763 
1764 	qs = &pi->adapter->sge.qs[qidx];
1765 
1766 	if (TXQ_TRYLOCK(qs)) {
1767 		/* XXX running */
1768 		error = cxgb_transmit_locked(ifp, qs, m);
1769 		TXQ_UNLOCK(qs);
1770 	} else
1771 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1772 	return (error);
1773 }
1774 void
1775 cxgb_start(struct ifnet *ifp)
1776 {
1777 	struct port_info *pi = ifp->if_softc;
1778 	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
1779 
1780 	if (!pi->link_config.link_ok)
1781 		return;
1782 
1783 	TXQ_LOCK(qs);
1784 	cxgb_start_locked(qs);
1785 	TXQ_UNLOCK(qs);
1786 }
1787 
1788 void
1789 cxgb_qflush(struct ifnet *ifp)
1790 {
1791 	/*
1792 	 * flush any enqueued mbufs in the buf_rings
1793 	 * and in the transmit queues
1794 	 * no-op for now
1795 	 */
1796 	return;
1797 }
1798 
1799 /**
1800  *	write_imm - write a packet into a Tx descriptor as immediate data
1801  *	@d: the Tx descriptor to write
1802  *	@m: the packet
1803  *	@len: the length of packet data to write as immediate data
1804  *	@gen: the generation bit value to write
1805  *
1806  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1807  *	contains a work request at its beginning.  We must write the packet
1808  *	carefully so the SGE doesn't read accidentally before it's written in
1809  *	its entirety.
1810  */
1811 static __inline void
1812 write_imm(struct tx_desc *d, struct mbuf *m,
1813 	  unsigned int len, unsigned int gen)
1814 {
1815 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1816 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1817 	uint32_t wr_hi, wr_lo;
1818 
1819 	if (len > WR_LEN)
1820 		panic("len too big %d\n", len);
1821 	if (len < sizeof(*from))
1822 		panic("len too small %d", len);
1823 
1824 	memcpy(&to[1], &from[1], len - sizeof(*from));
1825 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1826 					V_WR_BCNTLFLT(len & 7));
1827 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1828 					V_WR_LEN((len + 7) / 8));
1829 	set_wr_hdr(to, wr_hi, wr_lo);
1830 	wmb();
1831 	wr_gen2(d, gen);
1832 
1833 	/*
1834 	 * This check is a hack we should really fix the logic so
1835 	 * that this can't happen
1836 	 */
1837 	if (m->m_type != MT_DONTFREE)
1838 		m_freem(m);
1839 
1840 }
1841 
1842 /**
1843  *	check_desc_avail - check descriptor availability on a send queue
1844  *	@adap: the adapter
1845  *	@q: the TX queue
1846  *	@m: the packet needing the descriptors
1847  *	@ndesc: the number of Tx descriptors needed
1848  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1849  *
1850  *	Checks if the requested number of Tx descriptors is available on an
1851  *	SGE send queue.  If the queue is already suspended or not enough
1852  *	descriptors are available the packet is queued for later transmission.
1853  *	Must be called with the Tx queue locked.
1854  *
1855  *	Returns 0 if enough descriptors are available, 1 if there aren't
1856  *	enough descriptors and the packet has been queued, and 2 if the caller
1857  *	needs to retry because there weren't enough descriptors at the
1858  *	beginning of the call but some freed up in the mean time.
1859  */
1860 static __inline int
1861 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1862 		 struct mbuf *m, unsigned int ndesc,
1863 		 unsigned int qid)
1864 {
1865 	/*
1866 	 * XXX We currently only use this for checking the control queue
1867 	 * the control queue is only used for binding qsets which happens
1868 	 * at init time so we are guaranteed enough descriptors
1869 	 */
1870 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1871 addq_exit:	mbufq_tail(&q->sendq, m);
1872 		return 1;
1873 	}
1874 	if (__predict_false(q->size - q->in_use < ndesc)) {
1875 
1876 		struct sge_qset *qs = txq_to_qset(q, qid);
1877 
1878 		setbit(&qs->txq_stopped, qid);
1879 		if (should_restart_tx(q) &&
1880 		    test_and_clear_bit(qid, &qs->txq_stopped))
1881 			return 2;
1882 
1883 		q->stops++;
1884 		goto addq_exit;
1885 	}
1886 	return 0;
1887 }
1888 
1889 
1890 /**
1891  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1892  *	@q: the SGE control Tx queue
1893  *
1894  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1895  *	that send only immediate data (presently just the control queues) and
1896  *	thus do not have any mbufs
1897  */
1898 static __inline void
1899 reclaim_completed_tx_imm(struct sge_txq *q)
1900 {
1901 	unsigned int reclaim = q->processed - q->cleaned;
1902 
1903 	q->in_use -= reclaim;
1904 	q->cleaned += reclaim;
1905 }
1906 
1907 static __inline int
1908 immediate(const struct mbuf *m)
1909 {
1910 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1911 }
1912 
1913 /**
1914  *	ctrl_xmit - send a packet through an SGE control Tx queue
1915  *	@adap: the adapter
1916  *	@q: the control queue
1917  *	@m: the packet
1918  *
1919  *	Send a packet through an SGE control Tx queue.  Packets sent through
1920  *	a control queue must fit entirely as immediate data in a single Tx
1921  *	descriptor and have no page fragments.
1922  */
1923 static int
1924 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1925 {
1926 	int ret;
1927 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1928 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1929 
1930 	if (__predict_false(!immediate(m))) {
1931 		m_freem(m);
1932 		return 0;
1933 	}
1934 
1935 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1936 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1937 
1938 	TXQ_LOCK(qs);
1939 again:	reclaim_completed_tx_imm(q);
1940 
1941 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1942 	if (__predict_false(ret)) {
1943 		if (ret == 1) {
1944 			TXQ_UNLOCK(qs);
1945 			log(LOG_ERR, "no desc available\n");
1946 			return (ENOSPC);
1947 		}
1948 		goto again;
1949 	}
1950 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1951 
1952 	q->in_use++;
1953 	if (++q->pidx >= q->size) {
1954 		q->pidx = 0;
1955 		q->gen ^= 1;
1956 	}
1957 	TXQ_UNLOCK(qs);
1958 	t3_write_reg(adap, A_SG_KDOORBELL,
1959 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1960 	return (0);
1961 }
1962 
1963 
1964 /**
1965  *	restart_ctrlq - restart a suspended control queue
1966  *	@qs: the queue set cotaining the control queue
1967  *
1968  *	Resumes transmission on a suspended Tx control queue.
1969  */
1970 static void
1971 restart_ctrlq(void *data, int npending)
1972 {
1973 	struct mbuf *m;
1974 	struct sge_qset *qs = (struct sge_qset *)data;
1975 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1976 	adapter_t *adap = qs->port->adapter;
1977 
1978 	log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use);
1979 
1980 	TXQ_LOCK(qs);
1981 again:	reclaim_completed_tx_imm(q);
1982 
1983 	while (q->in_use < q->size &&
1984 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1985 
1986 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1987 
1988 		if (++q->pidx >= q->size) {
1989 			q->pidx = 0;
1990 			q->gen ^= 1;
1991 		}
1992 		q->in_use++;
1993 	}
1994 	if (!mbufq_empty(&q->sendq)) {
1995 		setbit(&qs->txq_stopped, TXQ_CTRL);
1996 
1997 		if (should_restart_tx(q) &&
1998 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1999 			goto again;
2000 		q->stops++;
2001 	}
2002 	TXQ_UNLOCK(qs);
2003 	t3_write_reg(adap, A_SG_KDOORBELL,
2004 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2005 }
2006 
2007 
2008 /*
2009  * Send a management message through control queue 0
2010  */
2011 int
2012 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
2013 {
2014 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
2015 }
2016 
2017 /**
2018  *	free_qset - free the resources of an SGE queue set
2019  *	@sc: the controller owning the queue set
2020  *	@q: the queue set
2021  *
2022  *	Release the HW and SW resources associated with an SGE queue set, such
2023  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2024  *	queue set must be quiesced prior to calling this.
2025  */
2026 static void
2027 t3_free_qset(adapter_t *sc, struct sge_qset *q)
2028 {
2029 	int i;
2030 
2031 	reclaim_completed_tx(q, 0, TXQ_ETH);
2032 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2033 		if (q->txq[i].txq_mr != NULL)
2034 			buf_ring_free(q->txq[i].txq_mr, M_DEVBUF);
2035 		if (q->txq[i].txq_ifq != NULL) {
2036 			ifq_delete(q->txq[i].txq_ifq);
2037 			free(q->txq[i].txq_ifq, M_DEVBUF);
2038 		}
2039 	}
2040 
2041 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2042 		if (q->fl[i].desc) {
2043 			mtx_lock_spin(&sc->sge.reg_lock);
2044 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2045 			mtx_unlock_spin(&sc->sge.reg_lock);
2046 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2047 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2048 					q->fl[i].desc_map);
2049 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2050 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2051 		}
2052 		if (q->fl[i].sdesc) {
2053 			free_rx_bufs(sc, &q->fl[i]);
2054 			free(q->fl[i].sdesc, M_DEVBUF);
2055 		}
2056 	}
2057 
2058 	mtx_unlock(&q->lock);
2059 	MTX_DESTROY(&q->lock);
2060 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2061 		if (q->txq[i].desc) {
2062 			mtx_lock_spin(&sc->sge.reg_lock);
2063 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2064 			mtx_unlock_spin(&sc->sge.reg_lock);
2065 			bus_dmamap_unload(q->txq[i].desc_tag,
2066 					q->txq[i].desc_map);
2067 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2068 					q->txq[i].desc_map);
2069 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2070 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2071 		}
2072 		if (q->txq[i].sdesc) {
2073 			free(q->txq[i].sdesc, M_DEVBUF);
2074 		}
2075 	}
2076 
2077 	if (q->rspq.desc) {
2078 		mtx_lock_spin(&sc->sge.reg_lock);
2079 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2080 		mtx_unlock_spin(&sc->sge.reg_lock);
2081 
2082 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2083 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2084 			        q->rspq.desc_map);
2085 		bus_dma_tag_destroy(q->rspq.desc_tag);
2086 		MTX_DESTROY(&q->rspq.lock);
2087 	}
2088 
2089 #ifdef LRO_SUPPORTED
2090 	tcp_lro_free(&q->lro.ctrl);
2091 #endif
2092 
2093 	bzero(q, sizeof(*q));
2094 }
2095 
2096 /**
2097  *	t3_free_sge_resources - free SGE resources
2098  *	@sc: the adapter softc
2099  *
2100  *	Frees resources used by the SGE queue sets.
2101  */
2102 void
2103 t3_free_sge_resources(adapter_t *sc)
2104 {
2105 	int i, nqsets;
2106 
2107 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2108 		nqsets += sc->port[i].nqsets;
2109 
2110 	for (i = 0; i < nqsets; ++i) {
2111 		TXQ_LOCK(&sc->sge.qs[i]);
2112 		t3_free_qset(sc, &sc->sge.qs[i]);
2113 	}
2114 
2115 }
2116 
2117 /**
2118  *	t3_sge_start - enable SGE
2119  *	@sc: the controller softc
2120  *
2121  *	Enables the SGE for DMAs.  This is the last step in starting packet
2122  *	transfers.
2123  */
2124 void
2125 t3_sge_start(adapter_t *sc)
2126 {
2127 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2128 }
2129 
2130 /**
2131  *	t3_sge_stop - disable SGE operation
2132  *	@sc: the adapter
2133  *
2134  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2135  *	from error interrupts) or from normal process context.  In the latter
2136  *	case it also disables any pending queue restart tasklets.  Note that
2137  *	if it is called in interrupt context it cannot disable the restart
2138  *	tasklets as it cannot wait, however the tasklets will have no effect
2139  *	since the doorbells are disabled and the driver will call this again
2140  *	later from process context, at which time the tasklets will be stopped
2141  *	if they are still running.
2142  */
2143 void
2144 t3_sge_stop(adapter_t *sc)
2145 {
2146 	int i, nqsets;
2147 
2148 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2149 
2150 	if (sc->tq == NULL)
2151 		return;
2152 
2153 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2154 		nqsets += sc->port[i].nqsets;
2155 #ifdef notyet
2156 	/*
2157 	 *
2158 	 * XXX
2159 	 */
2160 	for (i = 0; i < nqsets; ++i) {
2161 		struct sge_qset *qs = &sc->sge.qs[i];
2162 
2163 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2164 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2165 	}
2166 #endif
2167 }
2168 
2169 /**
2170  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2171  *	@adapter: the adapter
2172  *	@q: the Tx queue to reclaim descriptors from
2173  *	@reclaimable: the number of descriptors to reclaim
2174  *      @m_vec_size: maximum number of buffers to reclaim
2175  *      @desc_reclaimed: returns the number of descriptors reclaimed
2176  *
2177  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2178  *	Tx buffers.  Called with the Tx queue lock held.
2179  *
2180  *      Returns number of buffers of reclaimed
2181  */
2182 void
2183 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2184 {
2185 	struct tx_sw_desc *txsd;
2186 	unsigned int cidx, mask;
2187 	struct sge_txq *q = &qs->txq[queue];
2188 
2189 #ifdef T3_TRACE
2190 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2191 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2192 #endif
2193 	cidx = q->cidx;
2194 	mask = q->size - 1;
2195 	txsd = &q->sdesc[cidx];
2196 
2197 	mtx_assert(&qs->lock, MA_OWNED);
2198 	while (reclaimable--) {
2199 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2200 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2201 
2202 		if (txsd->m != NULL) {
2203 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2204 				bus_dmamap_unload(q->entry_tag, txsd->map);
2205 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2206 			}
2207 			m_freem_list(txsd->m);
2208 			txsd->m = NULL;
2209 		} else
2210 			q->txq_skipped++;
2211 
2212 		++txsd;
2213 		if (++cidx == q->size) {
2214 			cidx = 0;
2215 			txsd = q->sdesc;
2216 		}
2217 	}
2218 	q->cidx = cidx;
2219 
2220 }
2221 
2222 /**
2223  *	is_new_response - check if a response is newly written
2224  *	@r: the response descriptor
2225  *	@q: the response queue
2226  *
2227  *	Returns true if a response descriptor contains a yet unprocessed
2228  *	response.
2229  */
2230 static __inline int
2231 is_new_response(const struct rsp_desc *r,
2232     const struct sge_rspq *q)
2233 {
2234 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2235 }
2236 
2237 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2238 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2239 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2240 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2241 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2242 
2243 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2244 #define NOMEM_INTR_DELAY 2500
2245 
2246 /**
2247  *	write_ofld_wr - write an offload work request
2248  *	@adap: the adapter
2249  *	@m: the packet to send
2250  *	@q: the Tx queue
2251  *	@pidx: index of the first Tx descriptor to write
2252  *	@gen: the generation value to use
2253  *	@ndesc: number of descriptors the packet will occupy
2254  *
2255  *	Write an offload work request to send the supplied packet.  The packet
2256  *	data already carry the work request with most fields populated.
2257  */
2258 static void
2259 write_ofld_wr(adapter_t *adap, struct mbuf *m,
2260     struct sge_txq *q, unsigned int pidx,
2261     unsigned int gen, unsigned int ndesc,
2262     bus_dma_segment_t *segs, unsigned int nsegs)
2263 {
2264 	unsigned int sgl_flits, flits;
2265 	struct work_request_hdr *from;
2266 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2267 	struct tx_desc *d = &q->desc[pidx];
2268 	struct txq_state txqs;
2269 
2270 	if (immediate(m) && nsegs == 0) {
2271 		write_imm(d, m, m->m_len, gen);
2272 		return;
2273 	}
2274 
2275 	/* Only TX_DATA builds SGLs */
2276 	from = mtod(m, struct work_request_hdr *);
2277 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2278 
2279 	flits = m->m_len / 8;
2280 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2281 
2282 	make_sgl(sgp, segs, nsegs);
2283 	sgl_flits = sgl_len(nsegs);
2284 
2285 	txqs.gen = gen;
2286 	txqs.pidx = pidx;
2287 	txqs.compl = 0;
2288 
2289 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2290 	    from->wrh_hi, from->wrh_lo);
2291 }
2292 
2293 /**
2294  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2295  *	@m: the packet
2296  *
2297  * 	Returns the number of Tx descriptors needed for the given offload
2298  * 	packet.  These packets are already fully constructed.
2299  */
2300 static __inline unsigned int
2301 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2302 {
2303 	unsigned int flits, cnt = 0;
2304 	int ndescs;
2305 
2306 	if (m->m_len <= WR_LEN && nsegs == 0)
2307 		return (1);                 /* packet fits as immediate data */
2308 
2309 	/*
2310 	 * This needs to be re-visited for TOE
2311 	 */
2312 
2313 	cnt = nsegs;
2314 
2315 	/* headers */
2316 	flits = m->m_len / 8;
2317 
2318 	ndescs = flits_to_desc(flits + sgl_len(cnt));
2319 
2320 	return (ndescs);
2321 }
2322 
2323 /**
2324  *	ofld_xmit - send a packet through an offload queue
2325  *	@adap: the adapter
2326  *	@q: the Tx offload queue
2327  *	@m: the packet
2328  *
2329  *	Send an offload packet through an SGE offload queue.
2330  */
2331 static int
2332 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2333 {
2334 	int ret, nsegs;
2335 	unsigned int ndesc;
2336 	unsigned int pidx, gen;
2337 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2338 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2339 	struct tx_sw_desc *stx;
2340 
2341 	nsegs = m_get_sgllen(m);
2342 	vsegs = m_get_sgl(m);
2343 	ndesc = calc_tx_descs_ofld(m, nsegs);
2344 	busdma_map_sgl(vsegs, segs, nsegs);
2345 
2346 	stx = &q->sdesc[q->pidx];
2347 
2348 	TXQ_LOCK(qs);
2349 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2350 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2351 	if (__predict_false(ret)) {
2352 		if (ret == 1) {
2353 			printf("no ofld desc avail\n");
2354 
2355 			m_set_priority(m, ndesc);     /* save for restart */
2356 			TXQ_UNLOCK(qs);
2357 			return (EINTR);
2358 		}
2359 		goto again;
2360 	}
2361 
2362 	gen = q->gen;
2363 	q->in_use += ndesc;
2364 	pidx = q->pidx;
2365 	q->pidx += ndesc;
2366 	if (q->pidx >= q->size) {
2367 		q->pidx -= q->size;
2368 		q->gen ^= 1;
2369 	}
2370 #ifdef T3_TRACE
2371 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2372 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2373 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2374 		  skb_shinfo(skb)->nr_frags);
2375 #endif
2376 	TXQ_UNLOCK(qs);
2377 
2378 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2379 	check_ring_tx_db(adap, q);
2380 	return (0);
2381 }
2382 
2383 /**
2384  *	restart_offloadq - restart a suspended offload queue
2385  *	@qs: the queue set cotaining the offload queue
2386  *
2387  *	Resumes transmission on a suspended Tx offload queue.
2388  */
2389 static void
2390 restart_offloadq(void *data, int npending)
2391 {
2392 	struct mbuf *m;
2393 	struct sge_qset *qs = data;
2394 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2395 	adapter_t *adap = qs->port->adapter;
2396 	bus_dma_segment_t segs[TX_MAX_SEGS];
2397 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2398 	int nsegs, cleaned;
2399 
2400 	TXQ_LOCK(qs);
2401 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2402 
2403 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2404 		unsigned int gen, pidx;
2405 		unsigned int ndesc = m_get_priority(m);
2406 
2407 		if (__predict_false(q->size - q->in_use < ndesc)) {
2408 			setbit(&qs->txq_stopped, TXQ_OFLD);
2409 			if (should_restart_tx(q) &&
2410 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2411 				goto again;
2412 			q->stops++;
2413 			break;
2414 		}
2415 
2416 		gen = q->gen;
2417 		q->in_use += ndesc;
2418 		pidx = q->pidx;
2419 		q->pidx += ndesc;
2420 		if (q->pidx >= q->size) {
2421 			q->pidx -= q->size;
2422 			q->gen ^= 1;
2423 		}
2424 
2425 		(void)mbufq_dequeue(&q->sendq);
2426 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2427 		TXQ_UNLOCK(qs);
2428 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2429 		TXQ_LOCK(qs);
2430 	}
2431 #if USE_GTS
2432 	set_bit(TXQ_RUNNING, &q->flags);
2433 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2434 #endif
2435 	TXQ_UNLOCK(qs);
2436 	wmb();
2437 	t3_write_reg(adap, A_SG_KDOORBELL,
2438 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2439 }
2440 
2441 /**
2442  *	queue_set - return the queue set a packet should use
2443  *	@m: the packet
2444  *
2445  *	Maps a packet to the SGE queue set it should use.  The desired queue
2446  *	set is carried in bits 1-3 in the packet's priority.
2447  */
2448 static __inline int
2449 queue_set(const struct mbuf *m)
2450 {
2451 	return m_get_priority(m) >> 1;
2452 }
2453 
2454 /**
2455  *	is_ctrl_pkt - return whether an offload packet is a control packet
2456  *	@m: the packet
2457  *
2458  *	Determines whether an offload packet should use an OFLD or a CTRL
2459  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2460  */
2461 static __inline int
2462 is_ctrl_pkt(const struct mbuf *m)
2463 {
2464 	return m_get_priority(m) & 1;
2465 }
2466 
2467 /**
2468  *	t3_offload_tx - send an offload packet
2469  *	@tdev: the offload device to send to
2470  *	@m: the packet
2471  *
2472  *	Sends an offload packet.  We use the packet priority to select the
2473  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2474  *	should be sent as regular or control, bits 1-3 select the queue set.
2475  */
2476 int
2477 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2478 {
2479 	adapter_t *adap = tdev2adap(tdev);
2480 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2481 
2482 	if (__predict_false(is_ctrl_pkt(m)))
2483 		return ctrl_xmit(adap, qs, m);
2484 
2485 	return ofld_xmit(adap, qs, m);
2486 }
2487 
2488 /**
2489  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2490  *	@tdev: the offload device that will be receiving the packets
2491  *	@q: the SGE response queue that assembled the bundle
2492  *	@m: the partial bundle
2493  *	@n: the number of packets in the bundle
2494  *
2495  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2496  */
2497 static __inline void
2498 deliver_partial_bundle(struct t3cdev *tdev,
2499 			struct sge_rspq *q,
2500 			struct mbuf *mbufs[], int n)
2501 {
2502 	if (n) {
2503 		q->offload_bundles++;
2504 		cxgb_ofld_recv(tdev, mbufs, n);
2505 	}
2506 }
2507 
2508 static __inline int
2509 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2510     struct mbuf *m, struct mbuf *rx_gather[],
2511     unsigned int gather_idx)
2512 {
2513 
2514 	rq->offload_pkts++;
2515 	m->m_pkthdr.header = mtod(m, void *);
2516 	rx_gather[gather_idx++] = m;
2517 	if (gather_idx == RX_BUNDLE_SIZE) {
2518 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2519 		gather_idx = 0;
2520 		rq->offload_bundles++;
2521 	}
2522 	return (gather_idx);
2523 }
2524 
2525 static void
2526 restart_tx(struct sge_qset *qs)
2527 {
2528 	struct adapter *sc = qs->port->adapter;
2529 
2530 
2531 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2532 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2533 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2534 		qs->txq[TXQ_OFLD].restarts++;
2535 		DPRINTF("restarting TXQ_OFLD\n");
2536 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2537 	}
2538 	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2539 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2540 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2541 	    qs->txq[TXQ_CTRL].in_use);
2542 
2543 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2544 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2545 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2546 		qs->txq[TXQ_CTRL].restarts++;
2547 		DPRINTF("restarting TXQ_CTRL\n");
2548 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2549 	}
2550 }
2551 
2552 /**
2553  *	t3_sge_alloc_qset - initialize an SGE queue set
2554  *	@sc: the controller softc
2555  *	@id: the queue set id
2556  *	@nports: how many Ethernet ports will be using this queue set
2557  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2558  *	@p: configuration parameters for this queue set
2559  *	@ntxq: number of Tx queues for the queue set
2560  *	@pi: port info for queue set
2561  *
2562  *	Allocate resources and initialize an SGE queue set.  A queue set
2563  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2564  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2565  *	queue, offload queue, and control queue.
2566  */
2567 int
2568 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2569 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2570 {
2571 	struct sge_qset *q = &sc->sge.qs[id];
2572 	int i, ret = 0;
2573 
2574 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2575 	q->port = pi;
2576 
2577 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2578 
2579 		if ((q->txq[i].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2580 			    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2581 			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2582 			goto err;
2583 		}
2584 		if ((q->txq[i].txq_ifq =
2585 			malloc(sizeof(struct ifaltq), M_DEVBUF, M_NOWAIT|M_ZERO))
2586 		    == NULL) {
2587 			device_printf(sc->dev, "failed to allocate ifq\n");
2588 			goto err;
2589 		}
2590 		ifq_init(q->txq[i].txq_ifq, pi->ifp);
2591 		callout_init(&q->txq[i].txq_timer, 1);
2592 		callout_init(&q->txq[i].txq_watchdog, 1);
2593 		q->txq[i].txq_timer.c_cpu = id % mp_ncpus;
2594 		q->txq[i].txq_watchdog.c_cpu = id % mp_ncpus;
2595 	}
2596 	init_qset_cntxt(q, id);
2597 	q->idx = id;
2598 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2599 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2600 		    &q->fl[0].desc, &q->fl[0].sdesc,
2601 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2602 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2603 		printf("error %d from alloc ring fl0\n", ret);
2604 		goto err;
2605 	}
2606 
2607 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2608 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2609 		    &q->fl[1].desc, &q->fl[1].sdesc,
2610 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2611 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2612 		printf("error %d from alloc ring fl1\n", ret);
2613 		goto err;
2614 	}
2615 
2616 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2617 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2618 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2619 		    NULL, NULL)) != 0) {
2620 		printf("error %d from alloc ring rspq\n", ret);
2621 		goto err;
2622 	}
2623 
2624 	for (i = 0; i < ntxq; ++i) {
2625 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2626 
2627 		if ((ret = alloc_ring(sc, p->txq_size[i],
2628 			    sizeof(struct tx_desc), sz,
2629 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2630 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2631 			    &q->txq[i].desc_map,
2632 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2633 			printf("error %d from alloc ring tx %i\n", ret, i);
2634 			goto err;
2635 		}
2636 		mbufq_init(&q->txq[i].sendq);
2637 		q->txq[i].gen = 1;
2638 		q->txq[i].size = p->txq_size[i];
2639 	}
2640 
2641 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2642 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2643 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2644 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2645 
2646 	q->fl[0].gen = q->fl[1].gen = 1;
2647 	q->fl[0].size = p->fl_size;
2648 	q->fl[1].size = p->jumbo_size;
2649 
2650 	q->rspq.gen = 1;
2651 	q->rspq.cidx = 0;
2652 	q->rspq.size = p->rspq_size;
2653 
2654 	q->txq[TXQ_ETH].stop_thres = nports *
2655 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2656 
2657 	q->fl[0].buf_size = MCLBYTES;
2658 	q->fl[0].zone = zone_pack;
2659 	q->fl[0].type = EXT_PACKET;
2660 #if __FreeBSD_version > 800000
2661 	if (cxgb_use_16k_clusters) {
2662 		q->fl[1].buf_size = MJUM16BYTES;
2663 		q->fl[1].zone = zone_jumbo16;
2664 		q->fl[1].type = EXT_JUMBO16;
2665 	} else {
2666 		q->fl[1].buf_size = MJUM9BYTES;
2667 		q->fl[1].zone = zone_jumbo9;
2668 		q->fl[1].type = EXT_JUMBO9;
2669 	}
2670 #else
2671 	q->fl[1].buf_size = MJUMPAGESIZE;
2672 	q->fl[1].zone = zone_jumbop;
2673 	q->fl[1].type = EXT_JUMBOP;
2674 #endif
2675 
2676 #ifdef LRO_SUPPORTED
2677 	/* Allocate and setup the lro_ctrl structure */
2678 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2679 	ret = tcp_lro_init(&q->lro.ctrl);
2680 	if (ret) {
2681 		printf("error %d from tcp_lro_init\n", ret);
2682 		goto err;
2683 	}
2684 	q->lro.ctrl.ifp = pi->ifp;
2685 #endif
2686 
2687 	mtx_lock_spin(&sc->sge.reg_lock);
2688 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2689 				   q->rspq.phys_addr, q->rspq.size,
2690 				   q->fl[0].buf_size, 1, 0);
2691 	if (ret) {
2692 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2693 		goto err_unlock;
2694 	}
2695 
2696 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2697 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2698 					  q->fl[i].phys_addr, q->fl[i].size,
2699 					  q->fl[i].buf_size, p->cong_thres, 1,
2700 					  0);
2701 		if (ret) {
2702 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2703 			goto err_unlock;
2704 		}
2705 	}
2706 
2707 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2708 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2709 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2710 				 1, 0);
2711 	if (ret) {
2712 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2713 		goto err_unlock;
2714 	}
2715 
2716 	if (ntxq > 1) {
2717 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2718 					 USE_GTS, SGE_CNTXT_OFLD, id,
2719 					 q->txq[TXQ_OFLD].phys_addr,
2720 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2721 		if (ret) {
2722 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2723 			goto err_unlock;
2724 		}
2725 	}
2726 
2727 	if (ntxq > 2) {
2728 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2729 					 SGE_CNTXT_CTRL, id,
2730 					 q->txq[TXQ_CTRL].phys_addr,
2731 					 q->txq[TXQ_CTRL].size,
2732 					 q->txq[TXQ_CTRL].token, 1, 0);
2733 		if (ret) {
2734 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2735 			goto err_unlock;
2736 		}
2737 	}
2738 
2739 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2740 	    device_get_unit(sc->dev), irq_vec_idx);
2741 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2742 
2743 	mtx_unlock_spin(&sc->sge.reg_lock);
2744 	t3_update_qset_coalesce(q, p);
2745 	q->port = pi;
2746 
2747 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2748 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2749 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2750 
2751 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2752 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2753 
2754 	return (0);
2755 
2756 err_unlock:
2757 	mtx_unlock_spin(&sc->sge.reg_lock);
2758 err:
2759 	TXQ_LOCK(q);
2760 	t3_free_qset(sc, q);
2761 
2762 	return (ret);
2763 }
2764 
2765 /*
2766  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2767  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2768  * will also be taken into account here.
2769  */
2770 void
2771 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2772 {
2773 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2774 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2775 	struct ifnet *ifp = pi->ifp;
2776 
2777 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2778 
2779 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2780 	    cpl->csum_valid && cpl->csum == 0xffff) {
2781 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2782 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2783 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2784 		m->m_pkthdr.csum_data = 0xffff;
2785 	}
2786 	/*
2787 	 * XXX need to add VLAN support for 6.x
2788 	 */
2789 #ifdef VLAN_SUPPORTED
2790 	if (__predict_false(cpl->vlan_valid)) {
2791 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2792 		m->m_flags |= M_VLANTAG;
2793 	}
2794 #endif
2795 
2796 	m->m_pkthdr.rcvif = ifp;
2797 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2798 	/*
2799 	 * adjust after conversion to mbuf chain
2800 	 */
2801 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2802 	m->m_len -= (sizeof(*cpl) + ethpad);
2803 	m->m_data += (sizeof(*cpl) + ethpad);
2804 }
2805 
2806 /**
2807  *	get_packet - return the next ingress packet buffer from a free list
2808  *	@adap: the adapter that received the packet
2809  *	@drop_thres: # of remaining buffers before we start dropping packets
2810  *	@qs: the qset that the SGE free list holding the packet belongs to
2811  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2812  *      @r: response descriptor
2813  *
2814  *	Get the next packet from a free list and complete setup of the
2815  *	sk_buff.  If the packet is small we make a copy and recycle the
2816  *	original buffer, otherwise we use the original buffer itself.  If a
2817  *	positive drop threshold is supplied packets are dropped and their
2818  *	buffers recycled if (a) the number of remaining buffers is under the
2819  *	threshold and the packet is too big to copy, or (b) the packet should
2820  *	be copied but there is no memory for the copy.
2821  */
2822 static int
2823 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2824     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2825 {
2826 
2827 	unsigned int len_cq =  ntohl(r->len_cq);
2828 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2829 	int mask, cidx = fl->cidx;
2830 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2831 	uint32_t len = G_RSPD_LEN(len_cq);
2832 	uint32_t flags = M_EXT;
2833 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2834 	caddr_t cl;
2835 	struct mbuf *m;
2836 	int ret = 0;
2837 
2838 	mask = fl->size - 1;
2839 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2840 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2841 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2842 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2843 
2844 	fl->credits--;
2845 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2846 
2847 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2848 	    sopeop == RSPQ_SOP_EOP) {
2849 		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2850 			goto skip_recycle;
2851 		cl = mtod(m, void *);
2852 		memcpy(cl, sd->rxsd_cl, len);
2853 		recycle_rx_buf(adap, fl, fl->cidx);
2854 		m->m_pkthdr.len = m->m_len = len;
2855 		m->m_flags = 0;
2856 		mh->mh_head = mh->mh_tail = m;
2857 		ret = 1;
2858 		goto done;
2859 	} else {
2860 	skip_recycle:
2861 		bus_dmamap_unload(fl->entry_tag, sd->map);
2862 		cl = sd->rxsd_cl;
2863 		m = sd->m;
2864 
2865 		if ((sopeop == RSPQ_SOP_EOP) ||
2866 		    (sopeop == RSPQ_SOP))
2867 			flags |= M_PKTHDR;
2868 		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2869 		if (fl->zone == zone_pack) {
2870 			/*
2871 			 * restore clobbered data pointer
2872 			 */
2873 			m->m_data = m->m_ext.ext_buf;
2874 		} else {
2875 			m_cljset(m, cl, fl->type);
2876 		}
2877 		m->m_len = len;
2878 	}
2879 	switch(sopeop) {
2880 	case RSPQ_SOP_EOP:
2881 		ret = 1;
2882 		/* FALLTHROUGH */
2883 	case RSPQ_SOP:
2884 		mh->mh_head = mh->mh_tail = m;
2885 		m->m_pkthdr.len = len;
2886 		break;
2887 	case RSPQ_EOP:
2888 		ret = 1;
2889 		/* FALLTHROUGH */
2890 	case RSPQ_NSOP_NEOP:
2891 		if (mh->mh_tail == NULL) {
2892 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2893 			m_freem(m);
2894 			break;
2895 		}
2896 		mh->mh_tail->m_next = m;
2897 		mh->mh_tail = m;
2898 		mh->mh_head->m_pkthdr.len += len;
2899 		break;
2900 	}
2901 	if (cxgb_debug)
2902 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2903 done:
2904 	if (++fl->cidx == fl->size)
2905 		fl->cidx = 0;
2906 
2907 	return (ret);
2908 }
2909 
2910 /**
2911  *	handle_rsp_cntrl_info - handles control information in a response
2912  *	@qs: the queue set corresponding to the response
2913  *	@flags: the response control flags
2914  *
2915  *	Handles the control information of an SGE response, such as GTS
2916  *	indications and completion credits for the queue set's Tx queues.
2917  *	HW coalesces credits, we don't do any extra SW coalescing.
2918  */
2919 static __inline void
2920 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2921 {
2922 	unsigned int credits;
2923 
2924 #if USE_GTS
2925 	if (flags & F_RSPD_TXQ0_GTS)
2926 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2927 #endif
2928 	credits = G_RSPD_TXQ0_CR(flags);
2929 	if (credits)
2930 		qs->txq[TXQ_ETH].processed += credits;
2931 
2932 	credits = G_RSPD_TXQ2_CR(flags);
2933 	if (credits)
2934 		qs->txq[TXQ_CTRL].processed += credits;
2935 
2936 # if USE_GTS
2937 	if (flags & F_RSPD_TXQ1_GTS)
2938 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2939 # endif
2940 	credits = G_RSPD_TXQ1_CR(flags);
2941 	if (credits)
2942 		qs->txq[TXQ_OFLD].processed += credits;
2943 
2944 }
2945 
2946 static void
2947 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2948     unsigned int sleeping)
2949 {
2950 	;
2951 }
2952 
2953 /**
2954  *	process_responses - process responses from an SGE response queue
2955  *	@adap: the adapter
2956  *	@qs: the queue set to which the response queue belongs
2957  *	@budget: how many responses can be processed in this round
2958  *
2959  *	Process responses from an SGE response queue up to the supplied budget.
2960  *	Responses include received packets as well as credits and other events
2961  *	for the queues that belong to the response queue's queue set.
2962  *	A negative budget is effectively unlimited.
2963  *
2964  *	Additionally choose the interrupt holdoff time for the next interrupt
2965  *	on this queue.  If the system is under memory shortage use a fairly
2966  *	long delay to help recovery.
2967  */
2968 static int
2969 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2970 {
2971 	struct sge_rspq *rspq = &qs->rspq;
2972 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2973 	int budget_left = budget;
2974 	unsigned int sleeping = 0;
2975 #ifdef LRO_SUPPORTED
2976 	int lro_enabled = qs->lro.enabled;
2977 	int skip_lro;
2978 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2979 #endif
2980 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2981 	int ngathered = 0;
2982 #ifdef DEBUG
2983 	static int last_holdoff = 0;
2984 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2985 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2986 		last_holdoff = rspq->holdoff_tmr;
2987 	}
2988 #endif
2989 	rspq->next_holdoff = rspq->holdoff_tmr;
2990 
2991 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2992 		int eth, eop = 0, ethpad = 0;
2993 		uint32_t flags = ntohl(r->flags);
2994 		uint32_t rss_csum = *(const uint32_t *)r;
2995 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2996 
2997 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2998 
2999 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
3000 			struct mbuf *m;
3001 
3002 			if (cxgb_debug)
3003 				printf("async notification\n");
3004 
3005 			if (rspq->rspq_mh.mh_head == NULL) {
3006 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3007 				m = rspq->rspq_mh.mh_head;
3008 			} else {
3009 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3010 			}
3011 			if (m == NULL)
3012 				goto no_mem;
3013 
3014                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
3015 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
3016                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
3017 			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
3018 			eop = 1;
3019                         rspq->async_notif++;
3020 			goto skip;
3021 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
3022 			struct mbuf *m = NULL;
3023 
3024 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
3025 			    r->rss_hdr.opcode, rspq->cidx);
3026 			if (rspq->rspq_mh.mh_head == NULL)
3027 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3028                         else
3029 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3030 
3031 			if (rspq->rspq_mh.mh_head == NULL &&  m == NULL) {
3032 		no_mem:
3033 				rspq->next_holdoff = NOMEM_INTR_DELAY;
3034 				budget_left--;
3035 				break;
3036 			}
3037 			get_imm_packet(adap, r, rspq->rspq_mh.mh_head);
3038 			eop = 1;
3039 			rspq->imm_data++;
3040 		} else if (r->len_cq) {
3041 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3042 
3043 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
3044 			if (eop) {
3045 				rspq->rspq_mh.mh_head->m_flags |= M_FLOWID;
3046 				rspq->rspq_mh.mh_head->m_pkthdr.flowid = rss_hash;
3047 			}
3048 
3049 			ethpad = 2;
3050 		} else {
3051 			rspq->pure_rsps++;
3052 		}
3053 	skip:
3054 		if (flags & RSPD_CTRL_MASK) {
3055 			sleeping |= flags & RSPD_GTS_MASK;
3056 			handle_rsp_cntrl_info(qs, flags);
3057 		}
3058 
3059 		r++;
3060 		if (__predict_false(++rspq->cidx == rspq->size)) {
3061 			rspq->cidx = 0;
3062 			rspq->gen ^= 1;
3063 			r = rspq->desc;
3064 		}
3065 
3066 		if (++rspq->credits >= (rspq->size / 4)) {
3067 			refill_rspq(adap, rspq, rspq->credits);
3068 			rspq->credits = 0;
3069 		}
3070 		if (!eth && eop) {
3071 			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
3072 			/*
3073 			 * XXX size mismatch
3074 			 */
3075 			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
3076 
3077 
3078 			ngathered = rx_offload(&adap->tdev, rspq,
3079 			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
3080 			rspq->rspq_mh.mh_head = NULL;
3081 			DPRINTF("received offload packet\n");
3082 
3083 		} else if (eth && eop) {
3084 			struct mbuf *m = rspq->rspq_mh.mh_head;
3085 
3086 			t3_rx_eth(adap, rspq, m, ethpad);
3087 
3088 #ifdef LRO_SUPPORTED
3089 			/*
3090 			 * The T304 sends incoming packets on any qset.  If LRO
3091 			 * is also enabled, we could end up sending packet up
3092 			 * lro_ctrl->ifp's input.  That is incorrect.
3093 			 *
3094 			 * The mbuf's rcvif was derived from the cpl header and
3095 			 * is accurate.  Skip LRO and just use that.
3096 			 */
3097 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3098 
3099 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro &&
3100 			    (tcp_lro_rx(lro_ctrl, m, 0) == 0)) {
3101 				/* successfully queue'd for LRO */
3102 			} else
3103 #endif
3104 			{
3105 				/*
3106 				 * LRO not enabled, packet unsuitable for LRO,
3107 				 * or unable to queue.  Pass it up right now in
3108 				 * either case.
3109 				 */
3110 				struct ifnet *ifp = m->m_pkthdr.rcvif;
3111 				(*ifp->if_input)(ifp, m);
3112 			}
3113 			rspq->rspq_mh.mh_head = NULL;
3114 
3115 		}
3116 		__refill_fl_lt(adap, &qs->fl[0], 32);
3117 		__refill_fl_lt(adap, &qs->fl[1], 32);
3118 		--budget_left;
3119 	}
3120 
3121 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3122 
3123 #ifdef LRO_SUPPORTED
3124 	/* Flush LRO */
3125 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3126 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3127 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3128 		tcp_lro_flush(lro_ctrl, queued);
3129 	}
3130 #endif
3131 
3132 	if (sleeping)
3133 		check_ring_db(adap, qs, sleeping);
3134 
3135 	mb();  /* commit Tx queue processed updates */
3136 	if (__predict_false(qs->txq_stopped > 1)) {
3137 		printf("restarting tx on %p\n", qs);
3138 
3139 		restart_tx(qs);
3140 	}
3141 
3142 	__refill_fl_lt(adap, &qs->fl[0], 512);
3143 	__refill_fl_lt(adap, &qs->fl[1], 512);
3144 	budget -= budget_left;
3145 	return (budget);
3146 }
3147 
3148 /*
3149  * A helper function that processes responses and issues GTS.
3150  */
3151 static __inline int
3152 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3153 {
3154 	int work;
3155 	static int last_holdoff = 0;
3156 
3157 	work = process_responses(adap, rspq_to_qset(rq), -1);
3158 
3159 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3160 		printf("next_holdoff=%d\n", rq->next_holdoff);
3161 		last_holdoff = rq->next_holdoff;
3162 	}
3163 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3164 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3165 
3166 	return (work);
3167 }
3168 
3169 
3170 /*
3171  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3172  * Handles data events from SGE response queues as well as error and other
3173  * async events as they all use the same interrupt pin.  We use one SGE
3174  * response queue per port in this mode and protect all response queues with
3175  * queue 0's lock.
3176  */
3177 void
3178 t3b_intr(void *data)
3179 {
3180 	uint32_t i, map;
3181 	adapter_t *adap = data;
3182 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3183 
3184 	t3_write_reg(adap, A_PL_CLI, 0);
3185 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3186 
3187 	if (!map)
3188 		return;
3189 
3190 	if (__predict_false(map & F_ERRINTR))
3191 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3192 
3193 	mtx_lock(&q0->lock);
3194 	for_each_port(adap, i)
3195 	    if (map & (1 << i))
3196 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3197 	mtx_unlock(&q0->lock);
3198 }
3199 
3200 /*
3201  * The MSI interrupt handler.  This needs to handle data events from SGE
3202  * response queues as well as error and other async events as they all use
3203  * the same MSI vector.  We use one SGE response queue per port in this mode
3204  * and protect all response queues with queue 0's lock.
3205  */
3206 void
3207 t3_intr_msi(void *data)
3208 {
3209 	adapter_t *adap = data;
3210 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3211 	int i, new_packets = 0;
3212 
3213 	mtx_lock(&q0->lock);
3214 
3215 	for_each_port(adap, i)
3216 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3217 		    new_packets = 1;
3218 	mtx_unlock(&q0->lock);
3219 	if (new_packets == 0)
3220 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3221 }
3222 
3223 void
3224 t3_intr_msix(void *data)
3225 {
3226 	struct sge_qset *qs = data;
3227 	adapter_t *adap = qs->port->adapter;
3228 	struct sge_rspq *rspq = &qs->rspq;
3229 
3230 	if (process_responses_gts(adap, rspq) == 0)
3231 		rspq->unhandled_irqs++;
3232 }
3233 
3234 #define QDUMP_SBUF_SIZE		32 * 400
3235 static int
3236 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3237 {
3238 	struct sge_rspq *rspq;
3239 	struct sge_qset *qs;
3240 	int i, err, dump_end, idx;
3241 	static int multiplier = 1;
3242 	struct sbuf *sb;
3243 	struct rsp_desc *rspd;
3244 	uint32_t data[4];
3245 
3246 	rspq = arg1;
3247 	qs = rspq_to_qset(rspq);
3248 	if (rspq->rspq_dump_count == 0)
3249 		return (0);
3250 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3251 		log(LOG_WARNING,
3252 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3253 		rspq->rspq_dump_count = 0;
3254 		return (EINVAL);
3255 	}
3256 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3257 		log(LOG_WARNING,
3258 		    "dump start of %d is greater than queue size\n",
3259 		    rspq->rspq_dump_start);
3260 		rspq->rspq_dump_start = 0;
3261 		return (EINVAL);
3262 	}
3263 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3264 	if (err)
3265 		return (err);
3266 retry_sbufops:
3267 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3268 
3269 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3270 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3271 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3272 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3273 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3274 
3275 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3276 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3277 
3278 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3279 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3280 		idx = i & (RSPQ_Q_SIZE-1);
3281 
3282 		rspd = &rspq->desc[idx];
3283 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3284 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3285 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3286 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3287 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3288 		    be32toh(rspd->len_cq), rspd->intr_gen);
3289 	}
3290 	if (sbuf_overflowed(sb)) {
3291 		sbuf_delete(sb);
3292 		multiplier++;
3293 		goto retry_sbufops;
3294 	}
3295 	sbuf_finish(sb);
3296 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3297 	sbuf_delete(sb);
3298 	return (err);
3299 }
3300 
3301 static int
3302 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3303 {
3304 	struct sge_txq *txq;
3305 	struct sge_qset *qs;
3306 	int i, j, err, dump_end;
3307 	static int multiplier = 1;
3308 	struct sbuf *sb;
3309 	struct tx_desc *txd;
3310 	uint32_t *WR, wr_hi, wr_lo, gen;
3311 	uint32_t data[4];
3312 
3313 	txq = arg1;
3314 	qs = txq_to_qset(txq, TXQ_ETH);
3315 	if (txq->txq_dump_count == 0) {
3316 		return (0);
3317 	}
3318 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3319 		log(LOG_WARNING,
3320 		    "dump count is too large %d\n", txq->txq_dump_count);
3321 		txq->txq_dump_count = 1;
3322 		return (EINVAL);
3323 	}
3324 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3325 		log(LOG_WARNING,
3326 		    "dump start of %d is greater than queue size\n",
3327 		    txq->txq_dump_start);
3328 		txq->txq_dump_start = 0;
3329 		return (EINVAL);
3330 	}
3331 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3332 	if (err)
3333 		return (err);
3334 
3335 
3336 retry_sbufops:
3337 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3338 
3339 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3340 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3341 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3342 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3343 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3344 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3345 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3346 	    txq->txq_dump_start,
3347 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3348 
3349 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3350 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3351 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3352 		WR = (uint32_t *)txd->flit;
3353 		wr_hi = ntohl(WR[0]);
3354 		wr_lo = ntohl(WR[1]);
3355 		gen = G_WR_GEN(wr_lo);
3356 
3357 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3358 		    wr_hi, wr_lo, gen);
3359 		for (j = 2; j < 30; j += 4)
3360 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3361 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3362 
3363 	}
3364 	if (sbuf_overflowed(sb)) {
3365 		sbuf_delete(sb);
3366 		multiplier++;
3367 		goto retry_sbufops;
3368 	}
3369 	sbuf_finish(sb);
3370 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3371 	sbuf_delete(sb);
3372 	return (err);
3373 }
3374 
3375 static int
3376 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3377 {
3378 	struct sge_txq *txq;
3379 	struct sge_qset *qs;
3380 	int i, j, err, dump_end;
3381 	static int multiplier = 1;
3382 	struct sbuf *sb;
3383 	struct tx_desc *txd;
3384 	uint32_t *WR, wr_hi, wr_lo, gen;
3385 
3386 	txq = arg1;
3387 	qs = txq_to_qset(txq, TXQ_CTRL);
3388 	if (txq->txq_dump_count == 0) {
3389 		return (0);
3390 	}
3391 	if (txq->txq_dump_count > 256) {
3392 		log(LOG_WARNING,
3393 		    "dump count is too large %d\n", txq->txq_dump_count);
3394 		txq->txq_dump_count = 1;
3395 		return (EINVAL);
3396 	}
3397 	if (txq->txq_dump_start > 255) {
3398 		log(LOG_WARNING,
3399 		    "dump start of %d is greater than queue size\n",
3400 		    txq->txq_dump_start);
3401 		txq->txq_dump_start = 0;
3402 		return (EINVAL);
3403 	}
3404 
3405 retry_sbufops:
3406 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3407 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3408 	    txq->txq_dump_start,
3409 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3410 
3411 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3412 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3413 		txd = &txq->desc[i & (255)];
3414 		WR = (uint32_t *)txd->flit;
3415 		wr_hi = ntohl(WR[0]);
3416 		wr_lo = ntohl(WR[1]);
3417 		gen = G_WR_GEN(wr_lo);
3418 
3419 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3420 		    wr_hi, wr_lo, gen);
3421 		for (j = 2; j < 30; j += 4)
3422 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3423 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3424 
3425 	}
3426 	if (sbuf_overflowed(sb)) {
3427 		sbuf_delete(sb);
3428 		multiplier++;
3429 		goto retry_sbufops;
3430 	}
3431 	sbuf_finish(sb);
3432 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3433 	sbuf_delete(sb);
3434 	return (err);
3435 }
3436 
3437 static int
3438 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3439 {
3440 	adapter_t *sc = arg1;
3441 	struct qset_params *qsp = &sc->params.sge.qset[0];
3442 	int coalesce_usecs;
3443 	struct sge_qset *qs;
3444 	int i, j, err, nqsets = 0;
3445 	struct mtx *lock;
3446 
3447 	if ((sc->flags & FULL_INIT_DONE) == 0)
3448 		return (ENXIO);
3449 
3450 	coalesce_usecs = qsp->coalesce_usecs;
3451         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3452 
3453 	if (err != 0) {
3454 		return (err);
3455 	}
3456 	if (coalesce_usecs == qsp->coalesce_usecs)
3457 		return (0);
3458 
3459 	for (i = 0; i < sc->params.nports; i++)
3460 		for (j = 0; j < sc->port[i].nqsets; j++)
3461 			nqsets++;
3462 
3463 	coalesce_usecs = max(1, coalesce_usecs);
3464 
3465 	for (i = 0; i < nqsets; i++) {
3466 		qs = &sc->sge.qs[i];
3467 		qsp = &sc->params.sge.qset[i];
3468 		qsp->coalesce_usecs = coalesce_usecs;
3469 
3470 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3471 			    &sc->sge.qs[0].rspq.lock;
3472 
3473 		mtx_lock(lock);
3474 		t3_update_qset_coalesce(qs, qsp);
3475 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3476 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3477 		mtx_unlock(lock);
3478 	}
3479 
3480 	return (0);
3481 }
3482 
3483 
3484 void
3485 t3_add_attach_sysctls(adapter_t *sc)
3486 {
3487 	struct sysctl_ctx_list *ctx;
3488 	struct sysctl_oid_list *children;
3489 
3490 	ctx = device_get_sysctl_ctx(sc->dev);
3491 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3492 
3493 	/* random information */
3494 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3495 	    "firmware_version",
3496 	    CTLFLAG_RD, &sc->fw_version,
3497 	    0, "firmware version");
3498 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3499 	    "hw_revision",
3500 	    CTLFLAG_RD, &sc->params.rev,
3501 	    0, "chip model");
3502 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3503 	    "port_types",
3504 	    CTLFLAG_RD, &sc->port_types,
3505 	    0, "type of ports");
3506 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3507 	    "enable_debug",
3508 	    CTLFLAG_RW, &cxgb_debug,
3509 	    0, "enable verbose debugging output");
3510 	SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3511 	    CTLFLAG_RD, &sc->tunq_coalesce,
3512 	    "#tunneled packets freed");
3513 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3514 	    "txq_overrun",
3515 	    CTLFLAG_RD, &txq_fills,
3516 	    0, "#times txq overrun");
3517 }
3518 
3519 
3520 static const char *rspq_name = "rspq";
3521 static const char *txq_names[] =
3522 {
3523 	"txq_eth",
3524 	"txq_ofld",
3525 	"txq_ctrl"
3526 };
3527 
3528 static int
3529 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3530 {
3531 	struct port_info *p = arg1;
3532 	uint64_t *parg;
3533 
3534 	if (!p)
3535 		return (EINVAL);
3536 
3537 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3538 	PORT_LOCK(p);
3539 	t3_mac_update_stats(&p->mac);
3540 	PORT_UNLOCK(p);
3541 
3542 	return (sysctl_handle_quad(oidp, parg, 0, req));
3543 }
3544 
3545 void
3546 t3_add_configured_sysctls(adapter_t *sc)
3547 {
3548 	struct sysctl_ctx_list *ctx;
3549 	struct sysctl_oid_list *children;
3550 	int i, j;
3551 
3552 	ctx = device_get_sysctl_ctx(sc->dev);
3553 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3554 
3555 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3556 	    "intr_coal",
3557 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3558 	    0, t3_set_coalesce_usecs,
3559 	    "I", "interrupt coalescing timer (us)");
3560 
3561 	for (i = 0; i < sc->params.nports; i++) {
3562 		struct port_info *pi = &sc->port[i];
3563 		struct sysctl_oid *poid;
3564 		struct sysctl_oid_list *poidlist;
3565 		struct mac_stats *mstats = &pi->mac.stats;
3566 
3567 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3568 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3569 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3570 		poidlist = SYSCTL_CHILDREN(poid);
3571 		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3572 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3573 		    0, "#queue sets");
3574 
3575 		for (j = 0; j < pi->nqsets; j++) {
3576 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3577 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3578 					  *ctrlqpoid, *lropoid;
3579 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3580 					       *txqpoidlist, *ctrlqpoidlist,
3581 					       *lropoidlist;
3582 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3583 
3584 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3585 
3586 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3587 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3588 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3589 
3590 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3591 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3592 					"freelist #0 empty");
3593 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3594 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3595 					"freelist #1 empty");
3596 
3597 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3598 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3599 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3600 
3601 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3602 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3603 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3604 
3605 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3606 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3607 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3608 
3609 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3610 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3611 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3612 
3613 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3614 			    CTLFLAG_RD, &qs->rspq.size,
3615 			    0, "#entries in response queue");
3616 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3617 			    CTLFLAG_RD, &qs->rspq.cidx,
3618 			    0, "consumer index");
3619 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3620 			    CTLFLAG_RD, &qs->rspq.credits,
3621 			    0, "#credits");
3622 			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3623 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3624 			    "physical_address_of the queue");
3625 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3626 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3627 			    0, "start rspq dump entry");
3628 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3629 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3630 			    0, "#rspq entries to dump");
3631 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3632 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3633 			    0, t3_dump_rspq, "A", "dump of the response queue");
3634 
3635 
3636 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "dropped",
3637 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3638 			    0, "#tunneled packets dropped");
3639 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3640 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3641 			    0, "#tunneled packets waiting to be sent");
3642 #if 0
3643 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3644 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3645 			    0, "#tunneled packets queue producer index");
3646 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3647 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3648 			    0, "#tunneled packets queue consumer index");
3649 #endif
3650 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3651 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3652 			    0, "#tunneled packets processed by the card");
3653 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3654 			    CTLFLAG_RD, &txq->cleaned,
3655 			    0, "#tunneled packets cleaned");
3656 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3657 			    CTLFLAG_RD, &txq->in_use,
3658 			    0, "#tunneled packet slots in use");
3659 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3660 			    CTLFLAG_RD, &txq->txq_frees,
3661 			    "#tunneled packets freed");
3662 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3663 			    CTLFLAG_RD, &txq->txq_skipped,
3664 			    0, "#tunneled packet descriptors skipped");
3665 			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3666 			    CTLFLAG_RD, &txq->txq_coalesced,
3667 			    "#tunneled packets coalesced");
3668 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3669 			    CTLFLAG_RD, &txq->txq_enqueued,
3670 			    0, "#tunneled packets enqueued to hardware");
3671 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3672 			    CTLFLAG_RD, &qs->txq_stopped,
3673 			    0, "tx queues stopped");
3674 			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3675 			    CTLFLAG_RD, &txq->phys_addr,
3676 			    "physical_address_of the queue");
3677 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3678 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3679 			    0, "txq generation");
3680 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3681 			    CTLFLAG_RD, &txq->cidx,
3682 			    0, "hardware queue cidx");
3683 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3684 			    CTLFLAG_RD, &txq->pidx,
3685 			    0, "hardware queue pidx");
3686 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3687 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3688 			    0, "txq start idx for dump");
3689 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3690 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3691 			    0, "txq #entries to dump");
3692 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3693 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3694 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3695 
3696 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3697 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3698 			    0, "ctrlq start idx for dump");
3699 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3700 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3701 			    0, "ctrl #entries to dump");
3702 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3703 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3704 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3705 
3706 #ifdef LRO_SUPPORTED
3707 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3708 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3709 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3710 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3711 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3712 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3713 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3714 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3715 #endif
3716 		}
3717 
3718 		/* Now add a node for mac stats. */
3719 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3720 		    CTLFLAG_RD, NULL, "MAC statistics");
3721 		poidlist = SYSCTL_CHILDREN(poid);
3722 
3723 		/*
3724 		 * We (ab)use the length argument (arg2) to pass on the offset
3725 		 * of the data that we are interested in.  This is only required
3726 		 * for the quad counters that are updated from the hardware (we
3727 		 * make sure that we return the latest value).
3728 		 * sysctl_handle_macstat first updates *all* the counters from
3729 		 * the hardware, and then returns the latest value of the
3730 		 * requested counter.  Best would be to update only the
3731 		 * requested counter from hardware, but t3_mac_update_stats()
3732 		 * hides all the register details and we don't want to dive into
3733 		 * all that here.
3734 		 */
3735 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3736     (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3737     sysctl_handle_macstat, "QU", 0)
3738 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3739 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3740 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3741 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3742 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3743 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3744 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3745 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3746 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3747 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3748 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3749 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3750 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3751 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3752 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3753 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3754 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3755 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3756 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3757 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3758 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3759 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3760 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3761 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3762 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3763 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3764 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3765 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3766 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3767 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3768 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3769 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3770 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3771 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3772 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3773 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3774 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3775 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3776 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3777 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3778 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3779 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3780 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3781 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3782 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3783 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3784 #undef CXGB_SYSCTL_ADD_QUAD
3785 
3786 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3787     CTLFLAG_RD, &mstats->a, 0)
3788 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3789 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3790 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3791 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3792 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3793 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3794 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3795 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3796 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3797 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3798 #undef CXGB_SYSCTL_ADD_ULONG
3799 	}
3800 }
3801 
3802 /**
3803  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3804  *	@qs: the queue set
3805  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3806  *	@idx: the descriptor index in the queue
3807  *	@data: where to dump the descriptor contents
3808  *
3809  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3810  *	size of the descriptor.
3811  */
3812 int
3813 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3814 		unsigned char *data)
3815 {
3816 	if (qnum >= 6)
3817 		return (EINVAL);
3818 
3819 	if (qnum < 3) {
3820 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3821 			return -EINVAL;
3822 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3823 		return sizeof(struct tx_desc);
3824 	}
3825 
3826 	if (qnum == 3) {
3827 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3828 			return (EINVAL);
3829 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3830 		return sizeof(struct rsp_desc);
3831 	}
3832 
3833 	qnum -= 4;
3834 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3835 		return (EINVAL);
3836 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3837 	return sizeof(struct rx_desc);
3838 }
3839