xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 9bd497b8354567454e075076d40c996e21bd6095)
1 /**************************************************************************
2 
3 Copyright (c) 2007-2009, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/conf.h>
39 #include <machine/bus.h>
40 #include <machine/resource.h>
41 #include <sys/bus_dma.h>
42 #include <sys/rman.h>
43 #include <sys/queue.h>
44 #include <sys/sysctl.h>
45 #include <sys/taskqueue.h>
46 
47 #include <sys/proc.h>
48 #include <sys/sbuf.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/systm.h>
52 #include <sys/syslog.h>
53 
54 #include <net/bpf.h>
55 
56 #include <netinet/in_systm.h>
57 #include <netinet/in.h>
58 #include <netinet/ip.h>
59 #include <netinet/tcp.h>
60 
61 #include <dev/pci/pcireg.h>
62 #include <dev/pci/pcivar.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 
67 #include <cxgb_include.h>
68 #include <sys/mvec.h>
69 
70 int	txq_fills = 0;
71 int	multiq_tx_enable = 1;
72 
73 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
74 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
75 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
76 SYSCTL_UINT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
77     "size of per-queue mbuf ring");
78 
79 static int cxgb_tx_coalesce_force = 0;
80 TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
81 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
82     &cxgb_tx_coalesce_force, 0,
83     "coalesce small packets into a single work request regardless of ring state");
84 
85 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
86 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
87 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
88 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
89 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
90 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
91 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
92 
93 
94 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
95 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
96     &cxgb_tx_coalesce_enable_start);
97 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
98     &cxgb_tx_coalesce_enable_start, 0,
99     "coalesce enable threshold");
100 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
101 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
102 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
103     &cxgb_tx_coalesce_enable_stop, 0,
104     "coalesce disable threshold");
105 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
106 TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
107 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
108     &cxgb_tx_reclaim_threshold, 0,
109     "tx cleaning minimum threshold");
110 
111 /*
112  * XXX don't re-enable this until TOE stops assuming
113  * we have an m_ext
114  */
115 static int recycle_enable = 0;
116 int cxgb_ext_freed = 0;
117 int cxgb_ext_inited = 0;
118 int fl_q_size = 0;
119 int jumbo_q_size = 0;
120 
121 extern int cxgb_use_16k_clusters;
122 extern int nmbjumbo4;
123 extern int nmbjumbo9;
124 extern int nmbjumbo16;
125 
126 #define USE_GTS 0
127 
128 #define SGE_RX_SM_BUF_SIZE	1536
129 #define SGE_RX_DROP_THRES	16
130 #define SGE_RX_COPY_THRES	128
131 
132 /*
133  * Period of the Tx buffer reclaim timer.  This timer does not need to run
134  * frequently as Tx buffers are usually reclaimed by new Tx packets.
135  */
136 #define TX_RECLAIM_PERIOD       (hz >> 1)
137 
138 /*
139  * Values for sge_txq.flags
140  */
141 enum {
142 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
143 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
144 };
145 
146 struct tx_desc {
147 	uint64_t	flit[TX_DESC_FLITS];
148 } __packed;
149 
150 struct rx_desc {
151 	uint32_t	addr_lo;
152 	uint32_t	len_gen;
153 	uint32_t	gen2;
154 	uint32_t	addr_hi;
155 } __packed;
156 
157 struct rsp_desc {               /* response queue descriptor */
158 	struct rss_header	rss_hdr;
159 	uint32_t		flags;
160 	uint32_t		len_cq;
161 	uint8_t			imm_data[47];
162 	uint8_t			intr_gen;
163 } __packed;
164 
165 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
166 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
167 #define RX_SW_DESC_INUSE        (1 << 3)
168 #define TX_SW_DESC_MAPPED       (1 << 4)
169 
170 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
171 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
172 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
173 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
174 
175 struct tx_sw_desc {                /* SW state per Tx descriptor */
176 	struct mbuf	*m;
177 	bus_dmamap_t	map;
178 	int		flags;
179 };
180 
181 struct rx_sw_desc {                /* SW state per Rx descriptor */
182 	caddr_t		rxsd_cl;
183 	struct mbuf	*m;
184 	bus_dmamap_t	map;
185 	int		flags;
186 };
187 
188 struct txq_state {
189 	unsigned int	compl;
190 	unsigned int	gen;
191 	unsigned int	pidx;
192 };
193 
194 struct refill_fl_cb_arg {
195 	int               error;
196 	bus_dma_segment_t seg;
197 	int               nseg;
198 };
199 
200 
201 /*
202  * Maps a number of flits to the number of Tx descriptors that can hold them.
203  * The formula is
204  *
205  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
206  *
207  * HW allows up to 4 descriptors to be combined into a WR.
208  */
209 static uint8_t flit_desc_map[] = {
210 	0,
211 #if SGE_NUM_GENBITS == 1
212 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
214 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
215 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
216 #elif SGE_NUM_GENBITS == 2
217 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
218 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
219 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
220 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
221 #else
222 # error "SGE_NUM_GENBITS must be 1 or 2"
223 #endif
224 };
225 
226 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
227 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
228 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
229 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
230 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
231 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
232 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
233 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
234 #define	TXQ_RING_DEQUEUE(qs) \
235 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236 
237 int cxgb_debug = 0;
238 
239 static void sge_timer_cb(void *arg);
240 static void sge_timer_reclaim(void *arg, int ncount);
241 static void sge_txq_reclaim_handler(void *arg, int ncount);
242 static void cxgb_start_locked(struct sge_qset *qs);
243 
244 /*
245  * XXX need to cope with bursty scheduling by looking at a wider
246  * window than we are now for determining the need for coalescing
247  *
248  */
249 static __inline uint64_t
250 check_pkt_coalesce(struct sge_qset *qs)
251 {
252         struct adapter *sc;
253         struct sge_txq *txq;
254 	uint8_t *fill;
255 
256 	if (__predict_false(cxgb_tx_coalesce_force))
257 		return (1);
258 	txq = &qs->txq[TXQ_ETH];
259         sc = qs->port->adapter;
260 	fill = &sc->tunq_fill[qs->idx];
261 
262 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
263 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
264 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
265 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
266 	/*
267 	 * if the hardware transmit queue is more than 1/8 full
268 	 * we mark it as coalescing - we drop back from coalescing
269 	 * when we go below 1/32 full and there are no packets enqueued,
270 	 * this provides us with some degree of hysteresis
271 	 */
272         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
273 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
274                 *fill = 0;
275         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
276                 *fill = 1;
277 
278 	return (sc->tunq_coalesce);
279 }
280 
281 #ifdef __LP64__
282 static void
283 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
284 {
285 	uint64_t wr_hilo;
286 #if _BYTE_ORDER == _LITTLE_ENDIAN
287 	wr_hilo = wr_hi;
288 	wr_hilo |= (((uint64_t)wr_lo)<<32);
289 #else
290 	wr_hilo = wr_lo;
291 	wr_hilo |= (((uint64_t)wr_hi)<<32);
292 #endif
293 	wrp->wrh_hilo = wr_hilo;
294 }
295 #else
296 static void
297 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
298 {
299 
300 	wrp->wrh_hi = wr_hi;
301 	wmb();
302 	wrp->wrh_lo = wr_lo;
303 }
304 #endif
305 
306 struct coalesce_info {
307 	int count;
308 	int nbytes;
309 };
310 
311 static int
312 coalesce_check(struct mbuf *m, void *arg)
313 {
314 	struct coalesce_info *ci = arg;
315 	int *count = &ci->count;
316 	int *nbytes = &ci->nbytes;
317 
318 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
319 		(*count < 7) && (m->m_next == NULL))) {
320 		*count += 1;
321 		*nbytes += m->m_len;
322 		return (1);
323 	}
324 	return (0);
325 }
326 
327 static struct mbuf *
328 cxgb_dequeue(struct sge_qset *qs)
329 {
330 	struct mbuf *m, *m_head, *m_tail;
331 	struct coalesce_info ci;
332 
333 
334 	if (check_pkt_coalesce(qs) == 0)
335 		return TXQ_RING_DEQUEUE(qs);
336 
337 	m_head = m_tail = NULL;
338 	ci.count = ci.nbytes = 0;
339 	do {
340 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
341 		if (m_head == NULL) {
342 			m_tail = m_head = m;
343 		} else if (m != NULL) {
344 			m_tail->m_nextpkt = m;
345 			m_tail = m;
346 		}
347 	} while (m != NULL);
348 	if (ci.count > 7)
349 		panic("trying to coalesce %d packets in to one WR", ci.count);
350 	return (m_head);
351 }
352 
353 /**
354  *	reclaim_completed_tx - reclaims completed Tx descriptors
355  *	@adapter: the adapter
356  *	@q: the Tx queue to reclaim completed descriptors from
357  *
358  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
359  *	and frees the associated buffers if possible.  Called with the Tx
360  *	queue's lock held.
361  */
362 static __inline int
363 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
364 {
365 	struct sge_txq *q = &qs->txq[queue];
366 	int reclaim = desc_reclaimable(q);
367 
368 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
369 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
370 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
371 
372 	if (reclaim < reclaim_min)
373 		return (0);
374 
375 	mtx_assert(&qs->lock, MA_OWNED);
376 	if (reclaim > 0) {
377 		t3_free_tx_desc(qs, reclaim, queue);
378 		q->cleaned += reclaim;
379 		q->in_use -= reclaim;
380 	}
381 	if (isset(&qs->txq_stopped, TXQ_ETH))
382                 clrbit(&qs->txq_stopped, TXQ_ETH);
383 
384 	return (reclaim);
385 }
386 
387 /**
388  *	should_restart_tx - are there enough resources to restart a Tx queue?
389  *	@q: the Tx queue
390  *
391  *	Checks if there are enough descriptors to restart a suspended Tx queue.
392  */
393 static __inline int
394 should_restart_tx(const struct sge_txq *q)
395 {
396 	unsigned int r = q->processed - q->cleaned;
397 
398 	return q->in_use - r < (q->size >> 1);
399 }
400 
401 /**
402  *	t3_sge_init - initialize SGE
403  *	@adap: the adapter
404  *	@p: the SGE parameters
405  *
406  *	Performs SGE initialization needed every time after a chip reset.
407  *	We do not initialize any of the queue sets here, instead the driver
408  *	top-level must request those individually.  We also do not enable DMA
409  *	here, that should be done after the queues have been set up.
410  */
411 void
412 t3_sge_init(adapter_t *adap, struct sge_params *p)
413 {
414 	u_int ctrl, ups;
415 
416 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
417 
418 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
419 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
420 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
421 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
422 #if SGE_NUM_GENBITS == 1
423 	ctrl |= F_EGRGENCTRL;
424 #endif
425 	if (adap->params.rev > 0) {
426 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
427 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
428 	}
429 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
430 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
431 		     V_LORCQDRBTHRSH(512));
432 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
433 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
434 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
435 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
436 		     adap->params.rev < T3_REV_C ? 1000 : 500);
437 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
438 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
439 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
440 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
441 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
442 }
443 
444 
445 /**
446  *	sgl_len - calculates the size of an SGL of the given capacity
447  *	@n: the number of SGL entries
448  *
449  *	Calculates the number of flits needed for a scatter/gather list that
450  *	can hold the given number of entries.
451  */
452 static __inline unsigned int
453 sgl_len(unsigned int n)
454 {
455 	return ((3 * n) / 2 + (n & 1));
456 }
457 
458 /**
459  *	get_imm_packet - return the next ingress packet buffer from a response
460  *	@resp: the response descriptor containing the packet data
461  *
462  *	Return a packet containing the immediate data of the given response.
463  */
464 static int
465 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
466 {
467 
468 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
469 	m->m_ext.ext_buf = NULL;
470 	m->m_ext.ext_type = 0;
471 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
472 	return (0);
473 }
474 
475 static __inline u_int
476 flits_to_desc(u_int n)
477 {
478 	return (flit_desc_map[n]);
479 }
480 
481 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
482 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
483 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
484 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
485 		    F_HIRCQPARITYERROR)
486 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
487 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
488 		      F_RSPQDISABLED)
489 
490 /**
491  *	t3_sge_err_intr_handler - SGE async event interrupt handler
492  *	@adapter: the adapter
493  *
494  *	Interrupt handler for SGE asynchronous (non-data) events.
495  */
496 void
497 t3_sge_err_intr_handler(adapter_t *adapter)
498 {
499 	unsigned int v, status;
500 
501 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
502 	if (status & SGE_PARERR)
503 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
504 			 status & SGE_PARERR);
505 	if (status & SGE_FRAMINGERR)
506 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
507 			 status & SGE_FRAMINGERR);
508 	if (status & F_RSPQCREDITOVERFOW)
509 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
510 
511 	if (status & F_RSPQDISABLED) {
512 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
513 
514 		CH_ALERT(adapter,
515 			 "packet delivered to disabled response queue (0x%x)\n",
516 			 (v >> S_RSPQ0DISABLED) & 0xff);
517 	}
518 
519 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
520 	if (status & SGE_FATALERR)
521 		t3_fatal_err(adapter);
522 }
523 
524 void
525 t3_sge_prep(adapter_t *adap, struct sge_params *p)
526 {
527 	int i, nqsets;
528 
529 	nqsets = min(SGE_QSETS, mp_ncpus*4);
530 
531 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
532 
533 	while (!powerof2(fl_q_size))
534 		fl_q_size--;
535 #if __FreeBSD_version >= 700111
536 	if (cxgb_use_16k_clusters)
537 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
538 	else
539 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
540 #else
541 	jumbo_q_size = min(nmbjumbo4/(3*nqsets), JUMBO_Q_SIZE);
542 #endif
543 	while (!powerof2(jumbo_q_size))
544 		jumbo_q_size--;
545 
546 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
547 		device_printf(adap->dev,
548 		    "Insufficient clusters and/or jumbo buffers.\n");
549 
550 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
551 	p->max_pkt_size = adap->sge.qs[0].fl[1].buf_size - sizeof(struct cpl_rx_data);
552 
553 	for (i = 0; i < SGE_QSETS; ++i) {
554 		struct qset_params *q = p->qset + i;
555 
556 		if (adap->params.nports > 2) {
557 			q->coalesce_usecs = 50;
558 		} else {
559 #ifdef INVARIANTS
560 			q->coalesce_usecs = 10;
561 #else
562 			q->coalesce_usecs = 5;
563 #endif
564 		}
565 		q->polling = 0;
566 		q->rspq_size = RSPQ_Q_SIZE;
567 		q->fl_size = fl_q_size;
568 		q->jumbo_size = jumbo_q_size;
569 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
570 		q->txq_size[TXQ_OFLD] = 1024;
571 		q->txq_size[TXQ_CTRL] = 256;
572 		q->cong_thres = 0;
573 	}
574 }
575 
576 int
577 t3_sge_alloc(adapter_t *sc)
578 {
579 
580 	/* The parent tag. */
581 	if (bus_dma_tag_create( NULL,			/* parent */
582 				1, 0,			/* algnmnt, boundary */
583 				BUS_SPACE_MAXADDR,	/* lowaddr */
584 				BUS_SPACE_MAXADDR,	/* highaddr */
585 				NULL, NULL,		/* filter, filterarg */
586 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
587 				BUS_SPACE_UNRESTRICTED, /* nsegments */
588 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
589 				0,			/* flags */
590 				NULL, NULL,		/* lock, lockarg */
591 				&sc->parent_dmat)) {
592 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
593 		return (ENOMEM);
594 	}
595 
596 	/*
597 	 * DMA tag for normal sized RX frames
598 	 */
599 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
600 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
601 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
602 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
603 		return (ENOMEM);
604 	}
605 
606 	/*
607 	 * DMA tag for jumbo sized RX frames.
608 	 */
609 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
610 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
611 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
612 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
613 		return (ENOMEM);
614 	}
615 
616 	/*
617 	 * DMA tag for TX frames.
618 	 */
619 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
620 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
621 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
622 		NULL, NULL, &sc->tx_dmat)) {
623 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
624 		return (ENOMEM);
625 	}
626 
627 	return (0);
628 }
629 
630 int
631 t3_sge_free(struct adapter * sc)
632 {
633 
634 	if (sc->tx_dmat != NULL)
635 		bus_dma_tag_destroy(sc->tx_dmat);
636 
637 	if (sc->rx_jumbo_dmat != NULL)
638 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
639 
640 	if (sc->rx_dmat != NULL)
641 		bus_dma_tag_destroy(sc->rx_dmat);
642 
643 	if (sc->parent_dmat != NULL)
644 		bus_dma_tag_destroy(sc->parent_dmat);
645 
646 	return (0);
647 }
648 
649 void
650 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
651 {
652 
653 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
654 	qs->rspq.polling = 0 /* p->polling */;
655 }
656 
657 #if !defined(__i386__) && !defined(__amd64__)
658 static void
659 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
660 {
661 	struct refill_fl_cb_arg *cb_arg = arg;
662 
663 	cb_arg->error = error;
664 	cb_arg->seg = segs[0];
665 	cb_arg->nseg = nseg;
666 
667 }
668 #endif
669 /**
670  *	refill_fl - refill an SGE free-buffer list
671  *	@sc: the controller softc
672  *	@q: the free-list to refill
673  *	@n: the number of new buffers to allocate
674  *
675  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
676  *	The caller must assure that @n does not exceed the queue's capacity.
677  */
678 static void
679 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
680 {
681 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
682 	struct rx_desc *d = &q->desc[q->pidx];
683 	struct refill_fl_cb_arg cb_arg;
684 	struct mbuf *m;
685 	caddr_t cl;
686 	int err, count = 0;
687 
688 	cb_arg.error = 0;
689 	while (n--) {
690 		/*
691 		 * We only allocate a cluster, mbuf allocation happens after rx
692 		 */
693 		if (q->zone == zone_pack) {
694 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
695 				break;
696 			cl = m->m_ext.ext_buf;
697 		} else {
698 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
699 				break;
700 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
701 				uma_zfree(q->zone, cl);
702 				break;
703 			}
704 		}
705 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
706 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
707 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
708 				uma_zfree(q->zone, cl);
709 				goto done;
710 			}
711 			sd->flags |= RX_SW_DESC_MAP_CREATED;
712 		}
713 #if !defined(__i386__) && !defined(__amd64__)
714 		err = bus_dmamap_load(q->entry_tag, sd->map,
715 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
716 
717 		if (err != 0 || cb_arg.error) {
718 			if (q->zone == zone_pack)
719 				uma_zfree(q->zone, cl);
720 			m_free(m);
721 			goto done;
722 		}
723 #else
724 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
725 #endif
726 		sd->flags |= RX_SW_DESC_INUSE;
727 		sd->rxsd_cl = cl;
728 		sd->m = m;
729 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
730 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
731 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
732 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
733 
734 		d++;
735 		sd++;
736 
737 		if (++q->pidx == q->size) {
738 			q->pidx = 0;
739 			q->gen ^= 1;
740 			sd = q->sdesc;
741 			d = q->desc;
742 		}
743 		q->credits++;
744 		count++;
745 	}
746 
747 done:
748 	if (count)
749 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
750 }
751 
752 
753 /**
754  *	free_rx_bufs - free the Rx buffers on an SGE free list
755  *	@sc: the controle softc
756  *	@q: the SGE free list to clean up
757  *
758  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
759  *	this queue should be stopped before calling this function.
760  */
761 static void
762 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
763 {
764 	u_int cidx = q->cidx;
765 
766 	while (q->credits--) {
767 		struct rx_sw_desc *d = &q->sdesc[cidx];
768 
769 		if (d->flags & RX_SW_DESC_INUSE) {
770 			bus_dmamap_unload(q->entry_tag, d->map);
771 			bus_dmamap_destroy(q->entry_tag, d->map);
772 			if (q->zone == zone_pack) {
773 				m_init(d->m, zone_pack, MCLBYTES,
774 				    M_NOWAIT, MT_DATA, M_EXT);
775 				uma_zfree(zone_pack, d->m);
776 			} else {
777 				m_init(d->m, zone_mbuf, MLEN,
778 				    M_NOWAIT, MT_DATA, 0);
779 				uma_zfree(zone_mbuf, d->m);
780 				uma_zfree(q->zone, d->rxsd_cl);
781 			}
782 		}
783 
784 		d->rxsd_cl = NULL;
785 		d->m = NULL;
786 		if (++cidx == q->size)
787 			cidx = 0;
788 	}
789 }
790 
791 static __inline void
792 __refill_fl(adapter_t *adap, struct sge_fl *fl)
793 {
794 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
795 }
796 
797 static __inline void
798 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
799 {
800 	if ((fl->size - fl->credits) < max)
801 		refill_fl(adap, fl, min(max, fl->size - fl->credits));
802 }
803 
804 /**
805  *	recycle_rx_buf - recycle a receive buffer
806  *	@adapter: the adapter
807  *	@q: the SGE free list
808  *	@idx: index of buffer to recycle
809  *
810  *	Recycles the specified buffer on the given free list by adding it at
811  *	the next available slot on the list.
812  */
813 static void
814 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
815 {
816 	struct rx_desc *from = &q->desc[idx];
817 	struct rx_desc *to   = &q->desc[q->pidx];
818 
819 	q->sdesc[q->pidx] = q->sdesc[idx];
820 	to->addr_lo = from->addr_lo;        // already big endian
821 	to->addr_hi = from->addr_hi;        // likewise
822 	wmb();	/* necessary ? */
823 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
824 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
825 	q->credits++;
826 
827 	if (++q->pidx == q->size) {
828 		q->pidx = 0;
829 		q->gen ^= 1;
830 	}
831 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
832 }
833 
834 static void
835 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
836 {
837 	uint32_t *addr;
838 
839 	addr = arg;
840 	*addr = segs[0].ds_addr;
841 }
842 
843 static int
844 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
845     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
846     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
847 {
848 	size_t len = nelem * elem_size;
849 	void *s = NULL;
850 	void *p = NULL;
851 	int err;
852 
853 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
854 				      BUS_SPACE_MAXADDR_32BIT,
855 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
856 				      len, 0, NULL, NULL, tag)) != 0) {
857 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
858 		return (ENOMEM);
859 	}
860 
861 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
862 				    map)) != 0) {
863 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
864 		return (ENOMEM);
865 	}
866 
867 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
868 	bzero(p, len);
869 	*(void **)desc = p;
870 
871 	if (sw_size) {
872 		len = nelem * sw_size;
873 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
874 		*(void **)sdesc = s;
875 	}
876 	if (parent_entry_tag == NULL)
877 		return (0);
878 
879 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
880 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
881 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
882 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
883 		                      NULL, NULL, entry_tag)) != 0) {
884 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
885 		return (ENOMEM);
886 	}
887 	return (0);
888 }
889 
890 static void
891 sge_slow_intr_handler(void *arg, int ncount)
892 {
893 	adapter_t *sc = arg;
894 
895 	t3_slow_intr_handler(sc);
896 }
897 
898 /**
899  *	sge_timer_cb - perform periodic maintenance of an SGE qset
900  *	@data: the SGE queue set to maintain
901  *
902  *	Runs periodically from a timer to perform maintenance of an SGE queue
903  *	set.  It performs two tasks:
904  *
905  *	a) Cleans up any completed Tx descriptors that may still be pending.
906  *	Normal descriptor cleanup happens when new packets are added to a Tx
907  *	queue so this timer is relatively infrequent and does any cleanup only
908  *	if the Tx queue has not seen any new packets in a while.  We make a
909  *	best effort attempt to reclaim descriptors, in that we don't wait
910  *	around if we cannot get a queue's lock (which most likely is because
911  *	someone else is queueing new packets and so will also handle the clean
912  *	up).  Since control queues use immediate data exclusively we don't
913  *	bother cleaning them up here.
914  *
915  *	b) Replenishes Rx queues that have run out due to memory shortage.
916  *	Normally new Rx buffers are added when existing ones are consumed but
917  *	when out of memory a queue can become empty.  We try to add only a few
918  *	buffers here, the queue will be replenished fully as these new buffers
919  *	are used up if memory shortage has subsided.
920  *
921  *	c) Return coalesced response queue credits in case a response queue is
922  *	starved.
923  *
924  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
925  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
926  */
927 static void
928 sge_timer_cb(void *arg)
929 {
930 	adapter_t *sc = arg;
931 	if ((sc->flags & USING_MSIX) == 0) {
932 
933 		struct port_info *pi;
934 		struct sge_qset *qs;
935 		struct sge_txq  *txq;
936 		int i, j;
937 		int reclaim_ofl, refill_rx;
938 
939 		if (sc->open_device_map == 0)
940 			return;
941 
942 		for (i = 0; i < sc->params.nports; i++) {
943 			pi = &sc->port[i];
944 			for (j = 0; j < pi->nqsets; j++) {
945 				qs = &sc->sge.qs[pi->first_qset + j];
946 				txq = &qs->txq[0];
947 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
948 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
949 				    (qs->fl[1].credits < qs->fl[1].size));
950 				if (reclaim_ofl || refill_rx) {
951 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
952 					break;
953 				}
954 			}
955 		}
956 	}
957 
958 	if (sc->params.nports > 2) {
959 		int i;
960 
961 		for_each_port(sc, i) {
962 			struct port_info *pi = &sc->port[i];
963 
964 			t3_write_reg(sc, A_SG_KDOORBELL,
965 				     F_SELEGRCNTX |
966 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
967 		}
968 	}
969 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
970 	    sc->open_device_map != 0)
971 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
972 }
973 
974 /*
975  * This is meant to be a catch-all function to keep sge state private
976  * to sge.c
977  *
978  */
979 int
980 t3_sge_init_adapter(adapter_t *sc)
981 {
982 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
983 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
984 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
985 	return (0);
986 }
987 
988 int
989 t3_sge_reset_adapter(adapter_t *sc)
990 {
991 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
992 	return (0);
993 }
994 
995 int
996 t3_sge_init_port(struct port_info *pi)
997 {
998 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
999 	return (0);
1000 }
1001 
1002 /**
1003  *	refill_rspq - replenish an SGE response queue
1004  *	@adapter: the adapter
1005  *	@q: the response queue to replenish
1006  *	@credits: how many new responses to make available
1007  *
1008  *	Replenishes a response queue by making the supplied number of responses
1009  *	available to HW.
1010  */
1011 static __inline void
1012 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1013 {
1014 
1015 	/* mbufs are allocated on demand when a rspq entry is processed. */
1016 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1017 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1018 }
1019 
1020 static void
1021 sge_txq_reclaim_handler(void *arg, int ncount)
1022 {
1023 	struct sge_qset *qs = arg;
1024 	int i;
1025 
1026 	for (i = 0; i < 3; i++)
1027 		reclaim_completed_tx(qs, 16, i);
1028 }
1029 
1030 static void
1031 sge_timer_reclaim(void *arg, int ncount)
1032 {
1033 	struct port_info *pi = arg;
1034 	int i, nqsets = pi->nqsets;
1035 	adapter_t *sc = pi->adapter;
1036 	struct sge_qset *qs;
1037 	struct mtx *lock;
1038 
1039 	KASSERT((sc->flags & USING_MSIX) == 0,
1040 	    ("can't call timer reclaim for msi-x"));
1041 
1042 	for (i = 0; i < nqsets; i++) {
1043 		qs = &sc->sge.qs[pi->first_qset + i];
1044 
1045 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1046 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1047 			    &sc->sge.qs[0].rspq.lock;
1048 
1049 		if (mtx_trylock(lock)) {
1050 			/* XXX currently assume that we are *NOT* polling */
1051 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1052 
1053 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1054 				__refill_fl(sc, &qs->fl[0]);
1055 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1056 				__refill_fl(sc, &qs->fl[1]);
1057 
1058 			if (status & (1 << qs->rspq.cntxt_id)) {
1059 				if (qs->rspq.credits) {
1060 					refill_rspq(sc, &qs->rspq, 1);
1061 					qs->rspq.credits--;
1062 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1063 					    1 << qs->rspq.cntxt_id);
1064 				}
1065 			}
1066 			mtx_unlock(lock);
1067 		}
1068 	}
1069 }
1070 
1071 /**
1072  *	init_qset_cntxt - initialize an SGE queue set context info
1073  *	@qs: the queue set
1074  *	@id: the queue set id
1075  *
1076  *	Initializes the TIDs and context ids for the queues of a queue set.
1077  */
1078 static void
1079 init_qset_cntxt(struct sge_qset *qs, u_int id)
1080 {
1081 
1082 	qs->rspq.cntxt_id = id;
1083 	qs->fl[0].cntxt_id = 2 * id;
1084 	qs->fl[1].cntxt_id = 2 * id + 1;
1085 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1086 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1087 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1088 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1089 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1090 
1091 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1092 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1093 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1094 }
1095 
1096 
1097 static void
1098 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1099 {
1100 	txq->in_use += ndesc;
1101 	/*
1102 	 * XXX we don't handle stopping of queue
1103 	 * presumably start handles this when we bump against the end
1104 	 */
1105 	txqs->gen = txq->gen;
1106 	txq->unacked += ndesc;
1107 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1108 	txq->unacked &= 31;
1109 	txqs->pidx = txq->pidx;
1110 	txq->pidx += ndesc;
1111 #ifdef INVARIANTS
1112 	if (((txqs->pidx > txq->cidx) &&
1113 		(txq->pidx < txqs->pidx) &&
1114 		(txq->pidx >= txq->cidx)) ||
1115 	    ((txqs->pidx < txq->cidx) &&
1116 		(txq->pidx >= txq-> cidx)) ||
1117 	    ((txqs->pidx < txq->cidx) &&
1118 		(txq->cidx < txqs->pidx)))
1119 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1120 		    txqs->pidx, txq->pidx, txq->cidx);
1121 #endif
1122 	if (txq->pidx >= txq->size) {
1123 		txq->pidx -= txq->size;
1124 		txq->gen ^= 1;
1125 	}
1126 
1127 }
1128 
1129 /**
1130  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1131  *	@m: the packet mbufs
1132  *      @nsegs: the number of segments
1133  *
1134  * 	Returns the number of Tx descriptors needed for the given Ethernet
1135  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1136  */
1137 static __inline unsigned int
1138 calc_tx_descs(const struct mbuf *m, int nsegs)
1139 {
1140 	unsigned int flits;
1141 
1142 	if (m->m_pkthdr.len <= PIO_LEN)
1143 		return 1;
1144 
1145 	flits = sgl_len(nsegs) + 2;
1146 #ifdef TSO_SUPPORTED
1147 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1148 		flits++;
1149 #endif
1150 	return flits_to_desc(flits);
1151 }
1152 
1153 static unsigned int
1154 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1155     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1156 {
1157 	struct mbuf *m0;
1158 	int err, pktlen, pass = 0;
1159 	bus_dma_tag_t tag = txq->entry_tag;
1160 
1161 retry:
1162 	err = 0;
1163 	m0 = *m;
1164 	pktlen = m0->m_pkthdr.len;
1165 #if defined(__i386__) || defined(__amd64__)
1166 	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1167 		goto done;
1168 	} else
1169 #endif
1170 		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1171 
1172 	if (err == 0) {
1173 		goto done;
1174 	}
1175 	if (err == EFBIG && pass == 0) {
1176 		pass = 1;
1177 		/* Too many segments, try to defrag */
1178 		m0 = m_defrag(m0, M_DONTWAIT);
1179 		if (m0 == NULL) {
1180 			m_freem(*m);
1181 			*m = NULL;
1182 			return (ENOBUFS);
1183 		}
1184 		*m = m0;
1185 		goto retry;
1186 	} else if (err == ENOMEM) {
1187 		return (err);
1188 	} if (err) {
1189 		if (cxgb_debug)
1190 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1191 		m_freem(m0);
1192 		*m = NULL;
1193 		return (err);
1194 	}
1195 done:
1196 #if !defined(__i386__) && !defined(__amd64__)
1197 	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1198 #endif
1199 	txsd->flags |= TX_SW_DESC_MAPPED;
1200 
1201 	return (0);
1202 }
1203 
1204 /**
1205  *	make_sgl - populate a scatter/gather list for a packet
1206  *	@sgp: the SGL to populate
1207  *	@segs: the packet dma segments
1208  *	@nsegs: the number of segments
1209  *
1210  *	Generates a scatter/gather list for the buffers that make up a packet
1211  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1212  *	appropriately.
1213  */
1214 static __inline void
1215 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1216 {
1217 	int i, idx;
1218 
1219 	for (idx = 0, i = 0; i < nsegs; i++) {
1220 		/*
1221 		 * firmware doesn't like empty segments
1222 		 */
1223 		if (segs[i].ds_len == 0)
1224 			continue;
1225 		if (i && idx == 0)
1226 			++sgp;
1227 
1228 		sgp->len[idx] = htobe32(segs[i].ds_len);
1229 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1230 		idx ^= 1;
1231 	}
1232 
1233 	if (idx) {
1234 		sgp->len[idx] = 0;
1235 		sgp->addr[idx] = 0;
1236 	}
1237 }
1238 
1239 /**
1240  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1241  *	@adap: the adapter
1242  *	@q: the Tx queue
1243  *
1244  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1245  *	where the HW is going to sleep just after we checked, however,
1246  *	then the interrupt handler will detect the outstanding TX packet
1247  *	and ring the doorbell for us.
1248  *
1249  *	When GTS is disabled we unconditionally ring the doorbell.
1250  */
1251 static __inline void
1252 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1253 {
1254 #if USE_GTS
1255 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1256 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1257 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1258 #ifdef T3_TRACE
1259 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1260 			  q->cntxt_id);
1261 #endif
1262 		t3_write_reg(adap, A_SG_KDOORBELL,
1263 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1264 	}
1265 #else
1266 	wmb();            /* write descriptors before telling HW */
1267 	t3_write_reg(adap, A_SG_KDOORBELL,
1268 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1269 #endif
1270 }
1271 
1272 static __inline void
1273 wr_gen2(struct tx_desc *d, unsigned int gen)
1274 {
1275 #if SGE_NUM_GENBITS == 2
1276 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1277 #endif
1278 }
1279 
1280 /**
1281  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1282  *	@ndesc: number of Tx descriptors spanned by the SGL
1283  *	@txd: first Tx descriptor to be written
1284  *	@txqs: txq state (generation and producer index)
1285  *	@txq: the SGE Tx queue
1286  *	@sgl: the SGL
1287  *	@flits: number of flits to the start of the SGL in the first descriptor
1288  *	@sgl_flits: the SGL size in flits
1289  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1290  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1291  *
1292  *	Write a work request header and an associated SGL.  If the SGL is
1293  *	small enough to fit into one Tx descriptor it has already been written
1294  *	and we just need to write the WR header.  Otherwise we distribute the
1295  *	SGL across the number of descriptors it spans.
1296  */
1297 static void
1298 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1299     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1300     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1301 {
1302 
1303 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1304 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1305 
1306 	if (__predict_true(ndesc == 1)) {
1307 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1308 			V_WR_SGLSFLT(flits)) | wr_hi,
1309 		    htonl(V_WR_LEN(flits + sgl_flits) |
1310 			V_WR_GEN(txqs->gen)) | wr_lo);
1311 		/* XXX gen? */
1312 		wr_gen2(txd, txqs->gen);
1313 
1314 	} else {
1315 		unsigned int ogen = txqs->gen;
1316 		const uint64_t *fp = (const uint64_t *)sgl;
1317 		struct work_request_hdr *wp = wrp;
1318 
1319 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1320 		    V_WR_SGLSFLT(flits)) | wr_hi;
1321 
1322 		while (sgl_flits) {
1323 			unsigned int avail = WR_FLITS - flits;
1324 
1325 			if (avail > sgl_flits)
1326 				avail = sgl_flits;
1327 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1328 			sgl_flits -= avail;
1329 			ndesc--;
1330 			if (!sgl_flits)
1331 				break;
1332 
1333 			fp += avail;
1334 			txd++;
1335 			txsd++;
1336 			if (++txqs->pidx == txq->size) {
1337 				txqs->pidx = 0;
1338 				txqs->gen ^= 1;
1339 				txd = txq->desc;
1340 				txsd = txq->sdesc;
1341 			}
1342 
1343 			/*
1344 			 * when the head of the mbuf chain
1345 			 * is freed all clusters will be freed
1346 			 * with it
1347 			 */
1348 			wrp = (struct work_request_hdr *)txd;
1349 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1350 			    V_WR_SGLSFLT(1)) | wr_hi;
1351 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1352 				    sgl_flits + 1)) |
1353 			    V_WR_GEN(txqs->gen)) | wr_lo;
1354 			wr_gen2(txd, txqs->gen);
1355 			flits = 1;
1356 		}
1357 		wrp->wrh_hi |= htonl(F_WR_EOP);
1358 		wmb();
1359 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1360 		wr_gen2((struct tx_desc *)wp, ogen);
1361 	}
1362 }
1363 
1364 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1365 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1366 
1367 #ifdef VLAN_SUPPORTED
1368 #define GET_VTAG(cntrl, m) \
1369 do { \
1370 	if ((m)->m_flags & M_VLANTAG)					            \
1371 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1372 } while (0)
1373 
1374 #else
1375 #define GET_VTAG(cntrl, m)
1376 #endif
1377 
1378 static int
1379 t3_encap(struct sge_qset *qs, struct mbuf **m)
1380 {
1381 	adapter_t *sc;
1382 	struct mbuf *m0;
1383 	struct sge_txq *txq;
1384 	struct txq_state txqs;
1385 	struct port_info *pi;
1386 	unsigned int ndesc, flits, cntrl, mlen;
1387 	int err, nsegs, tso_info = 0;
1388 
1389 	struct work_request_hdr *wrp;
1390 	struct tx_sw_desc *txsd;
1391 	struct sg_ent *sgp, *sgl;
1392 	uint32_t wr_hi, wr_lo, sgl_flits;
1393 	bus_dma_segment_t segs[TX_MAX_SEGS];
1394 
1395 	struct tx_desc *txd;
1396 
1397 	pi = qs->port;
1398 	sc = pi->adapter;
1399 	txq = &qs->txq[TXQ_ETH];
1400 	txd = &txq->desc[txq->pidx];
1401 	txsd = &txq->sdesc[txq->pidx];
1402 	sgl = txq->txq_sgl;
1403 
1404 	prefetch(txd);
1405 	m0 = *m;
1406 
1407 	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
1408 	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
1409 
1410 	mtx_assert(&qs->lock, MA_OWNED);
1411 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1412 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1413 
1414 #ifdef VLAN_SUPPORTED
1415 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1416 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1417 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1418 #endif
1419 	if (m0->m_nextpkt != NULL) {
1420 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1421 		ndesc = 1;
1422 		mlen = 0;
1423 	} else {
1424 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1425 		    &m0, segs, &nsegs))) {
1426 			if (cxgb_debug)
1427 				printf("failed ... err=%d\n", err);
1428 			return (err);
1429 		}
1430 		mlen = m0->m_pkthdr.len;
1431 		ndesc = calc_tx_descs(m0, nsegs);
1432 	}
1433 	txq_prod(txq, ndesc, &txqs);
1434 
1435 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1436 	txsd->m = m0;
1437 
1438 	if (m0->m_nextpkt != NULL) {
1439 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1440 		int i, fidx;
1441 
1442 		if (nsegs > 7)
1443 			panic("trying to coalesce %d packets in to one WR", nsegs);
1444 		txq->txq_coalesced += nsegs;
1445 		wrp = (struct work_request_hdr *)txd;
1446 		flits = nsegs*2 + 1;
1447 
1448 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1449 			struct cpl_tx_pkt_batch_entry *cbe;
1450 			uint64_t flit;
1451 			uint32_t *hflit = (uint32_t *)&flit;
1452 			int cflags = m0->m_pkthdr.csum_flags;
1453 
1454 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1455 			GET_VTAG(cntrl, m0);
1456 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1457 			if (__predict_false(!(cflags & CSUM_IP)))
1458 				cntrl |= F_TXPKT_IPCSUM_DIS;
1459 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1460 				cntrl |= F_TXPKT_L4CSUM_DIS;
1461 
1462 			hflit[0] = htonl(cntrl);
1463 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1464 			flit |= htobe64(1 << 24);
1465 			cbe = &cpl_batch->pkt_entry[i];
1466 			cbe->cntrl = hflit[0];
1467 			cbe->len = hflit[1];
1468 			cbe->addr = htobe64(segs[i].ds_addr);
1469 		}
1470 
1471 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1472 		    V_WR_SGLSFLT(flits)) |
1473 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1474 		wr_lo = htonl(V_WR_LEN(flits) |
1475 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1476 		set_wr_hdr(wrp, wr_hi, wr_lo);
1477 		wmb();
1478 		wr_gen2(txd, txqs.gen);
1479 		check_ring_tx_db(sc, txq);
1480 		return (0);
1481 	} else if (tso_info) {
1482 		int min_size = TCPPKTHDRSIZE, eth_type, tagged;
1483 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1484 		struct ip *ip;
1485 		struct tcphdr *tcp;
1486 		char *pkthdr;
1487 
1488 		txd->flit[2] = 0;
1489 		GET_VTAG(cntrl, m0);
1490 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1491 		hdr->cntrl = htonl(cntrl);
1492 		hdr->len = htonl(mlen | 0x80000000);
1493 
1494 		DPRINTF("tso buf len=%d\n", mlen);
1495 
1496 		tagged = m0->m_flags & M_VLANTAG;
1497 		if (!tagged)
1498 			min_size -= ETHER_VLAN_ENCAP_LEN;
1499 
1500 		if (__predict_false(mlen < min_size)) {
1501 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1502 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1503 			    m0->m_pkthdr.csum_flags, m0->m_flags);
1504 			panic("tx tso packet too small");
1505 		}
1506 
1507 		/* Make sure that ether, ip, tcp headers are all in m0 */
1508 		if (__predict_false(m0->m_len < min_size)) {
1509 			m0 = m_pullup(m0, min_size);
1510 			if (__predict_false(m0 == NULL)) {
1511 				/* XXX panic probably an overreaction */
1512 				panic("couldn't fit header into mbuf");
1513 			}
1514 		}
1515 		pkthdr = m0->m_data;
1516 
1517 		if (tagged) {
1518 			eth_type = CPL_ETH_II_VLAN;
1519 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1520 			    ETHER_VLAN_ENCAP_LEN);
1521 		} else {
1522 			eth_type = CPL_ETH_II;
1523 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1524 		}
1525 		tcp = (struct tcphdr *)((uint8_t *)ip +
1526 		    sizeof(*ip));
1527 
1528 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1529 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1530 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1531 		hdr->lso_info = htonl(tso_info);
1532 
1533 		if (__predict_false(mlen <= PIO_LEN)) {
1534 			/* pkt not undersized but fits in PIO_LEN
1535 			 * Indicates a TSO bug at the higher levels.
1536 			 *
1537 			 */
1538 			DPRINTF("**5592 Fix** mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1539 			    m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags);
1540 			txsd->m = NULL;
1541 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1542 			flits = (mlen + 7) / 8 + 3;
1543 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1544 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1545 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1546 			wr_lo = htonl(V_WR_LEN(flits) |
1547 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1548 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1549 			wmb();
1550 			wr_gen2(txd, txqs.gen);
1551 			check_ring_tx_db(sc, txq);
1552 			return (0);
1553 		}
1554 		flits = 3;
1555 	} else {
1556 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1557 
1558 		GET_VTAG(cntrl, m0);
1559 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1560 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1561 			cntrl |= F_TXPKT_IPCSUM_DIS;
1562 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1563 			cntrl |= F_TXPKT_L4CSUM_DIS;
1564 		cpl->cntrl = htonl(cntrl);
1565 		cpl->len = htonl(mlen | 0x80000000);
1566 
1567 		if (mlen <= PIO_LEN) {
1568 			txsd->m = NULL;
1569 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1570 			flits = (mlen + 7) / 8 + 2;
1571 
1572 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1573 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1574 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1575 			wr_lo = htonl(V_WR_LEN(flits) |
1576 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1577 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1578 			wmb();
1579 			wr_gen2(txd, txqs.gen);
1580 			check_ring_tx_db(sc, txq);
1581 			return (0);
1582 		}
1583 		flits = 2;
1584 	}
1585 	wrp = (struct work_request_hdr *)txd;
1586 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1587 	make_sgl(sgp, segs, nsegs);
1588 
1589 	sgl_flits = sgl_len(nsegs);
1590 
1591 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1592 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1593 	wr_lo = htonl(V_WR_TID(txq->token));
1594 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1595 	    sgl_flits, wr_hi, wr_lo);
1596 	check_ring_tx_db(pi->adapter, txq);
1597 
1598 	return (0);
1599 }
1600 
1601 void
1602 cxgb_tx_watchdog(void *arg)
1603 {
1604 	struct sge_qset *qs = arg;
1605 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1606 
1607         if (qs->coalescing != 0 &&
1608 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1609 	    TXQ_RING_EMPTY(qs))
1610                 qs->coalescing = 0;
1611         else if (qs->coalescing == 0 &&
1612 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1613                 qs->coalescing = 1;
1614 	if (TXQ_TRYLOCK(qs)) {
1615 		qs->qs_flags |= QS_FLUSHING;
1616 		cxgb_start_locked(qs);
1617 		qs->qs_flags &= ~QS_FLUSHING;
1618 		TXQ_UNLOCK(qs);
1619 	}
1620 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1621 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1622 		    qs, txq->txq_watchdog.c_cpu);
1623 }
1624 
1625 static void
1626 cxgb_tx_timeout(void *arg)
1627 {
1628 	struct sge_qset *qs = arg;
1629 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1630 
1631 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1632                 qs->coalescing = 1;
1633 	if (TXQ_TRYLOCK(qs)) {
1634 		qs->qs_flags |= QS_TIMEOUT;
1635 		cxgb_start_locked(qs);
1636 		qs->qs_flags &= ~QS_TIMEOUT;
1637 		TXQ_UNLOCK(qs);
1638 	}
1639 }
1640 
1641 static void
1642 cxgb_start_locked(struct sge_qset *qs)
1643 {
1644 	struct mbuf *m_head = NULL;
1645 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1646 	int avail, txmax;
1647 	int in_use_init = txq->in_use;
1648 	struct port_info *pi = qs->port;
1649 	struct ifnet *ifp = pi->ifp;
1650 	avail = txq->size - txq->in_use - 4;
1651 	txmax = min(TX_START_MAX_DESC, avail);
1652 
1653 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1654 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1655 
1656 	if (!pi->link_config.link_ok) {
1657 		TXQ_RING_FLUSH(qs);
1658 		return;
1659 	}
1660 	TXQ_LOCK_ASSERT(qs);
1661 	while ((txq->in_use - in_use_init < txmax) &&
1662 	    !TXQ_RING_EMPTY(qs) &&
1663 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1664 	    pi->link_config.link_ok) {
1665 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1666 
1667 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1668 			break;
1669 		/*
1670 		 *  Encapsulation can modify our pointer, and or make it
1671 		 *  NULL on failure.  In that event, we can't requeue.
1672 		 */
1673 		if (t3_encap(qs, &m_head) || m_head == NULL)
1674 			break;
1675 
1676 		/* Send a copy of the frame to the BPF listener */
1677 		ETHER_BPF_MTAP(ifp, m_head);
1678 
1679 		/*
1680 		 * We sent via PIO, no longer need a copy
1681 		 */
1682 		if (m_head->m_nextpkt == NULL &&
1683 		    m_head->m_pkthdr.len <= PIO_LEN)
1684 			m_freem(m_head);
1685 
1686 		m_head = NULL;
1687 	}
1688 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1689 	    pi->link_config.link_ok)
1690 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1691 		    qs, txq->txq_timer.c_cpu);
1692 	if (m_head != NULL)
1693 		m_freem(m_head);
1694 }
1695 
1696 static int
1697 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1698 {
1699 	struct port_info *pi = qs->port;
1700 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1701 	struct buf_ring *br = txq->txq_mr;
1702 	int error, avail;
1703 
1704 	avail = txq->size - txq->in_use;
1705 	TXQ_LOCK_ASSERT(qs);
1706 
1707 	/*
1708 	 * We can only do a direct transmit if the following are true:
1709 	 * - we aren't coalescing (ring < 3/4 full)
1710 	 * - the link is up -- checked in caller
1711 	 * - there are no packets enqueued already
1712 	 * - there is space in hardware transmit queue
1713 	 */
1714 	if (check_pkt_coalesce(qs) == 0 &&
1715 	    TXQ_RING_EMPTY(qs) && avail > 4) {
1716 		if (t3_encap(qs, &m)) {
1717 			if (m != NULL &&
1718 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1719 				return (error);
1720 		} else {
1721 			/*
1722 			 * We've bypassed the buf ring so we need to update
1723 			 * the stats directly
1724 			 */
1725 			txq->txq_direct_packets++;
1726 			txq->txq_direct_bytes += m->m_pkthdr.len;
1727 			/*
1728 			** Send a copy of the frame to the BPF
1729 			** listener and set the watchdog on.
1730 			*/
1731 			ETHER_BPF_MTAP(ifp, m);
1732 			/*
1733 			 * We sent via PIO, no longer need a copy
1734 			 */
1735 			if (m->m_pkthdr.len <= PIO_LEN)
1736 				m_freem(m);
1737 
1738 		}
1739 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1740 		return (error);
1741 
1742 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1743 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1744 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1745 		cxgb_start_locked(qs);
1746 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1747 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1748 		    qs, txq->txq_timer.c_cpu);
1749 	return (0);
1750 }
1751 
1752 int
1753 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1754 {
1755 	struct sge_qset *qs;
1756 	struct port_info *pi = ifp->if_softc;
1757 	int error, qidx = pi->first_qset;
1758 
1759 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1760 	    ||(!pi->link_config.link_ok)) {
1761 		m_freem(m);
1762 		return (0);
1763 	}
1764 
1765 	if (m->m_flags & M_FLOWID)
1766 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1767 
1768 	qs = &pi->adapter->sge.qs[qidx];
1769 
1770 	if (TXQ_TRYLOCK(qs)) {
1771 		/* XXX running */
1772 		error = cxgb_transmit_locked(ifp, qs, m);
1773 		TXQ_UNLOCK(qs);
1774 	} else
1775 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1776 	return (error);
1777 }
1778 void
1779 cxgb_start(struct ifnet *ifp)
1780 {
1781 	struct port_info *pi = ifp->if_softc;
1782 	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
1783 
1784 	if (!pi->link_config.link_ok)
1785 		return;
1786 
1787 	TXQ_LOCK(qs);
1788 	cxgb_start_locked(qs);
1789 	TXQ_UNLOCK(qs);
1790 }
1791 
1792 void
1793 cxgb_qflush(struct ifnet *ifp)
1794 {
1795 	/*
1796 	 * flush any enqueued mbufs in the buf_rings
1797 	 * and in the transmit queues
1798 	 * no-op for now
1799 	 */
1800 	return;
1801 }
1802 
1803 /**
1804  *	write_imm - write a packet into a Tx descriptor as immediate data
1805  *	@d: the Tx descriptor to write
1806  *	@m: the packet
1807  *	@len: the length of packet data to write as immediate data
1808  *	@gen: the generation bit value to write
1809  *
1810  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1811  *	contains a work request at its beginning.  We must write the packet
1812  *	carefully so the SGE doesn't read accidentally before it's written in
1813  *	its entirety.
1814  */
1815 static __inline void
1816 write_imm(struct tx_desc *d, struct mbuf *m,
1817 	  unsigned int len, unsigned int gen)
1818 {
1819 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1820 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1821 	uint32_t wr_hi, wr_lo;
1822 
1823 	if (len > WR_LEN)
1824 		panic("len too big %d\n", len);
1825 	if (len < sizeof(*from))
1826 		panic("len too small %d", len);
1827 
1828 	memcpy(&to[1], &from[1], len - sizeof(*from));
1829 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1830 					V_WR_BCNTLFLT(len & 7));
1831 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1832 					V_WR_LEN((len + 7) / 8));
1833 	set_wr_hdr(to, wr_hi, wr_lo);
1834 	wmb();
1835 	wr_gen2(d, gen);
1836 
1837 	/*
1838 	 * This check is a hack we should really fix the logic so
1839 	 * that this can't happen
1840 	 */
1841 	if (m->m_type != MT_DONTFREE)
1842 		m_freem(m);
1843 
1844 }
1845 
1846 /**
1847  *	check_desc_avail - check descriptor availability on a send queue
1848  *	@adap: the adapter
1849  *	@q: the TX queue
1850  *	@m: the packet needing the descriptors
1851  *	@ndesc: the number of Tx descriptors needed
1852  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1853  *
1854  *	Checks if the requested number of Tx descriptors is available on an
1855  *	SGE send queue.  If the queue is already suspended or not enough
1856  *	descriptors are available the packet is queued for later transmission.
1857  *	Must be called with the Tx queue locked.
1858  *
1859  *	Returns 0 if enough descriptors are available, 1 if there aren't
1860  *	enough descriptors and the packet has been queued, and 2 if the caller
1861  *	needs to retry because there weren't enough descriptors at the
1862  *	beginning of the call but some freed up in the mean time.
1863  */
1864 static __inline int
1865 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1866 		 struct mbuf *m, unsigned int ndesc,
1867 		 unsigned int qid)
1868 {
1869 	/*
1870 	 * XXX We currently only use this for checking the control queue
1871 	 * the control queue is only used for binding qsets which happens
1872 	 * at init time so we are guaranteed enough descriptors
1873 	 */
1874 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1875 addq_exit:	mbufq_tail(&q->sendq, m);
1876 		return 1;
1877 	}
1878 	if (__predict_false(q->size - q->in_use < ndesc)) {
1879 
1880 		struct sge_qset *qs = txq_to_qset(q, qid);
1881 
1882 		setbit(&qs->txq_stopped, qid);
1883 		if (should_restart_tx(q) &&
1884 		    test_and_clear_bit(qid, &qs->txq_stopped))
1885 			return 2;
1886 
1887 		q->stops++;
1888 		goto addq_exit;
1889 	}
1890 	return 0;
1891 }
1892 
1893 
1894 /**
1895  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1896  *	@q: the SGE control Tx queue
1897  *
1898  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1899  *	that send only immediate data (presently just the control queues) and
1900  *	thus do not have any mbufs
1901  */
1902 static __inline void
1903 reclaim_completed_tx_imm(struct sge_txq *q)
1904 {
1905 	unsigned int reclaim = q->processed - q->cleaned;
1906 
1907 	q->in_use -= reclaim;
1908 	q->cleaned += reclaim;
1909 }
1910 
1911 static __inline int
1912 immediate(const struct mbuf *m)
1913 {
1914 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1915 }
1916 
1917 /**
1918  *	ctrl_xmit - send a packet through an SGE control Tx queue
1919  *	@adap: the adapter
1920  *	@q: the control queue
1921  *	@m: the packet
1922  *
1923  *	Send a packet through an SGE control Tx queue.  Packets sent through
1924  *	a control queue must fit entirely as immediate data in a single Tx
1925  *	descriptor and have no page fragments.
1926  */
1927 static int
1928 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1929 {
1930 	int ret;
1931 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1932 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1933 
1934 	if (__predict_false(!immediate(m))) {
1935 		m_freem(m);
1936 		return 0;
1937 	}
1938 
1939 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1940 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1941 
1942 	TXQ_LOCK(qs);
1943 again:	reclaim_completed_tx_imm(q);
1944 
1945 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1946 	if (__predict_false(ret)) {
1947 		if (ret == 1) {
1948 			TXQ_UNLOCK(qs);
1949 			return (ENOSPC);
1950 		}
1951 		goto again;
1952 	}
1953 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1954 
1955 	q->in_use++;
1956 	if (++q->pidx >= q->size) {
1957 		q->pidx = 0;
1958 		q->gen ^= 1;
1959 	}
1960 	TXQ_UNLOCK(qs);
1961 	wmb();
1962 	t3_write_reg(adap, A_SG_KDOORBELL,
1963 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1964 	return (0);
1965 }
1966 
1967 
1968 /**
1969  *	restart_ctrlq - restart a suspended control queue
1970  *	@qs: the queue set cotaining the control queue
1971  *
1972  *	Resumes transmission on a suspended Tx control queue.
1973  */
1974 static void
1975 restart_ctrlq(void *data, int npending)
1976 {
1977 	struct mbuf *m;
1978 	struct sge_qset *qs = (struct sge_qset *)data;
1979 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1980 	adapter_t *adap = qs->port->adapter;
1981 
1982 	TXQ_LOCK(qs);
1983 again:	reclaim_completed_tx_imm(q);
1984 
1985 	while (q->in_use < q->size &&
1986 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1987 
1988 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1989 
1990 		if (++q->pidx >= q->size) {
1991 			q->pidx = 0;
1992 			q->gen ^= 1;
1993 		}
1994 		q->in_use++;
1995 	}
1996 	if (!mbufq_empty(&q->sendq)) {
1997 		setbit(&qs->txq_stopped, TXQ_CTRL);
1998 
1999 		if (should_restart_tx(q) &&
2000 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
2001 			goto again;
2002 		q->stops++;
2003 	}
2004 	TXQ_UNLOCK(qs);
2005 	t3_write_reg(adap, A_SG_KDOORBELL,
2006 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2007 }
2008 
2009 
2010 /*
2011  * Send a management message through control queue 0
2012  */
2013 int
2014 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
2015 {
2016 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
2017 }
2018 
2019 /**
2020  *	free_qset - free the resources of an SGE queue set
2021  *	@sc: the controller owning the queue set
2022  *	@q: the queue set
2023  *
2024  *	Release the HW and SW resources associated with an SGE queue set, such
2025  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2026  *	queue set must be quiesced prior to calling this.
2027  */
2028 static void
2029 t3_free_qset(adapter_t *sc, struct sge_qset *q)
2030 {
2031 	int i;
2032 
2033 	reclaim_completed_tx(q, 0, TXQ_ETH);
2034 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2035 		if (q->txq[i].txq_mr != NULL)
2036 			buf_ring_free(q->txq[i].txq_mr, M_DEVBUF);
2037 		if (q->txq[i].txq_ifq != NULL) {
2038 			ifq_delete(q->txq[i].txq_ifq);
2039 			free(q->txq[i].txq_ifq, M_DEVBUF);
2040 		}
2041 	}
2042 
2043 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2044 		if (q->fl[i].desc) {
2045 			mtx_lock_spin(&sc->sge.reg_lock);
2046 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2047 			mtx_unlock_spin(&sc->sge.reg_lock);
2048 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2049 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2050 					q->fl[i].desc_map);
2051 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2052 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2053 		}
2054 		if (q->fl[i].sdesc) {
2055 			free_rx_bufs(sc, &q->fl[i]);
2056 			free(q->fl[i].sdesc, M_DEVBUF);
2057 		}
2058 	}
2059 
2060 	mtx_unlock(&q->lock);
2061 	MTX_DESTROY(&q->lock);
2062 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2063 		if (q->txq[i].desc) {
2064 			mtx_lock_spin(&sc->sge.reg_lock);
2065 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2066 			mtx_unlock_spin(&sc->sge.reg_lock);
2067 			bus_dmamap_unload(q->txq[i].desc_tag,
2068 					q->txq[i].desc_map);
2069 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2070 					q->txq[i].desc_map);
2071 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2072 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2073 		}
2074 		if (q->txq[i].sdesc) {
2075 			free(q->txq[i].sdesc, M_DEVBUF);
2076 		}
2077 	}
2078 
2079 	if (q->rspq.desc) {
2080 		mtx_lock_spin(&sc->sge.reg_lock);
2081 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2082 		mtx_unlock_spin(&sc->sge.reg_lock);
2083 
2084 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2085 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2086 			        q->rspq.desc_map);
2087 		bus_dma_tag_destroy(q->rspq.desc_tag);
2088 		MTX_DESTROY(&q->rspq.lock);
2089 	}
2090 
2091 #ifdef LRO_SUPPORTED
2092 	tcp_lro_free(&q->lro.ctrl);
2093 #endif
2094 
2095 	bzero(q, sizeof(*q));
2096 }
2097 
2098 /**
2099  *	t3_free_sge_resources - free SGE resources
2100  *	@sc: the adapter softc
2101  *
2102  *	Frees resources used by the SGE queue sets.
2103  */
2104 void
2105 t3_free_sge_resources(adapter_t *sc)
2106 {
2107 	int i, nqsets;
2108 
2109 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2110 		nqsets += sc->port[i].nqsets;
2111 
2112 	for (i = 0; i < nqsets; ++i) {
2113 		TXQ_LOCK(&sc->sge.qs[i]);
2114 		t3_free_qset(sc, &sc->sge.qs[i]);
2115 	}
2116 
2117 }
2118 
2119 /**
2120  *	t3_sge_start - enable SGE
2121  *	@sc: the controller softc
2122  *
2123  *	Enables the SGE for DMAs.  This is the last step in starting packet
2124  *	transfers.
2125  */
2126 void
2127 t3_sge_start(adapter_t *sc)
2128 {
2129 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2130 }
2131 
2132 /**
2133  *	t3_sge_stop - disable SGE operation
2134  *	@sc: the adapter
2135  *
2136  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2137  *	from error interrupts) or from normal process context.  In the latter
2138  *	case it also disables any pending queue restart tasklets.  Note that
2139  *	if it is called in interrupt context it cannot disable the restart
2140  *	tasklets as it cannot wait, however the tasklets will have no effect
2141  *	since the doorbells are disabled and the driver will call this again
2142  *	later from process context, at which time the tasklets will be stopped
2143  *	if they are still running.
2144  */
2145 void
2146 t3_sge_stop(adapter_t *sc)
2147 {
2148 	int i, nqsets;
2149 
2150 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2151 
2152 	if (sc->tq == NULL)
2153 		return;
2154 
2155 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2156 		nqsets += sc->port[i].nqsets;
2157 #ifdef notyet
2158 	/*
2159 	 *
2160 	 * XXX
2161 	 */
2162 	for (i = 0; i < nqsets; ++i) {
2163 		struct sge_qset *qs = &sc->sge.qs[i];
2164 
2165 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2166 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2167 	}
2168 #endif
2169 }
2170 
2171 /**
2172  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2173  *	@adapter: the adapter
2174  *	@q: the Tx queue to reclaim descriptors from
2175  *	@reclaimable: the number of descriptors to reclaim
2176  *      @m_vec_size: maximum number of buffers to reclaim
2177  *      @desc_reclaimed: returns the number of descriptors reclaimed
2178  *
2179  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2180  *	Tx buffers.  Called with the Tx queue lock held.
2181  *
2182  *      Returns number of buffers of reclaimed
2183  */
2184 void
2185 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2186 {
2187 	struct tx_sw_desc *txsd;
2188 	unsigned int cidx, mask;
2189 	struct sge_txq *q = &qs->txq[queue];
2190 
2191 #ifdef T3_TRACE
2192 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2193 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2194 #endif
2195 	cidx = q->cidx;
2196 	mask = q->size - 1;
2197 	txsd = &q->sdesc[cidx];
2198 
2199 	mtx_assert(&qs->lock, MA_OWNED);
2200 	while (reclaimable--) {
2201 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2202 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2203 
2204 		if (txsd->m != NULL) {
2205 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2206 				bus_dmamap_unload(q->entry_tag, txsd->map);
2207 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2208 			}
2209 			m_freem_list(txsd->m);
2210 			txsd->m = NULL;
2211 		} else
2212 			q->txq_skipped++;
2213 
2214 		++txsd;
2215 		if (++cidx == q->size) {
2216 			cidx = 0;
2217 			txsd = q->sdesc;
2218 		}
2219 	}
2220 	q->cidx = cidx;
2221 
2222 }
2223 
2224 /**
2225  *	is_new_response - check if a response is newly written
2226  *	@r: the response descriptor
2227  *	@q: the response queue
2228  *
2229  *	Returns true if a response descriptor contains a yet unprocessed
2230  *	response.
2231  */
2232 static __inline int
2233 is_new_response(const struct rsp_desc *r,
2234     const struct sge_rspq *q)
2235 {
2236 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2237 }
2238 
2239 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2240 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2241 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2242 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2243 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2244 
2245 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2246 #define NOMEM_INTR_DELAY 2500
2247 
2248 /**
2249  *	write_ofld_wr - write an offload work request
2250  *	@adap: the adapter
2251  *	@m: the packet to send
2252  *	@q: the Tx queue
2253  *	@pidx: index of the first Tx descriptor to write
2254  *	@gen: the generation value to use
2255  *	@ndesc: number of descriptors the packet will occupy
2256  *
2257  *	Write an offload work request to send the supplied packet.  The packet
2258  *	data already carry the work request with most fields populated.
2259  */
2260 static void
2261 write_ofld_wr(adapter_t *adap, struct mbuf *m,
2262     struct sge_txq *q, unsigned int pidx,
2263     unsigned int gen, unsigned int ndesc,
2264     bus_dma_segment_t *segs, unsigned int nsegs)
2265 {
2266 	unsigned int sgl_flits, flits;
2267 	struct work_request_hdr *from;
2268 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2269 	struct tx_desc *d = &q->desc[pidx];
2270 	struct txq_state txqs;
2271 
2272 	if (immediate(m) && nsegs == 0) {
2273 		write_imm(d, m, m->m_len, gen);
2274 		return;
2275 	}
2276 
2277 	/* Only TX_DATA builds SGLs */
2278 	from = mtod(m, struct work_request_hdr *);
2279 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2280 
2281 	flits = m->m_len / 8;
2282 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2283 
2284 	make_sgl(sgp, segs, nsegs);
2285 	sgl_flits = sgl_len(nsegs);
2286 
2287 	txqs.gen = gen;
2288 	txqs.pidx = pidx;
2289 	txqs.compl = 0;
2290 
2291 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2292 	    from->wrh_hi, from->wrh_lo);
2293 }
2294 
2295 /**
2296  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2297  *	@m: the packet
2298  *
2299  * 	Returns the number of Tx descriptors needed for the given offload
2300  * 	packet.  These packets are already fully constructed.
2301  */
2302 static __inline unsigned int
2303 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2304 {
2305 	unsigned int flits, cnt = 0;
2306 	int ndescs;
2307 
2308 	if (m->m_len <= WR_LEN && nsegs == 0)
2309 		return (1);                 /* packet fits as immediate data */
2310 
2311 	/*
2312 	 * This needs to be re-visited for TOE
2313 	 */
2314 
2315 	cnt = nsegs;
2316 
2317 	/* headers */
2318 	flits = m->m_len / 8;
2319 
2320 	ndescs = flits_to_desc(flits + sgl_len(cnt));
2321 
2322 	return (ndescs);
2323 }
2324 
2325 /**
2326  *	ofld_xmit - send a packet through an offload queue
2327  *	@adap: the adapter
2328  *	@q: the Tx offload queue
2329  *	@m: the packet
2330  *
2331  *	Send an offload packet through an SGE offload queue.
2332  */
2333 static int
2334 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2335 {
2336 	int ret, nsegs;
2337 	unsigned int ndesc;
2338 	unsigned int pidx, gen;
2339 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2340 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2341 	struct tx_sw_desc *stx;
2342 
2343 	nsegs = m_get_sgllen(m);
2344 	vsegs = m_get_sgl(m);
2345 	ndesc = calc_tx_descs_ofld(m, nsegs);
2346 	busdma_map_sgl(vsegs, segs, nsegs);
2347 
2348 	stx = &q->sdesc[q->pidx];
2349 
2350 	TXQ_LOCK(qs);
2351 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2352 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2353 	if (__predict_false(ret)) {
2354 		if (ret == 1) {
2355 			printf("no ofld desc avail\n");
2356 
2357 			m_set_priority(m, ndesc);     /* save for restart */
2358 			TXQ_UNLOCK(qs);
2359 			return (EINTR);
2360 		}
2361 		goto again;
2362 	}
2363 
2364 	gen = q->gen;
2365 	q->in_use += ndesc;
2366 	pidx = q->pidx;
2367 	q->pidx += ndesc;
2368 	if (q->pidx >= q->size) {
2369 		q->pidx -= q->size;
2370 		q->gen ^= 1;
2371 	}
2372 #ifdef T3_TRACE
2373 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2374 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2375 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2376 		  skb_shinfo(skb)->nr_frags);
2377 #endif
2378 	TXQ_UNLOCK(qs);
2379 
2380 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2381 	check_ring_tx_db(adap, q);
2382 	return (0);
2383 }
2384 
2385 /**
2386  *	restart_offloadq - restart a suspended offload queue
2387  *	@qs: the queue set cotaining the offload queue
2388  *
2389  *	Resumes transmission on a suspended Tx offload queue.
2390  */
2391 static void
2392 restart_offloadq(void *data, int npending)
2393 {
2394 	struct mbuf *m;
2395 	struct sge_qset *qs = data;
2396 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2397 	adapter_t *adap = qs->port->adapter;
2398 	bus_dma_segment_t segs[TX_MAX_SEGS];
2399 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2400 	int nsegs, cleaned;
2401 
2402 	TXQ_LOCK(qs);
2403 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2404 
2405 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2406 		unsigned int gen, pidx;
2407 		unsigned int ndesc = m_get_priority(m);
2408 
2409 		if (__predict_false(q->size - q->in_use < ndesc)) {
2410 			setbit(&qs->txq_stopped, TXQ_OFLD);
2411 			if (should_restart_tx(q) &&
2412 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2413 				goto again;
2414 			q->stops++;
2415 			break;
2416 		}
2417 
2418 		gen = q->gen;
2419 		q->in_use += ndesc;
2420 		pidx = q->pidx;
2421 		q->pidx += ndesc;
2422 		if (q->pidx >= q->size) {
2423 			q->pidx -= q->size;
2424 			q->gen ^= 1;
2425 		}
2426 
2427 		(void)mbufq_dequeue(&q->sendq);
2428 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2429 		TXQ_UNLOCK(qs);
2430 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2431 		TXQ_LOCK(qs);
2432 	}
2433 #if USE_GTS
2434 	set_bit(TXQ_RUNNING, &q->flags);
2435 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2436 #endif
2437 	TXQ_UNLOCK(qs);
2438 	wmb();
2439 	t3_write_reg(adap, A_SG_KDOORBELL,
2440 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2441 }
2442 
2443 /**
2444  *	queue_set - return the queue set a packet should use
2445  *	@m: the packet
2446  *
2447  *	Maps a packet to the SGE queue set it should use.  The desired queue
2448  *	set is carried in bits 1-3 in the packet's priority.
2449  */
2450 static __inline int
2451 queue_set(const struct mbuf *m)
2452 {
2453 	return m_get_priority(m) >> 1;
2454 }
2455 
2456 /**
2457  *	is_ctrl_pkt - return whether an offload packet is a control packet
2458  *	@m: the packet
2459  *
2460  *	Determines whether an offload packet should use an OFLD or a CTRL
2461  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2462  */
2463 static __inline int
2464 is_ctrl_pkt(const struct mbuf *m)
2465 {
2466 	return m_get_priority(m) & 1;
2467 }
2468 
2469 /**
2470  *	t3_offload_tx - send an offload packet
2471  *	@tdev: the offload device to send to
2472  *	@m: the packet
2473  *
2474  *	Sends an offload packet.  We use the packet priority to select the
2475  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2476  *	should be sent as regular or control, bits 1-3 select the queue set.
2477  */
2478 int
2479 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2480 {
2481 	adapter_t *adap = tdev2adap(tdev);
2482 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2483 
2484 	if (__predict_false(is_ctrl_pkt(m)))
2485 		return ctrl_xmit(adap, qs, m);
2486 
2487 	return ofld_xmit(adap, qs, m);
2488 }
2489 
2490 /**
2491  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2492  *	@tdev: the offload device that will be receiving the packets
2493  *	@q: the SGE response queue that assembled the bundle
2494  *	@m: the partial bundle
2495  *	@n: the number of packets in the bundle
2496  *
2497  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2498  */
2499 static __inline void
2500 deliver_partial_bundle(struct t3cdev *tdev,
2501 			struct sge_rspq *q,
2502 			struct mbuf *mbufs[], int n)
2503 {
2504 	if (n) {
2505 		q->offload_bundles++;
2506 		cxgb_ofld_recv(tdev, mbufs, n);
2507 	}
2508 }
2509 
2510 static __inline int
2511 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2512     struct mbuf *m, struct mbuf *rx_gather[],
2513     unsigned int gather_idx)
2514 {
2515 
2516 	rq->offload_pkts++;
2517 	m->m_pkthdr.header = mtod(m, void *);
2518 	rx_gather[gather_idx++] = m;
2519 	if (gather_idx == RX_BUNDLE_SIZE) {
2520 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2521 		gather_idx = 0;
2522 		rq->offload_bundles++;
2523 	}
2524 	return (gather_idx);
2525 }
2526 
2527 static void
2528 restart_tx(struct sge_qset *qs)
2529 {
2530 	struct adapter *sc = qs->port->adapter;
2531 
2532 
2533 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2534 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2535 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2536 		qs->txq[TXQ_OFLD].restarts++;
2537 		DPRINTF("restarting TXQ_OFLD\n");
2538 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2539 	}
2540 	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2541 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2542 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2543 	    qs->txq[TXQ_CTRL].in_use);
2544 
2545 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2546 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2547 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2548 		qs->txq[TXQ_CTRL].restarts++;
2549 		DPRINTF("restarting TXQ_CTRL\n");
2550 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2551 	}
2552 }
2553 
2554 /**
2555  *	t3_sge_alloc_qset - initialize an SGE queue set
2556  *	@sc: the controller softc
2557  *	@id: the queue set id
2558  *	@nports: how many Ethernet ports will be using this queue set
2559  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2560  *	@p: configuration parameters for this queue set
2561  *	@ntxq: number of Tx queues for the queue set
2562  *	@pi: port info for queue set
2563  *
2564  *	Allocate resources and initialize an SGE queue set.  A queue set
2565  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2566  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2567  *	queue, offload queue, and control queue.
2568  */
2569 int
2570 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2571 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2572 {
2573 	struct sge_qset *q = &sc->sge.qs[id];
2574 	int i, ret = 0;
2575 
2576 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2577 	q->port = pi;
2578 
2579 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2580 
2581 		if ((q->txq[i].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2582 			    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2583 			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2584 			goto err;
2585 		}
2586 		if ((q->txq[i].txq_ifq =
2587 			malloc(sizeof(struct ifaltq), M_DEVBUF, M_NOWAIT|M_ZERO))
2588 		    == NULL) {
2589 			device_printf(sc->dev, "failed to allocate ifq\n");
2590 			goto err;
2591 		}
2592 		ifq_init(q->txq[i].txq_ifq, pi->ifp);
2593 		callout_init(&q->txq[i].txq_timer, 1);
2594 		callout_init(&q->txq[i].txq_watchdog, 1);
2595 		q->txq[i].txq_timer.c_cpu = id % mp_ncpus;
2596 		q->txq[i].txq_watchdog.c_cpu = id % mp_ncpus;
2597 	}
2598 	init_qset_cntxt(q, id);
2599 	q->idx = id;
2600 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2601 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2602 		    &q->fl[0].desc, &q->fl[0].sdesc,
2603 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2604 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2605 		printf("error %d from alloc ring fl0\n", ret);
2606 		goto err;
2607 	}
2608 
2609 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2610 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2611 		    &q->fl[1].desc, &q->fl[1].sdesc,
2612 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2613 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2614 		printf("error %d from alloc ring fl1\n", ret);
2615 		goto err;
2616 	}
2617 
2618 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2619 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2620 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2621 		    NULL, NULL)) != 0) {
2622 		printf("error %d from alloc ring rspq\n", ret);
2623 		goto err;
2624 	}
2625 
2626 	for (i = 0; i < ntxq; ++i) {
2627 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2628 
2629 		if ((ret = alloc_ring(sc, p->txq_size[i],
2630 			    sizeof(struct tx_desc), sz,
2631 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2632 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2633 			    &q->txq[i].desc_map,
2634 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2635 			printf("error %d from alloc ring tx %i\n", ret, i);
2636 			goto err;
2637 		}
2638 		mbufq_init(&q->txq[i].sendq);
2639 		q->txq[i].gen = 1;
2640 		q->txq[i].size = p->txq_size[i];
2641 	}
2642 
2643 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2644 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2645 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2646 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2647 
2648 	q->fl[0].gen = q->fl[1].gen = 1;
2649 	q->fl[0].size = p->fl_size;
2650 	q->fl[1].size = p->jumbo_size;
2651 
2652 	q->rspq.gen = 1;
2653 	q->rspq.cidx = 0;
2654 	q->rspq.size = p->rspq_size;
2655 
2656 	q->txq[TXQ_ETH].stop_thres = nports *
2657 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2658 
2659 	q->fl[0].buf_size = MCLBYTES;
2660 	q->fl[0].zone = zone_pack;
2661 	q->fl[0].type = EXT_PACKET;
2662 #if __FreeBSD_version > 800000
2663 	if (cxgb_use_16k_clusters) {
2664 		q->fl[1].buf_size = MJUM16BYTES;
2665 		q->fl[1].zone = zone_jumbo16;
2666 		q->fl[1].type = EXT_JUMBO16;
2667 	} else {
2668 		q->fl[1].buf_size = MJUM9BYTES;
2669 		q->fl[1].zone = zone_jumbo9;
2670 		q->fl[1].type = EXT_JUMBO9;
2671 	}
2672 #else
2673 	q->fl[1].buf_size = MJUMPAGESIZE;
2674 	q->fl[1].zone = zone_jumbop;
2675 	q->fl[1].type = EXT_JUMBOP;
2676 #endif
2677 
2678 #ifdef LRO_SUPPORTED
2679 	/* Allocate and setup the lro_ctrl structure */
2680 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2681 	ret = tcp_lro_init(&q->lro.ctrl);
2682 	if (ret) {
2683 		printf("error %d from tcp_lro_init\n", ret);
2684 		goto err;
2685 	}
2686 	q->lro.ctrl.ifp = pi->ifp;
2687 #endif
2688 
2689 	mtx_lock_spin(&sc->sge.reg_lock);
2690 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2691 				   q->rspq.phys_addr, q->rspq.size,
2692 				   q->fl[0].buf_size, 1, 0);
2693 	if (ret) {
2694 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2695 		goto err_unlock;
2696 	}
2697 
2698 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2699 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2700 					  q->fl[i].phys_addr, q->fl[i].size,
2701 					  q->fl[i].buf_size, p->cong_thres, 1,
2702 					  0);
2703 		if (ret) {
2704 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2705 			goto err_unlock;
2706 		}
2707 	}
2708 
2709 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2710 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2711 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2712 				 1, 0);
2713 	if (ret) {
2714 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2715 		goto err_unlock;
2716 	}
2717 
2718 	if (ntxq > 1) {
2719 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2720 					 USE_GTS, SGE_CNTXT_OFLD, id,
2721 					 q->txq[TXQ_OFLD].phys_addr,
2722 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2723 		if (ret) {
2724 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2725 			goto err_unlock;
2726 		}
2727 	}
2728 
2729 	if (ntxq > 2) {
2730 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2731 					 SGE_CNTXT_CTRL, id,
2732 					 q->txq[TXQ_CTRL].phys_addr,
2733 					 q->txq[TXQ_CTRL].size,
2734 					 q->txq[TXQ_CTRL].token, 1, 0);
2735 		if (ret) {
2736 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2737 			goto err_unlock;
2738 		}
2739 	}
2740 
2741 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2742 	    device_get_unit(sc->dev), irq_vec_idx);
2743 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2744 
2745 	mtx_unlock_spin(&sc->sge.reg_lock);
2746 	t3_update_qset_coalesce(q, p);
2747 	q->port = pi;
2748 
2749 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2750 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2751 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2752 
2753 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2754 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2755 
2756 	return (0);
2757 
2758 err_unlock:
2759 	mtx_unlock_spin(&sc->sge.reg_lock);
2760 err:
2761 	TXQ_LOCK(q);
2762 	t3_free_qset(sc, q);
2763 
2764 	return (ret);
2765 }
2766 
2767 /*
2768  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2769  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2770  * will also be taken into account here.
2771  */
2772 void
2773 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2774 {
2775 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2776 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2777 	struct ifnet *ifp = pi->ifp;
2778 
2779 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2780 
2781 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2782 	    cpl->csum_valid && cpl->csum == 0xffff) {
2783 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2784 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2785 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2786 		m->m_pkthdr.csum_data = 0xffff;
2787 	}
2788 	/*
2789 	 * XXX need to add VLAN support for 6.x
2790 	 */
2791 #ifdef VLAN_SUPPORTED
2792 	if (__predict_false(cpl->vlan_valid)) {
2793 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2794 		m->m_flags |= M_VLANTAG;
2795 	}
2796 #endif
2797 
2798 	m->m_pkthdr.rcvif = ifp;
2799 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2800 	/*
2801 	 * adjust after conversion to mbuf chain
2802 	 */
2803 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2804 	m->m_len -= (sizeof(*cpl) + ethpad);
2805 	m->m_data += (sizeof(*cpl) + ethpad);
2806 }
2807 
2808 /**
2809  *	get_packet - return the next ingress packet buffer from a free list
2810  *	@adap: the adapter that received the packet
2811  *	@drop_thres: # of remaining buffers before we start dropping packets
2812  *	@qs: the qset that the SGE free list holding the packet belongs to
2813  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2814  *      @r: response descriptor
2815  *
2816  *	Get the next packet from a free list and complete setup of the
2817  *	sk_buff.  If the packet is small we make a copy and recycle the
2818  *	original buffer, otherwise we use the original buffer itself.  If a
2819  *	positive drop threshold is supplied packets are dropped and their
2820  *	buffers recycled if (a) the number of remaining buffers is under the
2821  *	threshold and the packet is too big to copy, or (b) the packet should
2822  *	be copied but there is no memory for the copy.
2823  */
2824 static int
2825 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2826     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2827 {
2828 
2829 	unsigned int len_cq =  ntohl(r->len_cq);
2830 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2831 	int mask, cidx = fl->cidx;
2832 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2833 	uint32_t len = G_RSPD_LEN(len_cq);
2834 	uint32_t flags = M_EXT;
2835 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2836 	caddr_t cl;
2837 	struct mbuf *m;
2838 	int ret = 0;
2839 
2840 	mask = fl->size - 1;
2841 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2842 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2843 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2844 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2845 
2846 	fl->credits--;
2847 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2848 
2849 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2850 	    sopeop == RSPQ_SOP_EOP) {
2851 		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2852 			goto skip_recycle;
2853 		cl = mtod(m, void *);
2854 		memcpy(cl, sd->rxsd_cl, len);
2855 		recycle_rx_buf(adap, fl, fl->cidx);
2856 		m->m_pkthdr.len = m->m_len = len;
2857 		m->m_flags = 0;
2858 		mh->mh_head = mh->mh_tail = m;
2859 		ret = 1;
2860 		goto done;
2861 	} else {
2862 	skip_recycle:
2863 		bus_dmamap_unload(fl->entry_tag, sd->map);
2864 		cl = sd->rxsd_cl;
2865 		m = sd->m;
2866 
2867 		if ((sopeop == RSPQ_SOP_EOP) ||
2868 		    (sopeop == RSPQ_SOP))
2869 			flags |= M_PKTHDR;
2870 		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2871 		if (fl->zone == zone_pack) {
2872 			/*
2873 			 * restore clobbered data pointer
2874 			 */
2875 			m->m_data = m->m_ext.ext_buf;
2876 		} else {
2877 			m_cljset(m, cl, fl->type);
2878 		}
2879 		m->m_len = len;
2880 	}
2881 	switch(sopeop) {
2882 	case RSPQ_SOP_EOP:
2883 		ret = 1;
2884 		/* FALLTHROUGH */
2885 	case RSPQ_SOP:
2886 		mh->mh_head = mh->mh_tail = m;
2887 		m->m_pkthdr.len = len;
2888 		break;
2889 	case RSPQ_EOP:
2890 		ret = 1;
2891 		/* FALLTHROUGH */
2892 	case RSPQ_NSOP_NEOP:
2893 		if (mh->mh_tail == NULL) {
2894 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2895 			m_freem(m);
2896 			break;
2897 		}
2898 		mh->mh_tail->m_next = m;
2899 		mh->mh_tail = m;
2900 		mh->mh_head->m_pkthdr.len += len;
2901 		break;
2902 	}
2903 	if (cxgb_debug)
2904 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2905 done:
2906 	if (++fl->cidx == fl->size)
2907 		fl->cidx = 0;
2908 
2909 	return (ret);
2910 }
2911 
2912 /**
2913  *	handle_rsp_cntrl_info - handles control information in a response
2914  *	@qs: the queue set corresponding to the response
2915  *	@flags: the response control flags
2916  *
2917  *	Handles the control information of an SGE response, such as GTS
2918  *	indications and completion credits for the queue set's Tx queues.
2919  *	HW coalesces credits, we don't do any extra SW coalescing.
2920  */
2921 static __inline void
2922 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2923 {
2924 	unsigned int credits;
2925 
2926 #if USE_GTS
2927 	if (flags & F_RSPD_TXQ0_GTS)
2928 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2929 #endif
2930 	credits = G_RSPD_TXQ0_CR(flags);
2931 	if (credits)
2932 		qs->txq[TXQ_ETH].processed += credits;
2933 
2934 	credits = G_RSPD_TXQ2_CR(flags);
2935 	if (credits)
2936 		qs->txq[TXQ_CTRL].processed += credits;
2937 
2938 # if USE_GTS
2939 	if (flags & F_RSPD_TXQ1_GTS)
2940 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2941 # endif
2942 	credits = G_RSPD_TXQ1_CR(flags);
2943 	if (credits)
2944 		qs->txq[TXQ_OFLD].processed += credits;
2945 
2946 }
2947 
2948 static void
2949 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2950     unsigned int sleeping)
2951 {
2952 	;
2953 }
2954 
2955 /**
2956  *	process_responses - process responses from an SGE response queue
2957  *	@adap: the adapter
2958  *	@qs: the queue set to which the response queue belongs
2959  *	@budget: how many responses can be processed in this round
2960  *
2961  *	Process responses from an SGE response queue up to the supplied budget.
2962  *	Responses include received packets as well as credits and other events
2963  *	for the queues that belong to the response queue's queue set.
2964  *	A negative budget is effectively unlimited.
2965  *
2966  *	Additionally choose the interrupt holdoff time for the next interrupt
2967  *	on this queue.  If the system is under memory shortage use a fairly
2968  *	long delay to help recovery.
2969  */
2970 static int
2971 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2972 {
2973 	struct sge_rspq *rspq = &qs->rspq;
2974 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2975 	int budget_left = budget;
2976 	unsigned int sleeping = 0;
2977 #ifdef LRO_SUPPORTED
2978 	int lro_enabled = qs->lro.enabled;
2979 	int skip_lro;
2980 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2981 #endif
2982 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2983 	int ngathered = 0;
2984 #ifdef DEBUG
2985 	static int last_holdoff = 0;
2986 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2987 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2988 		last_holdoff = rspq->holdoff_tmr;
2989 	}
2990 #endif
2991 	rspq->next_holdoff = rspq->holdoff_tmr;
2992 
2993 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2994 		int eth, eop = 0, ethpad = 0;
2995 		uint32_t flags = ntohl(r->flags);
2996 		uint32_t rss_csum = *(const uint32_t *)r;
2997 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2998 
2999 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
3000 
3001 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
3002 			struct mbuf *m;
3003 
3004 			if (cxgb_debug)
3005 				printf("async notification\n");
3006 
3007 			if (rspq->rspq_mh.mh_head == NULL) {
3008 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3009 				m = rspq->rspq_mh.mh_head;
3010 			} else {
3011 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3012 			}
3013 			if (m == NULL)
3014 				goto no_mem;
3015 
3016                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
3017 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
3018                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
3019 			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
3020 			eop = 1;
3021                         rspq->async_notif++;
3022 			goto skip;
3023 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
3024 			struct mbuf *m = NULL;
3025 
3026 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
3027 			    r->rss_hdr.opcode, rspq->cidx);
3028 			if (rspq->rspq_mh.mh_head == NULL)
3029 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3030                         else
3031 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3032 
3033 			if (rspq->rspq_mh.mh_head == NULL &&  m == NULL) {
3034 		no_mem:
3035 				rspq->next_holdoff = NOMEM_INTR_DELAY;
3036 				budget_left--;
3037 				break;
3038 			}
3039 			get_imm_packet(adap, r, rspq->rspq_mh.mh_head);
3040 			eop = 1;
3041 			rspq->imm_data++;
3042 		} else if (r->len_cq) {
3043 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3044 
3045 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
3046 			if (eop) {
3047 				rspq->rspq_mh.mh_head->m_flags |= M_FLOWID;
3048 				rspq->rspq_mh.mh_head->m_pkthdr.flowid = rss_hash;
3049 			}
3050 
3051 			ethpad = 2;
3052 		} else {
3053 			rspq->pure_rsps++;
3054 		}
3055 	skip:
3056 		if (flags & RSPD_CTRL_MASK) {
3057 			sleeping |= flags & RSPD_GTS_MASK;
3058 			handle_rsp_cntrl_info(qs, flags);
3059 		}
3060 
3061 		r++;
3062 		if (__predict_false(++rspq->cidx == rspq->size)) {
3063 			rspq->cidx = 0;
3064 			rspq->gen ^= 1;
3065 			r = rspq->desc;
3066 		}
3067 
3068 		if (++rspq->credits >= (rspq->size / 4)) {
3069 			refill_rspq(adap, rspq, rspq->credits);
3070 			rspq->credits = 0;
3071 		}
3072 		if (!eth && eop) {
3073 			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
3074 			/*
3075 			 * XXX size mismatch
3076 			 */
3077 			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
3078 
3079 
3080 			ngathered = rx_offload(&adap->tdev, rspq,
3081 			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
3082 			rspq->rspq_mh.mh_head = NULL;
3083 			DPRINTF("received offload packet\n");
3084 
3085 		} else if (eth && eop) {
3086 			struct mbuf *m = rspq->rspq_mh.mh_head;
3087 
3088 			t3_rx_eth(adap, rspq, m, ethpad);
3089 
3090 #ifdef LRO_SUPPORTED
3091 			/*
3092 			 * The T304 sends incoming packets on any qset.  If LRO
3093 			 * is also enabled, we could end up sending packet up
3094 			 * lro_ctrl->ifp's input.  That is incorrect.
3095 			 *
3096 			 * The mbuf's rcvif was derived from the cpl header and
3097 			 * is accurate.  Skip LRO and just use that.
3098 			 */
3099 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3100 
3101 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro &&
3102 			    (tcp_lro_rx(lro_ctrl, m, 0) == 0)) {
3103 				/* successfully queue'd for LRO */
3104 			} else
3105 #endif
3106 			{
3107 				/*
3108 				 * LRO not enabled, packet unsuitable for LRO,
3109 				 * or unable to queue.  Pass it up right now in
3110 				 * either case.
3111 				 */
3112 				struct ifnet *ifp = m->m_pkthdr.rcvif;
3113 				(*ifp->if_input)(ifp, m);
3114 			}
3115 			rspq->rspq_mh.mh_head = NULL;
3116 
3117 		}
3118 		__refill_fl_lt(adap, &qs->fl[0], 32);
3119 		__refill_fl_lt(adap, &qs->fl[1], 32);
3120 		--budget_left;
3121 	}
3122 
3123 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3124 
3125 #ifdef LRO_SUPPORTED
3126 	/* Flush LRO */
3127 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3128 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3129 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3130 		tcp_lro_flush(lro_ctrl, queued);
3131 	}
3132 #endif
3133 
3134 	if (sleeping)
3135 		check_ring_db(adap, qs, sleeping);
3136 
3137 	mb();  /* commit Tx queue processed updates */
3138 	if (__predict_false(qs->txq_stopped > 1))
3139 		restart_tx(qs);
3140 
3141 	__refill_fl_lt(adap, &qs->fl[0], 512);
3142 	__refill_fl_lt(adap, &qs->fl[1], 512);
3143 	budget -= budget_left;
3144 	return (budget);
3145 }
3146 
3147 /*
3148  * A helper function that processes responses and issues GTS.
3149  */
3150 static __inline int
3151 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3152 {
3153 	int work;
3154 	static int last_holdoff = 0;
3155 
3156 	work = process_responses(adap, rspq_to_qset(rq), -1);
3157 
3158 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3159 		printf("next_holdoff=%d\n", rq->next_holdoff);
3160 		last_holdoff = rq->next_holdoff;
3161 	}
3162 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3163 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3164 
3165 	return (work);
3166 }
3167 
3168 
3169 /*
3170  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3171  * Handles data events from SGE response queues as well as error and other
3172  * async events as they all use the same interrupt pin.  We use one SGE
3173  * response queue per port in this mode and protect all response queues with
3174  * queue 0's lock.
3175  */
3176 void
3177 t3b_intr(void *data)
3178 {
3179 	uint32_t i, map;
3180 	adapter_t *adap = data;
3181 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3182 
3183 	t3_write_reg(adap, A_PL_CLI, 0);
3184 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3185 
3186 	if (!map)
3187 		return;
3188 
3189 	if (__predict_false(map & F_ERRINTR))
3190 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3191 
3192 	mtx_lock(&q0->lock);
3193 	for_each_port(adap, i)
3194 	    if (map & (1 << i))
3195 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3196 	mtx_unlock(&q0->lock);
3197 }
3198 
3199 /*
3200  * The MSI interrupt handler.  This needs to handle data events from SGE
3201  * response queues as well as error and other async events as they all use
3202  * the same MSI vector.  We use one SGE response queue per port in this mode
3203  * and protect all response queues with queue 0's lock.
3204  */
3205 void
3206 t3_intr_msi(void *data)
3207 {
3208 	adapter_t *adap = data;
3209 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3210 	int i, new_packets = 0;
3211 
3212 	mtx_lock(&q0->lock);
3213 
3214 	for_each_port(adap, i)
3215 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3216 		    new_packets = 1;
3217 	mtx_unlock(&q0->lock);
3218 	if (new_packets == 0)
3219 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3220 }
3221 
3222 void
3223 t3_intr_msix(void *data)
3224 {
3225 	struct sge_qset *qs = data;
3226 	adapter_t *adap = qs->port->adapter;
3227 	struct sge_rspq *rspq = &qs->rspq;
3228 
3229 	if (process_responses_gts(adap, rspq) == 0)
3230 		rspq->unhandled_irqs++;
3231 }
3232 
3233 #define QDUMP_SBUF_SIZE		32 * 400
3234 static int
3235 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3236 {
3237 	struct sge_rspq *rspq;
3238 	struct sge_qset *qs;
3239 	int i, err, dump_end, idx;
3240 	static int multiplier = 1;
3241 	struct sbuf *sb;
3242 	struct rsp_desc *rspd;
3243 	uint32_t data[4];
3244 
3245 	rspq = arg1;
3246 	qs = rspq_to_qset(rspq);
3247 	if (rspq->rspq_dump_count == 0)
3248 		return (0);
3249 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3250 		log(LOG_WARNING,
3251 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3252 		rspq->rspq_dump_count = 0;
3253 		return (EINVAL);
3254 	}
3255 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3256 		log(LOG_WARNING,
3257 		    "dump start of %d is greater than queue size\n",
3258 		    rspq->rspq_dump_start);
3259 		rspq->rspq_dump_start = 0;
3260 		return (EINVAL);
3261 	}
3262 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3263 	if (err)
3264 		return (err);
3265 retry_sbufops:
3266 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3267 
3268 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3269 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3270 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3271 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3272 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3273 
3274 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3275 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3276 
3277 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3278 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3279 		idx = i & (RSPQ_Q_SIZE-1);
3280 
3281 		rspd = &rspq->desc[idx];
3282 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3283 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3284 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3285 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3286 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3287 		    be32toh(rspd->len_cq), rspd->intr_gen);
3288 	}
3289 	if (sbuf_overflowed(sb)) {
3290 		sbuf_delete(sb);
3291 		multiplier++;
3292 		goto retry_sbufops;
3293 	}
3294 	sbuf_finish(sb);
3295 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3296 	sbuf_delete(sb);
3297 	return (err);
3298 }
3299 
3300 static int
3301 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3302 {
3303 	struct sge_txq *txq;
3304 	struct sge_qset *qs;
3305 	int i, j, err, dump_end;
3306 	static int multiplier = 1;
3307 	struct sbuf *sb;
3308 	struct tx_desc *txd;
3309 	uint32_t *WR, wr_hi, wr_lo, gen;
3310 	uint32_t data[4];
3311 
3312 	txq = arg1;
3313 	qs = txq_to_qset(txq, TXQ_ETH);
3314 	if (txq->txq_dump_count == 0) {
3315 		return (0);
3316 	}
3317 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3318 		log(LOG_WARNING,
3319 		    "dump count is too large %d\n", txq->txq_dump_count);
3320 		txq->txq_dump_count = 1;
3321 		return (EINVAL);
3322 	}
3323 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3324 		log(LOG_WARNING,
3325 		    "dump start of %d is greater than queue size\n",
3326 		    txq->txq_dump_start);
3327 		txq->txq_dump_start = 0;
3328 		return (EINVAL);
3329 	}
3330 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3331 	if (err)
3332 		return (err);
3333 
3334 
3335 retry_sbufops:
3336 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3337 
3338 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3339 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3340 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3341 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3342 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3343 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3344 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3345 	    txq->txq_dump_start,
3346 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3347 
3348 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3349 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3350 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3351 		WR = (uint32_t *)txd->flit;
3352 		wr_hi = ntohl(WR[0]);
3353 		wr_lo = ntohl(WR[1]);
3354 		gen = G_WR_GEN(wr_lo);
3355 
3356 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3357 		    wr_hi, wr_lo, gen);
3358 		for (j = 2; j < 30; j += 4)
3359 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3360 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3361 
3362 	}
3363 	if (sbuf_overflowed(sb)) {
3364 		sbuf_delete(sb);
3365 		multiplier++;
3366 		goto retry_sbufops;
3367 	}
3368 	sbuf_finish(sb);
3369 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3370 	sbuf_delete(sb);
3371 	return (err);
3372 }
3373 
3374 static int
3375 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3376 {
3377 	struct sge_txq *txq;
3378 	struct sge_qset *qs;
3379 	int i, j, err, dump_end;
3380 	static int multiplier = 1;
3381 	struct sbuf *sb;
3382 	struct tx_desc *txd;
3383 	uint32_t *WR, wr_hi, wr_lo, gen;
3384 
3385 	txq = arg1;
3386 	qs = txq_to_qset(txq, TXQ_CTRL);
3387 	if (txq->txq_dump_count == 0) {
3388 		return (0);
3389 	}
3390 	if (txq->txq_dump_count > 256) {
3391 		log(LOG_WARNING,
3392 		    "dump count is too large %d\n", txq->txq_dump_count);
3393 		txq->txq_dump_count = 1;
3394 		return (EINVAL);
3395 	}
3396 	if (txq->txq_dump_start > 255) {
3397 		log(LOG_WARNING,
3398 		    "dump start of %d is greater than queue size\n",
3399 		    txq->txq_dump_start);
3400 		txq->txq_dump_start = 0;
3401 		return (EINVAL);
3402 	}
3403 
3404 retry_sbufops:
3405 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3406 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3407 	    txq->txq_dump_start,
3408 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3409 
3410 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3411 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3412 		txd = &txq->desc[i & (255)];
3413 		WR = (uint32_t *)txd->flit;
3414 		wr_hi = ntohl(WR[0]);
3415 		wr_lo = ntohl(WR[1]);
3416 		gen = G_WR_GEN(wr_lo);
3417 
3418 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3419 		    wr_hi, wr_lo, gen);
3420 		for (j = 2; j < 30; j += 4)
3421 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3422 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3423 
3424 	}
3425 	if (sbuf_overflowed(sb)) {
3426 		sbuf_delete(sb);
3427 		multiplier++;
3428 		goto retry_sbufops;
3429 	}
3430 	sbuf_finish(sb);
3431 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3432 	sbuf_delete(sb);
3433 	return (err);
3434 }
3435 
3436 static int
3437 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3438 {
3439 	adapter_t *sc = arg1;
3440 	struct qset_params *qsp = &sc->params.sge.qset[0];
3441 	int coalesce_usecs;
3442 	struct sge_qset *qs;
3443 	int i, j, err, nqsets = 0;
3444 	struct mtx *lock;
3445 
3446 	if ((sc->flags & FULL_INIT_DONE) == 0)
3447 		return (ENXIO);
3448 
3449 	coalesce_usecs = qsp->coalesce_usecs;
3450         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3451 
3452 	if (err != 0) {
3453 		return (err);
3454 	}
3455 	if (coalesce_usecs == qsp->coalesce_usecs)
3456 		return (0);
3457 
3458 	for (i = 0; i < sc->params.nports; i++)
3459 		for (j = 0; j < sc->port[i].nqsets; j++)
3460 			nqsets++;
3461 
3462 	coalesce_usecs = max(1, coalesce_usecs);
3463 
3464 	for (i = 0; i < nqsets; i++) {
3465 		qs = &sc->sge.qs[i];
3466 		qsp = &sc->params.sge.qset[i];
3467 		qsp->coalesce_usecs = coalesce_usecs;
3468 
3469 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3470 			    &sc->sge.qs[0].rspq.lock;
3471 
3472 		mtx_lock(lock);
3473 		t3_update_qset_coalesce(qs, qsp);
3474 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3475 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3476 		mtx_unlock(lock);
3477 	}
3478 
3479 	return (0);
3480 }
3481 
3482 
3483 void
3484 t3_add_attach_sysctls(adapter_t *sc)
3485 {
3486 	struct sysctl_ctx_list *ctx;
3487 	struct sysctl_oid_list *children;
3488 
3489 	ctx = device_get_sysctl_ctx(sc->dev);
3490 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3491 
3492 	/* random information */
3493 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3494 	    "firmware_version",
3495 	    CTLFLAG_RD, &sc->fw_version,
3496 	    0, "firmware version");
3497 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3498 	    "hw_revision",
3499 	    CTLFLAG_RD, &sc->params.rev,
3500 	    0, "chip model");
3501 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3502 	    "port_types",
3503 	    CTLFLAG_RD, &sc->port_types,
3504 	    0, "type of ports");
3505 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3506 	    "enable_debug",
3507 	    CTLFLAG_RW, &cxgb_debug,
3508 	    0, "enable verbose debugging output");
3509 	SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3510 	    CTLFLAG_RD, &sc->tunq_coalesce,
3511 	    "#tunneled packets freed");
3512 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3513 	    "txq_overrun",
3514 	    CTLFLAG_RD, &txq_fills,
3515 	    0, "#times txq overrun");
3516 }
3517 
3518 
3519 static const char *rspq_name = "rspq";
3520 static const char *txq_names[] =
3521 {
3522 	"txq_eth",
3523 	"txq_ofld",
3524 	"txq_ctrl"
3525 };
3526 
3527 static int
3528 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3529 {
3530 	struct port_info *p = arg1;
3531 	uint64_t *parg;
3532 
3533 	if (!p)
3534 		return (EINVAL);
3535 
3536 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3537 	PORT_LOCK(p);
3538 	t3_mac_update_stats(&p->mac);
3539 	PORT_UNLOCK(p);
3540 
3541 	return (sysctl_handle_quad(oidp, parg, 0, req));
3542 }
3543 
3544 void
3545 t3_add_configured_sysctls(adapter_t *sc)
3546 {
3547 	struct sysctl_ctx_list *ctx;
3548 	struct sysctl_oid_list *children;
3549 	int i, j;
3550 
3551 	ctx = device_get_sysctl_ctx(sc->dev);
3552 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3553 
3554 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3555 	    "intr_coal",
3556 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3557 	    0, t3_set_coalesce_usecs,
3558 	    "I", "interrupt coalescing timer (us)");
3559 
3560 	for (i = 0; i < sc->params.nports; i++) {
3561 		struct port_info *pi = &sc->port[i];
3562 		struct sysctl_oid *poid;
3563 		struct sysctl_oid_list *poidlist;
3564 		struct mac_stats *mstats = &pi->mac.stats;
3565 
3566 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3567 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3568 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3569 		poidlist = SYSCTL_CHILDREN(poid);
3570 		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3571 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3572 		    0, "#queue sets");
3573 
3574 		for (j = 0; j < pi->nqsets; j++) {
3575 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3576 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3577 					  *ctrlqpoid, *lropoid;
3578 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3579 					       *txqpoidlist, *ctrlqpoidlist,
3580 					       *lropoidlist;
3581 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3582 
3583 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3584 
3585 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3586 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3587 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3588 
3589 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3590 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3591 					"freelist #0 empty");
3592 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3593 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3594 					"freelist #1 empty");
3595 
3596 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3597 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3598 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3599 
3600 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3601 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3602 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3603 
3604 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3605 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3606 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3607 
3608 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3609 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3610 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3611 
3612 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3613 			    CTLFLAG_RD, &qs->rspq.size,
3614 			    0, "#entries in response queue");
3615 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3616 			    CTLFLAG_RD, &qs->rspq.cidx,
3617 			    0, "consumer index");
3618 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3619 			    CTLFLAG_RD, &qs->rspq.credits,
3620 			    0, "#credits");
3621 			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3622 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3623 			    "physical_address_of the queue");
3624 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3625 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3626 			    0, "start rspq dump entry");
3627 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3628 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3629 			    0, "#rspq entries to dump");
3630 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3631 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3632 			    0, t3_dump_rspq, "A", "dump of the response queue");
3633 
3634 
3635 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "dropped",
3636 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3637 			    0, "#tunneled packets dropped");
3638 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3639 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3640 			    0, "#tunneled packets waiting to be sent");
3641 #if 0
3642 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3643 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3644 			    0, "#tunneled packets queue producer index");
3645 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3646 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3647 			    0, "#tunneled packets queue consumer index");
3648 #endif
3649 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3650 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3651 			    0, "#tunneled packets processed by the card");
3652 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3653 			    CTLFLAG_RD, &txq->cleaned,
3654 			    0, "#tunneled packets cleaned");
3655 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3656 			    CTLFLAG_RD, &txq->in_use,
3657 			    0, "#tunneled packet slots in use");
3658 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3659 			    CTLFLAG_RD, &txq->txq_frees,
3660 			    "#tunneled packets freed");
3661 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3662 			    CTLFLAG_RD, &txq->txq_skipped,
3663 			    0, "#tunneled packet descriptors skipped");
3664 			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3665 			    CTLFLAG_RD, &txq->txq_coalesced,
3666 			    "#tunneled packets coalesced");
3667 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3668 			    CTLFLAG_RD, &txq->txq_enqueued,
3669 			    0, "#tunneled packets enqueued to hardware");
3670 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3671 			    CTLFLAG_RD, &qs->txq_stopped,
3672 			    0, "tx queues stopped");
3673 			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3674 			    CTLFLAG_RD, &txq->phys_addr,
3675 			    "physical_address_of the queue");
3676 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3677 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3678 			    0, "txq generation");
3679 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3680 			    CTLFLAG_RD, &txq->cidx,
3681 			    0, "hardware queue cidx");
3682 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3683 			    CTLFLAG_RD, &txq->pidx,
3684 			    0, "hardware queue pidx");
3685 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3686 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3687 			    0, "txq start idx for dump");
3688 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3689 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3690 			    0, "txq #entries to dump");
3691 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3692 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3693 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3694 
3695 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3696 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3697 			    0, "ctrlq start idx for dump");
3698 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3699 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3700 			    0, "ctrl #entries to dump");
3701 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3702 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3703 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3704 
3705 #ifdef LRO_SUPPORTED
3706 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3707 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3708 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3709 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3710 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3711 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3712 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3713 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3714 #endif
3715 		}
3716 
3717 		/* Now add a node for mac stats. */
3718 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3719 		    CTLFLAG_RD, NULL, "MAC statistics");
3720 		poidlist = SYSCTL_CHILDREN(poid);
3721 
3722 		/*
3723 		 * We (ab)use the length argument (arg2) to pass on the offset
3724 		 * of the data that we are interested in.  This is only required
3725 		 * for the quad counters that are updated from the hardware (we
3726 		 * make sure that we return the latest value).
3727 		 * sysctl_handle_macstat first updates *all* the counters from
3728 		 * the hardware, and then returns the latest value of the
3729 		 * requested counter.  Best would be to update only the
3730 		 * requested counter from hardware, but t3_mac_update_stats()
3731 		 * hides all the register details and we don't want to dive into
3732 		 * all that here.
3733 		 */
3734 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3735     (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3736     sysctl_handle_macstat, "QU", 0)
3737 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3738 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3739 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3740 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3741 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3742 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3743 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3744 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3745 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3746 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3747 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3748 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3749 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3750 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3751 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3752 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3753 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3754 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3755 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3756 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3757 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3758 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3759 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3760 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3761 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3762 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3763 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3764 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3765 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3766 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3767 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3768 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3769 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3770 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3771 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3772 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3773 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3774 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3775 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3776 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3777 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3778 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3779 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3780 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3781 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3782 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3783 #undef CXGB_SYSCTL_ADD_QUAD
3784 
3785 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3786     CTLFLAG_RD, &mstats->a, 0)
3787 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3788 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3789 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3790 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3791 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3792 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3793 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3794 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3795 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3796 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3797 #undef CXGB_SYSCTL_ADD_ULONG
3798 	}
3799 }
3800 
3801 /**
3802  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3803  *	@qs: the queue set
3804  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3805  *	@idx: the descriptor index in the queue
3806  *	@data: where to dump the descriptor contents
3807  *
3808  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3809  *	size of the descriptor.
3810  */
3811 int
3812 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3813 		unsigned char *data)
3814 {
3815 	if (qnum >= 6)
3816 		return (EINVAL);
3817 
3818 	if (qnum < 3) {
3819 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3820 			return -EINVAL;
3821 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3822 		return sizeof(struct tx_desc);
3823 	}
3824 
3825 	if (qnum == 3) {
3826 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3827 			return (EINVAL);
3828 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3829 		return sizeof(struct rsp_desc);
3830 	}
3831 
3832 	qnum -= 4;
3833 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3834 		return (EINVAL);
3835 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3836 	return sizeof(struct rx_desc);
3837 }
3838