xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 47cfa99a506970a13471405702e568f394ac8d1a)
1 /**************************************************************************
2 
3 Copyright (c) 2007-2009, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet6.h"
34 #include "opt_inet.h"
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/module.h>
40 #include <sys/bus.h>
41 #include <sys/conf.h>
42 #include <machine/bus.h>
43 #include <machine/resource.h>
44 #include <sys/bus_dma.h>
45 #include <sys/rman.h>
46 #include <sys/queue.h>
47 #include <sys/sysctl.h>
48 #include <sys/taskqueue.h>
49 
50 #include <sys/proc.h>
51 #include <sys/sbuf.h>
52 #include <sys/sched.h>
53 #include <sys/smp.h>
54 #include <sys/systm.h>
55 #include <sys/syslog.h>
56 #include <sys/socket.h>
57 
58 #include <net/bpf.h>
59 #include <net/ethernet.h>
60 #include <net/if.h>
61 #include <net/if_vlan_var.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 
69 #include <dev/pci/pcireg.h>
70 #include <dev/pci/pcivar.h>
71 
72 #include <vm/vm.h>
73 #include <vm/pmap.h>
74 
75 #include <cxgb_include.h>
76 #include <sys/mvec.h>
77 
78 int	txq_fills = 0;
79 int	multiq_tx_enable = 1;
80 
81 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
82 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
83 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
84 SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
85     "size of per-queue mbuf ring");
86 
87 static int cxgb_tx_coalesce_force = 0;
88 TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
89 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
90     &cxgb_tx_coalesce_force, 0,
91     "coalesce small packets into a single work request regardless of ring state");
92 
93 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
94 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
95 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
96 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
97 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
98 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
99 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
100 
101 
102 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
103 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
104     &cxgb_tx_coalesce_enable_start);
105 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
106     &cxgb_tx_coalesce_enable_start, 0,
107     "coalesce enable threshold");
108 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
109 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
110 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
111     &cxgb_tx_coalesce_enable_stop, 0,
112     "coalesce disable threshold");
113 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
114 TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
115 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
116     &cxgb_tx_reclaim_threshold, 0,
117     "tx cleaning minimum threshold");
118 
119 /*
120  * XXX don't re-enable this until TOE stops assuming
121  * we have an m_ext
122  */
123 static int recycle_enable = 0;
124 
125 extern int cxgb_use_16k_clusters;
126 extern int nmbjumbop;
127 extern int nmbjumbo9;
128 extern int nmbjumbo16;
129 
130 #define USE_GTS 0
131 
132 #define SGE_RX_SM_BUF_SIZE	1536
133 #define SGE_RX_DROP_THRES	16
134 #define SGE_RX_COPY_THRES	128
135 
136 /*
137  * Period of the Tx buffer reclaim timer.  This timer does not need to run
138  * frequently as Tx buffers are usually reclaimed by new Tx packets.
139  */
140 #define TX_RECLAIM_PERIOD       (hz >> 1)
141 
142 /*
143  * Values for sge_txq.flags
144  */
145 enum {
146 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
147 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
148 };
149 
150 struct tx_desc {
151 	uint64_t	flit[TX_DESC_FLITS];
152 } __packed;
153 
154 struct rx_desc {
155 	uint32_t	addr_lo;
156 	uint32_t	len_gen;
157 	uint32_t	gen2;
158 	uint32_t	addr_hi;
159 } __packed;
160 
161 struct rsp_desc {               /* response queue descriptor */
162 	struct rss_header	rss_hdr;
163 	uint32_t		flags;
164 	uint32_t		len_cq;
165 	uint8_t			imm_data[47];
166 	uint8_t			intr_gen;
167 } __packed;
168 
169 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
170 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
171 #define RX_SW_DESC_INUSE        (1 << 3)
172 #define TX_SW_DESC_MAPPED       (1 << 4)
173 
174 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
175 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
176 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
177 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
178 
179 struct tx_sw_desc {                /* SW state per Tx descriptor */
180 	struct mbuf	*m;
181 	bus_dmamap_t	map;
182 	int		flags;
183 };
184 
185 struct rx_sw_desc {                /* SW state per Rx descriptor */
186 	caddr_t		rxsd_cl;
187 	struct mbuf	*m;
188 	bus_dmamap_t	map;
189 	int		flags;
190 };
191 
192 struct txq_state {
193 	unsigned int	compl;
194 	unsigned int	gen;
195 	unsigned int	pidx;
196 };
197 
198 struct refill_fl_cb_arg {
199 	int               error;
200 	bus_dma_segment_t seg;
201 	int               nseg;
202 };
203 
204 
205 /*
206  * Maps a number of flits to the number of Tx descriptors that can hold them.
207  * The formula is
208  *
209  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
210  *
211  * HW allows up to 4 descriptors to be combined into a WR.
212  */
213 static uint8_t flit_desc_map[] = {
214 	0,
215 #if SGE_NUM_GENBITS == 1
216 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
218 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
219 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
220 #elif SGE_NUM_GENBITS == 2
221 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
222 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
223 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
225 #else
226 # error "SGE_NUM_GENBITS must be 1 or 2"
227 #endif
228 };
229 
230 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
231 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
232 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
233 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
234 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
235 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
236 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
237 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
238 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
239 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
240 #define	TXQ_RING_DEQUEUE(qs) \
241 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
242 
243 int cxgb_debug = 0;
244 
245 static void sge_timer_cb(void *arg);
246 static void sge_timer_reclaim(void *arg, int ncount);
247 static void sge_txq_reclaim_handler(void *arg, int ncount);
248 static void cxgb_start_locked(struct sge_qset *qs);
249 
250 /*
251  * XXX need to cope with bursty scheduling by looking at a wider
252  * window than we are now for determining the need for coalescing
253  *
254  */
255 static __inline uint64_t
256 check_pkt_coalesce(struct sge_qset *qs)
257 {
258         struct adapter *sc;
259         struct sge_txq *txq;
260 	uint8_t *fill;
261 
262 	if (__predict_false(cxgb_tx_coalesce_force))
263 		return (1);
264 	txq = &qs->txq[TXQ_ETH];
265         sc = qs->port->adapter;
266 	fill = &sc->tunq_fill[qs->idx];
267 
268 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
269 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
270 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
271 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
272 	/*
273 	 * if the hardware transmit queue is more than 1/8 full
274 	 * we mark it as coalescing - we drop back from coalescing
275 	 * when we go below 1/32 full and there are no packets enqueued,
276 	 * this provides us with some degree of hysteresis
277 	 */
278         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
279 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
280                 *fill = 0;
281         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
282                 *fill = 1;
283 
284 	return (sc->tunq_coalesce);
285 }
286 
287 #ifdef __LP64__
288 static void
289 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
290 {
291 	uint64_t wr_hilo;
292 #if _BYTE_ORDER == _LITTLE_ENDIAN
293 	wr_hilo = wr_hi;
294 	wr_hilo |= (((uint64_t)wr_lo)<<32);
295 #else
296 	wr_hilo = wr_lo;
297 	wr_hilo |= (((uint64_t)wr_hi)<<32);
298 #endif
299 	wrp->wrh_hilo = wr_hilo;
300 }
301 #else
302 static void
303 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
304 {
305 
306 	wrp->wrh_hi = wr_hi;
307 	wmb();
308 	wrp->wrh_lo = wr_lo;
309 }
310 #endif
311 
312 struct coalesce_info {
313 	int count;
314 	int nbytes;
315 };
316 
317 static int
318 coalesce_check(struct mbuf *m, void *arg)
319 {
320 	struct coalesce_info *ci = arg;
321 	int *count = &ci->count;
322 	int *nbytes = &ci->nbytes;
323 
324 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
325 		(*count < 7) && (m->m_next == NULL))) {
326 		*count += 1;
327 		*nbytes += m->m_len;
328 		return (1);
329 	}
330 	return (0);
331 }
332 
333 static struct mbuf *
334 cxgb_dequeue(struct sge_qset *qs)
335 {
336 	struct mbuf *m, *m_head, *m_tail;
337 	struct coalesce_info ci;
338 
339 
340 	if (check_pkt_coalesce(qs) == 0)
341 		return TXQ_RING_DEQUEUE(qs);
342 
343 	m_head = m_tail = NULL;
344 	ci.count = ci.nbytes = 0;
345 	do {
346 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
347 		if (m_head == NULL) {
348 			m_tail = m_head = m;
349 		} else if (m != NULL) {
350 			m_tail->m_nextpkt = m;
351 			m_tail = m;
352 		}
353 	} while (m != NULL);
354 	if (ci.count > 7)
355 		panic("trying to coalesce %d packets in to one WR", ci.count);
356 	return (m_head);
357 }
358 
359 /**
360  *	reclaim_completed_tx - reclaims completed Tx descriptors
361  *	@adapter: the adapter
362  *	@q: the Tx queue to reclaim completed descriptors from
363  *
364  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
365  *	and frees the associated buffers if possible.  Called with the Tx
366  *	queue's lock held.
367  */
368 static __inline int
369 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
370 {
371 	struct sge_txq *q = &qs->txq[queue];
372 	int reclaim = desc_reclaimable(q);
373 
374 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
375 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
376 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
377 
378 	if (reclaim < reclaim_min)
379 		return (0);
380 
381 	mtx_assert(&qs->lock, MA_OWNED);
382 	if (reclaim > 0) {
383 		t3_free_tx_desc(qs, reclaim, queue);
384 		q->cleaned += reclaim;
385 		q->in_use -= reclaim;
386 	}
387 	if (isset(&qs->txq_stopped, TXQ_ETH))
388                 clrbit(&qs->txq_stopped, TXQ_ETH);
389 
390 	return (reclaim);
391 }
392 
393 /**
394  *	should_restart_tx - are there enough resources to restart a Tx queue?
395  *	@q: the Tx queue
396  *
397  *	Checks if there are enough descriptors to restart a suspended Tx queue.
398  */
399 static __inline int
400 should_restart_tx(const struct sge_txq *q)
401 {
402 	unsigned int r = q->processed - q->cleaned;
403 
404 	return q->in_use - r < (q->size >> 1);
405 }
406 
407 /**
408  *	t3_sge_init - initialize SGE
409  *	@adap: the adapter
410  *	@p: the SGE parameters
411  *
412  *	Performs SGE initialization needed every time after a chip reset.
413  *	We do not initialize any of the queue sets here, instead the driver
414  *	top-level must request those individually.  We also do not enable DMA
415  *	here, that should be done after the queues have been set up.
416  */
417 void
418 t3_sge_init(adapter_t *adap, struct sge_params *p)
419 {
420 	u_int ctrl, ups;
421 
422 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
423 
424 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
425 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
426 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
427 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
428 #if SGE_NUM_GENBITS == 1
429 	ctrl |= F_EGRGENCTRL;
430 #endif
431 	if (adap->params.rev > 0) {
432 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
433 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
434 	}
435 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
436 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
437 		     V_LORCQDRBTHRSH(512));
438 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
439 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
440 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
441 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
442 		     adap->params.rev < T3_REV_C ? 1000 : 500);
443 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
444 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
445 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
446 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
447 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
448 }
449 
450 
451 /**
452  *	sgl_len - calculates the size of an SGL of the given capacity
453  *	@n: the number of SGL entries
454  *
455  *	Calculates the number of flits needed for a scatter/gather list that
456  *	can hold the given number of entries.
457  */
458 static __inline unsigned int
459 sgl_len(unsigned int n)
460 {
461 	return ((3 * n) / 2 + (n & 1));
462 }
463 
464 /**
465  *	get_imm_packet - return the next ingress packet buffer from a response
466  *	@resp: the response descriptor containing the packet data
467  *
468  *	Return a packet containing the immediate data of the given response.
469  */
470 static int
471 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
472 {
473 
474 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
475 	m->m_ext.ext_buf = NULL;
476 	m->m_ext.ext_type = 0;
477 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
478 	return (0);
479 }
480 
481 static __inline u_int
482 flits_to_desc(u_int n)
483 {
484 	return (flit_desc_map[n]);
485 }
486 
487 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
488 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
489 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
490 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
491 		    F_HIRCQPARITYERROR)
492 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
493 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
494 		      F_RSPQDISABLED)
495 
496 /**
497  *	t3_sge_err_intr_handler - SGE async event interrupt handler
498  *	@adapter: the adapter
499  *
500  *	Interrupt handler for SGE asynchronous (non-data) events.
501  */
502 void
503 t3_sge_err_intr_handler(adapter_t *adapter)
504 {
505 	unsigned int v, status;
506 
507 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
508 	if (status & SGE_PARERR)
509 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
510 			 status & SGE_PARERR);
511 	if (status & SGE_FRAMINGERR)
512 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
513 			 status & SGE_FRAMINGERR);
514 	if (status & F_RSPQCREDITOVERFOW)
515 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
516 
517 	if (status & F_RSPQDISABLED) {
518 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
519 
520 		CH_ALERT(adapter,
521 			 "packet delivered to disabled response queue (0x%x)\n",
522 			 (v >> S_RSPQ0DISABLED) & 0xff);
523 	}
524 
525 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
526 	if (status & SGE_FATALERR)
527 		t3_fatal_err(adapter);
528 }
529 
530 void
531 t3_sge_prep(adapter_t *adap, struct sge_params *p)
532 {
533 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
534 
535 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
536 	nqsets *= adap->params.nports;
537 
538 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
539 
540 	while (!powerof2(fl_q_size))
541 		fl_q_size--;
542 
543 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
544 	    is_offload(adap);
545 
546 #if __FreeBSD_version >= 700111
547 	if (use_16k) {
548 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
549 		jumbo_buf_size = MJUM16BYTES;
550 	} else {
551 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
552 		jumbo_buf_size = MJUM9BYTES;
553 	}
554 #else
555 	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
556 	jumbo_buf_size = MJUMPAGESIZE;
557 #endif
558 	while (!powerof2(jumbo_q_size))
559 		jumbo_q_size--;
560 
561 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
562 		device_printf(adap->dev,
563 		    "Insufficient clusters and/or jumbo buffers.\n");
564 
565 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
566 
567 	for (i = 0; i < SGE_QSETS; ++i) {
568 		struct qset_params *q = p->qset + i;
569 
570 		if (adap->params.nports > 2) {
571 			q->coalesce_usecs = 50;
572 		} else {
573 #ifdef INVARIANTS
574 			q->coalesce_usecs = 10;
575 #else
576 			q->coalesce_usecs = 5;
577 #endif
578 		}
579 		q->polling = 0;
580 		q->rspq_size = RSPQ_Q_SIZE;
581 		q->fl_size = fl_q_size;
582 		q->jumbo_size = jumbo_q_size;
583 		q->jumbo_buf_size = jumbo_buf_size;
584 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
585 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
586 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
587 		q->cong_thres = 0;
588 	}
589 }
590 
591 int
592 t3_sge_alloc(adapter_t *sc)
593 {
594 
595 	/* The parent tag. */
596 	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
597 				1, 0,			/* algnmnt, boundary */
598 				BUS_SPACE_MAXADDR,	/* lowaddr */
599 				BUS_SPACE_MAXADDR,	/* highaddr */
600 				NULL, NULL,		/* filter, filterarg */
601 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
602 				BUS_SPACE_UNRESTRICTED, /* nsegments */
603 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
604 				0,			/* flags */
605 				NULL, NULL,		/* lock, lockarg */
606 				&sc->parent_dmat)) {
607 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
608 		return (ENOMEM);
609 	}
610 
611 	/*
612 	 * DMA tag for normal sized RX frames
613 	 */
614 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
615 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
616 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
617 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
618 		return (ENOMEM);
619 	}
620 
621 	/*
622 	 * DMA tag for jumbo sized RX frames.
623 	 */
624 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
625 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
626 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
627 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
628 		return (ENOMEM);
629 	}
630 
631 	/*
632 	 * DMA tag for TX frames.
633 	 */
634 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
635 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
636 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
637 		NULL, NULL, &sc->tx_dmat)) {
638 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
639 		return (ENOMEM);
640 	}
641 
642 	return (0);
643 }
644 
645 int
646 t3_sge_free(struct adapter * sc)
647 {
648 
649 	if (sc->tx_dmat != NULL)
650 		bus_dma_tag_destroy(sc->tx_dmat);
651 
652 	if (sc->rx_jumbo_dmat != NULL)
653 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
654 
655 	if (sc->rx_dmat != NULL)
656 		bus_dma_tag_destroy(sc->rx_dmat);
657 
658 	if (sc->parent_dmat != NULL)
659 		bus_dma_tag_destroy(sc->parent_dmat);
660 
661 	return (0);
662 }
663 
664 void
665 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
666 {
667 
668 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
669 	qs->rspq.polling = 0 /* p->polling */;
670 }
671 
672 #if !defined(__i386__) && !defined(__amd64__)
673 static void
674 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
675 {
676 	struct refill_fl_cb_arg *cb_arg = arg;
677 
678 	cb_arg->error = error;
679 	cb_arg->seg = segs[0];
680 	cb_arg->nseg = nseg;
681 
682 }
683 #endif
684 /**
685  *	refill_fl - refill an SGE free-buffer list
686  *	@sc: the controller softc
687  *	@q: the free-list to refill
688  *	@n: the number of new buffers to allocate
689  *
690  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
691  *	The caller must assure that @n does not exceed the queue's capacity.
692  */
693 static void
694 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
695 {
696 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
697 	struct rx_desc *d = &q->desc[q->pidx];
698 	struct refill_fl_cb_arg cb_arg;
699 	struct mbuf *m;
700 	caddr_t cl;
701 	int err;
702 
703 	cb_arg.error = 0;
704 	while (n--) {
705 		/*
706 		 * We only allocate a cluster, mbuf allocation happens after rx
707 		 */
708 		if (q->zone == zone_pack) {
709 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
710 				break;
711 			cl = m->m_ext.ext_buf;
712 		} else {
713 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
714 				break;
715 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
716 				uma_zfree(q->zone, cl);
717 				break;
718 			}
719 		}
720 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
721 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
722 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
723 				uma_zfree(q->zone, cl);
724 				goto done;
725 			}
726 			sd->flags |= RX_SW_DESC_MAP_CREATED;
727 		}
728 #if !defined(__i386__) && !defined(__amd64__)
729 		err = bus_dmamap_load(q->entry_tag, sd->map,
730 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
731 
732 		if (err != 0 || cb_arg.error) {
733 			if (q->zone == zone_pack)
734 				uma_zfree(q->zone, cl);
735 			m_free(m);
736 			goto done;
737 		}
738 #else
739 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
740 #endif
741 		sd->flags |= RX_SW_DESC_INUSE;
742 		sd->rxsd_cl = cl;
743 		sd->m = m;
744 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
745 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
746 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
747 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
748 
749 		d++;
750 		sd++;
751 
752 		if (++q->pidx == q->size) {
753 			q->pidx = 0;
754 			q->gen ^= 1;
755 			sd = q->sdesc;
756 			d = q->desc;
757 		}
758 		q->credits++;
759 		q->db_pending++;
760 	}
761 
762 done:
763 	if (q->db_pending >= 32) {
764 		q->db_pending = 0;
765 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
766 	}
767 }
768 
769 
770 /**
771  *	free_rx_bufs - free the Rx buffers on an SGE free list
772  *	@sc: the controle softc
773  *	@q: the SGE free list to clean up
774  *
775  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
776  *	this queue should be stopped before calling this function.
777  */
778 static void
779 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
780 {
781 	u_int cidx = q->cidx;
782 
783 	while (q->credits--) {
784 		struct rx_sw_desc *d = &q->sdesc[cidx];
785 
786 		if (d->flags & RX_SW_DESC_INUSE) {
787 			bus_dmamap_unload(q->entry_tag, d->map);
788 			bus_dmamap_destroy(q->entry_tag, d->map);
789 			if (q->zone == zone_pack) {
790 				m_init(d->m, zone_pack, MCLBYTES,
791 				    M_NOWAIT, MT_DATA, M_EXT);
792 				uma_zfree(zone_pack, d->m);
793 			} else {
794 				m_init(d->m, zone_mbuf, MLEN,
795 				    M_NOWAIT, MT_DATA, 0);
796 				uma_zfree(zone_mbuf, d->m);
797 				uma_zfree(q->zone, d->rxsd_cl);
798 			}
799 		}
800 
801 		d->rxsd_cl = NULL;
802 		d->m = NULL;
803 		if (++cidx == q->size)
804 			cidx = 0;
805 	}
806 }
807 
808 static __inline void
809 __refill_fl(adapter_t *adap, struct sge_fl *fl)
810 {
811 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
812 }
813 
814 static __inline void
815 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
816 {
817 	uint32_t reclaimable = fl->size - fl->credits;
818 
819 	if (reclaimable > 0)
820 		refill_fl(adap, fl, min(max, reclaimable));
821 }
822 
823 /**
824  *	recycle_rx_buf - recycle a receive buffer
825  *	@adapter: the adapter
826  *	@q: the SGE free list
827  *	@idx: index of buffer to recycle
828  *
829  *	Recycles the specified buffer on the given free list by adding it at
830  *	the next available slot on the list.
831  */
832 static void
833 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
834 {
835 	struct rx_desc *from = &q->desc[idx];
836 	struct rx_desc *to   = &q->desc[q->pidx];
837 
838 	q->sdesc[q->pidx] = q->sdesc[idx];
839 	to->addr_lo = from->addr_lo;        // already big endian
840 	to->addr_hi = from->addr_hi;        // likewise
841 	wmb();	/* necessary ? */
842 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
843 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
844 	q->credits++;
845 
846 	if (++q->pidx == q->size) {
847 		q->pidx = 0;
848 		q->gen ^= 1;
849 	}
850 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
851 }
852 
853 static void
854 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
855 {
856 	uint32_t *addr;
857 
858 	addr = arg;
859 	*addr = segs[0].ds_addr;
860 }
861 
862 static int
863 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
864     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
865     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
866 {
867 	size_t len = nelem * elem_size;
868 	void *s = NULL;
869 	void *p = NULL;
870 	int err;
871 
872 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
873 				      BUS_SPACE_MAXADDR_32BIT,
874 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
875 				      len, 0, NULL, NULL, tag)) != 0) {
876 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
877 		return (ENOMEM);
878 	}
879 
880 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
881 				    map)) != 0) {
882 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
883 		return (ENOMEM);
884 	}
885 
886 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
887 	bzero(p, len);
888 	*(void **)desc = p;
889 
890 	if (sw_size) {
891 		len = nelem * sw_size;
892 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
893 		*(void **)sdesc = s;
894 	}
895 	if (parent_entry_tag == NULL)
896 		return (0);
897 
898 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
899 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
900 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
901 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
902 		                      NULL, NULL, entry_tag)) != 0) {
903 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
904 		return (ENOMEM);
905 	}
906 	return (0);
907 }
908 
909 static void
910 sge_slow_intr_handler(void *arg, int ncount)
911 {
912 	adapter_t *sc = arg;
913 
914 	t3_slow_intr_handler(sc);
915 	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
916 	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
917 }
918 
919 /**
920  *	sge_timer_cb - perform periodic maintenance of an SGE qset
921  *	@data: the SGE queue set to maintain
922  *
923  *	Runs periodically from a timer to perform maintenance of an SGE queue
924  *	set.  It performs two tasks:
925  *
926  *	a) Cleans up any completed Tx descriptors that may still be pending.
927  *	Normal descriptor cleanup happens when new packets are added to a Tx
928  *	queue so this timer is relatively infrequent and does any cleanup only
929  *	if the Tx queue has not seen any new packets in a while.  We make a
930  *	best effort attempt to reclaim descriptors, in that we don't wait
931  *	around if we cannot get a queue's lock (which most likely is because
932  *	someone else is queueing new packets and so will also handle the clean
933  *	up).  Since control queues use immediate data exclusively we don't
934  *	bother cleaning them up here.
935  *
936  *	b) Replenishes Rx queues that have run out due to memory shortage.
937  *	Normally new Rx buffers are added when existing ones are consumed but
938  *	when out of memory a queue can become empty.  We try to add only a few
939  *	buffers here, the queue will be replenished fully as these new buffers
940  *	are used up if memory shortage has subsided.
941  *
942  *	c) Return coalesced response queue credits in case a response queue is
943  *	starved.
944  *
945  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
946  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
947  */
948 static void
949 sge_timer_cb(void *arg)
950 {
951 	adapter_t *sc = arg;
952 	if ((sc->flags & USING_MSIX) == 0) {
953 
954 		struct port_info *pi;
955 		struct sge_qset *qs;
956 		struct sge_txq  *txq;
957 		int i, j;
958 		int reclaim_ofl, refill_rx;
959 
960 		if (sc->open_device_map == 0)
961 			return;
962 
963 		for (i = 0; i < sc->params.nports; i++) {
964 			pi = &sc->port[i];
965 			for (j = 0; j < pi->nqsets; j++) {
966 				qs = &sc->sge.qs[pi->first_qset + j];
967 				txq = &qs->txq[0];
968 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
969 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
970 				    (qs->fl[1].credits < qs->fl[1].size));
971 				if (reclaim_ofl || refill_rx) {
972 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
973 					break;
974 				}
975 			}
976 		}
977 	}
978 
979 	if (sc->params.nports > 2) {
980 		int i;
981 
982 		for_each_port(sc, i) {
983 			struct port_info *pi = &sc->port[i];
984 
985 			t3_write_reg(sc, A_SG_KDOORBELL,
986 				     F_SELEGRCNTX |
987 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
988 		}
989 	}
990 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
991 	    sc->open_device_map != 0)
992 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
993 }
994 
995 /*
996  * This is meant to be a catch-all function to keep sge state private
997  * to sge.c
998  *
999  */
1000 int
1001 t3_sge_init_adapter(adapter_t *sc)
1002 {
1003 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
1004 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1005 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1006 	return (0);
1007 }
1008 
1009 int
1010 t3_sge_reset_adapter(adapter_t *sc)
1011 {
1012 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1013 	return (0);
1014 }
1015 
1016 int
1017 t3_sge_init_port(struct port_info *pi)
1018 {
1019 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1020 	return (0);
1021 }
1022 
1023 /**
1024  *	refill_rspq - replenish an SGE response queue
1025  *	@adapter: the adapter
1026  *	@q: the response queue to replenish
1027  *	@credits: how many new responses to make available
1028  *
1029  *	Replenishes a response queue by making the supplied number of responses
1030  *	available to HW.
1031  */
1032 static __inline void
1033 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1034 {
1035 
1036 	/* mbufs are allocated on demand when a rspq entry is processed. */
1037 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1038 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1039 }
1040 
1041 static void
1042 sge_txq_reclaim_handler(void *arg, int ncount)
1043 {
1044 	struct sge_qset *qs = arg;
1045 	int i;
1046 
1047 	for (i = 0; i < 3; i++)
1048 		reclaim_completed_tx(qs, 16, i);
1049 }
1050 
1051 static void
1052 sge_timer_reclaim(void *arg, int ncount)
1053 {
1054 	struct port_info *pi = arg;
1055 	int i, nqsets = pi->nqsets;
1056 	adapter_t *sc = pi->adapter;
1057 	struct sge_qset *qs;
1058 	struct mtx *lock;
1059 
1060 	KASSERT((sc->flags & USING_MSIX) == 0,
1061 	    ("can't call timer reclaim for msi-x"));
1062 
1063 	for (i = 0; i < nqsets; i++) {
1064 		qs = &sc->sge.qs[pi->first_qset + i];
1065 
1066 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1067 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1068 			    &sc->sge.qs[0].rspq.lock;
1069 
1070 		if (mtx_trylock(lock)) {
1071 			/* XXX currently assume that we are *NOT* polling */
1072 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1073 
1074 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1075 				__refill_fl(sc, &qs->fl[0]);
1076 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1077 				__refill_fl(sc, &qs->fl[1]);
1078 
1079 			if (status & (1 << qs->rspq.cntxt_id)) {
1080 				if (qs->rspq.credits) {
1081 					refill_rspq(sc, &qs->rspq, 1);
1082 					qs->rspq.credits--;
1083 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1084 					    1 << qs->rspq.cntxt_id);
1085 				}
1086 			}
1087 			mtx_unlock(lock);
1088 		}
1089 	}
1090 }
1091 
1092 /**
1093  *	init_qset_cntxt - initialize an SGE queue set context info
1094  *	@qs: the queue set
1095  *	@id: the queue set id
1096  *
1097  *	Initializes the TIDs and context ids for the queues of a queue set.
1098  */
1099 static void
1100 init_qset_cntxt(struct sge_qset *qs, u_int id)
1101 {
1102 
1103 	qs->rspq.cntxt_id = id;
1104 	qs->fl[0].cntxt_id = 2 * id;
1105 	qs->fl[1].cntxt_id = 2 * id + 1;
1106 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1107 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1108 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1109 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1110 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1111 
1112 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1113 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1114 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1115 }
1116 
1117 
1118 static void
1119 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1120 {
1121 	txq->in_use += ndesc;
1122 	/*
1123 	 * XXX we don't handle stopping of queue
1124 	 * presumably start handles this when we bump against the end
1125 	 */
1126 	txqs->gen = txq->gen;
1127 	txq->unacked += ndesc;
1128 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1129 	txq->unacked &= 31;
1130 	txqs->pidx = txq->pidx;
1131 	txq->pidx += ndesc;
1132 #ifdef INVARIANTS
1133 	if (((txqs->pidx > txq->cidx) &&
1134 		(txq->pidx < txqs->pidx) &&
1135 		(txq->pidx >= txq->cidx)) ||
1136 	    ((txqs->pidx < txq->cidx) &&
1137 		(txq->pidx >= txq-> cidx)) ||
1138 	    ((txqs->pidx < txq->cidx) &&
1139 		(txq->cidx < txqs->pidx)))
1140 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1141 		    txqs->pidx, txq->pidx, txq->cidx);
1142 #endif
1143 	if (txq->pidx >= txq->size) {
1144 		txq->pidx -= txq->size;
1145 		txq->gen ^= 1;
1146 	}
1147 
1148 }
1149 
1150 /**
1151  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1152  *	@m: the packet mbufs
1153  *      @nsegs: the number of segments
1154  *
1155  * 	Returns the number of Tx descriptors needed for the given Ethernet
1156  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1157  */
1158 static __inline unsigned int
1159 calc_tx_descs(const struct mbuf *m, int nsegs)
1160 {
1161 	unsigned int flits;
1162 
1163 	if (m->m_pkthdr.len <= PIO_LEN)
1164 		return 1;
1165 
1166 	flits = sgl_len(nsegs) + 2;
1167 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1168 		flits++;
1169 
1170 	return flits_to_desc(flits);
1171 }
1172 
1173 static unsigned int
1174 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1175     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1176 {
1177 	struct mbuf *m0;
1178 	int err, pktlen, pass = 0;
1179 	bus_dma_tag_t tag = txq->entry_tag;
1180 
1181 retry:
1182 	err = 0;
1183 	m0 = *m;
1184 	pktlen = m0->m_pkthdr.len;
1185 #if defined(__i386__) || defined(__amd64__)
1186 	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1187 		goto done;
1188 	} else
1189 #endif
1190 		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1191 
1192 	if (err == 0) {
1193 		goto done;
1194 	}
1195 	if (err == EFBIG && pass == 0) {
1196 		pass = 1;
1197 		/* Too many segments, try to defrag */
1198 		m0 = m_defrag(m0, M_DONTWAIT);
1199 		if (m0 == NULL) {
1200 			m_freem(*m);
1201 			*m = NULL;
1202 			return (ENOBUFS);
1203 		}
1204 		*m = m0;
1205 		goto retry;
1206 	} else if (err == ENOMEM) {
1207 		return (err);
1208 	} if (err) {
1209 		if (cxgb_debug)
1210 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1211 		m_freem(m0);
1212 		*m = NULL;
1213 		return (err);
1214 	}
1215 done:
1216 #if !defined(__i386__) && !defined(__amd64__)
1217 	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1218 #endif
1219 	txsd->flags |= TX_SW_DESC_MAPPED;
1220 
1221 	return (0);
1222 }
1223 
1224 /**
1225  *	make_sgl - populate a scatter/gather list for a packet
1226  *	@sgp: the SGL to populate
1227  *	@segs: the packet dma segments
1228  *	@nsegs: the number of segments
1229  *
1230  *	Generates a scatter/gather list for the buffers that make up a packet
1231  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1232  *	appropriately.
1233  */
1234 static __inline void
1235 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1236 {
1237 	int i, idx;
1238 
1239 	for (idx = 0, i = 0; i < nsegs; i++) {
1240 		/*
1241 		 * firmware doesn't like empty segments
1242 		 */
1243 		if (segs[i].ds_len == 0)
1244 			continue;
1245 		if (i && idx == 0)
1246 			++sgp;
1247 
1248 		sgp->len[idx] = htobe32(segs[i].ds_len);
1249 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1250 		idx ^= 1;
1251 	}
1252 
1253 	if (idx) {
1254 		sgp->len[idx] = 0;
1255 		sgp->addr[idx] = 0;
1256 	}
1257 }
1258 
1259 /**
1260  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1261  *	@adap: the adapter
1262  *	@q: the Tx queue
1263  *
1264  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1265  *	where the HW is going to sleep just after we checked, however,
1266  *	then the interrupt handler will detect the outstanding TX packet
1267  *	and ring the doorbell for us.
1268  *
1269  *	When GTS is disabled we unconditionally ring the doorbell.
1270  */
1271 static __inline void
1272 check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1273 {
1274 #if USE_GTS
1275 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1276 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1277 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1278 #ifdef T3_TRACE
1279 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1280 			  q->cntxt_id);
1281 #endif
1282 		t3_write_reg(adap, A_SG_KDOORBELL,
1283 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1284 	}
1285 #else
1286 	if (mustring || ++q->db_pending >= 32) {
1287 		wmb();            /* write descriptors before telling HW */
1288 		t3_write_reg(adap, A_SG_KDOORBELL,
1289 		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1290 		q->db_pending = 0;
1291 	}
1292 #endif
1293 }
1294 
1295 static __inline void
1296 wr_gen2(struct tx_desc *d, unsigned int gen)
1297 {
1298 #if SGE_NUM_GENBITS == 2
1299 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1300 #endif
1301 }
1302 
1303 /**
1304  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1305  *	@ndesc: number of Tx descriptors spanned by the SGL
1306  *	@txd: first Tx descriptor to be written
1307  *	@txqs: txq state (generation and producer index)
1308  *	@txq: the SGE Tx queue
1309  *	@sgl: the SGL
1310  *	@flits: number of flits to the start of the SGL in the first descriptor
1311  *	@sgl_flits: the SGL size in flits
1312  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1313  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1314  *
1315  *	Write a work request header and an associated SGL.  If the SGL is
1316  *	small enough to fit into one Tx descriptor it has already been written
1317  *	and we just need to write the WR header.  Otherwise we distribute the
1318  *	SGL across the number of descriptors it spans.
1319  */
1320 static void
1321 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1322     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1323     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1324 {
1325 
1326 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1327 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1328 
1329 	if (__predict_true(ndesc == 1)) {
1330 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1331 			V_WR_SGLSFLT(flits)) | wr_hi,
1332 		    htonl(V_WR_LEN(flits + sgl_flits) |
1333 			V_WR_GEN(txqs->gen)) | wr_lo);
1334 		/* XXX gen? */
1335 		wr_gen2(txd, txqs->gen);
1336 
1337 	} else {
1338 		unsigned int ogen = txqs->gen;
1339 		const uint64_t *fp = (const uint64_t *)sgl;
1340 		struct work_request_hdr *wp = wrp;
1341 
1342 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1343 		    V_WR_SGLSFLT(flits)) | wr_hi;
1344 
1345 		while (sgl_flits) {
1346 			unsigned int avail = WR_FLITS - flits;
1347 
1348 			if (avail > sgl_flits)
1349 				avail = sgl_flits;
1350 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1351 			sgl_flits -= avail;
1352 			ndesc--;
1353 			if (!sgl_flits)
1354 				break;
1355 
1356 			fp += avail;
1357 			txd++;
1358 			txsd++;
1359 			if (++txqs->pidx == txq->size) {
1360 				txqs->pidx = 0;
1361 				txqs->gen ^= 1;
1362 				txd = txq->desc;
1363 				txsd = txq->sdesc;
1364 			}
1365 
1366 			/*
1367 			 * when the head of the mbuf chain
1368 			 * is freed all clusters will be freed
1369 			 * with it
1370 			 */
1371 			wrp = (struct work_request_hdr *)txd;
1372 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1373 			    V_WR_SGLSFLT(1)) | wr_hi;
1374 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1375 				    sgl_flits + 1)) |
1376 			    V_WR_GEN(txqs->gen)) | wr_lo;
1377 			wr_gen2(txd, txqs->gen);
1378 			flits = 1;
1379 		}
1380 		wrp->wrh_hi |= htonl(F_WR_EOP);
1381 		wmb();
1382 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1383 		wr_gen2((struct tx_desc *)wp, ogen);
1384 	}
1385 }
1386 
1387 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1388 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1389 
1390 #define GET_VTAG(cntrl, m) \
1391 do { \
1392 	if ((m)->m_flags & M_VLANTAG)					            \
1393 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1394 } while (0)
1395 
1396 static int
1397 t3_encap(struct sge_qset *qs, struct mbuf **m)
1398 {
1399 	adapter_t *sc;
1400 	struct mbuf *m0;
1401 	struct sge_txq *txq;
1402 	struct txq_state txqs;
1403 	struct port_info *pi;
1404 	unsigned int ndesc, flits, cntrl, mlen;
1405 	int err, nsegs, tso_info = 0;
1406 
1407 	struct work_request_hdr *wrp;
1408 	struct tx_sw_desc *txsd;
1409 	struct sg_ent *sgp, *sgl;
1410 	uint32_t wr_hi, wr_lo, sgl_flits;
1411 	bus_dma_segment_t segs[TX_MAX_SEGS];
1412 
1413 	struct tx_desc *txd;
1414 
1415 	pi = qs->port;
1416 	sc = pi->adapter;
1417 	txq = &qs->txq[TXQ_ETH];
1418 	txd = &txq->desc[txq->pidx];
1419 	txsd = &txq->sdesc[txq->pidx];
1420 	sgl = txq->txq_sgl;
1421 
1422 	prefetch(txd);
1423 	m0 = *m;
1424 
1425 	mtx_assert(&qs->lock, MA_OWNED);
1426 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1427 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1428 
1429 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1430 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1431 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1432 
1433 	if (m0->m_nextpkt != NULL) {
1434 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1435 		ndesc = 1;
1436 		mlen = 0;
1437 	} else {
1438 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1439 		    &m0, segs, &nsegs))) {
1440 			if (cxgb_debug)
1441 				printf("failed ... err=%d\n", err);
1442 			return (err);
1443 		}
1444 		mlen = m0->m_pkthdr.len;
1445 		ndesc = calc_tx_descs(m0, nsegs);
1446 	}
1447 	txq_prod(txq, ndesc, &txqs);
1448 
1449 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1450 	txsd->m = m0;
1451 
1452 	if (m0->m_nextpkt != NULL) {
1453 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1454 		int i, fidx;
1455 
1456 		if (nsegs > 7)
1457 			panic("trying to coalesce %d packets in to one WR", nsegs);
1458 		txq->txq_coalesced += nsegs;
1459 		wrp = (struct work_request_hdr *)txd;
1460 		flits = nsegs*2 + 1;
1461 
1462 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1463 			struct cpl_tx_pkt_batch_entry *cbe;
1464 			uint64_t flit;
1465 			uint32_t *hflit = (uint32_t *)&flit;
1466 			int cflags = m0->m_pkthdr.csum_flags;
1467 
1468 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1469 			GET_VTAG(cntrl, m0);
1470 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1471 			if (__predict_false(!(cflags & CSUM_IP)))
1472 				cntrl |= F_TXPKT_IPCSUM_DIS;
1473 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1474 				cntrl |= F_TXPKT_L4CSUM_DIS;
1475 
1476 			hflit[0] = htonl(cntrl);
1477 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1478 			flit |= htobe64(1 << 24);
1479 			cbe = &cpl_batch->pkt_entry[i];
1480 			cbe->cntrl = hflit[0];
1481 			cbe->len = hflit[1];
1482 			cbe->addr = htobe64(segs[i].ds_addr);
1483 		}
1484 
1485 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1486 		    V_WR_SGLSFLT(flits)) |
1487 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1488 		wr_lo = htonl(V_WR_LEN(flits) |
1489 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1490 		set_wr_hdr(wrp, wr_hi, wr_lo);
1491 		wmb();
1492 		ETHER_BPF_MTAP(pi->ifp, m0);
1493 		wr_gen2(txd, txqs.gen);
1494 		check_ring_tx_db(sc, txq, 0);
1495 		return (0);
1496 	} else if (tso_info) {
1497 		uint16_t eth_type;
1498 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1499 		struct ether_header *eh;
1500 		void *l3hdr;
1501 		struct tcphdr *tcp;
1502 
1503 		txd->flit[2] = 0;
1504 		GET_VTAG(cntrl, m0);
1505 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1506 		hdr->cntrl = htonl(cntrl);
1507 		hdr->len = htonl(mlen | 0x80000000);
1508 
1509 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1510 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1511 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1512 			    m0->m_pkthdr.csum_flags, m0->m_flags);
1513 			panic("tx tso packet too small");
1514 		}
1515 
1516 		/* Make sure that ether, ip, tcp headers are all in m0 */
1517 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1518 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1519 			if (__predict_false(m0 == NULL)) {
1520 				/* XXX panic probably an overreaction */
1521 				panic("couldn't fit header into mbuf");
1522 			}
1523 		}
1524 
1525 		eh = mtod(m0, struct ether_header *);
1526 		eth_type = eh->ether_type;
1527 		if (eth_type == htons(ETHERTYPE_VLAN)) {
1528 			struct ether_vlan_header *evh = (void *)eh;
1529 
1530 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1531 			l3hdr = evh + 1;
1532 			eth_type = evh->evl_proto;
1533 		} else {
1534 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1535 			l3hdr = eh + 1;
1536 		}
1537 
1538 		if (eth_type == htons(ETHERTYPE_IP)) {
1539 			struct ip *ip = l3hdr;
1540 
1541 			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1542 			tcp = (struct tcphdr *)(ip + 1);
1543 		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1544 			struct ip6_hdr *ip6 = l3hdr;
1545 
1546 			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1547 			    ("%s: CSUM_TSO with ip6_nxt %d",
1548 			    __func__, ip6->ip6_nxt));
1549 
1550 			tso_info |= F_LSO_IPV6;
1551 			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1552 			tcp = (struct tcphdr *)(ip6 + 1);
1553 		} else
1554 			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1555 
1556 		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1557 		hdr->lso_info = htonl(tso_info);
1558 
1559 		if (__predict_false(mlen <= PIO_LEN)) {
1560 			/*
1561 			 * pkt not undersized but fits in PIO_LEN
1562 			 * Indicates a TSO bug at the higher levels.
1563 			 */
1564 			txsd->m = NULL;
1565 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1566 			flits = (mlen + 7) / 8 + 3;
1567 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1568 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1569 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1570 			wr_lo = htonl(V_WR_LEN(flits) |
1571 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1572 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1573 			wmb();
1574 			ETHER_BPF_MTAP(pi->ifp, m0);
1575 			wr_gen2(txd, txqs.gen);
1576 			check_ring_tx_db(sc, txq, 0);
1577 			m_freem(m0);
1578 			return (0);
1579 		}
1580 		flits = 3;
1581 	} else {
1582 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1583 
1584 		GET_VTAG(cntrl, m0);
1585 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1586 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1587 			cntrl |= F_TXPKT_IPCSUM_DIS;
1588 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1589 			cntrl |= F_TXPKT_L4CSUM_DIS;
1590 		cpl->cntrl = htonl(cntrl);
1591 		cpl->len = htonl(mlen | 0x80000000);
1592 
1593 		if (mlen <= PIO_LEN) {
1594 			txsd->m = NULL;
1595 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1596 			flits = (mlen + 7) / 8 + 2;
1597 
1598 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1599 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1600 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1601 			wr_lo = htonl(V_WR_LEN(flits) |
1602 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1603 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1604 			wmb();
1605 			ETHER_BPF_MTAP(pi->ifp, m0);
1606 			wr_gen2(txd, txqs.gen);
1607 			check_ring_tx_db(sc, txq, 0);
1608 			m_freem(m0);
1609 			return (0);
1610 		}
1611 		flits = 2;
1612 	}
1613 	wrp = (struct work_request_hdr *)txd;
1614 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1615 	make_sgl(sgp, segs, nsegs);
1616 
1617 	sgl_flits = sgl_len(nsegs);
1618 
1619 	ETHER_BPF_MTAP(pi->ifp, m0);
1620 
1621 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1622 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1623 	wr_lo = htonl(V_WR_TID(txq->token));
1624 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1625 	    sgl_flits, wr_hi, wr_lo);
1626 	check_ring_tx_db(sc, txq, 0);
1627 
1628 	return (0);
1629 }
1630 
1631 void
1632 cxgb_tx_watchdog(void *arg)
1633 {
1634 	struct sge_qset *qs = arg;
1635 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1636 
1637         if (qs->coalescing != 0 &&
1638 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1639 	    TXQ_RING_EMPTY(qs))
1640                 qs->coalescing = 0;
1641         else if (qs->coalescing == 0 &&
1642 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1643                 qs->coalescing = 1;
1644 	if (TXQ_TRYLOCK(qs)) {
1645 		qs->qs_flags |= QS_FLUSHING;
1646 		cxgb_start_locked(qs);
1647 		qs->qs_flags &= ~QS_FLUSHING;
1648 		TXQ_UNLOCK(qs);
1649 	}
1650 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1651 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1652 		    qs, txq->txq_watchdog.c_cpu);
1653 }
1654 
1655 static void
1656 cxgb_tx_timeout(void *arg)
1657 {
1658 	struct sge_qset *qs = arg;
1659 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1660 
1661 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1662                 qs->coalescing = 1;
1663 	if (TXQ_TRYLOCK(qs)) {
1664 		qs->qs_flags |= QS_TIMEOUT;
1665 		cxgb_start_locked(qs);
1666 		qs->qs_flags &= ~QS_TIMEOUT;
1667 		TXQ_UNLOCK(qs);
1668 	}
1669 }
1670 
1671 static void
1672 cxgb_start_locked(struct sge_qset *qs)
1673 {
1674 	struct mbuf *m_head = NULL;
1675 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1676 	struct port_info *pi = qs->port;
1677 	struct ifnet *ifp = pi->ifp;
1678 
1679 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1680 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1681 
1682 	if (!pi->link_config.link_ok) {
1683 		TXQ_RING_FLUSH(qs);
1684 		return;
1685 	}
1686 	TXQ_LOCK_ASSERT(qs);
1687 	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1688 	    pi->link_config.link_ok) {
1689 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1690 
1691 		if (txq->size - txq->in_use <= TX_MAX_DESC)
1692 			break;
1693 
1694 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1695 			break;
1696 		/*
1697 		 *  Encapsulation can modify our pointer, and or make it
1698 		 *  NULL on failure.  In that event, we can't requeue.
1699 		 */
1700 		if (t3_encap(qs, &m_head) || m_head == NULL)
1701 			break;
1702 
1703 		m_head = NULL;
1704 	}
1705 
1706 	if (txq->db_pending)
1707 		check_ring_tx_db(pi->adapter, txq, 1);
1708 
1709 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1710 	    pi->link_config.link_ok)
1711 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1712 		    qs, txq->txq_timer.c_cpu);
1713 	if (m_head != NULL)
1714 		m_freem(m_head);
1715 }
1716 
1717 static int
1718 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1719 {
1720 	struct port_info *pi = qs->port;
1721 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1722 	struct buf_ring *br = txq->txq_mr;
1723 	int error, avail;
1724 
1725 	avail = txq->size - txq->in_use;
1726 	TXQ_LOCK_ASSERT(qs);
1727 
1728 	/*
1729 	 * We can only do a direct transmit if the following are true:
1730 	 * - we aren't coalescing (ring < 3/4 full)
1731 	 * - the link is up -- checked in caller
1732 	 * - there are no packets enqueued already
1733 	 * - there is space in hardware transmit queue
1734 	 */
1735 	if (check_pkt_coalesce(qs) == 0 &&
1736 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1737 		if (t3_encap(qs, &m)) {
1738 			if (m != NULL &&
1739 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1740 				return (error);
1741 		} else {
1742 			if (txq->db_pending)
1743 				check_ring_tx_db(pi->adapter, txq, 1);
1744 
1745 			/*
1746 			 * We've bypassed the buf ring so we need to update
1747 			 * the stats directly
1748 			 */
1749 			txq->txq_direct_packets++;
1750 			txq->txq_direct_bytes += m->m_pkthdr.len;
1751 		}
1752 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1753 		return (error);
1754 
1755 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1756 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1757 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1758 		cxgb_start_locked(qs);
1759 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1760 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1761 		    qs, txq->txq_timer.c_cpu);
1762 	return (0);
1763 }
1764 
1765 int
1766 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1767 {
1768 	struct sge_qset *qs;
1769 	struct port_info *pi = ifp->if_softc;
1770 	int error, qidx = pi->first_qset;
1771 
1772 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1773 	    ||(!pi->link_config.link_ok)) {
1774 		m_freem(m);
1775 		return (0);
1776 	}
1777 
1778 	if (m->m_flags & M_FLOWID)
1779 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1780 
1781 	qs = &pi->adapter->sge.qs[qidx];
1782 
1783 	if (TXQ_TRYLOCK(qs)) {
1784 		/* XXX running */
1785 		error = cxgb_transmit_locked(ifp, qs, m);
1786 		TXQ_UNLOCK(qs);
1787 	} else
1788 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1789 	return (error);
1790 }
1791 
1792 void
1793 cxgb_qflush(struct ifnet *ifp)
1794 {
1795 	/*
1796 	 * flush any enqueued mbufs in the buf_rings
1797 	 * and in the transmit queues
1798 	 * no-op for now
1799 	 */
1800 	return;
1801 }
1802 
1803 /**
1804  *	write_imm - write a packet into a Tx descriptor as immediate data
1805  *	@d: the Tx descriptor to write
1806  *	@m: the packet
1807  *	@len: the length of packet data to write as immediate data
1808  *	@gen: the generation bit value to write
1809  *
1810  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1811  *	contains a work request at its beginning.  We must write the packet
1812  *	carefully so the SGE doesn't read accidentally before it's written in
1813  *	its entirety.
1814  */
1815 static __inline void
1816 write_imm(struct tx_desc *d, struct mbuf *m,
1817 	  unsigned int len, unsigned int gen)
1818 {
1819 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1820 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1821 	uint32_t wr_hi, wr_lo;
1822 
1823 	if (len > WR_LEN)
1824 		panic("len too big %d\n", len);
1825 	if (len < sizeof(*from))
1826 		panic("len too small %d", len);
1827 
1828 	memcpy(&to[1], &from[1], len - sizeof(*from));
1829 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1830 					V_WR_BCNTLFLT(len & 7));
1831 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1832 					V_WR_LEN((len + 7) / 8));
1833 	set_wr_hdr(to, wr_hi, wr_lo);
1834 	wmb();
1835 	wr_gen2(d, gen);
1836 
1837 	/*
1838 	 * This check is a hack we should really fix the logic so
1839 	 * that this can't happen
1840 	 */
1841 	if (m->m_type != MT_DONTFREE)
1842 		m_freem(m);
1843 
1844 }
1845 
1846 /**
1847  *	check_desc_avail - check descriptor availability on a send queue
1848  *	@adap: the adapter
1849  *	@q: the TX queue
1850  *	@m: the packet needing the descriptors
1851  *	@ndesc: the number of Tx descriptors needed
1852  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1853  *
1854  *	Checks if the requested number of Tx descriptors is available on an
1855  *	SGE send queue.  If the queue is already suspended or not enough
1856  *	descriptors are available the packet is queued for later transmission.
1857  *	Must be called with the Tx queue locked.
1858  *
1859  *	Returns 0 if enough descriptors are available, 1 if there aren't
1860  *	enough descriptors and the packet has been queued, and 2 if the caller
1861  *	needs to retry because there weren't enough descriptors at the
1862  *	beginning of the call but some freed up in the mean time.
1863  */
1864 static __inline int
1865 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1866 		 struct mbuf *m, unsigned int ndesc,
1867 		 unsigned int qid)
1868 {
1869 	/*
1870 	 * XXX We currently only use this for checking the control queue
1871 	 * the control queue is only used for binding qsets which happens
1872 	 * at init time so we are guaranteed enough descriptors
1873 	 */
1874 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1875 addq_exit:	mbufq_tail(&q->sendq, m);
1876 		return 1;
1877 	}
1878 	if (__predict_false(q->size - q->in_use < ndesc)) {
1879 
1880 		struct sge_qset *qs = txq_to_qset(q, qid);
1881 
1882 		setbit(&qs->txq_stopped, qid);
1883 		if (should_restart_tx(q) &&
1884 		    test_and_clear_bit(qid, &qs->txq_stopped))
1885 			return 2;
1886 
1887 		q->stops++;
1888 		goto addq_exit;
1889 	}
1890 	return 0;
1891 }
1892 
1893 
1894 /**
1895  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1896  *	@q: the SGE control Tx queue
1897  *
1898  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1899  *	that send only immediate data (presently just the control queues) and
1900  *	thus do not have any mbufs
1901  */
1902 static __inline void
1903 reclaim_completed_tx_imm(struct sge_txq *q)
1904 {
1905 	unsigned int reclaim = q->processed - q->cleaned;
1906 
1907 	q->in_use -= reclaim;
1908 	q->cleaned += reclaim;
1909 }
1910 
1911 static __inline int
1912 immediate(const struct mbuf *m)
1913 {
1914 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1915 }
1916 
1917 /**
1918  *	ctrl_xmit - send a packet through an SGE control Tx queue
1919  *	@adap: the adapter
1920  *	@q: the control queue
1921  *	@m: the packet
1922  *
1923  *	Send a packet through an SGE control Tx queue.  Packets sent through
1924  *	a control queue must fit entirely as immediate data in a single Tx
1925  *	descriptor and have no page fragments.
1926  */
1927 static int
1928 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1929 {
1930 	int ret;
1931 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1932 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1933 
1934 	if (__predict_false(!immediate(m))) {
1935 		m_freem(m);
1936 		return 0;
1937 	}
1938 
1939 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1940 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1941 
1942 	TXQ_LOCK(qs);
1943 again:	reclaim_completed_tx_imm(q);
1944 
1945 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1946 	if (__predict_false(ret)) {
1947 		if (ret == 1) {
1948 			TXQ_UNLOCK(qs);
1949 			return (ENOSPC);
1950 		}
1951 		goto again;
1952 	}
1953 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1954 
1955 	q->in_use++;
1956 	if (++q->pidx >= q->size) {
1957 		q->pidx = 0;
1958 		q->gen ^= 1;
1959 	}
1960 	TXQ_UNLOCK(qs);
1961 	wmb();
1962 	t3_write_reg(adap, A_SG_KDOORBELL,
1963 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1964 	return (0);
1965 }
1966 
1967 
1968 /**
1969  *	restart_ctrlq - restart a suspended control queue
1970  *	@qs: the queue set cotaining the control queue
1971  *
1972  *	Resumes transmission on a suspended Tx control queue.
1973  */
1974 static void
1975 restart_ctrlq(void *data, int npending)
1976 {
1977 	struct mbuf *m;
1978 	struct sge_qset *qs = (struct sge_qset *)data;
1979 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1980 	adapter_t *adap = qs->port->adapter;
1981 
1982 	TXQ_LOCK(qs);
1983 again:	reclaim_completed_tx_imm(q);
1984 
1985 	while (q->in_use < q->size &&
1986 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1987 
1988 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1989 
1990 		if (++q->pidx >= q->size) {
1991 			q->pidx = 0;
1992 			q->gen ^= 1;
1993 		}
1994 		q->in_use++;
1995 	}
1996 	if (!mbufq_empty(&q->sendq)) {
1997 		setbit(&qs->txq_stopped, TXQ_CTRL);
1998 
1999 		if (should_restart_tx(q) &&
2000 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
2001 			goto again;
2002 		q->stops++;
2003 	}
2004 	TXQ_UNLOCK(qs);
2005 	t3_write_reg(adap, A_SG_KDOORBELL,
2006 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2007 }
2008 
2009 
2010 /*
2011  * Send a management message through control queue 0
2012  */
2013 int
2014 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
2015 {
2016 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
2017 }
2018 
2019 /**
2020  *	free_qset - free the resources of an SGE queue set
2021  *	@sc: the controller owning the queue set
2022  *	@q: the queue set
2023  *
2024  *	Release the HW and SW resources associated with an SGE queue set, such
2025  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2026  *	queue set must be quiesced prior to calling this.
2027  */
2028 static void
2029 t3_free_qset(adapter_t *sc, struct sge_qset *q)
2030 {
2031 	int i;
2032 
2033 	reclaim_completed_tx(q, 0, TXQ_ETH);
2034 	if (q->txq[TXQ_ETH].txq_mr != NULL)
2035 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
2036 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
2037 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
2038 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
2039 	}
2040 
2041 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2042 		if (q->fl[i].desc) {
2043 			mtx_lock_spin(&sc->sge.reg_lock);
2044 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2045 			mtx_unlock_spin(&sc->sge.reg_lock);
2046 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2047 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2048 					q->fl[i].desc_map);
2049 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2050 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2051 		}
2052 		if (q->fl[i].sdesc) {
2053 			free_rx_bufs(sc, &q->fl[i]);
2054 			free(q->fl[i].sdesc, M_DEVBUF);
2055 		}
2056 	}
2057 
2058 	mtx_unlock(&q->lock);
2059 	MTX_DESTROY(&q->lock);
2060 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2061 		if (q->txq[i].desc) {
2062 			mtx_lock_spin(&sc->sge.reg_lock);
2063 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2064 			mtx_unlock_spin(&sc->sge.reg_lock);
2065 			bus_dmamap_unload(q->txq[i].desc_tag,
2066 					q->txq[i].desc_map);
2067 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2068 					q->txq[i].desc_map);
2069 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2070 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2071 		}
2072 		if (q->txq[i].sdesc) {
2073 			free(q->txq[i].sdesc, M_DEVBUF);
2074 		}
2075 	}
2076 
2077 	if (q->rspq.desc) {
2078 		mtx_lock_spin(&sc->sge.reg_lock);
2079 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2080 		mtx_unlock_spin(&sc->sge.reg_lock);
2081 
2082 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2083 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2084 			        q->rspq.desc_map);
2085 		bus_dma_tag_destroy(q->rspq.desc_tag);
2086 		MTX_DESTROY(&q->rspq.lock);
2087 	}
2088 
2089 #if defined(INET6) || defined(INET)
2090 	tcp_lro_free(&q->lro.ctrl);
2091 #endif
2092 
2093 	bzero(q, sizeof(*q));
2094 }
2095 
2096 /**
2097  *	t3_free_sge_resources - free SGE resources
2098  *	@sc: the adapter softc
2099  *
2100  *	Frees resources used by the SGE queue sets.
2101  */
2102 void
2103 t3_free_sge_resources(adapter_t *sc, int nqsets)
2104 {
2105 	int i;
2106 
2107 	for (i = 0; i < nqsets; ++i) {
2108 		TXQ_LOCK(&sc->sge.qs[i]);
2109 		t3_free_qset(sc, &sc->sge.qs[i]);
2110 	}
2111 }
2112 
2113 /**
2114  *	t3_sge_start - enable SGE
2115  *	@sc: the controller softc
2116  *
2117  *	Enables the SGE for DMAs.  This is the last step in starting packet
2118  *	transfers.
2119  */
2120 void
2121 t3_sge_start(adapter_t *sc)
2122 {
2123 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2124 }
2125 
2126 /**
2127  *	t3_sge_stop - disable SGE operation
2128  *	@sc: the adapter
2129  *
2130  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2131  *	from error interrupts) or from normal process context.  In the latter
2132  *	case it also disables any pending queue restart tasklets.  Note that
2133  *	if it is called in interrupt context it cannot disable the restart
2134  *	tasklets as it cannot wait, however the tasklets will have no effect
2135  *	since the doorbells are disabled and the driver will call this again
2136  *	later from process context, at which time the tasklets will be stopped
2137  *	if they are still running.
2138  */
2139 void
2140 t3_sge_stop(adapter_t *sc)
2141 {
2142 	int i, nqsets;
2143 
2144 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2145 
2146 	if (sc->tq == NULL)
2147 		return;
2148 
2149 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2150 		nqsets += sc->port[i].nqsets;
2151 #ifdef notyet
2152 	/*
2153 	 *
2154 	 * XXX
2155 	 */
2156 	for (i = 0; i < nqsets; ++i) {
2157 		struct sge_qset *qs = &sc->sge.qs[i];
2158 
2159 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2160 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2161 	}
2162 #endif
2163 }
2164 
2165 /**
2166  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2167  *	@adapter: the adapter
2168  *	@q: the Tx queue to reclaim descriptors from
2169  *	@reclaimable: the number of descriptors to reclaim
2170  *      @m_vec_size: maximum number of buffers to reclaim
2171  *      @desc_reclaimed: returns the number of descriptors reclaimed
2172  *
2173  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2174  *	Tx buffers.  Called with the Tx queue lock held.
2175  *
2176  *      Returns number of buffers of reclaimed
2177  */
2178 void
2179 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2180 {
2181 	struct tx_sw_desc *txsd;
2182 	unsigned int cidx, mask;
2183 	struct sge_txq *q = &qs->txq[queue];
2184 
2185 #ifdef T3_TRACE
2186 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2187 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2188 #endif
2189 	cidx = q->cidx;
2190 	mask = q->size - 1;
2191 	txsd = &q->sdesc[cidx];
2192 
2193 	mtx_assert(&qs->lock, MA_OWNED);
2194 	while (reclaimable--) {
2195 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2196 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2197 
2198 		if (txsd->m != NULL) {
2199 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2200 				bus_dmamap_unload(q->entry_tag, txsd->map);
2201 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2202 			}
2203 			m_freem_list(txsd->m);
2204 			txsd->m = NULL;
2205 		} else
2206 			q->txq_skipped++;
2207 
2208 		++txsd;
2209 		if (++cidx == q->size) {
2210 			cidx = 0;
2211 			txsd = q->sdesc;
2212 		}
2213 	}
2214 	q->cidx = cidx;
2215 
2216 }
2217 
2218 /**
2219  *	is_new_response - check if a response is newly written
2220  *	@r: the response descriptor
2221  *	@q: the response queue
2222  *
2223  *	Returns true if a response descriptor contains a yet unprocessed
2224  *	response.
2225  */
2226 static __inline int
2227 is_new_response(const struct rsp_desc *r,
2228     const struct sge_rspq *q)
2229 {
2230 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2231 }
2232 
2233 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2234 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2235 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2236 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2237 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2238 
2239 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2240 #define NOMEM_INTR_DELAY 2500
2241 
2242 /**
2243  *	write_ofld_wr - write an offload work request
2244  *	@adap: the adapter
2245  *	@m: the packet to send
2246  *	@q: the Tx queue
2247  *	@pidx: index of the first Tx descriptor to write
2248  *	@gen: the generation value to use
2249  *	@ndesc: number of descriptors the packet will occupy
2250  *
2251  *	Write an offload work request to send the supplied packet.  The packet
2252  *	data already carry the work request with most fields populated.
2253  */
2254 static void
2255 write_ofld_wr(adapter_t *adap, struct mbuf *m,
2256     struct sge_txq *q, unsigned int pidx,
2257     unsigned int gen, unsigned int ndesc,
2258     bus_dma_segment_t *segs, unsigned int nsegs)
2259 {
2260 	unsigned int sgl_flits, flits;
2261 	struct work_request_hdr *from;
2262 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2263 	struct tx_desc *d = &q->desc[pidx];
2264 	struct txq_state txqs;
2265 
2266 	if (immediate(m) && nsegs == 0) {
2267 		write_imm(d, m, m->m_len, gen);
2268 		return;
2269 	}
2270 
2271 	/* Only TX_DATA builds SGLs */
2272 	from = mtod(m, struct work_request_hdr *);
2273 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2274 
2275 	flits = m->m_len / 8;
2276 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2277 
2278 	make_sgl(sgp, segs, nsegs);
2279 	sgl_flits = sgl_len(nsegs);
2280 
2281 	txqs.gen = gen;
2282 	txqs.pidx = pidx;
2283 	txqs.compl = 0;
2284 
2285 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2286 	    from->wrh_hi, from->wrh_lo);
2287 }
2288 
2289 /**
2290  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2291  *	@m: the packet
2292  *
2293  * 	Returns the number of Tx descriptors needed for the given offload
2294  * 	packet.  These packets are already fully constructed.
2295  */
2296 static __inline unsigned int
2297 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2298 {
2299 	unsigned int flits, cnt = 0;
2300 	int ndescs;
2301 
2302 	if (m->m_len <= WR_LEN && nsegs == 0)
2303 		return (1);                 /* packet fits as immediate data */
2304 
2305 	/*
2306 	 * This needs to be re-visited for TOE
2307 	 */
2308 
2309 	cnt = nsegs;
2310 
2311 	/* headers */
2312 	flits = m->m_len / 8;
2313 
2314 	ndescs = flits_to_desc(flits + sgl_len(cnt));
2315 
2316 	return (ndescs);
2317 }
2318 
2319 /**
2320  *	ofld_xmit - send a packet through an offload queue
2321  *	@adap: the adapter
2322  *	@q: the Tx offload queue
2323  *	@m: the packet
2324  *
2325  *	Send an offload packet through an SGE offload queue.
2326  */
2327 static int
2328 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2329 {
2330 	int ret, nsegs;
2331 	unsigned int ndesc;
2332 	unsigned int pidx, gen;
2333 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2334 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2335 	struct tx_sw_desc *stx;
2336 
2337 	nsegs = m_get_sgllen(m);
2338 	vsegs = m_get_sgl(m);
2339 	ndesc = calc_tx_descs_ofld(m, nsegs);
2340 	busdma_map_sgl(vsegs, segs, nsegs);
2341 
2342 	stx = &q->sdesc[q->pidx];
2343 
2344 	TXQ_LOCK(qs);
2345 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2346 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2347 	if (__predict_false(ret)) {
2348 		if (ret == 1) {
2349 			printf("no ofld desc avail\n");
2350 
2351 			m_set_priority(m, ndesc);     /* save for restart */
2352 			TXQ_UNLOCK(qs);
2353 			return (EINTR);
2354 		}
2355 		goto again;
2356 	}
2357 
2358 	gen = q->gen;
2359 	q->in_use += ndesc;
2360 	pidx = q->pidx;
2361 	q->pidx += ndesc;
2362 	if (q->pidx >= q->size) {
2363 		q->pidx -= q->size;
2364 		q->gen ^= 1;
2365 	}
2366 #ifdef T3_TRACE
2367 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2368 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2369 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2370 		  skb_shinfo(skb)->nr_frags);
2371 #endif
2372 	TXQ_UNLOCK(qs);
2373 
2374 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2375 	check_ring_tx_db(adap, q, 1);
2376 	return (0);
2377 }
2378 
2379 /**
2380  *	restart_offloadq - restart a suspended offload queue
2381  *	@qs: the queue set cotaining the offload queue
2382  *
2383  *	Resumes transmission on a suspended Tx offload queue.
2384  */
2385 static void
2386 restart_offloadq(void *data, int npending)
2387 {
2388 	struct mbuf *m;
2389 	struct sge_qset *qs = data;
2390 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2391 	adapter_t *adap = qs->port->adapter;
2392 	bus_dma_segment_t segs[TX_MAX_SEGS];
2393 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2394 	int nsegs, cleaned;
2395 
2396 	TXQ_LOCK(qs);
2397 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2398 
2399 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2400 		unsigned int gen, pidx;
2401 		unsigned int ndesc = m_get_priority(m);
2402 
2403 		if (__predict_false(q->size - q->in_use < ndesc)) {
2404 			setbit(&qs->txq_stopped, TXQ_OFLD);
2405 			if (should_restart_tx(q) &&
2406 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2407 				goto again;
2408 			q->stops++;
2409 			break;
2410 		}
2411 
2412 		gen = q->gen;
2413 		q->in_use += ndesc;
2414 		pidx = q->pidx;
2415 		q->pidx += ndesc;
2416 		if (q->pidx >= q->size) {
2417 			q->pidx -= q->size;
2418 			q->gen ^= 1;
2419 		}
2420 
2421 		(void)mbufq_dequeue(&q->sendq);
2422 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2423 		TXQ_UNLOCK(qs);
2424 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2425 		TXQ_LOCK(qs);
2426 	}
2427 #if USE_GTS
2428 	set_bit(TXQ_RUNNING, &q->flags);
2429 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2430 #endif
2431 	TXQ_UNLOCK(qs);
2432 	wmb();
2433 	t3_write_reg(adap, A_SG_KDOORBELL,
2434 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2435 }
2436 
2437 /**
2438  *	queue_set - return the queue set a packet should use
2439  *	@m: the packet
2440  *
2441  *	Maps a packet to the SGE queue set it should use.  The desired queue
2442  *	set is carried in bits 1-3 in the packet's priority.
2443  */
2444 static __inline int
2445 queue_set(const struct mbuf *m)
2446 {
2447 	return m_get_priority(m) >> 1;
2448 }
2449 
2450 /**
2451  *	is_ctrl_pkt - return whether an offload packet is a control packet
2452  *	@m: the packet
2453  *
2454  *	Determines whether an offload packet should use an OFLD or a CTRL
2455  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2456  */
2457 static __inline int
2458 is_ctrl_pkt(const struct mbuf *m)
2459 {
2460 	return m_get_priority(m) & 1;
2461 }
2462 
2463 /**
2464  *	t3_offload_tx - send an offload packet
2465  *	@tdev: the offload device to send to
2466  *	@m: the packet
2467  *
2468  *	Sends an offload packet.  We use the packet priority to select the
2469  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2470  *	should be sent as regular or control, bits 1-3 select the queue set.
2471  */
2472 int
2473 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2474 {
2475 	adapter_t *adap = tdev2adap(tdev);
2476 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2477 
2478 	if (__predict_false(is_ctrl_pkt(m)))
2479 		return ctrl_xmit(adap, qs, m);
2480 
2481 	return ofld_xmit(adap, qs, m);
2482 }
2483 
2484 /**
2485  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2486  *	@tdev: the offload device that will be receiving the packets
2487  *	@q: the SGE response queue that assembled the bundle
2488  *	@m: the partial bundle
2489  *	@n: the number of packets in the bundle
2490  *
2491  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2492  */
2493 static __inline void
2494 deliver_partial_bundle(struct t3cdev *tdev,
2495 			struct sge_rspq *q,
2496 			struct mbuf *mbufs[], int n)
2497 {
2498 	if (n) {
2499 		q->offload_bundles++;
2500 		cxgb_ofld_recv(tdev, mbufs, n);
2501 	}
2502 }
2503 
2504 static __inline int
2505 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2506     struct mbuf *m, struct mbuf *rx_gather[],
2507     unsigned int gather_idx)
2508 {
2509 
2510 	rq->offload_pkts++;
2511 	m->m_pkthdr.header = mtod(m, void *);
2512 	rx_gather[gather_idx++] = m;
2513 	if (gather_idx == RX_BUNDLE_SIZE) {
2514 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2515 		gather_idx = 0;
2516 		rq->offload_bundles++;
2517 	}
2518 	return (gather_idx);
2519 }
2520 
2521 static void
2522 restart_tx(struct sge_qset *qs)
2523 {
2524 	struct adapter *sc = qs->port->adapter;
2525 
2526 
2527 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2528 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2529 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2530 		qs->txq[TXQ_OFLD].restarts++;
2531 		DPRINTF("restarting TXQ_OFLD\n");
2532 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2533 	}
2534 	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2535 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2536 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2537 	    qs->txq[TXQ_CTRL].in_use);
2538 
2539 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2540 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2541 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2542 		qs->txq[TXQ_CTRL].restarts++;
2543 		DPRINTF("restarting TXQ_CTRL\n");
2544 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2545 	}
2546 }
2547 
2548 /**
2549  *	t3_sge_alloc_qset - initialize an SGE queue set
2550  *	@sc: the controller softc
2551  *	@id: the queue set id
2552  *	@nports: how many Ethernet ports will be using this queue set
2553  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2554  *	@p: configuration parameters for this queue set
2555  *	@ntxq: number of Tx queues for the queue set
2556  *	@pi: port info for queue set
2557  *
2558  *	Allocate resources and initialize an SGE queue set.  A queue set
2559  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2560  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2561  *	queue, offload queue, and control queue.
2562  */
2563 int
2564 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2565 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2566 {
2567 	struct sge_qset *q = &sc->sge.qs[id];
2568 	int i, ret = 0;
2569 
2570 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2571 	q->port = pi;
2572 
2573 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2574 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2575 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2576 		goto err;
2577 	}
2578 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2579 	    M_NOWAIT | M_ZERO)) == NULL) {
2580 		device_printf(sc->dev, "failed to allocate ifq\n");
2581 		goto err;
2582 	}
2583 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2584 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2585 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2586 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2587 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2588 
2589 	init_qset_cntxt(q, id);
2590 	q->idx = id;
2591 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2592 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2593 		    &q->fl[0].desc, &q->fl[0].sdesc,
2594 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2595 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2596 		printf("error %d from alloc ring fl0\n", ret);
2597 		goto err;
2598 	}
2599 
2600 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2601 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2602 		    &q->fl[1].desc, &q->fl[1].sdesc,
2603 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2604 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2605 		printf("error %d from alloc ring fl1\n", ret);
2606 		goto err;
2607 	}
2608 
2609 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2610 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2611 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2612 		    NULL, NULL)) != 0) {
2613 		printf("error %d from alloc ring rspq\n", ret);
2614 		goto err;
2615 	}
2616 
2617 	for (i = 0; i < ntxq; ++i) {
2618 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2619 
2620 		if ((ret = alloc_ring(sc, p->txq_size[i],
2621 			    sizeof(struct tx_desc), sz,
2622 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2623 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2624 			    &q->txq[i].desc_map,
2625 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2626 			printf("error %d from alloc ring tx %i\n", ret, i);
2627 			goto err;
2628 		}
2629 		mbufq_init(&q->txq[i].sendq);
2630 		q->txq[i].gen = 1;
2631 		q->txq[i].size = p->txq_size[i];
2632 	}
2633 
2634 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2635 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2636 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2637 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2638 
2639 	q->fl[0].gen = q->fl[1].gen = 1;
2640 	q->fl[0].size = p->fl_size;
2641 	q->fl[1].size = p->jumbo_size;
2642 
2643 	q->rspq.gen = 1;
2644 	q->rspq.cidx = 0;
2645 	q->rspq.size = p->rspq_size;
2646 
2647 	q->txq[TXQ_ETH].stop_thres = nports *
2648 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2649 
2650 	q->fl[0].buf_size = MCLBYTES;
2651 	q->fl[0].zone = zone_pack;
2652 	q->fl[0].type = EXT_PACKET;
2653 
2654 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2655 		q->fl[1].zone = zone_jumbo16;
2656 		q->fl[1].type = EXT_JUMBO16;
2657 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2658 		q->fl[1].zone = zone_jumbo9;
2659 		q->fl[1].type = EXT_JUMBO9;
2660 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2661 		q->fl[1].zone = zone_jumbop;
2662 		q->fl[1].type = EXT_JUMBOP;
2663 	} else {
2664 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2665 		ret = EDOOFUS;
2666 		goto err;
2667 	}
2668 	q->fl[1].buf_size = p->jumbo_buf_size;
2669 
2670 	/* Allocate and setup the lro_ctrl structure */
2671 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2672 #if defined(INET6) || defined(INET)
2673 	ret = tcp_lro_init(&q->lro.ctrl);
2674 	if (ret) {
2675 		printf("error %d from tcp_lro_init\n", ret);
2676 		goto err;
2677 	}
2678 #endif
2679 	q->lro.ctrl.ifp = pi->ifp;
2680 
2681 	mtx_lock_spin(&sc->sge.reg_lock);
2682 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2683 				   q->rspq.phys_addr, q->rspq.size,
2684 				   q->fl[0].buf_size, 1, 0);
2685 	if (ret) {
2686 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2687 		goto err_unlock;
2688 	}
2689 
2690 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2691 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2692 					  q->fl[i].phys_addr, q->fl[i].size,
2693 					  q->fl[i].buf_size, p->cong_thres, 1,
2694 					  0);
2695 		if (ret) {
2696 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2697 			goto err_unlock;
2698 		}
2699 	}
2700 
2701 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2702 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2703 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2704 				 1, 0);
2705 	if (ret) {
2706 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2707 		goto err_unlock;
2708 	}
2709 
2710 	if (ntxq > 1) {
2711 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2712 					 USE_GTS, SGE_CNTXT_OFLD, id,
2713 					 q->txq[TXQ_OFLD].phys_addr,
2714 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2715 		if (ret) {
2716 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2717 			goto err_unlock;
2718 		}
2719 	}
2720 
2721 	if (ntxq > 2) {
2722 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2723 					 SGE_CNTXT_CTRL, id,
2724 					 q->txq[TXQ_CTRL].phys_addr,
2725 					 q->txq[TXQ_CTRL].size,
2726 					 q->txq[TXQ_CTRL].token, 1, 0);
2727 		if (ret) {
2728 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2729 			goto err_unlock;
2730 		}
2731 	}
2732 
2733 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2734 	    device_get_unit(sc->dev), irq_vec_idx);
2735 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2736 
2737 	mtx_unlock_spin(&sc->sge.reg_lock);
2738 	t3_update_qset_coalesce(q, p);
2739 	q->port = pi;
2740 
2741 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2742 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2743 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2744 
2745 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2746 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2747 
2748 	return (0);
2749 
2750 err_unlock:
2751 	mtx_unlock_spin(&sc->sge.reg_lock);
2752 err:
2753 	TXQ_LOCK(q);
2754 	t3_free_qset(sc, q);
2755 
2756 	return (ret);
2757 }
2758 
2759 /*
2760  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2761  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2762  * will also be taken into account here.
2763  */
2764 void
2765 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2766 {
2767 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2768 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2769 	struct ifnet *ifp = pi->ifp;
2770 
2771 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2772 
2773 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2774 	    cpl->csum_valid && cpl->csum == 0xffff) {
2775 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2776 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2777 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2778 		m->m_pkthdr.csum_data = 0xffff;
2779 	}
2780 
2781 	if (cpl->vlan_valid) {
2782 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2783 		m->m_flags |= M_VLANTAG;
2784 	}
2785 
2786 	m->m_pkthdr.rcvif = ifp;
2787 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2788 	/*
2789 	 * adjust after conversion to mbuf chain
2790 	 */
2791 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2792 	m->m_len -= (sizeof(*cpl) + ethpad);
2793 	m->m_data += (sizeof(*cpl) + ethpad);
2794 }
2795 
2796 /**
2797  *	get_packet - return the next ingress packet buffer from a free list
2798  *	@adap: the adapter that received the packet
2799  *	@drop_thres: # of remaining buffers before we start dropping packets
2800  *	@qs: the qset that the SGE free list holding the packet belongs to
2801  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2802  *      @r: response descriptor
2803  *
2804  *	Get the next packet from a free list and complete setup of the
2805  *	sk_buff.  If the packet is small we make a copy and recycle the
2806  *	original buffer, otherwise we use the original buffer itself.  If a
2807  *	positive drop threshold is supplied packets are dropped and their
2808  *	buffers recycled if (a) the number of remaining buffers is under the
2809  *	threshold and the packet is too big to copy, or (b) the packet should
2810  *	be copied but there is no memory for the copy.
2811  */
2812 static int
2813 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2814     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2815 {
2816 
2817 	unsigned int len_cq =  ntohl(r->len_cq);
2818 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2819 	int mask, cidx = fl->cidx;
2820 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2821 	uint32_t len = G_RSPD_LEN(len_cq);
2822 	uint32_t flags = M_EXT;
2823 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2824 	caddr_t cl;
2825 	struct mbuf *m;
2826 	int ret = 0;
2827 
2828 	mask = fl->size - 1;
2829 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2830 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2831 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2832 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2833 
2834 	fl->credits--;
2835 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2836 
2837 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2838 	    sopeop == RSPQ_SOP_EOP) {
2839 		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2840 			goto skip_recycle;
2841 		cl = mtod(m, void *);
2842 		memcpy(cl, sd->rxsd_cl, len);
2843 		recycle_rx_buf(adap, fl, fl->cidx);
2844 		m->m_pkthdr.len = m->m_len = len;
2845 		m->m_flags = 0;
2846 		mh->mh_head = mh->mh_tail = m;
2847 		ret = 1;
2848 		goto done;
2849 	} else {
2850 	skip_recycle:
2851 		bus_dmamap_unload(fl->entry_tag, sd->map);
2852 		cl = sd->rxsd_cl;
2853 		m = sd->m;
2854 
2855 		if ((sopeop == RSPQ_SOP_EOP) ||
2856 		    (sopeop == RSPQ_SOP))
2857 			flags |= M_PKTHDR;
2858 		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2859 		if (fl->zone == zone_pack) {
2860 			/*
2861 			 * restore clobbered data pointer
2862 			 */
2863 			m->m_data = m->m_ext.ext_buf;
2864 		} else {
2865 			m_cljset(m, cl, fl->type);
2866 		}
2867 		m->m_len = len;
2868 	}
2869 	switch(sopeop) {
2870 	case RSPQ_SOP_EOP:
2871 		ret = 1;
2872 		/* FALLTHROUGH */
2873 	case RSPQ_SOP:
2874 		mh->mh_head = mh->mh_tail = m;
2875 		m->m_pkthdr.len = len;
2876 		break;
2877 	case RSPQ_EOP:
2878 		ret = 1;
2879 		/* FALLTHROUGH */
2880 	case RSPQ_NSOP_NEOP:
2881 		if (mh->mh_tail == NULL) {
2882 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2883 			m_freem(m);
2884 			break;
2885 		}
2886 		mh->mh_tail->m_next = m;
2887 		mh->mh_tail = m;
2888 		mh->mh_head->m_pkthdr.len += len;
2889 		break;
2890 	}
2891 	if (cxgb_debug)
2892 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2893 done:
2894 	if (++fl->cidx == fl->size)
2895 		fl->cidx = 0;
2896 
2897 	return (ret);
2898 }
2899 
2900 /**
2901  *	handle_rsp_cntrl_info - handles control information in a response
2902  *	@qs: the queue set corresponding to the response
2903  *	@flags: the response control flags
2904  *
2905  *	Handles the control information of an SGE response, such as GTS
2906  *	indications and completion credits for the queue set's Tx queues.
2907  *	HW coalesces credits, we don't do any extra SW coalescing.
2908  */
2909 static __inline void
2910 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2911 {
2912 	unsigned int credits;
2913 
2914 #if USE_GTS
2915 	if (flags & F_RSPD_TXQ0_GTS)
2916 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2917 #endif
2918 	credits = G_RSPD_TXQ0_CR(flags);
2919 	if (credits)
2920 		qs->txq[TXQ_ETH].processed += credits;
2921 
2922 	credits = G_RSPD_TXQ2_CR(flags);
2923 	if (credits)
2924 		qs->txq[TXQ_CTRL].processed += credits;
2925 
2926 # if USE_GTS
2927 	if (flags & F_RSPD_TXQ1_GTS)
2928 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2929 # endif
2930 	credits = G_RSPD_TXQ1_CR(flags);
2931 	if (credits)
2932 		qs->txq[TXQ_OFLD].processed += credits;
2933 
2934 }
2935 
2936 static void
2937 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2938     unsigned int sleeping)
2939 {
2940 	;
2941 }
2942 
2943 /**
2944  *	process_responses - process responses from an SGE response queue
2945  *	@adap: the adapter
2946  *	@qs: the queue set to which the response queue belongs
2947  *	@budget: how many responses can be processed in this round
2948  *
2949  *	Process responses from an SGE response queue up to the supplied budget.
2950  *	Responses include received packets as well as credits and other events
2951  *	for the queues that belong to the response queue's queue set.
2952  *	A negative budget is effectively unlimited.
2953  *
2954  *	Additionally choose the interrupt holdoff time for the next interrupt
2955  *	on this queue.  If the system is under memory shortage use a fairly
2956  *	long delay to help recovery.
2957  */
2958 static int
2959 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2960 {
2961 	struct sge_rspq *rspq = &qs->rspq;
2962 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2963 	int budget_left = budget;
2964 	unsigned int sleeping = 0;
2965 #if defined(INET6) || defined(INET)
2966 	int lro_enabled = qs->lro.enabled;
2967 	int skip_lro;
2968 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2969 #endif
2970 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2971 	int ngathered = 0;
2972 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2973 #ifdef DEBUG
2974 	static int last_holdoff = 0;
2975 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2976 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2977 		last_holdoff = rspq->holdoff_tmr;
2978 	}
2979 #endif
2980 	rspq->next_holdoff = rspq->holdoff_tmr;
2981 
2982 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2983 		int eth, eop = 0, ethpad = 0;
2984 		uint32_t flags = ntohl(r->flags);
2985 		uint32_t rss_csum = *(const uint32_t *)r;
2986 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2987 
2988 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2989 
2990 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2991 			struct mbuf *m;
2992 
2993 			if (cxgb_debug)
2994 				printf("async notification\n");
2995 
2996 			if (mh->mh_head == NULL) {
2997 				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2998 				m = mh->mh_head;
2999 			} else {
3000 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3001 			}
3002 			if (m == NULL)
3003 				goto no_mem;
3004 
3005                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
3006 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
3007                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
3008 			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
3009 			eop = 1;
3010                         rspq->async_notif++;
3011 			goto skip;
3012 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
3013 			struct mbuf *m = NULL;
3014 
3015 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
3016 			    r->rss_hdr.opcode, rspq->cidx);
3017 			if (mh->mh_head == NULL)
3018 				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3019                         else
3020 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3021 
3022 			if (mh->mh_head == NULL &&  m == NULL) {
3023 		no_mem:
3024 				rspq->next_holdoff = NOMEM_INTR_DELAY;
3025 				budget_left--;
3026 				break;
3027 			}
3028 			get_imm_packet(adap, r, mh->mh_head);
3029 			eop = 1;
3030 			rspq->imm_data++;
3031 		} else if (r->len_cq) {
3032 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3033 
3034 			eop = get_packet(adap, drop_thresh, qs, mh, r);
3035 			if (eop) {
3036 				if (r->rss_hdr.hash_type && !adap->timestamp)
3037 					mh->mh_head->m_flags |= M_FLOWID;
3038 				mh->mh_head->m_pkthdr.flowid = rss_hash;
3039 			}
3040 
3041 			ethpad = 2;
3042 		} else {
3043 			rspq->pure_rsps++;
3044 		}
3045 	skip:
3046 		if (flags & RSPD_CTRL_MASK) {
3047 			sleeping |= flags & RSPD_GTS_MASK;
3048 			handle_rsp_cntrl_info(qs, flags);
3049 		}
3050 
3051 		r++;
3052 		if (__predict_false(++rspq->cidx == rspq->size)) {
3053 			rspq->cidx = 0;
3054 			rspq->gen ^= 1;
3055 			r = rspq->desc;
3056 		}
3057 
3058 		if (++rspq->credits >= 64) {
3059 			refill_rspq(adap, rspq, rspq->credits);
3060 			rspq->credits = 0;
3061 		}
3062 		if (!eth && eop) {
3063 			mh->mh_head->m_pkthdr.csum_data = rss_csum;
3064 			/*
3065 			 * XXX size mismatch
3066 			 */
3067 			m_set_priority(mh->mh_head, rss_hash);
3068 
3069 
3070 			ngathered = rx_offload(&adap->tdev, rspq,
3071 			    mh->mh_head, offload_mbufs, ngathered);
3072 			mh->mh_head = NULL;
3073 			DPRINTF("received offload packet\n");
3074 
3075 		} else if (eth && eop) {
3076 			struct mbuf *m = mh->mh_head;
3077 
3078 			t3_rx_eth(adap, rspq, m, ethpad);
3079 
3080 			/*
3081 			 * The T304 sends incoming packets on any qset.  If LRO
3082 			 * is also enabled, we could end up sending packet up
3083 			 * lro_ctrl->ifp's input.  That is incorrect.
3084 			 *
3085 			 * The mbuf's rcvif was derived from the cpl header and
3086 			 * is accurate.  Skip LRO and just use that.
3087 			 */
3088 #if defined(INET6) || defined(INET)
3089 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3090 
3091 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
3092 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
3093 			    ) {
3094 				/* successfully queue'd for LRO */
3095 			} else
3096 #endif
3097 			{
3098 				/*
3099 				 * LRO not enabled, packet unsuitable for LRO,
3100 				 * or unable to queue.  Pass it up right now in
3101 				 * either case.
3102 				 */
3103 				struct ifnet *ifp = m->m_pkthdr.rcvif;
3104 				(*ifp->if_input)(ifp, m);
3105 			}
3106 			mh->mh_head = NULL;
3107 
3108 		}
3109 		__refill_fl_lt(adap, &qs->fl[0], 32);
3110 		__refill_fl_lt(adap, &qs->fl[1], 32);
3111 		--budget_left;
3112 	}
3113 
3114 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3115 
3116 #if defined(INET6) || defined(INET)
3117 	/* Flush LRO */
3118 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3119 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3120 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3121 		tcp_lro_flush(lro_ctrl, queued);
3122 	}
3123 #endif
3124 
3125 	if (sleeping)
3126 		check_ring_db(adap, qs, sleeping);
3127 
3128 	mb();  /* commit Tx queue processed updates */
3129 	if (__predict_false(qs->txq_stopped > 1))
3130 		restart_tx(qs);
3131 
3132 	__refill_fl_lt(adap, &qs->fl[0], 512);
3133 	__refill_fl_lt(adap, &qs->fl[1], 512);
3134 	budget -= budget_left;
3135 	return (budget);
3136 }
3137 
3138 /*
3139  * A helper function that processes responses and issues GTS.
3140  */
3141 static __inline int
3142 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3143 {
3144 	int work;
3145 	static int last_holdoff = 0;
3146 
3147 	work = process_responses(adap, rspq_to_qset(rq), -1);
3148 
3149 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3150 		printf("next_holdoff=%d\n", rq->next_holdoff);
3151 		last_holdoff = rq->next_holdoff;
3152 	}
3153 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3154 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3155 
3156 	return (work);
3157 }
3158 
3159 
3160 /*
3161  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3162  * Handles data events from SGE response queues as well as error and other
3163  * async events as they all use the same interrupt pin.  We use one SGE
3164  * response queue per port in this mode and protect all response queues with
3165  * queue 0's lock.
3166  */
3167 void
3168 t3b_intr(void *data)
3169 {
3170 	uint32_t i, map;
3171 	adapter_t *adap = data;
3172 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3173 
3174 	t3_write_reg(adap, A_PL_CLI, 0);
3175 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3176 
3177 	if (!map)
3178 		return;
3179 
3180 	if (__predict_false(map & F_ERRINTR)) {
3181 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3182 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3183 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3184 	}
3185 
3186 	mtx_lock(&q0->lock);
3187 	for_each_port(adap, i)
3188 	    if (map & (1 << i))
3189 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3190 	mtx_unlock(&q0->lock);
3191 }
3192 
3193 /*
3194  * The MSI interrupt handler.  This needs to handle data events from SGE
3195  * response queues as well as error and other async events as they all use
3196  * the same MSI vector.  We use one SGE response queue per port in this mode
3197  * and protect all response queues with queue 0's lock.
3198  */
3199 void
3200 t3_intr_msi(void *data)
3201 {
3202 	adapter_t *adap = data;
3203 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3204 	int i, new_packets = 0;
3205 
3206 	mtx_lock(&q0->lock);
3207 
3208 	for_each_port(adap, i)
3209 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3210 		    new_packets = 1;
3211 	mtx_unlock(&q0->lock);
3212 	if (new_packets == 0) {
3213 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3214 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3215 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3216 	}
3217 }
3218 
3219 void
3220 t3_intr_msix(void *data)
3221 {
3222 	struct sge_qset *qs = data;
3223 	adapter_t *adap = qs->port->adapter;
3224 	struct sge_rspq *rspq = &qs->rspq;
3225 
3226 	if (process_responses_gts(adap, rspq) == 0)
3227 		rspq->unhandled_irqs++;
3228 }
3229 
3230 #define QDUMP_SBUF_SIZE		32 * 400
3231 static int
3232 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3233 {
3234 	struct sge_rspq *rspq;
3235 	struct sge_qset *qs;
3236 	int i, err, dump_end, idx;
3237 	struct sbuf *sb;
3238 	struct rsp_desc *rspd;
3239 	uint32_t data[4];
3240 
3241 	rspq = arg1;
3242 	qs = rspq_to_qset(rspq);
3243 	if (rspq->rspq_dump_count == 0)
3244 		return (0);
3245 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3246 		log(LOG_WARNING,
3247 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3248 		rspq->rspq_dump_count = 0;
3249 		return (EINVAL);
3250 	}
3251 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3252 		log(LOG_WARNING,
3253 		    "dump start of %d is greater than queue size\n",
3254 		    rspq->rspq_dump_start);
3255 		rspq->rspq_dump_start = 0;
3256 		return (EINVAL);
3257 	}
3258 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3259 	if (err)
3260 		return (err);
3261 	err = sysctl_wire_old_buffer(req, 0);
3262 	if (err)
3263 		return (err);
3264 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3265 
3266 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3267 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3268 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3269 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3270 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3271 
3272 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3273 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3274 
3275 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3276 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3277 		idx = i & (RSPQ_Q_SIZE-1);
3278 
3279 		rspd = &rspq->desc[idx];
3280 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3281 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3282 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3283 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3284 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3285 		    be32toh(rspd->len_cq), rspd->intr_gen);
3286 	}
3287 
3288 	err = sbuf_finish(sb);
3289 	/* Output a trailing NUL. */
3290 	if (err == 0)
3291 		err = SYSCTL_OUT(req, "", 1);
3292 	sbuf_delete(sb);
3293 	return (err);
3294 }
3295 
3296 static int
3297 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3298 {
3299 	struct sge_txq *txq;
3300 	struct sge_qset *qs;
3301 	int i, j, err, dump_end;
3302 	struct sbuf *sb;
3303 	struct tx_desc *txd;
3304 	uint32_t *WR, wr_hi, wr_lo, gen;
3305 	uint32_t data[4];
3306 
3307 	txq = arg1;
3308 	qs = txq_to_qset(txq, TXQ_ETH);
3309 	if (txq->txq_dump_count == 0) {
3310 		return (0);
3311 	}
3312 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3313 		log(LOG_WARNING,
3314 		    "dump count is too large %d\n", txq->txq_dump_count);
3315 		txq->txq_dump_count = 1;
3316 		return (EINVAL);
3317 	}
3318 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3319 		log(LOG_WARNING,
3320 		    "dump start of %d is greater than queue size\n",
3321 		    txq->txq_dump_start);
3322 		txq->txq_dump_start = 0;
3323 		return (EINVAL);
3324 	}
3325 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3326 	if (err)
3327 		return (err);
3328 	err = sysctl_wire_old_buffer(req, 0);
3329 	if (err)
3330 		return (err);
3331 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3332 
3333 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3334 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3335 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3336 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3337 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3338 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3339 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3340 	    txq->txq_dump_start,
3341 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3342 
3343 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3344 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3345 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3346 		WR = (uint32_t *)txd->flit;
3347 		wr_hi = ntohl(WR[0]);
3348 		wr_lo = ntohl(WR[1]);
3349 		gen = G_WR_GEN(wr_lo);
3350 
3351 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3352 		    wr_hi, wr_lo, gen);
3353 		for (j = 2; j < 30; j += 4)
3354 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3355 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3356 
3357 	}
3358 	err = sbuf_finish(sb);
3359 	/* Output a trailing NUL. */
3360 	if (err == 0)
3361 		err = SYSCTL_OUT(req, "", 1);
3362 	sbuf_delete(sb);
3363 	return (err);
3364 }
3365 
3366 static int
3367 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3368 {
3369 	struct sge_txq *txq;
3370 	struct sge_qset *qs;
3371 	int i, j, err, dump_end;
3372 	struct sbuf *sb;
3373 	struct tx_desc *txd;
3374 	uint32_t *WR, wr_hi, wr_lo, gen;
3375 
3376 	txq = arg1;
3377 	qs = txq_to_qset(txq, TXQ_CTRL);
3378 	if (txq->txq_dump_count == 0) {
3379 		return (0);
3380 	}
3381 	if (txq->txq_dump_count > 256) {
3382 		log(LOG_WARNING,
3383 		    "dump count is too large %d\n", txq->txq_dump_count);
3384 		txq->txq_dump_count = 1;
3385 		return (EINVAL);
3386 	}
3387 	if (txq->txq_dump_start > 255) {
3388 		log(LOG_WARNING,
3389 		    "dump start of %d is greater than queue size\n",
3390 		    txq->txq_dump_start);
3391 		txq->txq_dump_start = 0;
3392 		return (EINVAL);
3393 	}
3394 
3395 	err = sysctl_wire_old_buffer(req, 0);
3396 	if (err != 0)
3397 		return (err);
3398 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3399 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3400 	    txq->txq_dump_start,
3401 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3402 
3403 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3404 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3405 		txd = &txq->desc[i & (255)];
3406 		WR = (uint32_t *)txd->flit;
3407 		wr_hi = ntohl(WR[0]);
3408 		wr_lo = ntohl(WR[1]);
3409 		gen = G_WR_GEN(wr_lo);
3410 
3411 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3412 		    wr_hi, wr_lo, gen);
3413 		for (j = 2; j < 30; j += 4)
3414 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3415 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3416 
3417 	}
3418 	err = sbuf_finish(sb);
3419 	/* Output a trailing NUL. */
3420 	if (err == 0)
3421 		err = SYSCTL_OUT(req, "", 1);
3422 	sbuf_delete(sb);
3423 	return (err);
3424 }
3425 
3426 static int
3427 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3428 {
3429 	adapter_t *sc = arg1;
3430 	struct qset_params *qsp = &sc->params.sge.qset[0];
3431 	int coalesce_usecs;
3432 	struct sge_qset *qs;
3433 	int i, j, err, nqsets = 0;
3434 	struct mtx *lock;
3435 
3436 	if ((sc->flags & FULL_INIT_DONE) == 0)
3437 		return (ENXIO);
3438 
3439 	coalesce_usecs = qsp->coalesce_usecs;
3440         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3441 
3442 	if (err != 0) {
3443 		return (err);
3444 	}
3445 	if (coalesce_usecs == qsp->coalesce_usecs)
3446 		return (0);
3447 
3448 	for (i = 0; i < sc->params.nports; i++)
3449 		for (j = 0; j < sc->port[i].nqsets; j++)
3450 			nqsets++;
3451 
3452 	coalesce_usecs = max(1, coalesce_usecs);
3453 
3454 	for (i = 0; i < nqsets; i++) {
3455 		qs = &sc->sge.qs[i];
3456 		qsp = &sc->params.sge.qset[i];
3457 		qsp->coalesce_usecs = coalesce_usecs;
3458 
3459 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3460 			    &sc->sge.qs[0].rspq.lock;
3461 
3462 		mtx_lock(lock);
3463 		t3_update_qset_coalesce(qs, qsp);
3464 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3465 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3466 		mtx_unlock(lock);
3467 	}
3468 
3469 	return (0);
3470 }
3471 
3472 static int
3473 t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3474 {
3475 	adapter_t *sc = arg1;
3476 	int rc, timestamp;
3477 
3478 	if ((sc->flags & FULL_INIT_DONE) == 0)
3479 		return (ENXIO);
3480 
3481 	timestamp = sc->timestamp;
3482 	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3483 
3484 	if (rc != 0)
3485 		return (rc);
3486 
3487 	if (timestamp != sc->timestamp) {
3488 		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3489 		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3490 		sc->timestamp = timestamp;
3491 	}
3492 
3493 	return (0);
3494 }
3495 
3496 void
3497 t3_add_attach_sysctls(adapter_t *sc)
3498 {
3499 	struct sysctl_ctx_list *ctx;
3500 	struct sysctl_oid_list *children;
3501 
3502 	ctx = device_get_sysctl_ctx(sc->dev);
3503 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3504 
3505 	/* random information */
3506 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3507 	    "firmware_version",
3508 	    CTLFLAG_RD, &sc->fw_version,
3509 	    0, "firmware version");
3510 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3511 	    "hw_revision",
3512 	    CTLFLAG_RD, &sc->params.rev,
3513 	    0, "chip model");
3514 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3515 	    "port_types",
3516 	    CTLFLAG_RD, &sc->port_types,
3517 	    0, "type of ports");
3518 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3519 	    "enable_debug",
3520 	    CTLFLAG_RW, &cxgb_debug,
3521 	    0, "enable verbose debugging output");
3522 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3523 	    CTLFLAG_RD, &sc->tunq_coalesce,
3524 	    "#tunneled packets freed");
3525 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3526 	    "txq_overrun",
3527 	    CTLFLAG_RD, &txq_fills,
3528 	    0, "#times txq overrun");
3529 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3530 	    "core_clock",
3531 	    CTLFLAG_RD, &sc->params.vpd.cclk,
3532 	    0, "core clock frequency (in KHz)");
3533 }
3534 
3535 
3536 static const char *rspq_name = "rspq";
3537 static const char *txq_names[] =
3538 {
3539 	"txq_eth",
3540 	"txq_ofld",
3541 	"txq_ctrl"
3542 };
3543 
3544 static int
3545 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3546 {
3547 	struct port_info *p = arg1;
3548 	uint64_t *parg;
3549 
3550 	if (!p)
3551 		return (EINVAL);
3552 
3553 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3554 	PORT_LOCK(p);
3555 	t3_mac_update_stats(&p->mac);
3556 	PORT_UNLOCK(p);
3557 
3558 	return (sysctl_handle_64(oidp, parg, 0, req));
3559 }
3560 
3561 void
3562 t3_add_configured_sysctls(adapter_t *sc)
3563 {
3564 	struct sysctl_ctx_list *ctx;
3565 	struct sysctl_oid_list *children;
3566 	int i, j;
3567 
3568 	ctx = device_get_sysctl_ctx(sc->dev);
3569 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3570 
3571 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3572 	    "intr_coal",
3573 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3574 	    0, t3_set_coalesce_usecs,
3575 	    "I", "interrupt coalescing timer (us)");
3576 
3577 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3578 	    "pkt_timestamp",
3579 	    CTLTYPE_INT | CTLFLAG_RW, sc,
3580 	    0, t3_pkt_timestamp,
3581 	    "I", "provide packet timestamp instead of connection hash");
3582 
3583 	for (i = 0; i < sc->params.nports; i++) {
3584 		struct port_info *pi = &sc->port[i];
3585 		struct sysctl_oid *poid;
3586 		struct sysctl_oid_list *poidlist;
3587 		struct mac_stats *mstats = &pi->mac.stats;
3588 
3589 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3590 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3591 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3592 		poidlist = SYSCTL_CHILDREN(poid);
3593 		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3594 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3595 		    0, "#queue sets");
3596 
3597 		for (j = 0; j < pi->nqsets; j++) {
3598 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3599 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3600 					  *ctrlqpoid, *lropoid;
3601 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3602 					       *txqpoidlist, *ctrlqpoidlist,
3603 					       *lropoidlist;
3604 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3605 
3606 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3607 
3608 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3609 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3610 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3611 
3612 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3613 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3614 					"freelist #0 empty");
3615 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3616 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3617 					"freelist #1 empty");
3618 
3619 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3620 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3621 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3622 
3623 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3624 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3625 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3626 
3627 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3628 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3629 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3630 
3631 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3632 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3633 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3634 
3635 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3636 			    CTLFLAG_RD, &qs->rspq.size,
3637 			    0, "#entries in response queue");
3638 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3639 			    CTLFLAG_RD, &qs->rspq.cidx,
3640 			    0, "consumer index");
3641 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3642 			    CTLFLAG_RD, &qs->rspq.credits,
3643 			    0, "#credits");
3644 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3645 			    CTLFLAG_RD, &qs->rspq.starved,
3646 			    0, "#times starved");
3647 			SYSCTL_ADD_ULONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3648 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3649 			    "physical_address_of the queue");
3650 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3651 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3652 			    0, "start rspq dump entry");
3653 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3654 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3655 			    0, "#rspq entries to dump");
3656 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3657 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3658 			    0, t3_dump_rspq, "A", "dump of the response queue");
3659 
3660 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3661 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3662 			    "#tunneled packets dropped");
3663 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3664 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3665 			    0, "#tunneled packets waiting to be sent");
3666 #if 0
3667 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3668 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3669 			    0, "#tunneled packets queue producer index");
3670 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3671 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3672 			    0, "#tunneled packets queue consumer index");
3673 #endif
3674 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3675 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3676 			    0, "#tunneled packets processed by the card");
3677 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3678 			    CTLFLAG_RD, &txq->cleaned,
3679 			    0, "#tunneled packets cleaned");
3680 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3681 			    CTLFLAG_RD, &txq->in_use,
3682 			    0, "#tunneled packet slots in use");
3683 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3684 			    CTLFLAG_RD, &txq->txq_frees,
3685 			    "#tunneled packets freed");
3686 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3687 			    CTLFLAG_RD, &txq->txq_skipped,
3688 			    0, "#tunneled packet descriptors skipped");
3689 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3690 			    CTLFLAG_RD, &txq->txq_coalesced,
3691 			    "#tunneled packets coalesced");
3692 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3693 			    CTLFLAG_RD, &txq->txq_enqueued,
3694 			    0, "#tunneled packets enqueued to hardware");
3695 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3696 			    CTLFLAG_RD, &qs->txq_stopped,
3697 			    0, "tx queues stopped");
3698 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3699 			    CTLFLAG_RD, &txq->phys_addr,
3700 			    "physical_address_of the queue");
3701 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3702 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3703 			    0, "txq generation");
3704 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3705 			    CTLFLAG_RD, &txq->cidx,
3706 			    0, "hardware queue cidx");
3707 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3708 			    CTLFLAG_RD, &txq->pidx,
3709 			    0, "hardware queue pidx");
3710 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3711 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3712 			    0, "txq start idx for dump");
3713 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3714 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3715 			    0, "txq #entries to dump");
3716 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3717 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3718 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3719 
3720 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3721 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3722 			    0, "ctrlq start idx for dump");
3723 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3724 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3725 			    0, "ctrl #entries to dump");
3726 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3727 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3728 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3729 
3730 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3731 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3732 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3733 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3734 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3735 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3736 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3737 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3738 		}
3739 
3740 		/* Now add a node for mac stats. */
3741 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3742 		    CTLFLAG_RD, NULL, "MAC statistics");
3743 		poidlist = SYSCTL_CHILDREN(poid);
3744 
3745 		/*
3746 		 * We (ab)use the length argument (arg2) to pass on the offset
3747 		 * of the data that we are interested in.  This is only required
3748 		 * for the quad counters that are updated from the hardware (we
3749 		 * make sure that we return the latest value).
3750 		 * sysctl_handle_macstat first updates *all* the counters from
3751 		 * the hardware, and then returns the latest value of the
3752 		 * requested counter.  Best would be to update only the
3753 		 * requested counter from hardware, but t3_mac_update_stats()
3754 		 * hides all the register details and we don't want to dive into
3755 		 * all that here.
3756 		 */
3757 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3758     (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3759     sysctl_handle_macstat, "QU", 0)
3760 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3761 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3762 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3763 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3764 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3765 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3766 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3767 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3768 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3769 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3770 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3771 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3772 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3773 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3774 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3775 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3776 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3777 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3778 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3779 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3780 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3781 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3782 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3783 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3784 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3785 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3786 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3787 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3788 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3789 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3790 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3791 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3792 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3793 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3794 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3795 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3796 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3797 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3798 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3799 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3800 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3801 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3802 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3803 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3804 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3805 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3806 #undef CXGB_SYSCTL_ADD_QUAD
3807 
3808 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3809     CTLFLAG_RD, &mstats->a, 0)
3810 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3811 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3812 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3813 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3814 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3815 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3816 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3817 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3818 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3819 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3820 #undef CXGB_SYSCTL_ADD_ULONG
3821 	}
3822 }
3823 
3824 /**
3825  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3826  *	@qs: the queue set
3827  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3828  *	@idx: the descriptor index in the queue
3829  *	@data: where to dump the descriptor contents
3830  *
3831  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3832  *	size of the descriptor.
3833  */
3834 int
3835 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3836 		unsigned char *data)
3837 {
3838 	if (qnum >= 6)
3839 		return (EINVAL);
3840 
3841 	if (qnum < 3) {
3842 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3843 			return -EINVAL;
3844 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3845 		return sizeof(struct tx_desc);
3846 	}
3847 
3848 	if (qnum == 3) {
3849 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3850 			return (EINVAL);
3851 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3852 		return sizeof(struct rsp_desc);
3853 	}
3854 
3855 	qnum -= 4;
3856 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3857 		return (EINVAL);
3858 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3859 	return sizeof(struct rx_desc);
3860 }
3861