xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 4ed925457ab06e83238a5db33e89ccc94b99a713)
1 /**************************************************************************
2 
3 Copyright (c) 2007-2009, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/conf.h>
39 #include <machine/bus.h>
40 #include <machine/resource.h>
41 #include <sys/bus_dma.h>
42 #include <sys/rman.h>
43 #include <sys/queue.h>
44 #include <sys/sysctl.h>
45 #include <sys/taskqueue.h>
46 
47 #include <sys/proc.h>
48 #include <sys/sbuf.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/systm.h>
52 #include <sys/syslog.h>
53 
54 #include <net/bpf.h>
55 
56 #include <netinet/in_systm.h>
57 #include <netinet/in.h>
58 #include <netinet/ip.h>
59 #include <netinet/tcp.h>
60 
61 #include <dev/pci/pcireg.h>
62 #include <dev/pci/pcivar.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 
67 #include <cxgb_include.h>
68 #include <sys/mvec.h>
69 
70 int	txq_fills = 0;
71 int	multiq_tx_enable = 1;
72 
73 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
74 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
75 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
76 SYSCTL_UINT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
77     "size of per-queue mbuf ring");
78 
79 static int cxgb_tx_coalesce_force = 0;
80 TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
81 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
82     &cxgb_tx_coalesce_force, 0,
83     "coalesce small packets into a single work request regardless of ring state");
84 
85 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
86 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
87 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
88 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
89 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
90 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
91 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
92 
93 
94 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
95 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
96     &cxgb_tx_coalesce_enable_start);
97 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
98     &cxgb_tx_coalesce_enable_start, 0,
99     "coalesce enable threshold");
100 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
101 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
102 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
103     &cxgb_tx_coalesce_enable_stop, 0,
104     "coalesce disable threshold");
105 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
106 TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
107 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
108     &cxgb_tx_reclaim_threshold, 0,
109     "tx cleaning minimum threshold");
110 
111 /*
112  * XXX don't re-enable this until TOE stops assuming
113  * we have an m_ext
114  */
115 static int recycle_enable = 0;
116 int cxgb_ext_freed = 0;
117 int cxgb_ext_inited = 0;
118 int fl_q_size = 0;
119 int jumbo_q_size = 0;
120 
121 extern int cxgb_use_16k_clusters;
122 extern int nmbjumbo4;
123 extern int nmbjumbo9;
124 extern int nmbjumbo16;
125 
126 #define USE_GTS 0
127 
128 #define SGE_RX_SM_BUF_SIZE	1536
129 #define SGE_RX_DROP_THRES	16
130 #define SGE_RX_COPY_THRES	128
131 
132 /*
133  * Period of the Tx buffer reclaim timer.  This timer does not need to run
134  * frequently as Tx buffers are usually reclaimed by new Tx packets.
135  */
136 #define TX_RECLAIM_PERIOD       (hz >> 1)
137 
138 /*
139  * Values for sge_txq.flags
140  */
141 enum {
142 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
143 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
144 };
145 
146 struct tx_desc {
147 	uint64_t	flit[TX_DESC_FLITS];
148 } __packed;
149 
150 struct rx_desc {
151 	uint32_t	addr_lo;
152 	uint32_t	len_gen;
153 	uint32_t	gen2;
154 	uint32_t	addr_hi;
155 } __packed;
156 
157 struct rsp_desc {               /* response queue descriptor */
158 	struct rss_header	rss_hdr;
159 	uint32_t		flags;
160 	uint32_t		len_cq;
161 	uint8_t			imm_data[47];
162 	uint8_t			intr_gen;
163 } __packed;
164 
165 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
166 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
167 #define RX_SW_DESC_INUSE        (1 << 3)
168 #define TX_SW_DESC_MAPPED       (1 << 4)
169 
170 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
171 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
172 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
173 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
174 
175 struct tx_sw_desc {                /* SW state per Tx descriptor */
176 	struct mbuf	*m;
177 	bus_dmamap_t	map;
178 	int		flags;
179 };
180 
181 struct rx_sw_desc {                /* SW state per Rx descriptor */
182 	caddr_t		rxsd_cl;
183 	struct mbuf	*m;
184 	bus_dmamap_t	map;
185 	int		flags;
186 };
187 
188 struct txq_state {
189 	unsigned int	compl;
190 	unsigned int	gen;
191 	unsigned int	pidx;
192 };
193 
194 struct refill_fl_cb_arg {
195 	int               error;
196 	bus_dma_segment_t seg;
197 	int               nseg;
198 };
199 
200 
201 /*
202  * Maps a number of flits to the number of Tx descriptors that can hold them.
203  * The formula is
204  *
205  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
206  *
207  * HW allows up to 4 descriptors to be combined into a WR.
208  */
209 static uint8_t flit_desc_map[] = {
210 	0,
211 #if SGE_NUM_GENBITS == 1
212 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
214 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
215 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
216 #elif SGE_NUM_GENBITS == 2
217 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
218 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
219 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
220 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
221 #else
222 # error "SGE_NUM_GENBITS must be 1 or 2"
223 #endif
224 };
225 
226 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
227 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
228 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
229 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
230 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
231 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
232 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
233 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
234 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
235 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
236 #define	TXQ_RING_DEQUEUE(qs) \
237 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
238 
239 int cxgb_debug = 0;
240 
241 static void sge_timer_cb(void *arg);
242 static void sge_timer_reclaim(void *arg, int ncount);
243 static void sge_txq_reclaim_handler(void *arg, int ncount);
244 static void cxgb_start_locked(struct sge_qset *qs);
245 
246 /*
247  * XXX need to cope with bursty scheduling by looking at a wider
248  * window than we are now for determining the need for coalescing
249  *
250  */
251 static __inline uint64_t
252 check_pkt_coalesce(struct sge_qset *qs)
253 {
254         struct adapter *sc;
255         struct sge_txq *txq;
256 	uint8_t *fill;
257 
258 	if (__predict_false(cxgb_tx_coalesce_force))
259 		return (1);
260 	txq = &qs->txq[TXQ_ETH];
261         sc = qs->port->adapter;
262 	fill = &sc->tunq_fill[qs->idx];
263 
264 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
265 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
266 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
267 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
268 	/*
269 	 * if the hardware transmit queue is more than 1/8 full
270 	 * we mark it as coalescing - we drop back from coalescing
271 	 * when we go below 1/32 full and there are no packets enqueued,
272 	 * this provides us with some degree of hysteresis
273 	 */
274         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
275 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
276                 *fill = 0;
277         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
278                 *fill = 1;
279 
280 	return (sc->tunq_coalesce);
281 }
282 
283 #ifdef __LP64__
284 static void
285 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
286 {
287 	uint64_t wr_hilo;
288 #if _BYTE_ORDER == _LITTLE_ENDIAN
289 	wr_hilo = wr_hi;
290 	wr_hilo |= (((uint64_t)wr_lo)<<32);
291 #else
292 	wr_hilo = wr_lo;
293 	wr_hilo |= (((uint64_t)wr_hi)<<32);
294 #endif
295 	wrp->wrh_hilo = wr_hilo;
296 }
297 #else
298 static void
299 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
300 {
301 
302 	wrp->wrh_hi = wr_hi;
303 	wmb();
304 	wrp->wrh_lo = wr_lo;
305 }
306 #endif
307 
308 struct coalesce_info {
309 	int count;
310 	int nbytes;
311 };
312 
313 static int
314 coalesce_check(struct mbuf *m, void *arg)
315 {
316 	struct coalesce_info *ci = arg;
317 	int *count = &ci->count;
318 	int *nbytes = &ci->nbytes;
319 
320 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
321 		(*count < 7) && (m->m_next == NULL))) {
322 		*count += 1;
323 		*nbytes += m->m_len;
324 		return (1);
325 	}
326 	return (0);
327 }
328 
329 static struct mbuf *
330 cxgb_dequeue(struct sge_qset *qs)
331 {
332 	struct mbuf *m, *m_head, *m_tail;
333 	struct coalesce_info ci;
334 
335 
336 	if (check_pkt_coalesce(qs) == 0)
337 		return TXQ_RING_DEQUEUE(qs);
338 
339 	m_head = m_tail = NULL;
340 	ci.count = ci.nbytes = 0;
341 	do {
342 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
343 		if (m_head == NULL) {
344 			m_tail = m_head = m;
345 		} else if (m != NULL) {
346 			m_tail->m_nextpkt = m;
347 			m_tail = m;
348 		}
349 	} while (m != NULL);
350 	if (ci.count > 7)
351 		panic("trying to coalesce %d packets in to one WR", ci.count);
352 	return (m_head);
353 }
354 
355 /**
356  *	reclaim_completed_tx - reclaims completed Tx descriptors
357  *	@adapter: the adapter
358  *	@q: the Tx queue to reclaim completed descriptors from
359  *
360  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
361  *	and frees the associated buffers if possible.  Called with the Tx
362  *	queue's lock held.
363  */
364 static __inline int
365 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
366 {
367 	struct sge_txq *q = &qs->txq[queue];
368 	int reclaim = desc_reclaimable(q);
369 
370 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
371 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
372 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
373 
374 	if (reclaim < reclaim_min)
375 		return (0);
376 
377 	mtx_assert(&qs->lock, MA_OWNED);
378 	if (reclaim > 0) {
379 		t3_free_tx_desc(qs, reclaim, queue);
380 		q->cleaned += reclaim;
381 		q->in_use -= reclaim;
382 	}
383 	if (isset(&qs->txq_stopped, TXQ_ETH))
384                 clrbit(&qs->txq_stopped, TXQ_ETH);
385 
386 	return (reclaim);
387 }
388 
389 /**
390  *	should_restart_tx - are there enough resources to restart a Tx queue?
391  *	@q: the Tx queue
392  *
393  *	Checks if there are enough descriptors to restart a suspended Tx queue.
394  */
395 static __inline int
396 should_restart_tx(const struct sge_txq *q)
397 {
398 	unsigned int r = q->processed - q->cleaned;
399 
400 	return q->in_use - r < (q->size >> 1);
401 }
402 
403 /**
404  *	t3_sge_init - initialize SGE
405  *	@adap: the adapter
406  *	@p: the SGE parameters
407  *
408  *	Performs SGE initialization needed every time after a chip reset.
409  *	We do not initialize any of the queue sets here, instead the driver
410  *	top-level must request those individually.  We also do not enable DMA
411  *	here, that should be done after the queues have been set up.
412  */
413 void
414 t3_sge_init(adapter_t *adap, struct sge_params *p)
415 {
416 	u_int ctrl, ups;
417 
418 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
419 
420 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
421 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
422 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
423 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
424 #if SGE_NUM_GENBITS == 1
425 	ctrl |= F_EGRGENCTRL;
426 #endif
427 	if (adap->params.rev > 0) {
428 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
429 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
430 	}
431 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
432 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
433 		     V_LORCQDRBTHRSH(512));
434 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
435 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
436 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
437 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
438 		     adap->params.rev < T3_REV_C ? 1000 : 500);
439 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
440 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
441 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
442 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
443 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
444 }
445 
446 
447 /**
448  *	sgl_len - calculates the size of an SGL of the given capacity
449  *	@n: the number of SGL entries
450  *
451  *	Calculates the number of flits needed for a scatter/gather list that
452  *	can hold the given number of entries.
453  */
454 static __inline unsigned int
455 sgl_len(unsigned int n)
456 {
457 	return ((3 * n) / 2 + (n & 1));
458 }
459 
460 /**
461  *	get_imm_packet - return the next ingress packet buffer from a response
462  *	@resp: the response descriptor containing the packet data
463  *
464  *	Return a packet containing the immediate data of the given response.
465  */
466 static int
467 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
468 {
469 
470 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
471 	m->m_ext.ext_buf = NULL;
472 	m->m_ext.ext_type = 0;
473 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
474 	return (0);
475 }
476 
477 static __inline u_int
478 flits_to_desc(u_int n)
479 {
480 	return (flit_desc_map[n]);
481 }
482 
483 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
484 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
485 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
486 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
487 		    F_HIRCQPARITYERROR)
488 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
489 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
490 		      F_RSPQDISABLED)
491 
492 /**
493  *	t3_sge_err_intr_handler - SGE async event interrupt handler
494  *	@adapter: the adapter
495  *
496  *	Interrupt handler for SGE asynchronous (non-data) events.
497  */
498 void
499 t3_sge_err_intr_handler(adapter_t *adapter)
500 {
501 	unsigned int v, status;
502 
503 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
504 	if (status & SGE_PARERR)
505 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
506 			 status & SGE_PARERR);
507 	if (status & SGE_FRAMINGERR)
508 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
509 			 status & SGE_FRAMINGERR);
510 	if (status & F_RSPQCREDITOVERFOW)
511 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
512 
513 	if (status & F_RSPQDISABLED) {
514 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
515 
516 		CH_ALERT(adapter,
517 			 "packet delivered to disabled response queue (0x%x)\n",
518 			 (v >> S_RSPQ0DISABLED) & 0xff);
519 	}
520 
521 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
522 	if (status & SGE_FATALERR)
523 		t3_fatal_err(adapter);
524 }
525 
526 void
527 t3_sge_prep(adapter_t *adap, struct sge_params *p)
528 {
529 	int i, nqsets;
530 
531 	nqsets = min(SGE_QSETS, mp_ncpus*4);
532 
533 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
534 
535 	while (!powerof2(fl_q_size))
536 		fl_q_size--;
537 #if __FreeBSD_version >= 700111
538 	if (cxgb_use_16k_clusters)
539 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
540 	else
541 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
542 #else
543 	jumbo_q_size = min(nmbjumbo4/(3*nqsets), JUMBO_Q_SIZE);
544 #endif
545 	while (!powerof2(jumbo_q_size))
546 		jumbo_q_size--;
547 
548 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
549 		device_printf(adap->dev,
550 		    "Insufficient clusters and/or jumbo buffers.\n");
551 
552 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
553 	p->max_pkt_size = adap->sge.qs[0].fl[1].buf_size - sizeof(struct cpl_rx_data);
554 
555 	for (i = 0; i < SGE_QSETS; ++i) {
556 		struct qset_params *q = p->qset + i;
557 
558 		if (adap->params.nports > 2) {
559 			q->coalesce_usecs = 50;
560 		} else {
561 #ifdef INVARIANTS
562 			q->coalesce_usecs = 10;
563 #else
564 			q->coalesce_usecs = 5;
565 #endif
566 		}
567 		q->polling = 0;
568 		q->rspq_size = RSPQ_Q_SIZE;
569 		q->fl_size = fl_q_size;
570 		q->jumbo_size = jumbo_q_size;
571 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
572 		q->txq_size[TXQ_OFLD] = 1024;
573 		q->txq_size[TXQ_CTRL] = 256;
574 		q->cong_thres = 0;
575 	}
576 }
577 
578 int
579 t3_sge_alloc(adapter_t *sc)
580 {
581 
582 	/* The parent tag. */
583 	if (bus_dma_tag_create( NULL,			/* parent */
584 				1, 0,			/* algnmnt, boundary */
585 				BUS_SPACE_MAXADDR,	/* lowaddr */
586 				BUS_SPACE_MAXADDR,	/* highaddr */
587 				NULL, NULL,		/* filter, filterarg */
588 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
589 				BUS_SPACE_UNRESTRICTED, /* nsegments */
590 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
591 				0,			/* flags */
592 				NULL, NULL,		/* lock, lockarg */
593 				&sc->parent_dmat)) {
594 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
595 		return (ENOMEM);
596 	}
597 
598 	/*
599 	 * DMA tag for normal sized RX frames
600 	 */
601 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
602 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
603 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
604 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
605 		return (ENOMEM);
606 	}
607 
608 	/*
609 	 * DMA tag for jumbo sized RX frames.
610 	 */
611 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
612 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
613 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
614 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
615 		return (ENOMEM);
616 	}
617 
618 	/*
619 	 * DMA tag for TX frames.
620 	 */
621 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
622 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
623 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
624 		NULL, NULL, &sc->tx_dmat)) {
625 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
626 		return (ENOMEM);
627 	}
628 
629 	return (0);
630 }
631 
632 int
633 t3_sge_free(struct adapter * sc)
634 {
635 
636 	if (sc->tx_dmat != NULL)
637 		bus_dma_tag_destroy(sc->tx_dmat);
638 
639 	if (sc->rx_jumbo_dmat != NULL)
640 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
641 
642 	if (sc->rx_dmat != NULL)
643 		bus_dma_tag_destroy(sc->rx_dmat);
644 
645 	if (sc->parent_dmat != NULL)
646 		bus_dma_tag_destroy(sc->parent_dmat);
647 
648 	return (0);
649 }
650 
651 void
652 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
653 {
654 
655 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
656 	qs->rspq.polling = 0 /* p->polling */;
657 }
658 
659 #if !defined(__i386__) && !defined(__amd64__)
660 static void
661 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
662 {
663 	struct refill_fl_cb_arg *cb_arg = arg;
664 
665 	cb_arg->error = error;
666 	cb_arg->seg = segs[0];
667 	cb_arg->nseg = nseg;
668 
669 }
670 #endif
671 /**
672  *	refill_fl - refill an SGE free-buffer list
673  *	@sc: the controller softc
674  *	@q: the free-list to refill
675  *	@n: the number of new buffers to allocate
676  *
677  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
678  *	The caller must assure that @n does not exceed the queue's capacity.
679  */
680 static void
681 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
682 {
683 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
684 	struct rx_desc *d = &q->desc[q->pidx];
685 	struct refill_fl_cb_arg cb_arg;
686 	struct mbuf *m;
687 	caddr_t cl;
688 	int err, count = 0;
689 
690 	cb_arg.error = 0;
691 	while (n--) {
692 		/*
693 		 * We only allocate a cluster, mbuf allocation happens after rx
694 		 */
695 		if (q->zone == zone_pack) {
696 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
697 				break;
698 			cl = m->m_ext.ext_buf;
699 		} else {
700 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
701 				break;
702 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
703 				uma_zfree(q->zone, cl);
704 				break;
705 			}
706 		}
707 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
708 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
709 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
710 				uma_zfree(q->zone, cl);
711 				goto done;
712 			}
713 			sd->flags |= RX_SW_DESC_MAP_CREATED;
714 		}
715 #if !defined(__i386__) && !defined(__amd64__)
716 		err = bus_dmamap_load(q->entry_tag, sd->map,
717 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
718 
719 		if (err != 0 || cb_arg.error) {
720 			if (q->zone == zone_pack)
721 				uma_zfree(q->zone, cl);
722 			m_free(m);
723 			goto done;
724 		}
725 #else
726 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
727 #endif
728 		sd->flags |= RX_SW_DESC_INUSE;
729 		sd->rxsd_cl = cl;
730 		sd->m = m;
731 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
732 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
733 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
734 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
735 
736 		d++;
737 		sd++;
738 
739 		if (++q->pidx == q->size) {
740 			q->pidx = 0;
741 			q->gen ^= 1;
742 			sd = q->sdesc;
743 			d = q->desc;
744 		}
745 		q->credits++;
746 		count++;
747 	}
748 
749 done:
750 	if (count)
751 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
752 }
753 
754 
755 /**
756  *	free_rx_bufs - free the Rx buffers on an SGE free list
757  *	@sc: the controle softc
758  *	@q: the SGE free list to clean up
759  *
760  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
761  *	this queue should be stopped before calling this function.
762  */
763 static void
764 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
765 {
766 	u_int cidx = q->cidx;
767 
768 	while (q->credits--) {
769 		struct rx_sw_desc *d = &q->sdesc[cidx];
770 
771 		if (d->flags & RX_SW_DESC_INUSE) {
772 			bus_dmamap_unload(q->entry_tag, d->map);
773 			bus_dmamap_destroy(q->entry_tag, d->map);
774 			if (q->zone == zone_pack) {
775 				m_init(d->m, zone_pack, MCLBYTES,
776 				    M_NOWAIT, MT_DATA, M_EXT);
777 				uma_zfree(zone_pack, d->m);
778 			} else {
779 				m_init(d->m, zone_mbuf, MLEN,
780 				    M_NOWAIT, MT_DATA, 0);
781 				uma_zfree(zone_mbuf, d->m);
782 				uma_zfree(q->zone, d->rxsd_cl);
783 			}
784 		}
785 
786 		d->rxsd_cl = NULL;
787 		d->m = NULL;
788 		if (++cidx == q->size)
789 			cidx = 0;
790 	}
791 }
792 
793 static __inline void
794 __refill_fl(adapter_t *adap, struct sge_fl *fl)
795 {
796 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
797 }
798 
799 static __inline void
800 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
801 {
802 	if ((fl->size - fl->credits) < max)
803 		refill_fl(adap, fl, min(max, fl->size - fl->credits));
804 }
805 
806 /**
807  *	recycle_rx_buf - recycle a receive buffer
808  *	@adapter: the adapter
809  *	@q: the SGE free list
810  *	@idx: index of buffer to recycle
811  *
812  *	Recycles the specified buffer on the given free list by adding it at
813  *	the next available slot on the list.
814  */
815 static void
816 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
817 {
818 	struct rx_desc *from = &q->desc[idx];
819 	struct rx_desc *to   = &q->desc[q->pidx];
820 
821 	q->sdesc[q->pidx] = q->sdesc[idx];
822 	to->addr_lo = from->addr_lo;        // already big endian
823 	to->addr_hi = from->addr_hi;        // likewise
824 	wmb();	/* necessary ? */
825 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
826 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
827 	q->credits++;
828 
829 	if (++q->pidx == q->size) {
830 		q->pidx = 0;
831 		q->gen ^= 1;
832 	}
833 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
834 }
835 
836 static void
837 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
838 {
839 	uint32_t *addr;
840 
841 	addr = arg;
842 	*addr = segs[0].ds_addr;
843 }
844 
845 static int
846 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
847     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
848     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
849 {
850 	size_t len = nelem * elem_size;
851 	void *s = NULL;
852 	void *p = NULL;
853 	int err;
854 
855 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
856 				      BUS_SPACE_MAXADDR_32BIT,
857 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
858 				      len, 0, NULL, NULL, tag)) != 0) {
859 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
860 		return (ENOMEM);
861 	}
862 
863 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
864 				    map)) != 0) {
865 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
866 		return (ENOMEM);
867 	}
868 
869 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
870 	bzero(p, len);
871 	*(void **)desc = p;
872 
873 	if (sw_size) {
874 		len = nelem * sw_size;
875 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
876 		*(void **)sdesc = s;
877 	}
878 	if (parent_entry_tag == NULL)
879 		return (0);
880 
881 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
882 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
883 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
884 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
885 		                      NULL, NULL, entry_tag)) != 0) {
886 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
887 		return (ENOMEM);
888 	}
889 	return (0);
890 }
891 
892 static void
893 sge_slow_intr_handler(void *arg, int ncount)
894 {
895 	adapter_t *sc = arg;
896 
897 	t3_slow_intr_handler(sc);
898 }
899 
900 /**
901  *	sge_timer_cb - perform periodic maintenance of an SGE qset
902  *	@data: the SGE queue set to maintain
903  *
904  *	Runs periodically from a timer to perform maintenance of an SGE queue
905  *	set.  It performs two tasks:
906  *
907  *	a) Cleans up any completed Tx descriptors that may still be pending.
908  *	Normal descriptor cleanup happens when new packets are added to a Tx
909  *	queue so this timer is relatively infrequent and does any cleanup only
910  *	if the Tx queue has not seen any new packets in a while.  We make a
911  *	best effort attempt to reclaim descriptors, in that we don't wait
912  *	around if we cannot get a queue's lock (which most likely is because
913  *	someone else is queueing new packets and so will also handle the clean
914  *	up).  Since control queues use immediate data exclusively we don't
915  *	bother cleaning them up here.
916  *
917  *	b) Replenishes Rx queues that have run out due to memory shortage.
918  *	Normally new Rx buffers are added when existing ones are consumed but
919  *	when out of memory a queue can become empty.  We try to add only a few
920  *	buffers here, the queue will be replenished fully as these new buffers
921  *	are used up if memory shortage has subsided.
922  *
923  *	c) Return coalesced response queue credits in case a response queue is
924  *	starved.
925  *
926  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
927  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
928  */
929 static void
930 sge_timer_cb(void *arg)
931 {
932 	adapter_t *sc = arg;
933 	if ((sc->flags & USING_MSIX) == 0) {
934 
935 		struct port_info *pi;
936 		struct sge_qset *qs;
937 		struct sge_txq  *txq;
938 		int i, j;
939 		int reclaim_ofl, refill_rx;
940 
941 		if (sc->open_device_map == 0)
942 			return;
943 
944 		for (i = 0; i < sc->params.nports; i++) {
945 			pi = &sc->port[i];
946 			for (j = 0; j < pi->nqsets; j++) {
947 				qs = &sc->sge.qs[pi->first_qset + j];
948 				txq = &qs->txq[0];
949 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
950 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
951 				    (qs->fl[1].credits < qs->fl[1].size));
952 				if (reclaim_ofl || refill_rx) {
953 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
954 					break;
955 				}
956 			}
957 		}
958 	}
959 
960 	if (sc->params.nports > 2) {
961 		int i;
962 
963 		for_each_port(sc, i) {
964 			struct port_info *pi = &sc->port[i];
965 
966 			t3_write_reg(sc, A_SG_KDOORBELL,
967 				     F_SELEGRCNTX |
968 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
969 		}
970 	}
971 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
972 	    sc->open_device_map != 0)
973 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
974 }
975 
976 /*
977  * This is meant to be a catch-all function to keep sge state private
978  * to sge.c
979  *
980  */
981 int
982 t3_sge_init_adapter(adapter_t *sc)
983 {
984 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
985 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
986 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
987 	return (0);
988 }
989 
990 int
991 t3_sge_reset_adapter(adapter_t *sc)
992 {
993 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
994 	return (0);
995 }
996 
997 int
998 t3_sge_init_port(struct port_info *pi)
999 {
1000 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1001 	return (0);
1002 }
1003 
1004 /**
1005  *	refill_rspq - replenish an SGE response queue
1006  *	@adapter: the adapter
1007  *	@q: the response queue to replenish
1008  *	@credits: how many new responses to make available
1009  *
1010  *	Replenishes a response queue by making the supplied number of responses
1011  *	available to HW.
1012  */
1013 static __inline void
1014 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1015 {
1016 
1017 	/* mbufs are allocated on demand when a rspq entry is processed. */
1018 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1019 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1020 }
1021 
1022 static void
1023 sge_txq_reclaim_handler(void *arg, int ncount)
1024 {
1025 	struct sge_qset *qs = arg;
1026 	int i;
1027 
1028 	for (i = 0; i < 3; i++)
1029 		reclaim_completed_tx(qs, 16, i);
1030 }
1031 
1032 static void
1033 sge_timer_reclaim(void *arg, int ncount)
1034 {
1035 	struct port_info *pi = arg;
1036 	int i, nqsets = pi->nqsets;
1037 	adapter_t *sc = pi->adapter;
1038 	struct sge_qset *qs;
1039 	struct mtx *lock;
1040 
1041 	KASSERT((sc->flags & USING_MSIX) == 0,
1042 	    ("can't call timer reclaim for msi-x"));
1043 
1044 	for (i = 0; i < nqsets; i++) {
1045 		qs = &sc->sge.qs[pi->first_qset + i];
1046 
1047 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1048 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1049 			    &sc->sge.qs[0].rspq.lock;
1050 
1051 		if (mtx_trylock(lock)) {
1052 			/* XXX currently assume that we are *NOT* polling */
1053 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1054 
1055 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1056 				__refill_fl(sc, &qs->fl[0]);
1057 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1058 				__refill_fl(sc, &qs->fl[1]);
1059 
1060 			if (status & (1 << qs->rspq.cntxt_id)) {
1061 				if (qs->rspq.credits) {
1062 					refill_rspq(sc, &qs->rspq, 1);
1063 					qs->rspq.credits--;
1064 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1065 					    1 << qs->rspq.cntxt_id);
1066 				}
1067 			}
1068 			mtx_unlock(lock);
1069 		}
1070 	}
1071 }
1072 
1073 /**
1074  *	init_qset_cntxt - initialize an SGE queue set context info
1075  *	@qs: the queue set
1076  *	@id: the queue set id
1077  *
1078  *	Initializes the TIDs and context ids for the queues of a queue set.
1079  */
1080 static void
1081 init_qset_cntxt(struct sge_qset *qs, u_int id)
1082 {
1083 
1084 	qs->rspq.cntxt_id = id;
1085 	qs->fl[0].cntxt_id = 2 * id;
1086 	qs->fl[1].cntxt_id = 2 * id + 1;
1087 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1088 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1089 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1090 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1091 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1092 
1093 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1094 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1095 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1096 }
1097 
1098 
1099 static void
1100 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1101 {
1102 	txq->in_use += ndesc;
1103 	/*
1104 	 * XXX we don't handle stopping of queue
1105 	 * presumably start handles this when we bump against the end
1106 	 */
1107 	txqs->gen = txq->gen;
1108 	txq->unacked += ndesc;
1109 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1110 	txq->unacked &= 31;
1111 	txqs->pidx = txq->pidx;
1112 	txq->pidx += ndesc;
1113 #ifdef INVARIANTS
1114 	if (((txqs->pidx > txq->cidx) &&
1115 		(txq->pidx < txqs->pidx) &&
1116 		(txq->pidx >= txq->cidx)) ||
1117 	    ((txqs->pidx < txq->cidx) &&
1118 		(txq->pidx >= txq-> cidx)) ||
1119 	    ((txqs->pidx < txq->cidx) &&
1120 		(txq->cidx < txqs->pidx)))
1121 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1122 		    txqs->pidx, txq->pidx, txq->cidx);
1123 #endif
1124 	if (txq->pidx >= txq->size) {
1125 		txq->pidx -= txq->size;
1126 		txq->gen ^= 1;
1127 	}
1128 
1129 }
1130 
1131 /**
1132  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1133  *	@m: the packet mbufs
1134  *      @nsegs: the number of segments
1135  *
1136  * 	Returns the number of Tx descriptors needed for the given Ethernet
1137  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1138  */
1139 static __inline unsigned int
1140 calc_tx_descs(const struct mbuf *m, int nsegs)
1141 {
1142 	unsigned int flits;
1143 
1144 	if (m->m_pkthdr.len <= PIO_LEN)
1145 		return 1;
1146 
1147 	flits = sgl_len(nsegs) + 2;
1148 #ifdef TSO_SUPPORTED
1149 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1150 		flits++;
1151 #endif
1152 	return flits_to_desc(flits);
1153 }
1154 
1155 static unsigned int
1156 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1157     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1158 {
1159 	struct mbuf *m0;
1160 	int err, pktlen, pass = 0;
1161 	bus_dma_tag_t tag = txq->entry_tag;
1162 
1163 retry:
1164 	err = 0;
1165 	m0 = *m;
1166 	pktlen = m0->m_pkthdr.len;
1167 #if defined(__i386__) || defined(__amd64__)
1168 	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1169 		goto done;
1170 	} else
1171 #endif
1172 		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1173 
1174 	if (err == 0) {
1175 		goto done;
1176 	}
1177 	if (err == EFBIG && pass == 0) {
1178 		pass = 1;
1179 		/* Too many segments, try to defrag */
1180 		m0 = m_defrag(m0, M_DONTWAIT);
1181 		if (m0 == NULL) {
1182 			m_freem(*m);
1183 			*m = NULL;
1184 			return (ENOBUFS);
1185 		}
1186 		*m = m0;
1187 		goto retry;
1188 	} else if (err == ENOMEM) {
1189 		return (err);
1190 	} if (err) {
1191 		if (cxgb_debug)
1192 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1193 		m_freem(m0);
1194 		*m = NULL;
1195 		return (err);
1196 	}
1197 done:
1198 #if !defined(__i386__) && !defined(__amd64__)
1199 	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1200 #endif
1201 	txsd->flags |= TX_SW_DESC_MAPPED;
1202 
1203 	return (0);
1204 }
1205 
1206 /**
1207  *	make_sgl - populate a scatter/gather list for a packet
1208  *	@sgp: the SGL to populate
1209  *	@segs: the packet dma segments
1210  *	@nsegs: the number of segments
1211  *
1212  *	Generates a scatter/gather list for the buffers that make up a packet
1213  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1214  *	appropriately.
1215  */
1216 static __inline void
1217 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1218 {
1219 	int i, idx;
1220 
1221 	for (idx = 0, i = 0; i < nsegs; i++) {
1222 		/*
1223 		 * firmware doesn't like empty segments
1224 		 */
1225 		if (segs[i].ds_len == 0)
1226 			continue;
1227 		if (i && idx == 0)
1228 			++sgp;
1229 
1230 		sgp->len[idx] = htobe32(segs[i].ds_len);
1231 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1232 		idx ^= 1;
1233 	}
1234 
1235 	if (idx) {
1236 		sgp->len[idx] = 0;
1237 		sgp->addr[idx] = 0;
1238 	}
1239 }
1240 
1241 /**
1242  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1243  *	@adap: the adapter
1244  *	@q: the Tx queue
1245  *
1246  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1247  *	where the HW is going to sleep just after we checked, however,
1248  *	then the interrupt handler will detect the outstanding TX packet
1249  *	and ring the doorbell for us.
1250  *
1251  *	When GTS is disabled we unconditionally ring the doorbell.
1252  */
1253 static __inline void
1254 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1255 {
1256 #if USE_GTS
1257 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1258 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1259 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1260 #ifdef T3_TRACE
1261 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1262 			  q->cntxt_id);
1263 #endif
1264 		t3_write_reg(adap, A_SG_KDOORBELL,
1265 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1266 	}
1267 #else
1268 	wmb();            /* write descriptors before telling HW */
1269 	t3_write_reg(adap, A_SG_KDOORBELL,
1270 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1271 #endif
1272 }
1273 
1274 static __inline void
1275 wr_gen2(struct tx_desc *d, unsigned int gen)
1276 {
1277 #if SGE_NUM_GENBITS == 2
1278 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1279 #endif
1280 }
1281 
1282 /**
1283  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1284  *	@ndesc: number of Tx descriptors spanned by the SGL
1285  *	@txd: first Tx descriptor to be written
1286  *	@txqs: txq state (generation and producer index)
1287  *	@txq: the SGE Tx queue
1288  *	@sgl: the SGL
1289  *	@flits: number of flits to the start of the SGL in the first descriptor
1290  *	@sgl_flits: the SGL size in flits
1291  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1292  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1293  *
1294  *	Write a work request header and an associated SGL.  If the SGL is
1295  *	small enough to fit into one Tx descriptor it has already been written
1296  *	and we just need to write the WR header.  Otherwise we distribute the
1297  *	SGL across the number of descriptors it spans.
1298  */
1299 static void
1300 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1301     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1302     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1303 {
1304 
1305 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1306 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1307 
1308 	if (__predict_true(ndesc == 1)) {
1309 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1310 			V_WR_SGLSFLT(flits)) | wr_hi,
1311 		    htonl(V_WR_LEN(flits + sgl_flits) |
1312 			V_WR_GEN(txqs->gen)) | wr_lo);
1313 		/* XXX gen? */
1314 		wr_gen2(txd, txqs->gen);
1315 
1316 	} else {
1317 		unsigned int ogen = txqs->gen;
1318 		const uint64_t *fp = (const uint64_t *)sgl;
1319 		struct work_request_hdr *wp = wrp;
1320 
1321 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1322 		    V_WR_SGLSFLT(flits)) | wr_hi;
1323 
1324 		while (sgl_flits) {
1325 			unsigned int avail = WR_FLITS - flits;
1326 
1327 			if (avail > sgl_flits)
1328 				avail = sgl_flits;
1329 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1330 			sgl_flits -= avail;
1331 			ndesc--;
1332 			if (!sgl_flits)
1333 				break;
1334 
1335 			fp += avail;
1336 			txd++;
1337 			txsd++;
1338 			if (++txqs->pidx == txq->size) {
1339 				txqs->pidx = 0;
1340 				txqs->gen ^= 1;
1341 				txd = txq->desc;
1342 				txsd = txq->sdesc;
1343 			}
1344 
1345 			/*
1346 			 * when the head of the mbuf chain
1347 			 * is freed all clusters will be freed
1348 			 * with it
1349 			 */
1350 			wrp = (struct work_request_hdr *)txd;
1351 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1352 			    V_WR_SGLSFLT(1)) | wr_hi;
1353 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1354 				    sgl_flits + 1)) |
1355 			    V_WR_GEN(txqs->gen)) | wr_lo;
1356 			wr_gen2(txd, txqs->gen);
1357 			flits = 1;
1358 		}
1359 		wrp->wrh_hi |= htonl(F_WR_EOP);
1360 		wmb();
1361 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1362 		wr_gen2((struct tx_desc *)wp, ogen);
1363 	}
1364 }
1365 
1366 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1367 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1368 
1369 #ifdef VLAN_SUPPORTED
1370 #define GET_VTAG(cntrl, m) \
1371 do { \
1372 	if ((m)->m_flags & M_VLANTAG)					            \
1373 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1374 } while (0)
1375 
1376 #else
1377 #define GET_VTAG(cntrl, m)
1378 #endif
1379 
1380 static int
1381 t3_encap(struct sge_qset *qs, struct mbuf **m)
1382 {
1383 	adapter_t *sc;
1384 	struct mbuf *m0;
1385 	struct sge_txq *txq;
1386 	struct txq_state txqs;
1387 	struct port_info *pi;
1388 	unsigned int ndesc, flits, cntrl, mlen;
1389 	int err, nsegs, tso_info = 0;
1390 
1391 	struct work_request_hdr *wrp;
1392 	struct tx_sw_desc *txsd;
1393 	struct sg_ent *sgp, *sgl;
1394 	uint32_t wr_hi, wr_lo, sgl_flits;
1395 	bus_dma_segment_t segs[TX_MAX_SEGS];
1396 
1397 	struct tx_desc *txd;
1398 
1399 	pi = qs->port;
1400 	sc = pi->adapter;
1401 	txq = &qs->txq[TXQ_ETH];
1402 	txd = &txq->desc[txq->pidx];
1403 	txsd = &txq->sdesc[txq->pidx];
1404 	sgl = txq->txq_sgl;
1405 
1406 	prefetch(txd);
1407 	m0 = *m;
1408 
1409 	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
1410 	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
1411 
1412 	mtx_assert(&qs->lock, MA_OWNED);
1413 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1414 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1415 
1416 #ifdef VLAN_SUPPORTED
1417 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1418 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1419 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1420 #endif
1421 	if (m0->m_nextpkt != NULL) {
1422 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1423 		ndesc = 1;
1424 		mlen = 0;
1425 	} else {
1426 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1427 		    &m0, segs, &nsegs))) {
1428 			if (cxgb_debug)
1429 				printf("failed ... err=%d\n", err);
1430 			return (err);
1431 		}
1432 		mlen = m0->m_pkthdr.len;
1433 		ndesc = calc_tx_descs(m0, nsegs);
1434 	}
1435 	txq_prod(txq, ndesc, &txqs);
1436 
1437 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1438 	txsd->m = m0;
1439 
1440 	if (m0->m_nextpkt != NULL) {
1441 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1442 		int i, fidx;
1443 
1444 		if (nsegs > 7)
1445 			panic("trying to coalesce %d packets in to one WR", nsegs);
1446 		txq->txq_coalesced += nsegs;
1447 		wrp = (struct work_request_hdr *)txd;
1448 		flits = nsegs*2 + 1;
1449 
1450 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1451 			struct cpl_tx_pkt_batch_entry *cbe;
1452 			uint64_t flit;
1453 			uint32_t *hflit = (uint32_t *)&flit;
1454 			int cflags = m0->m_pkthdr.csum_flags;
1455 
1456 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1457 			GET_VTAG(cntrl, m0);
1458 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1459 			if (__predict_false(!(cflags & CSUM_IP)))
1460 				cntrl |= F_TXPKT_IPCSUM_DIS;
1461 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1462 				cntrl |= F_TXPKT_L4CSUM_DIS;
1463 
1464 			hflit[0] = htonl(cntrl);
1465 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1466 			flit |= htobe64(1 << 24);
1467 			cbe = &cpl_batch->pkt_entry[i];
1468 			cbe->cntrl = hflit[0];
1469 			cbe->len = hflit[1];
1470 			cbe->addr = htobe64(segs[i].ds_addr);
1471 		}
1472 
1473 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1474 		    V_WR_SGLSFLT(flits)) |
1475 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1476 		wr_lo = htonl(V_WR_LEN(flits) |
1477 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1478 		set_wr_hdr(wrp, wr_hi, wr_lo);
1479 		wmb();
1480 		wr_gen2(txd, txqs.gen);
1481 		check_ring_tx_db(sc, txq);
1482 		return (0);
1483 	} else if (tso_info) {
1484 		int min_size = TCPPKTHDRSIZE, eth_type, tagged;
1485 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1486 		struct ip *ip;
1487 		struct tcphdr *tcp;
1488 		char *pkthdr;
1489 
1490 		txd->flit[2] = 0;
1491 		GET_VTAG(cntrl, m0);
1492 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1493 		hdr->cntrl = htonl(cntrl);
1494 		hdr->len = htonl(mlen | 0x80000000);
1495 
1496 		DPRINTF("tso buf len=%d\n", mlen);
1497 
1498 		tagged = m0->m_flags & M_VLANTAG;
1499 		if (!tagged)
1500 			min_size -= ETHER_VLAN_ENCAP_LEN;
1501 
1502 		if (__predict_false(mlen < min_size)) {
1503 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1504 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1505 			    m0->m_pkthdr.csum_flags, m0->m_flags);
1506 			panic("tx tso packet too small");
1507 		}
1508 
1509 		/* Make sure that ether, ip, tcp headers are all in m0 */
1510 		if (__predict_false(m0->m_len < min_size)) {
1511 			m0 = m_pullup(m0, min_size);
1512 			if (__predict_false(m0 == NULL)) {
1513 				/* XXX panic probably an overreaction */
1514 				panic("couldn't fit header into mbuf");
1515 			}
1516 		}
1517 		pkthdr = m0->m_data;
1518 
1519 		if (tagged) {
1520 			eth_type = CPL_ETH_II_VLAN;
1521 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1522 			    ETHER_VLAN_ENCAP_LEN);
1523 		} else {
1524 			eth_type = CPL_ETH_II;
1525 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1526 		}
1527 		tcp = (struct tcphdr *)((uint8_t *)ip +
1528 		    sizeof(*ip));
1529 
1530 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1531 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1532 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1533 		hdr->lso_info = htonl(tso_info);
1534 
1535 		if (__predict_false(mlen <= PIO_LEN)) {
1536 			/* pkt not undersized but fits in PIO_LEN
1537 			 * Indicates a TSO bug at the higher levels.
1538 			 *
1539 			 */
1540 			DPRINTF("**5592 Fix** mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1541 			    m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags);
1542 			txsd->m = NULL;
1543 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1544 			flits = (mlen + 7) / 8 + 3;
1545 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1546 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1547 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1548 			wr_lo = htonl(V_WR_LEN(flits) |
1549 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1550 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1551 			wmb();
1552 			wr_gen2(txd, txqs.gen);
1553 			check_ring_tx_db(sc, txq);
1554 			return (0);
1555 		}
1556 		flits = 3;
1557 	} else {
1558 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1559 
1560 		GET_VTAG(cntrl, m0);
1561 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1562 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1563 			cntrl |= F_TXPKT_IPCSUM_DIS;
1564 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1565 			cntrl |= F_TXPKT_L4CSUM_DIS;
1566 		cpl->cntrl = htonl(cntrl);
1567 		cpl->len = htonl(mlen | 0x80000000);
1568 
1569 		if (mlen <= PIO_LEN) {
1570 			txsd->m = NULL;
1571 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1572 			flits = (mlen + 7) / 8 + 2;
1573 
1574 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1575 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1576 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1577 			wr_lo = htonl(V_WR_LEN(flits) |
1578 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1579 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1580 			wmb();
1581 			wr_gen2(txd, txqs.gen);
1582 			check_ring_tx_db(sc, txq);
1583 			return (0);
1584 		}
1585 		flits = 2;
1586 	}
1587 	wrp = (struct work_request_hdr *)txd;
1588 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1589 	make_sgl(sgp, segs, nsegs);
1590 
1591 	sgl_flits = sgl_len(nsegs);
1592 
1593 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1594 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1595 	wr_lo = htonl(V_WR_TID(txq->token));
1596 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1597 	    sgl_flits, wr_hi, wr_lo);
1598 	check_ring_tx_db(pi->adapter, txq);
1599 
1600 	return (0);
1601 }
1602 
1603 void
1604 cxgb_tx_watchdog(void *arg)
1605 {
1606 	struct sge_qset *qs = arg;
1607 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1608 
1609         if (qs->coalescing != 0 &&
1610 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1611 	    TXQ_RING_EMPTY(qs))
1612                 qs->coalescing = 0;
1613         else if (qs->coalescing == 0 &&
1614 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1615                 qs->coalescing = 1;
1616 	if (TXQ_TRYLOCK(qs)) {
1617 		qs->qs_flags |= QS_FLUSHING;
1618 		cxgb_start_locked(qs);
1619 		qs->qs_flags &= ~QS_FLUSHING;
1620 		TXQ_UNLOCK(qs);
1621 	}
1622 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1623 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1624 		    qs, txq->txq_watchdog.c_cpu);
1625 }
1626 
1627 static void
1628 cxgb_tx_timeout(void *arg)
1629 {
1630 	struct sge_qset *qs = arg;
1631 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1632 
1633 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1634                 qs->coalescing = 1;
1635 	if (TXQ_TRYLOCK(qs)) {
1636 		qs->qs_flags |= QS_TIMEOUT;
1637 		cxgb_start_locked(qs);
1638 		qs->qs_flags &= ~QS_TIMEOUT;
1639 		TXQ_UNLOCK(qs);
1640 	}
1641 }
1642 
1643 static void
1644 cxgb_start_locked(struct sge_qset *qs)
1645 {
1646 	struct mbuf *m_head = NULL;
1647 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1648 	int avail, txmax;
1649 	int in_use_init = txq->in_use;
1650 	struct port_info *pi = qs->port;
1651 	struct ifnet *ifp = pi->ifp;
1652 	avail = txq->size - txq->in_use - 4;
1653 	txmax = min(TX_START_MAX_DESC, avail);
1654 
1655 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1656 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1657 
1658 	if (!pi->link_config.link_ok) {
1659 		TXQ_RING_FLUSH(qs);
1660 		return;
1661 	}
1662 	TXQ_LOCK_ASSERT(qs);
1663 	while ((txq->in_use - in_use_init < txmax) &&
1664 	    !TXQ_RING_EMPTY(qs) &&
1665 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1666 	    pi->link_config.link_ok) {
1667 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1668 
1669 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1670 			break;
1671 		/*
1672 		 *  Encapsulation can modify our pointer, and or make it
1673 		 *  NULL on failure.  In that event, we can't requeue.
1674 		 */
1675 		if (t3_encap(qs, &m_head) || m_head == NULL)
1676 			break;
1677 
1678 		/* Send a copy of the frame to the BPF listener */
1679 		ETHER_BPF_MTAP(ifp, m_head);
1680 
1681 		/*
1682 		 * We sent via PIO, no longer need a copy
1683 		 */
1684 		if (m_head->m_nextpkt == NULL &&
1685 		    m_head->m_pkthdr.len <= PIO_LEN)
1686 			m_freem(m_head);
1687 
1688 		m_head = NULL;
1689 	}
1690 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1691 	    pi->link_config.link_ok)
1692 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1693 		    qs, txq->txq_timer.c_cpu);
1694 	if (m_head != NULL)
1695 		m_freem(m_head);
1696 }
1697 
1698 static int
1699 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1700 {
1701 	struct port_info *pi = qs->port;
1702 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1703 	struct buf_ring *br = txq->txq_mr;
1704 	int error, avail;
1705 
1706 	avail = txq->size - txq->in_use;
1707 	TXQ_LOCK_ASSERT(qs);
1708 
1709 	/*
1710 	 * We can only do a direct transmit if the following are true:
1711 	 * - we aren't coalescing (ring < 3/4 full)
1712 	 * - the link is up -- checked in caller
1713 	 * - there are no packets enqueued already
1714 	 * - there is space in hardware transmit queue
1715 	 */
1716 	if (check_pkt_coalesce(qs) == 0 &&
1717 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > 4) {
1718 		if (t3_encap(qs, &m)) {
1719 			if (m != NULL &&
1720 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1721 				return (error);
1722 		} else {
1723 			/*
1724 			 * We've bypassed the buf ring so we need to update
1725 			 * the stats directly
1726 			 */
1727 			txq->txq_direct_packets++;
1728 			txq->txq_direct_bytes += m->m_pkthdr.len;
1729 			/*
1730 			** Send a copy of the frame to the BPF
1731 			** listener and set the watchdog on.
1732 			*/
1733 			ETHER_BPF_MTAP(ifp, m);
1734 			/*
1735 			 * We sent via PIO, no longer need a copy
1736 			 */
1737 			if (m->m_pkthdr.len <= PIO_LEN)
1738 				m_freem(m);
1739 
1740 		}
1741 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1742 		return (error);
1743 
1744 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1745 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1746 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1747 		cxgb_start_locked(qs);
1748 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1749 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1750 		    qs, txq->txq_timer.c_cpu);
1751 	return (0);
1752 }
1753 
1754 int
1755 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1756 {
1757 	struct sge_qset *qs;
1758 	struct port_info *pi = ifp->if_softc;
1759 	int error, qidx = pi->first_qset;
1760 
1761 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1762 	    ||(!pi->link_config.link_ok)) {
1763 		m_freem(m);
1764 		return (0);
1765 	}
1766 
1767 	if (m->m_flags & M_FLOWID)
1768 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1769 
1770 	qs = &pi->adapter->sge.qs[qidx];
1771 
1772 	if (TXQ_TRYLOCK(qs)) {
1773 		/* XXX running */
1774 		error = cxgb_transmit_locked(ifp, qs, m);
1775 		TXQ_UNLOCK(qs);
1776 	} else
1777 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1778 	return (error);
1779 }
1780 void
1781 cxgb_start(struct ifnet *ifp)
1782 {
1783 	struct port_info *pi = ifp->if_softc;
1784 	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
1785 
1786 	if (!pi->link_config.link_ok)
1787 		return;
1788 
1789 	TXQ_LOCK(qs);
1790 	cxgb_start_locked(qs);
1791 	TXQ_UNLOCK(qs);
1792 }
1793 
1794 void
1795 cxgb_qflush(struct ifnet *ifp)
1796 {
1797 	/*
1798 	 * flush any enqueued mbufs in the buf_rings
1799 	 * and in the transmit queues
1800 	 * no-op for now
1801 	 */
1802 	return;
1803 }
1804 
1805 /**
1806  *	write_imm - write a packet into a Tx descriptor as immediate data
1807  *	@d: the Tx descriptor to write
1808  *	@m: the packet
1809  *	@len: the length of packet data to write as immediate data
1810  *	@gen: the generation bit value to write
1811  *
1812  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1813  *	contains a work request at its beginning.  We must write the packet
1814  *	carefully so the SGE doesn't read accidentally before it's written in
1815  *	its entirety.
1816  */
1817 static __inline void
1818 write_imm(struct tx_desc *d, struct mbuf *m,
1819 	  unsigned int len, unsigned int gen)
1820 {
1821 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1822 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1823 	uint32_t wr_hi, wr_lo;
1824 
1825 	if (len > WR_LEN)
1826 		panic("len too big %d\n", len);
1827 	if (len < sizeof(*from))
1828 		panic("len too small %d", len);
1829 
1830 	memcpy(&to[1], &from[1], len - sizeof(*from));
1831 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1832 					V_WR_BCNTLFLT(len & 7));
1833 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1834 					V_WR_LEN((len + 7) / 8));
1835 	set_wr_hdr(to, wr_hi, wr_lo);
1836 	wmb();
1837 	wr_gen2(d, gen);
1838 
1839 	/*
1840 	 * This check is a hack we should really fix the logic so
1841 	 * that this can't happen
1842 	 */
1843 	if (m->m_type != MT_DONTFREE)
1844 		m_freem(m);
1845 
1846 }
1847 
1848 /**
1849  *	check_desc_avail - check descriptor availability on a send queue
1850  *	@adap: the adapter
1851  *	@q: the TX queue
1852  *	@m: the packet needing the descriptors
1853  *	@ndesc: the number of Tx descriptors needed
1854  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1855  *
1856  *	Checks if the requested number of Tx descriptors is available on an
1857  *	SGE send queue.  If the queue is already suspended or not enough
1858  *	descriptors are available the packet is queued for later transmission.
1859  *	Must be called with the Tx queue locked.
1860  *
1861  *	Returns 0 if enough descriptors are available, 1 if there aren't
1862  *	enough descriptors and the packet has been queued, and 2 if the caller
1863  *	needs to retry because there weren't enough descriptors at the
1864  *	beginning of the call but some freed up in the mean time.
1865  */
1866 static __inline int
1867 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1868 		 struct mbuf *m, unsigned int ndesc,
1869 		 unsigned int qid)
1870 {
1871 	/*
1872 	 * XXX We currently only use this for checking the control queue
1873 	 * the control queue is only used for binding qsets which happens
1874 	 * at init time so we are guaranteed enough descriptors
1875 	 */
1876 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1877 addq_exit:	mbufq_tail(&q->sendq, m);
1878 		return 1;
1879 	}
1880 	if (__predict_false(q->size - q->in_use < ndesc)) {
1881 
1882 		struct sge_qset *qs = txq_to_qset(q, qid);
1883 
1884 		setbit(&qs->txq_stopped, qid);
1885 		if (should_restart_tx(q) &&
1886 		    test_and_clear_bit(qid, &qs->txq_stopped))
1887 			return 2;
1888 
1889 		q->stops++;
1890 		goto addq_exit;
1891 	}
1892 	return 0;
1893 }
1894 
1895 
1896 /**
1897  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1898  *	@q: the SGE control Tx queue
1899  *
1900  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1901  *	that send only immediate data (presently just the control queues) and
1902  *	thus do not have any mbufs
1903  */
1904 static __inline void
1905 reclaim_completed_tx_imm(struct sge_txq *q)
1906 {
1907 	unsigned int reclaim = q->processed - q->cleaned;
1908 
1909 	q->in_use -= reclaim;
1910 	q->cleaned += reclaim;
1911 }
1912 
1913 static __inline int
1914 immediate(const struct mbuf *m)
1915 {
1916 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1917 }
1918 
1919 /**
1920  *	ctrl_xmit - send a packet through an SGE control Tx queue
1921  *	@adap: the adapter
1922  *	@q: the control queue
1923  *	@m: the packet
1924  *
1925  *	Send a packet through an SGE control Tx queue.  Packets sent through
1926  *	a control queue must fit entirely as immediate data in a single Tx
1927  *	descriptor and have no page fragments.
1928  */
1929 static int
1930 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1931 {
1932 	int ret;
1933 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1934 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1935 
1936 	if (__predict_false(!immediate(m))) {
1937 		m_freem(m);
1938 		return 0;
1939 	}
1940 
1941 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1942 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1943 
1944 	TXQ_LOCK(qs);
1945 again:	reclaim_completed_tx_imm(q);
1946 
1947 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1948 	if (__predict_false(ret)) {
1949 		if (ret == 1) {
1950 			TXQ_UNLOCK(qs);
1951 			return (ENOSPC);
1952 		}
1953 		goto again;
1954 	}
1955 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1956 
1957 	q->in_use++;
1958 	if (++q->pidx >= q->size) {
1959 		q->pidx = 0;
1960 		q->gen ^= 1;
1961 	}
1962 	TXQ_UNLOCK(qs);
1963 	wmb();
1964 	t3_write_reg(adap, A_SG_KDOORBELL,
1965 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1966 	return (0);
1967 }
1968 
1969 
1970 /**
1971  *	restart_ctrlq - restart a suspended control queue
1972  *	@qs: the queue set cotaining the control queue
1973  *
1974  *	Resumes transmission on a suspended Tx control queue.
1975  */
1976 static void
1977 restart_ctrlq(void *data, int npending)
1978 {
1979 	struct mbuf *m;
1980 	struct sge_qset *qs = (struct sge_qset *)data;
1981 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1982 	adapter_t *adap = qs->port->adapter;
1983 
1984 	TXQ_LOCK(qs);
1985 again:	reclaim_completed_tx_imm(q);
1986 
1987 	while (q->in_use < q->size &&
1988 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1989 
1990 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1991 
1992 		if (++q->pidx >= q->size) {
1993 			q->pidx = 0;
1994 			q->gen ^= 1;
1995 		}
1996 		q->in_use++;
1997 	}
1998 	if (!mbufq_empty(&q->sendq)) {
1999 		setbit(&qs->txq_stopped, TXQ_CTRL);
2000 
2001 		if (should_restart_tx(q) &&
2002 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
2003 			goto again;
2004 		q->stops++;
2005 	}
2006 	TXQ_UNLOCK(qs);
2007 	t3_write_reg(adap, A_SG_KDOORBELL,
2008 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2009 }
2010 
2011 
2012 /*
2013  * Send a management message through control queue 0
2014  */
2015 int
2016 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
2017 {
2018 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
2019 }
2020 
2021 /**
2022  *	free_qset - free the resources of an SGE queue set
2023  *	@sc: the controller owning the queue set
2024  *	@q: the queue set
2025  *
2026  *	Release the HW and SW resources associated with an SGE queue set, such
2027  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2028  *	queue set must be quiesced prior to calling this.
2029  */
2030 static void
2031 t3_free_qset(adapter_t *sc, struct sge_qset *q)
2032 {
2033 	int i;
2034 
2035 	reclaim_completed_tx(q, 0, TXQ_ETH);
2036 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2037 		if (q->txq[i].txq_mr != NULL)
2038 			buf_ring_free(q->txq[i].txq_mr, M_DEVBUF);
2039 		if (q->txq[i].txq_ifq != NULL) {
2040 			ifq_delete(q->txq[i].txq_ifq);
2041 			free(q->txq[i].txq_ifq, M_DEVBUF);
2042 		}
2043 	}
2044 
2045 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2046 		if (q->fl[i].desc) {
2047 			mtx_lock_spin(&sc->sge.reg_lock);
2048 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2049 			mtx_unlock_spin(&sc->sge.reg_lock);
2050 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2051 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2052 					q->fl[i].desc_map);
2053 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2054 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2055 		}
2056 		if (q->fl[i].sdesc) {
2057 			free_rx_bufs(sc, &q->fl[i]);
2058 			free(q->fl[i].sdesc, M_DEVBUF);
2059 		}
2060 	}
2061 
2062 	mtx_unlock(&q->lock);
2063 	MTX_DESTROY(&q->lock);
2064 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2065 		if (q->txq[i].desc) {
2066 			mtx_lock_spin(&sc->sge.reg_lock);
2067 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2068 			mtx_unlock_spin(&sc->sge.reg_lock);
2069 			bus_dmamap_unload(q->txq[i].desc_tag,
2070 					q->txq[i].desc_map);
2071 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2072 					q->txq[i].desc_map);
2073 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2074 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2075 		}
2076 		if (q->txq[i].sdesc) {
2077 			free(q->txq[i].sdesc, M_DEVBUF);
2078 		}
2079 	}
2080 
2081 	if (q->rspq.desc) {
2082 		mtx_lock_spin(&sc->sge.reg_lock);
2083 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2084 		mtx_unlock_spin(&sc->sge.reg_lock);
2085 
2086 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2087 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2088 			        q->rspq.desc_map);
2089 		bus_dma_tag_destroy(q->rspq.desc_tag);
2090 		MTX_DESTROY(&q->rspq.lock);
2091 	}
2092 
2093 #ifdef LRO_SUPPORTED
2094 	tcp_lro_free(&q->lro.ctrl);
2095 #endif
2096 
2097 	bzero(q, sizeof(*q));
2098 }
2099 
2100 /**
2101  *	t3_free_sge_resources - free SGE resources
2102  *	@sc: the adapter softc
2103  *
2104  *	Frees resources used by the SGE queue sets.
2105  */
2106 void
2107 t3_free_sge_resources(adapter_t *sc)
2108 {
2109 	int i, nqsets;
2110 
2111 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2112 		nqsets += sc->port[i].nqsets;
2113 
2114 	for (i = 0; i < nqsets; ++i) {
2115 		TXQ_LOCK(&sc->sge.qs[i]);
2116 		t3_free_qset(sc, &sc->sge.qs[i]);
2117 	}
2118 
2119 }
2120 
2121 /**
2122  *	t3_sge_start - enable SGE
2123  *	@sc: the controller softc
2124  *
2125  *	Enables the SGE for DMAs.  This is the last step in starting packet
2126  *	transfers.
2127  */
2128 void
2129 t3_sge_start(adapter_t *sc)
2130 {
2131 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2132 }
2133 
2134 /**
2135  *	t3_sge_stop - disable SGE operation
2136  *	@sc: the adapter
2137  *
2138  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2139  *	from error interrupts) or from normal process context.  In the latter
2140  *	case it also disables any pending queue restart tasklets.  Note that
2141  *	if it is called in interrupt context it cannot disable the restart
2142  *	tasklets as it cannot wait, however the tasklets will have no effect
2143  *	since the doorbells are disabled and the driver will call this again
2144  *	later from process context, at which time the tasklets will be stopped
2145  *	if they are still running.
2146  */
2147 void
2148 t3_sge_stop(adapter_t *sc)
2149 {
2150 	int i, nqsets;
2151 
2152 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2153 
2154 	if (sc->tq == NULL)
2155 		return;
2156 
2157 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2158 		nqsets += sc->port[i].nqsets;
2159 #ifdef notyet
2160 	/*
2161 	 *
2162 	 * XXX
2163 	 */
2164 	for (i = 0; i < nqsets; ++i) {
2165 		struct sge_qset *qs = &sc->sge.qs[i];
2166 
2167 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2168 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2169 	}
2170 #endif
2171 }
2172 
2173 /**
2174  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2175  *	@adapter: the adapter
2176  *	@q: the Tx queue to reclaim descriptors from
2177  *	@reclaimable: the number of descriptors to reclaim
2178  *      @m_vec_size: maximum number of buffers to reclaim
2179  *      @desc_reclaimed: returns the number of descriptors reclaimed
2180  *
2181  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2182  *	Tx buffers.  Called with the Tx queue lock held.
2183  *
2184  *      Returns number of buffers of reclaimed
2185  */
2186 void
2187 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2188 {
2189 	struct tx_sw_desc *txsd;
2190 	unsigned int cidx, mask;
2191 	struct sge_txq *q = &qs->txq[queue];
2192 
2193 #ifdef T3_TRACE
2194 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2195 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2196 #endif
2197 	cidx = q->cidx;
2198 	mask = q->size - 1;
2199 	txsd = &q->sdesc[cidx];
2200 
2201 	mtx_assert(&qs->lock, MA_OWNED);
2202 	while (reclaimable--) {
2203 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2204 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2205 
2206 		if (txsd->m != NULL) {
2207 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2208 				bus_dmamap_unload(q->entry_tag, txsd->map);
2209 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2210 			}
2211 			m_freem_list(txsd->m);
2212 			txsd->m = NULL;
2213 		} else
2214 			q->txq_skipped++;
2215 
2216 		++txsd;
2217 		if (++cidx == q->size) {
2218 			cidx = 0;
2219 			txsd = q->sdesc;
2220 		}
2221 	}
2222 	q->cidx = cidx;
2223 
2224 }
2225 
2226 /**
2227  *	is_new_response - check if a response is newly written
2228  *	@r: the response descriptor
2229  *	@q: the response queue
2230  *
2231  *	Returns true if a response descriptor contains a yet unprocessed
2232  *	response.
2233  */
2234 static __inline int
2235 is_new_response(const struct rsp_desc *r,
2236     const struct sge_rspq *q)
2237 {
2238 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2239 }
2240 
2241 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2242 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2243 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2244 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2245 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2246 
2247 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2248 #define NOMEM_INTR_DELAY 2500
2249 
2250 /**
2251  *	write_ofld_wr - write an offload work request
2252  *	@adap: the adapter
2253  *	@m: the packet to send
2254  *	@q: the Tx queue
2255  *	@pidx: index of the first Tx descriptor to write
2256  *	@gen: the generation value to use
2257  *	@ndesc: number of descriptors the packet will occupy
2258  *
2259  *	Write an offload work request to send the supplied packet.  The packet
2260  *	data already carry the work request with most fields populated.
2261  */
2262 static void
2263 write_ofld_wr(adapter_t *adap, struct mbuf *m,
2264     struct sge_txq *q, unsigned int pidx,
2265     unsigned int gen, unsigned int ndesc,
2266     bus_dma_segment_t *segs, unsigned int nsegs)
2267 {
2268 	unsigned int sgl_flits, flits;
2269 	struct work_request_hdr *from;
2270 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2271 	struct tx_desc *d = &q->desc[pidx];
2272 	struct txq_state txqs;
2273 
2274 	if (immediate(m) && nsegs == 0) {
2275 		write_imm(d, m, m->m_len, gen);
2276 		return;
2277 	}
2278 
2279 	/* Only TX_DATA builds SGLs */
2280 	from = mtod(m, struct work_request_hdr *);
2281 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2282 
2283 	flits = m->m_len / 8;
2284 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2285 
2286 	make_sgl(sgp, segs, nsegs);
2287 	sgl_flits = sgl_len(nsegs);
2288 
2289 	txqs.gen = gen;
2290 	txqs.pidx = pidx;
2291 	txqs.compl = 0;
2292 
2293 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2294 	    from->wrh_hi, from->wrh_lo);
2295 }
2296 
2297 /**
2298  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2299  *	@m: the packet
2300  *
2301  * 	Returns the number of Tx descriptors needed for the given offload
2302  * 	packet.  These packets are already fully constructed.
2303  */
2304 static __inline unsigned int
2305 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2306 {
2307 	unsigned int flits, cnt = 0;
2308 	int ndescs;
2309 
2310 	if (m->m_len <= WR_LEN && nsegs == 0)
2311 		return (1);                 /* packet fits as immediate data */
2312 
2313 	/*
2314 	 * This needs to be re-visited for TOE
2315 	 */
2316 
2317 	cnt = nsegs;
2318 
2319 	/* headers */
2320 	flits = m->m_len / 8;
2321 
2322 	ndescs = flits_to_desc(flits + sgl_len(cnt));
2323 
2324 	return (ndescs);
2325 }
2326 
2327 /**
2328  *	ofld_xmit - send a packet through an offload queue
2329  *	@adap: the adapter
2330  *	@q: the Tx offload queue
2331  *	@m: the packet
2332  *
2333  *	Send an offload packet through an SGE offload queue.
2334  */
2335 static int
2336 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2337 {
2338 	int ret, nsegs;
2339 	unsigned int ndesc;
2340 	unsigned int pidx, gen;
2341 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2342 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2343 	struct tx_sw_desc *stx;
2344 
2345 	nsegs = m_get_sgllen(m);
2346 	vsegs = m_get_sgl(m);
2347 	ndesc = calc_tx_descs_ofld(m, nsegs);
2348 	busdma_map_sgl(vsegs, segs, nsegs);
2349 
2350 	stx = &q->sdesc[q->pidx];
2351 
2352 	TXQ_LOCK(qs);
2353 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2354 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2355 	if (__predict_false(ret)) {
2356 		if (ret == 1) {
2357 			printf("no ofld desc avail\n");
2358 
2359 			m_set_priority(m, ndesc);     /* save for restart */
2360 			TXQ_UNLOCK(qs);
2361 			return (EINTR);
2362 		}
2363 		goto again;
2364 	}
2365 
2366 	gen = q->gen;
2367 	q->in_use += ndesc;
2368 	pidx = q->pidx;
2369 	q->pidx += ndesc;
2370 	if (q->pidx >= q->size) {
2371 		q->pidx -= q->size;
2372 		q->gen ^= 1;
2373 	}
2374 #ifdef T3_TRACE
2375 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2376 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2377 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2378 		  skb_shinfo(skb)->nr_frags);
2379 #endif
2380 	TXQ_UNLOCK(qs);
2381 
2382 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2383 	check_ring_tx_db(adap, q);
2384 	return (0);
2385 }
2386 
2387 /**
2388  *	restart_offloadq - restart a suspended offload queue
2389  *	@qs: the queue set cotaining the offload queue
2390  *
2391  *	Resumes transmission on a suspended Tx offload queue.
2392  */
2393 static void
2394 restart_offloadq(void *data, int npending)
2395 {
2396 	struct mbuf *m;
2397 	struct sge_qset *qs = data;
2398 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2399 	adapter_t *adap = qs->port->adapter;
2400 	bus_dma_segment_t segs[TX_MAX_SEGS];
2401 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2402 	int nsegs, cleaned;
2403 
2404 	TXQ_LOCK(qs);
2405 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2406 
2407 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2408 		unsigned int gen, pidx;
2409 		unsigned int ndesc = m_get_priority(m);
2410 
2411 		if (__predict_false(q->size - q->in_use < ndesc)) {
2412 			setbit(&qs->txq_stopped, TXQ_OFLD);
2413 			if (should_restart_tx(q) &&
2414 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2415 				goto again;
2416 			q->stops++;
2417 			break;
2418 		}
2419 
2420 		gen = q->gen;
2421 		q->in_use += ndesc;
2422 		pidx = q->pidx;
2423 		q->pidx += ndesc;
2424 		if (q->pidx >= q->size) {
2425 			q->pidx -= q->size;
2426 			q->gen ^= 1;
2427 		}
2428 
2429 		(void)mbufq_dequeue(&q->sendq);
2430 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2431 		TXQ_UNLOCK(qs);
2432 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2433 		TXQ_LOCK(qs);
2434 	}
2435 #if USE_GTS
2436 	set_bit(TXQ_RUNNING, &q->flags);
2437 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2438 #endif
2439 	TXQ_UNLOCK(qs);
2440 	wmb();
2441 	t3_write_reg(adap, A_SG_KDOORBELL,
2442 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2443 }
2444 
2445 /**
2446  *	queue_set - return the queue set a packet should use
2447  *	@m: the packet
2448  *
2449  *	Maps a packet to the SGE queue set it should use.  The desired queue
2450  *	set is carried in bits 1-3 in the packet's priority.
2451  */
2452 static __inline int
2453 queue_set(const struct mbuf *m)
2454 {
2455 	return m_get_priority(m) >> 1;
2456 }
2457 
2458 /**
2459  *	is_ctrl_pkt - return whether an offload packet is a control packet
2460  *	@m: the packet
2461  *
2462  *	Determines whether an offload packet should use an OFLD or a CTRL
2463  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2464  */
2465 static __inline int
2466 is_ctrl_pkt(const struct mbuf *m)
2467 {
2468 	return m_get_priority(m) & 1;
2469 }
2470 
2471 /**
2472  *	t3_offload_tx - send an offload packet
2473  *	@tdev: the offload device to send to
2474  *	@m: the packet
2475  *
2476  *	Sends an offload packet.  We use the packet priority to select the
2477  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2478  *	should be sent as regular or control, bits 1-3 select the queue set.
2479  */
2480 int
2481 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2482 {
2483 	adapter_t *adap = tdev2adap(tdev);
2484 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2485 
2486 	if (__predict_false(is_ctrl_pkt(m)))
2487 		return ctrl_xmit(adap, qs, m);
2488 
2489 	return ofld_xmit(adap, qs, m);
2490 }
2491 
2492 /**
2493  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2494  *	@tdev: the offload device that will be receiving the packets
2495  *	@q: the SGE response queue that assembled the bundle
2496  *	@m: the partial bundle
2497  *	@n: the number of packets in the bundle
2498  *
2499  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2500  */
2501 static __inline void
2502 deliver_partial_bundle(struct t3cdev *tdev,
2503 			struct sge_rspq *q,
2504 			struct mbuf *mbufs[], int n)
2505 {
2506 	if (n) {
2507 		q->offload_bundles++;
2508 		cxgb_ofld_recv(tdev, mbufs, n);
2509 	}
2510 }
2511 
2512 static __inline int
2513 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2514     struct mbuf *m, struct mbuf *rx_gather[],
2515     unsigned int gather_idx)
2516 {
2517 
2518 	rq->offload_pkts++;
2519 	m->m_pkthdr.header = mtod(m, void *);
2520 	rx_gather[gather_idx++] = m;
2521 	if (gather_idx == RX_BUNDLE_SIZE) {
2522 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2523 		gather_idx = 0;
2524 		rq->offload_bundles++;
2525 	}
2526 	return (gather_idx);
2527 }
2528 
2529 static void
2530 restart_tx(struct sge_qset *qs)
2531 {
2532 	struct adapter *sc = qs->port->adapter;
2533 
2534 
2535 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2536 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2537 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2538 		qs->txq[TXQ_OFLD].restarts++;
2539 		DPRINTF("restarting TXQ_OFLD\n");
2540 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2541 	}
2542 	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2543 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2544 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2545 	    qs->txq[TXQ_CTRL].in_use);
2546 
2547 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2548 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2549 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2550 		qs->txq[TXQ_CTRL].restarts++;
2551 		DPRINTF("restarting TXQ_CTRL\n");
2552 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2553 	}
2554 }
2555 
2556 /**
2557  *	t3_sge_alloc_qset - initialize an SGE queue set
2558  *	@sc: the controller softc
2559  *	@id: the queue set id
2560  *	@nports: how many Ethernet ports will be using this queue set
2561  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2562  *	@p: configuration parameters for this queue set
2563  *	@ntxq: number of Tx queues for the queue set
2564  *	@pi: port info for queue set
2565  *
2566  *	Allocate resources and initialize an SGE queue set.  A queue set
2567  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2568  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2569  *	queue, offload queue, and control queue.
2570  */
2571 int
2572 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2573 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2574 {
2575 	struct sge_qset *q = &sc->sge.qs[id];
2576 	int i, ret = 0;
2577 
2578 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2579 	q->port = pi;
2580 
2581 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2582 
2583 		if ((q->txq[i].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2584 			    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2585 			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2586 			goto err;
2587 		}
2588 		if ((q->txq[i].txq_ifq =
2589 			malloc(sizeof(struct ifaltq), M_DEVBUF, M_NOWAIT|M_ZERO))
2590 		    == NULL) {
2591 			device_printf(sc->dev, "failed to allocate ifq\n");
2592 			goto err;
2593 		}
2594 		ifq_init(q->txq[i].txq_ifq, pi->ifp);
2595 		callout_init(&q->txq[i].txq_timer, 1);
2596 		callout_init(&q->txq[i].txq_watchdog, 1);
2597 		q->txq[i].txq_timer.c_cpu = id % mp_ncpus;
2598 		q->txq[i].txq_watchdog.c_cpu = id % mp_ncpus;
2599 	}
2600 	init_qset_cntxt(q, id);
2601 	q->idx = id;
2602 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2603 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2604 		    &q->fl[0].desc, &q->fl[0].sdesc,
2605 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2606 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2607 		printf("error %d from alloc ring fl0\n", ret);
2608 		goto err;
2609 	}
2610 
2611 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2612 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2613 		    &q->fl[1].desc, &q->fl[1].sdesc,
2614 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2615 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2616 		printf("error %d from alloc ring fl1\n", ret);
2617 		goto err;
2618 	}
2619 
2620 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2621 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2622 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2623 		    NULL, NULL)) != 0) {
2624 		printf("error %d from alloc ring rspq\n", ret);
2625 		goto err;
2626 	}
2627 
2628 	for (i = 0; i < ntxq; ++i) {
2629 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2630 
2631 		if ((ret = alloc_ring(sc, p->txq_size[i],
2632 			    sizeof(struct tx_desc), sz,
2633 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2634 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2635 			    &q->txq[i].desc_map,
2636 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2637 			printf("error %d from alloc ring tx %i\n", ret, i);
2638 			goto err;
2639 		}
2640 		mbufq_init(&q->txq[i].sendq);
2641 		q->txq[i].gen = 1;
2642 		q->txq[i].size = p->txq_size[i];
2643 	}
2644 
2645 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2646 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2647 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2648 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2649 
2650 	q->fl[0].gen = q->fl[1].gen = 1;
2651 	q->fl[0].size = p->fl_size;
2652 	q->fl[1].size = p->jumbo_size;
2653 
2654 	q->rspq.gen = 1;
2655 	q->rspq.cidx = 0;
2656 	q->rspq.size = p->rspq_size;
2657 
2658 	q->txq[TXQ_ETH].stop_thres = nports *
2659 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2660 
2661 	q->fl[0].buf_size = MCLBYTES;
2662 	q->fl[0].zone = zone_pack;
2663 	q->fl[0].type = EXT_PACKET;
2664 #if __FreeBSD_version > 800000
2665 	if (cxgb_use_16k_clusters) {
2666 		q->fl[1].buf_size = MJUM16BYTES;
2667 		q->fl[1].zone = zone_jumbo16;
2668 		q->fl[1].type = EXT_JUMBO16;
2669 	} else {
2670 		q->fl[1].buf_size = MJUM9BYTES;
2671 		q->fl[1].zone = zone_jumbo9;
2672 		q->fl[1].type = EXT_JUMBO9;
2673 	}
2674 #else
2675 	q->fl[1].buf_size = MJUMPAGESIZE;
2676 	q->fl[1].zone = zone_jumbop;
2677 	q->fl[1].type = EXT_JUMBOP;
2678 #endif
2679 
2680 #ifdef LRO_SUPPORTED
2681 	/* Allocate and setup the lro_ctrl structure */
2682 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2683 	ret = tcp_lro_init(&q->lro.ctrl);
2684 	if (ret) {
2685 		printf("error %d from tcp_lro_init\n", ret);
2686 		goto err;
2687 	}
2688 	q->lro.ctrl.ifp = pi->ifp;
2689 #endif
2690 
2691 	mtx_lock_spin(&sc->sge.reg_lock);
2692 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2693 				   q->rspq.phys_addr, q->rspq.size,
2694 				   q->fl[0].buf_size, 1, 0);
2695 	if (ret) {
2696 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2697 		goto err_unlock;
2698 	}
2699 
2700 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2701 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2702 					  q->fl[i].phys_addr, q->fl[i].size,
2703 					  q->fl[i].buf_size, p->cong_thres, 1,
2704 					  0);
2705 		if (ret) {
2706 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2707 			goto err_unlock;
2708 		}
2709 	}
2710 
2711 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2712 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2713 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2714 				 1, 0);
2715 	if (ret) {
2716 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2717 		goto err_unlock;
2718 	}
2719 
2720 	if (ntxq > 1) {
2721 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2722 					 USE_GTS, SGE_CNTXT_OFLD, id,
2723 					 q->txq[TXQ_OFLD].phys_addr,
2724 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2725 		if (ret) {
2726 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2727 			goto err_unlock;
2728 		}
2729 	}
2730 
2731 	if (ntxq > 2) {
2732 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2733 					 SGE_CNTXT_CTRL, id,
2734 					 q->txq[TXQ_CTRL].phys_addr,
2735 					 q->txq[TXQ_CTRL].size,
2736 					 q->txq[TXQ_CTRL].token, 1, 0);
2737 		if (ret) {
2738 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2739 			goto err_unlock;
2740 		}
2741 	}
2742 
2743 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2744 	    device_get_unit(sc->dev), irq_vec_idx);
2745 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2746 
2747 	mtx_unlock_spin(&sc->sge.reg_lock);
2748 	t3_update_qset_coalesce(q, p);
2749 	q->port = pi;
2750 
2751 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2752 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2753 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2754 
2755 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2756 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2757 
2758 	return (0);
2759 
2760 err_unlock:
2761 	mtx_unlock_spin(&sc->sge.reg_lock);
2762 err:
2763 	TXQ_LOCK(q);
2764 	t3_free_qset(sc, q);
2765 
2766 	return (ret);
2767 }
2768 
2769 /*
2770  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2771  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2772  * will also be taken into account here.
2773  */
2774 void
2775 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2776 {
2777 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2778 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2779 	struct ifnet *ifp = pi->ifp;
2780 
2781 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2782 
2783 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2784 	    cpl->csum_valid && cpl->csum == 0xffff) {
2785 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2786 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2787 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2788 		m->m_pkthdr.csum_data = 0xffff;
2789 	}
2790 	/*
2791 	 * XXX need to add VLAN support for 6.x
2792 	 */
2793 #ifdef VLAN_SUPPORTED
2794 	if (__predict_false(cpl->vlan_valid)) {
2795 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2796 		m->m_flags |= M_VLANTAG;
2797 	}
2798 #endif
2799 
2800 	m->m_pkthdr.rcvif = ifp;
2801 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2802 	/*
2803 	 * adjust after conversion to mbuf chain
2804 	 */
2805 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2806 	m->m_len -= (sizeof(*cpl) + ethpad);
2807 	m->m_data += (sizeof(*cpl) + ethpad);
2808 }
2809 
2810 /**
2811  *	get_packet - return the next ingress packet buffer from a free list
2812  *	@adap: the adapter that received the packet
2813  *	@drop_thres: # of remaining buffers before we start dropping packets
2814  *	@qs: the qset that the SGE free list holding the packet belongs to
2815  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2816  *      @r: response descriptor
2817  *
2818  *	Get the next packet from a free list and complete setup of the
2819  *	sk_buff.  If the packet is small we make a copy and recycle the
2820  *	original buffer, otherwise we use the original buffer itself.  If a
2821  *	positive drop threshold is supplied packets are dropped and their
2822  *	buffers recycled if (a) the number of remaining buffers is under the
2823  *	threshold and the packet is too big to copy, or (b) the packet should
2824  *	be copied but there is no memory for the copy.
2825  */
2826 static int
2827 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2828     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2829 {
2830 
2831 	unsigned int len_cq =  ntohl(r->len_cq);
2832 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2833 	int mask, cidx = fl->cidx;
2834 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2835 	uint32_t len = G_RSPD_LEN(len_cq);
2836 	uint32_t flags = M_EXT;
2837 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2838 	caddr_t cl;
2839 	struct mbuf *m;
2840 	int ret = 0;
2841 
2842 	mask = fl->size - 1;
2843 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2844 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2845 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2846 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2847 
2848 	fl->credits--;
2849 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2850 
2851 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2852 	    sopeop == RSPQ_SOP_EOP) {
2853 		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2854 			goto skip_recycle;
2855 		cl = mtod(m, void *);
2856 		memcpy(cl, sd->rxsd_cl, len);
2857 		recycle_rx_buf(adap, fl, fl->cidx);
2858 		m->m_pkthdr.len = m->m_len = len;
2859 		m->m_flags = 0;
2860 		mh->mh_head = mh->mh_tail = m;
2861 		ret = 1;
2862 		goto done;
2863 	} else {
2864 	skip_recycle:
2865 		bus_dmamap_unload(fl->entry_tag, sd->map);
2866 		cl = sd->rxsd_cl;
2867 		m = sd->m;
2868 
2869 		if ((sopeop == RSPQ_SOP_EOP) ||
2870 		    (sopeop == RSPQ_SOP))
2871 			flags |= M_PKTHDR;
2872 		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2873 		if (fl->zone == zone_pack) {
2874 			/*
2875 			 * restore clobbered data pointer
2876 			 */
2877 			m->m_data = m->m_ext.ext_buf;
2878 		} else {
2879 			m_cljset(m, cl, fl->type);
2880 		}
2881 		m->m_len = len;
2882 	}
2883 	switch(sopeop) {
2884 	case RSPQ_SOP_EOP:
2885 		ret = 1;
2886 		/* FALLTHROUGH */
2887 	case RSPQ_SOP:
2888 		mh->mh_head = mh->mh_tail = m;
2889 		m->m_pkthdr.len = len;
2890 		break;
2891 	case RSPQ_EOP:
2892 		ret = 1;
2893 		/* FALLTHROUGH */
2894 	case RSPQ_NSOP_NEOP:
2895 		if (mh->mh_tail == NULL) {
2896 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2897 			m_freem(m);
2898 			break;
2899 		}
2900 		mh->mh_tail->m_next = m;
2901 		mh->mh_tail = m;
2902 		mh->mh_head->m_pkthdr.len += len;
2903 		break;
2904 	}
2905 	if (cxgb_debug)
2906 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2907 done:
2908 	if (++fl->cidx == fl->size)
2909 		fl->cidx = 0;
2910 
2911 	return (ret);
2912 }
2913 
2914 /**
2915  *	handle_rsp_cntrl_info - handles control information in a response
2916  *	@qs: the queue set corresponding to the response
2917  *	@flags: the response control flags
2918  *
2919  *	Handles the control information of an SGE response, such as GTS
2920  *	indications and completion credits for the queue set's Tx queues.
2921  *	HW coalesces credits, we don't do any extra SW coalescing.
2922  */
2923 static __inline void
2924 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2925 {
2926 	unsigned int credits;
2927 
2928 #if USE_GTS
2929 	if (flags & F_RSPD_TXQ0_GTS)
2930 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2931 #endif
2932 	credits = G_RSPD_TXQ0_CR(flags);
2933 	if (credits)
2934 		qs->txq[TXQ_ETH].processed += credits;
2935 
2936 	credits = G_RSPD_TXQ2_CR(flags);
2937 	if (credits)
2938 		qs->txq[TXQ_CTRL].processed += credits;
2939 
2940 # if USE_GTS
2941 	if (flags & F_RSPD_TXQ1_GTS)
2942 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2943 # endif
2944 	credits = G_RSPD_TXQ1_CR(flags);
2945 	if (credits)
2946 		qs->txq[TXQ_OFLD].processed += credits;
2947 
2948 }
2949 
2950 static void
2951 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2952     unsigned int sleeping)
2953 {
2954 	;
2955 }
2956 
2957 /**
2958  *	process_responses - process responses from an SGE response queue
2959  *	@adap: the adapter
2960  *	@qs: the queue set to which the response queue belongs
2961  *	@budget: how many responses can be processed in this round
2962  *
2963  *	Process responses from an SGE response queue up to the supplied budget.
2964  *	Responses include received packets as well as credits and other events
2965  *	for the queues that belong to the response queue's queue set.
2966  *	A negative budget is effectively unlimited.
2967  *
2968  *	Additionally choose the interrupt holdoff time for the next interrupt
2969  *	on this queue.  If the system is under memory shortage use a fairly
2970  *	long delay to help recovery.
2971  */
2972 static int
2973 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2974 {
2975 	struct sge_rspq *rspq = &qs->rspq;
2976 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2977 	int budget_left = budget;
2978 	unsigned int sleeping = 0;
2979 #ifdef LRO_SUPPORTED
2980 	int lro_enabled = qs->lro.enabled;
2981 	int skip_lro;
2982 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2983 #endif
2984 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2985 	int ngathered = 0;
2986 #ifdef DEBUG
2987 	static int last_holdoff = 0;
2988 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2989 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2990 		last_holdoff = rspq->holdoff_tmr;
2991 	}
2992 #endif
2993 	rspq->next_holdoff = rspq->holdoff_tmr;
2994 
2995 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2996 		int eth, eop = 0, ethpad = 0;
2997 		uint32_t flags = ntohl(r->flags);
2998 		uint32_t rss_csum = *(const uint32_t *)r;
2999 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
3000 
3001 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
3002 
3003 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
3004 			struct mbuf *m;
3005 
3006 			if (cxgb_debug)
3007 				printf("async notification\n");
3008 
3009 			if (rspq->rspq_mh.mh_head == NULL) {
3010 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3011 				m = rspq->rspq_mh.mh_head;
3012 			} else {
3013 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3014 			}
3015 			if (m == NULL)
3016 				goto no_mem;
3017 
3018                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
3019 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
3020                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
3021 			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
3022 			eop = 1;
3023                         rspq->async_notif++;
3024 			goto skip;
3025 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
3026 			struct mbuf *m = NULL;
3027 
3028 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
3029 			    r->rss_hdr.opcode, rspq->cidx);
3030 			if (rspq->rspq_mh.mh_head == NULL)
3031 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3032                         else
3033 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3034 
3035 			if (rspq->rspq_mh.mh_head == NULL &&  m == NULL) {
3036 		no_mem:
3037 				rspq->next_holdoff = NOMEM_INTR_DELAY;
3038 				budget_left--;
3039 				break;
3040 			}
3041 			get_imm_packet(adap, r, rspq->rspq_mh.mh_head);
3042 			eop = 1;
3043 			rspq->imm_data++;
3044 		} else if (r->len_cq) {
3045 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3046 
3047 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
3048 			if (eop) {
3049 				rspq->rspq_mh.mh_head->m_flags |= M_FLOWID;
3050 				rspq->rspq_mh.mh_head->m_pkthdr.flowid = rss_hash;
3051 			}
3052 
3053 			ethpad = 2;
3054 		} else {
3055 			rspq->pure_rsps++;
3056 		}
3057 	skip:
3058 		if (flags & RSPD_CTRL_MASK) {
3059 			sleeping |= flags & RSPD_GTS_MASK;
3060 			handle_rsp_cntrl_info(qs, flags);
3061 		}
3062 
3063 		r++;
3064 		if (__predict_false(++rspq->cidx == rspq->size)) {
3065 			rspq->cidx = 0;
3066 			rspq->gen ^= 1;
3067 			r = rspq->desc;
3068 		}
3069 
3070 		if (++rspq->credits >= (rspq->size / 4)) {
3071 			refill_rspq(adap, rspq, rspq->credits);
3072 			rspq->credits = 0;
3073 		}
3074 		if (!eth && eop) {
3075 			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
3076 			/*
3077 			 * XXX size mismatch
3078 			 */
3079 			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
3080 
3081 
3082 			ngathered = rx_offload(&adap->tdev, rspq,
3083 			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
3084 			rspq->rspq_mh.mh_head = NULL;
3085 			DPRINTF("received offload packet\n");
3086 
3087 		} else if (eth && eop) {
3088 			struct mbuf *m = rspq->rspq_mh.mh_head;
3089 
3090 			t3_rx_eth(adap, rspq, m, ethpad);
3091 
3092 #ifdef LRO_SUPPORTED
3093 			/*
3094 			 * The T304 sends incoming packets on any qset.  If LRO
3095 			 * is also enabled, we could end up sending packet up
3096 			 * lro_ctrl->ifp's input.  That is incorrect.
3097 			 *
3098 			 * The mbuf's rcvif was derived from the cpl header and
3099 			 * is accurate.  Skip LRO and just use that.
3100 			 */
3101 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3102 
3103 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro &&
3104 			    (tcp_lro_rx(lro_ctrl, m, 0) == 0)) {
3105 				/* successfully queue'd for LRO */
3106 			} else
3107 #endif
3108 			{
3109 				/*
3110 				 * LRO not enabled, packet unsuitable for LRO,
3111 				 * or unable to queue.  Pass it up right now in
3112 				 * either case.
3113 				 */
3114 				struct ifnet *ifp = m->m_pkthdr.rcvif;
3115 				(*ifp->if_input)(ifp, m);
3116 			}
3117 			rspq->rspq_mh.mh_head = NULL;
3118 
3119 		}
3120 		__refill_fl_lt(adap, &qs->fl[0], 32);
3121 		__refill_fl_lt(adap, &qs->fl[1], 32);
3122 		--budget_left;
3123 	}
3124 
3125 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3126 
3127 #ifdef LRO_SUPPORTED
3128 	/* Flush LRO */
3129 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3130 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3131 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3132 		tcp_lro_flush(lro_ctrl, queued);
3133 	}
3134 #endif
3135 
3136 	if (sleeping)
3137 		check_ring_db(adap, qs, sleeping);
3138 
3139 	mb();  /* commit Tx queue processed updates */
3140 	if (__predict_false(qs->txq_stopped > 1))
3141 		restart_tx(qs);
3142 
3143 	__refill_fl_lt(adap, &qs->fl[0], 512);
3144 	__refill_fl_lt(adap, &qs->fl[1], 512);
3145 	budget -= budget_left;
3146 	return (budget);
3147 }
3148 
3149 /*
3150  * A helper function that processes responses and issues GTS.
3151  */
3152 static __inline int
3153 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3154 {
3155 	int work;
3156 	static int last_holdoff = 0;
3157 
3158 	work = process_responses(adap, rspq_to_qset(rq), -1);
3159 
3160 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3161 		printf("next_holdoff=%d\n", rq->next_holdoff);
3162 		last_holdoff = rq->next_holdoff;
3163 	}
3164 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3165 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3166 
3167 	return (work);
3168 }
3169 
3170 
3171 /*
3172  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3173  * Handles data events from SGE response queues as well as error and other
3174  * async events as they all use the same interrupt pin.  We use one SGE
3175  * response queue per port in this mode and protect all response queues with
3176  * queue 0's lock.
3177  */
3178 void
3179 t3b_intr(void *data)
3180 {
3181 	uint32_t i, map;
3182 	adapter_t *adap = data;
3183 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3184 
3185 	t3_write_reg(adap, A_PL_CLI, 0);
3186 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3187 
3188 	if (!map)
3189 		return;
3190 
3191 	if (__predict_false(map & F_ERRINTR))
3192 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3193 
3194 	mtx_lock(&q0->lock);
3195 	for_each_port(adap, i)
3196 	    if (map & (1 << i))
3197 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3198 	mtx_unlock(&q0->lock);
3199 }
3200 
3201 /*
3202  * The MSI interrupt handler.  This needs to handle data events from SGE
3203  * response queues as well as error and other async events as they all use
3204  * the same MSI vector.  We use one SGE response queue per port in this mode
3205  * and protect all response queues with queue 0's lock.
3206  */
3207 void
3208 t3_intr_msi(void *data)
3209 {
3210 	adapter_t *adap = data;
3211 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3212 	int i, new_packets = 0;
3213 
3214 	mtx_lock(&q0->lock);
3215 
3216 	for_each_port(adap, i)
3217 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3218 		    new_packets = 1;
3219 	mtx_unlock(&q0->lock);
3220 	if (new_packets == 0)
3221 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3222 }
3223 
3224 void
3225 t3_intr_msix(void *data)
3226 {
3227 	struct sge_qset *qs = data;
3228 	adapter_t *adap = qs->port->adapter;
3229 	struct sge_rspq *rspq = &qs->rspq;
3230 
3231 	if (process_responses_gts(adap, rspq) == 0)
3232 		rspq->unhandled_irqs++;
3233 }
3234 
3235 #define QDUMP_SBUF_SIZE		32 * 400
3236 static int
3237 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3238 {
3239 	struct sge_rspq *rspq;
3240 	struct sge_qset *qs;
3241 	int i, err, dump_end, idx;
3242 	static int multiplier = 1;
3243 	struct sbuf *sb;
3244 	struct rsp_desc *rspd;
3245 	uint32_t data[4];
3246 
3247 	rspq = arg1;
3248 	qs = rspq_to_qset(rspq);
3249 	if (rspq->rspq_dump_count == 0)
3250 		return (0);
3251 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3252 		log(LOG_WARNING,
3253 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3254 		rspq->rspq_dump_count = 0;
3255 		return (EINVAL);
3256 	}
3257 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3258 		log(LOG_WARNING,
3259 		    "dump start of %d is greater than queue size\n",
3260 		    rspq->rspq_dump_start);
3261 		rspq->rspq_dump_start = 0;
3262 		return (EINVAL);
3263 	}
3264 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3265 	if (err)
3266 		return (err);
3267 retry_sbufops:
3268 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3269 
3270 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3271 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3272 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3273 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3274 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3275 
3276 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3277 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3278 
3279 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3280 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3281 		idx = i & (RSPQ_Q_SIZE-1);
3282 
3283 		rspd = &rspq->desc[idx];
3284 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3285 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3286 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3287 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3288 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3289 		    be32toh(rspd->len_cq), rspd->intr_gen);
3290 	}
3291 	if (sbuf_overflowed(sb)) {
3292 		sbuf_delete(sb);
3293 		multiplier++;
3294 		goto retry_sbufops;
3295 	}
3296 	sbuf_finish(sb);
3297 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3298 	sbuf_delete(sb);
3299 	return (err);
3300 }
3301 
3302 static int
3303 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3304 {
3305 	struct sge_txq *txq;
3306 	struct sge_qset *qs;
3307 	int i, j, err, dump_end;
3308 	static int multiplier = 1;
3309 	struct sbuf *sb;
3310 	struct tx_desc *txd;
3311 	uint32_t *WR, wr_hi, wr_lo, gen;
3312 	uint32_t data[4];
3313 
3314 	txq = arg1;
3315 	qs = txq_to_qset(txq, TXQ_ETH);
3316 	if (txq->txq_dump_count == 0) {
3317 		return (0);
3318 	}
3319 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3320 		log(LOG_WARNING,
3321 		    "dump count is too large %d\n", txq->txq_dump_count);
3322 		txq->txq_dump_count = 1;
3323 		return (EINVAL);
3324 	}
3325 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3326 		log(LOG_WARNING,
3327 		    "dump start of %d is greater than queue size\n",
3328 		    txq->txq_dump_start);
3329 		txq->txq_dump_start = 0;
3330 		return (EINVAL);
3331 	}
3332 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3333 	if (err)
3334 		return (err);
3335 
3336 
3337 retry_sbufops:
3338 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3339 
3340 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3341 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3342 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3343 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3344 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3345 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3346 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3347 	    txq->txq_dump_start,
3348 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3349 
3350 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3351 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3352 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3353 		WR = (uint32_t *)txd->flit;
3354 		wr_hi = ntohl(WR[0]);
3355 		wr_lo = ntohl(WR[1]);
3356 		gen = G_WR_GEN(wr_lo);
3357 
3358 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3359 		    wr_hi, wr_lo, gen);
3360 		for (j = 2; j < 30; j += 4)
3361 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3362 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3363 
3364 	}
3365 	if (sbuf_overflowed(sb)) {
3366 		sbuf_delete(sb);
3367 		multiplier++;
3368 		goto retry_sbufops;
3369 	}
3370 	sbuf_finish(sb);
3371 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3372 	sbuf_delete(sb);
3373 	return (err);
3374 }
3375 
3376 static int
3377 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3378 {
3379 	struct sge_txq *txq;
3380 	struct sge_qset *qs;
3381 	int i, j, err, dump_end;
3382 	static int multiplier = 1;
3383 	struct sbuf *sb;
3384 	struct tx_desc *txd;
3385 	uint32_t *WR, wr_hi, wr_lo, gen;
3386 
3387 	txq = arg1;
3388 	qs = txq_to_qset(txq, TXQ_CTRL);
3389 	if (txq->txq_dump_count == 0) {
3390 		return (0);
3391 	}
3392 	if (txq->txq_dump_count > 256) {
3393 		log(LOG_WARNING,
3394 		    "dump count is too large %d\n", txq->txq_dump_count);
3395 		txq->txq_dump_count = 1;
3396 		return (EINVAL);
3397 	}
3398 	if (txq->txq_dump_start > 255) {
3399 		log(LOG_WARNING,
3400 		    "dump start of %d is greater than queue size\n",
3401 		    txq->txq_dump_start);
3402 		txq->txq_dump_start = 0;
3403 		return (EINVAL);
3404 	}
3405 
3406 retry_sbufops:
3407 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3408 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3409 	    txq->txq_dump_start,
3410 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3411 
3412 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3413 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3414 		txd = &txq->desc[i & (255)];
3415 		WR = (uint32_t *)txd->flit;
3416 		wr_hi = ntohl(WR[0]);
3417 		wr_lo = ntohl(WR[1]);
3418 		gen = G_WR_GEN(wr_lo);
3419 
3420 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3421 		    wr_hi, wr_lo, gen);
3422 		for (j = 2; j < 30; j += 4)
3423 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3424 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3425 
3426 	}
3427 	if (sbuf_overflowed(sb)) {
3428 		sbuf_delete(sb);
3429 		multiplier++;
3430 		goto retry_sbufops;
3431 	}
3432 	sbuf_finish(sb);
3433 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3434 	sbuf_delete(sb);
3435 	return (err);
3436 }
3437 
3438 static int
3439 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3440 {
3441 	adapter_t *sc = arg1;
3442 	struct qset_params *qsp = &sc->params.sge.qset[0];
3443 	int coalesce_usecs;
3444 	struct sge_qset *qs;
3445 	int i, j, err, nqsets = 0;
3446 	struct mtx *lock;
3447 
3448 	if ((sc->flags & FULL_INIT_DONE) == 0)
3449 		return (ENXIO);
3450 
3451 	coalesce_usecs = qsp->coalesce_usecs;
3452         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3453 
3454 	if (err != 0) {
3455 		return (err);
3456 	}
3457 	if (coalesce_usecs == qsp->coalesce_usecs)
3458 		return (0);
3459 
3460 	for (i = 0; i < sc->params.nports; i++)
3461 		for (j = 0; j < sc->port[i].nqsets; j++)
3462 			nqsets++;
3463 
3464 	coalesce_usecs = max(1, coalesce_usecs);
3465 
3466 	for (i = 0; i < nqsets; i++) {
3467 		qs = &sc->sge.qs[i];
3468 		qsp = &sc->params.sge.qset[i];
3469 		qsp->coalesce_usecs = coalesce_usecs;
3470 
3471 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3472 			    &sc->sge.qs[0].rspq.lock;
3473 
3474 		mtx_lock(lock);
3475 		t3_update_qset_coalesce(qs, qsp);
3476 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3477 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3478 		mtx_unlock(lock);
3479 	}
3480 
3481 	return (0);
3482 }
3483 
3484 
3485 void
3486 t3_add_attach_sysctls(adapter_t *sc)
3487 {
3488 	struct sysctl_ctx_list *ctx;
3489 	struct sysctl_oid_list *children;
3490 
3491 	ctx = device_get_sysctl_ctx(sc->dev);
3492 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3493 
3494 	/* random information */
3495 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3496 	    "firmware_version",
3497 	    CTLFLAG_RD, &sc->fw_version,
3498 	    0, "firmware version");
3499 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3500 	    "hw_revision",
3501 	    CTLFLAG_RD, &sc->params.rev,
3502 	    0, "chip model");
3503 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3504 	    "port_types",
3505 	    CTLFLAG_RD, &sc->port_types,
3506 	    0, "type of ports");
3507 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3508 	    "enable_debug",
3509 	    CTLFLAG_RW, &cxgb_debug,
3510 	    0, "enable verbose debugging output");
3511 	SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3512 	    CTLFLAG_RD, &sc->tunq_coalesce,
3513 	    "#tunneled packets freed");
3514 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3515 	    "txq_overrun",
3516 	    CTLFLAG_RD, &txq_fills,
3517 	    0, "#times txq overrun");
3518 }
3519 
3520 
3521 static const char *rspq_name = "rspq";
3522 static const char *txq_names[] =
3523 {
3524 	"txq_eth",
3525 	"txq_ofld",
3526 	"txq_ctrl"
3527 };
3528 
3529 static int
3530 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3531 {
3532 	struct port_info *p = arg1;
3533 	uint64_t *parg;
3534 
3535 	if (!p)
3536 		return (EINVAL);
3537 
3538 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3539 	PORT_LOCK(p);
3540 	t3_mac_update_stats(&p->mac);
3541 	PORT_UNLOCK(p);
3542 
3543 	return (sysctl_handle_quad(oidp, parg, 0, req));
3544 }
3545 
3546 void
3547 t3_add_configured_sysctls(adapter_t *sc)
3548 {
3549 	struct sysctl_ctx_list *ctx;
3550 	struct sysctl_oid_list *children;
3551 	int i, j;
3552 
3553 	ctx = device_get_sysctl_ctx(sc->dev);
3554 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3555 
3556 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3557 	    "intr_coal",
3558 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3559 	    0, t3_set_coalesce_usecs,
3560 	    "I", "interrupt coalescing timer (us)");
3561 
3562 	for (i = 0; i < sc->params.nports; i++) {
3563 		struct port_info *pi = &sc->port[i];
3564 		struct sysctl_oid *poid;
3565 		struct sysctl_oid_list *poidlist;
3566 		struct mac_stats *mstats = &pi->mac.stats;
3567 
3568 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3569 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3570 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3571 		poidlist = SYSCTL_CHILDREN(poid);
3572 		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3573 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3574 		    0, "#queue sets");
3575 
3576 		for (j = 0; j < pi->nqsets; j++) {
3577 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3578 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3579 					  *ctrlqpoid, *lropoid;
3580 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3581 					       *txqpoidlist, *ctrlqpoidlist,
3582 					       *lropoidlist;
3583 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3584 
3585 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3586 
3587 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3588 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3589 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3590 
3591 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3592 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3593 					"freelist #0 empty");
3594 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3595 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3596 					"freelist #1 empty");
3597 
3598 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3599 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3600 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3601 
3602 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3603 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3604 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3605 
3606 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3607 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3608 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3609 
3610 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3611 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3612 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3613 
3614 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3615 			    CTLFLAG_RD, &qs->rspq.size,
3616 			    0, "#entries in response queue");
3617 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3618 			    CTLFLAG_RD, &qs->rspq.cidx,
3619 			    0, "consumer index");
3620 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3621 			    CTLFLAG_RD, &qs->rspq.credits,
3622 			    0, "#credits");
3623 			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3624 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3625 			    "physical_address_of the queue");
3626 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3627 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3628 			    0, "start rspq dump entry");
3629 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3630 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3631 			    0, "#rspq entries to dump");
3632 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3633 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3634 			    0, t3_dump_rspq, "A", "dump of the response queue");
3635 
3636 
3637 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "dropped",
3638 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3639 			    0, "#tunneled packets dropped");
3640 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3641 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3642 			    0, "#tunneled packets waiting to be sent");
3643 #if 0
3644 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3645 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3646 			    0, "#tunneled packets queue producer index");
3647 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3648 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3649 			    0, "#tunneled packets queue consumer index");
3650 #endif
3651 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3652 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3653 			    0, "#tunneled packets processed by the card");
3654 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3655 			    CTLFLAG_RD, &txq->cleaned,
3656 			    0, "#tunneled packets cleaned");
3657 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3658 			    CTLFLAG_RD, &txq->in_use,
3659 			    0, "#tunneled packet slots in use");
3660 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3661 			    CTLFLAG_RD, &txq->txq_frees,
3662 			    "#tunneled packets freed");
3663 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3664 			    CTLFLAG_RD, &txq->txq_skipped,
3665 			    0, "#tunneled packet descriptors skipped");
3666 			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3667 			    CTLFLAG_RD, &txq->txq_coalesced,
3668 			    "#tunneled packets coalesced");
3669 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3670 			    CTLFLAG_RD, &txq->txq_enqueued,
3671 			    0, "#tunneled packets enqueued to hardware");
3672 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3673 			    CTLFLAG_RD, &qs->txq_stopped,
3674 			    0, "tx queues stopped");
3675 			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3676 			    CTLFLAG_RD, &txq->phys_addr,
3677 			    "physical_address_of the queue");
3678 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3679 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3680 			    0, "txq generation");
3681 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3682 			    CTLFLAG_RD, &txq->cidx,
3683 			    0, "hardware queue cidx");
3684 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3685 			    CTLFLAG_RD, &txq->pidx,
3686 			    0, "hardware queue pidx");
3687 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3688 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3689 			    0, "txq start idx for dump");
3690 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3691 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3692 			    0, "txq #entries to dump");
3693 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3694 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3695 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3696 
3697 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3698 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3699 			    0, "ctrlq start idx for dump");
3700 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3701 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3702 			    0, "ctrl #entries to dump");
3703 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3704 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3705 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3706 
3707 #ifdef LRO_SUPPORTED
3708 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3709 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3710 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3711 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3712 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3713 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3714 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3715 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3716 #endif
3717 		}
3718 
3719 		/* Now add a node for mac stats. */
3720 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3721 		    CTLFLAG_RD, NULL, "MAC statistics");
3722 		poidlist = SYSCTL_CHILDREN(poid);
3723 
3724 		/*
3725 		 * We (ab)use the length argument (arg2) to pass on the offset
3726 		 * of the data that we are interested in.  This is only required
3727 		 * for the quad counters that are updated from the hardware (we
3728 		 * make sure that we return the latest value).
3729 		 * sysctl_handle_macstat first updates *all* the counters from
3730 		 * the hardware, and then returns the latest value of the
3731 		 * requested counter.  Best would be to update only the
3732 		 * requested counter from hardware, but t3_mac_update_stats()
3733 		 * hides all the register details and we don't want to dive into
3734 		 * all that here.
3735 		 */
3736 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3737     (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3738     sysctl_handle_macstat, "QU", 0)
3739 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3740 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3741 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3742 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3743 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3744 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3745 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3746 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3747 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3748 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3749 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3750 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3751 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3752 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3753 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3754 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3755 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3756 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3757 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3758 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3759 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3760 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3761 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3762 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3763 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3764 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3765 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3766 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3767 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3768 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3769 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3770 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3771 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3772 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3773 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3774 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3775 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3776 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3777 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3778 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3779 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3780 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3781 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3782 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3783 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3784 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3785 #undef CXGB_SYSCTL_ADD_QUAD
3786 
3787 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3788     CTLFLAG_RD, &mstats->a, 0)
3789 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3790 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3791 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3792 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3793 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3794 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3795 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3796 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3797 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3798 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3799 #undef CXGB_SYSCTL_ADD_ULONG
3800 	}
3801 }
3802 
3803 /**
3804  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3805  *	@qs: the queue set
3806  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3807  *	@idx: the descriptor index in the queue
3808  *	@data: where to dump the descriptor contents
3809  *
3810  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3811  *	size of the descriptor.
3812  */
3813 int
3814 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3815 		unsigned char *data)
3816 {
3817 	if (qnum >= 6)
3818 		return (EINVAL);
3819 
3820 	if (qnum < 3) {
3821 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3822 			return -EINVAL;
3823 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3824 		return sizeof(struct tx_desc);
3825 	}
3826 
3827 	if (qnum == 3) {
3828 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3829 			return (EINVAL);
3830 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3831 		return sizeof(struct rsp_desc);
3832 	}
3833 
3834 	qnum -= 4;
3835 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3836 		return (EINVAL);
3837 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3838 	return sizeof(struct rx_desc);
3839 }
3840