xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /**************************************************************************
2 
3 Copyright (c) 2007-2009, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/module.h>
39 #include <sys/bus.h>
40 #include <sys/conf.h>
41 #include <machine/bus.h>
42 #include <machine/resource.h>
43 #include <sys/bus_dma.h>
44 #include <sys/rman.h>
45 #include <sys/queue.h>
46 #include <sys/sysctl.h>
47 #include <sys/taskqueue.h>
48 
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/systm.h>
54 #include <sys/syslog.h>
55 #include <sys/socket.h>
56 
57 #include <net/bpf.h>
58 #include <net/ethernet.h>
59 #include <net/if.h>
60 #include <net/if_vlan_var.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <dev/pci/pcireg.h>
68 #include <dev/pci/pcivar.h>
69 
70 #include <vm/vm.h>
71 #include <vm/pmap.h>
72 
73 #include <cxgb_include.h>
74 #include <sys/mvec.h>
75 
76 int	txq_fills = 0;
77 int	multiq_tx_enable = 1;
78 
79 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
80 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
81 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
82 SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
83     "size of per-queue mbuf ring");
84 
85 static int cxgb_tx_coalesce_force = 0;
86 TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
87 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
88     &cxgb_tx_coalesce_force, 0,
89     "coalesce small packets into a single work request regardless of ring state");
90 
91 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
92 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
93 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
94 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
95 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
96 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
97 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
98 
99 
100 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
101 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
102     &cxgb_tx_coalesce_enable_start);
103 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
104     &cxgb_tx_coalesce_enable_start, 0,
105     "coalesce enable threshold");
106 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
107 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
108 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
109     &cxgb_tx_coalesce_enable_stop, 0,
110     "coalesce disable threshold");
111 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
112 TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
113 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
114     &cxgb_tx_reclaim_threshold, 0,
115     "tx cleaning minimum threshold");
116 
117 /*
118  * XXX don't re-enable this until TOE stops assuming
119  * we have an m_ext
120  */
121 static int recycle_enable = 0;
122 
123 extern int cxgb_use_16k_clusters;
124 extern int nmbjumbop;
125 extern int nmbjumbo9;
126 extern int nmbjumbo16;
127 
128 #define USE_GTS 0
129 
130 #define SGE_RX_SM_BUF_SIZE	1536
131 #define SGE_RX_DROP_THRES	16
132 #define SGE_RX_COPY_THRES	128
133 
134 /*
135  * Period of the Tx buffer reclaim timer.  This timer does not need to run
136  * frequently as Tx buffers are usually reclaimed by new Tx packets.
137  */
138 #define TX_RECLAIM_PERIOD       (hz >> 1)
139 
140 /*
141  * Values for sge_txq.flags
142  */
143 enum {
144 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
145 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
146 };
147 
148 struct tx_desc {
149 	uint64_t	flit[TX_DESC_FLITS];
150 } __packed;
151 
152 struct rx_desc {
153 	uint32_t	addr_lo;
154 	uint32_t	len_gen;
155 	uint32_t	gen2;
156 	uint32_t	addr_hi;
157 } __packed;
158 
159 struct rsp_desc {               /* response queue descriptor */
160 	struct rss_header	rss_hdr;
161 	uint32_t		flags;
162 	uint32_t		len_cq;
163 	uint8_t			imm_data[47];
164 	uint8_t			intr_gen;
165 } __packed;
166 
167 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
168 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
169 #define RX_SW_DESC_INUSE        (1 << 3)
170 #define TX_SW_DESC_MAPPED       (1 << 4)
171 
172 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
173 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
174 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
175 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
176 
177 struct tx_sw_desc {                /* SW state per Tx descriptor */
178 	struct mbuf	*m;
179 	bus_dmamap_t	map;
180 	int		flags;
181 };
182 
183 struct rx_sw_desc {                /* SW state per Rx descriptor */
184 	caddr_t		rxsd_cl;
185 	struct mbuf	*m;
186 	bus_dmamap_t	map;
187 	int		flags;
188 };
189 
190 struct txq_state {
191 	unsigned int	compl;
192 	unsigned int	gen;
193 	unsigned int	pidx;
194 };
195 
196 struct refill_fl_cb_arg {
197 	int               error;
198 	bus_dma_segment_t seg;
199 	int               nseg;
200 };
201 
202 
203 /*
204  * Maps a number of flits to the number of Tx descriptors that can hold them.
205  * The formula is
206  *
207  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
208  *
209  * HW allows up to 4 descriptors to be combined into a WR.
210  */
211 static uint8_t flit_desc_map[] = {
212 	0,
213 #if SGE_NUM_GENBITS == 1
214 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
216 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
217 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
218 #elif SGE_NUM_GENBITS == 2
219 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
220 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
222 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
223 #else
224 # error "SGE_NUM_GENBITS must be 1 or 2"
225 #endif
226 };
227 
228 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
229 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
230 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
231 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
232 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
233 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
234 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
235 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
237 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
238 #define	TXQ_RING_DEQUEUE(qs) \
239 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
240 
241 int cxgb_debug = 0;
242 
243 static void sge_timer_cb(void *arg);
244 static void sge_timer_reclaim(void *arg, int ncount);
245 static void sge_txq_reclaim_handler(void *arg, int ncount);
246 static void cxgb_start_locked(struct sge_qset *qs);
247 
248 /*
249  * XXX need to cope with bursty scheduling by looking at a wider
250  * window than we are now for determining the need for coalescing
251  *
252  */
253 static __inline uint64_t
254 check_pkt_coalesce(struct sge_qset *qs)
255 {
256         struct adapter *sc;
257         struct sge_txq *txq;
258 	uint8_t *fill;
259 
260 	if (__predict_false(cxgb_tx_coalesce_force))
261 		return (1);
262 	txq = &qs->txq[TXQ_ETH];
263         sc = qs->port->adapter;
264 	fill = &sc->tunq_fill[qs->idx];
265 
266 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
267 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
268 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
269 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
270 	/*
271 	 * if the hardware transmit queue is more than 1/8 full
272 	 * we mark it as coalescing - we drop back from coalescing
273 	 * when we go below 1/32 full and there are no packets enqueued,
274 	 * this provides us with some degree of hysteresis
275 	 */
276         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
277 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
278                 *fill = 0;
279         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
280                 *fill = 1;
281 
282 	return (sc->tunq_coalesce);
283 }
284 
285 #ifdef __LP64__
286 static void
287 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
288 {
289 	uint64_t wr_hilo;
290 #if _BYTE_ORDER == _LITTLE_ENDIAN
291 	wr_hilo = wr_hi;
292 	wr_hilo |= (((uint64_t)wr_lo)<<32);
293 #else
294 	wr_hilo = wr_lo;
295 	wr_hilo |= (((uint64_t)wr_hi)<<32);
296 #endif
297 	wrp->wrh_hilo = wr_hilo;
298 }
299 #else
300 static void
301 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
302 {
303 
304 	wrp->wrh_hi = wr_hi;
305 	wmb();
306 	wrp->wrh_lo = wr_lo;
307 }
308 #endif
309 
310 struct coalesce_info {
311 	int count;
312 	int nbytes;
313 };
314 
315 static int
316 coalesce_check(struct mbuf *m, void *arg)
317 {
318 	struct coalesce_info *ci = arg;
319 	int *count = &ci->count;
320 	int *nbytes = &ci->nbytes;
321 
322 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
323 		(*count < 7) && (m->m_next == NULL))) {
324 		*count += 1;
325 		*nbytes += m->m_len;
326 		return (1);
327 	}
328 	return (0);
329 }
330 
331 static struct mbuf *
332 cxgb_dequeue(struct sge_qset *qs)
333 {
334 	struct mbuf *m, *m_head, *m_tail;
335 	struct coalesce_info ci;
336 
337 
338 	if (check_pkt_coalesce(qs) == 0)
339 		return TXQ_RING_DEQUEUE(qs);
340 
341 	m_head = m_tail = NULL;
342 	ci.count = ci.nbytes = 0;
343 	do {
344 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
345 		if (m_head == NULL) {
346 			m_tail = m_head = m;
347 		} else if (m != NULL) {
348 			m_tail->m_nextpkt = m;
349 			m_tail = m;
350 		}
351 	} while (m != NULL);
352 	if (ci.count > 7)
353 		panic("trying to coalesce %d packets in to one WR", ci.count);
354 	return (m_head);
355 }
356 
357 /**
358  *	reclaim_completed_tx - reclaims completed Tx descriptors
359  *	@adapter: the adapter
360  *	@q: the Tx queue to reclaim completed descriptors from
361  *
362  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
363  *	and frees the associated buffers if possible.  Called with the Tx
364  *	queue's lock held.
365  */
366 static __inline int
367 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
368 {
369 	struct sge_txq *q = &qs->txq[queue];
370 	int reclaim = desc_reclaimable(q);
371 
372 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
373 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
374 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
375 
376 	if (reclaim < reclaim_min)
377 		return (0);
378 
379 	mtx_assert(&qs->lock, MA_OWNED);
380 	if (reclaim > 0) {
381 		t3_free_tx_desc(qs, reclaim, queue);
382 		q->cleaned += reclaim;
383 		q->in_use -= reclaim;
384 	}
385 	if (isset(&qs->txq_stopped, TXQ_ETH))
386                 clrbit(&qs->txq_stopped, TXQ_ETH);
387 
388 	return (reclaim);
389 }
390 
391 /**
392  *	should_restart_tx - are there enough resources to restart a Tx queue?
393  *	@q: the Tx queue
394  *
395  *	Checks if there are enough descriptors to restart a suspended Tx queue.
396  */
397 static __inline int
398 should_restart_tx(const struct sge_txq *q)
399 {
400 	unsigned int r = q->processed - q->cleaned;
401 
402 	return q->in_use - r < (q->size >> 1);
403 }
404 
405 /**
406  *	t3_sge_init - initialize SGE
407  *	@adap: the adapter
408  *	@p: the SGE parameters
409  *
410  *	Performs SGE initialization needed every time after a chip reset.
411  *	We do not initialize any of the queue sets here, instead the driver
412  *	top-level must request those individually.  We also do not enable DMA
413  *	here, that should be done after the queues have been set up.
414  */
415 void
416 t3_sge_init(adapter_t *adap, struct sge_params *p)
417 {
418 	u_int ctrl, ups;
419 
420 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
421 
422 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
423 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
424 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
425 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
426 #if SGE_NUM_GENBITS == 1
427 	ctrl |= F_EGRGENCTRL;
428 #endif
429 	if (adap->params.rev > 0) {
430 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
431 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
432 	}
433 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
434 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
435 		     V_LORCQDRBTHRSH(512));
436 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
437 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
438 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
439 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
440 		     adap->params.rev < T3_REV_C ? 1000 : 500);
441 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
442 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
443 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
444 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
445 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
446 }
447 
448 
449 /**
450  *	sgl_len - calculates the size of an SGL of the given capacity
451  *	@n: the number of SGL entries
452  *
453  *	Calculates the number of flits needed for a scatter/gather list that
454  *	can hold the given number of entries.
455  */
456 static __inline unsigned int
457 sgl_len(unsigned int n)
458 {
459 	return ((3 * n) / 2 + (n & 1));
460 }
461 
462 /**
463  *	get_imm_packet - return the next ingress packet buffer from a response
464  *	@resp: the response descriptor containing the packet data
465  *
466  *	Return a packet containing the immediate data of the given response.
467  */
468 static int
469 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
470 {
471 
472 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
473 	m->m_ext.ext_buf = NULL;
474 	m->m_ext.ext_type = 0;
475 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
476 	return (0);
477 }
478 
479 static __inline u_int
480 flits_to_desc(u_int n)
481 {
482 	return (flit_desc_map[n]);
483 }
484 
485 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
486 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
487 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
488 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
489 		    F_HIRCQPARITYERROR)
490 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
491 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
492 		      F_RSPQDISABLED)
493 
494 /**
495  *	t3_sge_err_intr_handler - SGE async event interrupt handler
496  *	@adapter: the adapter
497  *
498  *	Interrupt handler for SGE asynchronous (non-data) events.
499  */
500 void
501 t3_sge_err_intr_handler(adapter_t *adapter)
502 {
503 	unsigned int v, status;
504 
505 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
506 	if (status & SGE_PARERR)
507 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
508 			 status & SGE_PARERR);
509 	if (status & SGE_FRAMINGERR)
510 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
511 			 status & SGE_FRAMINGERR);
512 	if (status & F_RSPQCREDITOVERFOW)
513 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
514 
515 	if (status & F_RSPQDISABLED) {
516 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
517 
518 		CH_ALERT(adapter,
519 			 "packet delivered to disabled response queue (0x%x)\n",
520 			 (v >> S_RSPQ0DISABLED) & 0xff);
521 	}
522 
523 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
524 	if (status & SGE_FATALERR)
525 		t3_fatal_err(adapter);
526 }
527 
528 void
529 t3_sge_prep(adapter_t *adap, struct sge_params *p)
530 {
531 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
532 
533 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
534 	nqsets *= adap->params.nports;
535 
536 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
537 
538 	while (!powerof2(fl_q_size))
539 		fl_q_size--;
540 
541 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
542 	    is_offload(adap);
543 
544 #if __FreeBSD_version >= 700111
545 	if (use_16k) {
546 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
547 		jumbo_buf_size = MJUM16BYTES;
548 	} else {
549 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
550 		jumbo_buf_size = MJUM9BYTES;
551 	}
552 #else
553 	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
554 	jumbo_buf_size = MJUMPAGESIZE;
555 #endif
556 	while (!powerof2(jumbo_q_size))
557 		jumbo_q_size--;
558 
559 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
560 		device_printf(adap->dev,
561 		    "Insufficient clusters and/or jumbo buffers.\n");
562 
563 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
564 
565 	for (i = 0; i < SGE_QSETS; ++i) {
566 		struct qset_params *q = p->qset + i;
567 
568 		if (adap->params.nports > 2) {
569 			q->coalesce_usecs = 50;
570 		} else {
571 #ifdef INVARIANTS
572 			q->coalesce_usecs = 10;
573 #else
574 			q->coalesce_usecs = 5;
575 #endif
576 		}
577 		q->polling = 0;
578 		q->rspq_size = RSPQ_Q_SIZE;
579 		q->fl_size = fl_q_size;
580 		q->jumbo_size = jumbo_q_size;
581 		q->jumbo_buf_size = jumbo_buf_size;
582 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
583 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
584 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
585 		q->cong_thres = 0;
586 	}
587 }
588 
589 int
590 t3_sge_alloc(adapter_t *sc)
591 {
592 
593 	/* The parent tag. */
594 	if (bus_dma_tag_create( NULL,			/* parent */
595 				1, 0,			/* algnmnt, boundary */
596 				BUS_SPACE_MAXADDR,	/* lowaddr */
597 				BUS_SPACE_MAXADDR,	/* highaddr */
598 				NULL, NULL,		/* filter, filterarg */
599 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
600 				BUS_SPACE_UNRESTRICTED, /* nsegments */
601 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
602 				0,			/* flags */
603 				NULL, NULL,		/* lock, lockarg */
604 				&sc->parent_dmat)) {
605 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
606 		return (ENOMEM);
607 	}
608 
609 	/*
610 	 * DMA tag for normal sized RX frames
611 	 */
612 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
613 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
614 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
615 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
616 		return (ENOMEM);
617 	}
618 
619 	/*
620 	 * DMA tag for jumbo sized RX frames.
621 	 */
622 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
623 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
624 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
625 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
626 		return (ENOMEM);
627 	}
628 
629 	/*
630 	 * DMA tag for TX frames.
631 	 */
632 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
633 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
634 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
635 		NULL, NULL, &sc->tx_dmat)) {
636 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
637 		return (ENOMEM);
638 	}
639 
640 	return (0);
641 }
642 
643 int
644 t3_sge_free(struct adapter * sc)
645 {
646 
647 	if (sc->tx_dmat != NULL)
648 		bus_dma_tag_destroy(sc->tx_dmat);
649 
650 	if (sc->rx_jumbo_dmat != NULL)
651 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
652 
653 	if (sc->rx_dmat != NULL)
654 		bus_dma_tag_destroy(sc->rx_dmat);
655 
656 	if (sc->parent_dmat != NULL)
657 		bus_dma_tag_destroy(sc->parent_dmat);
658 
659 	return (0);
660 }
661 
662 void
663 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
664 {
665 
666 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
667 	qs->rspq.polling = 0 /* p->polling */;
668 }
669 
670 #if !defined(__i386__) && !defined(__amd64__)
671 static void
672 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
673 {
674 	struct refill_fl_cb_arg *cb_arg = arg;
675 
676 	cb_arg->error = error;
677 	cb_arg->seg = segs[0];
678 	cb_arg->nseg = nseg;
679 
680 }
681 #endif
682 /**
683  *	refill_fl - refill an SGE free-buffer list
684  *	@sc: the controller softc
685  *	@q: the free-list to refill
686  *	@n: the number of new buffers to allocate
687  *
688  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
689  *	The caller must assure that @n does not exceed the queue's capacity.
690  */
691 static void
692 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
693 {
694 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
695 	struct rx_desc *d = &q->desc[q->pidx];
696 	struct refill_fl_cb_arg cb_arg;
697 	struct mbuf *m;
698 	caddr_t cl;
699 	int err;
700 
701 	cb_arg.error = 0;
702 	while (n--) {
703 		/*
704 		 * We only allocate a cluster, mbuf allocation happens after rx
705 		 */
706 		if (q->zone == zone_pack) {
707 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
708 				break;
709 			cl = m->m_ext.ext_buf;
710 		} else {
711 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
712 				break;
713 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
714 				uma_zfree(q->zone, cl);
715 				break;
716 			}
717 		}
718 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
719 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
720 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
721 				uma_zfree(q->zone, cl);
722 				goto done;
723 			}
724 			sd->flags |= RX_SW_DESC_MAP_CREATED;
725 		}
726 #if !defined(__i386__) && !defined(__amd64__)
727 		err = bus_dmamap_load(q->entry_tag, sd->map,
728 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
729 
730 		if (err != 0 || cb_arg.error) {
731 			if (q->zone == zone_pack)
732 				uma_zfree(q->zone, cl);
733 			m_free(m);
734 			goto done;
735 		}
736 #else
737 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
738 #endif
739 		sd->flags |= RX_SW_DESC_INUSE;
740 		sd->rxsd_cl = cl;
741 		sd->m = m;
742 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
743 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
744 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
745 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
746 
747 		d++;
748 		sd++;
749 
750 		if (++q->pidx == q->size) {
751 			q->pidx = 0;
752 			q->gen ^= 1;
753 			sd = q->sdesc;
754 			d = q->desc;
755 		}
756 		q->credits++;
757 		q->db_pending++;
758 	}
759 
760 done:
761 	if (q->db_pending >= 32) {
762 		q->db_pending = 0;
763 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
764 	}
765 }
766 
767 
768 /**
769  *	free_rx_bufs - free the Rx buffers on an SGE free list
770  *	@sc: the controle softc
771  *	@q: the SGE free list to clean up
772  *
773  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
774  *	this queue should be stopped before calling this function.
775  */
776 static void
777 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
778 {
779 	u_int cidx = q->cidx;
780 
781 	while (q->credits--) {
782 		struct rx_sw_desc *d = &q->sdesc[cidx];
783 
784 		if (d->flags & RX_SW_DESC_INUSE) {
785 			bus_dmamap_unload(q->entry_tag, d->map);
786 			bus_dmamap_destroy(q->entry_tag, d->map);
787 			if (q->zone == zone_pack) {
788 				m_init(d->m, zone_pack, MCLBYTES,
789 				    M_NOWAIT, MT_DATA, M_EXT);
790 				uma_zfree(zone_pack, d->m);
791 			} else {
792 				m_init(d->m, zone_mbuf, MLEN,
793 				    M_NOWAIT, MT_DATA, 0);
794 				uma_zfree(zone_mbuf, d->m);
795 				uma_zfree(q->zone, d->rxsd_cl);
796 			}
797 		}
798 
799 		d->rxsd_cl = NULL;
800 		d->m = NULL;
801 		if (++cidx == q->size)
802 			cidx = 0;
803 	}
804 }
805 
806 static __inline void
807 __refill_fl(adapter_t *adap, struct sge_fl *fl)
808 {
809 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
810 }
811 
812 static __inline void
813 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
814 {
815 	uint32_t reclaimable = fl->size - fl->credits;
816 
817 	if (reclaimable > 0)
818 		refill_fl(adap, fl, min(max, reclaimable));
819 }
820 
821 /**
822  *	recycle_rx_buf - recycle a receive buffer
823  *	@adapter: the adapter
824  *	@q: the SGE free list
825  *	@idx: index of buffer to recycle
826  *
827  *	Recycles the specified buffer on the given free list by adding it at
828  *	the next available slot on the list.
829  */
830 static void
831 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
832 {
833 	struct rx_desc *from = &q->desc[idx];
834 	struct rx_desc *to   = &q->desc[q->pidx];
835 
836 	q->sdesc[q->pidx] = q->sdesc[idx];
837 	to->addr_lo = from->addr_lo;        // already big endian
838 	to->addr_hi = from->addr_hi;        // likewise
839 	wmb();	/* necessary ? */
840 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
841 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
842 	q->credits++;
843 
844 	if (++q->pidx == q->size) {
845 		q->pidx = 0;
846 		q->gen ^= 1;
847 	}
848 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
849 }
850 
851 static void
852 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
853 {
854 	uint32_t *addr;
855 
856 	addr = arg;
857 	*addr = segs[0].ds_addr;
858 }
859 
860 static int
861 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
862     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
863     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
864 {
865 	size_t len = nelem * elem_size;
866 	void *s = NULL;
867 	void *p = NULL;
868 	int err;
869 
870 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
871 				      BUS_SPACE_MAXADDR_32BIT,
872 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
873 				      len, 0, NULL, NULL, tag)) != 0) {
874 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
875 		return (ENOMEM);
876 	}
877 
878 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
879 				    map)) != 0) {
880 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
881 		return (ENOMEM);
882 	}
883 
884 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
885 	bzero(p, len);
886 	*(void **)desc = p;
887 
888 	if (sw_size) {
889 		len = nelem * sw_size;
890 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
891 		*(void **)sdesc = s;
892 	}
893 	if (parent_entry_tag == NULL)
894 		return (0);
895 
896 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
897 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
898 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
899 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
900 		                      NULL, NULL, entry_tag)) != 0) {
901 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
902 		return (ENOMEM);
903 	}
904 	return (0);
905 }
906 
907 static void
908 sge_slow_intr_handler(void *arg, int ncount)
909 {
910 	adapter_t *sc = arg;
911 
912 	t3_slow_intr_handler(sc);
913 	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
914 	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
915 }
916 
917 /**
918  *	sge_timer_cb - perform periodic maintenance of an SGE qset
919  *	@data: the SGE queue set to maintain
920  *
921  *	Runs periodically from a timer to perform maintenance of an SGE queue
922  *	set.  It performs two tasks:
923  *
924  *	a) Cleans up any completed Tx descriptors that may still be pending.
925  *	Normal descriptor cleanup happens when new packets are added to a Tx
926  *	queue so this timer is relatively infrequent and does any cleanup only
927  *	if the Tx queue has not seen any new packets in a while.  We make a
928  *	best effort attempt to reclaim descriptors, in that we don't wait
929  *	around if we cannot get a queue's lock (which most likely is because
930  *	someone else is queueing new packets and so will also handle the clean
931  *	up).  Since control queues use immediate data exclusively we don't
932  *	bother cleaning them up here.
933  *
934  *	b) Replenishes Rx queues that have run out due to memory shortage.
935  *	Normally new Rx buffers are added when existing ones are consumed but
936  *	when out of memory a queue can become empty.  We try to add only a few
937  *	buffers here, the queue will be replenished fully as these new buffers
938  *	are used up if memory shortage has subsided.
939  *
940  *	c) Return coalesced response queue credits in case a response queue is
941  *	starved.
942  *
943  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
944  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
945  */
946 static void
947 sge_timer_cb(void *arg)
948 {
949 	adapter_t *sc = arg;
950 	if ((sc->flags & USING_MSIX) == 0) {
951 
952 		struct port_info *pi;
953 		struct sge_qset *qs;
954 		struct sge_txq  *txq;
955 		int i, j;
956 		int reclaim_ofl, refill_rx;
957 
958 		if (sc->open_device_map == 0)
959 			return;
960 
961 		for (i = 0; i < sc->params.nports; i++) {
962 			pi = &sc->port[i];
963 			for (j = 0; j < pi->nqsets; j++) {
964 				qs = &sc->sge.qs[pi->first_qset + j];
965 				txq = &qs->txq[0];
966 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
967 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
968 				    (qs->fl[1].credits < qs->fl[1].size));
969 				if (reclaim_ofl || refill_rx) {
970 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
971 					break;
972 				}
973 			}
974 		}
975 	}
976 
977 	if (sc->params.nports > 2) {
978 		int i;
979 
980 		for_each_port(sc, i) {
981 			struct port_info *pi = &sc->port[i];
982 
983 			t3_write_reg(sc, A_SG_KDOORBELL,
984 				     F_SELEGRCNTX |
985 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
986 		}
987 	}
988 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
989 	    sc->open_device_map != 0)
990 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
991 }
992 
993 /*
994  * This is meant to be a catch-all function to keep sge state private
995  * to sge.c
996  *
997  */
998 int
999 t3_sge_init_adapter(adapter_t *sc)
1000 {
1001 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
1002 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1003 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1004 	return (0);
1005 }
1006 
1007 int
1008 t3_sge_reset_adapter(adapter_t *sc)
1009 {
1010 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1011 	return (0);
1012 }
1013 
1014 int
1015 t3_sge_init_port(struct port_info *pi)
1016 {
1017 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1018 	return (0);
1019 }
1020 
1021 /**
1022  *	refill_rspq - replenish an SGE response queue
1023  *	@adapter: the adapter
1024  *	@q: the response queue to replenish
1025  *	@credits: how many new responses to make available
1026  *
1027  *	Replenishes a response queue by making the supplied number of responses
1028  *	available to HW.
1029  */
1030 static __inline void
1031 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1032 {
1033 
1034 	/* mbufs are allocated on demand when a rspq entry is processed. */
1035 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1036 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1037 }
1038 
1039 static void
1040 sge_txq_reclaim_handler(void *arg, int ncount)
1041 {
1042 	struct sge_qset *qs = arg;
1043 	int i;
1044 
1045 	for (i = 0; i < 3; i++)
1046 		reclaim_completed_tx(qs, 16, i);
1047 }
1048 
1049 static void
1050 sge_timer_reclaim(void *arg, int ncount)
1051 {
1052 	struct port_info *pi = arg;
1053 	int i, nqsets = pi->nqsets;
1054 	adapter_t *sc = pi->adapter;
1055 	struct sge_qset *qs;
1056 	struct mtx *lock;
1057 
1058 	KASSERT((sc->flags & USING_MSIX) == 0,
1059 	    ("can't call timer reclaim for msi-x"));
1060 
1061 	for (i = 0; i < nqsets; i++) {
1062 		qs = &sc->sge.qs[pi->first_qset + i];
1063 
1064 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1065 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1066 			    &sc->sge.qs[0].rspq.lock;
1067 
1068 		if (mtx_trylock(lock)) {
1069 			/* XXX currently assume that we are *NOT* polling */
1070 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1071 
1072 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1073 				__refill_fl(sc, &qs->fl[0]);
1074 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1075 				__refill_fl(sc, &qs->fl[1]);
1076 
1077 			if (status & (1 << qs->rspq.cntxt_id)) {
1078 				if (qs->rspq.credits) {
1079 					refill_rspq(sc, &qs->rspq, 1);
1080 					qs->rspq.credits--;
1081 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1082 					    1 << qs->rspq.cntxt_id);
1083 				}
1084 			}
1085 			mtx_unlock(lock);
1086 		}
1087 	}
1088 }
1089 
1090 /**
1091  *	init_qset_cntxt - initialize an SGE queue set context info
1092  *	@qs: the queue set
1093  *	@id: the queue set id
1094  *
1095  *	Initializes the TIDs and context ids for the queues of a queue set.
1096  */
1097 static void
1098 init_qset_cntxt(struct sge_qset *qs, u_int id)
1099 {
1100 
1101 	qs->rspq.cntxt_id = id;
1102 	qs->fl[0].cntxt_id = 2 * id;
1103 	qs->fl[1].cntxt_id = 2 * id + 1;
1104 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1105 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1106 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1107 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1108 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1109 
1110 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1111 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1112 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1113 }
1114 
1115 
1116 static void
1117 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1118 {
1119 	txq->in_use += ndesc;
1120 	/*
1121 	 * XXX we don't handle stopping of queue
1122 	 * presumably start handles this when we bump against the end
1123 	 */
1124 	txqs->gen = txq->gen;
1125 	txq->unacked += ndesc;
1126 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1127 	txq->unacked &= 31;
1128 	txqs->pidx = txq->pidx;
1129 	txq->pidx += ndesc;
1130 #ifdef INVARIANTS
1131 	if (((txqs->pidx > txq->cidx) &&
1132 		(txq->pidx < txqs->pidx) &&
1133 		(txq->pidx >= txq->cidx)) ||
1134 	    ((txqs->pidx < txq->cidx) &&
1135 		(txq->pidx >= txq-> cidx)) ||
1136 	    ((txqs->pidx < txq->cidx) &&
1137 		(txq->cidx < txqs->pidx)))
1138 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1139 		    txqs->pidx, txq->pidx, txq->cidx);
1140 #endif
1141 	if (txq->pidx >= txq->size) {
1142 		txq->pidx -= txq->size;
1143 		txq->gen ^= 1;
1144 	}
1145 
1146 }
1147 
1148 /**
1149  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1150  *	@m: the packet mbufs
1151  *      @nsegs: the number of segments
1152  *
1153  * 	Returns the number of Tx descriptors needed for the given Ethernet
1154  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1155  */
1156 static __inline unsigned int
1157 calc_tx_descs(const struct mbuf *m, int nsegs)
1158 {
1159 	unsigned int flits;
1160 
1161 	if (m->m_pkthdr.len <= PIO_LEN)
1162 		return 1;
1163 
1164 	flits = sgl_len(nsegs) + 2;
1165 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1166 		flits++;
1167 
1168 	return flits_to_desc(flits);
1169 }
1170 
1171 static unsigned int
1172 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1173     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1174 {
1175 	struct mbuf *m0;
1176 	int err, pktlen, pass = 0;
1177 	bus_dma_tag_t tag = txq->entry_tag;
1178 
1179 retry:
1180 	err = 0;
1181 	m0 = *m;
1182 	pktlen = m0->m_pkthdr.len;
1183 #if defined(__i386__) || defined(__amd64__)
1184 	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1185 		goto done;
1186 	} else
1187 #endif
1188 		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1189 
1190 	if (err == 0) {
1191 		goto done;
1192 	}
1193 	if (err == EFBIG && pass == 0) {
1194 		pass = 1;
1195 		/* Too many segments, try to defrag */
1196 		m0 = m_defrag(m0, M_DONTWAIT);
1197 		if (m0 == NULL) {
1198 			m_freem(*m);
1199 			*m = NULL;
1200 			return (ENOBUFS);
1201 		}
1202 		*m = m0;
1203 		goto retry;
1204 	} else if (err == ENOMEM) {
1205 		return (err);
1206 	} if (err) {
1207 		if (cxgb_debug)
1208 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1209 		m_freem(m0);
1210 		*m = NULL;
1211 		return (err);
1212 	}
1213 done:
1214 #if !defined(__i386__) && !defined(__amd64__)
1215 	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1216 #endif
1217 	txsd->flags |= TX_SW_DESC_MAPPED;
1218 
1219 	return (0);
1220 }
1221 
1222 /**
1223  *	make_sgl - populate a scatter/gather list for a packet
1224  *	@sgp: the SGL to populate
1225  *	@segs: the packet dma segments
1226  *	@nsegs: the number of segments
1227  *
1228  *	Generates a scatter/gather list for the buffers that make up a packet
1229  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1230  *	appropriately.
1231  */
1232 static __inline void
1233 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1234 {
1235 	int i, idx;
1236 
1237 	for (idx = 0, i = 0; i < nsegs; i++) {
1238 		/*
1239 		 * firmware doesn't like empty segments
1240 		 */
1241 		if (segs[i].ds_len == 0)
1242 			continue;
1243 		if (i && idx == 0)
1244 			++sgp;
1245 
1246 		sgp->len[idx] = htobe32(segs[i].ds_len);
1247 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1248 		idx ^= 1;
1249 	}
1250 
1251 	if (idx) {
1252 		sgp->len[idx] = 0;
1253 		sgp->addr[idx] = 0;
1254 	}
1255 }
1256 
1257 /**
1258  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1259  *	@adap: the adapter
1260  *	@q: the Tx queue
1261  *
1262  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1263  *	where the HW is going to sleep just after we checked, however,
1264  *	then the interrupt handler will detect the outstanding TX packet
1265  *	and ring the doorbell for us.
1266  *
1267  *	When GTS is disabled we unconditionally ring the doorbell.
1268  */
1269 static __inline void
1270 check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1271 {
1272 #if USE_GTS
1273 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1274 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1275 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1276 #ifdef T3_TRACE
1277 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1278 			  q->cntxt_id);
1279 #endif
1280 		t3_write_reg(adap, A_SG_KDOORBELL,
1281 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1282 	}
1283 #else
1284 	if (mustring || ++q->db_pending >= 32) {
1285 		wmb();            /* write descriptors before telling HW */
1286 		t3_write_reg(adap, A_SG_KDOORBELL,
1287 		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1288 		q->db_pending = 0;
1289 	}
1290 #endif
1291 }
1292 
1293 static __inline void
1294 wr_gen2(struct tx_desc *d, unsigned int gen)
1295 {
1296 #if SGE_NUM_GENBITS == 2
1297 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1298 #endif
1299 }
1300 
1301 /**
1302  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1303  *	@ndesc: number of Tx descriptors spanned by the SGL
1304  *	@txd: first Tx descriptor to be written
1305  *	@txqs: txq state (generation and producer index)
1306  *	@txq: the SGE Tx queue
1307  *	@sgl: the SGL
1308  *	@flits: number of flits to the start of the SGL in the first descriptor
1309  *	@sgl_flits: the SGL size in flits
1310  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1311  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1312  *
1313  *	Write a work request header and an associated SGL.  If the SGL is
1314  *	small enough to fit into one Tx descriptor it has already been written
1315  *	and we just need to write the WR header.  Otherwise we distribute the
1316  *	SGL across the number of descriptors it spans.
1317  */
1318 static void
1319 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1320     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1321     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1322 {
1323 
1324 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1325 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1326 
1327 	if (__predict_true(ndesc == 1)) {
1328 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1329 			V_WR_SGLSFLT(flits)) | wr_hi,
1330 		    htonl(V_WR_LEN(flits + sgl_flits) |
1331 			V_WR_GEN(txqs->gen)) | wr_lo);
1332 		/* XXX gen? */
1333 		wr_gen2(txd, txqs->gen);
1334 
1335 	} else {
1336 		unsigned int ogen = txqs->gen;
1337 		const uint64_t *fp = (const uint64_t *)sgl;
1338 		struct work_request_hdr *wp = wrp;
1339 
1340 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1341 		    V_WR_SGLSFLT(flits)) | wr_hi;
1342 
1343 		while (sgl_flits) {
1344 			unsigned int avail = WR_FLITS - flits;
1345 
1346 			if (avail > sgl_flits)
1347 				avail = sgl_flits;
1348 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1349 			sgl_flits -= avail;
1350 			ndesc--;
1351 			if (!sgl_flits)
1352 				break;
1353 
1354 			fp += avail;
1355 			txd++;
1356 			txsd++;
1357 			if (++txqs->pidx == txq->size) {
1358 				txqs->pidx = 0;
1359 				txqs->gen ^= 1;
1360 				txd = txq->desc;
1361 				txsd = txq->sdesc;
1362 			}
1363 
1364 			/*
1365 			 * when the head of the mbuf chain
1366 			 * is freed all clusters will be freed
1367 			 * with it
1368 			 */
1369 			wrp = (struct work_request_hdr *)txd;
1370 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1371 			    V_WR_SGLSFLT(1)) | wr_hi;
1372 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1373 				    sgl_flits + 1)) |
1374 			    V_WR_GEN(txqs->gen)) | wr_lo;
1375 			wr_gen2(txd, txqs->gen);
1376 			flits = 1;
1377 		}
1378 		wrp->wrh_hi |= htonl(F_WR_EOP);
1379 		wmb();
1380 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1381 		wr_gen2((struct tx_desc *)wp, ogen);
1382 	}
1383 }
1384 
1385 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1386 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1387 
1388 #define GET_VTAG(cntrl, m) \
1389 do { \
1390 	if ((m)->m_flags & M_VLANTAG)					            \
1391 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1392 } while (0)
1393 
1394 static int
1395 t3_encap(struct sge_qset *qs, struct mbuf **m)
1396 {
1397 	adapter_t *sc;
1398 	struct mbuf *m0;
1399 	struct sge_txq *txq;
1400 	struct txq_state txqs;
1401 	struct port_info *pi;
1402 	unsigned int ndesc, flits, cntrl, mlen;
1403 	int err, nsegs, tso_info = 0;
1404 
1405 	struct work_request_hdr *wrp;
1406 	struct tx_sw_desc *txsd;
1407 	struct sg_ent *sgp, *sgl;
1408 	uint32_t wr_hi, wr_lo, sgl_flits;
1409 	bus_dma_segment_t segs[TX_MAX_SEGS];
1410 
1411 	struct tx_desc *txd;
1412 
1413 	pi = qs->port;
1414 	sc = pi->adapter;
1415 	txq = &qs->txq[TXQ_ETH];
1416 	txd = &txq->desc[txq->pidx];
1417 	txsd = &txq->sdesc[txq->pidx];
1418 	sgl = txq->txq_sgl;
1419 
1420 	prefetch(txd);
1421 	m0 = *m;
1422 
1423 	mtx_assert(&qs->lock, MA_OWNED);
1424 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1425 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1426 
1427 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1428 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1429 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1430 
1431 	if (m0->m_nextpkt != NULL) {
1432 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1433 		ndesc = 1;
1434 		mlen = 0;
1435 	} else {
1436 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1437 		    &m0, segs, &nsegs))) {
1438 			if (cxgb_debug)
1439 				printf("failed ... err=%d\n", err);
1440 			return (err);
1441 		}
1442 		mlen = m0->m_pkthdr.len;
1443 		ndesc = calc_tx_descs(m0, nsegs);
1444 	}
1445 	txq_prod(txq, ndesc, &txqs);
1446 
1447 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1448 	txsd->m = m0;
1449 
1450 	if (m0->m_nextpkt != NULL) {
1451 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1452 		int i, fidx;
1453 
1454 		if (nsegs > 7)
1455 			panic("trying to coalesce %d packets in to one WR", nsegs);
1456 		txq->txq_coalesced += nsegs;
1457 		wrp = (struct work_request_hdr *)txd;
1458 		flits = nsegs*2 + 1;
1459 
1460 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1461 			struct cpl_tx_pkt_batch_entry *cbe;
1462 			uint64_t flit;
1463 			uint32_t *hflit = (uint32_t *)&flit;
1464 			int cflags = m0->m_pkthdr.csum_flags;
1465 
1466 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1467 			GET_VTAG(cntrl, m0);
1468 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1469 			if (__predict_false(!(cflags & CSUM_IP)))
1470 				cntrl |= F_TXPKT_IPCSUM_DIS;
1471 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1472 				cntrl |= F_TXPKT_L4CSUM_DIS;
1473 
1474 			hflit[0] = htonl(cntrl);
1475 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1476 			flit |= htobe64(1 << 24);
1477 			cbe = &cpl_batch->pkt_entry[i];
1478 			cbe->cntrl = hflit[0];
1479 			cbe->len = hflit[1];
1480 			cbe->addr = htobe64(segs[i].ds_addr);
1481 		}
1482 
1483 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1484 		    V_WR_SGLSFLT(flits)) |
1485 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1486 		wr_lo = htonl(V_WR_LEN(flits) |
1487 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1488 		set_wr_hdr(wrp, wr_hi, wr_lo);
1489 		wmb();
1490 		ETHER_BPF_MTAP(pi->ifp, m0);
1491 		wr_gen2(txd, txqs.gen);
1492 		check_ring_tx_db(sc, txq, 0);
1493 		return (0);
1494 	} else if (tso_info) {
1495 		int eth_type;
1496 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1497 		struct ether_header *eh;
1498 		struct ip *ip;
1499 		struct tcphdr *tcp;
1500 
1501 		txd->flit[2] = 0;
1502 		GET_VTAG(cntrl, m0);
1503 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1504 		hdr->cntrl = htonl(cntrl);
1505 		hdr->len = htonl(mlen | 0x80000000);
1506 
1507 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1508 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1509 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1510 			    m0->m_pkthdr.csum_flags, m0->m_flags);
1511 			panic("tx tso packet too small");
1512 		}
1513 
1514 		/* Make sure that ether, ip, tcp headers are all in m0 */
1515 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1516 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1517 			if (__predict_false(m0 == NULL)) {
1518 				/* XXX panic probably an overreaction */
1519 				panic("couldn't fit header into mbuf");
1520 			}
1521 		}
1522 
1523 		eh = mtod(m0, struct ether_header *);
1524 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
1525 			eth_type = CPL_ETH_II_VLAN;
1526 			ip = (struct ip *)((struct ether_vlan_header *)eh + 1);
1527 		} else {
1528 			eth_type = CPL_ETH_II;
1529 			ip = (struct ip *)(eh + 1);
1530 		}
1531 		tcp = (struct tcphdr *)(ip + 1);
1532 
1533 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1534 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1535 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1536 		hdr->lso_info = htonl(tso_info);
1537 
1538 		if (__predict_false(mlen <= PIO_LEN)) {
1539 			/*
1540 			 * pkt not undersized but fits in PIO_LEN
1541 			 * Indicates a TSO bug at the higher levels.
1542 			 */
1543 			txsd->m = NULL;
1544 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1545 			flits = (mlen + 7) / 8 + 3;
1546 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1547 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1548 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1549 			wr_lo = htonl(V_WR_LEN(flits) |
1550 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1551 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1552 			wmb();
1553 			ETHER_BPF_MTAP(pi->ifp, m0);
1554 			wr_gen2(txd, txqs.gen);
1555 			check_ring_tx_db(sc, txq, 0);
1556 			m_freem(m0);
1557 			return (0);
1558 		}
1559 		flits = 3;
1560 	} else {
1561 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1562 
1563 		GET_VTAG(cntrl, m0);
1564 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1565 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1566 			cntrl |= F_TXPKT_IPCSUM_DIS;
1567 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1568 			cntrl |= F_TXPKT_L4CSUM_DIS;
1569 		cpl->cntrl = htonl(cntrl);
1570 		cpl->len = htonl(mlen | 0x80000000);
1571 
1572 		if (mlen <= PIO_LEN) {
1573 			txsd->m = NULL;
1574 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1575 			flits = (mlen + 7) / 8 + 2;
1576 
1577 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1578 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1579 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1580 			wr_lo = htonl(V_WR_LEN(flits) |
1581 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1582 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1583 			wmb();
1584 			ETHER_BPF_MTAP(pi->ifp, m0);
1585 			wr_gen2(txd, txqs.gen);
1586 			check_ring_tx_db(sc, txq, 0);
1587 			m_freem(m0);
1588 			return (0);
1589 		}
1590 		flits = 2;
1591 	}
1592 	wrp = (struct work_request_hdr *)txd;
1593 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1594 	make_sgl(sgp, segs, nsegs);
1595 
1596 	sgl_flits = sgl_len(nsegs);
1597 
1598 	ETHER_BPF_MTAP(pi->ifp, m0);
1599 
1600 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1601 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1602 	wr_lo = htonl(V_WR_TID(txq->token));
1603 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1604 	    sgl_flits, wr_hi, wr_lo);
1605 	check_ring_tx_db(sc, txq, 0);
1606 
1607 	return (0);
1608 }
1609 
1610 void
1611 cxgb_tx_watchdog(void *arg)
1612 {
1613 	struct sge_qset *qs = arg;
1614 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1615 
1616         if (qs->coalescing != 0 &&
1617 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1618 	    TXQ_RING_EMPTY(qs))
1619                 qs->coalescing = 0;
1620         else if (qs->coalescing == 0 &&
1621 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1622                 qs->coalescing = 1;
1623 	if (TXQ_TRYLOCK(qs)) {
1624 		qs->qs_flags |= QS_FLUSHING;
1625 		cxgb_start_locked(qs);
1626 		qs->qs_flags &= ~QS_FLUSHING;
1627 		TXQ_UNLOCK(qs);
1628 	}
1629 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1630 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1631 		    qs, txq->txq_watchdog.c_cpu);
1632 }
1633 
1634 static void
1635 cxgb_tx_timeout(void *arg)
1636 {
1637 	struct sge_qset *qs = arg;
1638 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1639 
1640 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1641                 qs->coalescing = 1;
1642 	if (TXQ_TRYLOCK(qs)) {
1643 		qs->qs_flags |= QS_TIMEOUT;
1644 		cxgb_start_locked(qs);
1645 		qs->qs_flags &= ~QS_TIMEOUT;
1646 		TXQ_UNLOCK(qs);
1647 	}
1648 }
1649 
1650 static void
1651 cxgb_start_locked(struct sge_qset *qs)
1652 {
1653 	struct mbuf *m_head = NULL;
1654 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1655 	struct port_info *pi = qs->port;
1656 	struct ifnet *ifp = pi->ifp;
1657 
1658 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1659 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1660 
1661 	if (!pi->link_config.link_ok) {
1662 		TXQ_RING_FLUSH(qs);
1663 		return;
1664 	}
1665 	TXQ_LOCK_ASSERT(qs);
1666 	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1667 	    pi->link_config.link_ok) {
1668 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1669 
1670 		if (txq->size - txq->in_use <= TX_MAX_DESC)
1671 			break;
1672 
1673 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1674 			break;
1675 		/*
1676 		 *  Encapsulation can modify our pointer, and or make it
1677 		 *  NULL on failure.  In that event, we can't requeue.
1678 		 */
1679 		if (t3_encap(qs, &m_head) || m_head == NULL)
1680 			break;
1681 
1682 		m_head = NULL;
1683 	}
1684 
1685 	if (txq->db_pending)
1686 		check_ring_tx_db(pi->adapter, txq, 1);
1687 
1688 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1689 	    pi->link_config.link_ok)
1690 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1691 		    qs, txq->txq_timer.c_cpu);
1692 	if (m_head != NULL)
1693 		m_freem(m_head);
1694 }
1695 
1696 static int
1697 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1698 {
1699 	struct port_info *pi = qs->port;
1700 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1701 	struct buf_ring *br = txq->txq_mr;
1702 	int error, avail;
1703 
1704 	avail = txq->size - txq->in_use;
1705 	TXQ_LOCK_ASSERT(qs);
1706 
1707 	/*
1708 	 * We can only do a direct transmit if the following are true:
1709 	 * - we aren't coalescing (ring < 3/4 full)
1710 	 * - the link is up -- checked in caller
1711 	 * - there are no packets enqueued already
1712 	 * - there is space in hardware transmit queue
1713 	 */
1714 	if (check_pkt_coalesce(qs) == 0 &&
1715 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1716 		if (t3_encap(qs, &m)) {
1717 			if (m != NULL &&
1718 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1719 				return (error);
1720 		} else {
1721 			if (txq->db_pending)
1722 				check_ring_tx_db(pi->adapter, txq, 1);
1723 
1724 			/*
1725 			 * We've bypassed the buf ring so we need to update
1726 			 * the stats directly
1727 			 */
1728 			txq->txq_direct_packets++;
1729 			txq->txq_direct_bytes += m->m_pkthdr.len;
1730 		}
1731 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1732 		return (error);
1733 
1734 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1735 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1736 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1737 		cxgb_start_locked(qs);
1738 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1739 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1740 		    qs, txq->txq_timer.c_cpu);
1741 	return (0);
1742 }
1743 
1744 int
1745 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1746 {
1747 	struct sge_qset *qs;
1748 	struct port_info *pi = ifp->if_softc;
1749 	int error, qidx = pi->first_qset;
1750 
1751 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1752 	    ||(!pi->link_config.link_ok)) {
1753 		m_freem(m);
1754 		return (0);
1755 	}
1756 
1757 	if (m->m_flags & M_FLOWID)
1758 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1759 
1760 	qs = &pi->adapter->sge.qs[qidx];
1761 
1762 	if (TXQ_TRYLOCK(qs)) {
1763 		/* XXX running */
1764 		error = cxgb_transmit_locked(ifp, qs, m);
1765 		TXQ_UNLOCK(qs);
1766 	} else
1767 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1768 	return (error);
1769 }
1770 void
1771 cxgb_start(struct ifnet *ifp)
1772 {
1773 	struct port_info *pi = ifp->if_softc;
1774 	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
1775 
1776 	if (!pi->link_config.link_ok)
1777 		return;
1778 
1779 	TXQ_LOCK(qs);
1780 	cxgb_start_locked(qs);
1781 	TXQ_UNLOCK(qs);
1782 }
1783 
1784 void
1785 cxgb_qflush(struct ifnet *ifp)
1786 {
1787 	/*
1788 	 * flush any enqueued mbufs in the buf_rings
1789 	 * and in the transmit queues
1790 	 * no-op for now
1791 	 */
1792 	return;
1793 }
1794 
1795 /**
1796  *	write_imm - write a packet into a Tx descriptor as immediate data
1797  *	@d: the Tx descriptor to write
1798  *	@m: the packet
1799  *	@len: the length of packet data to write as immediate data
1800  *	@gen: the generation bit value to write
1801  *
1802  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1803  *	contains a work request at its beginning.  We must write the packet
1804  *	carefully so the SGE doesn't read accidentally before it's written in
1805  *	its entirety.
1806  */
1807 static __inline void
1808 write_imm(struct tx_desc *d, struct mbuf *m,
1809 	  unsigned int len, unsigned int gen)
1810 {
1811 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1812 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1813 	uint32_t wr_hi, wr_lo;
1814 
1815 	if (len > WR_LEN)
1816 		panic("len too big %d\n", len);
1817 	if (len < sizeof(*from))
1818 		panic("len too small %d", len);
1819 
1820 	memcpy(&to[1], &from[1], len - sizeof(*from));
1821 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1822 					V_WR_BCNTLFLT(len & 7));
1823 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1824 					V_WR_LEN((len + 7) / 8));
1825 	set_wr_hdr(to, wr_hi, wr_lo);
1826 	wmb();
1827 	wr_gen2(d, gen);
1828 
1829 	/*
1830 	 * This check is a hack we should really fix the logic so
1831 	 * that this can't happen
1832 	 */
1833 	if (m->m_type != MT_DONTFREE)
1834 		m_freem(m);
1835 
1836 }
1837 
1838 /**
1839  *	check_desc_avail - check descriptor availability on a send queue
1840  *	@adap: the adapter
1841  *	@q: the TX queue
1842  *	@m: the packet needing the descriptors
1843  *	@ndesc: the number of Tx descriptors needed
1844  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1845  *
1846  *	Checks if the requested number of Tx descriptors is available on an
1847  *	SGE send queue.  If the queue is already suspended or not enough
1848  *	descriptors are available the packet is queued for later transmission.
1849  *	Must be called with the Tx queue locked.
1850  *
1851  *	Returns 0 if enough descriptors are available, 1 if there aren't
1852  *	enough descriptors and the packet has been queued, and 2 if the caller
1853  *	needs to retry because there weren't enough descriptors at the
1854  *	beginning of the call but some freed up in the mean time.
1855  */
1856 static __inline int
1857 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1858 		 struct mbuf *m, unsigned int ndesc,
1859 		 unsigned int qid)
1860 {
1861 	/*
1862 	 * XXX We currently only use this for checking the control queue
1863 	 * the control queue is only used for binding qsets which happens
1864 	 * at init time so we are guaranteed enough descriptors
1865 	 */
1866 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1867 addq_exit:	mbufq_tail(&q->sendq, m);
1868 		return 1;
1869 	}
1870 	if (__predict_false(q->size - q->in_use < ndesc)) {
1871 
1872 		struct sge_qset *qs = txq_to_qset(q, qid);
1873 
1874 		setbit(&qs->txq_stopped, qid);
1875 		if (should_restart_tx(q) &&
1876 		    test_and_clear_bit(qid, &qs->txq_stopped))
1877 			return 2;
1878 
1879 		q->stops++;
1880 		goto addq_exit;
1881 	}
1882 	return 0;
1883 }
1884 
1885 
1886 /**
1887  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1888  *	@q: the SGE control Tx queue
1889  *
1890  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1891  *	that send only immediate data (presently just the control queues) and
1892  *	thus do not have any mbufs
1893  */
1894 static __inline void
1895 reclaim_completed_tx_imm(struct sge_txq *q)
1896 {
1897 	unsigned int reclaim = q->processed - q->cleaned;
1898 
1899 	q->in_use -= reclaim;
1900 	q->cleaned += reclaim;
1901 }
1902 
1903 static __inline int
1904 immediate(const struct mbuf *m)
1905 {
1906 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1907 }
1908 
1909 /**
1910  *	ctrl_xmit - send a packet through an SGE control Tx queue
1911  *	@adap: the adapter
1912  *	@q: the control queue
1913  *	@m: the packet
1914  *
1915  *	Send a packet through an SGE control Tx queue.  Packets sent through
1916  *	a control queue must fit entirely as immediate data in a single Tx
1917  *	descriptor and have no page fragments.
1918  */
1919 static int
1920 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1921 {
1922 	int ret;
1923 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1924 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1925 
1926 	if (__predict_false(!immediate(m))) {
1927 		m_freem(m);
1928 		return 0;
1929 	}
1930 
1931 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1932 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1933 
1934 	TXQ_LOCK(qs);
1935 again:	reclaim_completed_tx_imm(q);
1936 
1937 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1938 	if (__predict_false(ret)) {
1939 		if (ret == 1) {
1940 			TXQ_UNLOCK(qs);
1941 			return (ENOSPC);
1942 		}
1943 		goto again;
1944 	}
1945 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1946 
1947 	q->in_use++;
1948 	if (++q->pidx >= q->size) {
1949 		q->pidx = 0;
1950 		q->gen ^= 1;
1951 	}
1952 	TXQ_UNLOCK(qs);
1953 	wmb();
1954 	t3_write_reg(adap, A_SG_KDOORBELL,
1955 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1956 	return (0);
1957 }
1958 
1959 
1960 /**
1961  *	restart_ctrlq - restart a suspended control queue
1962  *	@qs: the queue set cotaining the control queue
1963  *
1964  *	Resumes transmission on a suspended Tx control queue.
1965  */
1966 static void
1967 restart_ctrlq(void *data, int npending)
1968 {
1969 	struct mbuf *m;
1970 	struct sge_qset *qs = (struct sge_qset *)data;
1971 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1972 	adapter_t *adap = qs->port->adapter;
1973 
1974 	TXQ_LOCK(qs);
1975 again:	reclaim_completed_tx_imm(q);
1976 
1977 	while (q->in_use < q->size &&
1978 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1979 
1980 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1981 
1982 		if (++q->pidx >= q->size) {
1983 			q->pidx = 0;
1984 			q->gen ^= 1;
1985 		}
1986 		q->in_use++;
1987 	}
1988 	if (!mbufq_empty(&q->sendq)) {
1989 		setbit(&qs->txq_stopped, TXQ_CTRL);
1990 
1991 		if (should_restart_tx(q) &&
1992 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1993 			goto again;
1994 		q->stops++;
1995 	}
1996 	TXQ_UNLOCK(qs);
1997 	t3_write_reg(adap, A_SG_KDOORBELL,
1998 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1999 }
2000 
2001 
2002 /*
2003  * Send a management message through control queue 0
2004  */
2005 int
2006 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
2007 {
2008 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
2009 }
2010 
2011 /**
2012  *	free_qset - free the resources of an SGE queue set
2013  *	@sc: the controller owning the queue set
2014  *	@q: the queue set
2015  *
2016  *	Release the HW and SW resources associated with an SGE queue set, such
2017  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2018  *	queue set must be quiesced prior to calling this.
2019  */
2020 static void
2021 t3_free_qset(adapter_t *sc, struct sge_qset *q)
2022 {
2023 	int i;
2024 
2025 	reclaim_completed_tx(q, 0, TXQ_ETH);
2026 	if (q->txq[TXQ_ETH].txq_mr != NULL)
2027 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
2028 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
2029 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
2030 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
2031 	}
2032 
2033 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2034 		if (q->fl[i].desc) {
2035 			mtx_lock_spin(&sc->sge.reg_lock);
2036 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2037 			mtx_unlock_spin(&sc->sge.reg_lock);
2038 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2039 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2040 					q->fl[i].desc_map);
2041 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2042 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2043 		}
2044 		if (q->fl[i].sdesc) {
2045 			free_rx_bufs(sc, &q->fl[i]);
2046 			free(q->fl[i].sdesc, M_DEVBUF);
2047 		}
2048 	}
2049 
2050 	mtx_unlock(&q->lock);
2051 	MTX_DESTROY(&q->lock);
2052 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2053 		if (q->txq[i].desc) {
2054 			mtx_lock_spin(&sc->sge.reg_lock);
2055 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2056 			mtx_unlock_spin(&sc->sge.reg_lock);
2057 			bus_dmamap_unload(q->txq[i].desc_tag,
2058 					q->txq[i].desc_map);
2059 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2060 					q->txq[i].desc_map);
2061 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2062 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2063 		}
2064 		if (q->txq[i].sdesc) {
2065 			free(q->txq[i].sdesc, M_DEVBUF);
2066 		}
2067 	}
2068 
2069 	if (q->rspq.desc) {
2070 		mtx_lock_spin(&sc->sge.reg_lock);
2071 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2072 		mtx_unlock_spin(&sc->sge.reg_lock);
2073 
2074 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2075 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2076 			        q->rspq.desc_map);
2077 		bus_dma_tag_destroy(q->rspq.desc_tag);
2078 		MTX_DESTROY(&q->rspq.lock);
2079 	}
2080 
2081 #ifdef INET
2082 	tcp_lro_free(&q->lro.ctrl);
2083 #endif
2084 
2085 	bzero(q, sizeof(*q));
2086 }
2087 
2088 /**
2089  *	t3_free_sge_resources - free SGE resources
2090  *	@sc: the adapter softc
2091  *
2092  *	Frees resources used by the SGE queue sets.
2093  */
2094 void
2095 t3_free_sge_resources(adapter_t *sc, int nqsets)
2096 {
2097 	int i;
2098 
2099 	for (i = 0; i < nqsets; ++i) {
2100 		TXQ_LOCK(&sc->sge.qs[i]);
2101 		t3_free_qset(sc, &sc->sge.qs[i]);
2102 	}
2103 }
2104 
2105 /**
2106  *	t3_sge_start - enable SGE
2107  *	@sc: the controller softc
2108  *
2109  *	Enables the SGE for DMAs.  This is the last step in starting packet
2110  *	transfers.
2111  */
2112 void
2113 t3_sge_start(adapter_t *sc)
2114 {
2115 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2116 }
2117 
2118 /**
2119  *	t3_sge_stop - disable SGE operation
2120  *	@sc: the adapter
2121  *
2122  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2123  *	from error interrupts) or from normal process context.  In the latter
2124  *	case it also disables any pending queue restart tasklets.  Note that
2125  *	if it is called in interrupt context it cannot disable the restart
2126  *	tasklets as it cannot wait, however the tasklets will have no effect
2127  *	since the doorbells are disabled and the driver will call this again
2128  *	later from process context, at which time the tasklets will be stopped
2129  *	if they are still running.
2130  */
2131 void
2132 t3_sge_stop(adapter_t *sc)
2133 {
2134 	int i, nqsets;
2135 
2136 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2137 
2138 	if (sc->tq == NULL)
2139 		return;
2140 
2141 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2142 		nqsets += sc->port[i].nqsets;
2143 #ifdef notyet
2144 	/*
2145 	 *
2146 	 * XXX
2147 	 */
2148 	for (i = 0; i < nqsets; ++i) {
2149 		struct sge_qset *qs = &sc->sge.qs[i];
2150 
2151 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2152 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2153 	}
2154 #endif
2155 }
2156 
2157 /**
2158  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2159  *	@adapter: the adapter
2160  *	@q: the Tx queue to reclaim descriptors from
2161  *	@reclaimable: the number of descriptors to reclaim
2162  *      @m_vec_size: maximum number of buffers to reclaim
2163  *      @desc_reclaimed: returns the number of descriptors reclaimed
2164  *
2165  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2166  *	Tx buffers.  Called with the Tx queue lock held.
2167  *
2168  *      Returns number of buffers of reclaimed
2169  */
2170 void
2171 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2172 {
2173 	struct tx_sw_desc *txsd;
2174 	unsigned int cidx, mask;
2175 	struct sge_txq *q = &qs->txq[queue];
2176 
2177 #ifdef T3_TRACE
2178 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2179 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2180 #endif
2181 	cidx = q->cidx;
2182 	mask = q->size - 1;
2183 	txsd = &q->sdesc[cidx];
2184 
2185 	mtx_assert(&qs->lock, MA_OWNED);
2186 	while (reclaimable--) {
2187 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2188 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2189 
2190 		if (txsd->m != NULL) {
2191 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2192 				bus_dmamap_unload(q->entry_tag, txsd->map);
2193 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2194 			}
2195 			m_freem_list(txsd->m);
2196 			txsd->m = NULL;
2197 		} else
2198 			q->txq_skipped++;
2199 
2200 		++txsd;
2201 		if (++cidx == q->size) {
2202 			cidx = 0;
2203 			txsd = q->sdesc;
2204 		}
2205 	}
2206 	q->cidx = cidx;
2207 
2208 }
2209 
2210 /**
2211  *	is_new_response - check if a response is newly written
2212  *	@r: the response descriptor
2213  *	@q: the response queue
2214  *
2215  *	Returns true if a response descriptor contains a yet unprocessed
2216  *	response.
2217  */
2218 static __inline int
2219 is_new_response(const struct rsp_desc *r,
2220     const struct sge_rspq *q)
2221 {
2222 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2223 }
2224 
2225 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2226 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2227 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2228 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2229 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2230 
2231 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2232 #define NOMEM_INTR_DELAY 2500
2233 
2234 /**
2235  *	write_ofld_wr - write an offload work request
2236  *	@adap: the adapter
2237  *	@m: the packet to send
2238  *	@q: the Tx queue
2239  *	@pidx: index of the first Tx descriptor to write
2240  *	@gen: the generation value to use
2241  *	@ndesc: number of descriptors the packet will occupy
2242  *
2243  *	Write an offload work request to send the supplied packet.  The packet
2244  *	data already carry the work request with most fields populated.
2245  */
2246 static void
2247 write_ofld_wr(adapter_t *adap, struct mbuf *m,
2248     struct sge_txq *q, unsigned int pidx,
2249     unsigned int gen, unsigned int ndesc,
2250     bus_dma_segment_t *segs, unsigned int nsegs)
2251 {
2252 	unsigned int sgl_flits, flits;
2253 	struct work_request_hdr *from;
2254 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2255 	struct tx_desc *d = &q->desc[pidx];
2256 	struct txq_state txqs;
2257 
2258 	if (immediate(m) && nsegs == 0) {
2259 		write_imm(d, m, m->m_len, gen);
2260 		return;
2261 	}
2262 
2263 	/* Only TX_DATA builds SGLs */
2264 	from = mtod(m, struct work_request_hdr *);
2265 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2266 
2267 	flits = m->m_len / 8;
2268 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2269 
2270 	make_sgl(sgp, segs, nsegs);
2271 	sgl_flits = sgl_len(nsegs);
2272 
2273 	txqs.gen = gen;
2274 	txqs.pidx = pidx;
2275 	txqs.compl = 0;
2276 
2277 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2278 	    from->wrh_hi, from->wrh_lo);
2279 }
2280 
2281 /**
2282  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2283  *	@m: the packet
2284  *
2285  * 	Returns the number of Tx descriptors needed for the given offload
2286  * 	packet.  These packets are already fully constructed.
2287  */
2288 static __inline unsigned int
2289 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2290 {
2291 	unsigned int flits, cnt = 0;
2292 	int ndescs;
2293 
2294 	if (m->m_len <= WR_LEN && nsegs == 0)
2295 		return (1);                 /* packet fits as immediate data */
2296 
2297 	/*
2298 	 * This needs to be re-visited for TOE
2299 	 */
2300 
2301 	cnt = nsegs;
2302 
2303 	/* headers */
2304 	flits = m->m_len / 8;
2305 
2306 	ndescs = flits_to_desc(flits + sgl_len(cnt));
2307 
2308 	return (ndescs);
2309 }
2310 
2311 /**
2312  *	ofld_xmit - send a packet through an offload queue
2313  *	@adap: the adapter
2314  *	@q: the Tx offload queue
2315  *	@m: the packet
2316  *
2317  *	Send an offload packet through an SGE offload queue.
2318  */
2319 static int
2320 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2321 {
2322 	int ret, nsegs;
2323 	unsigned int ndesc;
2324 	unsigned int pidx, gen;
2325 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2326 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2327 	struct tx_sw_desc *stx;
2328 
2329 	nsegs = m_get_sgllen(m);
2330 	vsegs = m_get_sgl(m);
2331 	ndesc = calc_tx_descs_ofld(m, nsegs);
2332 	busdma_map_sgl(vsegs, segs, nsegs);
2333 
2334 	stx = &q->sdesc[q->pidx];
2335 
2336 	TXQ_LOCK(qs);
2337 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2338 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2339 	if (__predict_false(ret)) {
2340 		if (ret == 1) {
2341 			printf("no ofld desc avail\n");
2342 
2343 			m_set_priority(m, ndesc);     /* save for restart */
2344 			TXQ_UNLOCK(qs);
2345 			return (EINTR);
2346 		}
2347 		goto again;
2348 	}
2349 
2350 	gen = q->gen;
2351 	q->in_use += ndesc;
2352 	pidx = q->pidx;
2353 	q->pidx += ndesc;
2354 	if (q->pidx >= q->size) {
2355 		q->pidx -= q->size;
2356 		q->gen ^= 1;
2357 	}
2358 #ifdef T3_TRACE
2359 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2360 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2361 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2362 		  skb_shinfo(skb)->nr_frags);
2363 #endif
2364 	TXQ_UNLOCK(qs);
2365 
2366 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2367 	check_ring_tx_db(adap, q, 1);
2368 	return (0);
2369 }
2370 
2371 /**
2372  *	restart_offloadq - restart a suspended offload queue
2373  *	@qs: the queue set cotaining the offload queue
2374  *
2375  *	Resumes transmission on a suspended Tx offload queue.
2376  */
2377 static void
2378 restart_offloadq(void *data, int npending)
2379 {
2380 	struct mbuf *m;
2381 	struct sge_qset *qs = data;
2382 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2383 	adapter_t *adap = qs->port->adapter;
2384 	bus_dma_segment_t segs[TX_MAX_SEGS];
2385 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2386 	int nsegs, cleaned;
2387 
2388 	TXQ_LOCK(qs);
2389 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2390 
2391 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2392 		unsigned int gen, pidx;
2393 		unsigned int ndesc = m_get_priority(m);
2394 
2395 		if (__predict_false(q->size - q->in_use < ndesc)) {
2396 			setbit(&qs->txq_stopped, TXQ_OFLD);
2397 			if (should_restart_tx(q) &&
2398 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2399 				goto again;
2400 			q->stops++;
2401 			break;
2402 		}
2403 
2404 		gen = q->gen;
2405 		q->in_use += ndesc;
2406 		pidx = q->pidx;
2407 		q->pidx += ndesc;
2408 		if (q->pidx >= q->size) {
2409 			q->pidx -= q->size;
2410 			q->gen ^= 1;
2411 		}
2412 
2413 		(void)mbufq_dequeue(&q->sendq);
2414 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2415 		TXQ_UNLOCK(qs);
2416 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2417 		TXQ_LOCK(qs);
2418 	}
2419 #if USE_GTS
2420 	set_bit(TXQ_RUNNING, &q->flags);
2421 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2422 #endif
2423 	TXQ_UNLOCK(qs);
2424 	wmb();
2425 	t3_write_reg(adap, A_SG_KDOORBELL,
2426 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2427 }
2428 
2429 /**
2430  *	queue_set - return the queue set a packet should use
2431  *	@m: the packet
2432  *
2433  *	Maps a packet to the SGE queue set it should use.  The desired queue
2434  *	set is carried in bits 1-3 in the packet's priority.
2435  */
2436 static __inline int
2437 queue_set(const struct mbuf *m)
2438 {
2439 	return m_get_priority(m) >> 1;
2440 }
2441 
2442 /**
2443  *	is_ctrl_pkt - return whether an offload packet is a control packet
2444  *	@m: the packet
2445  *
2446  *	Determines whether an offload packet should use an OFLD or a CTRL
2447  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2448  */
2449 static __inline int
2450 is_ctrl_pkt(const struct mbuf *m)
2451 {
2452 	return m_get_priority(m) & 1;
2453 }
2454 
2455 /**
2456  *	t3_offload_tx - send an offload packet
2457  *	@tdev: the offload device to send to
2458  *	@m: the packet
2459  *
2460  *	Sends an offload packet.  We use the packet priority to select the
2461  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2462  *	should be sent as regular or control, bits 1-3 select the queue set.
2463  */
2464 int
2465 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2466 {
2467 	adapter_t *adap = tdev2adap(tdev);
2468 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2469 
2470 	if (__predict_false(is_ctrl_pkt(m)))
2471 		return ctrl_xmit(adap, qs, m);
2472 
2473 	return ofld_xmit(adap, qs, m);
2474 }
2475 
2476 /**
2477  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2478  *	@tdev: the offload device that will be receiving the packets
2479  *	@q: the SGE response queue that assembled the bundle
2480  *	@m: the partial bundle
2481  *	@n: the number of packets in the bundle
2482  *
2483  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2484  */
2485 static __inline void
2486 deliver_partial_bundle(struct t3cdev *tdev,
2487 			struct sge_rspq *q,
2488 			struct mbuf *mbufs[], int n)
2489 {
2490 	if (n) {
2491 		q->offload_bundles++;
2492 		cxgb_ofld_recv(tdev, mbufs, n);
2493 	}
2494 }
2495 
2496 static __inline int
2497 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2498     struct mbuf *m, struct mbuf *rx_gather[],
2499     unsigned int gather_idx)
2500 {
2501 
2502 	rq->offload_pkts++;
2503 	m->m_pkthdr.header = mtod(m, void *);
2504 	rx_gather[gather_idx++] = m;
2505 	if (gather_idx == RX_BUNDLE_SIZE) {
2506 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2507 		gather_idx = 0;
2508 		rq->offload_bundles++;
2509 	}
2510 	return (gather_idx);
2511 }
2512 
2513 static void
2514 restart_tx(struct sge_qset *qs)
2515 {
2516 	struct adapter *sc = qs->port->adapter;
2517 
2518 
2519 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2520 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2521 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2522 		qs->txq[TXQ_OFLD].restarts++;
2523 		DPRINTF("restarting TXQ_OFLD\n");
2524 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2525 	}
2526 	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2527 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2528 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2529 	    qs->txq[TXQ_CTRL].in_use);
2530 
2531 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2532 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2533 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2534 		qs->txq[TXQ_CTRL].restarts++;
2535 		DPRINTF("restarting TXQ_CTRL\n");
2536 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2537 	}
2538 }
2539 
2540 /**
2541  *	t3_sge_alloc_qset - initialize an SGE queue set
2542  *	@sc: the controller softc
2543  *	@id: the queue set id
2544  *	@nports: how many Ethernet ports will be using this queue set
2545  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2546  *	@p: configuration parameters for this queue set
2547  *	@ntxq: number of Tx queues for the queue set
2548  *	@pi: port info for queue set
2549  *
2550  *	Allocate resources and initialize an SGE queue set.  A queue set
2551  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2552  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2553  *	queue, offload queue, and control queue.
2554  */
2555 int
2556 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2557 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2558 {
2559 	struct sge_qset *q = &sc->sge.qs[id];
2560 	int i, ret = 0;
2561 
2562 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2563 	q->port = pi;
2564 
2565 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2566 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2567 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2568 		goto err;
2569 	}
2570 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2571 	    M_NOWAIT | M_ZERO)) == NULL) {
2572 		device_printf(sc->dev, "failed to allocate ifq\n");
2573 		goto err;
2574 	}
2575 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2576 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2577 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2578 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2579 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2580 
2581 	init_qset_cntxt(q, id);
2582 	q->idx = id;
2583 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2584 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2585 		    &q->fl[0].desc, &q->fl[0].sdesc,
2586 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2587 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2588 		printf("error %d from alloc ring fl0\n", ret);
2589 		goto err;
2590 	}
2591 
2592 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2593 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2594 		    &q->fl[1].desc, &q->fl[1].sdesc,
2595 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2596 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2597 		printf("error %d from alloc ring fl1\n", ret);
2598 		goto err;
2599 	}
2600 
2601 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2602 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2603 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2604 		    NULL, NULL)) != 0) {
2605 		printf("error %d from alloc ring rspq\n", ret);
2606 		goto err;
2607 	}
2608 
2609 	for (i = 0; i < ntxq; ++i) {
2610 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2611 
2612 		if ((ret = alloc_ring(sc, p->txq_size[i],
2613 			    sizeof(struct tx_desc), sz,
2614 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2615 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2616 			    &q->txq[i].desc_map,
2617 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2618 			printf("error %d from alloc ring tx %i\n", ret, i);
2619 			goto err;
2620 		}
2621 		mbufq_init(&q->txq[i].sendq);
2622 		q->txq[i].gen = 1;
2623 		q->txq[i].size = p->txq_size[i];
2624 	}
2625 
2626 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2627 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2628 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2629 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2630 
2631 	q->fl[0].gen = q->fl[1].gen = 1;
2632 	q->fl[0].size = p->fl_size;
2633 	q->fl[1].size = p->jumbo_size;
2634 
2635 	q->rspq.gen = 1;
2636 	q->rspq.cidx = 0;
2637 	q->rspq.size = p->rspq_size;
2638 
2639 	q->txq[TXQ_ETH].stop_thres = nports *
2640 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2641 
2642 	q->fl[0].buf_size = MCLBYTES;
2643 	q->fl[0].zone = zone_pack;
2644 	q->fl[0].type = EXT_PACKET;
2645 
2646 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2647 		q->fl[1].zone = zone_jumbo16;
2648 		q->fl[1].type = EXT_JUMBO16;
2649 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2650 		q->fl[1].zone = zone_jumbo9;
2651 		q->fl[1].type = EXT_JUMBO9;
2652 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2653 		q->fl[1].zone = zone_jumbop;
2654 		q->fl[1].type = EXT_JUMBOP;
2655 	} else {
2656 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2657 		ret = EDOOFUS;
2658 		goto err;
2659 	}
2660 	q->fl[1].buf_size = p->jumbo_buf_size;
2661 
2662 	/* Allocate and setup the lro_ctrl structure */
2663 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2664 #ifdef INET
2665 	ret = tcp_lro_init(&q->lro.ctrl);
2666 	if (ret) {
2667 		printf("error %d from tcp_lro_init\n", ret);
2668 		goto err;
2669 	}
2670 #endif
2671 	q->lro.ctrl.ifp = pi->ifp;
2672 
2673 	mtx_lock_spin(&sc->sge.reg_lock);
2674 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2675 				   q->rspq.phys_addr, q->rspq.size,
2676 				   q->fl[0].buf_size, 1, 0);
2677 	if (ret) {
2678 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2679 		goto err_unlock;
2680 	}
2681 
2682 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2683 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2684 					  q->fl[i].phys_addr, q->fl[i].size,
2685 					  q->fl[i].buf_size, p->cong_thres, 1,
2686 					  0);
2687 		if (ret) {
2688 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2689 			goto err_unlock;
2690 		}
2691 	}
2692 
2693 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2694 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2695 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2696 				 1, 0);
2697 	if (ret) {
2698 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2699 		goto err_unlock;
2700 	}
2701 
2702 	if (ntxq > 1) {
2703 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2704 					 USE_GTS, SGE_CNTXT_OFLD, id,
2705 					 q->txq[TXQ_OFLD].phys_addr,
2706 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2707 		if (ret) {
2708 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2709 			goto err_unlock;
2710 		}
2711 	}
2712 
2713 	if (ntxq > 2) {
2714 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2715 					 SGE_CNTXT_CTRL, id,
2716 					 q->txq[TXQ_CTRL].phys_addr,
2717 					 q->txq[TXQ_CTRL].size,
2718 					 q->txq[TXQ_CTRL].token, 1, 0);
2719 		if (ret) {
2720 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2721 			goto err_unlock;
2722 		}
2723 	}
2724 
2725 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2726 	    device_get_unit(sc->dev), irq_vec_idx);
2727 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2728 
2729 	mtx_unlock_spin(&sc->sge.reg_lock);
2730 	t3_update_qset_coalesce(q, p);
2731 	q->port = pi;
2732 
2733 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2734 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2735 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2736 
2737 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2738 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2739 
2740 	return (0);
2741 
2742 err_unlock:
2743 	mtx_unlock_spin(&sc->sge.reg_lock);
2744 err:
2745 	TXQ_LOCK(q);
2746 	t3_free_qset(sc, q);
2747 
2748 	return (ret);
2749 }
2750 
2751 /*
2752  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2753  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2754  * will also be taken into account here.
2755  */
2756 void
2757 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2758 {
2759 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2760 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2761 	struct ifnet *ifp = pi->ifp;
2762 
2763 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2764 
2765 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2766 	    cpl->csum_valid && cpl->csum == 0xffff) {
2767 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2768 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2769 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2770 		m->m_pkthdr.csum_data = 0xffff;
2771 	}
2772 
2773 	if (cpl->vlan_valid) {
2774 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2775 		m->m_flags |= M_VLANTAG;
2776 	}
2777 
2778 	m->m_pkthdr.rcvif = ifp;
2779 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2780 	/*
2781 	 * adjust after conversion to mbuf chain
2782 	 */
2783 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2784 	m->m_len -= (sizeof(*cpl) + ethpad);
2785 	m->m_data += (sizeof(*cpl) + ethpad);
2786 }
2787 
2788 /**
2789  *	get_packet - return the next ingress packet buffer from a free list
2790  *	@adap: the adapter that received the packet
2791  *	@drop_thres: # of remaining buffers before we start dropping packets
2792  *	@qs: the qset that the SGE free list holding the packet belongs to
2793  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2794  *      @r: response descriptor
2795  *
2796  *	Get the next packet from a free list and complete setup of the
2797  *	sk_buff.  If the packet is small we make a copy and recycle the
2798  *	original buffer, otherwise we use the original buffer itself.  If a
2799  *	positive drop threshold is supplied packets are dropped and their
2800  *	buffers recycled if (a) the number of remaining buffers is under the
2801  *	threshold and the packet is too big to copy, or (b) the packet should
2802  *	be copied but there is no memory for the copy.
2803  */
2804 static int
2805 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2806     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2807 {
2808 
2809 	unsigned int len_cq =  ntohl(r->len_cq);
2810 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2811 	int mask, cidx = fl->cidx;
2812 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2813 	uint32_t len = G_RSPD_LEN(len_cq);
2814 	uint32_t flags = M_EXT;
2815 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2816 	caddr_t cl;
2817 	struct mbuf *m;
2818 	int ret = 0;
2819 
2820 	mask = fl->size - 1;
2821 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2822 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2823 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2824 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2825 
2826 	fl->credits--;
2827 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2828 
2829 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2830 	    sopeop == RSPQ_SOP_EOP) {
2831 		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2832 			goto skip_recycle;
2833 		cl = mtod(m, void *);
2834 		memcpy(cl, sd->rxsd_cl, len);
2835 		recycle_rx_buf(adap, fl, fl->cidx);
2836 		m->m_pkthdr.len = m->m_len = len;
2837 		m->m_flags = 0;
2838 		mh->mh_head = mh->mh_tail = m;
2839 		ret = 1;
2840 		goto done;
2841 	} else {
2842 	skip_recycle:
2843 		bus_dmamap_unload(fl->entry_tag, sd->map);
2844 		cl = sd->rxsd_cl;
2845 		m = sd->m;
2846 
2847 		if ((sopeop == RSPQ_SOP_EOP) ||
2848 		    (sopeop == RSPQ_SOP))
2849 			flags |= M_PKTHDR;
2850 		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2851 		if (fl->zone == zone_pack) {
2852 			/*
2853 			 * restore clobbered data pointer
2854 			 */
2855 			m->m_data = m->m_ext.ext_buf;
2856 		} else {
2857 			m_cljset(m, cl, fl->type);
2858 		}
2859 		m->m_len = len;
2860 	}
2861 	switch(sopeop) {
2862 	case RSPQ_SOP_EOP:
2863 		ret = 1;
2864 		/* FALLTHROUGH */
2865 	case RSPQ_SOP:
2866 		mh->mh_head = mh->mh_tail = m;
2867 		m->m_pkthdr.len = len;
2868 		break;
2869 	case RSPQ_EOP:
2870 		ret = 1;
2871 		/* FALLTHROUGH */
2872 	case RSPQ_NSOP_NEOP:
2873 		if (mh->mh_tail == NULL) {
2874 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2875 			m_freem(m);
2876 			break;
2877 		}
2878 		mh->mh_tail->m_next = m;
2879 		mh->mh_tail = m;
2880 		mh->mh_head->m_pkthdr.len += len;
2881 		break;
2882 	}
2883 	if (cxgb_debug)
2884 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2885 done:
2886 	if (++fl->cidx == fl->size)
2887 		fl->cidx = 0;
2888 
2889 	return (ret);
2890 }
2891 
2892 /**
2893  *	handle_rsp_cntrl_info - handles control information in a response
2894  *	@qs: the queue set corresponding to the response
2895  *	@flags: the response control flags
2896  *
2897  *	Handles the control information of an SGE response, such as GTS
2898  *	indications and completion credits for the queue set's Tx queues.
2899  *	HW coalesces credits, we don't do any extra SW coalescing.
2900  */
2901 static __inline void
2902 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2903 {
2904 	unsigned int credits;
2905 
2906 #if USE_GTS
2907 	if (flags & F_RSPD_TXQ0_GTS)
2908 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2909 #endif
2910 	credits = G_RSPD_TXQ0_CR(flags);
2911 	if (credits)
2912 		qs->txq[TXQ_ETH].processed += credits;
2913 
2914 	credits = G_RSPD_TXQ2_CR(flags);
2915 	if (credits)
2916 		qs->txq[TXQ_CTRL].processed += credits;
2917 
2918 # if USE_GTS
2919 	if (flags & F_RSPD_TXQ1_GTS)
2920 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2921 # endif
2922 	credits = G_RSPD_TXQ1_CR(flags);
2923 	if (credits)
2924 		qs->txq[TXQ_OFLD].processed += credits;
2925 
2926 }
2927 
2928 static void
2929 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2930     unsigned int sleeping)
2931 {
2932 	;
2933 }
2934 
2935 /**
2936  *	process_responses - process responses from an SGE response queue
2937  *	@adap: the adapter
2938  *	@qs: the queue set to which the response queue belongs
2939  *	@budget: how many responses can be processed in this round
2940  *
2941  *	Process responses from an SGE response queue up to the supplied budget.
2942  *	Responses include received packets as well as credits and other events
2943  *	for the queues that belong to the response queue's queue set.
2944  *	A negative budget is effectively unlimited.
2945  *
2946  *	Additionally choose the interrupt holdoff time for the next interrupt
2947  *	on this queue.  If the system is under memory shortage use a fairly
2948  *	long delay to help recovery.
2949  */
2950 static int
2951 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2952 {
2953 	struct sge_rspq *rspq = &qs->rspq;
2954 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2955 	int budget_left = budget;
2956 	unsigned int sleeping = 0;
2957 	int lro_enabled = qs->lro.enabled;
2958 	int skip_lro;
2959 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2960 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2961 	int ngathered = 0;
2962 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2963 #ifdef DEBUG
2964 	static int last_holdoff = 0;
2965 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2966 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2967 		last_holdoff = rspq->holdoff_tmr;
2968 	}
2969 #endif
2970 	rspq->next_holdoff = rspq->holdoff_tmr;
2971 
2972 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2973 		int eth, eop = 0, ethpad = 0;
2974 		uint32_t flags = ntohl(r->flags);
2975 		uint32_t rss_csum = *(const uint32_t *)r;
2976 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2977 
2978 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2979 
2980 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2981 			struct mbuf *m;
2982 
2983 			if (cxgb_debug)
2984 				printf("async notification\n");
2985 
2986 			if (mh->mh_head == NULL) {
2987 				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2988 				m = mh->mh_head;
2989 			} else {
2990 				m = m_gethdr(M_DONTWAIT, MT_DATA);
2991 			}
2992 			if (m == NULL)
2993 				goto no_mem;
2994 
2995                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2996 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2997                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
2998 			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
2999 			eop = 1;
3000                         rspq->async_notif++;
3001 			goto skip;
3002 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
3003 			struct mbuf *m = NULL;
3004 
3005 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
3006 			    r->rss_hdr.opcode, rspq->cidx);
3007 			if (mh->mh_head == NULL)
3008 				mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
3009                         else
3010 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3011 
3012 			if (mh->mh_head == NULL &&  m == NULL) {
3013 		no_mem:
3014 				rspq->next_holdoff = NOMEM_INTR_DELAY;
3015 				budget_left--;
3016 				break;
3017 			}
3018 			get_imm_packet(adap, r, mh->mh_head);
3019 			eop = 1;
3020 			rspq->imm_data++;
3021 		} else if (r->len_cq) {
3022 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3023 
3024 			eop = get_packet(adap, drop_thresh, qs, mh, r);
3025 			if (eop) {
3026 				if (r->rss_hdr.hash_type && !adap->timestamp)
3027 					mh->mh_head->m_flags |= M_FLOWID;
3028 				mh->mh_head->m_pkthdr.flowid = rss_hash;
3029 			}
3030 
3031 			ethpad = 2;
3032 		} else {
3033 			rspq->pure_rsps++;
3034 		}
3035 	skip:
3036 		if (flags & RSPD_CTRL_MASK) {
3037 			sleeping |= flags & RSPD_GTS_MASK;
3038 			handle_rsp_cntrl_info(qs, flags);
3039 		}
3040 
3041 		r++;
3042 		if (__predict_false(++rspq->cidx == rspq->size)) {
3043 			rspq->cidx = 0;
3044 			rspq->gen ^= 1;
3045 			r = rspq->desc;
3046 		}
3047 
3048 		if (++rspq->credits >= 64) {
3049 			refill_rspq(adap, rspq, rspq->credits);
3050 			rspq->credits = 0;
3051 		}
3052 		if (!eth && eop) {
3053 			mh->mh_head->m_pkthdr.csum_data = rss_csum;
3054 			/*
3055 			 * XXX size mismatch
3056 			 */
3057 			m_set_priority(mh->mh_head, rss_hash);
3058 
3059 
3060 			ngathered = rx_offload(&adap->tdev, rspq,
3061 			    mh->mh_head, offload_mbufs, ngathered);
3062 			mh->mh_head = NULL;
3063 			DPRINTF("received offload packet\n");
3064 
3065 		} else if (eth && eop) {
3066 			struct mbuf *m = mh->mh_head;
3067 
3068 			t3_rx_eth(adap, rspq, m, ethpad);
3069 
3070 			/*
3071 			 * The T304 sends incoming packets on any qset.  If LRO
3072 			 * is also enabled, we could end up sending packet up
3073 			 * lro_ctrl->ifp's input.  That is incorrect.
3074 			 *
3075 			 * The mbuf's rcvif was derived from the cpl header and
3076 			 * is accurate.  Skip LRO and just use that.
3077 			 */
3078 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3079 
3080 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
3081 #ifdef INET
3082 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
3083 #endif
3084 			    ) {
3085 				/* successfully queue'd for LRO */
3086 			} else {
3087 				/*
3088 				 * LRO not enabled, packet unsuitable for LRO,
3089 				 * or unable to queue.  Pass it up right now in
3090 				 * either case.
3091 				 */
3092 				struct ifnet *ifp = m->m_pkthdr.rcvif;
3093 				(*ifp->if_input)(ifp, m);
3094 			}
3095 			mh->mh_head = NULL;
3096 
3097 		}
3098 		__refill_fl_lt(adap, &qs->fl[0], 32);
3099 		__refill_fl_lt(adap, &qs->fl[1], 32);
3100 		--budget_left;
3101 	}
3102 
3103 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3104 
3105 #ifdef INET
3106 	/* Flush LRO */
3107 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3108 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3109 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3110 		tcp_lro_flush(lro_ctrl, queued);
3111 	}
3112 #endif
3113 
3114 	if (sleeping)
3115 		check_ring_db(adap, qs, sleeping);
3116 
3117 	mb();  /* commit Tx queue processed updates */
3118 	if (__predict_false(qs->txq_stopped > 1))
3119 		restart_tx(qs);
3120 
3121 	__refill_fl_lt(adap, &qs->fl[0], 512);
3122 	__refill_fl_lt(adap, &qs->fl[1], 512);
3123 	budget -= budget_left;
3124 	return (budget);
3125 }
3126 
3127 /*
3128  * A helper function that processes responses and issues GTS.
3129  */
3130 static __inline int
3131 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3132 {
3133 	int work;
3134 	static int last_holdoff = 0;
3135 
3136 	work = process_responses(adap, rspq_to_qset(rq), -1);
3137 
3138 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3139 		printf("next_holdoff=%d\n", rq->next_holdoff);
3140 		last_holdoff = rq->next_holdoff;
3141 	}
3142 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3143 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3144 
3145 	return (work);
3146 }
3147 
3148 
3149 /*
3150  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3151  * Handles data events from SGE response queues as well as error and other
3152  * async events as they all use the same interrupt pin.  We use one SGE
3153  * response queue per port in this mode and protect all response queues with
3154  * queue 0's lock.
3155  */
3156 void
3157 t3b_intr(void *data)
3158 {
3159 	uint32_t i, map;
3160 	adapter_t *adap = data;
3161 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3162 
3163 	t3_write_reg(adap, A_PL_CLI, 0);
3164 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3165 
3166 	if (!map)
3167 		return;
3168 
3169 	if (__predict_false(map & F_ERRINTR)) {
3170 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3171 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3172 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3173 	}
3174 
3175 	mtx_lock(&q0->lock);
3176 	for_each_port(adap, i)
3177 	    if (map & (1 << i))
3178 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3179 	mtx_unlock(&q0->lock);
3180 }
3181 
3182 /*
3183  * The MSI interrupt handler.  This needs to handle data events from SGE
3184  * response queues as well as error and other async events as they all use
3185  * the same MSI vector.  We use one SGE response queue per port in this mode
3186  * and protect all response queues with queue 0's lock.
3187  */
3188 void
3189 t3_intr_msi(void *data)
3190 {
3191 	adapter_t *adap = data;
3192 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3193 	int i, new_packets = 0;
3194 
3195 	mtx_lock(&q0->lock);
3196 
3197 	for_each_port(adap, i)
3198 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3199 		    new_packets = 1;
3200 	mtx_unlock(&q0->lock);
3201 	if (new_packets == 0) {
3202 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3203 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3204 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3205 	}
3206 }
3207 
3208 void
3209 t3_intr_msix(void *data)
3210 {
3211 	struct sge_qset *qs = data;
3212 	adapter_t *adap = qs->port->adapter;
3213 	struct sge_rspq *rspq = &qs->rspq;
3214 
3215 	if (process_responses_gts(adap, rspq) == 0)
3216 		rspq->unhandled_irqs++;
3217 }
3218 
3219 #define QDUMP_SBUF_SIZE		32 * 400
3220 static int
3221 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3222 {
3223 	struct sge_rspq *rspq;
3224 	struct sge_qset *qs;
3225 	int i, err, dump_end, idx;
3226 	struct sbuf *sb;
3227 	struct rsp_desc *rspd;
3228 	uint32_t data[4];
3229 
3230 	rspq = arg1;
3231 	qs = rspq_to_qset(rspq);
3232 	if (rspq->rspq_dump_count == 0)
3233 		return (0);
3234 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3235 		log(LOG_WARNING,
3236 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3237 		rspq->rspq_dump_count = 0;
3238 		return (EINVAL);
3239 	}
3240 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3241 		log(LOG_WARNING,
3242 		    "dump start of %d is greater than queue size\n",
3243 		    rspq->rspq_dump_start);
3244 		rspq->rspq_dump_start = 0;
3245 		return (EINVAL);
3246 	}
3247 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3248 	if (err)
3249 		return (err);
3250 	err = sysctl_wire_old_buffer(req, 0);
3251 	if (err)
3252 		return (err);
3253 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3254 
3255 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3256 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3257 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3258 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3259 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3260 
3261 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3262 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3263 
3264 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3265 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3266 		idx = i & (RSPQ_Q_SIZE-1);
3267 
3268 		rspd = &rspq->desc[idx];
3269 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3270 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3271 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3272 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3273 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3274 		    be32toh(rspd->len_cq), rspd->intr_gen);
3275 	}
3276 
3277 	err = sbuf_finish(sb);
3278 	/* Output a trailing NUL. */
3279 	if (err == 0)
3280 		err = SYSCTL_OUT(req, "", 1);
3281 	sbuf_delete(sb);
3282 	return (err);
3283 }
3284 
3285 static int
3286 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3287 {
3288 	struct sge_txq *txq;
3289 	struct sge_qset *qs;
3290 	int i, j, err, dump_end;
3291 	struct sbuf *sb;
3292 	struct tx_desc *txd;
3293 	uint32_t *WR, wr_hi, wr_lo, gen;
3294 	uint32_t data[4];
3295 
3296 	txq = arg1;
3297 	qs = txq_to_qset(txq, TXQ_ETH);
3298 	if (txq->txq_dump_count == 0) {
3299 		return (0);
3300 	}
3301 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3302 		log(LOG_WARNING,
3303 		    "dump count is too large %d\n", txq->txq_dump_count);
3304 		txq->txq_dump_count = 1;
3305 		return (EINVAL);
3306 	}
3307 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3308 		log(LOG_WARNING,
3309 		    "dump start of %d is greater than queue size\n",
3310 		    txq->txq_dump_start);
3311 		txq->txq_dump_start = 0;
3312 		return (EINVAL);
3313 	}
3314 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3315 	if (err)
3316 		return (err);
3317 	err = sysctl_wire_old_buffer(req, 0);
3318 	if (err)
3319 		return (err);
3320 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3321 
3322 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3323 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3324 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3325 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3326 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3327 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3328 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3329 	    txq->txq_dump_start,
3330 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3331 
3332 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3333 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3334 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3335 		WR = (uint32_t *)txd->flit;
3336 		wr_hi = ntohl(WR[0]);
3337 		wr_lo = ntohl(WR[1]);
3338 		gen = G_WR_GEN(wr_lo);
3339 
3340 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3341 		    wr_hi, wr_lo, gen);
3342 		for (j = 2; j < 30; j += 4)
3343 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3344 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3345 
3346 	}
3347 	err = sbuf_finish(sb);
3348 	/* Output a trailing NUL. */
3349 	if (err == 0)
3350 		err = SYSCTL_OUT(req, "", 1);
3351 	sbuf_delete(sb);
3352 	return (err);
3353 }
3354 
3355 static int
3356 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3357 {
3358 	struct sge_txq *txq;
3359 	struct sge_qset *qs;
3360 	int i, j, err, dump_end;
3361 	struct sbuf *sb;
3362 	struct tx_desc *txd;
3363 	uint32_t *WR, wr_hi, wr_lo, gen;
3364 
3365 	txq = arg1;
3366 	qs = txq_to_qset(txq, TXQ_CTRL);
3367 	if (txq->txq_dump_count == 0) {
3368 		return (0);
3369 	}
3370 	if (txq->txq_dump_count > 256) {
3371 		log(LOG_WARNING,
3372 		    "dump count is too large %d\n", txq->txq_dump_count);
3373 		txq->txq_dump_count = 1;
3374 		return (EINVAL);
3375 	}
3376 	if (txq->txq_dump_start > 255) {
3377 		log(LOG_WARNING,
3378 		    "dump start of %d is greater than queue size\n",
3379 		    txq->txq_dump_start);
3380 		txq->txq_dump_start = 0;
3381 		return (EINVAL);
3382 	}
3383 
3384 	err = sysctl_wire_old_buffer(req, 0);
3385 	if (err != 0)
3386 		return (err);
3387 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3388 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3389 	    txq->txq_dump_start,
3390 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3391 
3392 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3393 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3394 		txd = &txq->desc[i & (255)];
3395 		WR = (uint32_t *)txd->flit;
3396 		wr_hi = ntohl(WR[0]);
3397 		wr_lo = ntohl(WR[1]);
3398 		gen = G_WR_GEN(wr_lo);
3399 
3400 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3401 		    wr_hi, wr_lo, gen);
3402 		for (j = 2; j < 30; j += 4)
3403 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3404 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3405 
3406 	}
3407 	err = sbuf_finish(sb);
3408 	/* Output a trailing NUL. */
3409 	if (err == 0)
3410 		err = SYSCTL_OUT(req, "", 1);
3411 	sbuf_delete(sb);
3412 	return (err);
3413 }
3414 
3415 static int
3416 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3417 {
3418 	adapter_t *sc = arg1;
3419 	struct qset_params *qsp = &sc->params.sge.qset[0];
3420 	int coalesce_usecs;
3421 	struct sge_qset *qs;
3422 	int i, j, err, nqsets = 0;
3423 	struct mtx *lock;
3424 
3425 	if ((sc->flags & FULL_INIT_DONE) == 0)
3426 		return (ENXIO);
3427 
3428 	coalesce_usecs = qsp->coalesce_usecs;
3429         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3430 
3431 	if (err != 0) {
3432 		return (err);
3433 	}
3434 	if (coalesce_usecs == qsp->coalesce_usecs)
3435 		return (0);
3436 
3437 	for (i = 0; i < sc->params.nports; i++)
3438 		for (j = 0; j < sc->port[i].nqsets; j++)
3439 			nqsets++;
3440 
3441 	coalesce_usecs = max(1, coalesce_usecs);
3442 
3443 	for (i = 0; i < nqsets; i++) {
3444 		qs = &sc->sge.qs[i];
3445 		qsp = &sc->params.sge.qset[i];
3446 		qsp->coalesce_usecs = coalesce_usecs;
3447 
3448 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3449 			    &sc->sge.qs[0].rspq.lock;
3450 
3451 		mtx_lock(lock);
3452 		t3_update_qset_coalesce(qs, qsp);
3453 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3454 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3455 		mtx_unlock(lock);
3456 	}
3457 
3458 	return (0);
3459 }
3460 
3461 static int
3462 t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3463 {
3464 	adapter_t *sc = arg1;
3465 	int rc, timestamp;
3466 
3467 	if ((sc->flags & FULL_INIT_DONE) == 0)
3468 		return (ENXIO);
3469 
3470 	timestamp = sc->timestamp;
3471 	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3472 
3473 	if (rc != 0)
3474 		return (rc);
3475 
3476 	if (timestamp != sc->timestamp) {
3477 		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3478 		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3479 		sc->timestamp = timestamp;
3480 	}
3481 
3482 	return (0);
3483 }
3484 
3485 void
3486 t3_add_attach_sysctls(adapter_t *sc)
3487 {
3488 	struct sysctl_ctx_list *ctx;
3489 	struct sysctl_oid_list *children;
3490 
3491 	ctx = device_get_sysctl_ctx(sc->dev);
3492 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3493 
3494 	/* random information */
3495 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3496 	    "firmware_version",
3497 	    CTLFLAG_RD, &sc->fw_version,
3498 	    0, "firmware version");
3499 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3500 	    "hw_revision",
3501 	    CTLFLAG_RD, &sc->params.rev,
3502 	    0, "chip model");
3503 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3504 	    "port_types",
3505 	    CTLFLAG_RD, &sc->port_types,
3506 	    0, "type of ports");
3507 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3508 	    "enable_debug",
3509 	    CTLFLAG_RW, &cxgb_debug,
3510 	    0, "enable verbose debugging output");
3511 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3512 	    CTLFLAG_RD, &sc->tunq_coalesce,
3513 	    "#tunneled packets freed");
3514 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3515 	    "txq_overrun",
3516 	    CTLFLAG_RD, &txq_fills,
3517 	    0, "#times txq overrun");
3518 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3519 	    "core_clock",
3520 	    CTLFLAG_RD, &sc->params.vpd.cclk,
3521 	    0, "core clock frequency (in KHz)");
3522 }
3523 
3524 
3525 static const char *rspq_name = "rspq";
3526 static const char *txq_names[] =
3527 {
3528 	"txq_eth",
3529 	"txq_ofld",
3530 	"txq_ctrl"
3531 };
3532 
3533 static int
3534 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3535 {
3536 	struct port_info *p = arg1;
3537 	uint64_t *parg;
3538 
3539 	if (!p)
3540 		return (EINVAL);
3541 
3542 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3543 	PORT_LOCK(p);
3544 	t3_mac_update_stats(&p->mac);
3545 	PORT_UNLOCK(p);
3546 
3547 	return (sysctl_handle_64(oidp, parg, 0, req));
3548 }
3549 
3550 void
3551 t3_add_configured_sysctls(adapter_t *sc)
3552 {
3553 	struct sysctl_ctx_list *ctx;
3554 	struct sysctl_oid_list *children;
3555 	int i, j;
3556 
3557 	ctx = device_get_sysctl_ctx(sc->dev);
3558 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3559 
3560 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3561 	    "intr_coal",
3562 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3563 	    0, t3_set_coalesce_usecs,
3564 	    "I", "interrupt coalescing timer (us)");
3565 
3566 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3567 	    "pkt_timestamp",
3568 	    CTLTYPE_INT | CTLFLAG_RW, sc,
3569 	    0, t3_pkt_timestamp,
3570 	    "I", "provide packet timestamp instead of connection hash");
3571 
3572 	for (i = 0; i < sc->params.nports; i++) {
3573 		struct port_info *pi = &sc->port[i];
3574 		struct sysctl_oid *poid;
3575 		struct sysctl_oid_list *poidlist;
3576 		struct mac_stats *mstats = &pi->mac.stats;
3577 
3578 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3579 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3580 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3581 		poidlist = SYSCTL_CHILDREN(poid);
3582 		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3583 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3584 		    0, "#queue sets");
3585 
3586 		for (j = 0; j < pi->nqsets; j++) {
3587 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3588 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3589 					  *ctrlqpoid, *lropoid;
3590 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3591 					       *txqpoidlist, *ctrlqpoidlist,
3592 					       *lropoidlist;
3593 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3594 
3595 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3596 
3597 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3598 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3599 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3600 
3601 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3602 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3603 					"freelist #0 empty");
3604 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3605 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3606 					"freelist #1 empty");
3607 
3608 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3609 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3610 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3611 
3612 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3613 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3614 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3615 
3616 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3617 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3618 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3619 
3620 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3621 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3622 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3623 
3624 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3625 			    CTLFLAG_RD, &qs->rspq.size,
3626 			    0, "#entries in response queue");
3627 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3628 			    CTLFLAG_RD, &qs->rspq.cidx,
3629 			    0, "consumer index");
3630 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3631 			    CTLFLAG_RD, &qs->rspq.credits,
3632 			    0, "#credits");
3633 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3634 			    CTLFLAG_RD, &qs->rspq.starved,
3635 			    0, "#times starved");
3636 			SYSCTL_ADD_ULONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3637 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3638 			    "physical_address_of the queue");
3639 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3640 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3641 			    0, "start rspq dump entry");
3642 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3643 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3644 			    0, "#rspq entries to dump");
3645 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3646 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3647 			    0, t3_dump_rspq, "A", "dump of the response queue");
3648 
3649 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3650 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3651 			    "#tunneled packets dropped");
3652 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3653 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3654 			    0, "#tunneled packets waiting to be sent");
3655 #if 0
3656 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3657 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3658 			    0, "#tunneled packets queue producer index");
3659 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3660 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3661 			    0, "#tunneled packets queue consumer index");
3662 #endif
3663 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3664 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3665 			    0, "#tunneled packets processed by the card");
3666 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3667 			    CTLFLAG_RD, &txq->cleaned,
3668 			    0, "#tunneled packets cleaned");
3669 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3670 			    CTLFLAG_RD, &txq->in_use,
3671 			    0, "#tunneled packet slots in use");
3672 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3673 			    CTLFLAG_RD, &txq->txq_frees,
3674 			    "#tunneled packets freed");
3675 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3676 			    CTLFLAG_RD, &txq->txq_skipped,
3677 			    0, "#tunneled packet descriptors skipped");
3678 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3679 			    CTLFLAG_RD, &txq->txq_coalesced,
3680 			    "#tunneled packets coalesced");
3681 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3682 			    CTLFLAG_RD, &txq->txq_enqueued,
3683 			    0, "#tunneled packets enqueued to hardware");
3684 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3685 			    CTLFLAG_RD, &qs->txq_stopped,
3686 			    0, "tx queues stopped");
3687 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3688 			    CTLFLAG_RD, &txq->phys_addr,
3689 			    "physical_address_of the queue");
3690 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3691 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3692 			    0, "txq generation");
3693 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3694 			    CTLFLAG_RD, &txq->cidx,
3695 			    0, "hardware queue cidx");
3696 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3697 			    CTLFLAG_RD, &txq->pidx,
3698 			    0, "hardware queue pidx");
3699 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3700 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3701 			    0, "txq start idx for dump");
3702 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3703 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3704 			    0, "txq #entries to dump");
3705 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3706 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3707 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3708 
3709 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3710 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3711 			    0, "ctrlq start idx for dump");
3712 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3713 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3714 			    0, "ctrl #entries to dump");
3715 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3716 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3717 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3718 
3719 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3720 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3721 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3722 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3723 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3724 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3725 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3726 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3727 		}
3728 
3729 		/* Now add a node for mac stats. */
3730 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3731 		    CTLFLAG_RD, NULL, "MAC statistics");
3732 		poidlist = SYSCTL_CHILDREN(poid);
3733 
3734 		/*
3735 		 * We (ab)use the length argument (arg2) to pass on the offset
3736 		 * of the data that we are interested in.  This is only required
3737 		 * for the quad counters that are updated from the hardware (we
3738 		 * make sure that we return the latest value).
3739 		 * sysctl_handle_macstat first updates *all* the counters from
3740 		 * the hardware, and then returns the latest value of the
3741 		 * requested counter.  Best would be to update only the
3742 		 * requested counter from hardware, but t3_mac_update_stats()
3743 		 * hides all the register details and we don't want to dive into
3744 		 * all that here.
3745 		 */
3746 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3747     (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3748     sysctl_handle_macstat, "QU", 0)
3749 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3750 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3751 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3752 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3753 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3754 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3755 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3756 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3757 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3758 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3759 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3760 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3761 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3762 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3763 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3764 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3765 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3766 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3767 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3768 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3769 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3770 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3771 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3772 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3773 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3774 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3775 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3776 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3777 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3778 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3779 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3780 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3781 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3782 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3783 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3784 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3785 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3786 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3787 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3788 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3789 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3790 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3791 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3792 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3793 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3794 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3795 #undef CXGB_SYSCTL_ADD_QUAD
3796 
3797 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3798     CTLFLAG_RD, &mstats->a, 0)
3799 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3800 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3801 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3802 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3803 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3804 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3805 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3806 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3807 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3808 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3809 #undef CXGB_SYSCTL_ADD_ULONG
3810 	}
3811 }
3812 
3813 /**
3814  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3815  *	@qs: the queue set
3816  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3817  *	@idx: the descriptor index in the queue
3818  *	@data: where to dump the descriptor contents
3819  *
3820  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3821  *	size of the descriptor.
3822  */
3823 int
3824 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3825 		unsigned char *data)
3826 {
3827 	if (qnum >= 6)
3828 		return (EINVAL);
3829 
3830 	if (qnum < 3) {
3831 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3832 			return -EINVAL;
3833 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3834 		return sizeof(struct tx_desc);
3835 	}
3836 
3837 	if (qnum == 3) {
3838 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3839 			return (EINVAL);
3840 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3841 		return sizeof(struct rsp_desc);
3842 	}
3843 
3844 	qnum -= 4;
3845 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3846 		return (EINVAL);
3847 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3848 	return sizeof(struct rx_desc);
3849 }
3850