xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision d8b878873e7aa8df1972cc6a642804b17eb61087)
1 /**************************************************************************
2 
3 Copyright (c) 2007-2009, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/module.h>
39 #include <sys/bus.h>
40 #include <sys/conf.h>
41 #include <machine/bus.h>
42 #include <machine/resource.h>
43 #include <sys/bus_dma.h>
44 #include <sys/rman.h>
45 #include <sys/queue.h>
46 #include <sys/sysctl.h>
47 #include <sys/taskqueue.h>
48 
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/systm.h>
54 #include <sys/syslog.h>
55 #include <sys/socket.h>
56 
57 #include <net/bpf.h>
58 #include <net/ethernet.h>
59 #include <net/if.h>
60 #include <net/if_vlan_var.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <dev/pci/pcireg.h>
68 #include <dev/pci/pcivar.h>
69 
70 #include <vm/vm.h>
71 #include <vm/pmap.h>
72 
73 #include <cxgb_include.h>
74 #include <sys/mvec.h>
75 
76 int	txq_fills = 0;
77 int	multiq_tx_enable = 1;
78 
79 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
80 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
81 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
82 SYSCTL_UINT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
83     "size of per-queue mbuf ring");
84 
85 static int cxgb_tx_coalesce_force = 0;
86 TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
87 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
88     &cxgb_tx_coalesce_force, 0,
89     "coalesce small packets into a single work request regardless of ring state");
90 
91 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
92 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
93 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
94 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
95 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
96 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
97 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
98 
99 
100 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
101 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
102     &cxgb_tx_coalesce_enable_start);
103 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
104     &cxgb_tx_coalesce_enable_start, 0,
105     "coalesce enable threshold");
106 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
107 TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
108 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
109     &cxgb_tx_coalesce_enable_stop, 0,
110     "coalesce disable threshold");
111 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
112 TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
113 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
114     &cxgb_tx_reclaim_threshold, 0,
115     "tx cleaning minimum threshold");
116 
117 /*
118  * XXX don't re-enable this until TOE stops assuming
119  * we have an m_ext
120  */
121 static int recycle_enable = 0;
122 
123 extern int cxgb_use_16k_clusters;
124 extern int nmbjumbop;
125 extern int nmbjumbo9;
126 extern int nmbjumbo16;
127 
128 #define USE_GTS 0
129 
130 #define SGE_RX_SM_BUF_SIZE	1536
131 #define SGE_RX_DROP_THRES	16
132 #define SGE_RX_COPY_THRES	128
133 
134 /*
135  * Period of the Tx buffer reclaim timer.  This timer does not need to run
136  * frequently as Tx buffers are usually reclaimed by new Tx packets.
137  */
138 #define TX_RECLAIM_PERIOD       (hz >> 1)
139 
140 /*
141  * Values for sge_txq.flags
142  */
143 enum {
144 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
145 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
146 };
147 
148 struct tx_desc {
149 	uint64_t	flit[TX_DESC_FLITS];
150 } __packed;
151 
152 struct rx_desc {
153 	uint32_t	addr_lo;
154 	uint32_t	len_gen;
155 	uint32_t	gen2;
156 	uint32_t	addr_hi;
157 } __packed;
158 
159 struct rsp_desc {               /* response queue descriptor */
160 	struct rss_header	rss_hdr;
161 	uint32_t		flags;
162 	uint32_t		len_cq;
163 	uint8_t			imm_data[47];
164 	uint8_t			intr_gen;
165 } __packed;
166 
167 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
168 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
169 #define RX_SW_DESC_INUSE        (1 << 3)
170 #define TX_SW_DESC_MAPPED       (1 << 4)
171 
172 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
173 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
174 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
175 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
176 
177 struct tx_sw_desc {                /* SW state per Tx descriptor */
178 	struct mbuf	*m;
179 	bus_dmamap_t	map;
180 	int		flags;
181 };
182 
183 struct rx_sw_desc {                /* SW state per Rx descriptor */
184 	caddr_t		rxsd_cl;
185 	struct mbuf	*m;
186 	bus_dmamap_t	map;
187 	int		flags;
188 };
189 
190 struct txq_state {
191 	unsigned int	compl;
192 	unsigned int	gen;
193 	unsigned int	pidx;
194 };
195 
196 struct refill_fl_cb_arg {
197 	int               error;
198 	bus_dma_segment_t seg;
199 	int               nseg;
200 };
201 
202 
203 /*
204  * Maps a number of flits to the number of Tx descriptors that can hold them.
205  * The formula is
206  *
207  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
208  *
209  * HW allows up to 4 descriptors to be combined into a WR.
210  */
211 static uint8_t flit_desc_map[] = {
212 	0,
213 #if SGE_NUM_GENBITS == 1
214 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
216 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
217 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
218 #elif SGE_NUM_GENBITS == 2
219 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
220 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
222 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
223 #else
224 # error "SGE_NUM_GENBITS must be 1 or 2"
225 #endif
226 };
227 
228 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
229 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
230 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
231 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
232 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
233 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
234 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
235 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
237 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
238 #define	TXQ_RING_DEQUEUE(qs) \
239 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
240 
241 int cxgb_debug = 0;
242 
243 static void sge_timer_cb(void *arg);
244 static void sge_timer_reclaim(void *arg, int ncount);
245 static void sge_txq_reclaim_handler(void *arg, int ncount);
246 static void cxgb_start_locked(struct sge_qset *qs);
247 
248 /*
249  * XXX need to cope with bursty scheduling by looking at a wider
250  * window than we are now for determining the need for coalescing
251  *
252  */
253 static __inline uint64_t
254 check_pkt_coalesce(struct sge_qset *qs)
255 {
256         struct adapter *sc;
257         struct sge_txq *txq;
258 	uint8_t *fill;
259 
260 	if (__predict_false(cxgb_tx_coalesce_force))
261 		return (1);
262 	txq = &qs->txq[TXQ_ETH];
263         sc = qs->port->adapter;
264 	fill = &sc->tunq_fill[qs->idx];
265 
266 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
267 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
268 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
269 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
270 	/*
271 	 * if the hardware transmit queue is more than 1/8 full
272 	 * we mark it as coalescing - we drop back from coalescing
273 	 * when we go below 1/32 full and there are no packets enqueued,
274 	 * this provides us with some degree of hysteresis
275 	 */
276         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
277 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
278                 *fill = 0;
279         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
280                 *fill = 1;
281 
282 	return (sc->tunq_coalesce);
283 }
284 
285 #ifdef __LP64__
286 static void
287 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
288 {
289 	uint64_t wr_hilo;
290 #if _BYTE_ORDER == _LITTLE_ENDIAN
291 	wr_hilo = wr_hi;
292 	wr_hilo |= (((uint64_t)wr_lo)<<32);
293 #else
294 	wr_hilo = wr_lo;
295 	wr_hilo |= (((uint64_t)wr_hi)<<32);
296 #endif
297 	wrp->wrh_hilo = wr_hilo;
298 }
299 #else
300 static void
301 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
302 {
303 
304 	wrp->wrh_hi = wr_hi;
305 	wmb();
306 	wrp->wrh_lo = wr_lo;
307 }
308 #endif
309 
310 struct coalesce_info {
311 	int count;
312 	int nbytes;
313 };
314 
315 static int
316 coalesce_check(struct mbuf *m, void *arg)
317 {
318 	struct coalesce_info *ci = arg;
319 	int *count = &ci->count;
320 	int *nbytes = &ci->nbytes;
321 
322 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
323 		(*count < 7) && (m->m_next == NULL))) {
324 		*count += 1;
325 		*nbytes += m->m_len;
326 		return (1);
327 	}
328 	return (0);
329 }
330 
331 static struct mbuf *
332 cxgb_dequeue(struct sge_qset *qs)
333 {
334 	struct mbuf *m, *m_head, *m_tail;
335 	struct coalesce_info ci;
336 
337 
338 	if (check_pkt_coalesce(qs) == 0)
339 		return TXQ_RING_DEQUEUE(qs);
340 
341 	m_head = m_tail = NULL;
342 	ci.count = ci.nbytes = 0;
343 	do {
344 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
345 		if (m_head == NULL) {
346 			m_tail = m_head = m;
347 		} else if (m != NULL) {
348 			m_tail->m_nextpkt = m;
349 			m_tail = m;
350 		}
351 	} while (m != NULL);
352 	if (ci.count > 7)
353 		panic("trying to coalesce %d packets in to one WR", ci.count);
354 	return (m_head);
355 }
356 
357 /**
358  *	reclaim_completed_tx - reclaims completed Tx descriptors
359  *	@adapter: the adapter
360  *	@q: the Tx queue to reclaim completed descriptors from
361  *
362  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
363  *	and frees the associated buffers if possible.  Called with the Tx
364  *	queue's lock held.
365  */
366 static __inline int
367 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
368 {
369 	struct sge_txq *q = &qs->txq[queue];
370 	int reclaim = desc_reclaimable(q);
371 
372 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
373 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
374 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
375 
376 	if (reclaim < reclaim_min)
377 		return (0);
378 
379 	mtx_assert(&qs->lock, MA_OWNED);
380 	if (reclaim > 0) {
381 		t3_free_tx_desc(qs, reclaim, queue);
382 		q->cleaned += reclaim;
383 		q->in_use -= reclaim;
384 	}
385 	if (isset(&qs->txq_stopped, TXQ_ETH))
386                 clrbit(&qs->txq_stopped, TXQ_ETH);
387 
388 	return (reclaim);
389 }
390 
391 /**
392  *	should_restart_tx - are there enough resources to restart a Tx queue?
393  *	@q: the Tx queue
394  *
395  *	Checks if there are enough descriptors to restart a suspended Tx queue.
396  */
397 static __inline int
398 should_restart_tx(const struct sge_txq *q)
399 {
400 	unsigned int r = q->processed - q->cleaned;
401 
402 	return q->in_use - r < (q->size >> 1);
403 }
404 
405 /**
406  *	t3_sge_init - initialize SGE
407  *	@adap: the adapter
408  *	@p: the SGE parameters
409  *
410  *	Performs SGE initialization needed every time after a chip reset.
411  *	We do not initialize any of the queue sets here, instead the driver
412  *	top-level must request those individually.  We also do not enable DMA
413  *	here, that should be done after the queues have been set up.
414  */
415 void
416 t3_sge_init(adapter_t *adap, struct sge_params *p)
417 {
418 	u_int ctrl, ups;
419 
420 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
421 
422 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
423 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
424 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
425 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
426 #if SGE_NUM_GENBITS == 1
427 	ctrl |= F_EGRGENCTRL;
428 #endif
429 	if (adap->params.rev > 0) {
430 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
431 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
432 	}
433 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
434 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
435 		     V_LORCQDRBTHRSH(512));
436 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
437 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
438 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
439 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
440 		     adap->params.rev < T3_REV_C ? 1000 : 500);
441 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
442 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
443 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
444 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
445 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
446 }
447 
448 
449 /**
450  *	sgl_len - calculates the size of an SGL of the given capacity
451  *	@n: the number of SGL entries
452  *
453  *	Calculates the number of flits needed for a scatter/gather list that
454  *	can hold the given number of entries.
455  */
456 static __inline unsigned int
457 sgl_len(unsigned int n)
458 {
459 	return ((3 * n) / 2 + (n & 1));
460 }
461 
462 /**
463  *	get_imm_packet - return the next ingress packet buffer from a response
464  *	@resp: the response descriptor containing the packet data
465  *
466  *	Return a packet containing the immediate data of the given response.
467  */
468 static int
469 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
470 {
471 
472 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
473 	m->m_ext.ext_buf = NULL;
474 	m->m_ext.ext_type = 0;
475 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
476 	return (0);
477 }
478 
479 static __inline u_int
480 flits_to_desc(u_int n)
481 {
482 	return (flit_desc_map[n]);
483 }
484 
485 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
486 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
487 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
488 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
489 		    F_HIRCQPARITYERROR)
490 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
491 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
492 		      F_RSPQDISABLED)
493 
494 /**
495  *	t3_sge_err_intr_handler - SGE async event interrupt handler
496  *	@adapter: the adapter
497  *
498  *	Interrupt handler for SGE asynchronous (non-data) events.
499  */
500 void
501 t3_sge_err_intr_handler(adapter_t *adapter)
502 {
503 	unsigned int v, status;
504 
505 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
506 	if (status & SGE_PARERR)
507 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
508 			 status & SGE_PARERR);
509 	if (status & SGE_FRAMINGERR)
510 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
511 			 status & SGE_FRAMINGERR);
512 	if (status & F_RSPQCREDITOVERFOW)
513 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
514 
515 	if (status & F_RSPQDISABLED) {
516 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
517 
518 		CH_ALERT(adapter,
519 			 "packet delivered to disabled response queue (0x%x)\n",
520 			 (v >> S_RSPQ0DISABLED) & 0xff);
521 	}
522 
523 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
524 	if (status & SGE_FATALERR)
525 		t3_fatal_err(adapter);
526 }
527 
528 void
529 t3_sge_prep(adapter_t *adap, struct sge_params *p)
530 {
531 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
532 
533 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
534 	nqsets *= adap->params.nports;
535 
536 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
537 
538 	while (!powerof2(fl_q_size))
539 		fl_q_size--;
540 
541 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
542 	    is_offload(adap);
543 
544 #if __FreeBSD_version >= 700111
545 	if (use_16k) {
546 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
547 		jumbo_buf_size = MJUM16BYTES;
548 	} else {
549 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
550 		jumbo_buf_size = MJUM9BYTES;
551 	}
552 #else
553 	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
554 	jumbo_buf_size = MJUMPAGESIZE;
555 #endif
556 	while (!powerof2(jumbo_q_size))
557 		jumbo_q_size--;
558 
559 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
560 		device_printf(adap->dev,
561 		    "Insufficient clusters and/or jumbo buffers.\n");
562 
563 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
564 
565 	for (i = 0; i < SGE_QSETS; ++i) {
566 		struct qset_params *q = p->qset + i;
567 
568 		if (adap->params.nports > 2) {
569 			q->coalesce_usecs = 50;
570 		} else {
571 #ifdef INVARIANTS
572 			q->coalesce_usecs = 10;
573 #else
574 			q->coalesce_usecs = 5;
575 #endif
576 		}
577 		q->polling = 0;
578 		q->rspq_size = RSPQ_Q_SIZE;
579 		q->fl_size = fl_q_size;
580 		q->jumbo_size = jumbo_q_size;
581 		q->jumbo_buf_size = jumbo_buf_size;
582 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
583 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
584 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
585 		q->cong_thres = 0;
586 	}
587 }
588 
589 int
590 t3_sge_alloc(adapter_t *sc)
591 {
592 
593 	/* The parent tag. */
594 	if (bus_dma_tag_create( NULL,			/* parent */
595 				1, 0,			/* algnmnt, boundary */
596 				BUS_SPACE_MAXADDR,	/* lowaddr */
597 				BUS_SPACE_MAXADDR,	/* highaddr */
598 				NULL, NULL,		/* filter, filterarg */
599 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
600 				BUS_SPACE_UNRESTRICTED, /* nsegments */
601 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
602 				0,			/* flags */
603 				NULL, NULL,		/* lock, lockarg */
604 				&sc->parent_dmat)) {
605 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
606 		return (ENOMEM);
607 	}
608 
609 	/*
610 	 * DMA tag for normal sized RX frames
611 	 */
612 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
613 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
614 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
615 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
616 		return (ENOMEM);
617 	}
618 
619 	/*
620 	 * DMA tag for jumbo sized RX frames.
621 	 */
622 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
623 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
624 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
625 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
626 		return (ENOMEM);
627 	}
628 
629 	/*
630 	 * DMA tag for TX frames.
631 	 */
632 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
633 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
634 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
635 		NULL, NULL, &sc->tx_dmat)) {
636 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
637 		return (ENOMEM);
638 	}
639 
640 	return (0);
641 }
642 
643 int
644 t3_sge_free(struct adapter * sc)
645 {
646 
647 	if (sc->tx_dmat != NULL)
648 		bus_dma_tag_destroy(sc->tx_dmat);
649 
650 	if (sc->rx_jumbo_dmat != NULL)
651 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
652 
653 	if (sc->rx_dmat != NULL)
654 		bus_dma_tag_destroy(sc->rx_dmat);
655 
656 	if (sc->parent_dmat != NULL)
657 		bus_dma_tag_destroy(sc->parent_dmat);
658 
659 	return (0);
660 }
661 
662 void
663 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
664 {
665 
666 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
667 	qs->rspq.polling = 0 /* p->polling */;
668 }
669 
670 #if !defined(__i386__) && !defined(__amd64__)
671 static void
672 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
673 {
674 	struct refill_fl_cb_arg *cb_arg = arg;
675 
676 	cb_arg->error = error;
677 	cb_arg->seg = segs[0];
678 	cb_arg->nseg = nseg;
679 
680 }
681 #endif
682 /**
683  *	refill_fl - refill an SGE free-buffer list
684  *	@sc: the controller softc
685  *	@q: the free-list to refill
686  *	@n: the number of new buffers to allocate
687  *
688  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
689  *	The caller must assure that @n does not exceed the queue's capacity.
690  */
691 static void
692 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
693 {
694 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
695 	struct rx_desc *d = &q->desc[q->pidx];
696 	struct refill_fl_cb_arg cb_arg;
697 	struct mbuf *m;
698 	caddr_t cl;
699 	int err, count = 0;
700 
701 	cb_arg.error = 0;
702 	while (n--) {
703 		/*
704 		 * We only allocate a cluster, mbuf allocation happens after rx
705 		 */
706 		if (q->zone == zone_pack) {
707 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
708 				break;
709 			cl = m->m_ext.ext_buf;
710 		} else {
711 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
712 				break;
713 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
714 				uma_zfree(q->zone, cl);
715 				break;
716 			}
717 		}
718 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
719 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
720 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
721 				uma_zfree(q->zone, cl);
722 				goto done;
723 			}
724 			sd->flags |= RX_SW_DESC_MAP_CREATED;
725 		}
726 #if !defined(__i386__) && !defined(__amd64__)
727 		err = bus_dmamap_load(q->entry_tag, sd->map,
728 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
729 
730 		if (err != 0 || cb_arg.error) {
731 			if (q->zone == zone_pack)
732 				uma_zfree(q->zone, cl);
733 			m_free(m);
734 			goto done;
735 		}
736 #else
737 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
738 #endif
739 		sd->flags |= RX_SW_DESC_INUSE;
740 		sd->rxsd_cl = cl;
741 		sd->m = m;
742 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
743 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
744 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
745 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
746 
747 		d++;
748 		sd++;
749 
750 		if (++q->pidx == q->size) {
751 			q->pidx = 0;
752 			q->gen ^= 1;
753 			sd = q->sdesc;
754 			d = q->desc;
755 		}
756 		q->credits++;
757 		count++;
758 	}
759 
760 done:
761 	if (count)
762 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
763 }
764 
765 
766 /**
767  *	free_rx_bufs - free the Rx buffers on an SGE free list
768  *	@sc: the controle softc
769  *	@q: the SGE free list to clean up
770  *
771  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
772  *	this queue should be stopped before calling this function.
773  */
774 static void
775 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
776 {
777 	u_int cidx = q->cidx;
778 
779 	while (q->credits--) {
780 		struct rx_sw_desc *d = &q->sdesc[cidx];
781 
782 		if (d->flags & RX_SW_DESC_INUSE) {
783 			bus_dmamap_unload(q->entry_tag, d->map);
784 			bus_dmamap_destroy(q->entry_tag, d->map);
785 			if (q->zone == zone_pack) {
786 				m_init(d->m, zone_pack, MCLBYTES,
787 				    M_NOWAIT, MT_DATA, M_EXT);
788 				uma_zfree(zone_pack, d->m);
789 			} else {
790 				m_init(d->m, zone_mbuf, MLEN,
791 				    M_NOWAIT, MT_DATA, 0);
792 				uma_zfree(zone_mbuf, d->m);
793 				uma_zfree(q->zone, d->rxsd_cl);
794 			}
795 		}
796 
797 		d->rxsd_cl = NULL;
798 		d->m = NULL;
799 		if (++cidx == q->size)
800 			cidx = 0;
801 	}
802 }
803 
804 static __inline void
805 __refill_fl(adapter_t *adap, struct sge_fl *fl)
806 {
807 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
808 }
809 
810 static __inline void
811 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
812 {
813 	if ((fl->size - fl->credits) < max)
814 		refill_fl(adap, fl, min(max, fl->size - fl->credits));
815 }
816 
817 /**
818  *	recycle_rx_buf - recycle a receive buffer
819  *	@adapter: the adapter
820  *	@q: the SGE free list
821  *	@idx: index of buffer to recycle
822  *
823  *	Recycles the specified buffer on the given free list by adding it at
824  *	the next available slot on the list.
825  */
826 static void
827 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
828 {
829 	struct rx_desc *from = &q->desc[idx];
830 	struct rx_desc *to   = &q->desc[q->pidx];
831 
832 	q->sdesc[q->pidx] = q->sdesc[idx];
833 	to->addr_lo = from->addr_lo;        // already big endian
834 	to->addr_hi = from->addr_hi;        // likewise
835 	wmb();	/* necessary ? */
836 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
837 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
838 	q->credits++;
839 
840 	if (++q->pidx == q->size) {
841 		q->pidx = 0;
842 		q->gen ^= 1;
843 	}
844 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
845 }
846 
847 static void
848 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
849 {
850 	uint32_t *addr;
851 
852 	addr = arg;
853 	*addr = segs[0].ds_addr;
854 }
855 
856 static int
857 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
858     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
859     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
860 {
861 	size_t len = nelem * elem_size;
862 	void *s = NULL;
863 	void *p = NULL;
864 	int err;
865 
866 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
867 				      BUS_SPACE_MAXADDR_32BIT,
868 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
869 				      len, 0, NULL, NULL, tag)) != 0) {
870 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
871 		return (ENOMEM);
872 	}
873 
874 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
875 				    map)) != 0) {
876 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
877 		return (ENOMEM);
878 	}
879 
880 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
881 	bzero(p, len);
882 	*(void **)desc = p;
883 
884 	if (sw_size) {
885 		len = nelem * sw_size;
886 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
887 		*(void **)sdesc = s;
888 	}
889 	if (parent_entry_tag == NULL)
890 		return (0);
891 
892 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
893 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
894 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
895 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
896 		                      NULL, NULL, entry_tag)) != 0) {
897 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
898 		return (ENOMEM);
899 	}
900 	return (0);
901 }
902 
903 static void
904 sge_slow_intr_handler(void *arg, int ncount)
905 {
906 	adapter_t *sc = arg;
907 
908 	t3_slow_intr_handler(sc);
909 }
910 
911 /**
912  *	sge_timer_cb - perform periodic maintenance of an SGE qset
913  *	@data: the SGE queue set to maintain
914  *
915  *	Runs periodically from a timer to perform maintenance of an SGE queue
916  *	set.  It performs two tasks:
917  *
918  *	a) Cleans up any completed Tx descriptors that may still be pending.
919  *	Normal descriptor cleanup happens when new packets are added to a Tx
920  *	queue so this timer is relatively infrequent and does any cleanup only
921  *	if the Tx queue has not seen any new packets in a while.  We make a
922  *	best effort attempt to reclaim descriptors, in that we don't wait
923  *	around if we cannot get a queue's lock (which most likely is because
924  *	someone else is queueing new packets and so will also handle the clean
925  *	up).  Since control queues use immediate data exclusively we don't
926  *	bother cleaning them up here.
927  *
928  *	b) Replenishes Rx queues that have run out due to memory shortage.
929  *	Normally new Rx buffers are added when existing ones are consumed but
930  *	when out of memory a queue can become empty.  We try to add only a few
931  *	buffers here, the queue will be replenished fully as these new buffers
932  *	are used up if memory shortage has subsided.
933  *
934  *	c) Return coalesced response queue credits in case a response queue is
935  *	starved.
936  *
937  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
938  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
939  */
940 static void
941 sge_timer_cb(void *arg)
942 {
943 	adapter_t *sc = arg;
944 	if ((sc->flags & USING_MSIX) == 0) {
945 
946 		struct port_info *pi;
947 		struct sge_qset *qs;
948 		struct sge_txq  *txq;
949 		int i, j;
950 		int reclaim_ofl, refill_rx;
951 
952 		if (sc->open_device_map == 0)
953 			return;
954 
955 		for (i = 0; i < sc->params.nports; i++) {
956 			pi = &sc->port[i];
957 			for (j = 0; j < pi->nqsets; j++) {
958 				qs = &sc->sge.qs[pi->first_qset + j];
959 				txq = &qs->txq[0];
960 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
961 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
962 				    (qs->fl[1].credits < qs->fl[1].size));
963 				if (reclaim_ofl || refill_rx) {
964 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
965 					break;
966 				}
967 			}
968 		}
969 	}
970 
971 	if (sc->params.nports > 2) {
972 		int i;
973 
974 		for_each_port(sc, i) {
975 			struct port_info *pi = &sc->port[i];
976 
977 			t3_write_reg(sc, A_SG_KDOORBELL,
978 				     F_SELEGRCNTX |
979 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
980 		}
981 	}
982 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
983 	    sc->open_device_map != 0)
984 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
985 }
986 
987 /*
988  * This is meant to be a catch-all function to keep sge state private
989  * to sge.c
990  *
991  */
992 int
993 t3_sge_init_adapter(adapter_t *sc)
994 {
995 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
996 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
997 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
998 	return (0);
999 }
1000 
1001 int
1002 t3_sge_reset_adapter(adapter_t *sc)
1003 {
1004 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1005 	return (0);
1006 }
1007 
1008 int
1009 t3_sge_init_port(struct port_info *pi)
1010 {
1011 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1012 	return (0);
1013 }
1014 
1015 /**
1016  *	refill_rspq - replenish an SGE response queue
1017  *	@adapter: the adapter
1018  *	@q: the response queue to replenish
1019  *	@credits: how many new responses to make available
1020  *
1021  *	Replenishes a response queue by making the supplied number of responses
1022  *	available to HW.
1023  */
1024 static __inline void
1025 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1026 {
1027 
1028 	/* mbufs are allocated on demand when a rspq entry is processed. */
1029 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1030 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1031 }
1032 
1033 static void
1034 sge_txq_reclaim_handler(void *arg, int ncount)
1035 {
1036 	struct sge_qset *qs = arg;
1037 	int i;
1038 
1039 	for (i = 0; i < 3; i++)
1040 		reclaim_completed_tx(qs, 16, i);
1041 }
1042 
1043 static void
1044 sge_timer_reclaim(void *arg, int ncount)
1045 {
1046 	struct port_info *pi = arg;
1047 	int i, nqsets = pi->nqsets;
1048 	adapter_t *sc = pi->adapter;
1049 	struct sge_qset *qs;
1050 	struct mtx *lock;
1051 
1052 	KASSERT((sc->flags & USING_MSIX) == 0,
1053 	    ("can't call timer reclaim for msi-x"));
1054 
1055 	for (i = 0; i < nqsets; i++) {
1056 		qs = &sc->sge.qs[pi->first_qset + i];
1057 
1058 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1059 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1060 			    &sc->sge.qs[0].rspq.lock;
1061 
1062 		if (mtx_trylock(lock)) {
1063 			/* XXX currently assume that we are *NOT* polling */
1064 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1065 
1066 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1067 				__refill_fl(sc, &qs->fl[0]);
1068 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1069 				__refill_fl(sc, &qs->fl[1]);
1070 
1071 			if (status & (1 << qs->rspq.cntxt_id)) {
1072 				if (qs->rspq.credits) {
1073 					refill_rspq(sc, &qs->rspq, 1);
1074 					qs->rspq.credits--;
1075 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1076 					    1 << qs->rspq.cntxt_id);
1077 				}
1078 			}
1079 			mtx_unlock(lock);
1080 		}
1081 	}
1082 }
1083 
1084 /**
1085  *	init_qset_cntxt - initialize an SGE queue set context info
1086  *	@qs: the queue set
1087  *	@id: the queue set id
1088  *
1089  *	Initializes the TIDs and context ids for the queues of a queue set.
1090  */
1091 static void
1092 init_qset_cntxt(struct sge_qset *qs, u_int id)
1093 {
1094 
1095 	qs->rspq.cntxt_id = id;
1096 	qs->fl[0].cntxt_id = 2 * id;
1097 	qs->fl[1].cntxt_id = 2 * id + 1;
1098 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1099 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1100 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1101 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1102 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1103 
1104 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1105 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1106 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1107 }
1108 
1109 
1110 static void
1111 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1112 {
1113 	txq->in_use += ndesc;
1114 	/*
1115 	 * XXX we don't handle stopping of queue
1116 	 * presumably start handles this when we bump against the end
1117 	 */
1118 	txqs->gen = txq->gen;
1119 	txq->unacked += ndesc;
1120 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1121 	txq->unacked &= 31;
1122 	txqs->pidx = txq->pidx;
1123 	txq->pidx += ndesc;
1124 #ifdef INVARIANTS
1125 	if (((txqs->pidx > txq->cidx) &&
1126 		(txq->pidx < txqs->pidx) &&
1127 		(txq->pidx >= txq->cidx)) ||
1128 	    ((txqs->pidx < txq->cidx) &&
1129 		(txq->pidx >= txq-> cidx)) ||
1130 	    ((txqs->pidx < txq->cidx) &&
1131 		(txq->cidx < txqs->pidx)))
1132 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1133 		    txqs->pidx, txq->pidx, txq->cidx);
1134 #endif
1135 	if (txq->pidx >= txq->size) {
1136 		txq->pidx -= txq->size;
1137 		txq->gen ^= 1;
1138 	}
1139 
1140 }
1141 
1142 /**
1143  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1144  *	@m: the packet mbufs
1145  *      @nsegs: the number of segments
1146  *
1147  * 	Returns the number of Tx descriptors needed for the given Ethernet
1148  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1149  */
1150 static __inline unsigned int
1151 calc_tx_descs(const struct mbuf *m, int nsegs)
1152 {
1153 	unsigned int flits;
1154 
1155 	if (m->m_pkthdr.len <= PIO_LEN)
1156 		return 1;
1157 
1158 	flits = sgl_len(nsegs) + 2;
1159 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1160 		flits++;
1161 
1162 	return flits_to_desc(flits);
1163 }
1164 
1165 static unsigned int
1166 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1167     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1168 {
1169 	struct mbuf *m0;
1170 	int err, pktlen, pass = 0;
1171 	bus_dma_tag_t tag = txq->entry_tag;
1172 
1173 retry:
1174 	err = 0;
1175 	m0 = *m;
1176 	pktlen = m0->m_pkthdr.len;
1177 #if defined(__i386__) || defined(__amd64__)
1178 	if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) {
1179 		goto done;
1180 	} else
1181 #endif
1182 		err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0);
1183 
1184 	if (err == 0) {
1185 		goto done;
1186 	}
1187 	if (err == EFBIG && pass == 0) {
1188 		pass = 1;
1189 		/* Too many segments, try to defrag */
1190 		m0 = m_defrag(m0, M_DONTWAIT);
1191 		if (m0 == NULL) {
1192 			m_freem(*m);
1193 			*m = NULL;
1194 			return (ENOBUFS);
1195 		}
1196 		*m = m0;
1197 		goto retry;
1198 	} else if (err == ENOMEM) {
1199 		return (err);
1200 	} if (err) {
1201 		if (cxgb_debug)
1202 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1203 		m_freem(m0);
1204 		*m = NULL;
1205 		return (err);
1206 	}
1207 done:
1208 #if !defined(__i386__) && !defined(__amd64__)
1209 	bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE);
1210 #endif
1211 	txsd->flags |= TX_SW_DESC_MAPPED;
1212 
1213 	return (0);
1214 }
1215 
1216 /**
1217  *	make_sgl - populate a scatter/gather list for a packet
1218  *	@sgp: the SGL to populate
1219  *	@segs: the packet dma segments
1220  *	@nsegs: the number of segments
1221  *
1222  *	Generates a scatter/gather list for the buffers that make up a packet
1223  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1224  *	appropriately.
1225  */
1226 static __inline void
1227 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1228 {
1229 	int i, idx;
1230 
1231 	for (idx = 0, i = 0; i < nsegs; i++) {
1232 		/*
1233 		 * firmware doesn't like empty segments
1234 		 */
1235 		if (segs[i].ds_len == 0)
1236 			continue;
1237 		if (i && idx == 0)
1238 			++sgp;
1239 
1240 		sgp->len[idx] = htobe32(segs[i].ds_len);
1241 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1242 		idx ^= 1;
1243 	}
1244 
1245 	if (idx) {
1246 		sgp->len[idx] = 0;
1247 		sgp->addr[idx] = 0;
1248 	}
1249 }
1250 
1251 /**
1252  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1253  *	@adap: the adapter
1254  *	@q: the Tx queue
1255  *
1256  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1257  *	where the HW is going to sleep just after we checked, however,
1258  *	then the interrupt handler will detect the outstanding TX packet
1259  *	and ring the doorbell for us.
1260  *
1261  *	When GTS is disabled we unconditionally ring the doorbell.
1262  */
1263 static __inline void
1264 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1265 {
1266 #if USE_GTS
1267 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1268 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1269 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1270 #ifdef T3_TRACE
1271 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1272 			  q->cntxt_id);
1273 #endif
1274 		t3_write_reg(adap, A_SG_KDOORBELL,
1275 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1276 	}
1277 #else
1278 	wmb();            /* write descriptors before telling HW */
1279 	t3_write_reg(adap, A_SG_KDOORBELL,
1280 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1281 #endif
1282 }
1283 
1284 static __inline void
1285 wr_gen2(struct tx_desc *d, unsigned int gen)
1286 {
1287 #if SGE_NUM_GENBITS == 2
1288 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1289 #endif
1290 }
1291 
1292 /**
1293  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1294  *	@ndesc: number of Tx descriptors spanned by the SGL
1295  *	@txd: first Tx descriptor to be written
1296  *	@txqs: txq state (generation and producer index)
1297  *	@txq: the SGE Tx queue
1298  *	@sgl: the SGL
1299  *	@flits: number of flits to the start of the SGL in the first descriptor
1300  *	@sgl_flits: the SGL size in flits
1301  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1302  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1303  *
1304  *	Write a work request header and an associated SGL.  If the SGL is
1305  *	small enough to fit into one Tx descriptor it has already been written
1306  *	and we just need to write the WR header.  Otherwise we distribute the
1307  *	SGL across the number of descriptors it spans.
1308  */
1309 static void
1310 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1311     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1312     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1313 {
1314 
1315 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1316 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1317 
1318 	if (__predict_true(ndesc == 1)) {
1319 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1320 			V_WR_SGLSFLT(flits)) | wr_hi,
1321 		    htonl(V_WR_LEN(flits + sgl_flits) |
1322 			V_WR_GEN(txqs->gen)) | wr_lo);
1323 		/* XXX gen? */
1324 		wr_gen2(txd, txqs->gen);
1325 
1326 	} else {
1327 		unsigned int ogen = txqs->gen;
1328 		const uint64_t *fp = (const uint64_t *)sgl;
1329 		struct work_request_hdr *wp = wrp;
1330 
1331 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1332 		    V_WR_SGLSFLT(flits)) | wr_hi;
1333 
1334 		while (sgl_flits) {
1335 			unsigned int avail = WR_FLITS - flits;
1336 
1337 			if (avail > sgl_flits)
1338 				avail = sgl_flits;
1339 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1340 			sgl_flits -= avail;
1341 			ndesc--;
1342 			if (!sgl_flits)
1343 				break;
1344 
1345 			fp += avail;
1346 			txd++;
1347 			txsd++;
1348 			if (++txqs->pidx == txq->size) {
1349 				txqs->pidx = 0;
1350 				txqs->gen ^= 1;
1351 				txd = txq->desc;
1352 				txsd = txq->sdesc;
1353 			}
1354 
1355 			/*
1356 			 * when the head of the mbuf chain
1357 			 * is freed all clusters will be freed
1358 			 * with it
1359 			 */
1360 			wrp = (struct work_request_hdr *)txd;
1361 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1362 			    V_WR_SGLSFLT(1)) | wr_hi;
1363 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1364 				    sgl_flits + 1)) |
1365 			    V_WR_GEN(txqs->gen)) | wr_lo;
1366 			wr_gen2(txd, txqs->gen);
1367 			flits = 1;
1368 		}
1369 		wrp->wrh_hi |= htonl(F_WR_EOP);
1370 		wmb();
1371 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1372 		wr_gen2((struct tx_desc *)wp, ogen);
1373 	}
1374 }
1375 
1376 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1377 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1378 
1379 #define GET_VTAG(cntrl, m) \
1380 do { \
1381 	if ((m)->m_flags & M_VLANTAG)					            \
1382 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1383 } while (0)
1384 
1385 static int
1386 t3_encap(struct sge_qset *qs, struct mbuf **m)
1387 {
1388 	adapter_t *sc;
1389 	struct mbuf *m0;
1390 	struct sge_txq *txq;
1391 	struct txq_state txqs;
1392 	struct port_info *pi;
1393 	unsigned int ndesc, flits, cntrl, mlen;
1394 	int err, nsegs, tso_info = 0;
1395 
1396 	struct work_request_hdr *wrp;
1397 	struct tx_sw_desc *txsd;
1398 	struct sg_ent *sgp, *sgl;
1399 	uint32_t wr_hi, wr_lo, sgl_flits;
1400 	bus_dma_segment_t segs[TX_MAX_SEGS];
1401 
1402 	struct tx_desc *txd;
1403 
1404 	pi = qs->port;
1405 	sc = pi->adapter;
1406 	txq = &qs->txq[TXQ_ETH];
1407 	txd = &txq->desc[txq->pidx];
1408 	txsd = &txq->sdesc[txq->pidx];
1409 	sgl = txq->txq_sgl;
1410 
1411 	prefetch(txd);
1412 	m0 = *m;
1413 
1414 	mtx_assert(&qs->lock, MA_OWNED);
1415 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1416 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1417 
1418 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1419 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1420 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1421 
1422 	if (m0->m_nextpkt != NULL) {
1423 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1424 		ndesc = 1;
1425 		mlen = 0;
1426 	} else {
1427 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1428 		    &m0, segs, &nsegs))) {
1429 			if (cxgb_debug)
1430 				printf("failed ... err=%d\n", err);
1431 			return (err);
1432 		}
1433 		mlen = m0->m_pkthdr.len;
1434 		ndesc = calc_tx_descs(m0, nsegs);
1435 	}
1436 	txq_prod(txq, ndesc, &txqs);
1437 
1438 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1439 	txsd->m = m0;
1440 
1441 	if (m0->m_nextpkt != NULL) {
1442 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1443 		int i, fidx;
1444 
1445 		if (nsegs > 7)
1446 			panic("trying to coalesce %d packets in to one WR", nsegs);
1447 		txq->txq_coalesced += nsegs;
1448 		wrp = (struct work_request_hdr *)txd;
1449 		flits = nsegs*2 + 1;
1450 
1451 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1452 			struct cpl_tx_pkt_batch_entry *cbe;
1453 			uint64_t flit;
1454 			uint32_t *hflit = (uint32_t *)&flit;
1455 			int cflags = m0->m_pkthdr.csum_flags;
1456 
1457 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1458 			GET_VTAG(cntrl, m0);
1459 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1460 			if (__predict_false(!(cflags & CSUM_IP)))
1461 				cntrl |= F_TXPKT_IPCSUM_DIS;
1462 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP))))
1463 				cntrl |= F_TXPKT_L4CSUM_DIS;
1464 
1465 			hflit[0] = htonl(cntrl);
1466 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1467 			flit |= htobe64(1 << 24);
1468 			cbe = &cpl_batch->pkt_entry[i];
1469 			cbe->cntrl = hflit[0];
1470 			cbe->len = hflit[1];
1471 			cbe->addr = htobe64(segs[i].ds_addr);
1472 		}
1473 
1474 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1475 		    V_WR_SGLSFLT(flits)) |
1476 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1477 		wr_lo = htonl(V_WR_LEN(flits) |
1478 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1479 		set_wr_hdr(wrp, wr_hi, wr_lo);
1480 		wmb();
1481 		ETHER_BPF_MTAP(pi->ifp, m0);
1482 		wr_gen2(txd, txqs.gen);
1483 		check_ring_tx_db(sc, txq);
1484 		return (0);
1485 	} else if (tso_info) {
1486 		int eth_type;
1487 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1488 		struct ether_header *eh;
1489 		struct ip *ip;
1490 		struct tcphdr *tcp;
1491 
1492 		txd->flit[2] = 0;
1493 		GET_VTAG(cntrl, m0);
1494 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1495 		hdr->cntrl = htonl(cntrl);
1496 		hdr->len = htonl(mlen | 0x80000000);
1497 
1498 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1499 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1500 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1501 			    m0->m_pkthdr.csum_flags, m0->m_flags);
1502 			panic("tx tso packet too small");
1503 		}
1504 
1505 		/* Make sure that ether, ip, tcp headers are all in m0 */
1506 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1507 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1508 			if (__predict_false(m0 == NULL)) {
1509 				/* XXX panic probably an overreaction */
1510 				panic("couldn't fit header into mbuf");
1511 			}
1512 		}
1513 
1514 		eh = mtod(m0, struct ether_header *);
1515 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
1516 			eth_type = CPL_ETH_II_VLAN;
1517 			ip = (struct ip *)((struct ether_vlan_header *)eh + 1);
1518 		} else {
1519 			eth_type = CPL_ETH_II;
1520 			ip = (struct ip *)(eh + 1);
1521 		}
1522 		tcp = (struct tcphdr *)(ip + 1);
1523 
1524 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1525 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1526 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1527 		hdr->lso_info = htonl(tso_info);
1528 
1529 		if (__predict_false(mlen <= PIO_LEN)) {
1530 			/*
1531 			 * pkt not undersized but fits in PIO_LEN
1532 			 * Indicates a TSO bug at the higher levels.
1533 			 */
1534 			txsd->m = NULL;
1535 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1536 			flits = (mlen + 7) / 8 + 3;
1537 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1538 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1539 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1540 			wr_lo = htonl(V_WR_LEN(flits) |
1541 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1542 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1543 			wmb();
1544 			ETHER_BPF_MTAP(pi->ifp, m0);
1545 			wr_gen2(txd, txqs.gen);
1546 			check_ring_tx_db(sc, txq);
1547 			m_freem(m0);
1548 			return (0);
1549 		}
1550 		flits = 3;
1551 	} else {
1552 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1553 
1554 		GET_VTAG(cntrl, m0);
1555 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1556 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1557 			cntrl |= F_TXPKT_IPCSUM_DIS;
1558 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1559 			cntrl |= F_TXPKT_L4CSUM_DIS;
1560 		cpl->cntrl = htonl(cntrl);
1561 		cpl->len = htonl(mlen | 0x80000000);
1562 
1563 		if (mlen <= PIO_LEN) {
1564 			txsd->m = NULL;
1565 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1566 			flits = (mlen + 7) / 8 + 2;
1567 
1568 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1569 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1570 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1571 			wr_lo = htonl(V_WR_LEN(flits) |
1572 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1573 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1574 			wmb();
1575 			ETHER_BPF_MTAP(pi->ifp, m0);
1576 			wr_gen2(txd, txqs.gen);
1577 			check_ring_tx_db(sc, txq);
1578 			m_freem(m0);
1579 			return (0);
1580 		}
1581 		flits = 2;
1582 	}
1583 	wrp = (struct work_request_hdr *)txd;
1584 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1585 	make_sgl(sgp, segs, nsegs);
1586 
1587 	sgl_flits = sgl_len(nsegs);
1588 
1589 	ETHER_BPF_MTAP(pi->ifp, m0);
1590 
1591 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1592 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1593 	wr_lo = htonl(V_WR_TID(txq->token));
1594 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1595 	    sgl_flits, wr_hi, wr_lo);
1596 	check_ring_tx_db(sc, txq);
1597 
1598 	return (0);
1599 }
1600 
1601 void
1602 cxgb_tx_watchdog(void *arg)
1603 {
1604 	struct sge_qset *qs = arg;
1605 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1606 
1607         if (qs->coalescing != 0 &&
1608 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1609 	    TXQ_RING_EMPTY(qs))
1610                 qs->coalescing = 0;
1611         else if (qs->coalescing == 0 &&
1612 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1613                 qs->coalescing = 1;
1614 	if (TXQ_TRYLOCK(qs)) {
1615 		qs->qs_flags |= QS_FLUSHING;
1616 		cxgb_start_locked(qs);
1617 		qs->qs_flags &= ~QS_FLUSHING;
1618 		TXQ_UNLOCK(qs);
1619 	}
1620 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1621 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1622 		    qs, txq->txq_watchdog.c_cpu);
1623 }
1624 
1625 static void
1626 cxgb_tx_timeout(void *arg)
1627 {
1628 	struct sge_qset *qs = arg;
1629 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1630 
1631 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1632                 qs->coalescing = 1;
1633 	if (TXQ_TRYLOCK(qs)) {
1634 		qs->qs_flags |= QS_TIMEOUT;
1635 		cxgb_start_locked(qs);
1636 		qs->qs_flags &= ~QS_TIMEOUT;
1637 		TXQ_UNLOCK(qs);
1638 	}
1639 }
1640 
1641 static void
1642 cxgb_start_locked(struct sge_qset *qs)
1643 {
1644 	struct mbuf *m_head = NULL;
1645 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1646 	int in_use_init = txq->in_use;
1647 	struct port_info *pi = qs->port;
1648 	struct ifnet *ifp = pi->ifp;
1649 
1650 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1651 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1652 
1653 	if (!pi->link_config.link_ok) {
1654 		TXQ_RING_FLUSH(qs);
1655 		return;
1656 	}
1657 	TXQ_LOCK_ASSERT(qs);
1658 	while ((txq->in_use - in_use_init < TX_START_MAX_DESC) &&
1659 	    !TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1660 	    pi->link_config.link_ok) {
1661 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1662 
1663 		if (txq->size - txq->in_use <= TX_MAX_DESC)
1664 			break;
1665 
1666 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1667 			break;
1668 		/*
1669 		 *  Encapsulation can modify our pointer, and or make it
1670 		 *  NULL on failure.  In that event, we can't requeue.
1671 		 */
1672 		if (t3_encap(qs, &m_head) || m_head == NULL)
1673 			break;
1674 
1675 		m_head = NULL;
1676 	}
1677 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1678 	    pi->link_config.link_ok)
1679 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1680 		    qs, txq->txq_timer.c_cpu);
1681 	if (m_head != NULL)
1682 		m_freem(m_head);
1683 }
1684 
1685 static int
1686 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1687 {
1688 	struct port_info *pi = qs->port;
1689 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1690 	struct buf_ring *br = txq->txq_mr;
1691 	int error, avail;
1692 
1693 	avail = txq->size - txq->in_use;
1694 	TXQ_LOCK_ASSERT(qs);
1695 
1696 	/*
1697 	 * We can only do a direct transmit if the following are true:
1698 	 * - we aren't coalescing (ring < 3/4 full)
1699 	 * - the link is up -- checked in caller
1700 	 * - there are no packets enqueued already
1701 	 * - there is space in hardware transmit queue
1702 	 */
1703 	if (check_pkt_coalesce(qs) == 0 &&
1704 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1705 		if (t3_encap(qs, &m)) {
1706 			if (m != NULL &&
1707 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1708 				return (error);
1709 		} else {
1710 			/*
1711 			 * We've bypassed the buf ring so we need to update
1712 			 * the stats directly
1713 			 */
1714 			txq->txq_direct_packets++;
1715 			txq->txq_direct_bytes += m->m_pkthdr.len;
1716 		}
1717 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1718 		return (error);
1719 
1720 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1721 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1722 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1723 		cxgb_start_locked(qs);
1724 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1725 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1726 		    qs, txq->txq_timer.c_cpu);
1727 	return (0);
1728 }
1729 
1730 int
1731 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1732 {
1733 	struct sge_qset *qs;
1734 	struct port_info *pi = ifp->if_softc;
1735 	int error, qidx = pi->first_qset;
1736 
1737 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1738 	    ||(!pi->link_config.link_ok)) {
1739 		m_freem(m);
1740 		return (0);
1741 	}
1742 
1743 	if (m->m_flags & M_FLOWID)
1744 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1745 
1746 	qs = &pi->adapter->sge.qs[qidx];
1747 
1748 	if (TXQ_TRYLOCK(qs)) {
1749 		/* XXX running */
1750 		error = cxgb_transmit_locked(ifp, qs, m);
1751 		TXQ_UNLOCK(qs);
1752 	} else
1753 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1754 	return (error);
1755 }
1756 void
1757 cxgb_start(struct ifnet *ifp)
1758 {
1759 	struct port_info *pi = ifp->if_softc;
1760 	struct sge_qset *qs = &pi->adapter->sge.qs[pi->first_qset];
1761 
1762 	if (!pi->link_config.link_ok)
1763 		return;
1764 
1765 	TXQ_LOCK(qs);
1766 	cxgb_start_locked(qs);
1767 	TXQ_UNLOCK(qs);
1768 }
1769 
1770 void
1771 cxgb_qflush(struct ifnet *ifp)
1772 {
1773 	/*
1774 	 * flush any enqueued mbufs in the buf_rings
1775 	 * and in the transmit queues
1776 	 * no-op for now
1777 	 */
1778 	return;
1779 }
1780 
1781 /**
1782  *	write_imm - write a packet into a Tx descriptor as immediate data
1783  *	@d: the Tx descriptor to write
1784  *	@m: the packet
1785  *	@len: the length of packet data to write as immediate data
1786  *	@gen: the generation bit value to write
1787  *
1788  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1789  *	contains a work request at its beginning.  We must write the packet
1790  *	carefully so the SGE doesn't read accidentally before it's written in
1791  *	its entirety.
1792  */
1793 static __inline void
1794 write_imm(struct tx_desc *d, struct mbuf *m,
1795 	  unsigned int len, unsigned int gen)
1796 {
1797 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1798 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1799 	uint32_t wr_hi, wr_lo;
1800 
1801 	if (len > WR_LEN)
1802 		panic("len too big %d\n", len);
1803 	if (len < sizeof(*from))
1804 		panic("len too small %d", len);
1805 
1806 	memcpy(&to[1], &from[1], len - sizeof(*from));
1807 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1808 					V_WR_BCNTLFLT(len & 7));
1809 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) |
1810 					V_WR_LEN((len + 7) / 8));
1811 	set_wr_hdr(to, wr_hi, wr_lo);
1812 	wmb();
1813 	wr_gen2(d, gen);
1814 
1815 	/*
1816 	 * This check is a hack we should really fix the logic so
1817 	 * that this can't happen
1818 	 */
1819 	if (m->m_type != MT_DONTFREE)
1820 		m_freem(m);
1821 
1822 }
1823 
1824 /**
1825  *	check_desc_avail - check descriptor availability on a send queue
1826  *	@adap: the adapter
1827  *	@q: the TX queue
1828  *	@m: the packet needing the descriptors
1829  *	@ndesc: the number of Tx descriptors needed
1830  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1831  *
1832  *	Checks if the requested number of Tx descriptors is available on an
1833  *	SGE send queue.  If the queue is already suspended or not enough
1834  *	descriptors are available the packet is queued for later transmission.
1835  *	Must be called with the Tx queue locked.
1836  *
1837  *	Returns 0 if enough descriptors are available, 1 if there aren't
1838  *	enough descriptors and the packet has been queued, and 2 if the caller
1839  *	needs to retry because there weren't enough descriptors at the
1840  *	beginning of the call but some freed up in the mean time.
1841  */
1842 static __inline int
1843 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1844 		 struct mbuf *m, unsigned int ndesc,
1845 		 unsigned int qid)
1846 {
1847 	/*
1848 	 * XXX We currently only use this for checking the control queue
1849 	 * the control queue is only used for binding qsets which happens
1850 	 * at init time so we are guaranteed enough descriptors
1851 	 */
1852 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1853 addq_exit:	mbufq_tail(&q->sendq, m);
1854 		return 1;
1855 	}
1856 	if (__predict_false(q->size - q->in_use < ndesc)) {
1857 
1858 		struct sge_qset *qs = txq_to_qset(q, qid);
1859 
1860 		setbit(&qs->txq_stopped, qid);
1861 		if (should_restart_tx(q) &&
1862 		    test_and_clear_bit(qid, &qs->txq_stopped))
1863 			return 2;
1864 
1865 		q->stops++;
1866 		goto addq_exit;
1867 	}
1868 	return 0;
1869 }
1870 
1871 
1872 /**
1873  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1874  *	@q: the SGE control Tx queue
1875  *
1876  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1877  *	that send only immediate data (presently just the control queues) and
1878  *	thus do not have any mbufs
1879  */
1880 static __inline void
1881 reclaim_completed_tx_imm(struct sge_txq *q)
1882 {
1883 	unsigned int reclaim = q->processed - q->cleaned;
1884 
1885 	q->in_use -= reclaim;
1886 	q->cleaned += reclaim;
1887 }
1888 
1889 static __inline int
1890 immediate(const struct mbuf *m)
1891 {
1892 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1893 }
1894 
1895 /**
1896  *	ctrl_xmit - send a packet through an SGE control Tx queue
1897  *	@adap: the adapter
1898  *	@q: the control queue
1899  *	@m: the packet
1900  *
1901  *	Send a packet through an SGE control Tx queue.  Packets sent through
1902  *	a control queue must fit entirely as immediate data in a single Tx
1903  *	descriptor and have no page fragments.
1904  */
1905 static int
1906 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1907 {
1908 	int ret;
1909 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1910 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1911 
1912 	if (__predict_false(!immediate(m))) {
1913 		m_freem(m);
1914 		return 0;
1915 	}
1916 
1917 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1918 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1919 
1920 	TXQ_LOCK(qs);
1921 again:	reclaim_completed_tx_imm(q);
1922 
1923 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1924 	if (__predict_false(ret)) {
1925 		if (ret == 1) {
1926 			TXQ_UNLOCK(qs);
1927 			return (ENOSPC);
1928 		}
1929 		goto again;
1930 	}
1931 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1932 
1933 	q->in_use++;
1934 	if (++q->pidx >= q->size) {
1935 		q->pidx = 0;
1936 		q->gen ^= 1;
1937 	}
1938 	TXQ_UNLOCK(qs);
1939 	wmb();
1940 	t3_write_reg(adap, A_SG_KDOORBELL,
1941 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1942 	return (0);
1943 }
1944 
1945 
1946 /**
1947  *	restart_ctrlq - restart a suspended control queue
1948  *	@qs: the queue set cotaining the control queue
1949  *
1950  *	Resumes transmission on a suspended Tx control queue.
1951  */
1952 static void
1953 restart_ctrlq(void *data, int npending)
1954 {
1955 	struct mbuf *m;
1956 	struct sge_qset *qs = (struct sge_qset *)data;
1957 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1958 	adapter_t *adap = qs->port->adapter;
1959 
1960 	TXQ_LOCK(qs);
1961 again:	reclaim_completed_tx_imm(q);
1962 
1963 	while (q->in_use < q->size &&
1964 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1965 
1966 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1967 
1968 		if (++q->pidx >= q->size) {
1969 			q->pidx = 0;
1970 			q->gen ^= 1;
1971 		}
1972 		q->in_use++;
1973 	}
1974 	if (!mbufq_empty(&q->sendq)) {
1975 		setbit(&qs->txq_stopped, TXQ_CTRL);
1976 
1977 		if (should_restart_tx(q) &&
1978 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1979 			goto again;
1980 		q->stops++;
1981 	}
1982 	TXQ_UNLOCK(qs);
1983 	t3_write_reg(adap, A_SG_KDOORBELL,
1984 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1985 }
1986 
1987 
1988 /*
1989  * Send a management message through control queue 0
1990  */
1991 int
1992 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1993 {
1994 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1995 }
1996 
1997 /**
1998  *	free_qset - free the resources of an SGE queue set
1999  *	@sc: the controller owning the queue set
2000  *	@q: the queue set
2001  *
2002  *	Release the HW and SW resources associated with an SGE queue set, such
2003  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
2004  *	queue set must be quiesced prior to calling this.
2005  */
2006 static void
2007 t3_free_qset(adapter_t *sc, struct sge_qset *q)
2008 {
2009 	int i;
2010 
2011 	reclaim_completed_tx(q, 0, TXQ_ETH);
2012 	if (q->txq[TXQ_ETH].txq_mr != NULL)
2013 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
2014 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
2015 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
2016 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
2017 	}
2018 
2019 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2020 		if (q->fl[i].desc) {
2021 			mtx_lock_spin(&sc->sge.reg_lock);
2022 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2023 			mtx_unlock_spin(&sc->sge.reg_lock);
2024 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2025 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2026 					q->fl[i].desc_map);
2027 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2028 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2029 		}
2030 		if (q->fl[i].sdesc) {
2031 			free_rx_bufs(sc, &q->fl[i]);
2032 			free(q->fl[i].sdesc, M_DEVBUF);
2033 		}
2034 	}
2035 
2036 	mtx_unlock(&q->lock);
2037 	MTX_DESTROY(&q->lock);
2038 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2039 		if (q->txq[i].desc) {
2040 			mtx_lock_spin(&sc->sge.reg_lock);
2041 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2042 			mtx_unlock_spin(&sc->sge.reg_lock);
2043 			bus_dmamap_unload(q->txq[i].desc_tag,
2044 					q->txq[i].desc_map);
2045 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2046 					q->txq[i].desc_map);
2047 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2048 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2049 		}
2050 		if (q->txq[i].sdesc) {
2051 			free(q->txq[i].sdesc, M_DEVBUF);
2052 		}
2053 	}
2054 
2055 	if (q->rspq.desc) {
2056 		mtx_lock_spin(&sc->sge.reg_lock);
2057 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2058 		mtx_unlock_spin(&sc->sge.reg_lock);
2059 
2060 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2061 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2062 			        q->rspq.desc_map);
2063 		bus_dma_tag_destroy(q->rspq.desc_tag);
2064 		MTX_DESTROY(&q->rspq.lock);
2065 	}
2066 
2067 #ifdef INET
2068 	tcp_lro_free(&q->lro.ctrl);
2069 #endif
2070 
2071 	bzero(q, sizeof(*q));
2072 }
2073 
2074 /**
2075  *	t3_free_sge_resources - free SGE resources
2076  *	@sc: the adapter softc
2077  *
2078  *	Frees resources used by the SGE queue sets.
2079  */
2080 void
2081 t3_free_sge_resources(adapter_t *sc)
2082 {
2083 	int i, nqsets;
2084 
2085 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2086 		nqsets += sc->port[i].nqsets;
2087 
2088 	for (i = 0; i < nqsets; ++i) {
2089 		TXQ_LOCK(&sc->sge.qs[i]);
2090 		t3_free_qset(sc, &sc->sge.qs[i]);
2091 	}
2092 
2093 }
2094 
2095 /**
2096  *	t3_sge_start - enable SGE
2097  *	@sc: the controller softc
2098  *
2099  *	Enables the SGE for DMAs.  This is the last step in starting packet
2100  *	transfers.
2101  */
2102 void
2103 t3_sge_start(adapter_t *sc)
2104 {
2105 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2106 }
2107 
2108 /**
2109  *	t3_sge_stop - disable SGE operation
2110  *	@sc: the adapter
2111  *
2112  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2113  *	from error interrupts) or from normal process context.  In the latter
2114  *	case it also disables any pending queue restart tasklets.  Note that
2115  *	if it is called in interrupt context it cannot disable the restart
2116  *	tasklets as it cannot wait, however the tasklets will have no effect
2117  *	since the doorbells are disabled and the driver will call this again
2118  *	later from process context, at which time the tasklets will be stopped
2119  *	if they are still running.
2120  */
2121 void
2122 t3_sge_stop(adapter_t *sc)
2123 {
2124 	int i, nqsets;
2125 
2126 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2127 
2128 	if (sc->tq == NULL)
2129 		return;
2130 
2131 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2132 		nqsets += sc->port[i].nqsets;
2133 #ifdef notyet
2134 	/*
2135 	 *
2136 	 * XXX
2137 	 */
2138 	for (i = 0; i < nqsets; ++i) {
2139 		struct sge_qset *qs = &sc->sge.qs[i];
2140 
2141 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2142 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2143 	}
2144 #endif
2145 }
2146 
2147 /**
2148  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2149  *	@adapter: the adapter
2150  *	@q: the Tx queue to reclaim descriptors from
2151  *	@reclaimable: the number of descriptors to reclaim
2152  *      @m_vec_size: maximum number of buffers to reclaim
2153  *      @desc_reclaimed: returns the number of descriptors reclaimed
2154  *
2155  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2156  *	Tx buffers.  Called with the Tx queue lock held.
2157  *
2158  *      Returns number of buffers of reclaimed
2159  */
2160 void
2161 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2162 {
2163 	struct tx_sw_desc *txsd;
2164 	unsigned int cidx, mask;
2165 	struct sge_txq *q = &qs->txq[queue];
2166 
2167 #ifdef T3_TRACE
2168 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2169 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2170 #endif
2171 	cidx = q->cidx;
2172 	mask = q->size - 1;
2173 	txsd = &q->sdesc[cidx];
2174 
2175 	mtx_assert(&qs->lock, MA_OWNED);
2176 	while (reclaimable--) {
2177 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2178 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2179 
2180 		if (txsd->m != NULL) {
2181 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2182 				bus_dmamap_unload(q->entry_tag, txsd->map);
2183 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2184 			}
2185 			m_freem_list(txsd->m);
2186 			txsd->m = NULL;
2187 		} else
2188 			q->txq_skipped++;
2189 
2190 		++txsd;
2191 		if (++cidx == q->size) {
2192 			cidx = 0;
2193 			txsd = q->sdesc;
2194 		}
2195 	}
2196 	q->cidx = cidx;
2197 
2198 }
2199 
2200 /**
2201  *	is_new_response - check if a response is newly written
2202  *	@r: the response descriptor
2203  *	@q: the response queue
2204  *
2205  *	Returns true if a response descriptor contains a yet unprocessed
2206  *	response.
2207  */
2208 static __inline int
2209 is_new_response(const struct rsp_desc *r,
2210     const struct sge_rspq *q)
2211 {
2212 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2213 }
2214 
2215 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2216 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2217 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2218 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2219 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2220 
2221 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2222 #define NOMEM_INTR_DELAY 2500
2223 
2224 /**
2225  *	write_ofld_wr - write an offload work request
2226  *	@adap: the adapter
2227  *	@m: the packet to send
2228  *	@q: the Tx queue
2229  *	@pidx: index of the first Tx descriptor to write
2230  *	@gen: the generation value to use
2231  *	@ndesc: number of descriptors the packet will occupy
2232  *
2233  *	Write an offload work request to send the supplied packet.  The packet
2234  *	data already carry the work request with most fields populated.
2235  */
2236 static void
2237 write_ofld_wr(adapter_t *adap, struct mbuf *m,
2238     struct sge_txq *q, unsigned int pidx,
2239     unsigned int gen, unsigned int ndesc,
2240     bus_dma_segment_t *segs, unsigned int nsegs)
2241 {
2242 	unsigned int sgl_flits, flits;
2243 	struct work_request_hdr *from;
2244 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
2245 	struct tx_desc *d = &q->desc[pidx];
2246 	struct txq_state txqs;
2247 
2248 	if (immediate(m) && nsegs == 0) {
2249 		write_imm(d, m, m->m_len, gen);
2250 		return;
2251 	}
2252 
2253 	/* Only TX_DATA builds SGLs */
2254 	from = mtod(m, struct work_request_hdr *);
2255 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
2256 
2257 	flits = m->m_len / 8;
2258 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
2259 
2260 	make_sgl(sgp, segs, nsegs);
2261 	sgl_flits = sgl_len(nsegs);
2262 
2263 	txqs.gen = gen;
2264 	txqs.pidx = pidx;
2265 	txqs.compl = 0;
2266 
2267 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
2268 	    from->wrh_hi, from->wrh_lo);
2269 }
2270 
2271 /**
2272  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2273  *	@m: the packet
2274  *
2275  * 	Returns the number of Tx descriptors needed for the given offload
2276  * 	packet.  These packets are already fully constructed.
2277  */
2278 static __inline unsigned int
2279 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2280 {
2281 	unsigned int flits, cnt = 0;
2282 	int ndescs;
2283 
2284 	if (m->m_len <= WR_LEN && nsegs == 0)
2285 		return (1);                 /* packet fits as immediate data */
2286 
2287 	/*
2288 	 * This needs to be re-visited for TOE
2289 	 */
2290 
2291 	cnt = nsegs;
2292 
2293 	/* headers */
2294 	flits = m->m_len / 8;
2295 
2296 	ndescs = flits_to_desc(flits + sgl_len(cnt));
2297 
2298 	return (ndescs);
2299 }
2300 
2301 /**
2302  *	ofld_xmit - send a packet through an offload queue
2303  *	@adap: the adapter
2304  *	@q: the Tx offload queue
2305  *	@m: the packet
2306  *
2307  *	Send an offload packet through an SGE offload queue.
2308  */
2309 static int
2310 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2311 {
2312 	int ret, nsegs;
2313 	unsigned int ndesc;
2314 	unsigned int pidx, gen;
2315 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2316 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2317 	struct tx_sw_desc *stx;
2318 
2319 	nsegs = m_get_sgllen(m);
2320 	vsegs = m_get_sgl(m);
2321 	ndesc = calc_tx_descs_ofld(m, nsegs);
2322 	busdma_map_sgl(vsegs, segs, nsegs);
2323 
2324 	stx = &q->sdesc[q->pidx];
2325 
2326 	TXQ_LOCK(qs);
2327 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2328 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2329 	if (__predict_false(ret)) {
2330 		if (ret == 1) {
2331 			printf("no ofld desc avail\n");
2332 
2333 			m_set_priority(m, ndesc);     /* save for restart */
2334 			TXQ_UNLOCK(qs);
2335 			return (EINTR);
2336 		}
2337 		goto again;
2338 	}
2339 
2340 	gen = q->gen;
2341 	q->in_use += ndesc;
2342 	pidx = q->pidx;
2343 	q->pidx += ndesc;
2344 	if (q->pidx >= q->size) {
2345 		q->pidx -= q->size;
2346 		q->gen ^= 1;
2347 	}
2348 #ifdef T3_TRACE
2349 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2350 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2351 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2352 		  skb_shinfo(skb)->nr_frags);
2353 #endif
2354 	TXQ_UNLOCK(qs);
2355 
2356 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2357 	check_ring_tx_db(adap, q);
2358 	return (0);
2359 }
2360 
2361 /**
2362  *	restart_offloadq - restart a suspended offload queue
2363  *	@qs: the queue set cotaining the offload queue
2364  *
2365  *	Resumes transmission on a suspended Tx offload queue.
2366  */
2367 static void
2368 restart_offloadq(void *data, int npending)
2369 {
2370 	struct mbuf *m;
2371 	struct sge_qset *qs = data;
2372 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2373 	adapter_t *adap = qs->port->adapter;
2374 	bus_dma_segment_t segs[TX_MAX_SEGS];
2375 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2376 	int nsegs, cleaned;
2377 
2378 	TXQ_LOCK(qs);
2379 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2380 
2381 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2382 		unsigned int gen, pidx;
2383 		unsigned int ndesc = m_get_priority(m);
2384 
2385 		if (__predict_false(q->size - q->in_use < ndesc)) {
2386 			setbit(&qs->txq_stopped, TXQ_OFLD);
2387 			if (should_restart_tx(q) &&
2388 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2389 				goto again;
2390 			q->stops++;
2391 			break;
2392 		}
2393 
2394 		gen = q->gen;
2395 		q->in_use += ndesc;
2396 		pidx = q->pidx;
2397 		q->pidx += ndesc;
2398 		if (q->pidx >= q->size) {
2399 			q->pidx -= q->size;
2400 			q->gen ^= 1;
2401 		}
2402 
2403 		(void)mbufq_dequeue(&q->sendq);
2404 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2405 		TXQ_UNLOCK(qs);
2406 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2407 		TXQ_LOCK(qs);
2408 	}
2409 #if USE_GTS
2410 	set_bit(TXQ_RUNNING, &q->flags);
2411 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2412 #endif
2413 	TXQ_UNLOCK(qs);
2414 	wmb();
2415 	t3_write_reg(adap, A_SG_KDOORBELL,
2416 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2417 }
2418 
2419 /**
2420  *	queue_set - return the queue set a packet should use
2421  *	@m: the packet
2422  *
2423  *	Maps a packet to the SGE queue set it should use.  The desired queue
2424  *	set is carried in bits 1-3 in the packet's priority.
2425  */
2426 static __inline int
2427 queue_set(const struct mbuf *m)
2428 {
2429 	return m_get_priority(m) >> 1;
2430 }
2431 
2432 /**
2433  *	is_ctrl_pkt - return whether an offload packet is a control packet
2434  *	@m: the packet
2435  *
2436  *	Determines whether an offload packet should use an OFLD or a CTRL
2437  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2438  */
2439 static __inline int
2440 is_ctrl_pkt(const struct mbuf *m)
2441 {
2442 	return m_get_priority(m) & 1;
2443 }
2444 
2445 /**
2446  *	t3_offload_tx - send an offload packet
2447  *	@tdev: the offload device to send to
2448  *	@m: the packet
2449  *
2450  *	Sends an offload packet.  We use the packet priority to select the
2451  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2452  *	should be sent as regular or control, bits 1-3 select the queue set.
2453  */
2454 int
2455 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2456 {
2457 	adapter_t *adap = tdev2adap(tdev);
2458 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2459 
2460 	if (__predict_false(is_ctrl_pkt(m)))
2461 		return ctrl_xmit(adap, qs, m);
2462 
2463 	return ofld_xmit(adap, qs, m);
2464 }
2465 
2466 /**
2467  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2468  *	@tdev: the offload device that will be receiving the packets
2469  *	@q: the SGE response queue that assembled the bundle
2470  *	@m: the partial bundle
2471  *	@n: the number of packets in the bundle
2472  *
2473  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2474  */
2475 static __inline void
2476 deliver_partial_bundle(struct t3cdev *tdev,
2477 			struct sge_rspq *q,
2478 			struct mbuf *mbufs[], int n)
2479 {
2480 	if (n) {
2481 		q->offload_bundles++;
2482 		cxgb_ofld_recv(tdev, mbufs, n);
2483 	}
2484 }
2485 
2486 static __inline int
2487 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2488     struct mbuf *m, struct mbuf *rx_gather[],
2489     unsigned int gather_idx)
2490 {
2491 
2492 	rq->offload_pkts++;
2493 	m->m_pkthdr.header = mtod(m, void *);
2494 	rx_gather[gather_idx++] = m;
2495 	if (gather_idx == RX_BUNDLE_SIZE) {
2496 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2497 		gather_idx = 0;
2498 		rq->offload_bundles++;
2499 	}
2500 	return (gather_idx);
2501 }
2502 
2503 static void
2504 restart_tx(struct sge_qset *qs)
2505 {
2506 	struct adapter *sc = qs->port->adapter;
2507 
2508 
2509 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2510 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2511 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2512 		qs->txq[TXQ_OFLD].restarts++;
2513 		DPRINTF("restarting TXQ_OFLD\n");
2514 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2515 	}
2516 	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2517 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2518 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2519 	    qs->txq[TXQ_CTRL].in_use);
2520 
2521 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2522 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2523 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2524 		qs->txq[TXQ_CTRL].restarts++;
2525 		DPRINTF("restarting TXQ_CTRL\n");
2526 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2527 	}
2528 }
2529 
2530 /**
2531  *	t3_sge_alloc_qset - initialize an SGE queue set
2532  *	@sc: the controller softc
2533  *	@id: the queue set id
2534  *	@nports: how many Ethernet ports will be using this queue set
2535  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2536  *	@p: configuration parameters for this queue set
2537  *	@ntxq: number of Tx queues for the queue set
2538  *	@pi: port info for queue set
2539  *
2540  *	Allocate resources and initialize an SGE queue set.  A queue set
2541  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2542  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2543  *	queue, offload queue, and control queue.
2544  */
2545 int
2546 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2547 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2548 {
2549 	struct sge_qset *q = &sc->sge.qs[id];
2550 	int i, ret = 0;
2551 
2552 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2553 	q->port = pi;
2554 
2555 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2556 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2557 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2558 		goto err;
2559 	}
2560 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2561 	    M_NOWAIT | M_ZERO)) == NULL) {
2562 		device_printf(sc->dev, "failed to allocate ifq\n");
2563 		goto err;
2564 	}
2565 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2566 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2567 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2568 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2569 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2570 
2571 	init_qset_cntxt(q, id);
2572 	q->idx = id;
2573 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2574 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2575 		    &q->fl[0].desc, &q->fl[0].sdesc,
2576 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2577 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2578 		printf("error %d from alloc ring fl0\n", ret);
2579 		goto err;
2580 	}
2581 
2582 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2583 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2584 		    &q->fl[1].desc, &q->fl[1].sdesc,
2585 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2586 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2587 		printf("error %d from alloc ring fl1\n", ret);
2588 		goto err;
2589 	}
2590 
2591 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2592 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2593 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2594 		    NULL, NULL)) != 0) {
2595 		printf("error %d from alloc ring rspq\n", ret);
2596 		goto err;
2597 	}
2598 
2599 	for (i = 0; i < ntxq; ++i) {
2600 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2601 
2602 		if ((ret = alloc_ring(sc, p->txq_size[i],
2603 			    sizeof(struct tx_desc), sz,
2604 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2605 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2606 			    &q->txq[i].desc_map,
2607 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2608 			printf("error %d from alloc ring tx %i\n", ret, i);
2609 			goto err;
2610 		}
2611 		mbufq_init(&q->txq[i].sendq);
2612 		q->txq[i].gen = 1;
2613 		q->txq[i].size = p->txq_size[i];
2614 	}
2615 
2616 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2617 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2618 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2619 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2620 
2621 	q->fl[0].gen = q->fl[1].gen = 1;
2622 	q->fl[0].size = p->fl_size;
2623 	q->fl[1].size = p->jumbo_size;
2624 
2625 	q->rspq.gen = 1;
2626 	q->rspq.cidx = 0;
2627 	q->rspq.size = p->rspq_size;
2628 
2629 	q->txq[TXQ_ETH].stop_thres = nports *
2630 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2631 
2632 	q->fl[0].buf_size = MCLBYTES;
2633 	q->fl[0].zone = zone_pack;
2634 	q->fl[0].type = EXT_PACKET;
2635 
2636 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2637 		q->fl[1].zone = zone_jumbo16;
2638 		q->fl[1].type = EXT_JUMBO16;
2639 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2640 		q->fl[1].zone = zone_jumbo9;
2641 		q->fl[1].type = EXT_JUMBO9;
2642 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2643 		q->fl[1].zone = zone_jumbop;
2644 		q->fl[1].type = EXT_JUMBOP;
2645 	} else {
2646 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2647 		ret = EDOOFUS;
2648 		goto err;
2649 	}
2650 	q->fl[1].buf_size = p->jumbo_buf_size;
2651 
2652 	/* Allocate and setup the lro_ctrl structure */
2653 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2654 #ifdef INET
2655 	ret = tcp_lro_init(&q->lro.ctrl);
2656 	if (ret) {
2657 		printf("error %d from tcp_lro_init\n", ret);
2658 		goto err;
2659 	}
2660 #endif
2661 	q->lro.ctrl.ifp = pi->ifp;
2662 
2663 	mtx_lock_spin(&sc->sge.reg_lock);
2664 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2665 				   q->rspq.phys_addr, q->rspq.size,
2666 				   q->fl[0].buf_size, 1, 0);
2667 	if (ret) {
2668 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2669 		goto err_unlock;
2670 	}
2671 
2672 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2673 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2674 					  q->fl[i].phys_addr, q->fl[i].size,
2675 					  q->fl[i].buf_size, p->cong_thres, 1,
2676 					  0);
2677 		if (ret) {
2678 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2679 			goto err_unlock;
2680 		}
2681 	}
2682 
2683 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2684 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2685 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2686 				 1, 0);
2687 	if (ret) {
2688 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2689 		goto err_unlock;
2690 	}
2691 
2692 	if (ntxq > 1) {
2693 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2694 					 USE_GTS, SGE_CNTXT_OFLD, id,
2695 					 q->txq[TXQ_OFLD].phys_addr,
2696 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2697 		if (ret) {
2698 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2699 			goto err_unlock;
2700 		}
2701 	}
2702 
2703 	if (ntxq > 2) {
2704 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2705 					 SGE_CNTXT_CTRL, id,
2706 					 q->txq[TXQ_CTRL].phys_addr,
2707 					 q->txq[TXQ_CTRL].size,
2708 					 q->txq[TXQ_CTRL].token, 1, 0);
2709 		if (ret) {
2710 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2711 			goto err_unlock;
2712 		}
2713 	}
2714 
2715 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2716 	    device_get_unit(sc->dev), irq_vec_idx);
2717 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2718 
2719 	mtx_unlock_spin(&sc->sge.reg_lock);
2720 	t3_update_qset_coalesce(q, p);
2721 	q->port = pi;
2722 
2723 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2724 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2725 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2726 
2727 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2728 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2729 
2730 	return (0);
2731 
2732 err_unlock:
2733 	mtx_unlock_spin(&sc->sge.reg_lock);
2734 err:
2735 	TXQ_LOCK(q);
2736 	t3_free_qset(sc, q);
2737 
2738 	return (ret);
2739 }
2740 
2741 /*
2742  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2743  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2744  * will also be taken into account here.
2745  */
2746 void
2747 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2748 {
2749 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2750 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2751 	struct ifnet *ifp = pi->ifp;
2752 
2753 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2754 
2755 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2756 	    cpl->csum_valid && cpl->csum == 0xffff) {
2757 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2758 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2759 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2760 		m->m_pkthdr.csum_data = 0xffff;
2761 	}
2762 
2763 	if (cpl->vlan_valid) {
2764 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2765 		m->m_flags |= M_VLANTAG;
2766 	}
2767 
2768 	m->m_pkthdr.rcvif = ifp;
2769 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2770 	/*
2771 	 * adjust after conversion to mbuf chain
2772 	 */
2773 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2774 	m->m_len -= (sizeof(*cpl) + ethpad);
2775 	m->m_data += (sizeof(*cpl) + ethpad);
2776 }
2777 
2778 /**
2779  *	get_packet - return the next ingress packet buffer from a free list
2780  *	@adap: the adapter that received the packet
2781  *	@drop_thres: # of remaining buffers before we start dropping packets
2782  *	@qs: the qset that the SGE free list holding the packet belongs to
2783  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2784  *      @r: response descriptor
2785  *
2786  *	Get the next packet from a free list and complete setup of the
2787  *	sk_buff.  If the packet is small we make a copy and recycle the
2788  *	original buffer, otherwise we use the original buffer itself.  If a
2789  *	positive drop threshold is supplied packets are dropped and their
2790  *	buffers recycled if (a) the number of remaining buffers is under the
2791  *	threshold and the packet is too big to copy, or (b) the packet should
2792  *	be copied but there is no memory for the copy.
2793  */
2794 static int
2795 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2796     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2797 {
2798 
2799 	unsigned int len_cq =  ntohl(r->len_cq);
2800 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2801 	int mask, cidx = fl->cidx;
2802 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2803 	uint32_t len = G_RSPD_LEN(len_cq);
2804 	uint32_t flags = M_EXT;
2805 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2806 	caddr_t cl;
2807 	struct mbuf *m;
2808 	int ret = 0;
2809 
2810 	mask = fl->size - 1;
2811 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2812 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2813 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2814 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2815 
2816 	fl->credits--;
2817 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2818 
2819 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2820 	    sopeop == RSPQ_SOP_EOP) {
2821 		if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2822 			goto skip_recycle;
2823 		cl = mtod(m, void *);
2824 		memcpy(cl, sd->rxsd_cl, len);
2825 		recycle_rx_buf(adap, fl, fl->cidx);
2826 		m->m_pkthdr.len = m->m_len = len;
2827 		m->m_flags = 0;
2828 		mh->mh_head = mh->mh_tail = m;
2829 		ret = 1;
2830 		goto done;
2831 	} else {
2832 	skip_recycle:
2833 		bus_dmamap_unload(fl->entry_tag, sd->map);
2834 		cl = sd->rxsd_cl;
2835 		m = sd->m;
2836 
2837 		if ((sopeop == RSPQ_SOP_EOP) ||
2838 		    (sopeop == RSPQ_SOP))
2839 			flags |= M_PKTHDR;
2840 		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2841 		if (fl->zone == zone_pack) {
2842 			/*
2843 			 * restore clobbered data pointer
2844 			 */
2845 			m->m_data = m->m_ext.ext_buf;
2846 		} else {
2847 			m_cljset(m, cl, fl->type);
2848 		}
2849 		m->m_len = len;
2850 	}
2851 	switch(sopeop) {
2852 	case RSPQ_SOP_EOP:
2853 		ret = 1;
2854 		/* FALLTHROUGH */
2855 	case RSPQ_SOP:
2856 		mh->mh_head = mh->mh_tail = m;
2857 		m->m_pkthdr.len = len;
2858 		break;
2859 	case RSPQ_EOP:
2860 		ret = 1;
2861 		/* FALLTHROUGH */
2862 	case RSPQ_NSOP_NEOP:
2863 		if (mh->mh_tail == NULL) {
2864 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2865 			m_freem(m);
2866 			break;
2867 		}
2868 		mh->mh_tail->m_next = m;
2869 		mh->mh_tail = m;
2870 		mh->mh_head->m_pkthdr.len += len;
2871 		break;
2872 	}
2873 	if (cxgb_debug)
2874 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2875 done:
2876 	if (++fl->cidx == fl->size)
2877 		fl->cidx = 0;
2878 
2879 	return (ret);
2880 }
2881 
2882 /**
2883  *	handle_rsp_cntrl_info - handles control information in a response
2884  *	@qs: the queue set corresponding to the response
2885  *	@flags: the response control flags
2886  *
2887  *	Handles the control information of an SGE response, such as GTS
2888  *	indications and completion credits for the queue set's Tx queues.
2889  *	HW coalesces credits, we don't do any extra SW coalescing.
2890  */
2891 static __inline void
2892 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2893 {
2894 	unsigned int credits;
2895 
2896 #if USE_GTS
2897 	if (flags & F_RSPD_TXQ0_GTS)
2898 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2899 #endif
2900 	credits = G_RSPD_TXQ0_CR(flags);
2901 	if (credits)
2902 		qs->txq[TXQ_ETH].processed += credits;
2903 
2904 	credits = G_RSPD_TXQ2_CR(flags);
2905 	if (credits)
2906 		qs->txq[TXQ_CTRL].processed += credits;
2907 
2908 # if USE_GTS
2909 	if (flags & F_RSPD_TXQ1_GTS)
2910 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2911 # endif
2912 	credits = G_RSPD_TXQ1_CR(flags);
2913 	if (credits)
2914 		qs->txq[TXQ_OFLD].processed += credits;
2915 
2916 }
2917 
2918 static void
2919 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2920     unsigned int sleeping)
2921 {
2922 	;
2923 }
2924 
2925 /**
2926  *	process_responses - process responses from an SGE response queue
2927  *	@adap: the adapter
2928  *	@qs: the queue set to which the response queue belongs
2929  *	@budget: how many responses can be processed in this round
2930  *
2931  *	Process responses from an SGE response queue up to the supplied budget.
2932  *	Responses include received packets as well as credits and other events
2933  *	for the queues that belong to the response queue's queue set.
2934  *	A negative budget is effectively unlimited.
2935  *
2936  *	Additionally choose the interrupt holdoff time for the next interrupt
2937  *	on this queue.  If the system is under memory shortage use a fairly
2938  *	long delay to help recovery.
2939  */
2940 static int
2941 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2942 {
2943 	struct sge_rspq *rspq = &qs->rspq;
2944 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2945 	int budget_left = budget;
2946 	unsigned int sleeping = 0;
2947 	int lro_enabled = qs->lro.enabled;
2948 	int skip_lro;
2949 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2950 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2951 	int ngathered = 0;
2952 #ifdef DEBUG
2953 	static int last_holdoff = 0;
2954 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2955 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2956 		last_holdoff = rspq->holdoff_tmr;
2957 	}
2958 #endif
2959 	rspq->next_holdoff = rspq->holdoff_tmr;
2960 
2961 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2962 		int eth, eop = 0, ethpad = 0;
2963 		uint32_t flags = ntohl(r->flags);
2964 		uint32_t rss_csum = *(const uint32_t *)r;
2965 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2966 
2967 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2968 
2969 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2970 			struct mbuf *m;
2971 
2972 			if (cxgb_debug)
2973 				printf("async notification\n");
2974 
2975 			if (rspq->rspq_mh.mh_head == NULL) {
2976 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2977 				m = rspq->rspq_mh.mh_head;
2978 			} else {
2979 				m = m_gethdr(M_DONTWAIT, MT_DATA);
2980 			}
2981 			if (m == NULL)
2982 				goto no_mem;
2983 
2984                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2985 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2986                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
2987 			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
2988 			eop = 1;
2989                         rspq->async_notif++;
2990 			goto skip;
2991 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2992 			struct mbuf *m = NULL;
2993 
2994 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
2995 			    r->rss_hdr.opcode, rspq->cidx);
2996 			if (rspq->rspq_mh.mh_head == NULL)
2997 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2998                         else
2999 				m = m_gethdr(M_DONTWAIT, MT_DATA);
3000 
3001 			if (rspq->rspq_mh.mh_head == NULL &&  m == NULL) {
3002 		no_mem:
3003 				rspq->next_holdoff = NOMEM_INTR_DELAY;
3004 				budget_left--;
3005 				break;
3006 			}
3007 			get_imm_packet(adap, r, rspq->rspq_mh.mh_head);
3008 			eop = 1;
3009 			rspq->imm_data++;
3010 		} else if (r->len_cq) {
3011 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
3012 
3013 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
3014 			if (eop) {
3015 				rspq->rspq_mh.mh_head->m_flags |= M_FLOWID;
3016 				rspq->rspq_mh.mh_head->m_pkthdr.flowid = rss_hash;
3017 			}
3018 
3019 			ethpad = 2;
3020 		} else {
3021 			rspq->pure_rsps++;
3022 		}
3023 	skip:
3024 		if (flags & RSPD_CTRL_MASK) {
3025 			sleeping |= flags & RSPD_GTS_MASK;
3026 			handle_rsp_cntrl_info(qs, flags);
3027 		}
3028 
3029 		r++;
3030 		if (__predict_false(++rspq->cidx == rspq->size)) {
3031 			rspq->cidx = 0;
3032 			rspq->gen ^= 1;
3033 			r = rspq->desc;
3034 		}
3035 
3036 		if (++rspq->credits >= (rspq->size / 4)) {
3037 			refill_rspq(adap, rspq, rspq->credits);
3038 			rspq->credits = 0;
3039 		}
3040 		if (!eth && eop) {
3041 			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
3042 			/*
3043 			 * XXX size mismatch
3044 			 */
3045 			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
3046 
3047 
3048 			ngathered = rx_offload(&adap->tdev, rspq,
3049 			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
3050 			rspq->rspq_mh.mh_head = NULL;
3051 			DPRINTF("received offload packet\n");
3052 
3053 		} else if (eth && eop) {
3054 			struct mbuf *m = rspq->rspq_mh.mh_head;
3055 
3056 			t3_rx_eth(adap, rspq, m, ethpad);
3057 
3058 			/*
3059 			 * The T304 sends incoming packets on any qset.  If LRO
3060 			 * is also enabled, we could end up sending packet up
3061 			 * lro_ctrl->ifp's input.  That is incorrect.
3062 			 *
3063 			 * The mbuf's rcvif was derived from the cpl header and
3064 			 * is accurate.  Skip LRO and just use that.
3065 			 */
3066 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
3067 
3068 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
3069 #ifdef INET
3070 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
3071 #endif
3072 			    ) {
3073 				/* successfully queue'd for LRO */
3074 			} else {
3075 				/*
3076 				 * LRO not enabled, packet unsuitable for LRO,
3077 				 * or unable to queue.  Pass it up right now in
3078 				 * either case.
3079 				 */
3080 				struct ifnet *ifp = m->m_pkthdr.rcvif;
3081 				(*ifp->if_input)(ifp, m);
3082 			}
3083 			rspq->rspq_mh.mh_head = NULL;
3084 
3085 		}
3086 		__refill_fl_lt(adap, &qs->fl[0], 32);
3087 		__refill_fl_lt(adap, &qs->fl[1], 32);
3088 		--budget_left;
3089 	}
3090 
3091 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
3092 
3093 #ifdef INET
3094 	/* Flush LRO */
3095 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
3096 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
3097 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
3098 		tcp_lro_flush(lro_ctrl, queued);
3099 	}
3100 #endif
3101 
3102 	if (sleeping)
3103 		check_ring_db(adap, qs, sleeping);
3104 
3105 	mb();  /* commit Tx queue processed updates */
3106 	if (__predict_false(qs->txq_stopped > 1))
3107 		restart_tx(qs);
3108 
3109 	__refill_fl_lt(adap, &qs->fl[0], 512);
3110 	__refill_fl_lt(adap, &qs->fl[1], 512);
3111 	budget -= budget_left;
3112 	return (budget);
3113 }
3114 
3115 /*
3116  * A helper function that processes responses and issues GTS.
3117  */
3118 static __inline int
3119 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3120 {
3121 	int work;
3122 	static int last_holdoff = 0;
3123 
3124 	work = process_responses(adap, rspq_to_qset(rq), -1);
3125 
3126 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3127 		printf("next_holdoff=%d\n", rq->next_holdoff);
3128 		last_holdoff = rq->next_holdoff;
3129 	}
3130 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3131 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3132 
3133 	return (work);
3134 }
3135 
3136 
3137 /*
3138  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3139  * Handles data events from SGE response queues as well as error and other
3140  * async events as they all use the same interrupt pin.  We use one SGE
3141  * response queue per port in this mode and protect all response queues with
3142  * queue 0's lock.
3143  */
3144 void
3145 t3b_intr(void *data)
3146 {
3147 	uint32_t i, map;
3148 	adapter_t *adap = data;
3149 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3150 
3151 	t3_write_reg(adap, A_PL_CLI, 0);
3152 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3153 
3154 	if (!map)
3155 		return;
3156 
3157 	if (__predict_false(map & F_ERRINTR))
3158 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3159 
3160 	mtx_lock(&q0->lock);
3161 	for_each_port(adap, i)
3162 	    if (map & (1 << i))
3163 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3164 	mtx_unlock(&q0->lock);
3165 }
3166 
3167 /*
3168  * The MSI interrupt handler.  This needs to handle data events from SGE
3169  * response queues as well as error and other async events as they all use
3170  * the same MSI vector.  We use one SGE response queue per port in this mode
3171  * and protect all response queues with queue 0's lock.
3172  */
3173 void
3174 t3_intr_msi(void *data)
3175 {
3176 	adapter_t *adap = data;
3177 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3178 	int i, new_packets = 0;
3179 
3180 	mtx_lock(&q0->lock);
3181 
3182 	for_each_port(adap, i)
3183 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3184 		    new_packets = 1;
3185 	mtx_unlock(&q0->lock);
3186 	if (new_packets == 0)
3187 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3188 }
3189 
3190 void
3191 t3_intr_msix(void *data)
3192 {
3193 	struct sge_qset *qs = data;
3194 	adapter_t *adap = qs->port->adapter;
3195 	struct sge_rspq *rspq = &qs->rspq;
3196 
3197 	if (process_responses_gts(adap, rspq) == 0)
3198 		rspq->unhandled_irqs++;
3199 }
3200 
3201 #define QDUMP_SBUF_SIZE		32 * 400
3202 static int
3203 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3204 {
3205 	struct sge_rspq *rspq;
3206 	struct sge_qset *qs;
3207 	int i, err, dump_end, idx;
3208 	static int multiplier = 1;
3209 	struct sbuf *sb;
3210 	struct rsp_desc *rspd;
3211 	uint32_t data[4];
3212 
3213 	rspq = arg1;
3214 	qs = rspq_to_qset(rspq);
3215 	if (rspq->rspq_dump_count == 0)
3216 		return (0);
3217 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3218 		log(LOG_WARNING,
3219 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3220 		rspq->rspq_dump_count = 0;
3221 		return (EINVAL);
3222 	}
3223 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3224 		log(LOG_WARNING,
3225 		    "dump start of %d is greater than queue size\n",
3226 		    rspq->rspq_dump_start);
3227 		rspq->rspq_dump_start = 0;
3228 		return (EINVAL);
3229 	}
3230 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3231 	if (err)
3232 		return (err);
3233 retry_sbufops:
3234 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3235 
3236 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3237 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3238 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3239 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3240 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3241 
3242 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3243 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3244 
3245 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3246 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3247 		idx = i & (RSPQ_Q_SIZE-1);
3248 
3249 		rspd = &rspq->desc[idx];
3250 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3251 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3252 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3253 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3254 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3255 		    be32toh(rspd->len_cq), rspd->intr_gen);
3256 	}
3257 	if (sbuf_overflowed(sb)) {
3258 		sbuf_delete(sb);
3259 		multiplier++;
3260 		goto retry_sbufops;
3261 	}
3262 	sbuf_finish(sb);
3263 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3264 	sbuf_delete(sb);
3265 	return (err);
3266 }
3267 
3268 static int
3269 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3270 {
3271 	struct sge_txq *txq;
3272 	struct sge_qset *qs;
3273 	int i, j, err, dump_end;
3274 	static int multiplier = 1;
3275 	struct sbuf *sb;
3276 	struct tx_desc *txd;
3277 	uint32_t *WR, wr_hi, wr_lo, gen;
3278 	uint32_t data[4];
3279 
3280 	txq = arg1;
3281 	qs = txq_to_qset(txq, TXQ_ETH);
3282 	if (txq->txq_dump_count == 0) {
3283 		return (0);
3284 	}
3285 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3286 		log(LOG_WARNING,
3287 		    "dump count is too large %d\n", txq->txq_dump_count);
3288 		txq->txq_dump_count = 1;
3289 		return (EINVAL);
3290 	}
3291 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3292 		log(LOG_WARNING,
3293 		    "dump start of %d is greater than queue size\n",
3294 		    txq->txq_dump_start);
3295 		txq->txq_dump_start = 0;
3296 		return (EINVAL);
3297 	}
3298 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3299 	if (err)
3300 		return (err);
3301 
3302 
3303 retry_sbufops:
3304 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3305 
3306 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3307 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3308 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3309 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3310 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3311 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3312 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3313 	    txq->txq_dump_start,
3314 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3315 
3316 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3317 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3318 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3319 		WR = (uint32_t *)txd->flit;
3320 		wr_hi = ntohl(WR[0]);
3321 		wr_lo = ntohl(WR[1]);
3322 		gen = G_WR_GEN(wr_lo);
3323 
3324 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3325 		    wr_hi, wr_lo, gen);
3326 		for (j = 2; j < 30; j += 4)
3327 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3328 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3329 
3330 	}
3331 	if (sbuf_overflowed(sb)) {
3332 		sbuf_delete(sb);
3333 		multiplier++;
3334 		goto retry_sbufops;
3335 	}
3336 	sbuf_finish(sb);
3337 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3338 	sbuf_delete(sb);
3339 	return (err);
3340 }
3341 
3342 static int
3343 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3344 {
3345 	struct sge_txq *txq;
3346 	struct sge_qset *qs;
3347 	int i, j, err, dump_end;
3348 	static int multiplier = 1;
3349 	struct sbuf *sb;
3350 	struct tx_desc *txd;
3351 	uint32_t *WR, wr_hi, wr_lo, gen;
3352 
3353 	txq = arg1;
3354 	qs = txq_to_qset(txq, TXQ_CTRL);
3355 	if (txq->txq_dump_count == 0) {
3356 		return (0);
3357 	}
3358 	if (txq->txq_dump_count > 256) {
3359 		log(LOG_WARNING,
3360 		    "dump count is too large %d\n", txq->txq_dump_count);
3361 		txq->txq_dump_count = 1;
3362 		return (EINVAL);
3363 	}
3364 	if (txq->txq_dump_start > 255) {
3365 		log(LOG_WARNING,
3366 		    "dump start of %d is greater than queue size\n",
3367 		    txq->txq_dump_start);
3368 		txq->txq_dump_start = 0;
3369 		return (EINVAL);
3370 	}
3371 
3372 retry_sbufops:
3373 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3374 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3375 	    txq->txq_dump_start,
3376 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3377 
3378 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3379 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3380 		txd = &txq->desc[i & (255)];
3381 		WR = (uint32_t *)txd->flit;
3382 		wr_hi = ntohl(WR[0]);
3383 		wr_lo = ntohl(WR[1]);
3384 		gen = G_WR_GEN(wr_lo);
3385 
3386 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3387 		    wr_hi, wr_lo, gen);
3388 		for (j = 2; j < 30; j += 4)
3389 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3390 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3391 
3392 	}
3393 	if (sbuf_overflowed(sb)) {
3394 		sbuf_delete(sb);
3395 		multiplier++;
3396 		goto retry_sbufops;
3397 	}
3398 	sbuf_finish(sb);
3399 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3400 	sbuf_delete(sb);
3401 	return (err);
3402 }
3403 
3404 static int
3405 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3406 {
3407 	adapter_t *sc = arg1;
3408 	struct qset_params *qsp = &sc->params.sge.qset[0];
3409 	int coalesce_usecs;
3410 	struct sge_qset *qs;
3411 	int i, j, err, nqsets = 0;
3412 	struct mtx *lock;
3413 
3414 	if ((sc->flags & FULL_INIT_DONE) == 0)
3415 		return (ENXIO);
3416 
3417 	coalesce_usecs = qsp->coalesce_usecs;
3418         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3419 
3420 	if (err != 0) {
3421 		return (err);
3422 	}
3423 	if (coalesce_usecs == qsp->coalesce_usecs)
3424 		return (0);
3425 
3426 	for (i = 0; i < sc->params.nports; i++)
3427 		for (j = 0; j < sc->port[i].nqsets; j++)
3428 			nqsets++;
3429 
3430 	coalesce_usecs = max(1, coalesce_usecs);
3431 
3432 	for (i = 0; i < nqsets; i++) {
3433 		qs = &sc->sge.qs[i];
3434 		qsp = &sc->params.sge.qset[i];
3435 		qsp->coalesce_usecs = coalesce_usecs;
3436 
3437 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3438 			    &sc->sge.qs[0].rspq.lock;
3439 
3440 		mtx_lock(lock);
3441 		t3_update_qset_coalesce(qs, qsp);
3442 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3443 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3444 		mtx_unlock(lock);
3445 	}
3446 
3447 	return (0);
3448 }
3449 
3450 
3451 void
3452 t3_add_attach_sysctls(adapter_t *sc)
3453 {
3454 	struct sysctl_ctx_list *ctx;
3455 	struct sysctl_oid_list *children;
3456 
3457 	ctx = device_get_sysctl_ctx(sc->dev);
3458 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3459 
3460 	/* random information */
3461 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3462 	    "firmware_version",
3463 	    CTLFLAG_RD, &sc->fw_version,
3464 	    0, "firmware version");
3465 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3466 	    "hw_revision",
3467 	    CTLFLAG_RD, &sc->params.rev,
3468 	    0, "chip model");
3469 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3470 	    "port_types",
3471 	    CTLFLAG_RD, &sc->port_types,
3472 	    0, "type of ports");
3473 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3474 	    "enable_debug",
3475 	    CTLFLAG_RW, &cxgb_debug,
3476 	    0, "enable verbose debugging output");
3477 	SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3478 	    CTLFLAG_RD, &sc->tunq_coalesce,
3479 	    "#tunneled packets freed");
3480 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3481 	    "txq_overrun",
3482 	    CTLFLAG_RD, &txq_fills,
3483 	    0, "#times txq overrun");
3484 }
3485 
3486 
3487 static const char *rspq_name = "rspq";
3488 static const char *txq_names[] =
3489 {
3490 	"txq_eth",
3491 	"txq_ofld",
3492 	"txq_ctrl"
3493 };
3494 
3495 static int
3496 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3497 {
3498 	struct port_info *p = arg1;
3499 	uint64_t *parg;
3500 
3501 	if (!p)
3502 		return (EINVAL);
3503 
3504 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3505 	PORT_LOCK(p);
3506 	t3_mac_update_stats(&p->mac);
3507 	PORT_UNLOCK(p);
3508 
3509 	return (sysctl_handle_quad(oidp, parg, 0, req));
3510 }
3511 
3512 void
3513 t3_add_configured_sysctls(adapter_t *sc)
3514 {
3515 	struct sysctl_ctx_list *ctx;
3516 	struct sysctl_oid_list *children;
3517 	int i, j;
3518 
3519 	ctx = device_get_sysctl_ctx(sc->dev);
3520 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3521 
3522 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3523 	    "intr_coal",
3524 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3525 	    0, t3_set_coalesce_usecs,
3526 	    "I", "interrupt coalescing timer (us)");
3527 
3528 	for (i = 0; i < sc->params.nports; i++) {
3529 		struct port_info *pi = &sc->port[i];
3530 		struct sysctl_oid *poid;
3531 		struct sysctl_oid_list *poidlist;
3532 		struct mac_stats *mstats = &pi->mac.stats;
3533 
3534 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3535 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3536 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3537 		poidlist = SYSCTL_CHILDREN(poid);
3538 		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3539 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3540 		    0, "#queue sets");
3541 
3542 		for (j = 0; j < pi->nqsets; j++) {
3543 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3544 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3545 					  *ctrlqpoid, *lropoid;
3546 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3547 					       *txqpoidlist, *ctrlqpoidlist,
3548 					       *lropoidlist;
3549 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3550 
3551 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3552 
3553 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3554 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3555 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3556 
3557 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3558 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3559 					"freelist #0 empty");
3560 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3561 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3562 					"freelist #1 empty");
3563 
3564 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3565 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3566 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3567 
3568 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3569 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3570 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3571 
3572 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3573 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3574 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3575 
3576 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3577 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3578 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3579 
3580 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3581 			    CTLFLAG_RD, &qs->rspq.size,
3582 			    0, "#entries in response queue");
3583 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3584 			    CTLFLAG_RD, &qs->rspq.cidx,
3585 			    0, "consumer index");
3586 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3587 			    CTLFLAG_RD, &qs->rspq.credits,
3588 			    0, "#credits");
3589 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3590 			    CTLFLAG_RD, &qs->rspq.starved,
3591 			    0, "#times starved");
3592 			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3593 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3594 			    "physical_address_of the queue");
3595 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3596 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3597 			    0, "start rspq dump entry");
3598 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3599 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3600 			    0, "#rspq entries to dump");
3601 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3602 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3603 			    0, t3_dump_rspq, "A", "dump of the response queue");
3604 
3605 			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3606 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3607 			    "#tunneled packets dropped");
3608 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3609 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3610 			    0, "#tunneled packets waiting to be sent");
3611 #if 0
3612 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3613 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3614 			    0, "#tunneled packets queue producer index");
3615 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3616 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3617 			    0, "#tunneled packets queue consumer index");
3618 #endif
3619 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3620 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3621 			    0, "#tunneled packets processed by the card");
3622 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3623 			    CTLFLAG_RD, &txq->cleaned,
3624 			    0, "#tunneled packets cleaned");
3625 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3626 			    CTLFLAG_RD, &txq->in_use,
3627 			    0, "#tunneled packet slots in use");
3628 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3629 			    CTLFLAG_RD, &txq->txq_frees,
3630 			    "#tunneled packets freed");
3631 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3632 			    CTLFLAG_RD, &txq->txq_skipped,
3633 			    0, "#tunneled packet descriptors skipped");
3634 			SYSCTL_ADD_QUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3635 			    CTLFLAG_RD, &txq->txq_coalesced,
3636 			    "#tunneled packets coalesced");
3637 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3638 			    CTLFLAG_RD, &txq->txq_enqueued,
3639 			    0, "#tunneled packets enqueued to hardware");
3640 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3641 			    CTLFLAG_RD, &qs->txq_stopped,
3642 			    0, "tx queues stopped");
3643 			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3644 			    CTLFLAG_RD, &txq->phys_addr,
3645 			    "physical_address_of the queue");
3646 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3647 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3648 			    0, "txq generation");
3649 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3650 			    CTLFLAG_RD, &txq->cidx,
3651 			    0, "hardware queue cidx");
3652 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3653 			    CTLFLAG_RD, &txq->pidx,
3654 			    0, "hardware queue pidx");
3655 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3656 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3657 			    0, "txq start idx for dump");
3658 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3659 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3660 			    0, "txq #entries to dump");
3661 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3662 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3663 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3664 
3665 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3666 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3667 			    0, "ctrlq start idx for dump");
3668 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3669 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3670 			    0, "ctrl #entries to dump");
3671 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3672 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3673 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3674 
3675 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3676 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3677 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3678 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3679 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3680 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3681 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3682 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3683 		}
3684 
3685 		/* Now add a node for mac stats. */
3686 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3687 		    CTLFLAG_RD, NULL, "MAC statistics");
3688 		poidlist = SYSCTL_CHILDREN(poid);
3689 
3690 		/*
3691 		 * We (ab)use the length argument (arg2) to pass on the offset
3692 		 * of the data that we are interested in.  This is only required
3693 		 * for the quad counters that are updated from the hardware (we
3694 		 * make sure that we return the latest value).
3695 		 * sysctl_handle_macstat first updates *all* the counters from
3696 		 * the hardware, and then returns the latest value of the
3697 		 * requested counter.  Best would be to update only the
3698 		 * requested counter from hardware, but t3_mac_update_stats()
3699 		 * hides all the register details and we don't want to dive into
3700 		 * all that here.
3701 		 */
3702 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3703     (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3704     sysctl_handle_macstat, "QU", 0)
3705 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3706 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3707 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3708 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3709 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3710 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3711 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3712 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3713 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3714 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3715 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3716 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3717 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3718 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3719 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3720 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3721 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3722 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3723 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3724 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3725 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3726 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3727 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3728 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3729 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3730 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3731 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3732 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3733 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3734 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3735 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3736 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3737 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3738 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3739 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3740 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3741 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3742 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3743 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3744 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3745 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3746 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3747 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3748 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3749 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3750 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3751 #undef CXGB_SYSCTL_ADD_QUAD
3752 
3753 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3754     CTLFLAG_RD, &mstats->a, 0)
3755 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3756 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3757 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3758 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3759 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3760 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3761 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3762 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3763 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3764 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3765 #undef CXGB_SYSCTL_ADD_ULONG
3766 	}
3767 }
3768 
3769 /**
3770  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3771  *	@qs: the queue set
3772  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3773  *	@idx: the descriptor index in the queue
3774  *	@data: where to dump the descriptor contents
3775  *
3776  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3777  *	size of the descriptor.
3778  */
3779 int
3780 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3781 		unsigned char *data)
3782 {
3783 	if (qnum >= 6)
3784 		return (EINVAL);
3785 
3786 	if (qnum < 3) {
3787 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3788 			return -EINVAL;
3789 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3790 		return sizeof(struct tx_desc);
3791 	}
3792 
3793 	if (qnum == 3) {
3794 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3795 			return (EINVAL);
3796 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3797 		return sizeof(struct rsp_desc);
3798 	}
3799 
3800 	qnum -= 4;
3801 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3802 		return (EINVAL);
3803 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3804 	return sizeof(struct rx_desc);
3805 }
3806