xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision ce6a89e27cd190313be39bb479880aeda4778436)
1 /**************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2007-2009, Chelsio Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Chelsio Corporation nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet6.h"
35 #include "opt_inet.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <machine/bus.h>
44 #include <machine/resource.h>
45 #include <sys/rman.h>
46 #include <sys/queue.h>
47 #include <sys/sysctl.h>
48 #include <sys/taskqueue.h>
49 
50 #include <sys/proc.h>
51 #include <sys/sbuf.h>
52 #include <sys/sched.h>
53 #include <sys/smp.h>
54 #include <sys/systm.h>
55 #include <sys/syslog.h>
56 #include <sys/socket.h>
57 #include <sys/sglist.h>
58 
59 #include <net/if.h>
60 #include <net/if_var.h>
61 #include <net/bpf.h>
62 #include <net/ethernet.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 
71 #include <dev/pci/pcireg.h>
72 #include <dev/pci/pcivar.h>
73 
74 #include <vm/vm.h>
75 #include <vm/pmap.h>
76 
77 #include <cxgb_include.h>
78 #include <sys/mvec.h>
79 
80 int	txq_fills = 0;
81 int	multiq_tx_enable = 1;
82 
83 #ifdef TCP_OFFLOAD
84 CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
85 #endif
86 
87 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
88 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
89 SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
90     "size of per-queue mbuf ring");
91 
92 static int cxgb_tx_coalesce_force = 0;
93 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RWTUN,
94     &cxgb_tx_coalesce_force, 0,
95     "coalesce small packets into a single work request regardless of ring state");
96 
97 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
98 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
99 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
100 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
101 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
102 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
103 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
104 
105 
106 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
107 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RWTUN,
108     &cxgb_tx_coalesce_enable_start, 0,
109     "coalesce enable threshold");
110 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
111 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RWTUN,
112     &cxgb_tx_coalesce_enable_stop, 0,
113     "coalesce disable threshold");
114 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
115 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RWTUN,
116     &cxgb_tx_reclaim_threshold, 0,
117     "tx cleaning minimum threshold");
118 
119 /*
120  * XXX don't re-enable this until TOE stops assuming
121  * we have an m_ext
122  */
123 static int recycle_enable = 0;
124 
125 extern int cxgb_use_16k_clusters;
126 extern int nmbjumbop;
127 extern int nmbjumbo9;
128 extern int nmbjumbo16;
129 
130 #define USE_GTS 0
131 
132 #define SGE_RX_SM_BUF_SIZE	1536
133 #define SGE_RX_DROP_THRES	16
134 #define SGE_RX_COPY_THRES	128
135 
136 /*
137  * Period of the Tx buffer reclaim timer.  This timer does not need to run
138  * frequently as Tx buffers are usually reclaimed by new Tx packets.
139  */
140 #define TX_RECLAIM_PERIOD       (hz >> 1)
141 
142 /*
143  * Values for sge_txq.flags
144  */
145 enum {
146 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
147 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
148 };
149 
150 struct tx_desc {
151 	uint64_t	flit[TX_DESC_FLITS];
152 } __packed;
153 
154 struct rx_desc {
155 	uint32_t	addr_lo;
156 	uint32_t	len_gen;
157 	uint32_t	gen2;
158 	uint32_t	addr_hi;
159 } __packed;
160 
161 struct rsp_desc {               /* response queue descriptor */
162 	struct rss_header	rss_hdr;
163 	uint32_t		flags;
164 	uint32_t		len_cq;
165 	uint8_t			imm_data[47];
166 	uint8_t			intr_gen;
167 } __packed;
168 
169 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
170 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
171 #define RX_SW_DESC_INUSE        (1 << 3)
172 #define TX_SW_DESC_MAPPED       (1 << 4)
173 
174 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
175 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
176 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
177 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
178 
179 struct tx_sw_desc {                /* SW state per Tx descriptor */
180 	struct mbuf	*m;
181 	bus_dmamap_t	map;
182 	int		flags;
183 };
184 
185 struct rx_sw_desc {                /* SW state per Rx descriptor */
186 	caddr_t		rxsd_cl;
187 	struct mbuf	*m;
188 	bus_dmamap_t	map;
189 	int		flags;
190 };
191 
192 struct txq_state {
193 	unsigned int	compl;
194 	unsigned int	gen;
195 	unsigned int	pidx;
196 };
197 
198 struct refill_fl_cb_arg {
199 	int               error;
200 	bus_dma_segment_t seg;
201 	int               nseg;
202 };
203 
204 
205 /*
206  * Maps a number of flits to the number of Tx descriptors that can hold them.
207  * The formula is
208  *
209  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
210  *
211  * HW allows up to 4 descriptors to be combined into a WR.
212  */
213 static uint8_t flit_desc_map[] = {
214 	0,
215 #if SGE_NUM_GENBITS == 1
216 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
218 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
219 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
220 #elif SGE_NUM_GENBITS == 2
221 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
222 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
223 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
225 #else
226 # error "SGE_NUM_GENBITS must be 1 or 2"
227 #endif
228 };
229 
230 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
231 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
232 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
233 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
234 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
235 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
236 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
237 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
238 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
239 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
240 #define	TXQ_RING_DEQUEUE(qs) \
241 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
242 
243 int cxgb_debug = 0;
244 
245 static void sge_timer_cb(void *arg);
246 static void sge_timer_reclaim(void *arg, int ncount);
247 static void sge_txq_reclaim_handler(void *arg, int ncount);
248 static void cxgb_start_locked(struct sge_qset *qs);
249 
250 /*
251  * XXX need to cope with bursty scheduling by looking at a wider
252  * window than we are now for determining the need for coalescing
253  *
254  */
255 static __inline uint64_t
256 check_pkt_coalesce(struct sge_qset *qs)
257 {
258         struct adapter *sc;
259         struct sge_txq *txq;
260 	uint8_t *fill;
261 
262 	if (__predict_false(cxgb_tx_coalesce_force))
263 		return (1);
264 	txq = &qs->txq[TXQ_ETH];
265         sc = qs->port->adapter;
266 	fill = &sc->tunq_fill[qs->idx];
267 
268 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
269 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
270 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
271 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
272 	/*
273 	 * if the hardware transmit queue is more than 1/8 full
274 	 * we mark it as coalescing - we drop back from coalescing
275 	 * when we go below 1/32 full and there are no packets enqueued,
276 	 * this provides us with some degree of hysteresis
277 	 */
278         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
279 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
280                 *fill = 0;
281         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
282                 *fill = 1;
283 
284 	return (sc->tunq_coalesce);
285 }
286 
287 #ifdef __LP64__
288 static void
289 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
290 {
291 	uint64_t wr_hilo;
292 #if _BYTE_ORDER == _LITTLE_ENDIAN
293 	wr_hilo = wr_hi;
294 	wr_hilo |= (((uint64_t)wr_lo)<<32);
295 #else
296 	wr_hilo = wr_lo;
297 	wr_hilo |= (((uint64_t)wr_hi)<<32);
298 #endif
299 	wrp->wrh_hilo = wr_hilo;
300 }
301 #else
302 static void
303 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
304 {
305 
306 	wrp->wrh_hi = wr_hi;
307 	wmb();
308 	wrp->wrh_lo = wr_lo;
309 }
310 #endif
311 
312 struct coalesce_info {
313 	int count;
314 	int nbytes;
315 };
316 
317 static int
318 coalesce_check(struct mbuf *m, void *arg)
319 {
320 	struct coalesce_info *ci = arg;
321 	int *count = &ci->count;
322 	int *nbytes = &ci->nbytes;
323 
324 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
325 		(*count < 7) && (m->m_next == NULL))) {
326 		*count += 1;
327 		*nbytes += m->m_len;
328 		return (1);
329 	}
330 	return (0);
331 }
332 
333 static struct mbuf *
334 cxgb_dequeue(struct sge_qset *qs)
335 {
336 	struct mbuf *m, *m_head, *m_tail;
337 	struct coalesce_info ci;
338 
339 
340 	if (check_pkt_coalesce(qs) == 0)
341 		return TXQ_RING_DEQUEUE(qs);
342 
343 	m_head = m_tail = NULL;
344 	ci.count = ci.nbytes = 0;
345 	do {
346 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
347 		if (m_head == NULL) {
348 			m_tail = m_head = m;
349 		} else if (m != NULL) {
350 			m_tail->m_nextpkt = m;
351 			m_tail = m;
352 		}
353 	} while (m != NULL);
354 	if (ci.count > 7)
355 		panic("trying to coalesce %d packets in to one WR", ci.count);
356 	return (m_head);
357 }
358 
359 /**
360  *	reclaim_completed_tx - reclaims completed Tx descriptors
361  *	@adapter: the adapter
362  *	@q: the Tx queue to reclaim completed descriptors from
363  *
364  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
365  *	and frees the associated buffers if possible.  Called with the Tx
366  *	queue's lock held.
367  */
368 static __inline int
369 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
370 {
371 	struct sge_txq *q = &qs->txq[queue];
372 	int reclaim = desc_reclaimable(q);
373 
374 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
375 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
376 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
377 
378 	if (reclaim < reclaim_min)
379 		return (0);
380 
381 	mtx_assert(&qs->lock, MA_OWNED);
382 	if (reclaim > 0) {
383 		t3_free_tx_desc(qs, reclaim, queue);
384 		q->cleaned += reclaim;
385 		q->in_use -= reclaim;
386 	}
387 	if (isset(&qs->txq_stopped, TXQ_ETH))
388                 clrbit(&qs->txq_stopped, TXQ_ETH);
389 
390 	return (reclaim);
391 }
392 
393 #ifdef DEBUGNET
394 int
395 cxgb_debugnet_poll_tx(struct sge_qset *qs)
396 {
397 
398 	return (reclaim_completed_tx(qs, TX_RECLAIM_MAX, TXQ_ETH));
399 }
400 #endif
401 
402 /**
403  *	should_restart_tx - are there enough resources to restart a Tx queue?
404  *	@q: the Tx queue
405  *
406  *	Checks if there are enough descriptors to restart a suspended Tx queue.
407  */
408 static __inline int
409 should_restart_tx(const struct sge_txq *q)
410 {
411 	unsigned int r = q->processed - q->cleaned;
412 
413 	return q->in_use - r < (q->size >> 1);
414 }
415 
416 /**
417  *	t3_sge_init - initialize SGE
418  *	@adap: the adapter
419  *	@p: the SGE parameters
420  *
421  *	Performs SGE initialization needed every time after a chip reset.
422  *	We do not initialize any of the queue sets here, instead the driver
423  *	top-level must request those individually.  We also do not enable DMA
424  *	here, that should be done after the queues have been set up.
425  */
426 void
427 t3_sge_init(adapter_t *adap, struct sge_params *p)
428 {
429 	u_int ctrl, ups;
430 
431 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
432 
433 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
434 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
435 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
436 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
437 #if SGE_NUM_GENBITS == 1
438 	ctrl |= F_EGRGENCTRL;
439 #endif
440 	if (adap->params.rev > 0) {
441 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
442 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
443 	}
444 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
445 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
446 		     V_LORCQDRBTHRSH(512));
447 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
448 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
449 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
450 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
451 		     adap->params.rev < T3_REV_C ? 1000 : 500);
452 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
453 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
454 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
455 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
456 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
457 }
458 
459 
460 /**
461  *	sgl_len - calculates the size of an SGL of the given capacity
462  *	@n: the number of SGL entries
463  *
464  *	Calculates the number of flits needed for a scatter/gather list that
465  *	can hold the given number of entries.
466  */
467 static __inline unsigned int
468 sgl_len(unsigned int n)
469 {
470 	return ((3 * n) / 2 + (n & 1));
471 }
472 
473 /**
474  *	get_imm_packet - return the next ingress packet buffer from a response
475  *	@resp: the response descriptor containing the packet data
476  *
477  *	Return a packet containing the immediate data of the given response.
478  */
479 static int
480 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
481 {
482 
483 	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
484 		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
485 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
486 	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
487 		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
488 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
489 	} else
490 		m->m_len = IMMED_PKT_SIZE;
491 	m->m_ext.ext_buf = NULL;
492 	m->m_ext.ext_type = 0;
493 	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len);
494 	return (0);
495 }
496 
497 static __inline u_int
498 flits_to_desc(u_int n)
499 {
500 	return (flit_desc_map[n]);
501 }
502 
503 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
504 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
505 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
506 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
507 		    F_HIRCQPARITYERROR)
508 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
509 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
510 		      F_RSPQDISABLED)
511 
512 /**
513  *	t3_sge_err_intr_handler - SGE async event interrupt handler
514  *	@adapter: the adapter
515  *
516  *	Interrupt handler for SGE asynchronous (non-data) events.
517  */
518 void
519 t3_sge_err_intr_handler(adapter_t *adapter)
520 {
521 	unsigned int v, status;
522 
523 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
524 	if (status & SGE_PARERR)
525 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
526 			 status & SGE_PARERR);
527 	if (status & SGE_FRAMINGERR)
528 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
529 			 status & SGE_FRAMINGERR);
530 	if (status & F_RSPQCREDITOVERFOW)
531 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
532 
533 	if (status & F_RSPQDISABLED) {
534 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
535 
536 		CH_ALERT(adapter,
537 			 "packet delivered to disabled response queue (0x%x)\n",
538 			 (v >> S_RSPQ0DISABLED) & 0xff);
539 	}
540 
541 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
542 	if (status & SGE_FATALERR)
543 		t3_fatal_err(adapter);
544 }
545 
546 void
547 t3_sge_prep(adapter_t *adap, struct sge_params *p)
548 {
549 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
550 
551 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
552 	nqsets *= adap->params.nports;
553 
554 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
555 
556 	while (!powerof2(fl_q_size))
557 		fl_q_size--;
558 
559 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
560 	    is_offload(adap);
561 
562 	if (use_16k) {
563 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
564 		jumbo_buf_size = MJUM16BYTES;
565 	} else {
566 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
567 		jumbo_buf_size = MJUM9BYTES;
568 	}
569 	while (!powerof2(jumbo_q_size))
570 		jumbo_q_size--;
571 
572 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
573 		device_printf(adap->dev,
574 		    "Insufficient clusters and/or jumbo buffers.\n");
575 
576 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
577 
578 	for (i = 0; i < SGE_QSETS; ++i) {
579 		struct qset_params *q = p->qset + i;
580 
581 		if (adap->params.nports > 2) {
582 			q->coalesce_usecs = 50;
583 		} else {
584 #ifdef INVARIANTS
585 			q->coalesce_usecs = 10;
586 #else
587 			q->coalesce_usecs = 5;
588 #endif
589 		}
590 		q->polling = 0;
591 		q->rspq_size = RSPQ_Q_SIZE;
592 		q->fl_size = fl_q_size;
593 		q->jumbo_size = jumbo_q_size;
594 		q->jumbo_buf_size = jumbo_buf_size;
595 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
596 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
597 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
598 		q->cong_thres = 0;
599 	}
600 }
601 
602 int
603 t3_sge_alloc(adapter_t *sc)
604 {
605 
606 	/* The parent tag. */
607 	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
608 				1, 0,			/* algnmnt, boundary */
609 				BUS_SPACE_MAXADDR,	/* lowaddr */
610 				BUS_SPACE_MAXADDR,	/* highaddr */
611 				NULL, NULL,		/* filter, filterarg */
612 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
613 				BUS_SPACE_UNRESTRICTED, /* nsegments */
614 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
615 				0,			/* flags */
616 				NULL, NULL,		/* lock, lockarg */
617 				&sc->parent_dmat)) {
618 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
619 		return (ENOMEM);
620 	}
621 
622 	/*
623 	 * DMA tag for normal sized RX frames
624 	 */
625 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
626 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
627 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
628 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
629 		return (ENOMEM);
630 	}
631 
632 	/*
633 	 * DMA tag for jumbo sized RX frames.
634 	 */
635 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
636 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
637 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
638 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
639 		return (ENOMEM);
640 	}
641 
642 	/*
643 	 * DMA tag for TX frames.
644 	 */
645 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
646 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
647 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
648 		NULL, NULL, &sc->tx_dmat)) {
649 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
650 		return (ENOMEM);
651 	}
652 
653 	return (0);
654 }
655 
656 int
657 t3_sge_free(struct adapter * sc)
658 {
659 
660 	if (sc->tx_dmat != NULL)
661 		bus_dma_tag_destroy(sc->tx_dmat);
662 
663 	if (sc->rx_jumbo_dmat != NULL)
664 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
665 
666 	if (sc->rx_dmat != NULL)
667 		bus_dma_tag_destroy(sc->rx_dmat);
668 
669 	if (sc->parent_dmat != NULL)
670 		bus_dma_tag_destroy(sc->parent_dmat);
671 
672 	return (0);
673 }
674 
675 void
676 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
677 {
678 
679 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
680 	qs->rspq.polling = 0 /* p->polling */;
681 }
682 
683 #if !defined(__i386__) && !defined(__amd64__)
684 static void
685 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
686 {
687 	struct refill_fl_cb_arg *cb_arg = arg;
688 
689 	cb_arg->error = error;
690 	cb_arg->seg = segs[0];
691 	cb_arg->nseg = nseg;
692 
693 }
694 #endif
695 /**
696  *	refill_fl - refill an SGE free-buffer list
697  *	@sc: the controller softc
698  *	@q: the free-list to refill
699  *	@n: the number of new buffers to allocate
700  *
701  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
702  *	The caller must assure that @n does not exceed the queue's capacity.
703  */
704 static void
705 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
706 {
707 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
708 	struct rx_desc *d = &q->desc[q->pidx];
709 	struct refill_fl_cb_arg cb_arg;
710 	struct mbuf *m;
711 	caddr_t cl;
712 	int err;
713 
714 	cb_arg.error = 0;
715 	while (n--) {
716 		/*
717 		 * We allocate an uninitialized mbuf + cluster, mbuf is
718 		 * initialized after rx.
719 		 */
720 		if (q->zone == zone_pack) {
721 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
722 				break;
723 			cl = m->m_ext.ext_buf;
724 		} else {
725 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
726 				break;
727 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
728 				uma_zfree(q->zone, cl);
729 				break;
730 			}
731 		}
732 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
733 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
734 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
735 				uma_zfree(q->zone, cl);
736 				goto done;
737 			}
738 			sd->flags |= RX_SW_DESC_MAP_CREATED;
739 		}
740 #if !defined(__i386__) && !defined(__amd64__)
741 		err = bus_dmamap_load(q->entry_tag, sd->map,
742 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
743 
744 		if (err != 0 || cb_arg.error) {
745 			if (q->zone != zone_pack)
746 				uma_zfree(q->zone, cl);
747 			m_free(m);
748 			goto done;
749 		}
750 #else
751 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
752 #endif
753 		sd->flags |= RX_SW_DESC_INUSE;
754 		sd->rxsd_cl = cl;
755 		sd->m = m;
756 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
757 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
758 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
759 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
760 
761 		d++;
762 		sd++;
763 
764 		if (++q->pidx == q->size) {
765 			q->pidx = 0;
766 			q->gen ^= 1;
767 			sd = q->sdesc;
768 			d = q->desc;
769 		}
770 		q->credits++;
771 		q->db_pending++;
772 	}
773 
774 done:
775 	if (q->db_pending >= 32) {
776 		q->db_pending = 0;
777 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
778 	}
779 }
780 
781 
782 /**
783  *	free_rx_bufs - free the Rx buffers on an SGE free list
784  *	@sc: the controle softc
785  *	@q: the SGE free list to clean up
786  *
787  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
788  *	this queue should be stopped before calling this function.
789  */
790 static void
791 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
792 {
793 	u_int cidx = q->cidx;
794 
795 	while (q->credits--) {
796 		struct rx_sw_desc *d = &q->sdesc[cidx];
797 
798 		if (d->flags & RX_SW_DESC_INUSE) {
799 			bus_dmamap_unload(q->entry_tag, d->map);
800 			bus_dmamap_destroy(q->entry_tag, d->map);
801 			if (q->zone == zone_pack) {
802 				m_init(d->m, M_NOWAIT, MT_DATA, M_EXT);
803 				uma_zfree(zone_pack, d->m);
804 			} else {
805 				m_init(d->m, M_NOWAIT, MT_DATA, 0);
806 				uma_zfree(zone_mbuf, d->m);
807 				uma_zfree(q->zone, d->rxsd_cl);
808 			}
809 		}
810 
811 		d->rxsd_cl = NULL;
812 		d->m = NULL;
813 		if (++cidx == q->size)
814 			cidx = 0;
815 	}
816 }
817 
818 static __inline void
819 __refill_fl(adapter_t *adap, struct sge_fl *fl)
820 {
821 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
822 }
823 
824 static __inline void
825 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
826 {
827 	uint32_t reclaimable = fl->size - fl->credits;
828 
829 	if (reclaimable > 0)
830 		refill_fl(adap, fl, min(max, reclaimable));
831 }
832 
833 /**
834  *	recycle_rx_buf - recycle a receive buffer
835  *	@adapter: the adapter
836  *	@q: the SGE free list
837  *	@idx: index of buffer to recycle
838  *
839  *	Recycles the specified buffer on the given free list by adding it at
840  *	the next available slot on the list.
841  */
842 static void
843 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
844 {
845 	struct rx_desc *from = &q->desc[idx];
846 	struct rx_desc *to   = &q->desc[q->pidx];
847 
848 	q->sdesc[q->pidx] = q->sdesc[idx];
849 	to->addr_lo = from->addr_lo;        // already big endian
850 	to->addr_hi = from->addr_hi;        // likewise
851 	wmb();	/* necessary ? */
852 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
853 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
854 	q->credits++;
855 
856 	if (++q->pidx == q->size) {
857 		q->pidx = 0;
858 		q->gen ^= 1;
859 	}
860 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
861 }
862 
863 static void
864 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
865 {
866 	uint32_t *addr;
867 
868 	addr = arg;
869 	*addr = segs[0].ds_addr;
870 }
871 
872 static int
873 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
874     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
875     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
876 {
877 	size_t len = nelem * elem_size;
878 	void *s = NULL;
879 	void *p = NULL;
880 	int err;
881 
882 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
883 				      BUS_SPACE_MAXADDR_32BIT,
884 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
885 				      len, 0, NULL, NULL, tag)) != 0) {
886 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
887 		return (ENOMEM);
888 	}
889 
890 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
891 				    map)) != 0) {
892 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
893 		return (ENOMEM);
894 	}
895 
896 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
897 	bzero(p, len);
898 	*(void **)desc = p;
899 
900 	if (sw_size) {
901 		len = nelem * sw_size;
902 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
903 		*(void **)sdesc = s;
904 	}
905 	if (parent_entry_tag == NULL)
906 		return (0);
907 
908 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
909 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
910 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
911 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
912 		                      NULL, NULL, entry_tag)) != 0) {
913 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
914 		return (ENOMEM);
915 	}
916 	return (0);
917 }
918 
919 static void
920 sge_slow_intr_handler(void *arg, int ncount)
921 {
922 	adapter_t *sc = arg;
923 
924 	t3_slow_intr_handler(sc);
925 	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
926 	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
927 }
928 
929 /**
930  *	sge_timer_cb - perform periodic maintenance of an SGE qset
931  *	@data: the SGE queue set to maintain
932  *
933  *	Runs periodically from a timer to perform maintenance of an SGE queue
934  *	set.  It performs two tasks:
935  *
936  *	a) Cleans up any completed Tx descriptors that may still be pending.
937  *	Normal descriptor cleanup happens when new packets are added to a Tx
938  *	queue so this timer is relatively infrequent and does any cleanup only
939  *	if the Tx queue has not seen any new packets in a while.  We make a
940  *	best effort attempt to reclaim descriptors, in that we don't wait
941  *	around if we cannot get a queue's lock (which most likely is because
942  *	someone else is queueing new packets and so will also handle the clean
943  *	up).  Since control queues use immediate data exclusively we don't
944  *	bother cleaning them up here.
945  *
946  *	b) Replenishes Rx queues that have run out due to memory shortage.
947  *	Normally new Rx buffers are added when existing ones are consumed but
948  *	when out of memory a queue can become empty.  We try to add only a few
949  *	buffers here, the queue will be replenished fully as these new buffers
950  *	are used up if memory shortage has subsided.
951  *
952  *	c) Return coalesced response queue credits in case a response queue is
953  *	starved.
954  *
955  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
956  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
957  */
958 static void
959 sge_timer_cb(void *arg)
960 {
961 	adapter_t *sc = arg;
962 	if ((sc->flags & USING_MSIX) == 0) {
963 
964 		struct port_info *pi;
965 		struct sge_qset *qs;
966 		struct sge_txq  *txq;
967 		int i, j;
968 		int reclaim_ofl, refill_rx;
969 
970 		if (sc->open_device_map == 0)
971 			return;
972 
973 		for (i = 0; i < sc->params.nports; i++) {
974 			pi = &sc->port[i];
975 			for (j = 0; j < pi->nqsets; j++) {
976 				qs = &sc->sge.qs[pi->first_qset + j];
977 				txq = &qs->txq[0];
978 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
979 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
980 				    (qs->fl[1].credits < qs->fl[1].size));
981 				if (reclaim_ofl || refill_rx) {
982 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
983 					break;
984 				}
985 			}
986 		}
987 	}
988 
989 	if (sc->params.nports > 2) {
990 		int i;
991 
992 		for_each_port(sc, i) {
993 			struct port_info *pi = &sc->port[i];
994 
995 			t3_write_reg(sc, A_SG_KDOORBELL,
996 				     F_SELEGRCNTX |
997 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
998 		}
999 	}
1000 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
1001 	    sc->open_device_map != 0)
1002 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1003 }
1004 
1005 /*
1006  * This is meant to be a catch-all function to keep sge state private
1007  * to sge.c
1008  *
1009  */
1010 int
1011 t3_sge_init_adapter(adapter_t *sc)
1012 {
1013 	callout_init(&sc->sge_timer_ch, 1);
1014 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1015 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1016 	return (0);
1017 }
1018 
1019 int
1020 t3_sge_reset_adapter(adapter_t *sc)
1021 {
1022 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1023 	return (0);
1024 }
1025 
1026 int
1027 t3_sge_init_port(struct port_info *pi)
1028 {
1029 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1030 	return (0);
1031 }
1032 
1033 /**
1034  *	refill_rspq - replenish an SGE response queue
1035  *	@adapter: the adapter
1036  *	@q: the response queue to replenish
1037  *	@credits: how many new responses to make available
1038  *
1039  *	Replenishes a response queue by making the supplied number of responses
1040  *	available to HW.
1041  */
1042 static __inline void
1043 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1044 {
1045 
1046 	/* mbufs are allocated on demand when a rspq entry is processed. */
1047 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1048 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1049 }
1050 
1051 static void
1052 sge_txq_reclaim_handler(void *arg, int ncount)
1053 {
1054 	struct sge_qset *qs = arg;
1055 	int i;
1056 
1057 	for (i = 0; i < 3; i++)
1058 		reclaim_completed_tx(qs, 16, i);
1059 }
1060 
1061 static void
1062 sge_timer_reclaim(void *arg, int ncount)
1063 {
1064 	struct port_info *pi = arg;
1065 	int i, nqsets = pi->nqsets;
1066 	adapter_t *sc = pi->adapter;
1067 	struct sge_qset *qs;
1068 	struct mtx *lock;
1069 
1070 	KASSERT((sc->flags & USING_MSIX) == 0,
1071 	    ("can't call timer reclaim for msi-x"));
1072 
1073 	for (i = 0; i < nqsets; i++) {
1074 		qs = &sc->sge.qs[pi->first_qset + i];
1075 
1076 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1077 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1078 			    &sc->sge.qs[0].rspq.lock;
1079 
1080 		if (mtx_trylock(lock)) {
1081 			/* XXX currently assume that we are *NOT* polling */
1082 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1083 
1084 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1085 				__refill_fl(sc, &qs->fl[0]);
1086 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1087 				__refill_fl(sc, &qs->fl[1]);
1088 
1089 			if (status & (1 << qs->rspq.cntxt_id)) {
1090 				if (qs->rspq.credits) {
1091 					refill_rspq(sc, &qs->rspq, 1);
1092 					qs->rspq.credits--;
1093 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1094 					    1 << qs->rspq.cntxt_id);
1095 				}
1096 			}
1097 			mtx_unlock(lock);
1098 		}
1099 	}
1100 }
1101 
1102 /**
1103  *	init_qset_cntxt - initialize an SGE queue set context info
1104  *	@qs: the queue set
1105  *	@id: the queue set id
1106  *
1107  *	Initializes the TIDs and context ids for the queues of a queue set.
1108  */
1109 static void
1110 init_qset_cntxt(struct sge_qset *qs, u_int id)
1111 {
1112 
1113 	qs->rspq.cntxt_id = id;
1114 	qs->fl[0].cntxt_id = 2 * id;
1115 	qs->fl[1].cntxt_id = 2 * id + 1;
1116 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1117 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1118 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1119 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1120 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1121 
1122 	/* XXX: a sane limit is needed instead of INT_MAX */
1123 	mbufq_init(&qs->txq[TXQ_ETH].sendq, INT_MAX);
1124 	mbufq_init(&qs->txq[TXQ_OFLD].sendq, INT_MAX);
1125 	mbufq_init(&qs->txq[TXQ_CTRL].sendq, INT_MAX);
1126 }
1127 
1128 
1129 static void
1130 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1131 {
1132 	txq->in_use += ndesc;
1133 	/*
1134 	 * XXX we don't handle stopping of queue
1135 	 * presumably start handles this when we bump against the end
1136 	 */
1137 	txqs->gen = txq->gen;
1138 	txq->unacked += ndesc;
1139 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1140 	txq->unacked &= 31;
1141 	txqs->pidx = txq->pidx;
1142 	txq->pidx += ndesc;
1143 #ifdef INVARIANTS
1144 	if (((txqs->pidx > txq->cidx) &&
1145 		(txq->pidx < txqs->pidx) &&
1146 		(txq->pidx >= txq->cidx)) ||
1147 	    ((txqs->pidx < txq->cidx) &&
1148 		(txq->pidx >= txq-> cidx)) ||
1149 	    ((txqs->pidx < txq->cidx) &&
1150 		(txq->cidx < txqs->pidx)))
1151 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1152 		    txqs->pidx, txq->pidx, txq->cidx);
1153 #endif
1154 	if (txq->pidx >= txq->size) {
1155 		txq->pidx -= txq->size;
1156 		txq->gen ^= 1;
1157 	}
1158 
1159 }
1160 
1161 /**
1162  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1163  *	@m: the packet mbufs
1164  *      @nsegs: the number of segments
1165  *
1166  * 	Returns the number of Tx descriptors needed for the given Ethernet
1167  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1168  */
1169 static __inline unsigned int
1170 calc_tx_descs(const struct mbuf *m, int nsegs)
1171 {
1172 	unsigned int flits;
1173 
1174 	if (m->m_pkthdr.len <= PIO_LEN)
1175 		return 1;
1176 
1177 	flits = sgl_len(nsegs) + 2;
1178 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1179 		flits++;
1180 
1181 	return flits_to_desc(flits);
1182 }
1183 
1184 /**
1185  *	make_sgl - populate a scatter/gather list for a packet
1186  *	@sgp: the SGL to populate
1187  *	@segs: the packet dma segments
1188  *	@nsegs: the number of segments
1189  *
1190  *	Generates a scatter/gather list for the buffers that make up a packet
1191  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1192  *	appropriately.
1193  */
1194 static __inline void
1195 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1196 {
1197 	int i, idx;
1198 
1199 	for (idx = 0, i = 0; i < nsegs; i++) {
1200 		/*
1201 		 * firmware doesn't like empty segments
1202 		 */
1203 		if (segs[i].ds_len == 0)
1204 			continue;
1205 		if (i && idx == 0)
1206 			++sgp;
1207 
1208 		sgp->len[idx] = htobe32(segs[i].ds_len);
1209 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1210 		idx ^= 1;
1211 	}
1212 
1213 	if (idx) {
1214 		sgp->len[idx] = 0;
1215 		sgp->addr[idx] = 0;
1216 	}
1217 }
1218 
1219 /**
1220  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1221  *	@adap: the adapter
1222  *	@q: the Tx queue
1223  *
1224  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1225  *	where the HW is going to sleep just after we checked, however,
1226  *	then the interrupt handler will detect the outstanding TX packet
1227  *	and ring the doorbell for us.
1228  *
1229  *	When GTS is disabled we unconditionally ring the doorbell.
1230  */
1231 static __inline void
1232 check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1233 {
1234 #if USE_GTS
1235 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1236 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1237 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1238 #ifdef T3_TRACE
1239 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1240 			  q->cntxt_id);
1241 #endif
1242 		t3_write_reg(adap, A_SG_KDOORBELL,
1243 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1244 	}
1245 #else
1246 	if (mustring || ++q->db_pending >= 32) {
1247 		wmb();            /* write descriptors before telling HW */
1248 		t3_write_reg(adap, A_SG_KDOORBELL,
1249 		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1250 		q->db_pending = 0;
1251 	}
1252 #endif
1253 }
1254 
1255 static __inline void
1256 wr_gen2(struct tx_desc *d, unsigned int gen)
1257 {
1258 #if SGE_NUM_GENBITS == 2
1259 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1260 #endif
1261 }
1262 
1263 /**
1264  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1265  *	@ndesc: number of Tx descriptors spanned by the SGL
1266  *	@txd: first Tx descriptor to be written
1267  *	@txqs: txq state (generation and producer index)
1268  *	@txq: the SGE Tx queue
1269  *	@sgl: the SGL
1270  *	@flits: number of flits to the start of the SGL in the first descriptor
1271  *	@sgl_flits: the SGL size in flits
1272  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1273  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1274  *
1275  *	Write a work request header and an associated SGL.  If the SGL is
1276  *	small enough to fit into one Tx descriptor it has already been written
1277  *	and we just need to write the WR header.  Otherwise we distribute the
1278  *	SGL across the number of descriptors it spans.
1279  */
1280 static void
1281 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1282     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1283     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1284 {
1285 
1286 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1287 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1288 
1289 	if (__predict_true(ndesc == 1)) {
1290 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1291 		    V_WR_SGLSFLT(flits)) | wr_hi,
1292 		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
1293 		    wr_lo);
1294 
1295 		wr_gen2(txd, txqs->gen);
1296 
1297 	} else {
1298 		unsigned int ogen = txqs->gen;
1299 		const uint64_t *fp = (const uint64_t *)sgl;
1300 		struct work_request_hdr *wp = wrp;
1301 
1302 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1303 		    V_WR_SGLSFLT(flits)) | wr_hi;
1304 
1305 		while (sgl_flits) {
1306 			unsigned int avail = WR_FLITS - flits;
1307 
1308 			if (avail > sgl_flits)
1309 				avail = sgl_flits;
1310 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1311 			sgl_flits -= avail;
1312 			ndesc--;
1313 			if (!sgl_flits)
1314 				break;
1315 
1316 			fp += avail;
1317 			txd++;
1318 			txsd++;
1319 			if (++txqs->pidx == txq->size) {
1320 				txqs->pidx = 0;
1321 				txqs->gen ^= 1;
1322 				txd = txq->desc;
1323 				txsd = txq->sdesc;
1324 			}
1325 
1326 			/*
1327 			 * when the head of the mbuf chain
1328 			 * is freed all clusters will be freed
1329 			 * with it
1330 			 */
1331 			wrp = (struct work_request_hdr *)txd;
1332 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1333 			    V_WR_SGLSFLT(1)) | wr_hi;
1334 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1335 				    sgl_flits + 1)) |
1336 			    V_WR_GEN(txqs->gen)) | wr_lo;
1337 			wr_gen2(txd, txqs->gen);
1338 			flits = 1;
1339 		}
1340 		wrp->wrh_hi |= htonl(F_WR_EOP);
1341 		wmb();
1342 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1343 		wr_gen2((struct tx_desc *)wp, ogen);
1344 	}
1345 }
1346 
1347 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1348 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1349 
1350 #define GET_VTAG(cntrl, m) \
1351 do { \
1352 	if ((m)->m_flags & M_VLANTAG)					            \
1353 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1354 } while (0)
1355 
1356 static int
1357 t3_encap(struct sge_qset *qs, struct mbuf **m)
1358 {
1359 	adapter_t *sc;
1360 	struct mbuf *m0;
1361 	struct sge_txq *txq;
1362 	struct txq_state txqs;
1363 	struct port_info *pi;
1364 	unsigned int ndesc, flits, cntrl, mlen;
1365 	int err, nsegs, tso_info = 0;
1366 
1367 	struct work_request_hdr *wrp;
1368 	struct tx_sw_desc *txsd;
1369 	struct sg_ent *sgp, *sgl;
1370 	uint32_t wr_hi, wr_lo, sgl_flits;
1371 	bus_dma_segment_t segs[TX_MAX_SEGS];
1372 
1373 	struct tx_desc *txd;
1374 
1375 	pi = qs->port;
1376 	sc = pi->adapter;
1377 	txq = &qs->txq[TXQ_ETH];
1378 	txd = &txq->desc[txq->pidx];
1379 	txsd = &txq->sdesc[txq->pidx];
1380 	sgl = txq->txq_sgl;
1381 
1382 	prefetch(txd);
1383 	m0 = *m;
1384 
1385 	mtx_assert(&qs->lock, MA_OWNED);
1386 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1387 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1388 
1389 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1390 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1391 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1392 
1393 	if (m0->m_nextpkt != NULL) {
1394 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1395 		ndesc = 1;
1396 		mlen = 0;
1397 	} else {
1398 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1399 		    &m0, segs, &nsegs))) {
1400 			if (cxgb_debug)
1401 				printf("failed ... err=%d\n", err);
1402 			return (err);
1403 		}
1404 		mlen = m0->m_pkthdr.len;
1405 		ndesc = calc_tx_descs(m0, nsegs);
1406 	}
1407 	txq_prod(txq, ndesc, &txqs);
1408 
1409 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1410 	txsd->m = m0;
1411 
1412 	if (m0->m_nextpkt != NULL) {
1413 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1414 		int i, fidx;
1415 
1416 		if (nsegs > 7)
1417 			panic("trying to coalesce %d packets in to one WR", nsegs);
1418 		txq->txq_coalesced += nsegs;
1419 		wrp = (struct work_request_hdr *)txd;
1420 		flits = nsegs*2 + 1;
1421 
1422 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1423 			struct cpl_tx_pkt_batch_entry *cbe;
1424 			uint64_t flit;
1425 			uint32_t *hflit = (uint32_t *)&flit;
1426 			int cflags = m0->m_pkthdr.csum_flags;
1427 
1428 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1429 			GET_VTAG(cntrl, m0);
1430 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1431 			if (__predict_false(!(cflags & CSUM_IP)))
1432 				cntrl |= F_TXPKT_IPCSUM_DIS;
1433 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP |
1434 			    CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1435 				cntrl |= F_TXPKT_L4CSUM_DIS;
1436 
1437 			hflit[0] = htonl(cntrl);
1438 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1439 			flit |= htobe64(1 << 24);
1440 			cbe = &cpl_batch->pkt_entry[i];
1441 			cbe->cntrl = hflit[0];
1442 			cbe->len = hflit[1];
1443 			cbe->addr = htobe64(segs[i].ds_addr);
1444 		}
1445 
1446 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1447 		    V_WR_SGLSFLT(flits)) |
1448 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1449 		wr_lo = htonl(V_WR_LEN(flits) |
1450 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1451 		set_wr_hdr(wrp, wr_hi, wr_lo);
1452 		wmb();
1453 		ETHER_BPF_MTAP(pi->ifp, m0);
1454 		wr_gen2(txd, txqs.gen);
1455 		check_ring_tx_db(sc, txq, 0);
1456 		return (0);
1457 	} else if (tso_info) {
1458 		uint16_t eth_type;
1459 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1460 		struct ether_header *eh;
1461 		void *l3hdr;
1462 		struct tcphdr *tcp;
1463 
1464 		txd->flit[2] = 0;
1465 		GET_VTAG(cntrl, m0);
1466 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1467 		hdr->cntrl = htonl(cntrl);
1468 		hdr->len = htonl(mlen | 0x80000000);
1469 
1470 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1471 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x",
1472 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1473 			    (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags);
1474 			panic("tx tso packet too small");
1475 		}
1476 
1477 		/* Make sure that ether, ip, tcp headers are all in m0 */
1478 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1479 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1480 			if (__predict_false(m0 == NULL)) {
1481 				/* XXX panic probably an overreaction */
1482 				panic("couldn't fit header into mbuf");
1483 			}
1484 		}
1485 
1486 		eh = mtod(m0, struct ether_header *);
1487 		eth_type = eh->ether_type;
1488 		if (eth_type == htons(ETHERTYPE_VLAN)) {
1489 			struct ether_vlan_header *evh = (void *)eh;
1490 
1491 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1492 			l3hdr = evh + 1;
1493 			eth_type = evh->evl_proto;
1494 		} else {
1495 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1496 			l3hdr = eh + 1;
1497 		}
1498 
1499 		if (eth_type == htons(ETHERTYPE_IP)) {
1500 			struct ip *ip = l3hdr;
1501 
1502 			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1503 			tcp = (struct tcphdr *)(ip + 1);
1504 		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1505 			struct ip6_hdr *ip6 = l3hdr;
1506 
1507 			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1508 			    ("%s: CSUM_TSO with ip6_nxt %d",
1509 			    __func__, ip6->ip6_nxt));
1510 
1511 			tso_info |= F_LSO_IPV6;
1512 			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1513 			tcp = (struct tcphdr *)(ip6 + 1);
1514 		} else
1515 			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1516 
1517 		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1518 		hdr->lso_info = htonl(tso_info);
1519 
1520 		if (__predict_false(mlen <= PIO_LEN)) {
1521 			/*
1522 			 * pkt not undersized but fits in PIO_LEN
1523 			 * Indicates a TSO bug at the higher levels.
1524 			 */
1525 			txsd->m = NULL;
1526 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1527 			flits = (mlen + 7) / 8 + 3;
1528 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1529 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1530 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1531 			wr_lo = htonl(V_WR_LEN(flits) |
1532 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1533 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1534 			wmb();
1535 			ETHER_BPF_MTAP(pi->ifp, m0);
1536 			wr_gen2(txd, txqs.gen);
1537 			check_ring_tx_db(sc, txq, 0);
1538 			m_freem(m0);
1539 			return (0);
1540 		}
1541 		flits = 3;
1542 	} else {
1543 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1544 
1545 		GET_VTAG(cntrl, m0);
1546 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1547 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1548 			cntrl |= F_TXPKT_IPCSUM_DIS;
1549 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP |
1550 		    CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1551 			cntrl |= F_TXPKT_L4CSUM_DIS;
1552 		cpl->cntrl = htonl(cntrl);
1553 		cpl->len = htonl(mlen | 0x80000000);
1554 
1555 		if (mlen <= PIO_LEN) {
1556 			txsd->m = NULL;
1557 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1558 			flits = (mlen + 7) / 8 + 2;
1559 
1560 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1561 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1562 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1563 			wr_lo = htonl(V_WR_LEN(flits) |
1564 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1565 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1566 			wmb();
1567 			ETHER_BPF_MTAP(pi->ifp, m0);
1568 			wr_gen2(txd, txqs.gen);
1569 			check_ring_tx_db(sc, txq, 0);
1570 			m_freem(m0);
1571 			return (0);
1572 		}
1573 		flits = 2;
1574 	}
1575 	wrp = (struct work_request_hdr *)txd;
1576 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1577 	make_sgl(sgp, segs, nsegs);
1578 
1579 	sgl_flits = sgl_len(nsegs);
1580 
1581 	ETHER_BPF_MTAP(pi->ifp, m0);
1582 
1583 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1584 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1585 	wr_lo = htonl(V_WR_TID(txq->token));
1586 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1587 	    sgl_flits, wr_hi, wr_lo);
1588 	check_ring_tx_db(sc, txq, 0);
1589 
1590 	return (0);
1591 }
1592 
1593 #ifdef DEBUGNET
1594 int
1595 cxgb_debugnet_encap(struct sge_qset *qs, struct mbuf **m)
1596 {
1597 	int error;
1598 
1599 	error = t3_encap(qs, m);
1600 	if (error == 0)
1601 		check_ring_tx_db(qs->port->adapter, &qs->txq[TXQ_ETH], 1);
1602 	else if (*m != NULL) {
1603 		m_freem(*m);
1604 		*m = NULL;
1605 	}
1606 	return (error);
1607 }
1608 #endif
1609 
1610 void
1611 cxgb_tx_watchdog(void *arg)
1612 {
1613 	struct sge_qset *qs = arg;
1614 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1615 
1616         if (qs->coalescing != 0 &&
1617 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1618 	    TXQ_RING_EMPTY(qs))
1619                 qs->coalescing = 0;
1620         else if (qs->coalescing == 0 &&
1621 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1622                 qs->coalescing = 1;
1623 	if (TXQ_TRYLOCK(qs)) {
1624 		qs->qs_flags |= QS_FLUSHING;
1625 		cxgb_start_locked(qs);
1626 		qs->qs_flags &= ~QS_FLUSHING;
1627 		TXQ_UNLOCK(qs);
1628 	}
1629 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1630 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1631 		    qs, txq->txq_watchdog.c_cpu);
1632 }
1633 
1634 static void
1635 cxgb_tx_timeout(void *arg)
1636 {
1637 	struct sge_qset *qs = arg;
1638 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1639 
1640 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1641                 qs->coalescing = 1;
1642 	if (TXQ_TRYLOCK(qs)) {
1643 		qs->qs_flags |= QS_TIMEOUT;
1644 		cxgb_start_locked(qs);
1645 		qs->qs_flags &= ~QS_TIMEOUT;
1646 		TXQ_UNLOCK(qs);
1647 	}
1648 }
1649 
1650 static void
1651 cxgb_start_locked(struct sge_qset *qs)
1652 {
1653 	struct mbuf *m_head = NULL;
1654 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1655 	struct port_info *pi = qs->port;
1656 	struct ifnet *ifp = pi->ifp;
1657 
1658 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1659 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1660 
1661 	if (!pi->link_config.link_ok) {
1662 		TXQ_RING_FLUSH(qs);
1663 		return;
1664 	}
1665 	TXQ_LOCK_ASSERT(qs);
1666 	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1667 	    pi->link_config.link_ok) {
1668 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1669 
1670 		if (txq->size - txq->in_use <= TX_MAX_DESC)
1671 			break;
1672 
1673 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1674 			break;
1675 		/*
1676 		 *  Encapsulation can modify our pointer, and or make it
1677 		 *  NULL on failure.  In that event, we can't requeue.
1678 		 */
1679 		if (t3_encap(qs, &m_head) || m_head == NULL)
1680 			break;
1681 
1682 		m_head = NULL;
1683 	}
1684 
1685 	if (txq->db_pending)
1686 		check_ring_tx_db(pi->adapter, txq, 1);
1687 
1688 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1689 	    pi->link_config.link_ok)
1690 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1691 		    qs, txq->txq_timer.c_cpu);
1692 	if (m_head != NULL)
1693 		m_freem(m_head);
1694 }
1695 
1696 static int
1697 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1698 {
1699 	struct port_info *pi = qs->port;
1700 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1701 	struct buf_ring *br = txq->txq_mr;
1702 	int error, avail;
1703 
1704 	avail = txq->size - txq->in_use;
1705 	TXQ_LOCK_ASSERT(qs);
1706 
1707 	/*
1708 	 * We can only do a direct transmit if the following are true:
1709 	 * - we aren't coalescing (ring < 3/4 full)
1710 	 * - the link is up -- checked in caller
1711 	 * - there are no packets enqueued already
1712 	 * - there is space in hardware transmit queue
1713 	 */
1714 	if (check_pkt_coalesce(qs) == 0 &&
1715 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1716 		if (t3_encap(qs, &m)) {
1717 			if (m != NULL &&
1718 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1719 				return (error);
1720 		} else {
1721 			if (txq->db_pending)
1722 				check_ring_tx_db(pi->adapter, txq, 1);
1723 
1724 			/*
1725 			 * We've bypassed the buf ring so we need to update
1726 			 * the stats directly
1727 			 */
1728 			txq->txq_direct_packets++;
1729 			txq->txq_direct_bytes += m->m_pkthdr.len;
1730 		}
1731 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1732 		return (error);
1733 
1734 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1735 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1736 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1737 		cxgb_start_locked(qs);
1738 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1739 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1740 		    qs, txq->txq_timer.c_cpu);
1741 	return (0);
1742 }
1743 
1744 int
1745 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1746 {
1747 	struct sge_qset *qs;
1748 	struct port_info *pi = ifp->if_softc;
1749 	int error, qidx = pi->first_qset;
1750 
1751 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1752 	    ||(!pi->link_config.link_ok)) {
1753 		m_freem(m);
1754 		return (0);
1755 	}
1756 
1757 	/* check if flowid is set */
1758 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
1759 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1760 
1761 	qs = &pi->adapter->sge.qs[qidx];
1762 
1763 	if (TXQ_TRYLOCK(qs)) {
1764 		/* XXX running */
1765 		error = cxgb_transmit_locked(ifp, qs, m);
1766 		TXQ_UNLOCK(qs);
1767 	} else
1768 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1769 	return (error);
1770 }
1771 
1772 void
1773 cxgb_qflush(struct ifnet *ifp)
1774 {
1775 	/*
1776 	 * flush any enqueued mbufs in the buf_rings
1777 	 * and in the transmit queues
1778 	 * no-op for now
1779 	 */
1780 	return;
1781 }
1782 
1783 /**
1784  *	write_imm - write a packet into a Tx descriptor as immediate data
1785  *	@d: the Tx descriptor to write
1786  *	@m: the packet
1787  *	@len: the length of packet data to write as immediate data
1788  *	@gen: the generation bit value to write
1789  *
1790  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1791  *	contains a work request at its beginning.  We must write the packet
1792  *	carefully so the SGE doesn't read accidentally before it's written in
1793  *	its entirety.
1794  */
1795 static __inline void
1796 write_imm(struct tx_desc *d, caddr_t src,
1797 	  unsigned int len, unsigned int gen)
1798 {
1799 	struct work_request_hdr *from = (struct work_request_hdr *)src;
1800 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1801 	uint32_t wr_hi, wr_lo;
1802 
1803 	KASSERT(len <= WR_LEN && len >= sizeof(*from),
1804 	    ("%s: invalid len %d", __func__, len));
1805 
1806 	memcpy(&to[1], &from[1], len - sizeof(*from));
1807 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1808 	    V_WR_BCNTLFLT(len & 7));
1809 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
1810 	set_wr_hdr(to, wr_hi, wr_lo);
1811 	wmb();
1812 	wr_gen2(d, gen);
1813 }
1814 
1815 /**
1816  *	check_desc_avail - check descriptor availability on a send queue
1817  *	@adap: the adapter
1818  *	@q: the TX queue
1819  *	@m: the packet needing the descriptors
1820  *	@ndesc: the number of Tx descriptors needed
1821  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1822  *
1823  *	Checks if the requested number of Tx descriptors is available on an
1824  *	SGE send queue.  If the queue is already suspended or not enough
1825  *	descriptors are available the packet is queued for later transmission.
1826  *	Must be called with the Tx queue locked.
1827  *
1828  *	Returns 0 if enough descriptors are available, 1 if there aren't
1829  *	enough descriptors and the packet has been queued, and 2 if the caller
1830  *	needs to retry because there weren't enough descriptors at the
1831  *	beginning of the call but some freed up in the mean time.
1832  */
1833 static __inline int
1834 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1835 		 struct mbuf *m, unsigned int ndesc,
1836 		 unsigned int qid)
1837 {
1838 	/*
1839 	 * XXX We currently only use this for checking the control queue
1840 	 * the control queue is only used for binding qsets which happens
1841 	 * at init time so we are guaranteed enough descriptors
1842 	 */
1843 	if (__predict_false(mbufq_len(&q->sendq))) {
1844 addq_exit:	(void )mbufq_enqueue(&q->sendq, m);
1845 		return 1;
1846 	}
1847 	if (__predict_false(q->size - q->in_use < ndesc)) {
1848 
1849 		struct sge_qset *qs = txq_to_qset(q, qid);
1850 
1851 		setbit(&qs->txq_stopped, qid);
1852 		if (should_restart_tx(q) &&
1853 		    test_and_clear_bit(qid, &qs->txq_stopped))
1854 			return 2;
1855 
1856 		q->stops++;
1857 		goto addq_exit;
1858 	}
1859 	return 0;
1860 }
1861 
1862 
1863 /**
1864  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1865  *	@q: the SGE control Tx queue
1866  *
1867  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1868  *	that send only immediate data (presently just the control queues) and
1869  *	thus do not have any mbufs
1870  */
1871 static __inline void
1872 reclaim_completed_tx_imm(struct sge_txq *q)
1873 {
1874 	unsigned int reclaim = q->processed - q->cleaned;
1875 
1876 	q->in_use -= reclaim;
1877 	q->cleaned += reclaim;
1878 }
1879 
1880 /**
1881  *	ctrl_xmit - send a packet through an SGE control Tx queue
1882  *	@adap: the adapter
1883  *	@q: the control queue
1884  *	@m: the packet
1885  *
1886  *	Send a packet through an SGE control Tx queue.  Packets sent through
1887  *	a control queue must fit entirely as immediate data in a single Tx
1888  *	descriptor and have no page fragments.
1889  */
1890 static int
1891 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1892 {
1893 	int ret;
1894 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1895 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1896 
1897 	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
1898 
1899 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1900 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1901 
1902 	TXQ_LOCK(qs);
1903 again:	reclaim_completed_tx_imm(q);
1904 
1905 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1906 	if (__predict_false(ret)) {
1907 		if (ret == 1) {
1908 			TXQ_UNLOCK(qs);
1909 			return (ENOSPC);
1910 		}
1911 		goto again;
1912 	}
1913 	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1914 
1915 	q->in_use++;
1916 	if (++q->pidx >= q->size) {
1917 		q->pidx = 0;
1918 		q->gen ^= 1;
1919 	}
1920 	TXQ_UNLOCK(qs);
1921 	wmb();
1922 	t3_write_reg(adap, A_SG_KDOORBELL,
1923 	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1924 
1925 	m_free(m);
1926 	return (0);
1927 }
1928 
1929 
1930 /**
1931  *	restart_ctrlq - restart a suspended control queue
1932  *	@qs: the queue set cotaining the control queue
1933  *
1934  *	Resumes transmission on a suspended Tx control queue.
1935  */
1936 static void
1937 restart_ctrlq(void *data, int npending)
1938 {
1939 	struct mbuf *m;
1940 	struct sge_qset *qs = (struct sge_qset *)data;
1941 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1942 	adapter_t *adap = qs->port->adapter;
1943 
1944 	TXQ_LOCK(qs);
1945 again:	reclaim_completed_tx_imm(q);
1946 
1947 	while (q->in_use < q->size &&
1948 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1949 
1950 		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1951 		m_free(m);
1952 
1953 		if (++q->pidx >= q->size) {
1954 			q->pidx = 0;
1955 			q->gen ^= 1;
1956 		}
1957 		q->in_use++;
1958 	}
1959 	if (mbufq_len(&q->sendq)) {
1960 		setbit(&qs->txq_stopped, TXQ_CTRL);
1961 
1962 		if (should_restart_tx(q) &&
1963 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1964 			goto again;
1965 		q->stops++;
1966 	}
1967 	TXQ_UNLOCK(qs);
1968 	t3_write_reg(adap, A_SG_KDOORBELL,
1969 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1970 }
1971 
1972 
1973 /*
1974  * Send a management message through control queue 0
1975  */
1976 int
1977 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1978 {
1979 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1980 }
1981 
1982 /**
1983  *	free_qset - free the resources of an SGE queue set
1984  *	@sc: the controller owning the queue set
1985  *	@q: the queue set
1986  *
1987  *	Release the HW and SW resources associated with an SGE queue set, such
1988  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1989  *	queue set must be quiesced prior to calling this.
1990  */
1991 static void
1992 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1993 {
1994 	int i;
1995 
1996 	reclaim_completed_tx(q, 0, TXQ_ETH);
1997 	if (q->txq[TXQ_ETH].txq_mr != NULL)
1998 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
1999 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
2000 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
2001 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
2002 	}
2003 
2004 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2005 		if (q->fl[i].desc) {
2006 			mtx_lock_spin(&sc->sge.reg_lock);
2007 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2008 			mtx_unlock_spin(&sc->sge.reg_lock);
2009 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2010 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2011 					q->fl[i].desc_map);
2012 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2013 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2014 		}
2015 		if (q->fl[i].sdesc) {
2016 			free_rx_bufs(sc, &q->fl[i]);
2017 			free(q->fl[i].sdesc, M_DEVBUF);
2018 		}
2019 	}
2020 
2021 	mtx_unlock(&q->lock);
2022 	MTX_DESTROY(&q->lock);
2023 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2024 		if (q->txq[i].desc) {
2025 			mtx_lock_spin(&sc->sge.reg_lock);
2026 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2027 			mtx_unlock_spin(&sc->sge.reg_lock);
2028 			bus_dmamap_unload(q->txq[i].desc_tag,
2029 					q->txq[i].desc_map);
2030 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2031 					q->txq[i].desc_map);
2032 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2033 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2034 		}
2035 		if (q->txq[i].sdesc) {
2036 			free(q->txq[i].sdesc, M_DEVBUF);
2037 		}
2038 	}
2039 
2040 	if (q->rspq.desc) {
2041 		mtx_lock_spin(&sc->sge.reg_lock);
2042 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2043 		mtx_unlock_spin(&sc->sge.reg_lock);
2044 
2045 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2046 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2047 			        q->rspq.desc_map);
2048 		bus_dma_tag_destroy(q->rspq.desc_tag);
2049 		MTX_DESTROY(&q->rspq.lock);
2050 	}
2051 
2052 #if defined(INET6) || defined(INET)
2053 	tcp_lro_free(&q->lro.ctrl);
2054 #endif
2055 
2056 	bzero(q, sizeof(*q));
2057 }
2058 
2059 /**
2060  *	t3_free_sge_resources - free SGE resources
2061  *	@sc: the adapter softc
2062  *
2063  *	Frees resources used by the SGE queue sets.
2064  */
2065 void
2066 t3_free_sge_resources(adapter_t *sc, int nqsets)
2067 {
2068 	int i;
2069 
2070 	for (i = 0; i < nqsets; ++i) {
2071 		TXQ_LOCK(&sc->sge.qs[i]);
2072 		t3_free_qset(sc, &sc->sge.qs[i]);
2073 	}
2074 }
2075 
2076 /**
2077  *	t3_sge_start - enable SGE
2078  *	@sc: the controller softc
2079  *
2080  *	Enables the SGE for DMAs.  This is the last step in starting packet
2081  *	transfers.
2082  */
2083 void
2084 t3_sge_start(adapter_t *sc)
2085 {
2086 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2087 }
2088 
2089 /**
2090  *	t3_sge_stop - disable SGE operation
2091  *	@sc: the adapter
2092  *
2093  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2094  *	from error interrupts) or from normal process context.  In the latter
2095  *	case it also disables any pending queue restart tasklets.  Note that
2096  *	if it is called in interrupt context it cannot disable the restart
2097  *	tasklets as it cannot wait, however the tasklets will have no effect
2098  *	since the doorbells are disabled and the driver will call this again
2099  *	later from process context, at which time the tasklets will be stopped
2100  *	if they are still running.
2101  */
2102 void
2103 t3_sge_stop(adapter_t *sc)
2104 {
2105 	int i, nqsets;
2106 
2107 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2108 
2109 	if (sc->tq == NULL)
2110 		return;
2111 
2112 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2113 		nqsets += sc->port[i].nqsets;
2114 #ifdef notyet
2115 	/*
2116 	 *
2117 	 * XXX
2118 	 */
2119 	for (i = 0; i < nqsets; ++i) {
2120 		struct sge_qset *qs = &sc->sge.qs[i];
2121 
2122 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2123 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2124 	}
2125 #endif
2126 }
2127 
2128 /**
2129  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2130  *	@adapter: the adapter
2131  *	@q: the Tx queue to reclaim descriptors from
2132  *	@reclaimable: the number of descriptors to reclaim
2133  *      @m_vec_size: maximum number of buffers to reclaim
2134  *      @desc_reclaimed: returns the number of descriptors reclaimed
2135  *
2136  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2137  *	Tx buffers.  Called with the Tx queue lock held.
2138  *
2139  *      Returns number of buffers of reclaimed
2140  */
2141 void
2142 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2143 {
2144 	struct tx_sw_desc *txsd;
2145 	unsigned int cidx, mask;
2146 	struct sge_txq *q = &qs->txq[queue];
2147 
2148 #ifdef T3_TRACE
2149 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2150 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2151 #endif
2152 	cidx = q->cidx;
2153 	mask = q->size - 1;
2154 	txsd = &q->sdesc[cidx];
2155 
2156 	mtx_assert(&qs->lock, MA_OWNED);
2157 	while (reclaimable--) {
2158 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2159 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2160 
2161 		if (txsd->m != NULL) {
2162 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2163 				bus_dmamap_unload(q->entry_tag, txsd->map);
2164 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2165 			}
2166 			m_freem_list(txsd->m);
2167 			txsd->m = NULL;
2168 		} else
2169 			q->txq_skipped++;
2170 
2171 		++txsd;
2172 		if (++cidx == q->size) {
2173 			cidx = 0;
2174 			txsd = q->sdesc;
2175 		}
2176 	}
2177 	q->cidx = cidx;
2178 
2179 }
2180 
2181 /**
2182  *	is_new_response - check if a response is newly written
2183  *	@r: the response descriptor
2184  *	@q: the response queue
2185  *
2186  *	Returns true if a response descriptor contains a yet unprocessed
2187  *	response.
2188  */
2189 static __inline int
2190 is_new_response(const struct rsp_desc *r,
2191     const struct sge_rspq *q)
2192 {
2193 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2194 }
2195 
2196 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2197 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2198 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2199 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2200 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2201 
2202 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2203 #define NOMEM_INTR_DELAY 2500
2204 
2205 #ifdef TCP_OFFLOAD
2206 /**
2207  *	write_ofld_wr - write an offload work request
2208  *	@adap: the adapter
2209  *	@m: the packet to send
2210  *	@q: the Tx queue
2211  *	@pidx: index of the first Tx descriptor to write
2212  *	@gen: the generation value to use
2213  *	@ndesc: number of descriptors the packet will occupy
2214  *
2215  *	Write an offload work request to send the supplied packet.  The packet
2216  *	data already carry the work request with most fields populated.
2217  */
2218 static void
2219 write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
2220     unsigned int pidx, unsigned int gen, unsigned int ndesc)
2221 {
2222 	unsigned int sgl_flits, flits;
2223 	int i, idx, nsegs, wrlen;
2224 	struct work_request_hdr *from;
2225 	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
2226 	struct tx_desc *d = &q->desc[pidx];
2227 	struct txq_state txqs;
2228 	struct sglist_seg *segs;
2229 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2230 	struct sglist *sgl;
2231 
2232 	from = (void *)(oh + 1);	/* Start of WR within mbuf */
2233 	wrlen = m->m_len - sizeof(*oh);
2234 
2235 	if (!(oh->flags & F_HDR_SGL)) {
2236 		write_imm(d, (caddr_t)from, wrlen, gen);
2237 
2238 		/*
2239 		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
2240 		 * t3_push_frames and freed in wr_ack.  Others, like those sent
2241 		 * down by close_conn, t3_send_reset, etc. should be freed here.
2242 		 */
2243 		if (!(oh->flags & F_HDR_DF))
2244 			m_free(m);
2245 		return;
2246 	}
2247 
2248 	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
2249 
2250 	sgl = oh->sgl;
2251 	flits = wrlen / 8;
2252 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
2253 
2254 	nsegs = sgl->sg_nseg;
2255 	segs = sgl->sg_segs;
2256 	for (idx = 0, i = 0; i < nsegs; i++) {
2257 		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
2258 		if (i && idx == 0)
2259 			++sgp;
2260 		sgp->len[idx] = htobe32(segs[i].ss_len);
2261 		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
2262 		idx ^= 1;
2263 	}
2264 	if (idx) {
2265 		sgp->len[idx] = 0;
2266 		sgp->addr[idx] = 0;
2267 	}
2268 
2269 	sgl_flits = sgl_len(nsegs);
2270 	txqs.gen = gen;
2271 	txqs.pidx = pidx;
2272 	txqs.compl = 0;
2273 
2274 	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
2275 	    from->wrh_hi, from->wrh_lo);
2276 }
2277 
2278 /**
2279  *	ofld_xmit - send a packet through an offload queue
2280  *	@adap: the adapter
2281  *	@q: the Tx offload queue
2282  *	@m: the packet
2283  *
2284  *	Send an offload packet through an SGE offload queue.
2285  */
2286 static int
2287 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2288 {
2289 	int ret;
2290 	unsigned int ndesc;
2291 	unsigned int pidx, gen;
2292 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2293 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2294 
2295 	ndesc = G_HDR_NDESC(oh->flags);
2296 
2297 	TXQ_LOCK(qs);
2298 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2299 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2300 	if (__predict_false(ret)) {
2301 		if (ret == 1) {
2302 			TXQ_UNLOCK(qs);
2303 			return (EINTR);
2304 		}
2305 		goto again;
2306 	}
2307 
2308 	gen = q->gen;
2309 	q->in_use += ndesc;
2310 	pidx = q->pidx;
2311 	q->pidx += ndesc;
2312 	if (q->pidx >= q->size) {
2313 		q->pidx -= q->size;
2314 		q->gen ^= 1;
2315 	}
2316 
2317 	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2318 	check_ring_tx_db(adap, q, 1);
2319 	TXQ_UNLOCK(qs);
2320 
2321 	return (0);
2322 }
2323 
2324 /**
2325  *	restart_offloadq - restart a suspended offload queue
2326  *	@qs: the queue set cotaining the offload queue
2327  *
2328  *	Resumes transmission on a suspended Tx offload queue.
2329  */
2330 static void
2331 restart_offloadq(void *data, int npending)
2332 {
2333 	struct mbuf *m;
2334 	struct sge_qset *qs = data;
2335 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2336 	adapter_t *adap = qs->port->adapter;
2337 	int cleaned;
2338 
2339 	TXQ_LOCK(qs);
2340 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2341 
2342 	while ((m = mbufq_first(&q->sendq)) != NULL) {
2343 		unsigned int gen, pidx;
2344 		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2345 		unsigned int ndesc = G_HDR_NDESC(oh->flags);
2346 
2347 		if (__predict_false(q->size - q->in_use < ndesc)) {
2348 			setbit(&qs->txq_stopped, TXQ_OFLD);
2349 			if (should_restart_tx(q) &&
2350 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2351 				goto again;
2352 			q->stops++;
2353 			break;
2354 		}
2355 
2356 		gen = q->gen;
2357 		q->in_use += ndesc;
2358 		pidx = q->pidx;
2359 		q->pidx += ndesc;
2360 		if (q->pidx >= q->size) {
2361 			q->pidx -= q->size;
2362 			q->gen ^= 1;
2363 		}
2364 
2365 		(void)mbufq_dequeue(&q->sendq);
2366 		TXQ_UNLOCK(qs);
2367 		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2368 		TXQ_LOCK(qs);
2369 	}
2370 #if USE_GTS
2371 	set_bit(TXQ_RUNNING, &q->flags);
2372 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2373 #endif
2374 	TXQ_UNLOCK(qs);
2375 	wmb();
2376 	t3_write_reg(adap, A_SG_KDOORBELL,
2377 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2378 }
2379 
2380 /**
2381  *	t3_offload_tx - send an offload packet
2382  *	@m: the packet
2383  *
2384  *	Sends an offload packet.  We use the packet priority to select the
2385  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2386  *	should be sent as regular or control, bits 1-3 select the queue set.
2387  */
2388 int
2389 t3_offload_tx(struct adapter *sc, struct mbuf *m)
2390 {
2391 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2392 	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
2393 
2394 	if (oh->flags & F_HDR_CTRL) {
2395 		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
2396 		return (ctrl_xmit(sc, qs, m));
2397 	} else
2398 		return (ofld_xmit(sc, qs, m));
2399 }
2400 #endif
2401 
2402 static void
2403 restart_tx(struct sge_qset *qs)
2404 {
2405 	struct adapter *sc = qs->port->adapter;
2406 
2407 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2408 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2409 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2410 		qs->txq[TXQ_OFLD].restarts++;
2411 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2412 	}
2413 
2414 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2415 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2416 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2417 		qs->txq[TXQ_CTRL].restarts++;
2418 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2419 	}
2420 }
2421 
2422 /**
2423  *	t3_sge_alloc_qset - initialize an SGE queue set
2424  *	@sc: the controller softc
2425  *	@id: the queue set id
2426  *	@nports: how many Ethernet ports will be using this queue set
2427  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2428  *	@p: configuration parameters for this queue set
2429  *	@ntxq: number of Tx queues for the queue set
2430  *	@pi: port info for queue set
2431  *
2432  *	Allocate resources and initialize an SGE queue set.  A queue set
2433  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2434  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2435  *	queue, offload queue, and control queue.
2436  */
2437 int
2438 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2439 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2440 {
2441 	struct sge_qset *q = &sc->sge.qs[id];
2442 	int i, ret = 0;
2443 
2444 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2445 	q->port = pi;
2446 	q->adap = sc;
2447 
2448 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2449 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2450 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2451 		goto err;
2452 	}
2453 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2454 	    M_NOWAIT | M_ZERO)) == NULL) {
2455 		device_printf(sc->dev, "failed to allocate ifq\n");
2456 		goto err;
2457 	}
2458 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2459 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2460 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2461 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2462 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2463 
2464 	init_qset_cntxt(q, id);
2465 	q->idx = id;
2466 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2467 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2468 		    &q->fl[0].desc, &q->fl[0].sdesc,
2469 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2470 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2471 		printf("error %d from alloc ring fl0\n", ret);
2472 		goto err;
2473 	}
2474 
2475 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2476 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2477 		    &q->fl[1].desc, &q->fl[1].sdesc,
2478 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2479 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2480 		printf("error %d from alloc ring fl1\n", ret);
2481 		goto err;
2482 	}
2483 
2484 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2485 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2486 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2487 		    NULL, NULL)) != 0) {
2488 		printf("error %d from alloc ring rspq\n", ret);
2489 		goto err;
2490 	}
2491 
2492 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2493 	    device_get_unit(sc->dev), irq_vec_idx);
2494 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2495 
2496 	for (i = 0; i < ntxq; ++i) {
2497 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2498 
2499 		if ((ret = alloc_ring(sc, p->txq_size[i],
2500 			    sizeof(struct tx_desc), sz,
2501 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2502 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2503 			    &q->txq[i].desc_map,
2504 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2505 			printf("error %d from alloc ring tx %i\n", ret, i);
2506 			goto err;
2507 		}
2508 		mbufq_init(&q->txq[i].sendq, INT_MAX);
2509 		q->txq[i].gen = 1;
2510 		q->txq[i].size = p->txq_size[i];
2511 	}
2512 
2513 #ifdef TCP_OFFLOAD
2514 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2515 #endif
2516 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2517 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2518 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2519 
2520 	q->fl[0].gen = q->fl[1].gen = 1;
2521 	q->fl[0].size = p->fl_size;
2522 	q->fl[1].size = p->jumbo_size;
2523 
2524 	q->rspq.gen = 1;
2525 	q->rspq.cidx = 0;
2526 	q->rspq.size = p->rspq_size;
2527 
2528 	q->txq[TXQ_ETH].stop_thres = nports *
2529 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2530 
2531 	q->fl[0].buf_size = MCLBYTES;
2532 	q->fl[0].zone = zone_pack;
2533 	q->fl[0].type = EXT_PACKET;
2534 
2535 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2536 		q->fl[1].zone = zone_jumbo16;
2537 		q->fl[1].type = EXT_JUMBO16;
2538 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2539 		q->fl[1].zone = zone_jumbo9;
2540 		q->fl[1].type = EXT_JUMBO9;
2541 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2542 		q->fl[1].zone = zone_jumbop;
2543 		q->fl[1].type = EXT_JUMBOP;
2544 	} else {
2545 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2546 		ret = EDOOFUS;
2547 		goto err;
2548 	}
2549 	q->fl[1].buf_size = p->jumbo_buf_size;
2550 
2551 	/* Allocate and setup the lro_ctrl structure */
2552 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2553 #if defined(INET6) || defined(INET)
2554 	ret = tcp_lro_init(&q->lro.ctrl);
2555 	if (ret) {
2556 		printf("error %d from tcp_lro_init\n", ret);
2557 		goto err;
2558 	}
2559 #endif
2560 	q->lro.ctrl.ifp = pi->ifp;
2561 
2562 	mtx_lock_spin(&sc->sge.reg_lock);
2563 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2564 				   q->rspq.phys_addr, q->rspq.size,
2565 				   q->fl[0].buf_size, 1, 0);
2566 	if (ret) {
2567 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2568 		goto err_unlock;
2569 	}
2570 
2571 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2572 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2573 					  q->fl[i].phys_addr, q->fl[i].size,
2574 					  q->fl[i].buf_size, p->cong_thres, 1,
2575 					  0);
2576 		if (ret) {
2577 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2578 			goto err_unlock;
2579 		}
2580 	}
2581 
2582 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2583 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2584 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2585 				 1, 0);
2586 	if (ret) {
2587 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2588 		goto err_unlock;
2589 	}
2590 
2591 	if (ntxq > 1) {
2592 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2593 					 USE_GTS, SGE_CNTXT_OFLD, id,
2594 					 q->txq[TXQ_OFLD].phys_addr,
2595 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2596 		if (ret) {
2597 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2598 			goto err_unlock;
2599 		}
2600 	}
2601 
2602 	if (ntxq > 2) {
2603 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2604 					 SGE_CNTXT_CTRL, id,
2605 					 q->txq[TXQ_CTRL].phys_addr,
2606 					 q->txq[TXQ_CTRL].size,
2607 					 q->txq[TXQ_CTRL].token, 1, 0);
2608 		if (ret) {
2609 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2610 			goto err_unlock;
2611 		}
2612 	}
2613 
2614 	mtx_unlock_spin(&sc->sge.reg_lock);
2615 	t3_update_qset_coalesce(q, p);
2616 
2617 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2618 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2619 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2620 
2621 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2622 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2623 
2624 	return (0);
2625 
2626 err_unlock:
2627 	mtx_unlock_spin(&sc->sge.reg_lock);
2628 err:
2629 	TXQ_LOCK(q);
2630 	t3_free_qset(sc, q);
2631 
2632 	return (ret);
2633 }
2634 
2635 /*
2636  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2637  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2638  * will also be taken into account here.
2639  */
2640 void
2641 t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad)
2642 {
2643 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2644 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2645 	struct ifnet *ifp = pi->ifp;
2646 
2647 	if (cpl->vlan_valid) {
2648 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2649 		m->m_flags |= M_VLANTAG;
2650 	}
2651 
2652 	m->m_pkthdr.rcvif = ifp;
2653 	/*
2654 	 * adjust after conversion to mbuf chain
2655 	 */
2656 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2657 	m->m_len -= (sizeof(*cpl) + ethpad);
2658 	m->m_data += (sizeof(*cpl) + ethpad);
2659 
2660 	if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) {
2661 		struct ether_header *eh = mtod(m, void *);
2662 		uint16_t eh_type;
2663 
2664 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2665 			struct ether_vlan_header *evh = mtod(m, void *);
2666 
2667 			eh_type = evh->evl_proto;
2668 		} else
2669 			eh_type = eh->ether_type;
2670 
2671 		if (ifp->if_capenable & IFCAP_RXCSUM &&
2672 		    eh_type == htons(ETHERTYPE_IP)) {
2673 			m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
2674 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2675 			m->m_pkthdr.csum_data = 0xffff;
2676 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
2677 		    eh_type == htons(ETHERTYPE_IPV6)) {
2678 			m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
2679 			    CSUM_PSEUDO_HDR);
2680 			m->m_pkthdr.csum_data = 0xffff;
2681 		}
2682 	}
2683 }
2684 
2685 /**
2686  *	get_packet - return the next ingress packet buffer from a free list
2687  *	@adap: the adapter that received the packet
2688  *	@drop_thres: # of remaining buffers before we start dropping packets
2689  *	@qs: the qset that the SGE free list holding the packet belongs to
2690  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2691  *      @r: response descriptor
2692  *
2693  *	Get the next packet from a free list and complete setup of the
2694  *	sk_buff.  If the packet is small we make a copy and recycle the
2695  *	original buffer, otherwise we use the original buffer itself.  If a
2696  *	positive drop threshold is supplied packets are dropped and their
2697  *	buffers recycled if (a) the number of remaining buffers is under the
2698  *	threshold and the packet is too big to copy, or (b) the packet should
2699  *	be copied but there is no memory for the copy.
2700  */
2701 static int
2702 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2703     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2704 {
2705 
2706 	unsigned int len_cq =  ntohl(r->len_cq);
2707 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2708 	int mask, cidx = fl->cidx;
2709 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2710 	uint32_t len = G_RSPD_LEN(len_cq);
2711 	uint32_t flags = M_EXT;
2712 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2713 	caddr_t cl;
2714 	struct mbuf *m;
2715 	int ret = 0;
2716 
2717 	mask = fl->size - 1;
2718 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2719 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2720 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2721 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2722 
2723 	fl->credits--;
2724 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2725 
2726 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2727 	    sopeop == RSPQ_SOP_EOP) {
2728 		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
2729 			goto skip_recycle;
2730 		cl = mtod(m, void *);
2731 		memcpy(cl, sd->rxsd_cl, len);
2732 		recycle_rx_buf(adap, fl, fl->cidx);
2733 		m->m_pkthdr.len = m->m_len = len;
2734 		m->m_flags = 0;
2735 		mh->mh_head = mh->mh_tail = m;
2736 		ret = 1;
2737 		goto done;
2738 	} else {
2739 	skip_recycle:
2740 		bus_dmamap_unload(fl->entry_tag, sd->map);
2741 		cl = sd->rxsd_cl;
2742 		m = sd->m;
2743 
2744 		if ((sopeop == RSPQ_SOP_EOP) ||
2745 		    (sopeop == RSPQ_SOP))
2746 			flags |= M_PKTHDR;
2747 		m_init(m, M_NOWAIT, MT_DATA, flags);
2748 		if (fl->zone == zone_pack) {
2749 			/*
2750 			 * restore clobbered data pointer
2751 			 */
2752 			m->m_data = m->m_ext.ext_buf;
2753 		} else {
2754 			m_cljset(m, cl, fl->type);
2755 		}
2756 		m->m_len = len;
2757 	}
2758 	switch(sopeop) {
2759 	case RSPQ_SOP_EOP:
2760 		ret = 1;
2761 		/* FALLTHROUGH */
2762 	case RSPQ_SOP:
2763 		mh->mh_head = mh->mh_tail = m;
2764 		m->m_pkthdr.len = len;
2765 		break;
2766 	case RSPQ_EOP:
2767 		ret = 1;
2768 		/* FALLTHROUGH */
2769 	case RSPQ_NSOP_NEOP:
2770 		if (mh->mh_tail == NULL) {
2771 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2772 			m_freem(m);
2773 			break;
2774 		}
2775 		mh->mh_tail->m_next = m;
2776 		mh->mh_tail = m;
2777 		mh->mh_head->m_pkthdr.len += len;
2778 		break;
2779 	}
2780 	if (cxgb_debug)
2781 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2782 done:
2783 	if (++fl->cidx == fl->size)
2784 		fl->cidx = 0;
2785 
2786 	return (ret);
2787 }
2788 
2789 /**
2790  *	handle_rsp_cntrl_info - handles control information in a response
2791  *	@qs: the queue set corresponding to the response
2792  *	@flags: the response control flags
2793  *
2794  *	Handles the control information of an SGE response, such as GTS
2795  *	indications and completion credits for the queue set's Tx queues.
2796  *	HW coalesces credits, we don't do any extra SW coalescing.
2797  */
2798 static __inline void
2799 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2800 {
2801 	unsigned int credits;
2802 
2803 #if USE_GTS
2804 	if (flags & F_RSPD_TXQ0_GTS)
2805 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2806 #endif
2807 	credits = G_RSPD_TXQ0_CR(flags);
2808 	if (credits)
2809 		qs->txq[TXQ_ETH].processed += credits;
2810 
2811 	credits = G_RSPD_TXQ2_CR(flags);
2812 	if (credits)
2813 		qs->txq[TXQ_CTRL].processed += credits;
2814 
2815 # if USE_GTS
2816 	if (flags & F_RSPD_TXQ1_GTS)
2817 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2818 # endif
2819 	credits = G_RSPD_TXQ1_CR(flags);
2820 	if (credits)
2821 		qs->txq[TXQ_OFLD].processed += credits;
2822 
2823 }
2824 
2825 static void
2826 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2827     unsigned int sleeping)
2828 {
2829 	;
2830 }
2831 
2832 /**
2833  *	process_responses - process responses from an SGE response queue
2834  *	@adap: the adapter
2835  *	@qs: the queue set to which the response queue belongs
2836  *	@budget: how many responses can be processed in this round
2837  *
2838  *	Process responses from an SGE response queue up to the supplied budget.
2839  *	Responses include received packets as well as credits and other events
2840  *	for the queues that belong to the response queue's queue set.
2841  *	A negative budget is effectively unlimited.
2842  *
2843  *	Additionally choose the interrupt holdoff time for the next interrupt
2844  *	on this queue.  If the system is under memory shortage use a fairly
2845  *	long delay to help recovery.
2846  */
2847 static int
2848 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2849 {
2850 	struct sge_rspq *rspq = &qs->rspq;
2851 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2852 	int budget_left = budget;
2853 	unsigned int sleeping = 0;
2854 #if defined(INET6) || defined(INET)
2855 	int lro_enabled = qs->lro.enabled;
2856 	int skip_lro;
2857 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2858 #endif
2859 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2860 #ifdef DEBUG
2861 	static int last_holdoff = 0;
2862 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2863 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2864 		last_holdoff = rspq->holdoff_tmr;
2865 	}
2866 #endif
2867 	rspq->next_holdoff = rspq->holdoff_tmr;
2868 
2869 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2870 		int eth, eop = 0, ethpad = 0;
2871 		uint32_t flags = ntohl(r->flags);
2872 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2873 		uint8_t opcode = r->rss_hdr.opcode;
2874 
2875 		eth = (opcode == CPL_RX_PKT);
2876 
2877 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2878 			struct mbuf *m;
2879 
2880 			if (cxgb_debug)
2881 				printf("async notification\n");
2882 
2883 			if (mh->mh_head == NULL) {
2884 				mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA);
2885 				m = mh->mh_head;
2886 			} else {
2887 				m = m_gethdr(M_NOWAIT, MT_DATA);
2888 			}
2889 			if (m == NULL)
2890 				goto no_mem;
2891 
2892                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2893 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2894                         *mtod(m, uint8_t *) = CPL_ASYNC_NOTIF;
2895 			opcode = CPL_ASYNC_NOTIF;
2896 			eop = 1;
2897                         rspq->async_notif++;
2898 			goto skip;
2899 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2900 			struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
2901 
2902 			if (m == NULL) {
2903 		no_mem:
2904 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2905 				budget_left--;
2906 				break;
2907 			}
2908 			if (mh->mh_head == NULL)
2909 				mh->mh_head = m;
2910                         else
2911 				mh->mh_tail->m_next = m;
2912 			mh->mh_tail = m;
2913 
2914 			get_imm_packet(adap, r, m);
2915 			mh->mh_head->m_pkthdr.len += m->m_len;
2916 			eop = 1;
2917 			rspq->imm_data++;
2918 		} else if (r->len_cq) {
2919 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2920 
2921 			eop = get_packet(adap, drop_thresh, qs, mh, r);
2922 			if (eop) {
2923 				if (r->rss_hdr.hash_type && !adap->timestamp) {
2924 					M_HASHTYPE_SET(mh->mh_head,
2925 					    M_HASHTYPE_OPAQUE_HASH);
2926 					mh->mh_head->m_pkthdr.flowid = rss_hash;
2927 				}
2928 			}
2929 
2930 			ethpad = 2;
2931 		} else {
2932 			rspq->pure_rsps++;
2933 		}
2934 	skip:
2935 		if (flags & RSPD_CTRL_MASK) {
2936 			sleeping |= flags & RSPD_GTS_MASK;
2937 			handle_rsp_cntrl_info(qs, flags);
2938 		}
2939 
2940 		if (!eth && eop) {
2941 			rspq->offload_pkts++;
2942 #ifdef TCP_OFFLOAD
2943 			adap->cpl_handler[opcode](qs, r, mh->mh_head);
2944 #else
2945 			m_freem(mh->mh_head);
2946 #endif
2947 			mh->mh_head = NULL;
2948 		} else if (eth && eop) {
2949 			struct mbuf *m = mh->mh_head;
2950 
2951 			t3_rx_eth(adap, m, ethpad);
2952 
2953 			/*
2954 			 * The T304 sends incoming packets on any qset.  If LRO
2955 			 * is also enabled, we could end up sending packet up
2956 			 * lro_ctrl->ifp's input.  That is incorrect.
2957 			 *
2958 			 * The mbuf's rcvif was derived from the cpl header and
2959 			 * is accurate.  Skip LRO and just use that.
2960 			 */
2961 #if defined(INET6) || defined(INET)
2962 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
2963 
2964 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
2965 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
2966 			    ) {
2967 				/* successfully queue'd for LRO */
2968 			} else
2969 #endif
2970 			{
2971 				/*
2972 				 * LRO not enabled, packet unsuitable for LRO,
2973 				 * or unable to queue.  Pass it up right now in
2974 				 * either case.
2975 				 */
2976 				struct ifnet *ifp = m->m_pkthdr.rcvif;
2977 				(*ifp->if_input)(ifp, m);
2978 			}
2979 			mh->mh_head = NULL;
2980 
2981 		}
2982 
2983 		r++;
2984 		if (__predict_false(++rspq->cidx == rspq->size)) {
2985 			rspq->cidx = 0;
2986 			rspq->gen ^= 1;
2987 			r = rspq->desc;
2988 		}
2989 
2990 		if (++rspq->credits >= 64) {
2991 			refill_rspq(adap, rspq, rspq->credits);
2992 			rspq->credits = 0;
2993 		}
2994 		__refill_fl_lt(adap, &qs->fl[0], 32);
2995 		__refill_fl_lt(adap, &qs->fl[1], 32);
2996 		--budget_left;
2997 	}
2998 
2999 #if defined(INET6) || defined(INET)
3000 	/* Flush LRO */
3001 	tcp_lro_flush_all(lro_ctrl);
3002 #endif
3003 
3004 	if (sleeping)
3005 		check_ring_db(adap, qs, sleeping);
3006 
3007 	mb();  /* commit Tx queue processed updates */
3008 	if (__predict_false(qs->txq_stopped > 1))
3009 		restart_tx(qs);
3010 
3011 	__refill_fl_lt(adap, &qs->fl[0], 512);
3012 	__refill_fl_lt(adap, &qs->fl[1], 512);
3013 	budget -= budget_left;
3014 	return (budget);
3015 }
3016 
3017 /*
3018  * A helper function that processes responses and issues GTS.
3019  */
3020 static __inline int
3021 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3022 {
3023 	int work;
3024 	static int last_holdoff = 0;
3025 
3026 	work = process_responses(adap, rspq_to_qset(rq), -1);
3027 
3028 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3029 		printf("next_holdoff=%d\n", rq->next_holdoff);
3030 		last_holdoff = rq->next_holdoff;
3031 	}
3032 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3033 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3034 
3035 	return (work);
3036 }
3037 
3038 #ifdef DEBUGNET
3039 int
3040 cxgb_debugnet_poll_rx(adapter_t *adap, struct sge_qset *qs)
3041 {
3042 
3043 	return (process_responses_gts(adap, &qs->rspq));
3044 }
3045 #endif
3046 
3047 /*
3048  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3049  * Handles data events from SGE response queues as well as error and other
3050  * async events as they all use the same interrupt pin.  We use one SGE
3051  * response queue per port in this mode and protect all response queues with
3052  * queue 0's lock.
3053  */
3054 void
3055 t3b_intr(void *data)
3056 {
3057 	uint32_t i, map;
3058 	adapter_t *adap = data;
3059 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3060 
3061 	t3_write_reg(adap, A_PL_CLI, 0);
3062 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3063 
3064 	if (!map)
3065 		return;
3066 
3067 	if (__predict_false(map & F_ERRINTR)) {
3068 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3069 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3070 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3071 	}
3072 
3073 	mtx_lock(&q0->lock);
3074 	for_each_port(adap, i)
3075 	    if (map & (1 << i))
3076 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3077 	mtx_unlock(&q0->lock);
3078 }
3079 
3080 /*
3081  * The MSI interrupt handler.  This needs to handle data events from SGE
3082  * response queues as well as error and other async events as they all use
3083  * the same MSI vector.  We use one SGE response queue per port in this mode
3084  * and protect all response queues with queue 0's lock.
3085  */
3086 void
3087 t3_intr_msi(void *data)
3088 {
3089 	adapter_t *adap = data;
3090 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3091 	int i, new_packets = 0;
3092 
3093 	mtx_lock(&q0->lock);
3094 
3095 	for_each_port(adap, i)
3096 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3097 		    new_packets = 1;
3098 	mtx_unlock(&q0->lock);
3099 	if (new_packets == 0) {
3100 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3101 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3102 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3103 	}
3104 }
3105 
3106 void
3107 t3_intr_msix(void *data)
3108 {
3109 	struct sge_qset *qs = data;
3110 	adapter_t *adap = qs->port->adapter;
3111 	struct sge_rspq *rspq = &qs->rspq;
3112 
3113 	if (process_responses_gts(adap, rspq) == 0)
3114 		rspq->unhandled_irqs++;
3115 }
3116 
3117 #define QDUMP_SBUF_SIZE		32 * 400
3118 static int
3119 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3120 {
3121 	struct sge_rspq *rspq;
3122 	struct sge_qset *qs;
3123 	int i, err, dump_end, idx;
3124 	struct sbuf *sb;
3125 	struct rsp_desc *rspd;
3126 	uint32_t data[4];
3127 
3128 	rspq = arg1;
3129 	qs = rspq_to_qset(rspq);
3130 	if (rspq->rspq_dump_count == 0)
3131 		return (0);
3132 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3133 		log(LOG_WARNING,
3134 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3135 		rspq->rspq_dump_count = 0;
3136 		return (EINVAL);
3137 	}
3138 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3139 		log(LOG_WARNING,
3140 		    "dump start of %d is greater than queue size\n",
3141 		    rspq->rspq_dump_start);
3142 		rspq->rspq_dump_start = 0;
3143 		return (EINVAL);
3144 	}
3145 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3146 	if (err)
3147 		return (err);
3148 	err = sysctl_wire_old_buffer(req, 0);
3149 	if (err)
3150 		return (err);
3151 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3152 
3153 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3154 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3155 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3156 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3157 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3158 
3159 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3160 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3161 
3162 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3163 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3164 		idx = i & (RSPQ_Q_SIZE-1);
3165 
3166 		rspd = &rspq->desc[idx];
3167 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3168 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3169 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3170 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3171 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3172 		    be32toh(rspd->len_cq), rspd->intr_gen);
3173 	}
3174 
3175 	err = sbuf_finish(sb);
3176 	sbuf_delete(sb);
3177 	return (err);
3178 }
3179 
3180 static int
3181 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3182 {
3183 	struct sge_txq *txq;
3184 	struct sge_qset *qs;
3185 	int i, j, err, dump_end;
3186 	struct sbuf *sb;
3187 	struct tx_desc *txd;
3188 	uint32_t *WR, wr_hi, wr_lo, gen;
3189 	uint32_t data[4];
3190 
3191 	txq = arg1;
3192 	qs = txq_to_qset(txq, TXQ_ETH);
3193 	if (txq->txq_dump_count == 0) {
3194 		return (0);
3195 	}
3196 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3197 		log(LOG_WARNING,
3198 		    "dump count is too large %d\n", txq->txq_dump_count);
3199 		txq->txq_dump_count = 1;
3200 		return (EINVAL);
3201 	}
3202 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3203 		log(LOG_WARNING,
3204 		    "dump start of %d is greater than queue size\n",
3205 		    txq->txq_dump_start);
3206 		txq->txq_dump_start = 0;
3207 		return (EINVAL);
3208 	}
3209 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3210 	if (err)
3211 		return (err);
3212 	err = sysctl_wire_old_buffer(req, 0);
3213 	if (err)
3214 		return (err);
3215 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3216 
3217 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3218 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3219 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3220 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3221 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3222 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3223 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3224 	    txq->txq_dump_start,
3225 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3226 
3227 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3228 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3229 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3230 		WR = (uint32_t *)txd->flit;
3231 		wr_hi = ntohl(WR[0]);
3232 		wr_lo = ntohl(WR[1]);
3233 		gen = G_WR_GEN(wr_lo);
3234 
3235 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3236 		    wr_hi, wr_lo, gen);
3237 		for (j = 2; j < 30; j += 4)
3238 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3239 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3240 
3241 	}
3242 	err = sbuf_finish(sb);
3243 	sbuf_delete(sb);
3244 	return (err);
3245 }
3246 
3247 static int
3248 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3249 {
3250 	struct sge_txq *txq;
3251 	struct sge_qset *qs;
3252 	int i, j, err, dump_end;
3253 	struct sbuf *sb;
3254 	struct tx_desc *txd;
3255 	uint32_t *WR, wr_hi, wr_lo, gen;
3256 
3257 	txq = arg1;
3258 	qs = txq_to_qset(txq, TXQ_CTRL);
3259 	if (txq->txq_dump_count == 0) {
3260 		return (0);
3261 	}
3262 	if (txq->txq_dump_count > 256) {
3263 		log(LOG_WARNING,
3264 		    "dump count is too large %d\n", txq->txq_dump_count);
3265 		txq->txq_dump_count = 1;
3266 		return (EINVAL);
3267 	}
3268 	if (txq->txq_dump_start > 255) {
3269 		log(LOG_WARNING,
3270 		    "dump start of %d is greater than queue size\n",
3271 		    txq->txq_dump_start);
3272 		txq->txq_dump_start = 0;
3273 		return (EINVAL);
3274 	}
3275 
3276 	err = sysctl_wire_old_buffer(req, 0);
3277 	if (err != 0)
3278 		return (err);
3279 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3280 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3281 	    txq->txq_dump_start,
3282 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3283 
3284 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3285 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3286 		txd = &txq->desc[i & (255)];
3287 		WR = (uint32_t *)txd->flit;
3288 		wr_hi = ntohl(WR[0]);
3289 		wr_lo = ntohl(WR[1]);
3290 		gen = G_WR_GEN(wr_lo);
3291 
3292 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3293 		    wr_hi, wr_lo, gen);
3294 		for (j = 2; j < 30; j += 4)
3295 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3296 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3297 
3298 	}
3299 	err = sbuf_finish(sb);
3300 	sbuf_delete(sb);
3301 	return (err);
3302 }
3303 
3304 static int
3305 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3306 {
3307 	adapter_t *sc = arg1;
3308 	struct qset_params *qsp = &sc->params.sge.qset[0];
3309 	int coalesce_usecs;
3310 	struct sge_qset *qs;
3311 	int i, j, err, nqsets = 0;
3312 	struct mtx *lock;
3313 
3314 	if ((sc->flags & FULL_INIT_DONE) == 0)
3315 		return (ENXIO);
3316 
3317 	coalesce_usecs = qsp->coalesce_usecs;
3318         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3319 
3320 	if (err != 0) {
3321 		return (err);
3322 	}
3323 	if (coalesce_usecs == qsp->coalesce_usecs)
3324 		return (0);
3325 
3326 	for (i = 0; i < sc->params.nports; i++)
3327 		for (j = 0; j < sc->port[i].nqsets; j++)
3328 			nqsets++;
3329 
3330 	coalesce_usecs = max(1, coalesce_usecs);
3331 
3332 	for (i = 0; i < nqsets; i++) {
3333 		qs = &sc->sge.qs[i];
3334 		qsp = &sc->params.sge.qset[i];
3335 		qsp->coalesce_usecs = coalesce_usecs;
3336 
3337 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3338 			    &sc->sge.qs[0].rspq.lock;
3339 
3340 		mtx_lock(lock);
3341 		t3_update_qset_coalesce(qs, qsp);
3342 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3343 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3344 		mtx_unlock(lock);
3345 	}
3346 
3347 	return (0);
3348 }
3349 
3350 static int
3351 t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3352 {
3353 	adapter_t *sc = arg1;
3354 	int rc, timestamp;
3355 
3356 	if ((sc->flags & FULL_INIT_DONE) == 0)
3357 		return (ENXIO);
3358 
3359 	timestamp = sc->timestamp;
3360 	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3361 
3362 	if (rc != 0)
3363 		return (rc);
3364 
3365 	if (timestamp != sc->timestamp) {
3366 		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3367 		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3368 		sc->timestamp = timestamp;
3369 	}
3370 
3371 	return (0);
3372 }
3373 
3374 void
3375 t3_add_attach_sysctls(adapter_t *sc)
3376 {
3377 	struct sysctl_ctx_list *ctx;
3378 	struct sysctl_oid_list *children;
3379 
3380 	ctx = device_get_sysctl_ctx(sc->dev);
3381 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3382 
3383 	/* random information */
3384 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3385 	    "firmware_version",
3386 	    CTLFLAG_RD, sc->fw_version,
3387 	    0, "firmware version");
3388 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3389 	    "hw_revision",
3390 	    CTLFLAG_RD, &sc->params.rev,
3391 	    0, "chip model");
3392 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3393 	    "port_types",
3394 	    CTLFLAG_RD, sc->port_types,
3395 	    0, "type of ports");
3396 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3397 	    "enable_debug",
3398 	    CTLFLAG_RW, &cxgb_debug,
3399 	    0, "enable verbose debugging output");
3400 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3401 	    CTLFLAG_RD, &sc->tunq_coalesce,
3402 	    "#tunneled packets freed");
3403 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3404 	    "txq_overrun",
3405 	    CTLFLAG_RD, &txq_fills,
3406 	    0, "#times txq overrun");
3407 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3408 	    "core_clock",
3409 	    CTLFLAG_RD, &sc->params.vpd.cclk,
3410 	    0, "core clock frequency (in KHz)");
3411 }
3412 
3413 
3414 static const char *rspq_name = "rspq";
3415 static const char *txq_names[] =
3416 {
3417 	"txq_eth",
3418 	"txq_ofld",
3419 	"txq_ctrl"
3420 };
3421 
3422 static int
3423 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3424 {
3425 	struct port_info *p = arg1;
3426 	uint64_t *parg;
3427 
3428 	if (!p)
3429 		return (EINVAL);
3430 
3431 	cxgb_refresh_stats(p);
3432 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3433 
3434 	return (sysctl_handle_64(oidp, parg, 0, req));
3435 }
3436 
3437 void
3438 t3_add_configured_sysctls(adapter_t *sc)
3439 {
3440 	struct sysctl_ctx_list *ctx;
3441 	struct sysctl_oid_list *children;
3442 	int i, j;
3443 
3444 	ctx = device_get_sysctl_ctx(sc->dev);
3445 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3446 
3447 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3448 	    "intr_coal",
3449 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, sc,
3450 	    0, t3_set_coalesce_usecs,
3451 	    "I", "interrupt coalescing timer (us)");
3452 
3453 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3454 	    "pkt_timestamp",
3455 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, sc,
3456 	    0, t3_pkt_timestamp,
3457 	    "I", "provide packet timestamp instead of connection hash");
3458 
3459 	for (i = 0; i < sc->params.nports; i++) {
3460 		struct port_info *pi = &sc->port[i];
3461 		struct sysctl_oid *poid;
3462 		struct sysctl_oid_list *poidlist;
3463 		struct mac_stats *mstats = &pi->mac.stats;
3464 
3465 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3466 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3467 		    pi->namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3468 		    "port statistics");
3469 		poidlist = SYSCTL_CHILDREN(poid);
3470 		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3471 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3472 		    0, "#queue sets");
3473 
3474 		for (j = 0; j < pi->nqsets; j++) {
3475 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3476 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3477 					  *ctrlqpoid, *lropoid;
3478 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3479 					       *txqpoidlist, *ctrlqpoidlist,
3480 					       *lropoidlist;
3481 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3482 
3483 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3484 
3485 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3486 			    qs->namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3487 			    "qset statistics");
3488 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3489 
3490 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3491 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3492 					"freelist #0 empty");
3493 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3494 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3495 					"freelist #1 empty");
3496 
3497 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3498 			    rspq_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3499 			    "rspq statistics");
3500 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3501 
3502 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3503 			    txq_names[0], CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3504 			    "txq statistics");
3505 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3506 
3507 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3508 			    txq_names[2], CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3509 			    "ctrlq statistics");
3510 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3511 
3512 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3513 			    "lro_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3514 			    "LRO statistics");
3515 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3516 
3517 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3518 			    CTLFLAG_RD, &qs->rspq.size,
3519 			    0, "#entries in response queue");
3520 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3521 			    CTLFLAG_RD, &qs->rspq.cidx,
3522 			    0, "consumer index");
3523 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3524 			    CTLFLAG_RD, &qs->rspq.credits,
3525 			    0, "#credits");
3526 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3527 			    CTLFLAG_RD, &qs->rspq.starved,
3528 			    0, "#times starved");
3529 			SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3530 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3531 			    "physical_address_of the queue");
3532 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3533 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3534 			    0, "start rspq dump entry");
3535 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3536 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3537 			    0, "#rspq entries to dump");
3538 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3539 			    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
3540 			    &qs->rspq, 0, t3_dump_rspq, "A",
3541 			    "dump of the response queue");
3542 
3543 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3544 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3545 			    "#tunneled packets dropped");
3546 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3547 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.mq_len,
3548 			    0, "#tunneled packets waiting to be sent");
3549 #if 0
3550 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3551 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3552 			    0, "#tunneled packets queue producer index");
3553 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3554 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3555 			    0, "#tunneled packets queue consumer index");
3556 #endif
3557 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3558 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3559 			    0, "#tunneled packets processed by the card");
3560 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3561 			    CTLFLAG_RD, &txq->cleaned,
3562 			    0, "#tunneled packets cleaned");
3563 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3564 			    CTLFLAG_RD, &txq->in_use,
3565 			    0, "#tunneled packet slots in use");
3566 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees",
3567 			    CTLFLAG_RD, &txq->txq_frees,
3568 			    "#tunneled packets freed");
3569 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3570 			    CTLFLAG_RD, &txq->txq_skipped,
3571 			    0, "#tunneled packet descriptors skipped");
3572 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3573 			    CTLFLAG_RD, &txq->txq_coalesced,
3574 			    "#tunneled packets coalesced");
3575 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3576 			    CTLFLAG_RD, &txq->txq_enqueued,
3577 			    0, "#tunneled packets enqueued to hardware");
3578 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3579 			    CTLFLAG_RD, &qs->txq_stopped,
3580 			    0, "tx queues stopped");
3581 			SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3582 			    CTLFLAG_RD, &txq->phys_addr,
3583 			    "physical_address_of the queue");
3584 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3585 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3586 			    0, "txq generation");
3587 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3588 			    CTLFLAG_RD, &txq->cidx,
3589 			    0, "hardware queue cidx");
3590 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3591 			    CTLFLAG_RD, &txq->pidx,
3592 			    0, "hardware queue pidx");
3593 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3594 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3595 			    0, "txq start idx for dump");
3596 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3597 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3598 			    0, "txq #entries to dump");
3599 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3600 			    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
3601 			    &qs->txq[TXQ_ETH], 0, t3_dump_txq_eth, "A",
3602 			    "dump of the transmit queue");
3603 
3604 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3605 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3606 			    0, "ctrlq start idx for dump");
3607 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3608 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3609 			    0, "ctrl #entries to dump");
3610 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3611 			    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
3612 			    &qs->txq[TXQ_CTRL], 0, t3_dump_txq_ctrl, "A",
3613 			    "dump of the transmit queue");
3614 
3615 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_queued",
3616 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3617 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3618 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3619 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3620 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3621 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3622 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3623 		}
3624 
3625 		/* Now add a node for mac stats. */
3626 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3627 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "MAC statistics");
3628 		poidlist = SYSCTL_CHILDREN(poid);
3629 
3630 		/*
3631 		 * We (ab)use the length argument (arg2) to pass on the offset
3632 		 * of the data that we are interested in.  This is only required
3633 		 * for the quad counters that are updated from the hardware (we
3634 		 * make sure that we return the latest value).
3635 		 * sysctl_handle_macstat first updates *all* the counters from
3636 		 * the hardware, and then returns the latest value of the
3637 		 * requested counter.  Best would be to update only the
3638 		 * requested counter from hardware, but t3_mac_update_stats()
3639 		 * hides all the register details and we don't want to dive into
3640 		 * all that here.
3641 		 */
3642 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3643     CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_NEEDGIANT, pi, \
3644     offsetof(struct mac_stats, a), sysctl_handle_macstat, "QU", 0)
3645 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3646 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3647 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3648 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3649 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3650 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3651 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3652 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3653 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3654 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3655 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3656 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3657 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3658 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3659 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3660 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3661 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3662 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3663 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3664 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3665 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3666 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3667 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3668 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3669 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3670 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3671 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3672 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3673 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3674 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3675 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3676 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3677 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3678 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3679 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3680 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3681 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3682 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3683 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3684 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3685 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3686 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3687 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3688 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3689 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3690 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3691 #undef CXGB_SYSCTL_ADD_QUAD
3692 
3693 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3694     CTLFLAG_RD, &mstats->a, 0)
3695 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3696 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3697 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3698 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3699 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3700 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3701 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3702 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3703 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3704 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3705 #undef CXGB_SYSCTL_ADD_ULONG
3706 	}
3707 }
3708 
3709 /**
3710  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3711  *	@qs: the queue set
3712  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3713  *	@idx: the descriptor index in the queue
3714  *	@data: where to dump the descriptor contents
3715  *
3716  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3717  *	size of the descriptor.
3718  */
3719 int
3720 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3721 		unsigned char *data)
3722 {
3723 	if (qnum >= 6)
3724 		return (EINVAL);
3725 
3726 	if (qnum < 3) {
3727 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3728 			return -EINVAL;
3729 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3730 		return sizeof(struct tx_desc);
3731 	}
3732 
3733 	if (qnum == 3) {
3734 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3735 			return (EINVAL);
3736 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3737 		return sizeof(struct rsp_desc);
3738 	}
3739 
3740 	qnum -= 4;
3741 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3742 		return (EINVAL);
3743 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3744 	return sizeof(struct rx_desc);
3745 }
3746