xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision aa1a8ff2d6dbc51ef058f46f3db5a8bb77967145)
1 /**************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause
3 
4 Copyright (c) 2007-2009, Chelsio Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Chelsio Corporation nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 #include "opt_inet6.h"
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/module.h>
39 #include <sys/bus.h>
40 #include <sys/conf.h>
41 #include <machine/bus.h>
42 #include <machine/resource.h>
43 #include <sys/rman.h>
44 #include <sys/queue.h>
45 #include <sys/sysctl.h>
46 #include <sys/taskqueue.h>
47 
48 #include <sys/proc.h>
49 #include <sys/sbuf.h>
50 #include <sys/sched.h>
51 #include <sys/smp.h>
52 #include <sys/systm.h>
53 #include <sys/syslog.h>
54 #include <sys/socket.h>
55 #include <sys/sglist.h>
56 
57 #include <net/if.h>
58 #include <net/if_var.h>
59 #include <net/bpf.h>
60 #include <net/ethernet.h>
61 #include <net/if_vlan_var.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 
69 #include <dev/pci/pcireg.h>
70 #include <dev/pci/pcivar.h>
71 
72 #include <vm/vm.h>
73 #include <vm/pmap.h>
74 
75 #include <cxgb_include.h>
76 #include <sys/mvec.h>
77 
78 int	txq_fills = 0;
79 int	multiq_tx_enable = 1;
80 
81 #ifdef TCP_OFFLOAD
82 CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
83 #endif
84 
85 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
86 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
87 SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
88     "size of per-queue mbuf ring");
89 
90 static int cxgb_tx_coalesce_force = 0;
91 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RWTUN,
92     &cxgb_tx_coalesce_force, 0,
93     "coalesce small packets into a single work request regardless of ring state");
94 
95 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
96 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
97 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
98 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
99 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
100 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
101 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
102 
103 
104 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
105 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RWTUN,
106     &cxgb_tx_coalesce_enable_start, 0,
107     "coalesce enable threshold");
108 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
109 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RWTUN,
110     &cxgb_tx_coalesce_enable_stop, 0,
111     "coalesce disable threshold");
112 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
113 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RWTUN,
114     &cxgb_tx_reclaim_threshold, 0,
115     "tx cleaning minimum threshold");
116 
117 /*
118  * XXX don't re-enable this until TOE stops assuming
119  * we have an m_ext
120  */
121 static int recycle_enable = 0;
122 
123 extern int cxgb_use_16k_clusters;
124 extern int nmbjumbop;
125 extern int nmbjumbo9;
126 extern int nmbjumbo16;
127 
128 #define USE_GTS 0
129 
130 #define SGE_RX_SM_BUF_SIZE	1536
131 #define SGE_RX_DROP_THRES	16
132 #define SGE_RX_COPY_THRES	128
133 
134 /*
135  * Period of the Tx buffer reclaim timer.  This timer does not need to run
136  * frequently as Tx buffers are usually reclaimed by new Tx packets.
137  */
138 #define TX_RECLAIM_PERIOD       (hz >> 1)
139 
140 /*
141  * Values for sge_txq.flags
142  */
143 enum {
144 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
145 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
146 };
147 
148 struct tx_desc {
149 	uint64_t	flit[TX_DESC_FLITS];
150 } __packed;
151 
152 struct rx_desc {
153 	uint32_t	addr_lo;
154 	uint32_t	len_gen;
155 	uint32_t	gen2;
156 	uint32_t	addr_hi;
157 } __packed;
158 
159 struct rsp_desc {               /* response queue descriptor */
160 	struct rss_header	rss_hdr;
161 	uint32_t		flags;
162 	uint32_t		len_cq;
163 	uint8_t			imm_data[47];
164 	uint8_t			intr_gen;
165 } __packed;
166 
167 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
168 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
169 #define RX_SW_DESC_INUSE        (1 << 3)
170 #define TX_SW_DESC_MAPPED       (1 << 4)
171 
172 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
173 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
174 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
175 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
176 
177 struct tx_sw_desc {                /* SW state per Tx descriptor */
178 	struct mbuf	*m;
179 	bus_dmamap_t	map;
180 	int		flags;
181 };
182 
183 struct rx_sw_desc {                /* SW state per Rx descriptor */
184 	caddr_t		rxsd_cl;
185 	struct mbuf	*m;
186 	bus_dmamap_t	map;
187 	int		flags;
188 };
189 
190 struct txq_state {
191 	unsigned int	compl;
192 	unsigned int	gen;
193 	unsigned int	pidx;
194 };
195 
196 struct refill_fl_cb_arg {
197 	int               error;
198 	bus_dma_segment_t seg;
199 	int               nseg;
200 };
201 
202 
203 /*
204  * Maps a number of flits to the number of Tx descriptors that can hold them.
205  * The formula is
206  *
207  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
208  *
209  * HW allows up to 4 descriptors to be combined into a WR.
210  */
211 static uint8_t flit_desc_map[] = {
212 	0,
213 #if SGE_NUM_GENBITS == 1
214 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
216 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
217 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
218 #elif SGE_NUM_GENBITS == 2
219 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
220 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
222 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
223 #else
224 # error "SGE_NUM_GENBITS must be 1 or 2"
225 #endif
226 };
227 
228 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
229 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
230 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
231 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
232 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
233 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
234 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
235 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
237 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
238 #define	TXQ_RING_DEQUEUE(qs) \
239 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
240 
241 int cxgb_debug = 0;
242 
243 static void sge_timer_cb(void *arg);
244 static void sge_timer_reclaim(void *arg, int ncount);
245 static void sge_txq_reclaim_handler(void *arg, int ncount);
246 static void cxgb_start_locked(struct sge_qset *qs);
247 
248 /*
249  * XXX need to cope with bursty scheduling by looking at a wider
250  * window than we are now for determining the need for coalescing
251  *
252  */
253 static __inline uint64_t
254 check_pkt_coalesce(struct sge_qset *qs)
255 {
256         struct adapter *sc;
257         struct sge_txq *txq;
258 	uint8_t *fill;
259 
260 	if (__predict_false(cxgb_tx_coalesce_force))
261 		return (1);
262 	txq = &qs->txq[TXQ_ETH];
263         sc = qs->port->adapter;
264 	fill = &sc->tunq_fill[qs->idx];
265 
266 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
267 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
268 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
269 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
270 	/*
271 	 * if the hardware transmit queue is more than 1/8 full
272 	 * we mark it as coalescing - we drop back from coalescing
273 	 * when we go below 1/32 full and there are no packets enqueued,
274 	 * this provides us with some degree of hysteresis
275 	 */
276         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
277 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
278                 *fill = 0;
279         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
280                 *fill = 1;
281 
282 	return (sc->tunq_coalesce);
283 }
284 
285 #ifdef __LP64__
286 static void
287 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
288 {
289 	uint64_t wr_hilo;
290 #if _BYTE_ORDER == _LITTLE_ENDIAN
291 	wr_hilo = wr_hi;
292 	wr_hilo |= (((uint64_t)wr_lo)<<32);
293 #else
294 	wr_hilo = wr_lo;
295 	wr_hilo |= (((uint64_t)wr_hi)<<32);
296 #endif
297 	wrp->wrh_hilo = wr_hilo;
298 }
299 #else
300 static void
301 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
302 {
303 
304 	wrp->wrh_hi = wr_hi;
305 	wmb();
306 	wrp->wrh_lo = wr_lo;
307 }
308 #endif
309 
310 struct coalesce_info {
311 	int count;
312 	int nbytes;
313 	int noncoal;
314 };
315 
316 static int
317 coalesce_check(struct mbuf *m, void *arg)
318 {
319 	struct coalesce_info *ci = arg;
320 
321 	if ((m->m_next != NULL) ||
322 	    ((mtod(m, vm_offset_t) & PAGE_MASK) + m->m_len > PAGE_SIZE))
323 		ci->noncoal = 1;
324 
325 	if ((ci->count == 0) || (ci->noncoal == 0 && (ci->count < 7) &&
326 	    (ci->nbytes + m->m_len <= 10500))) {
327 		ci->count++;
328 		ci->nbytes += m->m_len;
329 		return (1);
330 	}
331 	return (0);
332 }
333 
334 static struct mbuf *
335 cxgb_dequeue(struct sge_qset *qs)
336 {
337 	struct mbuf *m, *m_head, *m_tail;
338 	struct coalesce_info ci;
339 
340 
341 	if (check_pkt_coalesce(qs) == 0)
342 		return TXQ_RING_DEQUEUE(qs);
343 
344 	m_head = m_tail = NULL;
345 	ci.count = ci.nbytes = ci.noncoal = 0;
346 	do {
347 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
348 		if (m_head == NULL) {
349 			m_tail = m_head = m;
350 		} else if (m != NULL) {
351 			m_tail->m_nextpkt = m;
352 			m_tail = m;
353 		}
354 	} while (m != NULL);
355 	if (ci.count > 7)
356 		panic("trying to coalesce %d packets in to one WR", ci.count);
357 	return (m_head);
358 }
359 
360 /**
361  *	reclaim_completed_tx - reclaims completed Tx descriptors
362  *	@adapter: the adapter
363  *	@q: the Tx queue to reclaim completed descriptors from
364  *
365  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
366  *	and frees the associated buffers if possible.  Called with the Tx
367  *	queue's lock held.
368  */
369 static __inline int
370 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
371 {
372 	struct sge_txq *q = &qs->txq[queue];
373 	int reclaim = desc_reclaimable(q);
374 
375 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
376 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
377 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
378 
379 	if (reclaim < reclaim_min)
380 		return (0);
381 
382 	mtx_assert(&qs->lock, MA_OWNED);
383 	if (reclaim > 0) {
384 		t3_free_tx_desc(qs, reclaim, queue);
385 		q->cleaned += reclaim;
386 		q->in_use -= reclaim;
387 	}
388 	if (isset(&qs->txq_stopped, TXQ_ETH))
389                 clrbit(&qs->txq_stopped, TXQ_ETH);
390 
391 	return (reclaim);
392 }
393 
394 #ifdef DEBUGNET
395 int
396 cxgb_debugnet_poll_tx(struct sge_qset *qs)
397 {
398 
399 	return (reclaim_completed_tx(qs, TX_RECLAIM_MAX, TXQ_ETH));
400 }
401 #endif
402 
403 /**
404  *	should_restart_tx - are there enough resources to restart a Tx queue?
405  *	@q: the Tx queue
406  *
407  *	Checks if there are enough descriptors to restart a suspended Tx queue.
408  */
409 static __inline int
410 should_restart_tx(const struct sge_txq *q)
411 {
412 	unsigned int r = q->processed - q->cleaned;
413 
414 	return q->in_use - r < (q->size >> 1);
415 }
416 
417 /**
418  *	t3_sge_init - initialize SGE
419  *	@adap: the adapter
420  *	@p: the SGE parameters
421  *
422  *	Performs SGE initialization needed every time after a chip reset.
423  *	We do not initialize any of the queue sets here, instead the driver
424  *	top-level must request those individually.  We also do not enable DMA
425  *	here, that should be done after the queues have been set up.
426  */
427 void
428 t3_sge_init(adapter_t *adap, struct sge_params *p)
429 {
430 	u_int ctrl, ups;
431 
432 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
433 
434 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
435 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
436 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
437 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
438 #if SGE_NUM_GENBITS == 1
439 	ctrl |= F_EGRGENCTRL;
440 #endif
441 	if (adap->params.rev > 0) {
442 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
443 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
444 	}
445 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
446 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
447 		     V_LORCQDRBTHRSH(512));
448 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
449 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
450 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
451 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
452 		     adap->params.rev < T3_REV_C ? 1000 : 500);
453 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
454 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
455 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
456 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
457 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
458 }
459 
460 
461 /**
462  *	sgl_len - calculates the size of an SGL of the given capacity
463  *	@n: the number of SGL entries
464  *
465  *	Calculates the number of flits needed for a scatter/gather list that
466  *	can hold the given number of entries.
467  */
468 static __inline unsigned int
469 sgl_len(unsigned int n)
470 {
471 	return ((3 * n) / 2 + (n & 1));
472 }
473 
474 /**
475  *	get_imm_packet - return the next ingress packet buffer from a response
476  *	@resp: the response descriptor containing the packet data
477  *
478  *	Return a packet containing the immediate data of the given response.
479  */
480 static int
481 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
482 {
483 
484 	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
485 		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
486 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
487 	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
488 		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
489 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
490 	} else
491 		m->m_len = IMMED_PKT_SIZE;
492 	m->m_ext.ext_buf = NULL;
493 	m->m_ext.ext_type = 0;
494 	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len);
495 	return (0);
496 }
497 
498 static __inline u_int
499 flits_to_desc(u_int n)
500 {
501 	return (flit_desc_map[n]);
502 }
503 
504 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
505 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
506 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
507 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
508 		    F_HIRCQPARITYERROR)
509 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
510 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
511 		      F_RSPQDISABLED)
512 
513 /**
514  *	t3_sge_err_intr_handler - SGE async event interrupt handler
515  *	@adapter: the adapter
516  *
517  *	Interrupt handler for SGE asynchronous (non-data) events.
518  */
519 void
520 t3_sge_err_intr_handler(adapter_t *adapter)
521 {
522 	unsigned int v, status;
523 
524 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
525 	if (status & SGE_PARERR)
526 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
527 			 status & SGE_PARERR);
528 	if (status & SGE_FRAMINGERR)
529 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
530 			 status & SGE_FRAMINGERR);
531 	if (status & F_RSPQCREDITOVERFOW)
532 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
533 
534 	if (status & F_RSPQDISABLED) {
535 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
536 
537 		CH_ALERT(adapter,
538 			 "packet delivered to disabled response queue (0x%x)\n",
539 			 (v >> S_RSPQ0DISABLED) & 0xff);
540 	}
541 
542 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
543 	if (status & SGE_FATALERR)
544 		t3_fatal_err(adapter);
545 }
546 
547 void
548 t3_sge_prep(adapter_t *adap, struct sge_params *p)
549 {
550 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
551 
552 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
553 	nqsets *= adap->params.nports;
554 
555 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
556 
557 	while (!powerof2(fl_q_size))
558 		fl_q_size--;
559 
560 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
561 	    is_offload(adap);
562 
563 	if (use_16k) {
564 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
565 		jumbo_buf_size = MJUM16BYTES;
566 	} else {
567 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
568 		jumbo_buf_size = MJUM9BYTES;
569 	}
570 	while (!powerof2(jumbo_q_size))
571 		jumbo_q_size--;
572 
573 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
574 		device_printf(adap->dev,
575 		    "Insufficient clusters and/or jumbo buffers.\n");
576 
577 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
578 
579 	for (i = 0; i < SGE_QSETS; ++i) {
580 		struct qset_params *q = p->qset + i;
581 
582 		if (adap->params.nports > 2) {
583 			q->coalesce_usecs = 50;
584 		} else {
585 #ifdef INVARIANTS
586 			q->coalesce_usecs = 10;
587 #else
588 			q->coalesce_usecs = 5;
589 #endif
590 		}
591 		q->polling = 0;
592 		q->rspq_size = RSPQ_Q_SIZE;
593 		q->fl_size = fl_q_size;
594 		q->jumbo_size = jumbo_q_size;
595 		q->jumbo_buf_size = jumbo_buf_size;
596 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
597 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
598 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
599 		q->cong_thres = 0;
600 	}
601 }
602 
603 int
604 t3_sge_alloc(adapter_t *sc)
605 {
606 
607 	/* The parent tag. */
608 	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
609 				1, 0,			/* algnmnt, boundary */
610 				BUS_SPACE_MAXADDR,	/* lowaddr */
611 				BUS_SPACE_MAXADDR,	/* highaddr */
612 				NULL, NULL,		/* filter, filterarg */
613 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
614 				BUS_SPACE_UNRESTRICTED, /* nsegments */
615 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
616 				0,			/* flags */
617 				NULL, NULL,		/* lock, lockarg */
618 				&sc->parent_dmat)) {
619 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
620 		return (ENOMEM);
621 	}
622 
623 	/*
624 	 * DMA tag for normal sized RX frames
625 	 */
626 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
627 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
628 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
629 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
630 		return (ENOMEM);
631 	}
632 
633 	/*
634 	 * DMA tag for jumbo sized RX frames.
635 	 */
636 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
637 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
638 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
639 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
640 		return (ENOMEM);
641 	}
642 
643 	/*
644 	 * DMA tag for TX frames.
645 	 */
646 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
647 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
648 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
649 		NULL, NULL, &sc->tx_dmat)) {
650 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
651 		return (ENOMEM);
652 	}
653 
654 	return (0);
655 }
656 
657 int
658 t3_sge_free(struct adapter * sc)
659 {
660 
661 	if (sc->tx_dmat != NULL)
662 		bus_dma_tag_destroy(sc->tx_dmat);
663 
664 	if (sc->rx_jumbo_dmat != NULL)
665 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
666 
667 	if (sc->rx_dmat != NULL)
668 		bus_dma_tag_destroy(sc->rx_dmat);
669 
670 	if (sc->parent_dmat != NULL)
671 		bus_dma_tag_destroy(sc->parent_dmat);
672 
673 	return (0);
674 }
675 
676 void
677 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
678 {
679 
680 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
681 	qs->rspq.polling = 0 /* p->polling */;
682 }
683 
684 #if !defined(__i386__) && !defined(__amd64__)
685 static void
686 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
687 {
688 	struct refill_fl_cb_arg *cb_arg = arg;
689 
690 	cb_arg->error = error;
691 	cb_arg->seg = segs[0];
692 	cb_arg->nseg = nseg;
693 
694 }
695 #endif
696 /**
697  *	refill_fl - refill an SGE free-buffer list
698  *	@sc: the controller softc
699  *	@q: the free-list to refill
700  *	@n: the number of new buffers to allocate
701  *
702  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
703  *	The caller must assure that @n does not exceed the queue's capacity.
704  */
705 static void
706 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
707 {
708 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
709 	struct rx_desc *d = &q->desc[q->pidx];
710 	struct refill_fl_cb_arg cb_arg;
711 	struct mbuf *m;
712 	caddr_t cl;
713 	int err;
714 
715 	cb_arg.error = 0;
716 	while (n--) {
717 		/*
718 		 * We allocate an uninitialized mbuf + cluster, mbuf is
719 		 * initialized after rx.
720 		 */
721 		if (q->zone == zone_pack) {
722 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
723 				break;
724 			cl = m->m_ext.ext_buf;
725 		} else {
726 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
727 				break;
728 			if ((m = m_gethdr_raw(M_NOWAIT, 0)) == NULL) {
729 				uma_zfree(q->zone, cl);
730 				break;
731 			}
732 		}
733 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
734 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
735 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
736 				uma_zfree(q->zone, cl);
737 				goto done;
738 			}
739 			sd->flags |= RX_SW_DESC_MAP_CREATED;
740 		}
741 #if !defined(__i386__) && !defined(__amd64__)
742 		err = bus_dmamap_load(q->entry_tag, sd->map,
743 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
744 
745 		if (err != 0 || cb_arg.error) {
746 			if (q->zone != zone_pack)
747 				uma_zfree(q->zone, cl);
748 			m_free(m);
749 			goto done;
750 		}
751 #else
752 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
753 #endif
754 		sd->flags |= RX_SW_DESC_INUSE;
755 		sd->rxsd_cl = cl;
756 		sd->m = m;
757 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
758 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
759 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
760 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
761 
762 		d++;
763 		sd++;
764 
765 		if (++q->pidx == q->size) {
766 			q->pidx = 0;
767 			q->gen ^= 1;
768 			sd = q->sdesc;
769 			d = q->desc;
770 		}
771 		q->credits++;
772 		q->db_pending++;
773 	}
774 
775 done:
776 	if (q->db_pending >= 32) {
777 		q->db_pending = 0;
778 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
779 	}
780 }
781 
782 
783 /**
784  *	free_rx_bufs - free the Rx buffers on an SGE free list
785  *	@sc: the controle softc
786  *	@q: the SGE free list to clean up
787  *
788  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
789  *	this queue should be stopped before calling this function.
790  */
791 static void
792 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
793 {
794 	u_int cidx = q->cidx;
795 
796 	while (q->credits--) {
797 		struct rx_sw_desc *d = &q->sdesc[cidx];
798 
799 		if (d->flags & RX_SW_DESC_INUSE) {
800 			bus_dmamap_unload(q->entry_tag, d->map);
801 			bus_dmamap_destroy(q->entry_tag, d->map);
802 			if (q->zone == zone_pack) {
803 				m_init(d->m, M_NOWAIT, MT_DATA, M_EXT);
804 				uma_zfree(zone_pack, d->m);
805 			} else {
806 				m_init(d->m, M_NOWAIT, MT_DATA, 0);
807 				m_free_raw(d->m);
808 				uma_zfree(q->zone, d->rxsd_cl);
809 			}
810 		}
811 
812 		d->rxsd_cl = NULL;
813 		d->m = NULL;
814 		if (++cidx == q->size)
815 			cidx = 0;
816 	}
817 }
818 
819 static __inline void
820 __refill_fl(adapter_t *adap, struct sge_fl *fl)
821 {
822 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
823 }
824 
825 static __inline void
826 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
827 {
828 	uint32_t reclaimable = fl->size - fl->credits;
829 
830 	if (reclaimable > 0)
831 		refill_fl(adap, fl, min(max, reclaimable));
832 }
833 
834 /**
835  *	recycle_rx_buf - recycle a receive buffer
836  *	@adapter: the adapter
837  *	@q: the SGE free list
838  *	@idx: index of buffer to recycle
839  *
840  *	Recycles the specified buffer on the given free list by adding it at
841  *	the next available slot on the list.
842  */
843 static void
844 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
845 {
846 	struct rx_desc *from = &q->desc[idx];
847 	struct rx_desc *to   = &q->desc[q->pidx];
848 
849 	q->sdesc[q->pidx] = q->sdesc[idx];
850 	to->addr_lo = from->addr_lo;        // already big endian
851 	to->addr_hi = from->addr_hi;        // likewise
852 	wmb();	/* necessary ? */
853 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
854 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
855 	q->credits++;
856 
857 	if (++q->pidx == q->size) {
858 		q->pidx = 0;
859 		q->gen ^= 1;
860 	}
861 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
862 }
863 
864 static void
865 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
866 {
867 	uint32_t *addr;
868 
869 	addr = arg;
870 	*addr = segs[0].ds_addr;
871 }
872 
873 static int
874 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
875     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
876     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
877 {
878 	size_t len = nelem * elem_size;
879 	void *s = NULL;
880 	void *p = NULL;
881 	int err;
882 
883 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
884 				      BUS_SPACE_MAXADDR_32BIT,
885 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
886 				      len, 0, NULL, NULL, tag)) != 0) {
887 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
888 		return (ENOMEM);
889 	}
890 
891 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
892 				    map)) != 0) {
893 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
894 		return (ENOMEM);
895 	}
896 
897 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
898 	bzero(p, len);
899 	*(void **)desc = p;
900 
901 	if (sw_size) {
902 		len = nelem * sw_size;
903 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
904 		*(void **)sdesc = s;
905 	}
906 	if (parent_entry_tag == NULL)
907 		return (0);
908 
909 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
910 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
911 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
912 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
913 		                      NULL, NULL, entry_tag)) != 0) {
914 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
915 		return (ENOMEM);
916 	}
917 	return (0);
918 }
919 
920 static void
921 sge_slow_intr_handler(void *arg, int ncount)
922 {
923 	adapter_t *sc = arg;
924 
925 	t3_slow_intr_handler(sc);
926 	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
927 	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
928 }
929 
930 /**
931  *	sge_timer_cb - perform periodic maintenance of an SGE qset
932  *	@data: the SGE queue set to maintain
933  *
934  *	Runs periodically from a timer to perform maintenance of an SGE queue
935  *	set.  It performs two tasks:
936  *
937  *	a) Cleans up any completed Tx descriptors that may still be pending.
938  *	Normal descriptor cleanup happens when new packets are added to a Tx
939  *	queue so this timer is relatively infrequent and does any cleanup only
940  *	if the Tx queue has not seen any new packets in a while.  We make a
941  *	best effort attempt to reclaim descriptors, in that we don't wait
942  *	around if we cannot get a queue's lock (which most likely is because
943  *	someone else is queueing new packets and so will also handle the clean
944  *	up).  Since control queues use immediate data exclusively we don't
945  *	bother cleaning them up here.
946  *
947  *	b) Replenishes Rx queues that have run out due to memory shortage.
948  *	Normally new Rx buffers are added when existing ones are consumed but
949  *	when out of memory a queue can become empty.  We try to add only a few
950  *	buffers here, the queue will be replenished fully as these new buffers
951  *	are used up if memory shortage has subsided.
952  *
953  *	c) Return coalesced response queue credits in case a response queue is
954  *	starved.
955  *
956  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
957  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
958  */
959 static void
960 sge_timer_cb(void *arg)
961 {
962 	adapter_t *sc = arg;
963 	if ((sc->flags & USING_MSIX) == 0) {
964 
965 		struct port_info *pi;
966 		struct sge_qset *qs;
967 		struct sge_txq  *txq;
968 		int i, j;
969 		int reclaim_ofl, refill_rx;
970 
971 		if (sc->open_device_map == 0)
972 			return;
973 
974 		for (i = 0; i < sc->params.nports; i++) {
975 			pi = &sc->port[i];
976 			for (j = 0; j < pi->nqsets; j++) {
977 				qs = &sc->sge.qs[pi->first_qset + j];
978 				txq = &qs->txq[0];
979 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
980 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
981 				    (qs->fl[1].credits < qs->fl[1].size));
982 				if (reclaim_ofl || refill_rx) {
983 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
984 					break;
985 				}
986 			}
987 		}
988 	}
989 
990 	if (sc->params.nports > 2) {
991 		int i;
992 
993 		for_each_port(sc, i) {
994 			struct port_info *pi = &sc->port[i];
995 
996 			t3_write_reg(sc, A_SG_KDOORBELL,
997 				     F_SELEGRCNTX |
998 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
999 		}
1000 	}
1001 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
1002 	    sc->open_device_map != 0)
1003 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1004 }
1005 
1006 /*
1007  * This is meant to be a catch-all function to keep sge state private
1008  * to sge.c
1009  *
1010  */
1011 int
1012 t3_sge_init_adapter(adapter_t *sc)
1013 {
1014 	callout_init(&sc->sge_timer_ch, 1);
1015 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1016 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1017 	return (0);
1018 }
1019 
1020 int
1021 t3_sge_reset_adapter(adapter_t *sc)
1022 {
1023 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1024 	return (0);
1025 }
1026 
1027 int
1028 t3_sge_init_port(struct port_info *pi)
1029 {
1030 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1031 	return (0);
1032 }
1033 
1034 /**
1035  *	refill_rspq - replenish an SGE response queue
1036  *	@adapter: the adapter
1037  *	@q: the response queue to replenish
1038  *	@credits: how many new responses to make available
1039  *
1040  *	Replenishes a response queue by making the supplied number of responses
1041  *	available to HW.
1042  */
1043 static __inline void
1044 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1045 {
1046 
1047 	/* mbufs are allocated on demand when a rspq entry is processed. */
1048 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1049 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1050 }
1051 
1052 static void
1053 sge_txq_reclaim_handler(void *arg, int ncount)
1054 {
1055 	struct sge_qset *qs = arg;
1056 	int i;
1057 
1058 	for (i = 0; i < 3; i++)
1059 		reclaim_completed_tx(qs, 16, i);
1060 }
1061 
1062 static void
1063 sge_timer_reclaim(void *arg, int ncount)
1064 {
1065 	struct port_info *pi = arg;
1066 	int i, nqsets = pi->nqsets;
1067 	adapter_t *sc = pi->adapter;
1068 	struct sge_qset *qs;
1069 	struct mtx *lock;
1070 
1071 	KASSERT((sc->flags & USING_MSIX) == 0,
1072 	    ("can't call timer reclaim for msi-x"));
1073 
1074 	for (i = 0; i < nqsets; i++) {
1075 		qs = &sc->sge.qs[pi->first_qset + i];
1076 
1077 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1078 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1079 			    &sc->sge.qs[0].rspq.lock;
1080 
1081 		if (mtx_trylock(lock)) {
1082 			/* XXX currently assume that we are *NOT* polling */
1083 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1084 
1085 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1086 				__refill_fl(sc, &qs->fl[0]);
1087 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1088 				__refill_fl(sc, &qs->fl[1]);
1089 
1090 			if (status & (1 << qs->rspq.cntxt_id)) {
1091 				if (qs->rspq.credits) {
1092 					refill_rspq(sc, &qs->rspq, 1);
1093 					qs->rspq.credits--;
1094 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1095 					    1 << qs->rspq.cntxt_id);
1096 				}
1097 			}
1098 			mtx_unlock(lock);
1099 		}
1100 	}
1101 }
1102 
1103 /**
1104  *	init_qset_cntxt - initialize an SGE queue set context info
1105  *	@qs: the queue set
1106  *	@id: the queue set id
1107  *
1108  *	Initializes the TIDs and context ids for the queues of a queue set.
1109  */
1110 static void
1111 init_qset_cntxt(struct sge_qset *qs, u_int id)
1112 {
1113 
1114 	qs->rspq.cntxt_id = id;
1115 	qs->fl[0].cntxt_id = 2 * id;
1116 	qs->fl[1].cntxt_id = 2 * id + 1;
1117 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1118 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1119 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1120 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1121 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1122 
1123 	/* XXX: a sane limit is needed instead of INT_MAX */
1124 	mbufq_init(&qs->txq[TXQ_ETH].sendq, INT_MAX);
1125 	mbufq_init(&qs->txq[TXQ_OFLD].sendq, INT_MAX);
1126 	mbufq_init(&qs->txq[TXQ_CTRL].sendq, INT_MAX);
1127 }
1128 
1129 
1130 static void
1131 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1132 {
1133 	txq->in_use += ndesc;
1134 	/*
1135 	 * XXX we don't handle stopping of queue
1136 	 * presumably start handles this when we bump against the end
1137 	 */
1138 	txqs->gen = txq->gen;
1139 	txq->unacked += ndesc;
1140 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1141 	txq->unacked &= 31;
1142 	txqs->pidx = txq->pidx;
1143 	txq->pidx += ndesc;
1144 #ifdef INVARIANTS
1145 	if (((txqs->pidx > txq->cidx) &&
1146 		(txq->pidx < txqs->pidx) &&
1147 		(txq->pidx >= txq->cidx)) ||
1148 	    ((txqs->pidx < txq->cidx) &&
1149 		(txq->pidx >= txq-> cidx)) ||
1150 	    ((txqs->pidx < txq->cidx) &&
1151 		(txq->cidx < txqs->pidx)))
1152 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1153 		    txqs->pidx, txq->pidx, txq->cidx);
1154 #endif
1155 	if (txq->pidx >= txq->size) {
1156 		txq->pidx -= txq->size;
1157 		txq->gen ^= 1;
1158 	}
1159 
1160 }
1161 
1162 /**
1163  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1164  *	@m: the packet mbufs
1165  *      @nsegs: the number of segments
1166  *
1167  * 	Returns the number of Tx descriptors needed for the given Ethernet
1168  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1169  */
1170 static __inline unsigned int
1171 calc_tx_descs(const struct mbuf *m, int nsegs)
1172 {
1173 	unsigned int flits;
1174 
1175 	if (m->m_pkthdr.len <= PIO_LEN)
1176 		return 1;
1177 
1178 	flits = sgl_len(nsegs) + 2;
1179 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1180 		flits++;
1181 
1182 	return flits_to_desc(flits);
1183 }
1184 
1185 /**
1186  *	make_sgl - populate a scatter/gather list for a packet
1187  *	@sgp: the SGL to populate
1188  *	@segs: the packet dma segments
1189  *	@nsegs: the number of segments
1190  *
1191  *	Generates a scatter/gather list for the buffers that make up a packet
1192  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1193  *	appropriately.
1194  */
1195 static __inline void
1196 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1197 {
1198 	int i, idx;
1199 
1200 	for (idx = 0, i = 0; i < nsegs; i++) {
1201 		/*
1202 		 * firmware doesn't like empty segments
1203 		 */
1204 		if (segs[i].ds_len == 0)
1205 			continue;
1206 		if (i && idx == 0)
1207 			++sgp;
1208 
1209 		sgp->len[idx] = htobe32(segs[i].ds_len);
1210 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1211 		idx ^= 1;
1212 	}
1213 
1214 	if (idx) {
1215 		sgp->len[idx] = 0;
1216 		sgp->addr[idx] = 0;
1217 	}
1218 }
1219 
1220 /**
1221  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1222  *	@adap: the adapter
1223  *	@q: the Tx queue
1224  *
1225  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1226  *	where the HW is going to sleep just after we checked, however,
1227  *	then the interrupt handler will detect the outstanding TX packet
1228  *	and ring the doorbell for us.
1229  *
1230  *	When GTS is disabled we unconditionally ring the doorbell.
1231  */
1232 static __inline void
1233 check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1234 {
1235 #if USE_GTS
1236 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1237 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1238 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1239 #ifdef T3_TRACE
1240 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1241 			  q->cntxt_id);
1242 #endif
1243 		t3_write_reg(adap, A_SG_KDOORBELL,
1244 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1245 	}
1246 #else
1247 	if (mustring || ++q->db_pending >= 32) {
1248 		wmb();            /* write descriptors before telling HW */
1249 		t3_write_reg(adap, A_SG_KDOORBELL,
1250 		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1251 		q->db_pending = 0;
1252 	}
1253 #endif
1254 }
1255 
1256 static __inline void
1257 wr_gen2(struct tx_desc *d, unsigned int gen)
1258 {
1259 #if SGE_NUM_GENBITS == 2
1260 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1261 #endif
1262 }
1263 
1264 /**
1265  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1266  *	@ndesc: number of Tx descriptors spanned by the SGL
1267  *	@txd: first Tx descriptor to be written
1268  *	@txqs: txq state (generation and producer index)
1269  *	@txq: the SGE Tx queue
1270  *	@sgl: the SGL
1271  *	@flits: number of flits to the start of the SGL in the first descriptor
1272  *	@sgl_flits: the SGL size in flits
1273  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1274  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1275  *
1276  *	Write a work request header and an associated SGL.  If the SGL is
1277  *	small enough to fit into one Tx descriptor it has already been written
1278  *	and we just need to write the WR header.  Otherwise we distribute the
1279  *	SGL across the number of descriptors it spans.
1280  */
1281 static void
1282 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1283     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1284     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1285 {
1286 
1287 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1288 
1289 	if (__predict_true(ndesc == 1)) {
1290 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1291 		    V_WR_SGLSFLT(flits)) | wr_hi,
1292 		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
1293 		    wr_lo);
1294 
1295 		wr_gen2(txd, txqs->gen);
1296 
1297 	} else {
1298 		unsigned int ogen = txqs->gen;
1299 		const uint64_t *fp = (const uint64_t *)sgl;
1300 		struct work_request_hdr *wp = wrp;
1301 
1302 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1303 		    V_WR_SGLSFLT(flits)) | wr_hi;
1304 
1305 		while (sgl_flits) {
1306 			unsigned int avail = WR_FLITS - flits;
1307 
1308 			if (avail > sgl_flits)
1309 				avail = sgl_flits;
1310 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1311 			sgl_flits -= avail;
1312 			ndesc--;
1313 			if (!sgl_flits)
1314 				break;
1315 
1316 			fp += avail;
1317 			txd++;
1318 			if (++txqs->pidx == txq->size) {
1319 				txqs->pidx = 0;
1320 				txqs->gen ^= 1;
1321 				txd = txq->desc;
1322 			}
1323 
1324 			/*
1325 			 * when the head of the mbuf chain
1326 			 * is freed all clusters will be freed
1327 			 * with it
1328 			 */
1329 			wrp = (struct work_request_hdr *)txd;
1330 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1331 			    V_WR_SGLSFLT(1)) | wr_hi;
1332 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1333 				    sgl_flits + 1)) |
1334 			    V_WR_GEN(txqs->gen)) | wr_lo;
1335 			wr_gen2(txd, txqs->gen);
1336 			flits = 1;
1337 		}
1338 		wrp->wrh_hi |= htonl(F_WR_EOP);
1339 		wmb();
1340 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1341 		wr_gen2((struct tx_desc *)wp, ogen);
1342 	}
1343 }
1344 
1345 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1346 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1347 
1348 #define GET_VTAG(cntrl, m) \
1349 do { \
1350 	if ((m)->m_flags & M_VLANTAG)					            \
1351 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1352 } while (0)
1353 
1354 static int
1355 t3_encap(struct sge_qset *qs, struct mbuf **m)
1356 {
1357 	adapter_t *sc;
1358 	struct mbuf *m0;
1359 	struct sge_txq *txq;
1360 	struct txq_state txqs;
1361 	struct port_info *pi;
1362 	unsigned int ndesc, flits, cntrl, mlen;
1363 	int err, nsegs, tso_info = 0;
1364 
1365 	struct work_request_hdr *wrp;
1366 	struct tx_sw_desc *txsd;
1367 	struct sg_ent *sgp, *sgl;
1368 	uint32_t wr_hi, wr_lo, sgl_flits;
1369 	bus_dma_segment_t segs[TX_MAX_SEGS];
1370 
1371 	struct tx_desc *txd;
1372 
1373 	pi = qs->port;
1374 	sc = pi->adapter;
1375 	txq = &qs->txq[TXQ_ETH];
1376 	txd = &txq->desc[txq->pidx];
1377 	txsd = &txq->sdesc[txq->pidx];
1378 	sgl = txq->txq_sgl;
1379 
1380 	prefetch(txd);
1381 	m0 = *m;
1382 
1383 	mtx_assert(&qs->lock, MA_OWNED);
1384 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1385 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1386 
1387 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1388 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1389 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1390 
1391 	if (m0->m_nextpkt != NULL) {
1392 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1393 		ndesc = 1;
1394 		mlen = 0;
1395 	} else {
1396 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1397 		    &m0, segs, &nsegs))) {
1398 			if (cxgb_debug)
1399 				printf("failed ... err=%d\n", err);
1400 			return (err);
1401 		}
1402 		mlen = m0->m_pkthdr.len;
1403 		ndesc = calc_tx_descs(m0, nsegs);
1404 	}
1405 	txq_prod(txq, ndesc, &txqs);
1406 
1407 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1408 	txsd->m = m0;
1409 
1410 	if (m0->m_nextpkt != NULL) {
1411 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1412 		int i, fidx;
1413 
1414 		if (nsegs > 7)
1415 			panic("trying to coalesce %d packets in to one WR", nsegs);
1416 		txq->txq_coalesced += nsegs;
1417 		wrp = (struct work_request_hdr *)txd;
1418 		flits = nsegs*2 + 1;
1419 
1420 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1421 			struct cpl_tx_pkt_batch_entry *cbe;
1422 			uint64_t flit;
1423 			uint32_t *hflit = (uint32_t *)&flit;
1424 			int cflags = m0->m_pkthdr.csum_flags;
1425 
1426 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1427 			GET_VTAG(cntrl, m0);
1428 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1429 			if (__predict_false(!(cflags & CSUM_IP)))
1430 				cntrl |= F_TXPKT_IPCSUM_DIS;
1431 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP |
1432 			    CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1433 				cntrl |= F_TXPKT_L4CSUM_DIS;
1434 
1435 			hflit[0] = htonl(cntrl);
1436 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1437 			flit |= htobe64(1 << 24);
1438 			cbe = &cpl_batch->pkt_entry[i];
1439 			cbe->cntrl = hflit[0];
1440 			cbe->len = hflit[1];
1441 			cbe->addr = htobe64(segs[i].ds_addr);
1442 		}
1443 
1444 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1445 		    V_WR_SGLSFLT(flits)) |
1446 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1447 		wr_lo = htonl(V_WR_LEN(flits) |
1448 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1449 		set_wr_hdr(wrp, wr_hi, wr_lo);
1450 		wmb();
1451 		ETHER_BPF_MTAP(pi->ifp, m0);
1452 		wr_gen2(txd, txqs.gen);
1453 		check_ring_tx_db(sc, txq, 0);
1454 		return (0);
1455 	} else if (tso_info) {
1456 		uint16_t eth_type;
1457 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1458 		struct ether_header *eh;
1459 		void *l3hdr;
1460 		struct tcphdr *tcp;
1461 
1462 		txd->flit[2] = 0;
1463 		GET_VTAG(cntrl, m0);
1464 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1465 		hdr->cntrl = htonl(cntrl);
1466 		hdr->len = htonl(mlen | 0x80000000);
1467 
1468 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1469 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x",
1470 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1471 			    (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags);
1472 			panic("tx tso packet too small");
1473 		}
1474 
1475 		/* Make sure that ether, ip, tcp headers are all in m0 */
1476 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1477 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1478 			if (__predict_false(m0 == NULL)) {
1479 				/* XXX panic probably an overreaction */
1480 				panic("couldn't fit header into mbuf");
1481 			}
1482 		}
1483 
1484 		eh = mtod(m0, struct ether_header *);
1485 		eth_type = eh->ether_type;
1486 		if (eth_type == htons(ETHERTYPE_VLAN)) {
1487 			struct ether_vlan_header *evh = (void *)eh;
1488 
1489 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1490 			l3hdr = evh + 1;
1491 			eth_type = evh->evl_proto;
1492 		} else {
1493 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1494 			l3hdr = eh + 1;
1495 		}
1496 
1497 		if (eth_type == htons(ETHERTYPE_IP)) {
1498 			struct ip *ip = l3hdr;
1499 
1500 			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1501 			tcp = (struct tcphdr *)(ip + 1);
1502 		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1503 			struct ip6_hdr *ip6 = l3hdr;
1504 
1505 			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1506 			    ("%s: CSUM_TSO with ip6_nxt %d",
1507 			    __func__, ip6->ip6_nxt));
1508 
1509 			tso_info |= F_LSO_IPV6;
1510 			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1511 			tcp = (struct tcphdr *)(ip6 + 1);
1512 		} else
1513 			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1514 
1515 		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1516 		hdr->lso_info = htonl(tso_info);
1517 
1518 		if (__predict_false(mlen <= PIO_LEN)) {
1519 			/*
1520 			 * pkt not undersized but fits in PIO_LEN
1521 			 * Indicates a TSO bug at the higher levels.
1522 			 */
1523 			txsd->m = NULL;
1524 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1525 			flits = (mlen + 7) / 8 + 3;
1526 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1527 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1528 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1529 			wr_lo = htonl(V_WR_LEN(flits) |
1530 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1531 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1532 			wmb();
1533 			ETHER_BPF_MTAP(pi->ifp, m0);
1534 			wr_gen2(txd, txqs.gen);
1535 			check_ring_tx_db(sc, txq, 0);
1536 			m_freem(m0);
1537 			return (0);
1538 		}
1539 		flits = 3;
1540 	} else {
1541 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1542 
1543 		GET_VTAG(cntrl, m0);
1544 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1545 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1546 			cntrl |= F_TXPKT_IPCSUM_DIS;
1547 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP |
1548 		    CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1549 			cntrl |= F_TXPKT_L4CSUM_DIS;
1550 		cpl->cntrl = htonl(cntrl);
1551 		cpl->len = htonl(mlen | 0x80000000);
1552 
1553 		if (mlen <= PIO_LEN) {
1554 			txsd->m = NULL;
1555 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1556 			flits = (mlen + 7) / 8 + 2;
1557 
1558 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1559 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1560 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1561 			wr_lo = htonl(V_WR_LEN(flits) |
1562 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1563 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1564 			wmb();
1565 			ETHER_BPF_MTAP(pi->ifp, m0);
1566 			wr_gen2(txd, txqs.gen);
1567 			check_ring_tx_db(sc, txq, 0);
1568 			m_freem(m0);
1569 			return (0);
1570 		}
1571 		flits = 2;
1572 	}
1573 	wrp = (struct work_request_hdr *)txd;
1574 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1575 	make_sgl(sgp, segs, nsegs);
1576 
1577 	sgl_flits = sgl_len(nsegs);
1578 
1579 	ETHER_BPF_MTAP(pi->ifp, m0);
1580 
1581 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1582 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1583 	wr_lo = htonl(V_WR_TID(txq->token));
1584 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1585 	    sgl_flits, wr_hi, wr_lo);
1586 	check_ring_tx_db(sc, txq, 0);
1587 
1588 	return (0);
1589 }
1590 
1591 #ifdef DEBUGNET
1592 int
1593 cxgb_debugnet_encap(struct sge_qset *qs, struct mbuf **m)
1594 {
1595 	int error;
1596 
1597 	error = t3_encap(qs, m);
1598 	if (error == 0)
1599 		check_ring_tx_db(qs->port->adapter, &qs->txq[TXQ_ETH], 1);
1600 	else if (*m != NULL) {
1601 		m_freem(*m);
1602 		*m = NULL;
1603 	}
1604 	return (error);
1605 }
1606 #endif
1607 
1608 void
1609 cxgb_tx_watchdog(void *arg)
1610 {
1611 	struct sge_qset *qs = arg;
1612 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1613 
1614         if (qs->coalescing != 0 &&
1615 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1616 	    TXQ_RING_EMPTY(qs))
1617                 qs->coalescing = 0;
1618         else if (qs->coalescing == 0 &&
1619 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1620                 qs->coalescing = 1;
1621 	if (TXQ_TRYLOCK(qs)) {
1622 		qs->qs_flags |= QS_FLUSHING;
1623 		cxgb_start_locked(qs);
1624 		qs->qs_flags &= ~QS_FLUSHING;
1625 		TXQ_UNLOCK(qs);
1626 	}
1627 	if (if_getdrvflags(qs->port->ifp) & IFF_DRV_RUNNING)
1628 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1629 		    qs, txq->txq_watchdog.c_cpu);
1630 }
1631 
1632 static void
1633 cxgb_tx_timeout(void *arg)
1634 {
1635 	struct sge_qset *qs = arg;
1636 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1637 
1638 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1639                 qs->coalescing = 1;
1640 	if (TXQ_TRYLOCK(qs)) {
1641 		qs->qs_flags |= QS_TIMEOUT;
1642 		cxgb_start_locked(qs);
1643 		qs->qs_flags &= ~QS_TIMEOUT;
1644 		TXQ_UNLOCK(qs);
1645 	}
1646 }
1647 
1648 static void
1649 cxgb_start_locked(struct sge_qset *qs)
1650 {
1651 	struct mbuf *m_head = NULL;
1652 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1653 	struct port_info *pi = qs->port;
1654 	if_t ifp = pi->ifp;
1655 
1656 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1657 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1658 
1659 	if (!pi->link_config.link_ok) {
1660 		TXQ_RING_FLUSH(qs);
1661 		return;
1662 	}
1663 	TXQ_LOCK_ASSERT(qs);
1664 	while (!TXQ_RING_EMPTY(qs) && (if_getdrvflags(ifp) & IFF_DRV_RUNNING) &&
1665 	    pi->link_config.link_ok) {
1666 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1667 
1668 		if (txq->size - txq->in_use <= TX_MAX_DESC)
1669 			break;
1670 
1671 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1672 			break;
1673 		/*
1674 		 *  Encapsulation can modify our pointer, and or make it
1675 		 *  NULL on failure.  In that event, we can't requeue.
1676 		 */
1677 		if (t3_encap(qs, &m_head) || m_head == NULL)
1678 			break;
1679 
1680 		m_head = NULL;
1681 	}
1682 
1683 	if (txq->db_pending)
1684 		check_ring_tx_db(pi->adapter, txq, 1);
1685 
1686 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1687 	    pi->link_config.link_ok)
1688 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1689 		    qs, txq->txq_timer.c_cpu);
1690 	if (m_head != NULL)
1691 		m_freem(m_head);
1692 }
1693 
1694 static int
1695 cxgb_transmit_locked(if_t ifp, struct sge_qset *qs, struct mbuf *m)
1696 {
1697 	struct port_info *pi = qs->port;
1698 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1699 	struct buf_ring *br = txq->txq_mr;
1700 	int error, avail;
1701 
1702 	avail = txq->size - txq->in_use;
1703 	TXQ_LOCK_ASSERT(qs);
1704 
1705 	/*
1706 	 * We can only do a direct transmit if the following are true:
1707 	 * - we aren't coalescing (ring < 3/4 full)
1708 	 * - the link is up -- checked in caller
1709 	 * - there are no packets enqueued already
1710 	 * - there is space in hardware transmit queue
1711 	 */
1712 	if (check_pkt_coalesce(qs) == 0 &&
1713 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1714 		if (t3_encap(qs, &m)) {
1715 			if (m != NULL &&
1716 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1717 				return (error);
1718 		} else {
1719 			if (txq->db_pending)
1720 				check_ring_tx_db(pi->adapter, txq, 1);
1721 
1722 			/*
1723 			 * We've bypassed the buf ring so we need to update
1724 			 * the stats directly
1725 			 */
1726 			txq->txq_direct_packets++;
1727 			txq->txq_direct_bytes += m->m_pkthdr.len;
1728 		}
1729 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1730 		return (error);
1731 
1732 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1733 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1734 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1735 		cxgb_start_locked(qs);
1736 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1737 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1738 		    qs, txq->txq_timer.c_cpu);
1739 	return (0);
1740 }
1741 
1742 int
1743 cxgb_transmit(if_t ifp, struct mbuf *m)
1744 {
1745 	struct sge_qset *qs;
1746 	struct port_info *pi = if_getsoftc(ifp);
1747 	int error, qidx = pi->first_qset;
1748 
1749 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0
1750 	    ||(!pi->link_config.link_ok)) {
1751 		m_freem(m);
1752 		return (0);
1753 	}
1754 
1755 	/* check if flowid is set */
1756 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
1757 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1758 
1759 	qs = &pi->adapter->sge.qs[qidx];
1760 
1761 	if (TXQ_TRYLOCK(qs)) {
1762 		/* XXX running */
1763 		error = cxgb_transmit_locked(ifp, qs, m);
1764 		TXQ_UNLOCK(qs);
1765 	} else
1766 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1767 	return (error);
1768 }
1769 
1770 void
1771 cxgb_qflush(if_t ifp)
1772 {
1773 	/*
1774 	 * flush any enqueued mbufs in the buf_rings
1775 	 * and in the transmit queues
1776 	 * no-op for now
1777 	 */
1778 	return;
1779 }
1780 
1781 /**
1782  *	write_imm - write a packet into a Tx descriptor as immediate data
1783  *	@d: the Tx descriptor to write
1784  *	@m: the packet
1785  *	@len: the length of packet data to write as immediate data
1786  *	@gen: the generation bit value to write
1787  *
1788  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1789  *	contains a work request at its beginning.  We must write the packet
1790  *	carefully so the SGE doesn't read accidentally before it's written in
1791  *	its entirety.
1792  */
1793 static __inline void
1794 write_imm(struct tx_desc *d, caddr_t src,
1795 	  unsigned int len, unsigned int gen)
1796 {
1797 	struct work_request_hdr *from = (struct work_request_hdr *)src;
1798 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1799 	uint32_t wr_hi, wr_lo;
1800 
1801 	KASSERT(len <= WR_LEN && len >= sizeof(*from),
1802 	    ("%s: invalid len %d", __func__, len));
1803 
1804 	memcpy(&to[1], &from[1], len - sizeof(*from));
1805 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1806 	    V_WR_BCNTLFLT(len & 7));
1807 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
1808 	set_wr_hdr(to, wr_hi, wr_lo);
1809 	wmb();
1810 	wr_gen2(d, gen);
1811 }
1812 
1813 /**
1814  *	check_desc_avail - check descriptor availability on a send queue
1815  *	@adap: the adapter
1816  *	@q: the TX queue
1817  *	@m: the packet needing the descriptors
1818  *	@ndesc: the number of Tx descriptors needed
1819  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1820  *
1821  *	Checks if the requested number of Tx descriptors is available on an
1822  *	SGE send queue.  If the queue is already suspended or not enough
1823  *	descriptors are available the packet is queued for later transmission.
1824  *	Must be called with the Tx queue locked.
1825  *
1826  *	Returns 0 if enough descriptors are available, 1 if there aren't
1827  *	enough descriptors and the packet has been queued, and 2 if the caller
1828  *	needs to retry because there weren't enough descriptors at the
1829  *	beginning of the call but some freed up in the mean time.
1830  */
1831 static __inline int
1832 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1833 		 struct mbuf *m, unsigned int ndesc,
1834 		 unsigned int qid)
1835 {
1836 	/*
1837 	 * XXX We currently only use this for checking the control queue
1838 	 * the control queue is only used for binding qsets which happens
1839 	 * at init time so we are guaranteed enough descriptors
1840 	 */
1841 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1842 addq_exit:	(void )mbufq_enqueue(&q->sendq, m);
1843 		return 1;
1844 	}
1845 	if (__predict_false(q->size - q->in_use < ndesc)) {
1846 
1847 		struct sge_qset *qs = txq_to_qset(q, qid);
1848 
1849 		setbit(&qs->txq_stopped, qid);
1850 		if (should_restart_tx(q) &&
1851 		    test_and_clear_bit(qid, &qs->txq_stopped))
1852 			return 2;
1853 
1854 		q->stops++;
1855 		goto addq_exit;
1856 	}
1857 	return 0;
1858 }
1859 
1860 
1861 /**
1862  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1863  *	@q: the SGE control Tx queue
1864  *
1865  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1866  *	that send only immediate data (presently just the control queues) and
1867  *	thus do not have any mbufs
1868  */
1869 static __inline void
1870 reclaim_completed_tx_imm(struct sge_txq *q)
1871 {
1872 	unsigned int reclaim = q->processed - q->cleaned;
1873 
1874 	q->in_use -= reclaim;
1875 	q->cleaned += reclaim;
1876 }
1877 
1878 /**
1879  *	ctrl_xmit - send a packet through an SGE control Tx queue
1880  *	@adap: the adapter
1881  *	@q: the control queue
1882  *	@m: the packet
1883  *
1884  *	Send a packet through an SGE control Tx queue.  Packets sent through
1885  *	a control queue must fit entirely as immediate data in a single Tx
1886  *	descriptor and have no page fragments.
1887  */
1888 static int
1889 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1890 {
1891 	int ret;
1892 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1893 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1894 
1895 	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
1896 
1897 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1898 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1899 
1900 	TXQ_LOCK(qs);
1901 again:	reclaim_completed_tx_imm(q);
1902 
1903 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1904 	if (__predict_false(ret)) {
1905 		if (ret == 1) {
1906 			TXQ_UNLOCK(qs);
1907 			return (ENOSPC);
1908 		}
1909 		goto again;
1910 	}
1911 	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1912 
1913 	q->in_use++;
1914 	if (++q->pidx >= q->size) {
1915 		q->pidx = 0;
1916 		q->gen ^= 1;
1917 	}
1918 	TXQ_UNLOCK(qs);
1919 	wmb();
1920 	t3_write_reg(adap, A_SG_KDOORBELL,
1921 	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1922 
1923 	m_free(m);
1924 	return (0);
1925 }
1926 
1927 
1928 /**
1929  *	restart_ctrlq - restart a suspended control queue
1930  *	@qs: the queue set cotaining the control queue
1931  *
1932  *	Resumes transmission on a suspended Tx control queue.
1933  */
1934 static void
1935 restart_ctrlq(void *data, int npending)
1936 {
1937 	struct mbuf *m;
1938 	struct sge_qset *qs = (struct sge_qset *)data;
1939 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1940 	adapter_t *adap = qs->port->adapter;
1941 
1942 	TXQ_LOCK(qs);
1943 again:	reclaim_completed_tx_imm(q);
1944 
1945 	while (q->in_use < q->size &&
1946 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1947 
1948 		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1949 		m_free(m);
1950 
1951 		if (++q->pidx >= q->size) {
1952 			q->pidx = 0;
1953 			q->gen ^= 1;
1954 		}
1955 		q->in_use++;
1956 	}
1957 	if (!mbufq_empty(&q->sendq)) {
1958 		setbit(&qs->txq_stopped, TXQ_CTRL);
1959 
1960 		if (should_restart_tx(q) &&
1961 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1962 			goto again;
1963 		q->stops++;
1964 	}
1965 	TXQ_UNLOCK(qs);
1966 	t3_write_reg(adap, A_SG_KDOORBELL,
1967 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1968 }
1969 
1970 
1971 /*
1972  * Send a management message through control queue 0
1973  */
1974 int
1975 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1976 {
1977 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1978 }
1979 
1980 /**
1981  *	free_qset - free the resources of an SGE queue set
1982  *	@sc: the controller owning the queue set
1983  *	@q: the queue set
1984  *
1985  *	Release the HW and SW resources associated with an SGE queue set, such
1986  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1987  *	queue set must be quiesced prior to calling this.
1988  */
1989 static void
1990 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1991 {
1992 	int i;
1993 
1994 	reclaim_completed_tx(q, 0, TXQ_ETH);
1995 	if (q->txq[TXQ_ETH].txq_mr != NULL)
1996 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
1997 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
1998 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
1999 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
2000 	}
2001 
2002 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2003 		if (q->fl[i].desc) {
2004 			mtx_lock_spin(&sc->sge.reg_lock);
2005 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
2006 			mtx_unlock_spin(&sc->sge.reg_lock);
2007 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
2008 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
2009 					q->fl[i].desc_map);
2010 			bus_dma_tag_destroy(q->fl[i].desc_tag);
2011 			bus_dma_tag_destroy(q->fl[i].entry_tag);
2012 		}
2013 		if (q->fl[i].sdesc) {
2014 			free_rx_bufs(sc, &q->fl[i]);
2015 			free(q->fl[i].sdesc, M_DEVBUF);
2016 		}
2017 	}
2018 
2019 	mtx_unlock(&q->lock);
2020 	MTX_DESTROY(&q->lock);
2021 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2022 		if (q->txq[i].desc) {
2023 			mtx_lock_spin(&sc->sge.reg_lock);
2024 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2025 			mtx_unlock_spin(&sc->sge.reg_lock);
2026 			bus_dmamap_unload(q->txq[i].desc_tag,
2027 					q->txq[i].desc_map);
2028 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2029 					q->txq[i].desc_map);
2030 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2031 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2032 		}
2033 		if (q->txq[i].sdesc) {
2034 			free(q->txq[i].sdesc, M_DEVBUF);
2035 		}
2036 	}
2037 
2038 	if (q->rspq.desc) {
2039 		mtx_lock_spin(&sc->sge.reg_lock);
2040 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2041 		mtx_unlock_spin(&sc->sge.reg_lock);
2042 
2043 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2044 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2045 			        q->rspq.desc_map);
2046 		bus_dma_tag_destroy(q->rspq.desc_tag);
2047 		MTX_DESTROY(&q->rspq.lock);
2048 	}
2049 
2050 #if defined(INET6) || defined(INET)
2051 	tcp_lro_free(&q->lro.ctrl);
2052 #endif
2053 
2054 	bzero(q, sizeof(*q));
2055 }
2056 
2057 /**
2058  *	t3_free_sge_resources - free SGE resources
2059  *	@sc: the adapter softc
2060  *
2061  *	Frees resources used by the SGE queue sets.
2062  */
2063 void
2064 t3_free_sge_resources(adapter_t *sc, int nqsets)
2065 {
2066 	int i;
2067 
2068 	for (i = 0; i < nqsets; ++i) {
2069 		TXQ_LOCK(&sc->sge.qs[i]);
2070 		t3_free_qset(sc, &sc->sge.qs[i]);
2071 	}
2072 }
2073 
2074 /**
2075  *	t3_sge_start - enable SGE
2076  *	@sc: the controller softc
2077  *
2078  *	Enables the SGE for DMAs.  This is the last step in starting packet
2079  *	transfers.
2080  */
2081 void
2082 t3_sge_start(adapter_t *sc)
2083 {
2084 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2085 }
2086 
2087 /**
2088  *	t3_sge_stop - disable SGE operation
2089  *	@sc: the adapter
2090  *
2091  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2092  *	from error interrupts) or from normal process context.  In the latter
2093  *	case it also disables any pending queue restart tasklets.  Note that
2094  *	if it is called in interrupt context it cannot disable the restart
2095  *	tasklets as it cannot wait, however the tasklets will have no effect
2096  *	since the doorbells are disabled and the driver will call this again
2097  *	later from process context, at which time the tasklets will be stopped
2098  *	if they are still running.
2099  */
2100 void
2101 t3_sge_stop(adapter_t *sc)
2102 {
2103 
2104 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2105 }
2106 
2107 /**
2108  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2109  *	@adapter: the adapter
2110  *	@q: the Tx queue to reclaim descriptors from
2111  *	@reclaimable: the number of descriptors to reclaim
2112  *      @m_vec_size: maximum number of buffers to reclaim
2113  *      @desc_reclaimed: returns the number of descriptors reclaimed
2114  *
2115  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2116  *	Tx buffers.  Called with the Tx queue lock held.
2117  *
2118  *      Returns number of buffers of reclaimed
2119  */
2120 void
2121 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2122 {
2123 	struct tx_sw_desc *txsd;
2124 	unsigned int cidx, mask;
2125 	struct sge_txq *q = &qs->txq[queue];
2126 
2127 #ifdef T3_TRACE
2128 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2129 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2130 #endif
2131 	cidx = q->cidx;
2132 	mask = q->size - 1;
2133 	txsd = &q->sdesc[cidx];
2134 
2135 	mtx_assert(&qs->lock, MA_OWNED);
2136 	while (reclaimable--) {
2137 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2138 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2139 
2140 		if (txsd->m != NULL) {
2141 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2142 				bus_dmamap_unload(q->entry_tag, txsd->map);
2143 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2144 			}
2145 			m_freem_list(txsd->m);
2146 			txsd->m = NULL;
2147 		} else
2148 			q->txq_skipped++;
2149 
2150 		++txsd;
2151 		if (++cidx == q->size) {
2152 			cidx = 0;
2153 			txsd = q->sdesc;
2154 		}
2155 	}
2156 	q->cidx = cidx;
2157 
2158 }
2159 
2160 /**
2161  *	is_new_response - check if a response is newly written
2162  *	@r: the response descriptor
2163  *	@q: the response queue
2164  *
2165  *	Returns true if a response descriptor contains a yet unprocessed
2166  *	response.
2167  */
2168 static __inline int
2169 is_new_response(const struct rsp_desc *r,
2170     const struct sge_rspq *q)
2171 {
2172 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2173 }
2174 
2175 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2176 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2177 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2178 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2179 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2180 
2181 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2182 #define NOMEM_INTR_DELAY 2500
2183 
2184 #ifdef TCP_OFFLOAD
2185 /**
2186  *	write_ofld_wr - write an offload work request
2187  *	@adap: the adapter
2188  *	@m: the packet to send
2189  *	@q: the Tx queue
2190  *	@pidx: index of the first Tx descriptor to write
2191  *	@gen: the generation value to use
2192  *	@ndesc: number of descriptors the packet will occupy
2193  *
2194  *	Write an offload work request to send the supplied packet.  The packet
2195  *	data already carry the work request with most fields populated.
2196  */
2197 static void
2198 write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
2199     unsigned int pidx, unsigned int gen, unsigned int ndesc)
2200 {
2201 	unsigned int sgl_flits, flits;
2202 	int i, idx, nsegs, wrlen;
2203 	struct work_request_hdr *from;
2204 	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
2205 	struct tx_desc *d = &q->desc[pidx];
2206 	struct txq_state txqs;
2207 	struct sglist_seg *segs;
2208 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2209 	struct sglist *sgl;
2210 
2211 	from = (void *)(oh + 1);	/* Start of WR within mbuf */
2212 	wrlen = m->m_len - sizeof(*oh);
2213 
2214 	if (!(oh->flags & F_HDR_SGL)) {
2215 		write_imm(d, (caddr_t)from, wrlen, gen);
2216 
2217 		/*
2218 		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
2219 		 * t3_push_frames and freed in wr_ack.  Others, like those sent
2220 		 * down by close_conn, t3_send_reset, etc. should be freed here.
2221 		 */
2222 		if (!(oh->flags & F_HDR_DF))
2223 			m_free(m);
2224 		return;
2225 	}
2226 
2227 	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
2228 
2229 	sgl = oh->sgl;
2230 	flits = wrlen / 8;
2231 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
2232 
2233 	nsegs = sgl->sg_nseg;
2234 	segs = sgl->sg_segs;
2235 	for (idx = 0, i = 0; i < nsegs; i++) {
2236 		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
2237 		if (i && idx == 0)
2238 			++sgp;
2239 		sgp->len[idx] = htobe32(segs[i].ss_len);
2240 		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
2241 		idx ^= 1;
2242 	}
2243 	if (idx) {
2244 		sgp->len[idx] = 0;
2245 		sgp->addr[idx] = 0;
2246 	}
2247 
2248 	sgl_flits = sgl_len(nsegs);
2249 	txqs.gen = gen;
2250 	txqs.pidx = pidx;
2251 	txqs.compl = 0;
2252 
2253 	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
2254 	    from->wrh_hi, from->wrh_lo);
2255 }
2256 
2257 /**
2258  *	ofld_xmit - send a packet through an offload queue
2259  *	@adap: the adapter
2260  *	@q: the Tx offload queue
2261  *	@m: the packet
2262  *
2263  *	Send an offload packet through an SGE offload queue.
2264  */
2265 static int
2266 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2267 {
2268 	int ret;
2269 	unsigned int ndesc;
2270 	unsigned int pidx, gen;
2271 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2272 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2273 
2274 	ndesc = G_HDR_NDESC(oh->flags);
2275 
2276 	TXQ_LOCK(qs);
2277 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2278 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2279 	if (__predict_false(ret)) {
2280 		if (ret == 1) {
2281 			TXQ_UNLOCK(qs);
2282 			return (EINTR);
2283 		}
2284 		goto again;
2285 	}
2286 
2287 	gen = q->gen;
2288 	q->in_use += ndesc;
2289 	pidx = q->pidx;
2290 	q->pidx += ndesc;
2291 	if (q->pidx >= q->size) {
2292 		q->pidx -= q->size;
2293 		q->gen ^= 1;
2294 	}
2295 
2296 	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2297 	check_ring_tx_db(adap, q, 1);
2298 	TXQ_UNLOCK(qs);
2299 
2300 	return (0);
2301 }
2302 
2303 /**
2304  *	restart_offloadq - restart a suspended offload queue
2305  *	@qs: the queue set cotaining the offload queue
2306  *
2307  *	Resumes transmission on a suspended Tx offload queue.
2308  */
2309 static void
2310 restart_offloadq(void *data, int npending)
2311 {
2312 	struct mbuf *m;
2313 	struct sge_qset *qs = data;
2314 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2315 	adapter_t *adap = qs->port->adapter;
2316 
2317 	TXQ_LOCK(qs);
2318 again:
2319 	while ((m = mbufq_first(&q->sendq)) != NULL) {
2320 		unsigned int gen, pidx;
2321 		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2322 		unsigned int ndesc = G_HDR_NDESC(oh->flags);
2323 
2324 		if (__predict_false(q->size - q->in_use < ndesc)) {
2325 			setbit(&qs->txq_stopped, TXQ_OFLD);
2326 			if (should_restart_tx(q) &&
2327 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2328 				goto again;
2329 			q->stops++;
2330 			break;
2331 		}
2332 
2333 		gen = q->gen;
2334 		q->in_use += ndesc;
2335 		pidx = q->pidx;
2336 		q->pidx += ndesc;
2337 		if (q->pidx >= q->size) {
2338 			q->pidx -= q->size;
2339 			q->gen ^= 1;
2340 		}
2341 
2342 		(void)mbufq_dequeue(&q->sendq);
2343 		TXQ_UNLOCK(qs);
2344 		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2345 		TXQ_LOCK(qs);
2346 	}
2347 #if USE_GTS
2348 	set_bit(TXQ_RUNNING, &q->flags);
2349 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2350 #endif
2351 	TXQ_UNLOCK(qs);
2352 	wmb();
2353 	t3_write_reg(adap, A_SG_KDOORBELL,
2354 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2355 }
2356 
2357 /**
2358  *	t3_offload_tx - send an offload packet
2359  *	@m: the packet
2360  *
2361  *	Sends an offload packet.  We use the packet priority to select the
2362  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2363  *	should be sent as regular or control, bits 1-3 select the queue set.
2364  */
2365 int
2366 t3_offload_tx(struct adapter *sc, struct mbuf *m)
2367 {
2368 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2369 	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
2370 
2371 	if (oh->flags & F_HDR_CTRL) {
2372 		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
2373 		return (ctrl_xmit(sc, qs, m));
2374 	} else
2375 		return (ofld_xmit(sc, qs, m));
2376 }
2377 #endif
2378 
2379 static void
2380 restart_tx(struct sge_qset *qs)
2381 {
2382 	struct adapter *sc = qs->port->adapter;
2383 
2384 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2385 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2386 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2387 		qs->txq[TXQ_OFLD].restarts++;
2388 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2389 	}
2390 
2391 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2392 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2393 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2394 		qs->txq[TXQ_CTRL].restarts++;
2395 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2396 	}
2397 }
2398 
2399 /**
2400  *	t3_sge_alloc_qset - initialize an SGE queue set
2401  *	@sc: the controller softc
2402  *	@id: the queue set id
2403  *	@nports: how many Ethernet ports will be using this queue set
2404  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2405  *	@p: configuration parameters for this queue set
2406  *	@ntxq: number of Tx queues for the queue set
2407  *	@pi: port info for queue set
2408  *
2409  *	Allocate resources and initialize an SGE queue set.  A queue set
2410  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2411  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2412  *	queue, offload queue, and control queue.
2413  */
2414 int
2415 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2416 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2417 {
2418 	struct sge_qset *q = &sc->sge.qs[id];
2419 	int i, ret = 0;
2420 
2421 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2422 	q->port = pi;
2423 	q->adap = sc;
2424 
2425 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2426 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2427 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2428 		goto err;
2429 	}
2430 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2431 	    M_NOWAIT | M_ZERO)) == NULL) {
2432 		device_printf(sc->dev, "failed to allocate ifq\n");
2433 		goto err;
2434 	}
2435 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2436 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2437 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2438 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2439 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2440 
2441 	init_qset_cntxt(q, id);
2442 	q->idx = id;
2443 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2444 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2445 		    &q->fl[0].desc, &q->fl[0].sdesc,
2446 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2447 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2448 		printf("error %d from alloc ring fl0\n", ret);
2449 		goto err;
2450 	}
2451 
2452 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2453 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2454 		    &q->fl[1].desc, &q->fl[1].sdesc,
2455 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2456 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2457 		printf("error %d from alloc ring fl1\n", ret);
2458 		goto err;
2459 	}
2460 
2461 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2462 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2463 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2464 		    NULL, NULL)) != 0) {
2465 		printf("error %d from alloc ring rspq\n", ret);
2466 		goto err;
2467 	}
2468 
2469 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2470 	    device_get_unit(sc->dev), irq_vec_idx);
2471 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2472 
2473 	for (i = 0; i < ntxq; ++i) {
2474 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2475 
2476 		if ((ret = alloc_ring(sc, p->txq_size[i],
2477 			    sizeof(struct tx_desc), sz,
2478 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2479 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2480 			    &q->txq[i].desc_map,
2481 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2482 			printf("error %d from alloc ring tx %i\n", ret, i);
2483 			goto err;
2484 		}
2485 		mbufq_init(&q->txq[i].sendq, INT_MAX);
2486 		q->txq[i].gen = 1;
2487 		q->txq[i].size = p->txq_size[i];
2488 	}
2489 
2490 #ifdef TCP_OFFLOAD
2491 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2492 #endif
2493 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2494 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2495 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2496 
2497 	q->fl[0].gen = q->fl[1].gen = 1;
2498 	q->fl[0].size = p->fl_size;
2499 	q->fl[1].size = p->jumbo_size;
2500 
2501 	q->rspq.gen = 1;
2502 	q->rspq.cidx = 0;
2503 	q->rspq.size = p->rspq_size;
2504 
2505 	q->txq[TXQ_ETH].stop_thres = nports *
2506 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2507 
2508 	q->fl[0].buf_size = MCLBYTES;
2509 	q->fl[0].zone = zone_pack;
2510 	q->fl[0].type = EXT_PACKET;
2511 
2512 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2513 		q->fl[1].zone = zone_jumbo16;
2514 		q->fl[1].type = EXT_JUMBO16;
2515 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2516 		q->fl[1].zone = zone_jumbo9;
2517 		q->fl[1].type = EXT_JUMBO9;
2518 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2519 		q->fl[1].zone = zone_jumbop;
2520 		q->fl[1].type = EXT_JUMBOP;
2521 	} else {
2522 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2523 		ret = EDOOFUS;
2524 		goto err;
2525 	}
2526 	q->fl[1].buf_size = p->jumbo_buf_size;
2527 
2528 	/* Allocate and setup the lro_ctrl structure */
2529 	q->lro.enabled = !!(if_getcapenable(pi->ifp) & IFCAP_LRO);
2530 #if defined(INET6) || defined(INET)
2531 	ret = tcp_lro_init(&q->lro.ctrl);
2532 	if (ret) {
2533 		printf("error %d from tcp_lro_init\n", ret);
2534 		goto err;
2535 	}
2536 #endif
2537 	q->lro.ctrl.ifp = pi->ifp;
2538 
2539 	mtx_lock_spin(&sc->sge.reg_lock);
2540 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2541 				   q->rspq.phys_addr, q->rspq.size,
2542 				   q->fl[0].buf_size, 1, 0);
2543 	if (ret) {
2544 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2545 		goto err_unlock;
2546 	}
2547 
2548 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2549 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2550 					  q->fl[i].phys_addr, q->fl[i].size,
2551 					  q->fl[i].buf_size, p->cong_thres, 1,
2552 					  0);
2553 		if (ret) {
2554 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2555 			goto err_unlock;
2556 		}
2557 	}
2558 
2559 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2560 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2561 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2562 				 1, 0);
2563 	if (ret) {
2564 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2565 		goto err_unlock;
2566 	}
2567 
2568 	if (ntxq > 1) {
2569 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2570 					 USE_GTS, SGE_CNTXT_OFLD, id,
2571 					 q->txq[TXQ_OFLD].phys_addr,
2572 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2573 		if (ret) {
2574 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2575 			goto err_unlock;
2576 		}
2577 	}
2578 
2579 	if (ntxq > 2) {
2580 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2581 					 SGE_CNTXT_CTRL, id,
2582 					 q->txq[TXQ_CTRL].phys_addr,
2583 					 q->txq[TXQ_CTRL].size,
2584 					 q->txq[TXQ_CTRL].token, 1, 0);
2585 		if (ret) {
2586 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2587 			goto err_unlock;
2588 		}
2589 	}
2590 
2591 	mtx_unlock_spin(&sc->sge.reg_lock);
2592 	t3_update_qset_coalesce(q, p);
2593 
2594 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2595 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2596 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2597 
2598 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2599 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2600 
2601 	return (0);
2602 
2603 err_unlock:
2604 	mtx_unlock_spin(&sc->sge.reg_lock);
2605 err:
2606 	TXQ_LOCK(q);
2607 	t3_free_qset(sc, q);
2608 
2609 	return (ret);
2610 }
2611 
2612 /*
2613  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2614  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2615  * will also be taken into account here.
2616  */
2617 void
2618 t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad)
2619 {
2620 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2621 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2622 	if_t ifp = pi->ifp;
2623 
2624 	if (cpl->vlan_valid) {
2625 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2626 		m->m_flags |= M_VLANTAG;
2627 	}
2628 
2629 	m->m_pkthdr.rcvif = ifp;
2630 	/*
2631 	 * adjust after conversion to mbuf chain
2632 	 */
2633 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2634 	m->m_len -= (sizeof(*cpl) + ethpad);
2635 	m->m_data += (sizeof(*cpl) + ethpad);
2636 
2637 	if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) {
2638 		struct ether_header *eh = mtod(m, void *);
2639 		uint16_t eh_type;
2640 
2641 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2642 			struct ether_vlan_header *evh = mtod(m, void *);
2643 
2644 			eh_type = evh->evl_proto;
2645 		} else
2646 			eh_type = eh->ether_type;
2647 
2648 		if (if_getcapenable(ifp) & IFCAP_RXCSUM &&
2649 		    eh_type == htons(ETHERTYPE_IP)) {
2650 			m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
2651 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2652 			m->m_pkthdr.csum_data = 0xffff;
2653 		} else if (if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6 &&
2654 		    eh_type == htons(ETHERTYPE_IPV6)) {
2655 			m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
2656 			    CSUM_PSEUDO_HDR);
2657 			m->m_pkthdr.csum_data = 0xffff;
2658 		}
2659 	}
2660 }
2661 
2662 /**
2663  *	get_packet - return the next ingress packet buffer from a free list
2664  *	@adap: the adapter that received the packet
2665  *	@drop_thres: # of remaining buffers before we start dropping packets
2666  *	@qs: the qset that the SGE free list holding the packet belongs to
2667  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2668  *      @r: response descriptor
2669  *
2670  *	Get the next packet from a free list and complete setup of the
2671  *	sk_buff.  If the packet is small we make a copy and recycle the
2672  *	original buffer, otherwise we use the original buffer itself.  If a
2673  *	positive drop threshold is supplied packets are dropped and their
2674  *	buffers recycled if (a) the number of remaining buffers is under the
2675  *	threshold and the packet is too big to copy, or (b) the packet should
2676  *	be copied but there is no memory for the copy.
2677  */
2678 static int
2679 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2680     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2681 {
2682 
2683 	unsigned int len_cq =  ntohl(r->len_cq);
2684 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2685 	int mask, cidx = fl->cidx;
2686 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2687 	uint32_t len = G_RSPD_LEN(len_cq);
2688 	uint32_t flags = M_EXT;
2689 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2690 	caddr_t cl;
2691 	struct mbuf *m;
2692 	int ret = 0;
2693 
2694 	mask = fl->size - 1;
2695 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2696 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2697 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2698 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2699 
2700 	fl->credits--;
2701 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2702 
2703 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2704 	    sopeop == RSPQ_SOP_EOP) {
2705 		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
2706 			goto skip_recycle;
2707 		cl = mtod(m, void *);
2708 		memcpy(cl, sd->rxsd_cl, len);
2709 		recycle_rx_buf(adap, fl, fl->cidx);
2710 		m->m_pkthdr.len = m->m_len = len;
2711 		m->m_flags = 0;
2712 		mh->mh_head = mh->mh_tail = m;
2713 		ret = 1;
2714 		goto done;
2715 	} else {
2716 	skip_recycle:
2717 		bus_dmamap_unload(fl->entry_tag, sd->map);
2718 		cl = sd->rxsd_cl;
2719 		m = sd->m;
2720 
2721 		if ((sopeop == RSPQ_SOP_EOP) ||
2722 		    (sopeop == RSPQ_SOP))
2723 			flags |= M_PKTHDR;
2724 		m_init(m, M_NOWAIT, MT_DATA, flags);
2725 		if (fl->zone == zone_pack) {
2726 			/*
2727 			 * restore clobbered data pointer
2728 			 */
2729 			m->m_data = m->m_ext.ext_buf;
2730 		} else {
2731 			m_cljset(m, cl, fl->type);
2732 		}
2733 		m->m_len = len;
2734 	}
2735 	switch(sopeop) {
2736 	case RSPQ_SOP_EOP:
2737 		ret = 1;
2738 		/* FALLTHROUGH */
2739 	case RSPQ_SOP:
2740 		mh->mh_head = mh->mh_tail = m;
2741 		m->m_pkthdr.len = len;
2742 		break;
2743 	case RSPQ_EOP:
2744 		ret = 1;
2745 		/* FALLTHROUGH */
2746 	case RSPQ_NSOP_NEOP:
2747 		if (mh->mh_tail == NULL) {
2748 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2749 			m_freem(m);
2750 			m = NULL;
2751 			break;
2752 		}
2753 		mh->mh_tail->m_next = m;
2754 		mh->mh_tail = m;
2755 		mh->mh_head->m_pkthdr.len += len;
2756 		break;
2757 	}
2758 	if (cxgb_debug && m != NULL)
2759 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2760 done:
2761 	if (++fl->cidx == fl->size)
2762 		fl->cidx = 0;
2763 
2764 	return (ret);
2765 }
2766 
2767 /**
2768  *	handle_rsp_cntrl_info - handles control information in a response
2769  *	@qs: the queue set corresponding to the response
2770  *	@flags: the response control flags
2771  *
2772  *	Handles the control information of an SGE response, such as GTS
2773  *	indications and completion credits for the queue set's Tx queues.
2774  *	HW coalesces credits, we don't do any extra SW coalescing.
2775  */
2776 static __inline void
2777 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2778 {
2779 	unsigned int credits;
2780 
2781 #if USE_GTS
2782 	if (flags & F_RSPD_TXQ0_GTS)
2783 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2784 #endif
2785 	credits = G_RSPD_TXQ0_CR(flags);
2786 	if (credits)
2787 		qs->txq[TXQ_ETH].processed += credits;
2788 
2789 	credits = G_RSPD_TXQ2_CR(flags);
2790 	if (credits)
2791 		qs->txq[TXQ_CTRL].processed += credits;
2792 
2793 # if USE_GTS
2794 	if (flags & F_RSPD_TXQ1_GTS)
2795 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2796 # endif
2797 	credits = G_RSPD_TXQ1_CR(flags);
2798 	if (credits)
2799 		qs->txq[TXQ_OFLD].processed += credits;
2800 
2801 }
2802 
2803 static void
2804 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2805     unsigned int sleeping)
2806 {
2807 	;
2808 }
2809 
2810 /**
2811  *	process_responses - process responses from an SGE response queue
2812  *	@adap: the adapter
2813  *	@qs: the queue set to which the response queue belongs
2814  *	@budget: how many responses can be processed in this round
2815  *
2816  *	Process responses from an SGE response queue up to the supplied budget.
2817  *	Responses include received packets as well as credits and other events
2818  *	for the queues that belong to the response queue's queue set.
2819  *	A negative budget is effectively unlimited.
2820  *
2821  *	Additionally choose the interrupt holdoff time for the next interrupt
2822  *	on this queue.  If the system is under memory shortage use a fairly
2823  *	long delay to help recovery.
2824  */
2825 static int
2826 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2827 {
2828 	struct sge_rspq *rspq = &qs->rspq;
2829 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2830 	int budget_left = budget;
2831 	unsigned int sleeping = 0;
2832 #if defined(INET6) || defined(INET)
2833 	int lro_enabled = qs->lro.enabled;
2834 	int skip_lro;
2835 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2836 #endif
2837 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2838 #ifdef DEBUG
2839 	static int last_holdoff = 0;
2840 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2841 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2842 		last_holdoff = rspq->holdoff_tmr;
2843 	}
2844 #endif
2845 	rspq->next_holdoff = rspq->holdoff_tmr;
2846 
2847 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2848 		int eth, eop = 0, ethpad = 0;
2849 		uint32_t flags = ntohl(r->flags);
2850 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2851 		uint8_t opcode = r->rss_hdr.opcode;
2852 
2853 		eth = (opcode == CPL_RX_PKT);
2854 
2855 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2856 			struct mbuf *m;
2857 
2858 			if (cxgb_debug)
2859 				printf("async notification\n");
2860 
2861 			if (mh->mh_head == NULL) {
2862 				mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA);
2863 				m = mh->mh_head;
2864 			} else {
2865 				m = m_gethdr(M_NOWAIT, MT_DATA);
2866 			}
2867 			if (m == NULL)
2868 				goto no_mem;
2869 
2870                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2871 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2872                         *mtod(m, uint8_t *) = CPL_ASYNC_NOTIF;
2873 			opcode = CPL_ASYNC_NOTIF;
2874 			eop = 1;
2875                         rspq->async_notif++;
2876 			goto skip;
2877 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2878 			struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
2879 
2880 			if (m == NULL) {
2881 		no_mem:
2882 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2883 				budget_left--;
2884 				break;
2885 			}
2886 			if (mh->mh_head == NULL)
2887 				mh->mh_head = m;
2888                         else
2889 				mh->mh_tail->m_next = m;
2890 			mh->mh_tail = m;
2891 
2892 			get_imm_packet(adap, r, m);
2893 			mh->mh_head->m_pkthdr.len += m->m_len;
2894 			eop = 1;
2895 			rspq->imm_data++;
2896 		} else if (r->len_cq) {
2897 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2898 
2899 			eop = get_packet(adap, drop_thresh, qs, mh, r);
2900 			if (eop) {
2901 				if (r->rss_hdr.hash_type && !adap->timestamp) {
2902 					M_HASHTYPE_SET(mh->mh_head,
2903 					    M_HASHTYPE_OPAQUE_HASH);
2904 					mh->mh_head->m_pkthdr.flowid = rss_hash;
2905 				}
2906 			}
2907 
2908 			ethpad = 2;
2909 		} else {
2910 			rspq->pure_rsps++;
2911 		}
2912 	skip:
2913 		if (flags & RSPD_CTRL_MASK) {
2914 			sleeping |= flags & RSPD_GTS_MASK;
2915 			handle_rsp_cntrl_info(qs, flags);
2916 		}
2917 
2918 		if (!eth && eop) {
2919 			rspq->offload_pkts++;
2920 #ifdef TCP_OFFLOAD
2921 			adap->cpl_handler[opcode](qs, r, mh->mh_head);
2922 #else
2923 			m_freem(mh->mh_head);
2924 #endif
2925 			mh->mh_head = NULL;
2926 		} else if (eth && eop) {
2927 			struct mbuf *m = mh->mh_head;
2928 
2929 			t3_rx_eth(adap, m, ethpad);
2930 
2931 			/*
2932 			 * The T304 sends incoming packets on any qset.  If LRO
2933 			 * is also enabled, we could end up sending packet up
2934 			 * lro_ctrl->ifp's input.  That is incorrect.
2935 			 *
2936 			 * The mbuf's rcvif was derived from the cpl header and
2937 			 * is accurate.  Skip LRO and just use that.
2938 			 */
2939 #if defined(INET6) || defined(INET)
2940 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
2941 
2942 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
2943 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
2944 			    ) {
2945 				/* successfully queue'd for LRO */
2946 			} else
2947 #endif
2948 			{
2949 				/*
2950 				 * LRO not enabled, packet unsuitable for LRO,
2951 				 * or unable to queue.  Pass it up right now in
2952 				 * either case.
2953 				 */
2954 				if_t ifp = m->m_pkthdr.rcvif;
2955 				if_input(ifp, m);
2956 			}
2957 			mh->mh_head = NULL;
2958 
2959 		}
2960 
2961 		r++;
2962 		if (__predict_false(++rspq->cidx == rspq->size)) {
2963 			rspq->cidx = 0;
2964 			rspq->gen ^= 1;
2965 			r = rspq->desc;
2966 		}
2967 
2968 		if (++rspq->credits >= 64) {
2969 			refill_rspq(adap, rspq, rspq->credits);
2970 			rspq->credits = 0;
2971 		}
2972 		__refill_fl_lt(adap, &qs->fl[0], 32);
2973 		__refill_fl_lt(adap, &qs->fl[1], 32);
2974 		--budget_left;
2975 	}
2976 
2977 #if defined(INET6) || defined(INET)
2978 	/* Flush LRO */
2979 	tcp_lro_flush_all(lro_ctrl);
2980 #endif
2981 
2982 	if (sleeping)
2983 		check_ring_db(adap, qs, sleeping);
2984 
2985 	mb();  /* commit Tx queue processed updates */
2986 	if (__predict_false(qs->txq_stopped > 1))
2987 		restart_tx(qs);
2988 
2989 	__refill_fl_lt(adap, &qs->fl[0], 512);
2990 	__refill_fl_lt(adap, &qs->fl[1], 512);
2991 	budget -= budget_left;
2992 	return (budget);
2993 }
2994 
2995 /*
2996  * A helper function that processes responses and issues GTS.
2997  */
2998 static __inline int
2999 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3000 {
3001 	int work;
3002 	static int last_holdoff = 0;
3003 
3004 	work = process_responses(adap, rspq_to_qset(rq), -1);
3005 
3006 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3007 		printf("next_holdoff=%d\n", rq->next_holdoff);
3008 		last_holdoff = rq->next_holdoff;
3009 	}
3010 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3011 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3012 
3013 	return (work);
3014 }
3015 
3016 #ifdef DEBUGNET
3017 int
3018 cxgb_debugnet_poll_rx(adapter_t *adap, struct sge_qset *qs)
3019 {
3020 
3021 	return (process_responses_gts(adap, &qs->rspq));
3022 }
3023 #endif
3024 
3025 /*
3026  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3027  * Handles data events from SGE response queues as well as error and other
3028  * async events as they all use the same interrupt pin.  We use one SGE
3029  * response queue per port in this mode and protect all response queues with
3030  * queue 0's lock.
3031  */
3032 void
3033 t3b_intr(void *data)
3034 {
3035 	uint32_t i, map;
3036 	adapter_t *adap = data;
3037 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3038 
3039 	t3_write_reg(adap, A_PL_CLI, 0);
3040 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3041 
3042 	if (!map)
3043 		return;
3044 
3045 	if (__predict_false(map & F_ERRINTR)) {
3046 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3047 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3048 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3049 	}
3050 
3051 	mtx_lock(&q0->lock);
3052 	for_each_port(adap, i)
3053 	    if (map & (1 << i))
3054 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3055 	mtx_unlock(&q0->lock);
3056 }
3057 
3058 /*
3059  * The MSI interrupt handler.  This needs to handle data events from SGE
3060  * response queues as well as error and other async events as they all use
3061  * the same MSI vector.  We use one SGE response queue per port in this mode
3062  * and protect all response queues with queue 0's lock.
3063  */
3064 void
3065 t3_intr_msi(void *data)
3066 {
3067 	adapter_t *adap = data;
3068 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3069 	int i, new_packets = 0;
3070 
3071 	mtx_lock(&q0->lock);
3072 
3073 	for_each_port(adap, i)
3074 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3075 		    new_packets = 1;
3076 	mtx_unlock(&q0->lock);
3077 	if (new_packets == 0) {
3078 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3079 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3080 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3081 	}
3082 }
3083 
3084 void
3085 t3_intr_msix(void *data)
3086 {
3087 	struct sge_qset *qs = data;
3088 	adapter_t *adap = qs->port->adapter;
3089 	struct sge_rspq *rspq = &qs->rspq;
3090 
3091 	if (process_responses_gts(adap, rspq) == 0)
3092 		rspq->unhandled_irqs++;
3093 }
3094 
3095 #define QDUMP_SBUF_SIZE		32 * 400
3096 static int
3097 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3098 {
3099 	struct sge_rspq *rspq;
3100 	struct sge_qset *qs;
3101 	int i, err, dump_end, idx;
3102 	struct sbuf *sb;
3103 	struct rsp_desc *rspd;
3104 	uint32_t data[4];
3105 
3106 	rspq = arg1;
3107 	qs = rspq_to_qset(rspq);
3108 	if (rspq->rspq_dump_count == 0)
3109 		return (0);
3110 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3111 		log(LOG_WARNING,
3112 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3113 		rspq->rspq_dump_count = 0;
3114 		return (EINVAL);
3115 	}
3116 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3117 		log(LOG_WARNING,
3118 		    "dump start of %d is greater than queue size\n",
3119 		    rspq->rspq_dump_start);
3120 		rspq->rspq_dump_start = 0;
3121 		return (EINVAL);
3122 	}
3123 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3124 	if (err)
3125 		return (err);
3126 	err = sysctl_wire_old_buffer(req, 0);
3127 	if (err)
3128 		return (err);
3129 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3130 
3131 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3132 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3133 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3134 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3135 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3136 
3137 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3138 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3139 
3140 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3141 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3142 		idx = i & (RSPQ_Q_SIZE-1);
3143 
3144 		rspd = &rspq->desc[idx];
3145 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3146 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3147 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3148 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3149 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3150 		    be32toh(rspd->len_cq), rspd->intr_gen);
3151 	}
3152 
3153 	err = sbuf_finish(sb);
3154 	sbuf_delete(sb);
3155 	return (err);
3156 }
3157 
3158 static int
3159 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3160 {
3161 	struct sge_txq *txq;
3162 	struct sge_qset *qs;
3163 	int i, j, err, dump_end;
3164 	struct sbuf *sb;
3165 	struct tx_desc *txd;
3166 	uint32_t *WR, wr_hi, wr_lo, gen;
3167 	uint32_t data[4];
3168 
3169 	txq = arg1;
3170 	qs = txq_to_qset(txq, TXQ_ETH);
3171 	if (txq->txq_dump_count == 0) {
3172 		return (0);
3173 	}
3174 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3175 		log(LOG_WARNING,
3176 		    "dump count is too large %d\n", txq->txq_dump_count);
3177 		txq->txq_dump_count = 1;
3178 		return (EINVAL);
3179 	}
3180 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3181 		log(LOG_WARNING,
3182 		    "dump start of %d is greater than queue size\n",
3183 		    txq->txq_dump_start);
3184 		txq->txq_dump_start = 0;
3185 		return (EINVAL);
3186 	}
3187 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3188 	if (err)
3189 		return (err);
3190 	err = sysctl_wire_old_buffer(req, 0);
3191 	if (err)
3192 		return (err);
3193 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3194 
3195 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3196 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3197 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3198 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3199 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3200 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3201 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3202 	    txq->txq_dump_start,
3203 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3204 
3205 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3206 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3207 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3208 		WR = (uint32_t *)txd->flit;
3209 		wr_hi = ntohl(WR[0]);
3210 		wr_lo = ntohl(WR[1]);
3211 		gen = G_WR_GEN(wr_lo);
3212 
3213 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3214 		    wr_hi, wr_lo, gen);
3215 		for (j = 2; j < 30; j += 4)
3216 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3217 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3218 
3219 	}
3220 	err = sbuf_finish(sb);
3221 	sbuf_delete(sb);
3222 	return (err);
3223 }
3224 
3225 static int
3226 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3227 {
3228 	struct sge_txq *txq;
3229 	struct sge_qset *qs;
3230 	int i, j, err, dump_end;
3231 	struct sbuf *sb;
3232 	struct tx_desc *txd;
3233 	uint32_t *WR, wr_hi, wr_lo, gen;
3234 
3235 	txq = arg1;
3236 	qs = txq_to_qset(txq, TXQ_CTRL);
3237 	if (txq->txq_dump_count == 0) {
3238 		return (0);
3239 	}
3240 	if (txq->txq_dump_count > 256) {
3241 		log(LOG_WARNING,
3242 		    "dump count is too large %d\n", txq->txq_dump_count);
3243 		txq->txq_dump_count = 1;
3244 		return (EINVAL);
3245 	}
3246 	if (txq->txq_dump_start > 255) {
3247 		log(LOG_WARNING,
3248 		    "dump start of %d is greater than queue size\n",
3249 		    txq->txq_dump_start);
3250 		txq->txq_dump_start = 0;
3251 		return (EINVAL);
3252 	}
3253 
3254 	err = sysctl_wire_old_buffer(req, 0);
3255 	if (err != 0)
3256 		return (err);
3257 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3258 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3259 	    txq->txq_dump_start,
3260 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3261 
3262 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3263 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3264 		txd = &txq->desc[i & (255)];
3265 		WR = (uint32_t *)txd->flit;
3266 		wr_hi = ntohl(WR[0]);
3267 		wr_lo = ntohl(WR[1]);
3268 		gen = G_WR_GEN(wr_lo);
3269 
3270 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3271 		    wr_hi, wr_lo, gen);
3272 		for (j = 2; j < 30; j += 4)
3273 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3274 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3275 
3276 	}
3277 	err = sbuf_finish(sb);
3278 	sbuf_delete(sb);
3279 	return (err);
3280 }
3281 
3282 static int
3283 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3284 {
3285 	adapter_t *sc = arg1;
3286 	struct qset_params *qsp = &sc->params.sge.qset[0];
3287 	int coalesce_usecs;
3288 	struct sge_qset *qs;
3289 	int i, j, err, nqsets = 0;
3290 	struct mtx *lock;
3291 
3292 	if ((sc->flags & FULL_INIT_DONE) == 0)
3293 		return (ENXIO);
3294 
3295 	coalesce_usecs = qsp->coalesce_usecs;
3296         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3297 
3298 	if (err != 0) {
3299 		return (err);
3300 	}
3301 	if (coalesce_usecs == qsp->coalesce_usecs)
3302 		return (0);
3303 
3304 	for (i = 0; i < sc->params.nports; i++)
3305 		for (j = 0; j < sc->port[i].nqsets; j++)
3306 			nqsets++;
3307 
3308 	coalesce_usecs = max(1, coalesce_usecs);
3309 
3310 	for (i = 0; i < nqsets; i++) {
3311 		qs = &sc->sge.qs[i];
3312 		qsp = &sc->params.sge.qset[i];
3313 		qsp->coalesce_usecs = coalesce_usecs;
3314 
3315 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3316 			    &sc->sge.qs[0].rspq.lock;
3317 
3318 		mtx_lock(lock);
3319 		t3_update_qset_coalesce(qs, qsp);
3320 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3321 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3322 		mtx_unlock(lock);
3323 	}
3324 
3325 	return (0);
3326 }
3327 
3328 static int
3329 t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3330 {
3331 	adapter_t *sc = arg1;
3332 	int rc, timestamp;
3333 
3334 	if ((sc->flags & FULL_INIT_DONE) == 0)
3335 		return (ENXIO);
3336 
3337 	timestamp = sc->timestamp;
3338 	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3339 
3340 	if (rc != 0)
3341 		return (rc);
3342 
3343 	if (timestamp != sc->timestamp) {
3344 		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3345 		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3346 		sc->timestamp = timestamp;
3347 	}
3348 
3349 	return (0);
3350 }
3351 
3352 void
3353 t3_add_attach_sysctls(adapter_t *sc)
3354 {
3355 	struct sysctl_ctx_list *ctx;
3356 	struct sysctl_oid_list *children;
3357 
3358 	ctx = device_get_sysctl_ctx(sc->dev);
3359 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3360 
3361 	/* random information */
3362 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3363 	    "firmware_version",
3364 	    CTLFLAG_RD, sc->fw_version,
3365 	    0, "firmware version");
3366 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3367 	    "hw_revision",
3368 	    CTLFLAG_RD, &sc->params.rev,
3369 	    0, "chip model");
3370 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3371 	    "port_types",
3372 	    CTLFLAG_RD, sc->port_types,
3373 	    0, "type of ports");
3374 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3375 	    "enable_debug",
3376 	    CTLFLAG_RW, &cxgb_debug,
3377 	    0, "enable verbose debugging output");
3378 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3379 	    CTLFLAG_RD, &sc->tunq_coalesce,
3380 	    "#tunneled packets freed");
3381 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3382 	    "txq_overrun",
3383 	    CTLFLAG_RD, &txq_fills,
3384 	    0, "#times txq overrun");
3385 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3386 	    "core_clock",
3387 	    CTLFLAG_RD, &sc->params.vpd.cclk,
3388 	    0, "core clock frequency (in KHz)");
3389 }
3390 
3391 
3392 static const char *rspq_name = "rspq";
3393 static const char *txq_names[] =
3394 {
3395 	"txq_eth",
3396 	"txq_ofld",
3397 	"txq_ctrl"
3398 };
3399 
3400 static int
3401 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3402 {
3403 	struct port_info *p = arg1;
3404 	uint64_t *parg;
3405 
3406 	if (!p)
3407 		return (EINVAL);
3408 
3409 	cxgb_refresh_stats(p);
3410 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3411 
3412 	return (sysctl_handle_64(oidp, parg, 0, req));
3413 }
3414 
3415 void
3416 t3_add_configured_sysctls(adapter_t *sc)
3417 {
3418 	struct sysctl_ctx_list *ctx;
3419 	struct sysctl_oid_list *children;
3420 	int i, j;
3421 
3422 	ctx = device_get_sysctl_ctx(sc->dev);
3423 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3424 
3425 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3426 	    "intr_coal",
3427 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, sc,
3428 	    0, t3_set_coalesce_usecs,
3429 	    "I", "interrupt coalescing timer (us)");
3430 
3431 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3432 	    "pkt_timestamp",
3433 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, sc,
3434 	    0, t3_pkt_timestamp,
3435 	    "I", "provide packet timestamp instead of connection hash");
3436 
3437 	for (i = 0; i < sc->params.nports; i++) {
3438 		struct port_info *pi = &sc->port[i];
3439 		struct sysctl_oid *poid;
3440 		struct sysctl_oid_list *poidlist;
3441 		struct mac_stats *mstats = &pi->mac.stats;
3442 
3443 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3444 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3445 		    pi->namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3446 		    "port statistics");
3447 		poidlist = SYSCTL_CHILDREN(poid);
3448 		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3449 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3450 		    0, "#queue sets");
3451 
3452 		for (j = 0; j < pi->nqsets; j++) {
3453 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3454 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3455 					  *ctrlqpoid, *lropoid;
3456 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3457 					       *txqpoidlist, *ctrlqpoidlist,
3458 					       *lropoidlist;
3459 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3460 
3461 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3462 
3463 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3464 			    qs->namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3465 			    "qset statistics");
3466 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3467 
3468 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3469 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3470 					"freelist #0 empty");
3471 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3472 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3473 					"freelist #1 empty");
3474 
3475 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3476 			    rspq_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3477 			    "rspq statistics");
3478 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3479 
3480 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3481 			    txq_names[0], CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3482 			    "txq statistics");
3483 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3484 
3485 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3486 			    txq_names[2], CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3487 			    "ctrlq statistics");
3488 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3489 
3490 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3491 			    "lro_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
3492 			    "LRO statistics");
3493 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3494 
3495 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3496 			    CTLFLAG_RD, &qs->rspq.size,
3497 			    0, "#entries in response queue");
3498 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3499 			    CTLFLAG_RD, &qs->rspq.cidx,
3500 			    0, "consumer index");
3501 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3502 			    CTLFLAG_RD, &qs->rspq.credits,
3503 			    0, "#credits");
3504 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3505 			    CTLFLAG_RD, &qs->rspq.starved,
3506 			    0, "#times starved");
3507 			SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3508 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3509 			    "physical_address_of the queue");
3510 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3511 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3512 			    0, "start rspq dump entry");
3513 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3514 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3515 			    0, "#rspq entries to dump");
3516 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3517 			    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
3518 			    &qs->rspq, 0, t3_dump_rspq, "A",
3519 			    "dump of the response queue");
3520 
3521 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3522 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3523 			    "#tunneled packets dropped");
3524 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3525 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.mq_len,
3526 			    0, "#tunneled packets waiting to be sent");
3527 #if 0
3528 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3529 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3530 			    0, "#tunneled packets queue producer index");
3531 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3532 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3533 			    0, "#tunneled packets queue consumer index");
3534 #endif
3535 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3536 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3537 			    0, "#tunneled packets processed by the card");
3538 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3539 			    CTLFLAG_RD, &txq->cleaned,
3540 			    0, "#tunneled packets cleaned");
3541 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3542 			    CTLFLAG_RD, &txq->in_use,
3543 			    0, "#tunneled packet slots in use");
3544 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees",
3545 			    CTLFLAG_RD, &txq->txq_frees,
3546 			    "#tunneled packets freed");
3547 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3548 			    CTLFLAG_RD, &txq->txq_skipped,
3549 			    0, "#tunneled packet descriptors skipped");
3550 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3551 			    CTLFLAG_RD, &txq->txq_coalesced,
3552 			    "#tunneled packets coalesced");
3553 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3554 			    CTLFLAG_RD, &txq->txq_enqueued,
3555 			    0, "#tunneled packets enqueued to hardware");
3556 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3557 			    CTLFLAG_RD, &qs->txq_stopped,
3558 			    0, "tx queues stopped");
3559 			SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3560 			    CTLFLAG_RD, &txq->phys_addr,
3561 			    "physical_address_of the queue");
3562 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3563 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3564 			    0, "txq generation");
3565 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3566 			    CTLFLAG_RD, &txq->cidx,
3567 			    0, "hardware queue cidx");
3568 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3569 			    CTLFLAG_RD, &txq->pidx,
3570 			    0, "hardware queue pidx");
3571 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3572 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3573 			    0, "txq start idx for dump");
3574 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3575 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3576 			    0, "txq #entries to dump");
3577 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3578 			    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
3579 			    &qs->txq[TXQ_ETH], 0, t3_dump_txq_eth, "A",
3580 			    "dump of the transmit queue");
3581 
3582 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3583 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3584 			    0, "ctrlq start idx for dump");
3585 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3586 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3587 			    0, "ctrl #entries to dump");
3588 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3589 			    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
3590 			    &qs->txq[TXQ_CTRL], 0, t3_dump_txq_ctrl, "A",
3591 			    "dump of the transmit queue");
3592 
3593 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_queued",
3594 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3595 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3596 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3597 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3598 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3599 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3600 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3601 		}
3602 
3603 		/* Now add a node for mac stats. */
3604 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3605 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "MAC statistics");
3606 		poidlist = SYSCTL_CHILDREN(poid);
3607 
3608 		/*
3609 		 * We (ab)use the length argument (arg2) to pass on the offset
3610 		 * of the data that we are interested in.  This is only required
3611 		 * for the quad counters that are updated from the hardware (we
3612 		 * make sure that we return the latest value).
3613 		 * sysctl_handle_macstat first updates *all* the counters from
3614 		 * the hardware, and then returns the latest value of the
3615 		 * requested counter.  Best would be to update only the
3616 		 * requested counter from hardware, but t3_mac_update_stats()
3617 		 * hides all the register details and we don't want to dive into
3618 		 * all that here.
3619 		 */
3620 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3621     CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_NEEDGIANT, pi, \
3622     offsetof(struct mac_stats, a), sysctl_handle_macstat, "QU", 0)
3623 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3624 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3625 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3626 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3627 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3628 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3629 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3630 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3631 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3632 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3633 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3634 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3635 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3636 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3637 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3638 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3639 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3640 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3641 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3642 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3643 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3644 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3645 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3646 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3647 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3648 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3649 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3650 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3651 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3652 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3653 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3654 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3655 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3656 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3657 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3658 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3659 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3660 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3661 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3662 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3663 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3664 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3665 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3666 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3667 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3668 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3669 #undef CXGB_SYSCTL_ADD_QUAD
3670 
3671 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3672     CTLFLAG_RD, &mstats->a, 0)
3673 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3674 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3675 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3676 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3677 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3678 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3679 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3680 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3681 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3682 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3683 #undef CXGB_SYSCTL_ADD_ULONG
3684 	}
3685 }
3686 
3687 /**
3688  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3689  *	@qs: the queue set
3690  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3691  *	@idx: the descriptor index in the queue
3692  *	@data: where to dump the descriptor contents
3693  *
3694  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3695  *	size of the descriptor.
3696  */
3697 int
3698 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3699 		unsigned char *data)
3700 {
3701 	if (qnum >= 6)
3702 		return (EINVAL);
3703 
3704 	if (qnum < 3) {
3705 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3706 			return -EINVAL;
3707 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3708 		return sizeof(struct tx_desc);
3709 	}
3710 
3711 	if (qnum == 3) {
3712 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3713 			return (EINVAL);
3714 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3715 		return sizeof(struct rsp_desc);
3716 	}
3717 
3718 	qnum -= 4;
3719 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3720 		return (EINVAL);
3721 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3722 	return sizeof(struct rx_desc);
3723 }
3724