xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision aa24f48b361effe51163877d84f1b70d32b77e04)
1 /**************************************************************************
2 
3 Copyright (c) 2007-2009, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet6.h"
34 #include "opt_inet.h"
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/module.h>
40 #include <sys/bus.h>
41 #include <sys/conf.h>
42 #include <machine/bus.h>
43 #include <machine/resource.h>
44 #include <sys/rman.h>
45 #include <sys/queue.h>
46 #include <sys/sysctl.h>
47 #include <sys/taskqueue.h>
48 
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/systm.h>
54 #include <sys/syslog.h>
55 #include <sys/socket.h>
56 #include <sys/sglist.h>
57 
58 #include <net/if.h>
59 #include <net/if_var.h>
60 #include <net/bpf.h>
61 #include <net/ethernet.h>
62 #include <net/if_vlan_var.h>
63 
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip6.h>
68 #include <netinet/tcp.h>
69 
70 #include <dev/pci/pcireg.h>
71 #include <dev/pci/pcivar.h>
72 
73 #include <vm/vm.h>
74 #include <vm/pmap.h>
75 
76 #include <cxgb_include.h>
77 #include <sys/mvec.h>
78 
79 int	txq_fills = 0;
80 int	multiq_tx_enable = 1;
81 
82 #ifdef TCP_OFFLOAD
83 CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
84 #endif
85 
86 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
87 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
88 SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
89     "size of per-queue mbuf ring");
90 
91 static int cxgb_tx_coalesce_force = 0;
92 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RWTUN,
93     &cxgb_tx_coalesce_force, 0,
94     "coalesce small packets into a single work request regardless of ring state");
95 
96 #define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
97 #define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
98 #define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
99 #define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
100 #define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
101 #define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
102 #define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
103 
104 
105 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
106 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RWTUN,
107     &cxgb_tx_coalesce_enable_start, 0,
108     "coalesce enable threshold");
109 static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
110 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RWTUN,
111     &cxgb_tx_coalesce_enable_stop, 0,
112     "coalesce disable threshold");
113 static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
114 SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RWTUN,
115     &cxgb_tx_reclaim_threshold, 0,
116     "tx cleaning minimum threshold");
117 
118 /*
119  * XXX don't re-enable this until TOE stops assuming
120  * we have an m_ext
121  */
122 static int recycle_enable = 0;
123 
124 extern int cxgb_use_16k_clusters;
125 extern int nmbjumbop;
126 extern int nmbjumbo9;
127 extern int nmbjumbo16;
128 
129 #define USE_GTS 0
130 
131 #define SGE_RX_SM_BUF_SIZE	1536
132 #define SGE_RX_DROP_THRES	16
133 #define SGE_RX_COPY_THRES	128
134 
135 /*
136  * Period of the Tx buffer reclaim timer.  This timer does not need to run
137  * frequently as Tx buffers are usually reclaimed by new Tx packets.
138  */
139 #define TX_RECLAIM_PERIOD       (hz >> 1)
140 
141 /*
142  * Values for sge_txq.flags
143  */
144 enum {
145 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
146 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
147 };
148 
149 struct tx_desc {
150 	uint64_t	flit[TX_DESC_FLITS];
151 } __packed;
152 
153 struct rx_desc {
154 	uint32_t	addr_lo;
155 	uint32_t	len_gen;
156 	uint32_t	gen2;
157 	uint32_t	addr_hi;
158 } __packed;
159 
160 struct rsp_desc {               /* response queue descriptor */
161 	struct rss_header	rss_hdr;
162 	uint32_t		flags;
163 	uint32_t		len_cq;
164 	uint8_t			imm_data[47];
165 	uint8_t			intr_gen;
166 } __packed;
167 
168 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
169 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
170 #define RX_SW_DESC_INUSE        (1 << 3)
171 #define TX_SW_DESC_MAPPED       (1 << 4)
172 
173 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
174 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
175 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
176 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
177 
178 struct tx_sw_desc {                /* SW state per Tx descriptor */
179 	struct mbuf	*m;
180 	bus_dmamap_t	map;
181 	int		flags;
182 };
183 
184 struct rx_sw_desc {                /* SW state per Rx descriptor */
185 	caddr_t		rxsd_cl;
186 	struct mbuf	*m;
187 	bus_dmamap_t	map;
188 	int		flags;
189 };
190 
191 struct txq_state {
192 	unsigned int	compl;
193 	unsigned int	gen;
194 	unsigned int	pidx;
195 };
196 
197 struct refill_fl_cb_arg {
198 	int               error;
199 	bus_dma_segment_t seg;
200 	int               nseg;
201 };
202 
203 
204 /*
205  * Maps a number of flits to the number of Tx descriptors that can hold them.
206  * The formula is
207  *
208  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
209  *
210  * HW allows up to 4 descriptors to be combined into a WR.
211  */
212 static uint8_t flit_desc_map[] = {
213 	0,
214 #if SGE_NUM_GENBITS == 1
215 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
216 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
217 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
218 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
219 #elif SGE_NUM_GENBITS == 2
220 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
221 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
222 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
223 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
224 #else
225 # error "SGE_NUM_GENBITS must be 1 or 2"
226 #endif
227 };
228 
229 #define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
230 #define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
231 #define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
232 #define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
233 #define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
234 #define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
235 	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
236 #define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
237 #define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
238 	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
239 #define	TXQ_RING_DEQUEUE(qs) \
240 	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
241 
242 int cxgb_debug = 0;
243 
244 static void sge_timer_cb(void *arg);
245 static void sge_timer_reclaim(void *arg, int ncount);
246 static void sge_txq_reclaim_handler(void *arg, int ncount);
247 static void cxgb_start_locked(struct sge_qset *qs);
248 
249 /*
250  * XXX need to cope with bursty scheduling by looking at a wider
251  * window than we are now for determining the need for coalescing
252  *
253  */
254 static __inline uint64_t
255 check_pkt_coalesce(struct sge_qset *qs)
256 {
257         struct adapter *sc;
258         struct sge_txq *txq;
259 	uint8_t *fill;
260 
261 	if (__predict_false(cxgb_tx_coalesce_force))
262 		return (1);
263 	txq = &qs->txq[TXQ_ETH];
264         sc = qs->port->adapter;
265 	fill = &sc->tunq_fill[qs->idx];
266 
267 	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
268 		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
269 	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
270 		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
271 	/*
272 	 * if the hardware transmit queue is more than 1/8 full
273 	 * we mark it as coalescing - we drop back from coalescing
274 	 * when we go below 1/32 full and there are no packets enqueued,
275 	 * this provides us with some degree of hysteresis
276 	 */
277         if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
278 	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
279                 *fill = 0;
280         else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
281                 *fill = 1;
282 
283 	return (sc->tunq_coalesce);
284 }
285 
286 #ifdef __LP64__
287 static void
288 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
289 {
290 	uint64_t wr_hilo;
291 #if _BYTE_ORDER == _LITTLE_ENDIAN
292 	wr_hilo = wr_hi;
293 	wr_hilo |= (((uint64_t)wr_lo)<<32);
294 #else
295 	wr_hilo = wr_lo;
296 	wr_hilo |= (((uint64_t)wr_hi)<<32);
297 #endif
298 	wrp->wrh_hilo = wr_hilo;
299 }
300 #else
301 static void
302 set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
303 {
304 
305 	wrp->wrh_hi = wr_hi;
306 	wmb();
307 	wrp->wrh_lo = wr_lo;
308 }
309 #endif
310 
311 struct coalesce_info {
312 	int count;
313 	int nbytes;
314 };
315 
316 static int
317 coalesce_check(struct mbuf *m, void *arg)
318 {
319 	struct coalesce_info *ci = arg;
320 	int *count = &ci->count;
321 	int *nbytes = &ci->nbytes;
322 
323 	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
324 		(*count < 7) && (m->m_next == NULL))) {
325 		*count += 1;
326 		*nbytes += m->m_len;
327 		return (1);
328 	}
329 	return (0);
330 }
331 
332 static struct mbuf *
333 cxgb_dequeue(struct sge_qset *qs)
334 {
335 	struct mbuf *m, *m_head, *m_tail;
336 	struct coalesce_info ci;
337 
338 
339 	if (check_pkt_coalesce(qs) == 0)
340 		return TXQ_RING_DEQUEUE(qs);
341 
342 	m_head = m_tail = NULL;
343 	ci.count = ci.nbytes = 0;
344 	do {
345 		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
346 		if (m_head == NULL) {
347 			m_tail = m_head = m;
348 		} else if (m != NULL) {
349 			m_tail->m_nextpkt = m;
350 			m_tail = m;
351 		}
352 	} while (m != NULL);
353 	if (ci.count > 7)
354 		panic("trying to coalesce %d packets in to one WR", ci.count);
355 	return (m_head);
356 }
357 
358 /**
359  *	reclaim_completed_tx - reclaims completed Tx descriptors
360  *	@adapter: the adapter
361  *	@q: the Tx queue to reclaim completed descriptors from
362  *
363  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
364  *	and frees the associated buffers if possible.  Called with the Tx
365  *	queue's lock held.
366  */
367 static __inline int
368 reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
369 {
370 	struct sge_txq *q = &qs->txq[queue];
371 	int reclaim = desc_reclaimable(q);
372 
373 	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
374 	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
375 		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
376 
377 	if (reclaim < reclaim_min)
378 		return (0);
379 
380 	mtx_assert(&qs->lock, MA_OWNED);
381 	if (reclaim > 0) {
382 		t3_free_tx_desc(qs, reclaim, queue);
383 		q->cleaned += reclaim;
384 		q->in_use -= reclaim;
385 	}
386 	if (isset(&qs->txq_stopped, TXQ_ETH))
387                 clrbit(&qs->txq_stopped, TXQ_ETH);
388 
389 	return (reclaim);
390 }
391 
392 /**
393  *	should_restart_tx - are there enough resources to restart a Tx queue?
394  *	@q: the Tx queue
395  *
396  *	Checks if there are enough descriptors to restart a suspended Tx queue.
397  */
398 static __inline int
399 should_restart_tx(const struct sge_txq *q)
400 {
401 	unsigned int r = q->processed - q->cleaned;
402 
403 	return q->in_use - r < (q->size >> 1);
404 }
405 
406 /**
407  *	t3_sge_init - initialize SGE
408  *	@adap: the adapter
409  *	@p: the SGE parameters
410  *
411  *	Performs SGE initialization needed every time after a chip reset.
412  *	We do not initialize any of the queue sets here, instead the driver
413  *	top-level must request those individually.  We also do not enable DMA
414  *	here, that should be done after the queues have been set up.
415  */
416 void
417 t3_sge_init(adapter_t *adap, struct sge_params *p)
418 {
419 	u_int ctrl, ups;
420 
421 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
422 
423 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
424 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
425 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
426 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
427 #if SGE_NUM_GENBITS == 1
428 	ctrl |= F_EGRGENCTRL;
429 #endif
430 	if (adap->params.rev > 0) {
431 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
432 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
433 	}
434 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
435 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
436 		     V_LORCQDRBTHRSH(512));
437 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
438 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
439 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
440 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
441 		     adap->params.rev < T3_REV_C ? 1000 : 500);
442 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
443 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
444 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
445 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
446 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
447 }
448 
449 
450 /**
451  *	sgl_len - calculates the size of an SGL of the given capacity
452  *	@n: the number of SGL entries
453  *
454  *	Calculates the number of flits needed for a scatter/gather list that
455  *	can hold the given number of entries.
456  */
457 static __inline unsigned int
458 sgl_len(unsigned int n)
459 {
460 	return ((3 * n) / 2 + (n & 1));
461 }
462 
463 /**
464  *	get_imm_packet - return the next ingress packet buffer from a response
465  *	@resp: the response descriptor containing the packet data
466  *
467  *	Return a packet containing the immediate data of the given response.
468  */
469 static int
470 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
471 {
472 
473 	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
474 		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
475 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
476 	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
477 		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
478 		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
479 	} else
480 		m->m_len = IMMED_PKT_SIZE;
481 	m->m_ext.ext_buf = NULL;
482 	m->m_ext.ext_type = 0;
483 	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len);
484 	return (0);
485 }
486 
487 static __inline u_int
488 flits_to_desc(u_int n)
489 {
490 	return (flit_desc_map[n]);
491 }
492 
493 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
494 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
495 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
496 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
497 		    F_HIRCQPARITYERROR)
498 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
499 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
500 		      F_RSPQDISABLED)
501 
502 /**
503  *	t3_sge_err_intr_handler - SGE async event interrupt handler
504  *	@adapter: the adapter
505  *
506  *	Interrupt handler for SGE asynchronous (non-data) events.
507  */
508 void
509 t3_sge_err_intr_handler(adapter_t *adapter)
510 {
511 	unsigned int v, status;
512 
513 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
514 	if (status & SGE_PARERR)
515 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
516 			 status & SGE_PARERR);
517 	if (status & SGE_FRAMINGERR)
518 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
519 			 status & SGE_FRAMINGERR);
520 	if (status & F_RSPQCREDITOVERFOW)
521 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
522 
523 	if (status & F_RSPQDISABLED) {
524 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
525 
526 		CH_ALERT(adapter,
527 			 "packet delivered to disabled response queue (0x%x)\n",
528 			 (v >> S_RSPQ0DISABLED) & 0xff);
529 	}
530 
531 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
532 	if (status & SGE_FATALERR)
533 		t3_fatal_err(adapter);
534 }
535 
536 void
537 t3_sge_prep(adapter_t *adap, struct sge_params *p)
538 {
539 	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
540 
541 	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
542 	nqsets *= adap->params.nports;
543 
544 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
545 
546 	while (!powerof2(fl_q_size))
547 		fl_q_size--;
548 
549 	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
550 	    is_offload(adap);
551 
552 #if __FreeBSD_version >= 700111
553 	if (use_16k) {
554 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
555 		jumbo_buf_size = MJUM16BYTES;
556 	} else {
557 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
558 		jumbo_buf_size = MJUM9BYTES;
559 	}
560 #else
561 	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
562 	jumbo_buf_size = MJUMPAGESIZE;
563 #endif
564 	while (!powerof2(jumbo_q_size))
565 		jumbo_q_size--;
566 
567 	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
568 		device_printf(adap->dev,
569 		    "Insufficient clusters and/or jumbo buffers.\n");
570 
571 	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
572 
573 	for (i = 0; i < SGE_QSETS; ++i) {
574 		struct qset_params *q = p->qset + i;
575 
576 		if (adap->params.nports > 2) {
577 			q->coalesce_usecs = 50;
578 		} else {
579 #ifdef INVARIANTS
580 			q->coalesce_usecs = 10;
581 #else
582 			q->coalesce_usecs = 5;
583 #endif
584 		}
585 		q->polling = 0;
586 		q->rspq_size = RSPQ_Q_SIZE;
587 		q->fl_size = fl_q_size;
588 		q->jumbo_size = jumbo_q_size;
589 		q->jumbo_buf_size = jumbo_buf_size;
590 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
591 		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
592 		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
593 		q->cong_thres = 0;
594 	}
595 }
596 
597 int
598 t3_sge_alloc(adapter_t *sc)
599 {
600 
601 	/* The parent tag. */
602 	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
603 				1, 0,			/* algnmnt, boundary */
604 				BUS_SPACE_MAXADDR,	/* lowaddr */
605 				BUS_SPACE_MAXADDR,	/* highaddr */
606 				NULL, NULL,		/* filter, filterarg */
607 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
608 				BUS_SPACE_UNRESTRICTED, /* nsegments */
609 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
610 				0,			/* flags */
611 				NULL, NULL,		/* lock, lockarg */
612 				&sc->parent_dmat)) {
613 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
614 		return (ENOMEM);
615 	}
616 
617 	/*
618 	 * DMA tag for normal sized RX frames
619 	 */
620 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
621 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
622 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
623 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
624 		return (ENOMEM);
625 	}
626 
627 	/*
628 	 * DMA tag for jumbo sized RX frames.
629 	 */
630 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
631 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
632 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
633 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
634 		return (ENOMEM);
635 	}
636 
637 	/*
638 	 * DMA tag for TX frames.
639 	 */
640 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
641 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
642 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
643 		NULL, NULL, &sc->tx_dmat)) {
644 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
645 		return (ENOMEM);
646 	}
647 
648 	return (0);
649 }
650 
651 int
652 t3_sge_free(struct adapter * sc)
653 {
654 
655 	if (sc->tx_dmat != NULL)
656 		bus_dma_tag_destroy(sc->tx_dmat);
657 
658 	if (sc->rx_jumbo_dmat != NULL)
659 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
660 
661 	if (sc->rx_dmat != NULL)
662 		bus_dma_tag_destroy(sc->rx_dmat);
663 
664 	if (sc->parent_dmat != NULL)
665 		bus_dma_tag_destroy(sc->parent_dmat);
666 
667 	return (0);
668 }
669 
670 void
671 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
672 {
673 
674 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
675 	qs->rspq.polling = 0 /* p->polling */;
676 }
677 
678 #if !defined(__i386__) && !defined(__amd64__)
679 static void
680 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
681 {
682 	struct refill_fl_cb_arg *cb_arg = arg;
683 
684 	cb_arg->error = error;
685 	cb_arg->seg = segs[0];
686 	cb_arg->nseg = nseg;
687 
688 }
689 #endif
690 /**
691  *	refill_fl - refill an SGE free-buffer list
692  *	@sc: the controller softc
693  *	@q: the free-list to refill
694  *	@n: the number of new buffers to allocate
695  *
696  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
697  *	The caller must assure that @n does not exceed the queue's capacity.
698  */
699 static void
700 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
701 {
702 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
703 	struct rx_desc *d = &q->desc[q->pidx];
704 	struct refill_fl_cb_arg cb_arg;
705 	struct mbuf *m;
706 	caddr_t cl;
707 	int err;
708 
709 	cb_arg.error = 0;
710 	while (n--) {
711 		/*
712 		 * We allocate an uninitialized mbuf + cluster, mbuf is
713 		 * initialized after rx.
714 		 */
715 		if (q->zone == zone_pack) {
716 			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
717 				break;
718 			cl = m->m_ext.ext_buf;
719 		} else {
720 			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
721 				break;
722 			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
723 				uma_zfree(q->zone, cl);
724 				break;
725 			}
726 		}
727 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
728 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
729 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
730 				uma_zfree(q->zone, cl);
731 				goto done;
732 			}
733 			sd->flags |= RX_SW_DESC_MAP_CREATED;
734 		}
735 #if !defined(__i386__) && !defined(__amd64__)
736 		err = bus_dmamap_load(q->entry_tag, sd->map,
737 		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
738 
739 		if (err != 0 || cb_arg.error) {
740 			if (q->zone != zone_pack)
741 				uma_zfree(q->zone, cl);
742 			m_free(m);
743 			goto done;
744 		}
745 #else
746 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
747 #endif
748 		sd->flags |= RX_SW_DESC_INUSE;
749 		sd->rxsd_cl = cl;
750 		sd->m = m;
751 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
752 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
753 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
754 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
755 
756 		d++;
757 		sd++;
758 
759 		if (++q->pidx == q->size) {
760 			q->pidx = 0;
761 			q->gen ^= 1;
762 			sd = q->sdesc;
763 			d = q->desc;
764 		}
765 		q->credits++;
766 		q->db_pending++;
767 	}
768 
769 done:
770 	if (q->db_pending >= 32) {
771 		q->db_pending = 0;
772 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
773 	}
774 }
775 
776 
777 /**
778  *	free_rx_bufs - free the Rx buffers on an SGE free list
779  *	@sc: the controle softc
780  *	@q: the SGE free list to clean up
781  *
782  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
783  *	this queue should be stopped before calling this function.
784  */
785 static void
786 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
787 {
788 	u_int cidx = q->cidx;
789 
790 	while (q->credits--) {
791 		struct rx_sw_desc *d = &q->sdesc[cidx];
792 
793 		if (d->flags & RX_SW_DESC_INUSE) {
794 			bus_dmamap_unload(q->entry_tag, d->map);
795 			bus_dmamap_destroy(q->entry_tag, d->map);
796 			if (q->zone == zone_pack) {
797 				m_init(d->m, M_NOWAIT, MT_DATA, M_EXT);
798 				uma_zfree(zone_pack, d->m);
799 			} else {
800 				m_init(d->m, M_NOWAIT, MT_DATA, 0);
801 				uma_zfree(zone_mbuf, d->m);
802 				uma_zfree(q->zone, d->rxsd_cl);
803 			}
804 		}
805 
806 		d->rxsd_cl = NULL;
807 		d->m = NULL;
808 		if (++cidx == q->size)
809 			cidx = 0;
810 	}
811 }
812 
813 static __inline void
814 __refill_fl(adapter_t *adap, struct sge_fl *fl)
815 {
816 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
817 }
818 
819 static __inline void
820 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
821 {
822 	uint32_t reclaimable = fl->size - fl->credits;
823 
824 	if (reclaimable > 0)
825 		refill_fl(adap, fl, min(max, reclaimable));
826 }
827 
828 /**
829  *	recycle_rx_buf - recycle a receive buffer
830  *	@adapter: the adapter
831  *	@q: the SGE free list
832  *	@idx: index of buffer to recycle
833  *
834  *	Recycles the specified buffer on the given free list by adding it at
835  *	the next available slot on the list.
836  */
837 static void
838 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
839 {
840 	struct rx_desc *from = &q->desc[idx];
841 	struct rx_desc *to   = &q->desc[q->pidx];
842 
843 	q->sdesc[q->pidx] = q->sdesc[idx];
844 	to->addr_lo = from->addr_lo;        // already big endian
845 	to->addr_hi = from->addr_hi;        // likewise
846 	wmb();	/* necessary ? */
847 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
848 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
849 	q->credits++;
850 
851 	if (++q->pidx == q->size) {
852 		q->pidx = 0;
853 		q->gen ^= 1;
854 	}
855 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
856 }
857 
858 static void
859 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
860 {
861 	uint32_t *addr;
862 
863 	addr = arg;
864 	*addr = segs[0].ds_addr;
865 }
866 
867 static int
868 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
869     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
870     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
871 {
872 	size_t len = nelem * elem_size;
873 	void *s = NULL;
874 	void *p = NULL;
875 	int err;
876 
877 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
878 				      BUS_SPACE_MAXADDR_32BIT,
879 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
880 				      len, 0, NULL, NULL, tag)) != 0) {
881 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
882 		return (ENOMEM);
883 	}
884 
885 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
886 				    map)) != 0) {
887 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
888 		return (ENOMEM);
889 	}
890 
891 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
892 	bzero(p, len);
893 	*(void **)desc = p;
894 
895 	if (sw_size) {
896 		len = nelem * sw_size;
897 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
898 		*(void **)sdesc = s;
899 	}
900 	if (parent_entry_tag == NULL)
901 		return (0);
902 
903 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
904 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
905 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
906 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
907 		                      NULL, NULL, entry_tag)) != 0) {
908 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
909 		return (ENOMEM);
910 	}
911 	return (0);
912 }
913 
914 static void
915 sge_slow_intr_handler(void *arg, int ncount)
916 {
917 	adapter_t *sc = arg;
918 
919 	t3_slow_intr_handler(sc);
920 	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
921 	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
922 }
923 
924 /**
925  *	sge_timer_cb - perform periodic maintenance of an SGE qset
926  *	@data: the SGE queue set to maintain
927  *
928  *	Runs periodically from a timer to perform maintenance of an SGE queue
929  *	set.  It performs two tasks:
930  *
931  *	a) Cleans up any completed Tx descriptors that may still be pending.
932  *	Normal descriptor cleanup happens when new packets are added to a Tx
933  *	queue so this timer is relatively infrequent and does any cleanup only
934  *	if the Tx queue has not seen any new packets in a while.  We make a
935  *	best effort attempt to reclaim descriptors, in that we don't wait
936  *	around if we cannot get a queue's lock (which most likely is because
937  *	someone else is queueing new packets and so will also handle the clean
938  *	up).  Since control queues use immediate data exclusively we don't
939  *	bother cleaning them up here.
940  *
941  *	b) Replenishes Rx queues that have run out due to memory shortage.
942  *	Normally new Rx buffers are added when existing ones are consumed but
943  *	when out of memory a queue can become empty.  We try to add only a few
944  *	buffers here, the queue will be replenished fully as these new buffers
945  *	are used up if memory shortage has subsided.
946  *
947  *	c) Return coalesced response queue credits in case a response queue is
948  *	starved.
949  *
950  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
951  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
952  */
953 static void
954 sge_timer_cb(void *arg)
955 {
956 	adapter_t *sc = arg;
957 	if ((sc->flags & USING_MSIX) == 0) {
958 
959 		struct port_info *pi;
960 		struct sge_qset *qs;
961 		struct sge_txq  *txq;
962 		int i, j;
963 		int reclaim_ofl, refill_rx;
964 
965 		if (sc->open_device_map == 0)
966 			return;
967 
968 		for (i = 0; i < sc->params.nports; i++) {
969 			pi = &sc->port[i];
970 			for (j = 0; j < pi->nqsets; j++) {
971 				qs = &sc->sge.qs[pi->first_qset + j];
972 				txq = &qs->txq[0];
973 				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
974 				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
975 				    (qs->fl[1].credits < qs->fl[1].size));
976 				if (reclaim_ofl || refill_rx) {
977 					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
978 					break;
979 				}
980 			}
981 		}
982 	}
983 
984 	if (sc->params.nports > 2) {
985 		int i;
986 
987 		for_each_port(sc, i) {
988 			struct port_info *pi = &sc->port[i];
989 
990 			t3_write_reg(sc, A_SG_KDOORBELL,
991 				     F_SELEGRCNTX |
992 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
993 		}
994 	}
995 	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
996 	    sc->open_device_map != 0)
997 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
998 }
999 
1000 /*
1001  * This is meant to be a catch-all function to keep sge state private
1002  * to sge.c
1003  *
1004  */
1005 int
1006 t3_sge_init_adapter(adapter_t *sc)
1007 {
1008 	callout_init(&sc->sge_timer_ch, 1);
1009 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1010 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1011 	return (0);
1012 }
1013 
1014 int
1015 t3_sge_reset_adapter(adapter_t *sc)
1016 {
1017 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1018 	return (0);
1019 }
1020 
1021 int
1022 t3_sge_init_port(struct port_info *pi)
1023 {
1024 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1025 	return (0);
1026 }
1027 
1028 /**
1029  *	refill_rspq - replenish an SGE response queue
1030  *	@adapter: the adapter
1031  *	@q: the response queue to replenish
1032  *	@credits: how many new responses to make available
1033  *
1034  *	Replenishes a response queue by making the supplied number of responses
1035  *	available to HW.
1036  */
1037 static __inline void
1038 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1039 {
1040 
1041 	/* mbufs are allocated on demand when a rspq entry is processed. */
1042 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1043 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1044 }
1045 
1046 static void
1047 sge_txq_reclaim_handler(void *arg, int ncount)
1048 {
1049 	struct sge_qset *qs = arg;
1050 	int i;
1051 
1052 	for (i = 0; i < 3; i++)
1053 		reclaim_completed_tx(qs, 16, i);
1054 }
1055 
1056 static void
1057 sge_timer_reclaim(void *arg, int ncount)
1058 {
1059 	struct port_info *pi = arg;
1060 	int i, nqsets = pi->nqsets;
1061 	adapter_t *sc = pi->adapter;
1062 	struct sge_qset *qs;
1063 	struct mtx *lock;
1064 
1065 	KASSERT((sc->flags & USING_MSIX) == 0,
1066 	    ("can't call timer reclaim for msi-x"));
1067 
1068 	for (i = 0; i < nqsets; i++) {
1069 		qs = &sc->sge.qs[pi->first_qset + i];
1070 
1071 		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1072 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1073 			    &sc->sge.qs[0].rspq.lock;
1074 
1075 		if (mtx_trylock(lock)) {
1076 			/* XXX currently assume that we are *NOT* polling */
1077 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1078 
1079 			if (qs->fl[0].credits < qs->fl[0].size - 16)
1080 				__refill_fl(sc, &qs->fl[0]);
1081 			if (qs->fl[1].credits < qs->fl[1].size - 16)
1082 				__refill_fl(sc, &qs->fl[1]);
1083 
1084 			if (status & (1 << qs->rspq.cntxt_id)) {
1085 				if (qs->rspq.credits) {
1086 					refill_rspq(sc, &qs->rspq, 1);
1087 					qs->rspq.credits--;
1088 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1089 					    1 << qs->rspq.cntxt_id);
1090 				}
1091 			}
1092 			mtx_unlock(lock);
1093 		}
1094 	}
1095 }
1096 
1097 /**
1098  *	init_qset_cntxt - initialize an SGE queue set context info
1099  *	@qs: the queue set
1100  *	@id: the queue set id
1101  *
1102  *	Initializes the TIDs and context ids for the queues of a queue set.
1103  */
1104 static void
1105 init_qset_cntxt(struct sge_qset *qs, u_int id)
1106 {
1107 
1108 	qs->rspq.cntxt_id = id;
1109 	qs->fl[0].cntxt_id = 2 * id;
1110 	qs->fl[1].cntxt_id = 2 * id + 1;
1111 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1112 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1113 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1114 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1115 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1116 
1117 	/* XXX: a sane limit is needed instead of INT_MAX */
1118 	mbufq_init(&qs->txq[TXQ_ETH].sendq, INT_MAX);
1119 	mbufq_init(&qs->txq[TXQ_OFLD].sendq, INT_MAX);
1120 	mbufq_init(&qs->txq[TXQ_CTRL].sendq, INT_MAX);
1121 }
1122 
1123 
1124 static void
1125 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1126 {
1127 	txq->in_use += ndesc;
1128 	/*
1129 	 * XXX we don't handle stopping of queue
1130 	 * presumably start handles this when we bump against the end
1131 	 */
1132 	txqs->gen = txq->gen;
1133 	txq->unacked += ndesc;
1134 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1135 	txq->unacked &= 31;
1136 	txqs->pidx = txq->pidx;
1137 	txq->pidx += ndesc;
1138 #ifdef INVARIANTS
1139 	if (((txqs->pidx > txq->cidx) &&
1140 		(txq->pidx < txqs->pidx) &&
1141 		(txq->pidx >= txq->cidx)) ||
1142 	    ((txqs->pidx < txq->cidx) &&
1143 		(txq->pidx >= txq-> cidx)) ||
1144 	    ((txqs->pidx < txq->cidx) &&
1145 		(txq->cidx < txqs->pidx)))
1146 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1147 		    txqs->pidx, txq->pidx, txq->cidx);
1148 #endif
1149 	if (txq->pidx >= txq->size) {
1150 		txq->pidx -= txq->size;
1151 		txq->gen ^= 1;
1152 	}
1153 
1154 }
1155 
1156 /**
1157  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1158  *	@m: the packet mbufs
1159  *      @nsegs: the number of segments
1160  *
1161  * 	Returns the number of Tx descriptors needed for the given Ethernet
1162  * 	packet.  Ethernet packets require addition of WR and CPL headers.
1163  */
1164 static __inline unsigned int
1165 calc_tx_descs(const struct mbuf *m, int nsegs)
1166 {
1167 	unsigned int flits;
1168 
1169 	if (m->m_pkthdr.len <= PIO_LEN)
1170 		return 1;
1171 
1172 	flits = sgl_len(nsegs) + 2;
1173 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1174 		flits++;
1175 
1176 	return flits_to_desc(flits);
1177 }
1178 
1179 /**
1180  *	make_sgl - populate a scatter/gather list for a packet
1181  *	@sgp: the SGL to populate
1182  *	@segs: the packet dma segments
1183  *	@nsegs: the number of segments
1184  *
1185  *	Generates a scatter/gather list for the buffers that make up a packet
1186  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1187  *	appropriately.
1188  */
1189 static __inline void
1190 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1191 {
1192 	int i, idx;
1193 
1194 	for (idx = 0, i = 0; i < nsegs; i++) {
1195 		/*
1196 		 * firmware doesn't like empty segments
1197 		 */
1198 		if (segs[i].ds_len == 0)
1199 			continue;
1200 		if (i && idx == 0)
1201 			++sgp;
1202 
1203 		sgp->len[idx] = htobe32(segs[i].ds_len);
1204 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1205 		idx ^= 1;
1206 	}
1207 
1208 	if (idx) {
1209 		sgp->len[idx] = 0;
1210 		sgp->addr[idx] = 0;
1211 	}
1212 }
1213 
1214 /**
1215  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1216  *	@adap: the adapter
1217  *	@q: the Tx queue
1218  *
1219  *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1220  *	where the HW is going to sleep just after we checked, however,
1221  *	then the interrupt handler will detect the outstanding TX packet
1222  *	and ring the doorbell for us.
1223  *
1224  *	When GTS is disabled we unconditionally ring the doorbell.
1225  */
1226 static __inline void
1227 check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1228 {
1229 #if USE_GTS
1230 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1231 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1232 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1233 #ifdef T3_TRACE
1234 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1235 			  q->cntxt_id);
1236 #endif
1237 		t3_write_reg(adap, A_SG_KDOORBELL,
1238 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1239 	}
1240 #else
1241 	if (mustring || ++q->db_pending >= 32) {
1242 		wmb();            /* write descriptors before telling HW */
1243 		t3_write_reg(adap, A_SG_KDOORBELL,
1244 		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1245 		q->db_pending = 0;
1246 	}
1247 #endif
1248 }
1249 
1250 static __inline void
1251 wr_gen2(struct tx_desc *d, unsigned int gen)
1252 {
1253 #if SGE_NUM_GENBITS == 2
1254 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1255 #endif
1256 }
1257 
1258 /**
1259  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1260  *	@ndesc: number of Tx descriptors spanned by the SGL
1261  *	@txd: first Tx descriptor to be written
1262  *	@txqs: txq state (generation and producer index)
1263  *	@txq: the SGE Tx queue
1264  *	@sgl: the SGL
1265  *	@flits: number of flits to the start of the SGL in the first descriptor
1266  *	@sgl_flits: the SGL size in flits
1267  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1268  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1269  *
1270  *	Write a work request header and an associated SGL.  If the SGL is
1271  *	small enough to fit into one Tx descriptor it has already been written
1272  *	and we just need to write the WR header.  Otherwise we distribute the
1273  *	SGL across the number of descriptors it spans.
1274  */
1275 static void
1276 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1277     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1278     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1279 {
1280 
1281 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1282 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1283 
1284 	if (__predict_true(ndesc == 1)) {
1285 		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1286 		    V_WR_SGLSFLT(flits)) | wr_hi,
1287 		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
1288 		    wr_lo);
1289 
1290 		wr_gen2(txd, txqs->gen);
1291 
1292 	} else {
1293 		unsigned int ogen = txqs->gen;
1294 		const uint64_t *fp = (const uint64_t *)sgl;
1295 		struct work_request_hdr *wp = wrp;
1296 
1297 		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1298 		    V_WR_SGLSFLT(flits)) | wr_hi;
1299 
1300 		while (sgl_flits) {
1301 			unsigned int avail = WR_FLITS - flits;
1302 
1303 			if (avail > sgl_flits)
1304 				avail = sgl_flits;
1305 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1306 			sgl_flits -= avail;
1307 			ndesc--;
1308 			if (!sgl_flits)
1309 				break;
1310 
1311 			fp += avail;
1312 			txd++;
1313 			txsd++;
1314 			if (++txqs->pidx == txq->size) {
1315 				txqs->pidx = 0;
1316 				txqs->gen ^= 1;
1317 				txd = txq->desc;
1318 				txsd = txq->sdesc;
1319 			}
1320 
1321 			/*
1322 			 * when the head of the mbuf chain
1323 			 * is freed all clusters will be freed
1324 			 * with it
1325 			 */
1326 			wrp = (struct work_request_hdr *)txd;
1327 			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1328 			    V_WR_SGLSFLT(1)) | wr_hi;
1329 			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1330 				    sgl_flits + 1)) |
1331 			    V_WR_GEN(txqs->gen)) | wr_lo;
1332 			wr_gen2(txd, txqs->gen);
1333 			flits = 1;
1334 		}
1335 		wrp->wrh_hi |= htonl(F_WR_EOP);
1336 		wmb();
1337 		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1338 		wr_gen2((struct tx_desc *)wp, ogen);
1339 	}
1340 }
1341 
1342 /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1343 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1344 
1345 #define GET_VTAG(cntrl, m) \
1346 do { \
1347 	if ((m)->m_flags & M_VLANTAG)					            \
1348 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1349 } while (0)
1350 
1351 static int
1352 t3_encap(struct sge_qset *qs, struct mbuf **m)
1353 {
1354 	adapter_t *sc;
1355 	struct mbuf *m0;
1356 	struct sge_txq *txq;
1357 	struct txq_state txqs;
1358 	struct port_info *pi;
1359 	unsigned int ndesc, flits, cntrl, mlen;
1360 	int err, nsegs, tso_info = 0;
1361 
1362 	struct work_request_hdr *wrp;
1363 	struct tx_sw_desc *txsd;
1364 	struct sg_ent *sgp, *sgl;
1365 	uint32_t wr_hi, wr_lo, sgl_flits;
1366 	bus_dma_segment_t segs[TX_MAX_SEGS];
1367 
1368 	struct tx_desc *txd;
1369 
1370 	pi = qs->port;
1371 	sc = pi->adapter;
1372 	txq = &qs->txq[TXQ_ETH];
1373 	txd = &txq->desc[txq->pidx];
1374 	txsd = &txq->sdesc[txq->pidx];
1375 	sgl = txq->txq_sgl;
1376 
1377 	prefetch(txd);
1378 	m0 = *m;
1379 
1380 	mtx_assert(&qs->lock, MA_OWNED);
1381 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1382 	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1383 
1384 	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1385 	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1386 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1387 
1388 	if (m0->m_nextpkt != NULL) {
1389 		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1390 		ndesc = 1;
1391 		mlen = 0;
1392 	} else {
1393 		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1394 		    &m0, segs, &nsegs))) {
1395 			if (cxgb_debug)
1396 				printf("failed ... err=%d\n", err);
1397 			return (err);
1398 		}
1399 		mlen = m0->m_pkthdr.len;
1400 		ndesc = calc_tx_descs(m0, nsegs);
1401 	}
1402 	txq_prod(txq, ndesc, &txqs);
1403 
1404 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1405 	txsd->m = m0;
1406 
1407 	if (m0->m_nextpkt != NULL) {
1408 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1409 		int i, fidx;
1410 
1411 		if (nsegs > 7)
1412 			panic("trying to coalesce %d packets in to one WR", nsegs);
1413 		txq->txq_coalesced += nsegs;
1414 		wrp = (struct work_request_hdr *)txd;
1415 		flits = nsegs*2 + 1;
1416 
1417 		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1418 			struct cpl_tx_pkt_batch_entry *cbe;
1419 			uint64_t flit;
1420 			uint32_t *hflit = (uint32_t *)&flit;
1421 			int cflags = m0->m_pkthdr.csum_flags;
1422 
1423 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1424 			GET_VTAG(cntrl, m0);
1425 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1426 			if (__predict_false(!(cflags & CSUM_IP)))
1427 				cntrl |= F_TXPKT_IPCSUM_DIS;
1428 			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP |
1429 			    CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1430 				cntrl |= F_TXPKT_L4CSUM_DIS;
1431 
1432 			hflit[0] = htonl(cntrl);
1433 			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1434 			flit |= htobe64(1 << 24);
1435 			cbe = &cpl_batch->pkt_entry[i];
1436 			cbe->cntrl = hflit[0];
1437 			cbe->len = hflit[1];
1438 			cbe->addr = htobe64(segs[i].ds_addr);
1439 		}
1440 
1441 		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1442 		    V_WR_SGLSFLT(flits)) |
1443 		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1444 		wr_lo = htonl(V_WR_LEN(flits) |
1445 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1446 		set_wr_hdr(wrp, wr_hi, wr_lo);
1447 		wmb();
1448 		ETHER_BPF_MTAP(pi->ifp, m0);
1449 		wr_gen2(txd, txqs.gen);
1450 		check_ring_tx_db(sc, txq, 0);
1451 		return (0);
1452 	} else if (tso_info) {
1453 		uint16_t eth_type;
1454 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1455 		struct ether_header *eh;
1456 		void *l3hdr;
1457 		struct tcphdr *tcp;
1458 
1459 		txd->flit[2] = 0;
1460 		GET_VTAG(cntrl, m0);
1461 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1462 		hdr->cntrl = htonl(cntrl);
1463 		hdr->len = htonl(mlen | 0x80000000);
1464 
1465 		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1466 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x",
1467 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1468 			    (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags);
1469 			panic("tx tso packet too small");
1470 		}
1471 
1472 		/* Make sure that ether, ip, tcp headers are all in m0 */
1473 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1474 			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1475 			if (__predict_false(m0 == NULL)) {
1476 				/* XXX panic probably an overreaction */
1477 				panic("couldn't fit header into mbuf");
1478 			}
1479 		}
1480 
1481 		eh = mtod(m0, struct ether_header *);
1482 		eth_type = eh->ether_type;
1483 		if (eth_type == htons(ETHERTYPE_VLAN)) {
1484 			struct ether_vlan_header *evh = (void *)eh;
1485 
1486 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1487 			l3hdr = evh + 1;
1488 			eth_type = evh->evl_proto;
1489 		} else {
1490 			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1491 			l3hdr = eh + 1;
1492 		}
1493 
1494 		if (eth_type == htons(ETHERTYPE_IP)) {
1495 			struct ip *ip = l3hdr;
1496 
1497 			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1498 			tcp = (struct tcphdr *)(ip + 1);
1499 		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1500 			struct ip6_hdr *ip6 = l3hdr;
1501 
1502 			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1503 			    ("%s: CSUM_TSO with ip6_nxt %d",
1504 			    __func__, ip6->ip6_nxt));
1505 
1506 			tso_info |= F_LSO_IPV6;
1507 			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1508 			tcp = (struct tcphdr *)(ip6 + 1);
1509 		} else
1510 			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1511 
1512 		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1513 		hdr->lso_info = htonl(tso_info);
1514 
1515 		if (__predict_false(mlen <= PIO_LEN)) {
1516 			/*
1517 			 * pkt not undersized but fits in PIO_LEN
1518 			 * Indicates a TSO bug at the higher levels.
1519 			 */
1520 			txsd->m = NULL;
1521 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1522 			flits = (mlen + 7) / 8 + 3;
1523 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1524 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1525 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1526 			wr_lo = htonl(V_WR_LEN(flits) |
1527 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1528 			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1529 			wmb();
1530 			ETHER_BPF_MTAP(pi->ifp, m0);
1531 			wr_gen2(txd, txqs.gen);
1532 			check_ring_tx_db(sc, txq, 0);
1533 			m_freem(m0);
1534 			return (0);
1535 		}
1536 		flits = 3;
1537 	} else {
1538 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1539 
1540 		GET_VTAG(cntrl, m0);
1541 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1542 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1543 			cntrl |= F_TXPKT_IPCSUM_DIS;
1544 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP |
1545 		    CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1546 			cntrl |= F_TXPKT_L4CSUM_DIS;
1547 		cpl->cntrl = htonl(cntrl);
1548 		cpl->len = htonl(mlen | 0x80000000);
1549 
1550 		if (mlen <= PIO_LEN) {
1551 			txsd->m = NULL;
1552 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1553 			flits = (mlen + 7) / 8 + 2;
1554 
1555 			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1556 			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1557 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1558 			wr_lo = htonl(V_WR_LEN(flits) |
1559 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1560 			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1561 			wmb();
1562 			ETHER_BPF_MTAP(pi->ifp, m0);
1563 			wr_gen2(txd, txqs.gen);
1564 			check_ring_tx_db(sc, txq, 0);
1565 			m_freem(m0);
1566 			return (0);
1567 		}
1568 		flits = 2;
1569 	}
1570 	wrp = (struct work_request_hdr *)txd;
1571 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1572 	make_sgl(sgp, segs, nsegs);
1573 
1574 	sgl_flits = sgl_len(nsegs);
1575 
1576 	ETHER_BPF_MTAP(pi->ifp, m0);
1577 
1578 	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1579 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1580 	wr_lo = htonl(V_WR_TID(txq->token));
1581 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1582 	    sgl_flits, wr_hi, wr_lo);
1583 	check_ring_tx_db(sc, txq, 0);
1584 
1585 	return (0);
1586 }
1587 
1588 void
1589 cxgb_tx_watchdog(void *arg)
1590 {
1591 	struct sge_qset *qs = arg;
1592 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1593 
1594         if (qs->coalescing != 0 &&
1595 	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1596 	    TXQ_RING_EMPTY(qs))
1597                 qs->coalescing = 0;
1598         else if (qs->coalescing == 0 &&
1599 	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1600                 qs->coalescing = 1;
1601 	if (TXQ_TRYLOCK(qs)) {
1602 		qs->qs_flags |= QS_FLUSHING;
1603 		cxgb_start_locked(qs);
1604 		qs->qs_flags &= ~QS_FLUSHING;
1605 		TXQ_UNLOCK(qs);
1606 	}
1607 	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1608 		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1609 		    qs, txq->txq_watchdog.c_cpu);
1610 }
1611 
1612 static void
1613 cxgb_tx_timeout(void *arg)
1614 {
1615 	struct sge_qset *qs = arg;
1616 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1617 
1618 	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1619                 qs->coalescing = 1;
1620 	if (TXQ_TRYLOCK(qs)) {
1621 		qs->qs_flags |= QS_TIMEOUT;
1622 		cxgb_start_locked(qs);
1623 		qs->qs_flags &= ~QS_TIMEOUT;
1624 		TXQ_UNLOCK(qs);
1625 	}
1626 }
1627 
1628 static void
1629 cxgb_start_locked(struct sge_qset *qs)
1630 {
1631 	struct mbuf *m_head = NULL;
1632 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1633 	struct port_info *pi = qs->port;
1634 	struct ifnet *ifp = pi->ifp;
1635 
1636 	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1637 		reclaim_completed_tx(qs, 0, TXQ_ETH);
1638 
1639 	if (!pi->link_config.link_ok) {
1640 		TXQ_RING_FLUSH(qs);
1641 		return;
1642 	}
1643 	TXQ_LOCK_ASSERT(qs);
1644 	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1645 	    pi->link_config.link_ok) {
1646 		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1647 
1648 		if (txq->size - txq->in_use <= TX_MAX_DESC)
1649 			break;
1650 
1651 		if ((m_head = cxgb_dequeue(qs)) == NULL)
1652 			break;
1653 		/*
1654 		 *  Encapsulation can modify our pointer, and or make it
1655 		 *  NULL on failure.  In that event, we can't requeue.
1656 		 */
1657 		if (t3_encap(qs, &m_head) || m_head == NULL)
1658 			break;
1659 
1660 		m_head = NULL;
1661 	}
1662 
1663 	if (txq->db_pending)
1664 		check_ring_tx_db(pi->adapter, txq, 1);
1665 
1666 	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1667 	    pi->link_config.link_ok)
1668 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1669 		    qs, txq->txq_timer.c_cpu);
1670 	if (m_head != NULL)
1671 		m_freem(m_head);
1672 }
1673 
1674 static int
1675 cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1676 {
1677 	struct port_info *pi = qs->port;
1678 	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1679 	struct buf_ring *br = txq->txq_mr;
1680 	int error, avail;
1681 
1682 	avail = txq->size - txq->in_use;
1683 	TXQ_LOCK_ASSERT(qs);
1684 
1685 	/*
1686 	 * We can only do a direct transmit if the following are true:
1687 	 * - we aren't coalescing (ring < 3/4 full)
1688 	 * - the link is up -- checked in caller
1689 	 * - there are no packets enqueued already
1690 	 * - there is space in hardware transmit queue
1691 	 */
1692 	if (check_pkt_coalesce(qs) == 0 &&
1693 	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1694 		if (t3_encap(qs, &m)) {
1695 			if (m != NULL &&
1696 			    (error = drbr_enqueue(ifp, br, m)) != 0)
1697 				return (error);
1698 		} else {
1699 			if (txq->db_pending)
1700 				check_ring_tx_db(pi->adapter, txq, 1);
1701 
1702 			/*
1703 			 * We've bypassed the buf ring so we need to update
1704 			 * the stats directly
1705 			 */
1706 			txq->txq_direct_packets++;
1707 			txq->txq_direct_bytes += m->m_pkthdr.len;
1708 		}
1709 	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1710 		return (error);
1711 
1712 	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1713 	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1714 	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1715 		cxgb_start_locked(qs);
1716 	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1717 		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1718 		    qs, txq->txq_timer.c_cpu);
1719 	return (0);
1720 }
1721 
1722 int
1723 cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1724 {
1725 	struct sge_qset *qs;
1726 	struct port_info *pi = ifp->if_softc;
1727 	int error, qidx = pi->first_qset;
1728 
1729 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1730 	    ||(!pi->link_config.link_ok)) {
1731 		m_freem(m);
1732 		return (0);
1733 	}
1734 
1735 	/* check if flowid is set */
1736 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
1737 		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1738 
1739 	qs = &pi->adapter->sge.qs[qidx];
1740 
1741 	if (TXQ_TRYLOCK(qs)) {
1742 		/* XXX running */
1743 		error = cxgb_transmit_locked(ifp, qs, m);
1744 		TXQ_UNLOCK(qs);
1745 	} else
1746 		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1747 	return (error);
1748 }
1749 
1750 void
1751 cxgb_qflush(struct ifnet *ifp)
1752 {
1753 	/*
1754 	 * flush any enqueued mbufs in the buf_rings
1755 	 * and in the transmit queues
1756 	 * no-op for now
1757 	 */
1758 	return;
1759 }
1760 
1761 /**
1762  *	write_imm - write a packet into a Tx descriptor as immediate data
1763  *	@d: the Tx descriptor to write
1764  *	@m: the packet
1765  *	@len: the length of packet data to write as immediate data
1766  *	@gen: the generation bit value to write
1767  *
1768  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1769  *	contains a work request at its beginning.  We must write the packet
1770  *	carefully so the SGE doesn't read accidentally before it's written in
1771  *	its entirety.
1772  */
1773 static __inline void
1774 write_imm(struct tx_desc *d, caddr_t src,
1775 	  unsigned int len, unsigned int gen)
1776 {
1777 	struct work_request_hdr *from = (struct work_request_hdr *)src;
1778 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1779 	uint32_t wr_hi, wr_lo;
1780 
1781 	KASSERT(len <= WR_LEN && len >= sizeof(*from),
1782 	    ("%s: invalid len %d", __func__, len));
1783 
1784 	memcpy(&to[1], &from[1], len - sizeof(*from));
1785 	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1786 	    V_WR_BCNTLFLT(len & 7));
1787 	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
1788 	set_wr_hdr(to, wr_hi, wr_lo);
1789 	wmb();
1790 	wr_gen2(d, gen);
1791 }
1792 
1793 /**
1794  *	check_desc_avail - check descriptor availability on a send queue
1795  *	@adap: the adapter
1796  *	@q: the TX queue
1797  *	@m: the packet needing the descriptors
1798  *	@ndesc: the number of Tx descriptors needed
1799  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1800  *
1801  *	Checks if the requested number of Tx descriptors is available on an
1802  *	SGE send queue.  If the queue is already suspended or not enough
1803  *	descriptors are available the packet is queued for later transmission.
1804  *	Must be called with the Tx queue locked.
1805  *
1806  *	Returns 0 if enough descriptors are available, 1 if there aren't
1807  *	enough descriptors and the packet has been queued, and 2 if the caller
1808  *	needs to retry because there weren't enough descriptors at the
1809  *	beginning of the call but some freed up in the mean time.
1810  */
1811 static __inline int
1812 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1813 		 struct mbuf *m, unsigned int ndesc,
1814 		 unsigned int qid)
1815 {
1816 	/*
1817 	 * XXX We currently only use this for checking the control queue
1818 	 * the control queue is only used for binding qsets which happens
1819 	 * at init time so we are guaranteed enough descriptors
1820 	 */
1821 	if (__predict_false(mbufq_len(&q->sendq))) {
1822 addq_exit:	(void )mbufq_enqueue(&q->sendq, m);
1823 		return 1;
1824 	}
1825 	if (__predict_false(q->size - q->in_use < ndesc)) {
1826 
1827 		struct sge_qset *qs = txq_to_qset(q, qid);
1828 
1829 		setbit(&qs->txq_stopped, qid);
1830 		if (should_restart_tx(q) &&
1831 		    test_and_clear_bit(qid, &qs->txq_stopped))
1832 			return 2;
1833 
1834 		q->stops++;
1835 		goto addq_exit;
1836 	}
1837 	return 0;
1838 }
1839 
1840 
1841 /**
1842  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1843  *	@q: the SGE control Tx queue
1844  *
1845  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1846  *	that send only immediate data (presently just the control queues) and
1847  *	thus do not have any mbufs
1848  */
1849 static __inline void
1850 reclaim_completed_tx_imm(struct sge_txq *q)
1851 {
1852 	unsigned int reclaim = q->processed - q->cleaned;
1853 
1854 	q->in_use -= reclaim;
1855 	q->cleaned += reclaim;
1856 }
1857 
1858 /**
1859  *	ctrl_xmit - send a packet through an SGE control Tx queue
1860  *	@adap: the adapter
1861  *	@q: the control queue
1862  *	@m: the packet
1863  *
1864  *	Send a packet through an SGE control Tx queue.  Packets sent through
1865  *	a control queue must fit entirely as immediate data in a single Tx
1866  *	descriptor and have no page fragments.
1867  */
1868 static int
1869 ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1870 {
1871 	int ret;
1872 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1873 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1874 
1875 	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
1876 
1877 	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1878 	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1879 
1880 	TXQ_LOCK(qs);
1881 again:	reclaim_completed_tx_imm(q);
1882 
1883 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1884 	if (__predict_false(ret)) {
1885 		if (ret == 1) {
1886 			TXQ_UNLOCK(qs);
1887 			return (ENOSPC);
1888 		}
1889 		goto again;
1890 	}
1891 	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1892 
1893 	q->in_use++;
1894 	if (++q->pidx >= q->size) {
1895 		q->pidx = 0;
1896 		q->gen ^= 1;
1897 	}
1898 	TXQ_UNLOCK(qs);
1899 	wmb();
1900 	t3_write_reg(adap, A_SG_KDOORBELL,
1901 	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1902 
1903 	m_free(m);
1904 	return (0);
1905 }
1906 
1907 
1908 /**
1909  *	restart_ctrlq - restart a suspended control queue
1910  *	@qs: the queue set cotaining the control queue
1911  *
1912  *	Resumes transmission on a suspended Tx control queue.
1913  */
1914 static void
1915 restart_ctrlq(void *data, int npending)
1916 {
1917 	struct mbuf *m;
1918 	struct sge_qset *qs = (struct sge_qset *)data;
1919 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1920 	adapter_t *adap = qs->port->adapter;
1921 
1922 	TXQ_LOCK(qs);
1923 again:	reclaim_completed_tx_imm(q);
1924 
1925 	while (q->in_use < q->size &&
1926 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1927 
1928 		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1929 		m_free(m);
1930 
1931 		if (++q->pidx >= q->size) {
1932 			q->pidx = 0;
1933 			q->gen ^= 1;
1934 		}
1935 		q->in_use++;
1936 	}
1937 	if (mbufq_len(&q->sendq)) {
1938 		setbit(&qs->txq_stopped, TXQ_CTRL);
1939 
1940 		if (should_restart_tx(q) &&
1941 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1942 			goto again;
1943 		q->stops++;
1944 	}
1945 	TXQ_UNLOCK(qs);
1946 	t3_write_reg(adap, A_SG_KDOORBELL,
1947 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1948 }
1949 
1950 
1951 /*
1952  * Send a management message through control queue 0
1953  */
1954 int
1955 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1956 {
1957 	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1958 }
1959 
1960 /**
1961  *	free_qset - free the resources of an SGE queue set
1962  *	@sc: the controller owning the queue set
1963  *	@q: the queue set
1964  *
1965  *	Release the HW and SW resources associated with an SGE queue set, such
1966  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1967  *	queue set must be quiesced prior to calling this.
1968  */
1969 static void
1970 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1971 {
1972 	int i;
1973 
1974 	reclaim_completed_tx(q, 0, TXQ_ETH);
1975 	if (q->txq[TXQ_ETH].txq_mr != NULL)
1976 		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
1977 	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
1978 		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
1979 		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
1980 	}
1981 
1982 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1983 		if (q->fl[i].desc) {
1984 			mtx_lock_spin(&sc->sge.reg_lock);
1985 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1986 			mtx_unlock_spin(&sc->sge.reg_lock);
1987 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1988 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1989 					q->fl[i].desc_map);
1990 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1991 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1992 		}
1993 		if (q->fl[i].sdesc) {
1994 			free_rx_bufs(sc, &q->fl[i]);
1995 			free(q->fl[i].sdesc, M_DEVBUF);
1996 		}
1997 	}
1998 
1999 	mtx_unlock(&q->lock);
2000 	MTX_DESTROY(&q->lock);
2001 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2002 		if (q->txq[i].desc) {
2003 			mtx_lock_spin(&sc->sge.reg_lock);
2004 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2005 			mtx_unlock_spin(&sc->sge.reg_lock);
2006 			bus_dmamap_unload(q->txq[i].desc_tag,
2007 					q->txq[i].desc_map);
2008 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2009 					q->txq[i].desc_map);
2010 			bus_dma_tag_destroy(q->txq[i].desc_tag);
2011 			bus_dma_tag_destroy(q->txq[i].entry_tag);
2012 		}
2013 		if (q->txq[i].sdesc) {
2014 			free(q->txq[i].sdesc, M_DEVBUF);
2015 		}
2016 	}
2017 
2018 	if (q->rspq.desc) {
2019 		mtx_lock_spin(&sc->sge.reg_lock);
2020 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2021 		mtx_unlock_spin(&sc->sge.reg_lock);
2022 
2023 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2024 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2025 			        q->rspq.desc_map);
2026 		bus_dma_tag_destroy(q->rspq.desc_tag);
2027 		MTX_DESTROY(&q->rspq.lock);
2028 	}
2029 
2030 #if defined(INET6) || defined(INET)
2031 	tcp_lro_free(&q->lro.ctrl);
2032 #endif
2033 
2034 	bzero(q, sizeof(*q));
2035 }
2036 
2037 /**
2038  *	t3_free_sge_resources - free SGE resources
2039  *	@sc: the adapter softc
2040  *
2041  *	Frees resources used by the SGE queue sets.
2042  */
2043 void
2044 t3_free_sge_resources(adapter_t *sc, int nqsets)
2045 {
2046 	int i;
2047 
2048 	for (i = 0; i < nqsets; ++i) {
2049 		TXQ_LOCK(&sc->sge.qs[i]);
2050 		t3_free_qset(sc, &sc->sge.qs[i]);
2051 	}
2052 }
2053 
2054 /**
2055  *	t3_sge_start - enable SGE
2056  *	@sc: the controller softc
2057  *
2058  *	Enables the SGE for DMAs.  This is the last step in starting packet
2059  *	transfers.
2060  */
2061 void
2062 t3_sge_start(adapter_t *sc)
2063 {
2064 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2065 }
2066 
2067 /**
2068  *	t3_sge_stop - disable SGE operation
2069  *	@sc: the adapter
2070  *
2071  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2072  *	from error interrupts) or from normal process context.  In the latter
2073  *	case it also disables any pending queue restart tasklets.  Note that
2074  *	if it is called in interrupt context it cannot disable the restart
2075  *	tasklets as it cannot wait, however the tasklets will have no effect
2076  *	since the doorbells are disabled and the driver will call this again
2077  *	later from process context, at which time the tasklets will be stopped
2078  *	if they are still running.
2079  */
2080 void
2081 t3_sge_stop(adapter_t *sc)
2082 {
2083 	int i, nqsets;
2084 
2085 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2086 
2087 	if (sc->tq == NULL)
2088 		return;
2089 
2090 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2091 		nqsets += sc->port[i].nqsets;
2092 #ifdef notyet
2093 	/*
2094 	 *
2095 	 * XXX
2096 	 */
2097 	for (i = 0; i < nqsets; ++i) {
2098 		struct sge_qset *qs = &sc->sge.qs[i];
2099 
2100 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2101 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2102 	}
2103 #endif
2104 }
2105 
2106 /**
2107  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2108  *	@adapter: the adapter
2109  *	@q: the Tx queue to reclaim descriptors from
2110  *	@reclaimable: the number of descriptors to reclaim
2111  *      @m_vec_size: maximum number of buffers to reclaim
2112  *      @desc_reclaimed: returns the number of descriptors reclaimed
2113  *
2114  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2115  *	Tx buffers.  Called with the Tx queue lock held.
2116  *
2117  *      Returns number of buffers of reclaimed
2118  */
2119 void
2120 t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2121 {
2122 	struct tx_sw_desc *txsd;
2123 	unsigned int cidx, mask;
2124 	struct sge_txq *q = &qs->txq[queue];
2125 
2126 #ifdef T3_TRACE
2127 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2128 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2129 #endif
2130 	cidx = q->cidx;
2131 	mask = q->size - 1;
2132 	txsd = &q->sdesc[cidx];
2133 
2134 	mtx_assert(&qs->lock, MA_OWNED);
2135 	while (reclaimable--) {
2136 		prefetch(q->sdesc[(cidx + 1) & mask].m);
2137 		prefetch(q->sdesc[(cidx + 2) & mask].m);
2138 
2139 		if (txsd->m != NULL) {
2140 			if (txsd->flags & TX_SW_DESC_MAPPED) {
2141 				bus_dmamap_unload(q->entry_tag, txsd->map);
2142 				txsd->flags &= ~TX_SW_DESC_MAPPED;
2143 			}
2144 			m_freem_list(txsd->m);
2145 			txsd->m = NULL;
2146 		} else
2147 			q->txq_skipped++;
2148 
2149 		++txsd;
2150 		if (++cidx == q->size) {
2151 			cidx = 0;
2152 			txsd = q->sdesc;
2153 		}
2154 	}
2155 	q->cidx = cidx;
2156 
2157 }
2158 
2159 /**
2160  *	is_new_response - check if a response is newly written
2161  *	@r: the response descriptor
2162  *	@q: the response queue
2163  *
2164  *	Returns true if a response descriptor contains a yet unprocessed
2165  *	response.
2166  */
2167 static __inline int
2168 is_new_response(const struct rsp_desc *r,
2169     const struct sge_rspq *q)
2170 {
2171 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2172 }
2173 
2174 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2175 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2176 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2177 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2178 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2179 
2180 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2181 #define NOMEM_INTR_DELAY 2500
2182 
2183 #ifdef TCP_OFFLOAD
2184 /**
2185  *	write_ofld_wr - write an offload work request
2186  *	@adap: the adapter
2187  *	@m: the packet to send
2188  *	@q: the Tx queue
2189  *	@pidx: index of the first Tx descriptor to write
2190  *	@gen: the generation value to use
2191  *	@ndesc: number of descriptors the packet will occupy
2192  *
2193  *	Write an offload work request to send the supplied packet.  The packet
2194  *	data already carry the work request with most fields populated.
2195  */
2196 static void
2197 write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
2198     unsigned int pidx, unsigned int gen, unsigned int ndesc)
2199 {
2200 	unsigned int sgl_flits, flits;
2201 	int i, idx, nsegs, wrlen;
2202 	struct work_request_hdr *from;
2203 	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
2204 	struct tx_desc *d = &q->desc[pidx];
2205 	struct txq_state txqs;
2206 	struct sglist_seg *segs;
2207 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2208 	struct sglist *sgl;
2209 
2210 	from = (void *)(oh + 1);	/* Start of WR within mbuf */
2211 	wrlen = m->m_len - sizeof(*oh);
2212 
2213 	if (!(oh->flags & F_HDR_SGL)) {
2214 		write_imm(d, (caddr_t)from, wrlen, gen);
2215 
2216 		/*
2217 		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
2218 		 * t3_push_frames and freed in wr_ack.  Others, like those sent
2219 		 * down by close_conn, t3_send_reset, etc. should be freed here.
2220 		 */
2221 		if (!(oh->flags & F_HDR_DF))
2222 			m_free(m);
2223 		return;
2224 	}
2225 
2226 	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
2227 
2228 	sgl = oh->sgl;
2229 	flits = wrlen / 8;
2230 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
2231 
2232 	nsegs = sgl->sg_nseg;
2233 	segs = sgl->sg_segs;
2234 	for (idx = 0, i = 0; i < nsegs; i++) {
2235 		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
2236 		if (i && idx == 0)
2237 			++sgp;
2238 		sgp->len[idx] = htobe32(segs[i].ss_len);
2239 		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
2240 		idx ^= 1;
2241 	}
2242 	if (idx) {
2243 		sgp->len[idx] = 0;
2244 		sgp->addr[idx] = 0;
2245 	}
2246 
2247 	sgl_flits = sgl_len(nsegs);
2248 	txqs.gen = gen;
2249 	txqs.pidx = pidx;
2250 	txqs.compl = 0;
2251 
2252 	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
2253 	    from->wrh_hi, from->wrh_lo);
2254 }
2255 
2256 /**
2257  *	ofld_xmit - send a packet through an offload queue
2258  *	@adap: the adapter
2259  *	@q: the Tx offload queue
2260  *	@m: the packet
2261  *
2262  *	Send an offload packet through an SGE offload queue.
2263  */
2264 static int
2265 ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2266 {
2267 	int ret;
2268 	unsigned int ndesc;
2269 	unsigned int pidx, gen;
2270 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2271 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2272 
2273 	ndesc = G_HDR_NDESC(oh->flags);
2274 
2275 	TXQ_LOCK(qs);
2276 again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2277 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2278 	if (__predict_false(ret)) {
2279 		if (ret == 1) {
2280 			TXQ_UNLOCK(qs);
2281 			return (EINTR);
2282 		}
2283 		goto again;
2284 	}
2285 
2286 	gen = q->gen;
2287 	q->in_use += ndesc;
2288 	pidx = q->pidx;
2289 	q->pidx += ndesc;
2290 	if (q->pidx >= q->size) {
2291 		q->pidx -= q->size;
2292 		q->gen ^= 1;
2293 	}
2294 
2295 	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2296 	check_ring_tx_db(adap, q, 1);
2297 	TXQ_UNLOCK(qs);
2298 
2299 	return (0);
2300 }
2301 
2302 /**
2303  *	restart_offloadq - restart a suspended offload queue
2304  *	@qs: the queue set cotaining the offload queue
2305  *
2306  *	Resumes transmission on a suspended Tx offload queue.
2307  */
2308 static void
2309 restart_offloadq(void *data, int npending)
2310 {
2311 	struct mbuf *m;
2312 	struct sge_qset *qs = data;
2313 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2314 	adapter_t *adap = qs->port->adapter;
2315 	int cleaned;
2316 
2317 	TXQ_LOCK(qs);
2318 again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2319 
2320 	while ((m = mbufq_first(&q->sendq)) != NULL) {
2321 		unsigned int gen, pidx;
2322 		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2323 		unsigned int ndesc = G_HDR_NDESC(oh->flags);
2324 
2325 		if (__predict_false(q->size - q->in_use < ndesc)) {
2326 			setbit(&qs->txq_stopped, TXQ_OFLD);
2327 			if (should_restart_tx(q) &&
2328 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2329 				goto again;
2330 			q->stops++;
2331 			break;
2332 		}
2333 
2334 		gen = q->gen;
2335 		q->in_use += ndesc;
2336 		pidx = q->pidx;
2337 		q->pidx += ndesc;
2338 		if (q->pidx >= q->size) {
2339 			q->pidx -= q->size;
2340 			q->gen ^= 1;
2341 		}
2342 
2343 		(void)mbufq_dequeue(&q->sendq);
2344 		TXQ_UNLOCK(qs);
2345 		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2346 		TXQ_LOCK(qs);
2347 	}
2348 #if USE_GTS
2349 	set_bit(TXQ_RUNNING, &q->flags);
2350 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2351 #endif
2352 	TXQ_UNLOCK(qs);
2353 	wmb();
2354 	t3_write_reg(adap, A_SG_KDOORBELL,
2355 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2356 }
2357 
2358 /**
2359  *	t3_offload_tx - send an offload packet
2360  *	@m: the packet
2361  *
2362  *	Sends an offload packet.  We use the packet priority to select the
2363  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2364  *	should be sent as regular or control, bits 1-3 select the queue set.
2365  */
2366 int
2367 t3_offload_tx(struct adapter *sc, struct mbuf *m)
2368 {
2369 	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2370 	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
2371 
2372 	if (oh->flags & F_HDR_CTRL) {
2373 		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
2374 		return (ctrl_xmit(sc, qs, m));
2375 	} else
2376 		return (ofld_xmit(sc, qs, m));
2377 }
2378 #endif
2379 
2380 static void
2381 restart_tx(struct sge_qset *qs)
2382 {
2383 	struct adapter *sc = qs->port->adapter;
2384 
2385 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2386 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2387 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2388 		qs->txq[TXQ_OFLD].restarts++;
2389 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2390 	}
2391 
2392 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2393 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2394 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2395 		qs->txq[TXQ_CTRL].restarts++;
2396 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2397 	}
2398 }
2399 
2400 /**
2401  *	t3_sge_alloc_qset - initialize an SGE queue set
2402  *	@sc: the controller softc
2403  *	@id: the queue set id
2404  *	@nports: how many Ethernet ports will be using this queue set
2405  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2406  *	@p: configuration parameters for this queue set
2407  *	@ntxq: number of Tx queues for the queue set
2408  *	@pi: port info for queue set
2409  *
2410  *	Allocate resources and initialize an SGE queue set.  A queue set
2411  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2412  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2413  *	queue, offload queue, and control queue.
2414  */
2415 int
2416 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2417 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2418 {
2419 	struct sge_qset *q = &sc->sge.qs[id];
2420 	int i, ret = 0;
2421 
2422 	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2423 	q->port = pi;
2424 	q->adap = sc;
2425 
2426 	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2427 	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2428 		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2429 		goto err;
2430 	}
2431 	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2432 	    M_NOWAIT | M_ZERO)) == NULL) {
2433 		device_printf(sc->dev, "failed to allocate ifq\n");
2434 		goto err;
2435 	}
2436 	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2437 	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2438 	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2439 	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2440 	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2441 
2442 	init_qset_cntxt(q, id);
2443 	q->idx = id;
2444 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2445 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2446 		    &q->fl[0].desc, &q->fl[0].sdesc,
2447 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2448 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2449 		printf("error %d from alloc ring fl0\n", ret);
2450 		goto err;
2451 	}
2452 
2453 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2454 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2455 		    &q->fl[1].desc, &q->fl[1].sdesc,
2456 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2457 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2458 		printf("error %d from alloc ring fl1\n", ret);
2459 		goto err;
2460 	}
2461 
2462 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2463 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2464 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2465 		    NULL, NULL)) != 0) {
2466 		printf("error %d from alloc ring rspq\n", ret);
2467 		goto err;
2468 	}
2469 
2470 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2471 	    device_get_unit(sc->dev), irq_vec_idx);
2472 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2473 
2474 	for (i = 0; i < ntxq; ++i) {
2475 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2476 
2477 		if ((ret = alloc_ring(sc, p->txq_size[i],
2478 			    sizeof(struct tx_desc), sz,
2479 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2480 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2481 			    &q->txq[i].desc_map,
2482 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2483 			printf("error %d from alloc ring tx %i\n", ret, i);
2484 			goto err;
2485 		}
2486 		mbufq_init(&q->txq[i].sendq, INT_MAX);
2487 		q->txq[i].gen = 1;
2488 		q->txq[i].size = p->txq_size[i];
2489 	}
2490 
2491 #ifdef TCP_OFFLOAD
2492 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2493 #endif
2494 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2495 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2496 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2497 
2498 	q->fl[0].gen = q->fl[1].gen = 1;
2499 	q->fl[0].size = p->fl_size;
2500 	q->fl[1].size = p->jumbo_size;
2501 
2502 	q->rspq.gen = 1;
2503 	q->rspq.cidx = 0;
2504 	q->rspq.size = p->rspq_size;
2505 
2506 	q->txq[TXQ_ETH].stop_thres = nports *
2507 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2508 
2509 	q->fl[0].buf_size = MCLBYTES;
2510 	q->fl[0].zone = zone_pack;
2511 	q->fl[0].type = EXT_PACKET;
2512 
2513 	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2514 		q->fl[1].zone = zone_jumbo16;
2515 		q->fl[1].type = EXT_JUMBO16;
2516 	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2517 		q->fl[1].zone = zone_jumbo9;
2518 		q->fl[1].type = EXT_JUMBO9;
2519 	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2520 		q->fl[1].zone = zone_jumbop;
2521 		q->fl[1].type = EXT_JUMBOP;
2522 	} else {
2523 		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2524 		ret = EDOOFUS;
2525 		goto err;
2526 	}
2527 	q->fl[1].buf_size = p->jumbo_buf_size;
2528 
2529 	/* Allocate and setup the lro_ctrl structure */
2530 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2531 #if defined(INET6) || defined(INET)
2532 	ret = tcp_lro_init(&q->lro.ctrl);
2533 	if (ret) {
2534 		printf("error %d from tcp_lro_init\n", ret);
2535 		goto err;
2536 	}
2537 #endif
2538 	q->lro.ctrl.ifp = pi->ifp;
2539 
2540 	mtx_lock_spin(&sc->sge.reg_lock);
2541 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2542 				   q->rspq.phys_addr, q->rspq.size,
2543 				   q->fl[0].buf_size, 1, 0);
2544 	if (ret) {
2545 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2546 		goto err_unlock;
2547 	}
2548 
2549 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2550 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2551 					  q->fl[i].phys_addr, q->fl[i].size,
2552 					  q->fl[i].buf_size, p->cong_thres, 1,
2553 					  0);
2554 		if (ret) {
2555 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2556 			goto err_unlock;
2557 		}
2558 	}
2559 
2560 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2561 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2562 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2563 				 1, 0);
2564 	if (ret) {
2565 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2566 		goto err_unlock;
2567 	}
2568 
2569 	if (ntxq > 1) {
2570 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2571 					 USE_GTS, SGE_CNTXT_OFLD, id,
2572 					 q->txq[TXQ_OFLD].phys_addr,
2573 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2574 		if (ret) {
2575 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2576 			goto err_unlock;
2577 		}
2578 	}
2579 
2580 	if (ntxq > 2) {
2581 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2582 					 SGE_CNTXT_CTRL, id,
2583 					 q->txq[TXQ_CTRL].phys_addr,
2584 					 q->txq[TXQ_CTRL].size,
2585 					 q->txq[TXQ_CTRL].token, 1, 0);
2586 		if (ret) {
2587 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2588 			goto err_unlock;
2589 		}
2590 	}
2591 
2592 	mtx_unlock_spin(&sc->sge.reg_lock);
2593 	t3_update_qset_coalesce(q, p);
2594 
2595 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2596 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2597 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2598 
2599 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2600 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2601 
2602 	return (0);
2603 
2604 err_unlock:
2605 	mtx_unlock_spin(&sc->sge.reg_lock);
2606 err:
2607 	TXQ_LOCK(q);
2608 	t3_free_qset(sc, q);
2609 
2610 	return (ret);
2611 }
2612 
2613 /*
2614  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2615  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2616  * will also be taken into account here.
2617  */
2618 void
2619 t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad)
2620 {
2621 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2622 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2623 	struct ifnet *ifp = pi->ifp;
2624 
2625 	if (cpl->vlan_valid) {
2626 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2627 		m->m_flags |= M_VLANTAG;
2628 	}
2629 
2630 	m->m_pkthdr.rcvif = ifp;
2631 	/*
2632 	 * adjust after conversion to mbuf chain
2633 	 */
2634 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2635 	m->m_len -= (sizeof(*cpl) + ethpad);
2636 	m->m_data += (sizeof(*cpl) + ethpad);
2637 
2638 	if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) {
2639 		struct ether_header *eh = mtod(m, void *);
2640 		uint16_t eh_type;
2641 
2642 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2643 			struct ether_vlan_header *evh = mtod(m, void *);
2644 
2645 			eh_type = evh->evl_proto;
2646 		} else
2647 			eh_type = eh->ether_type;
2648 
2649 		if (ifp->if_capenable & IFCAP_RXCSUM &&
2650 		    eh_type == htons(ETHERTYPE_IP)) {
2651 			m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
2652 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2653 			m->m_pkthdr.csum_data = 0xffff;
2654 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
2655 		    eh_type == htons(ETHERTYPE_IPV6)) {
2656 			m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
2657 			    CSUM_PSEUDO_HDR);
2658 			m->m_pkthdr.csum_data = 0xffff;
2659 		}
2660 	}
2661 }
2662 
2663 /**
2664  *	get_packet - return the next ingress packet buffer from a free list
2665  *	@adap: the adapter that received the packet
2666  *	@drop_thres: # of remaining buffers before we start dropping packets
2667  *	@qs: the qset that the SGE free list holding the packet belongs to
2668  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2669  *      @r: response descriptor
2670  *
2671  *	Get the next packet from a free list and complete setup of the
2672  *	sk_buff.  If the packet is small we make a copy and recycle the
2673  *	original buffer, otherwise we use the original buffer itself.  If a
2674  *	positive drop threshold is supplied packets are dropped and their
2675  *	buffers recycled if (a) the number of remaining buffers is under the
2676  *	threshold and the packet is too big to copy, or (b) the packet should
2677  *	be copied but there is no memory for the copy.
2678  */
2679 static int
2680 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2681     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2682 {
2683 
2684 	unsigned int len_cq =  ntohl(r->len_cq);
2685 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2686 	int mask, cidx = fl->cidx;
2687 	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2688 	uint32_t len = G_RSPD_LEN(len_cq);
2689 	uint32_t flags = M_EXT;
2690 	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2691 	caddr_t cl;
2692 	struct mbuf *m;
2693 	int ret = 0;
2694 
2695 	mask = fl->size - 1;
2696 	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2697 	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2698 	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2699 	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2700 
2701 	fl->credits--;
2702 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2703 
2704 	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2705 	    sopeop == RSPQ_SOP_EOP) {
2706 		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
2707 			goto skip_recycle;
2708 		cl = mtod(m, void *);
2709 		memcpy(cl, sd->rxsd_cl, len);
2710 		recycle_rx_buf(adap, fl, fl->cidx);
2711 		m->m_pkthdr.len = m->m_len = len;
2712 		m->m_flags = 0;
2713 		mh->mh_head = mh->mh_tail = m;
2714 		ret = 1;
2715 		goto done;
2716 	} else {
2717 	skip_recycle:
2718 		bus_dmamap_unload(fl->entry_tag, sd->map);
2719 		cl = sd->rxsd_cl;
2720 		m = sd->m;
2721 
2722 		if ((sopeop == RSPQ_SOP_EOP) ||
2723 		    (sopeop == RSPQ_SOP))
2724 			flags |= M_PKTHDR;
2725 		m_init(m, M_NOWAIT, MT_DATA, flags);
2726 		if (fl->zone == zone_pack) {
2727 			/*
2728 			 * restore clobbered data pointer
2729 			 */
2730 			m->m_data = m->m_ext.ext_buf;
2731 		} else {
2732 			m_cljset(m, cl, fl->type);
2733 		}
2734 		m->m_len = len;
2735 	}
2736 	switch(sopeop) {
2737 	case RSPQ_SOP_EOP:
2738 		ret = 1;
2739 		/* FALLTHROUGH */
2740 	case RSPQ_SOP:
2741 		mh->mh_head = mh->mh_tail = m;
2742 		m->m_pkthdr.len = len;
2743 		break;
2744 	case RSPQ_EOP:
2745 		ret = 1;
2746 		/* FALLTHROUGH */
2747 	case RSPQ_NSOP_NEOP:
2748 		if (mh->mh_tail == NULL) {
2749 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2750 			m_freem(m);
2751 			break;
2752 		}
2753 		mh->mh_tail->m_next = m;
2754 		mh->mh_tail = m;
2755 		mh->mh_head->m_pkthdr.len += len;
2756 		break;
2757 	}
2758 	if (cxgb_debug)
2759 		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2760 done:
2761 	if (++fl->cidx == fl->size)
2762 		fl->cidx = 0;
2763 
2764 	return (ret);
2765 }
2766 
2767 /**
2768  *	handle_rsp_cntrl_info - handles control information in a response
2769  *	@qs: the queue set corresponding to the response
2770  *	@flags: the response control flags
2771  *
2772  *	Handles the control information of an SGE response, such as GTS
2773  *	indications and completion credits for the queue set's Tx queues.
2774  *	HW coalesces credits, we don't do any extra SW coalescing.
2775  */
2776 static __inline void
2777 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2778 {
2779 	unsigned int credits;
2780 
2781 #if USE_GTS
2782 	if (flags & F_RSPD_TXQ0_GTS)
2783 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2784 #endif
2785 	credits = G_RSPD_TXQ0_CR(flags);
2786 	if (credits)
2787 		qs->txq[TXQ_ETH].processed += credits;
2788 
2789 	credits = G_RSPD_TXQ2_CR(flags);
2790 	if (credits)
2791 		qs->txq[TXQ_CTRL].processed += credits;
2792 
2793 # if USE_GTS
2794 	if (flags & F_RSPD_TXQ1_GTS)
2795 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2796 # endif
2797 	credits = G_RSPD_TXQ1_CR(flags);
2798 	if (credits)
2799 		qs->txq[TXQ_OFLD].processed += credits;
2800 
2801 }
2802 
2803 static void
2804 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2805     unsigned int sleeping)
2806 {
2807 	;
2808 }
2809 
2810 /**
2811  *	process_responses - process responses from an SGE response queue
2812  *	@adap: the adapter
2813  *	@qs: the queue set to which the response queue belongs
2814  *	@budget: how many responses can be processed in this round
2815  *
2816  *	Process responses from an SGE response queue up to the supplied budget.
2817  *	Responses include received packets as well as credits and other events
2818  *	for the queues that belong to the response queue's queue set.
2819  *	A negative budget is effectively unlimited.
2820  *
2821  *	Additionally choose the interrupt holdoff time for the next interrupt
2822  *	on this queue.  If the system is under memory shortage use a fairly
2823  *	long delay to help recovery.
2824  */
2825 static int
2826 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2827 {
2828 	struct sge_rspq *rspq = &qs->rspq;
2829 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2830 	int budget_left = budget;
2831 	unsigned int sleeping = 0;
2832 #if defined(INET6) || defined(INET)
2833 	int lro_enabled = qs->lro.enabled;
2834 	int skip_lro;
2835 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2836 #endif
2837 	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2838 #ifdef DEBUG
2839 	static int last_holdoff = 0;
2840 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2841 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2842 		last_holdoff = rspq->holdoff_tmr;
2843 	}
2844 #endif
2845 	rspq->next_holdoff = rspq->holdoff_tmr;
2846 
2847 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2848 		int eth, eop = 0, ethpad = 0;
2849 		uint32_t flags = ntohl(r->flags);
2850 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2851 		uint8_t opcode = r->rss_hdr.opcode;
2852 
2853 		eth = (opcode == CPL_RX_PKT);
2854 
2855 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2856 			struct mbuf *m;
2857 
2858 			if (cxgb_debug)
2859 				printf("async notification\n");
2860 
2861 			if (mh->mh_head == NULL) {
2862 				mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA);
2863 				m = mh->mh_head;
2864 			} else {
2865 				m = m_gethdr(M_NOWAIT, MT_DATA);
2866 			}
2867 			if (m == NULL)
2868 				goto no_mem;
2869 
2870                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2871 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2872                         *mtod(m, uint8_t *) = CPL_ASYNC_NOTIF;
2873 			opcode = CPL_ASYNC_NOTIF;
2874 			eop = 1;
2875                         rspq->async_notif++;
2876 			goto skip;
2877 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2878 			struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
2879 
2880 			if (m == NULL) {
2881 		no_mem:
2882 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2883 				budget_left--;
2884 				break;
2885 			}
2886 			if (mh->mh_head == NULL)
2887 				mh->mh_head = m;
2888                         else
2889 				mh->mh_tail->m_next = m;
2890 			mh->mh_tail = m;
2891 
2892 			get_imm_packet(adap, r, m);
2893 			mh->mh_head->m_pkthdr.len += m->m_len;
2894 			eop = 1;
2895 			rspq->imm_data++;
2896 		} else if (r->len_cq) {
2897 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2898 
2899 			eop = get_packet(adap, drop_thresh, qs, mh, r);
2900 			if (eop) {
2901 				if (r->rss_hdr.hash_type && !adap->timestamp) {
2902 					M_HASHTYPE_SET(mh->mh_head,
2903 					    M_HASHTYPE_OPAQUE_HASH);
2904 					mh->mh_head->m_pkthdr.flowid = rss_hash;
2905 				}
2906 			}
2907 
2908 			ethpad = 2;
2909 		} else {
2910 			rspq->pure_rsps++;
2911 		}
2912 	skip:
2913 		if (flags & RSPD_CTRL_MASK) {
2914 			sleeping |= flags & RSPD_GTS_MASK;
2915 			handle_rsp_cntrl_info(qs, flags);
2916 		}
2917 
2918 		if (!eth && eop) {
2919 			rspq->offload_pkts++;
2920 #ifdef TCP_OFFLOAD
2921 			adap->cpl_handler[opcode](qs, r, mh->mh_head);
2922 #else
2923 			m_freem(mh->mh_head);
2924 #endif
2925 			mh->mh_head = NULL;
2926 		} else if (eth && eop) {
2927 			struct mbuf *m = mh->mh_head;
2928 
2929 			t3_rx_eth(adap, m, ethpad);
2930 
2931 			/*
2932 			 * The T304 sends incoming packets on any qset.  If LRO
2933 			 * is also enabled, we could end up sending packet up
2934 			 * lro_ctrl->ifp's input.  That is incorrect.
2935 			 *
2936 			 * The mbuf's rcvif was derived from the cpl header and
2937 			 * is accurate.  Skip LRO and just use that.
2938 			 */
2939 #if defined(INET6) || defined(INET)
2940 			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
2941 
2942 			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
2943 			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
2944 			    ) {
2945 				/* successfully queue'd for LRO */
2946 			} else
2947 #endif
2948 			{
2949 				/*
2950 				 * LRO not enabled, packet unsuitable for LRO,
2951 				 * or unable to queue.  Pass it up right now in
2952 				 * either case.
2953 				 */
2954 				struct ifnet *ifp = m->m_pkthdr.rcvif;
2955 				(*ifp->if_input)(ifp, m);
2956 			}
2957 			mh->mh_head = NULL;
2958 
2959 		}
2960 
2961 		r++;
2962 		if (__predict_false(++rspq->cidx == rspq->size)) {
2963 			rspq->cidx = 0;
2964 			rspq->gen ^= 1;
2965 			r = rspq->desc;
2966 		}
2967 
2968 		if (++rspq->credits >= 64) {
2969 			refill_rspq(adap, rspq, rspq->credits);
2970 			rspq->credits = 0;
2971 		}
2972 		__refill_fl_lt(adap, &qs->fl[0], 32);
2973 		__refill_fl_lt(adap, &qs->fl[1], 32);
2974 		--budget_left;
2975 	}
2976 
2977 #if defined(INET6) || defined(INET)
2978 	/* Flush LRO */
2979 	tcp_lro_flush_all(lro_ctrl);
2980 #endif
2981 
2982 	if (sleeping)
2983 		check_ring_db(adap, qs, sleeping);
2984 
2985 	mb();  /* commit Tx queue processed updates */
2986 	if (__predict_false(qs->txq_stopped > 1))
2987 		restart_tx(qs);
2988 
2989 	__refill_fl_lt(adap, &qs->fl[0], 512);
2990 	__refill_fl_lt(adap, &qs->fl[1], 512);
2991 	budget -= budget_left;
2992 	return (budget);
2993 }
2994 
2995 /*
2996  * A helper function that processes responses and issues GTS.
2997  */
2998 static __inline int
2999 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3000 {
3001 	int work;
3002 	static int last_holdoff = 0;
3003 
3004 	work = process_responses(adap, rspq_to_qset(rq), -1);
3005 
3006 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3007 		printf("next_holdoff=%d\n", rq->next_holdoff);
3008 		last_holdoff = rq->next_holdoff;
3009 	}
3010 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3011 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3012 
3013 	return (work);
3014 }
3015 
3016 
3017 /*
3018  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3019  * Handles data events from SGE response queues as well as error and other
3020  * async events as they all use the same interrupt pin.  We use one SGE
3021  * response queue per port in this mode and protect all response queues with
3022  * queue 0's lock.
3023  */
3024 void
3025 t3b_intr(void *data)
3026 {
3027 	uint32_t i, map;
3028 	adapter_t *adap = data;
3029 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3030 
3031 	t3_write_reg(adap, A_PL_CLI, 0);
3032 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3033 
3034 	if (!map)
3035 		return;
3036 
3037 	if (__predict_false(map & F_ERRINTR)) {
3038 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3039 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3040 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3041 	}
3042 
3043 	mtx_lock(&q0->lock);
3044 	for_each_port(adap, i)
3045 	    if (map & (1 << i))
3046 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3047 	mtx_unlock(&q0->lock);
3048 }
3049 
3050 /*
3051  * The MSI interrupt handler.  This needs to handle data events from SGE
3052  * response queues as well as error and other async events as they all use
3053  * the same MSI vector.  We use one SGE response queue per port in this mode
3054  * and protect all response queues with queue 0's lock.
3055  */
3056 void
3057 t3_intr_msi(void *data)
3058 {
3059 	adapter_t *adap = data;
3060 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3061 	int i, new_packets = 0;
3062 
3063 	mtx_lock(&q0->lock);
3064 
3065 	for_each_port(adap, i)
3066 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3067 		    new_packets = 1;
3068 	mtx_unlock(&q0->lock);
3069 	if (new_packets == 0) {
3070 		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3071 		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3072 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3073 	}
3074 }
3075 
3076 void
3077 t3_intr_msix(void *data)
3078 {
3079 	struct sge_qset *qs = data;
3080 	adapter_t *adap = qs->port->adapter;
3081 	struct sge_rspq *rspq = &qs->rspq;
3082 
3083 	if (process_responses_gts(adap, rspq) == 0)
3084 		rspq->unhandled_irqs++;
3085 }
3086 
3087 #define QDUMP_SBUF_SIZE		32 * 400
3088 static int
3089 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3090 {
3091 	struct sge_rspq *rspq;
3092 	struct sge_qset *qs;
3093 	int i, err, dump_end, idx;
3094 	struct sbuf *sb;
3095 	struct rsp_desc *rspd;
3096 	uint32_t data[4];
3097 
3098 	rspq = arg1;
3099 	qs = rspq_to_qset(rspq);
3100 	if (rspq->rspq_dump_count == 0)
3101 		return (0);
3102 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3103 		log(LOG_WARNING,
3104 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3105 		rspq->rspq_dump_count = 0;
3106 		return (EINVAL);
3107 	}
3108 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3109 		log(LOG_WARNING,
3110 		    "dump start of %d is greater than queue size\n",
3111 		    rspq->rspq_dump_start);
3112 		rspq->rspq_dump_start = 0;
3113 		return (EINVAL);
3114 	}
3115 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3116 	if (err)
3117 		return (err);
3118 	err = sysctl_wire_old_buffer(req, 0);
3119 	if (err)
3120 		return (err);
3121 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3122 
3123 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3124 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3125 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3126 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3127 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3128 
3129 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3130 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3131 
3132 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3133 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3134 		idx = i & (RSPQ_Q_SIZE-1);
3135 
3136 		rspd = &rspq->desc[idx];
3137 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3138 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3139 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3140 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3141 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3142 		    be32toh(rspd->len_cq), rspd->intr_gen);
3143 	}
3144 
3145 	err = sbuf_finish(sb);
3146 	sbuf_delete(sb);
3147 	return (err);
3148 }
3149 
3150 static int
3151 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3152 {
3153 	struct sge_txq *txq;
3154 	struct sge_qset *qs;
3155 	int i, j, err, dump_end;
3156 	struct sbuf *sb;
3157 	struct tx_desc *txd;
3158 	uint32_t *WR, wr_hi, wr_lo, gen;
3159 	uint32_t data[4];
3160 
3161 	txq = arg1;
3162 	qs = txq_to_qset(txq, TXQ_ETH);
3163 	if (txq->txq_dump_count == 0) {
3164 		return (0);
3165 	}
3166 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3167 		log(LOG_WARNING,
3168 		    "dump count is too large %d\n", txq->txq_dump_count);
3169 		txq->txq_dump_count = 1;
3170 		return (EINVAL);
3171 	}
3172 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3173 		log(LOG_WARNING,
3174 		    "dump start of %d is greater than queue size\n",
3175 		    txq->txq_dump_start);
3176 		txq->txq_dump_start = 0;
3177 		return (EINVAL);
3178 	}
3179 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3180 	if (err)
3181 		return (err);
3182 	err = sysctl_wire_old_buffer(req, 0);
3183 	if (err)
3184 		return (err);
3185 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3186 
3187 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3188 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3189 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3190 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3191 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3192 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3193 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3194 	    txq->txq_dump_start,
3195 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3196 
3197 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3198 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3199 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3200 		WR = (uint32_t *)txd->flit;
3201 		wr_hi = ntohl(WR[0]);
3202 		wr_lo = ntohl(WR[1]);
3203 		gen = G_WR_GEN(wr_lo);
3204 
3205 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3206 		    wr_hi, wr_lo, gen);
3207 		for (j = 2; j < 30; j += 4)
3208 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3209 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3210 
3211 	}
3212 	err = sbuf_finish(sb);
3213 	sbuf_delete(sb);
3214 	return (err);
3215 }
3216 
3217 static int
3218 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3219 {
3220 	struct sge_txq *txq;
3221 	struct sge_qset *qs;
3222 	int i, j, err, dump_end;
3223 	struct sbuf *sb;
3224 	struct tx_desc *txd;
3225 	uint32_t *WR, wr_hi, wr_lo, gen;
3226 
3227 	txq = arg1;
3228 	qs = txq_to_qset(txq, TXQ_CTRL);
3229 	if (txq->txq_dump_count == 0) {
3230 		return (0);
3231 	}
3232 	if (txq->txq_dump_count > 256) {
3233 		log(LOG_WARNING,
3234 		    "dump count is too large %d\n", txq->txq_dump_count);
3235 		txq->txq_dump_count = 1;
3236 		return (EINVAL);
3237 	}
3238 	if (txq->txq_dump_start > 255) {
3239 		log(LOG_WARNING,
3240 		    "dump start of %d is greater than queue size\n",
3241 		    txq->txq_dump_start);
3242 		txq->txq_dump_start = 0;
3243 		return (EINVAL);
3244 	}
3245 
3246 	err = sysctl_wire_old_buffer(req, 0);
3247 	if (err != 0)
3248 		return (err);
3249 	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3250 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3251 	    txq->txq_dump_start,
3252 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3253 
3254 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3255 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3256 		txd = &txq->desc[i & (255)];
3257 		WR = (uint32_t *)txd->flit;
3258 		wr_hi = ntohl(WR[0]);
3259 		wr_lo = ntohl(WR[1]);
3260 		gen = G_WR_GEN(wr_lo);
3261 
3262 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3263 		    wr_hi, wr_lo, gen);
3264 		for (j = 2; j < 30; j += 4)
3265 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3266 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3267 
3268 	}
3269 	err = sbuf_finish(sb);
3270 	sbuf_delete(sb);
3271 	return (err);
3272 }
3273 
3274 static int
3275 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3276 {
3277 	adapter_t *sc = arg1;
3278 	struct qset_params *qsp = &sc->params.sge.qset[0];
3279 	int coalesce_usecs;
3280 	struct sge_qset *qs;
3281 	int i, j, err, nqsets = 0;
3282 	struct mtx *lock;
3283 
3284 	if ((sc->flags & FULL_INIT_DONE) == 0)
3285 		return (ENXIO);
3286 
3287 	coalesce_usecs = qsp->coalesce_usecs;
3288         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3289 
3290 	if (err != 0) {
3291 		return (err);
3292 	}
3293 	if (coalesce_usecs == qsp->coalesce_usecs)
3294 		return (0);
3295 
3296 	for (i = 0; i < sc->params.nports; i++)
3297 		for (j = 0; j < sc->port[i].nqsets; j++)
3298 			nqsets++;
3299 
3300 	coalesce_usecs = max(1, coalesce_usecs);
3301 
3302 	for (i = 0; i < nqsets; i++) {
3303 		qs = &sc->sge.qs[i];
3304 		qsp = &sc->params.sge.qset[i];
3305 		qsp->coalesce_usecs = coalesce_usecs;
3306 
3307 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3308 			    &sc->sge.qs[0].rspq.lock;
3309 
3310 		mtx_lock(lock);
3311 		t3_update_qset_coalesce(qs, qsp);
3312 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3313 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3314 		mtx_unlock(lock);
3315 	}
3316 
3317 	return (0);
3318 }
3319 
3320 static int
3321 t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3322 {
3323 	adapter_t *sc = arg1;
3324 	int rc, timestamp;
3325 
3326 	if ((sc->flags & FULL_INIT_DONE) == 0)
3327 		return (ENXIO);
3328 
3329 	timestamp = sc->timestamp;
3330 	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3331 
3332 	if (rc != 0)
3333 		return (rc);
3334 
3335 	if (timestamp != sc->timestamp) {
3336 		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3337 		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3338 		sc->timestamp = timestamp;
3339 	}
3340 
3341 	return (0);
3342 }
3343 
3344 void
3345 t3_add_attach_sysctls(adapter_t *sc)
3346 {
3347 	struct sysctl_ctx_list *ctx;
3348 	struct sysctl_oid_list *children;
3349 
3350 	ctx = device_get_sysctl_ctx(sc->dev);
3351 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3352 
3353 	/* random information */
3354 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3355 	    "firmware_version",
3356 	    CTLFLAG_RD, sc->fw_version,
3357 	    0, "firmware version");
3358 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3359 	    "hw_revision",
3360 	    CTLFLAG_RD, &sc->params.rev,
3361 	    0, "chip model");
3362 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3363 	    "port_types",
3364 	    CTLFLAG_RD, sc->port_types,
3365 	    0, "type of ports");
3366 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3367 	    "enable_debug",
3368 	    CTLFLAG_RW, &cxgb_debug,
3369 	    0, "enable verbose debugging output");
3370 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3371 	    CTLFLAG_RD, &sc->tunq_coalesce,
3372 	    "#tunneled packets freed");
3373 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3374 	    "txq_overrun",
3375 	    CTLFLAG_RD, &txq_fills,
3376 	    0, "#times txq overrun");
3377 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3378 	    "core_clock",
3379 	    CTLFLAG_RD, &sc->params.vpd.cclk,
3380 	    0, "core clock frequency (in KHz)");
3381 }
3382 
3383 
3384 static const char *rspq_name = "rspq";
3385 static const char *txq_names[] =
3386 {
3387 	"txq_eth",
3388 	"txq_ofld",
3389 	"txq_ctrl"
3390 };
3391 
3392 static int
3393 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3394 {
3395 	struct port_info *p = arg1;
3396 	uint64_t *parg;
3397 
3398 	if (!p)
3399 		return (EINVAL);
3400 
3401 	cxgb_refresh_stats(p);
3402 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3403 
3404 	return (sysctl_handle_64(oidp, parg, 0, req));
3405 }
3406 
3407 void
3408 t3_add_configured_sysctls(adapter_t *sc)
3409 {
3410 	struct sysctl_ctx_list *ctx;
3411 	struct sysctl_oid_list *children;
3412 	int i, j;
3413 
3414 	ctx = device_get_sysctl_ctx(sc->dev);
3415 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3416 
3417 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3418 	    "intr_coal",
3419 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3420 	    0, t3_set_coalesce_usecs,
3421 	    "I", "interrupt coalescing timer (us)");
3422 
3423 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3424 	    "pkt_timestamp",
3425 	    CTLTYPE_INT | CTLFLAG_RW, sc,
3426 	    0, t3_pkt_timestamp,
3427 	    "I", "provide packet timestamp instead of connection hash");
3428 
3429 	for (i = 0; i < sc->params.nports; i++) {
3430 		struct port_info *pi = &sc->port[i];
3431 		struct sysctl_oid *poid;
3432 		struct sysctl_oid_list *poidlist;
3433 		struct mac_stats *mstats = &pi->mac.stats;
3434 
3435 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3436 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3437 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3438 		poidlist = SYSCTL_CHILDREN(poid);
3439 		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3440 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3441 		    0, "#queue sets");
3442 
3443 		for (j = 0; j < pi->nqsets; j++) {
3444 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3445 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3446 					  *ctrlqpoid, *lropoid;
3447 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3448 					       *txqpoidlist, *ctrlqpoidlist,
3449 					       *lropoidlist;
3450 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3451 
3452 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3453 
3454 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3455 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3456 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3457 
3458 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3459 					CTLFLAG_RD, &qs->fl[0].empty, 0,
3460 					"freelist #0 empty");
3461 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3462 					CTLFLAG_RD, &qs->fl[1].empty, 0,
3463 					"freelist #1 empty");
3464 
3465 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3466 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3467 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3468 
3469 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3470 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3471 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3472 
3473 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3474 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3475 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3476 
3477 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3478 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3479 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3480 
3481 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3482 			    CTLFLAG_RD, &qs->rspq.size,
3483 			    0, "#entries in response queue");
3484 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3485 			    CTLFLAG_RD, &qs->rspq.cidx,
3486 			    0, "consumer index");
3487 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3488 			    CTLFLAG_RD, &qs->rspq.credits,
3489 			    0, "#credits");
3490 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3491 			    CTLFLAG_RD, &qs->rspq.starved,
3492 			    0, "#times starved");
3493 			SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3494 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3495 			    "physical_address_of the queue");
3496 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3497 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3498 			    0, "start rspq dump entry");
3499 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3500 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3501 			    0, "#rspq entries to dump");
3502 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3503 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3504 			    0, t3_dump_rspq, "A", "dump of the response queue");
3505 
3506 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3507 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3508 			    "#tunneled packets dropped");
3509 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3510 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.mq_len,
3511 			    0, "#tunneled packets waiting to be sent");
3512 #if 0
3513 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3514 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3515 			    0, "#tunneled packets queue producer index");
3516 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3517 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3518 			    0, "#tunneled packets queue consumer index");
3519 #endif
3520 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3521 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3522 			    0, "#tunneled packets processed by the card");
3523 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3524 			    CTLFLAG_RD, &txq->cleaned,
3525 			    0, "#tunneled packets cleaned");
3526 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3527 			    CTLFLAG_RD, &txq->in_use,
3528 			    0, "#tunneled packet slots in use");
3529 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees",
3530 			    CTLFLAG_RD, &txq->txq_frees,
3531 			    "#tunneled packets freed");
3532 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3533 			    CTLFLAG_RD, &txq->txq_skipped,
3534 			    0, "#tunneled packet descriptors skipped");
3535 			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3536 			    CTLFLAG_RD, &txq->txq_coalesced,
3537 			    "#tunneled packets coalesced");
3538 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3539 			    CTLFLAG_RD, &txq->txq_enqueued,
3540 			    0, "#tunneled packets enqueued to hardware");
3541 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3542 			    CTLFLAG_RD, &qs->txq_stopped,
3543 			    0, "tx queues stopped");
3544 			SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3545 			    CTLFLAG_RD, &txq->phys_addr,
3546 			    "physical_address_of the queue");
3547 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3548 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3549 			    0, "txq generation");
3550 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3551 			    CTLFLAG_RD, &txq->cidx,
3552 			    0, "hardware queue cidx");
3553 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3554 			    CTLFLAG_RD, &txq->pidx,
3555 			    0, "hardware queue pidx");
3556 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3557 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3558 			    0, "txq start idx for dump");
3559 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3560 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3561 			    0, "txq #entries to dump");
3562 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3563 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3564 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3565 
3566 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3567 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3568 			    0, "ctrlq start idx for dump");
3569 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3570 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3571 			    0, "ctrl #entries to dump");
3572 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3573 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3574 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3575 
3576 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_queued",
3577 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3578 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3579 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3580 			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3581 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3582 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3583 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3584 		}
3585 
3586 		/* Now add a node for mac stats. */
3587 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3588 		    CTLFLAG_RD, NULL, "MAC statistics");
3589 		poidlist = SYSCTL_CHILDREN(poid);
3590 
3591 		/*
3592 		 * We (ab)use the length argument (arg2) to pass on the offset
3593 		 * of the data that we are interested in.  This is only required
3594 		 * for the quad counters that are updated from the hardware (we
3595 		 * make sure that we return the latest value).
3596 		 * sysctl_handle_macstat first updates *all* the counters from
3597 		 * the hardware, and then returns the latest value of the
3598 		 * requested counter.  Best would be to update only the
3599 		 * requested counter from hardware, but t3_mac_update_stats()
3600 		 * hides all the register details and we don't want to dive into
3601 		 * all that here.
3602 		 */
3603 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3604     (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3605     sysctl_handle_macstat, "QU", 0)
3606 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3607 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3608 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3609 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3610 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3611 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3612 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3613 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3614 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3615 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3616 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3617 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3618 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3619 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3620 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3621 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3622 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3623 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3624 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3625 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3626 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3627 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3628 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3629 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3630 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3631 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3632 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3633 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3634 		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3635 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3636 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3637 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3638 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3639 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3640 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3641 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3642 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3643 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3644 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3645 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3646 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3647 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3648 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3649 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3650 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3651 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3652 #undef CXGB_SYSCTL_ADD_QUAD
3653 
3654 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3655     CTLFLAG_RD, &mstats->a, 0)
3656 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3657 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3658 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3659 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3660 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3661 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3662 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3663 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3664 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3665 		CXGB_SYSCTL_ADD_ULONG(link_faults);
3666 #undef CXGB_SYSCTL_ADD_ULONG
3667 	}
3668 }
3669 
3670 /**
3671  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3672  *	@qs: the queue set
3673  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3674  *	@idx: the descriptor index in the queue
3675  *	@data: where to dump the descriptor contents
3676  *
3677  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3678  *	size of the descriptor.
3679  */
3680 int
3681 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3682 		unsigned char *data)
3683 {
3684 	if (qnum >= 6)
3685 		return (EINVAL);
3686 
3687 	if (qnum < 3) {
3688 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3689 			return -EINVAL;
3690 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3691 		return sizeof(struct tx_desc);
3692 	}
3693 
3694 	if (qnum == 3) {
3695 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3696 			return (EINVAL);
3697 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3698 		return sizeof(struct rsp_desc);
3699 	}
3700 
3701 	qnum -= 4;
3702 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3703 		return (EINVAL);
3704 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3705 	return sizeof(struct rx_desc);
3706 }
3707