xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 6356dba0b403daa023dec24559ab1f8e602e4f14)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 #define DEBUG_BUFRING
30 
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/module.h>
39 #include <sys/bus.h>
40 #include <sys/conf.h>
41 #include <machine/bus.h>
42 #include <machine/resource.h>
43 #include <sys/bus_dma.h>
44 #include <sys/rman.h>
45 #include <sys/queue.h>
46 #include <sys/sysctl.h>
47 #include <sys/taskqueue.h>
48 
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/systm.h>
54 #include <sys/syslog.h>
55 
56 #include <netinet/in_systm.h>
57 #include <netinet/in.h>
58 #include <netinet/ip.h>
59 #include <netinet/tcp.h>
60 
61 #include <dev/pci/pcireg.h>
62 #include <dev/pci/pcivar.h>
63 
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 
67 #ifdef CONFIG_DEFINED
68 #include <cxgb_include.h>
69 #include <sys/mvec.h>
70 #else
71 #include <dev/cxgb/cxgb_include.h>
72 #include <dev/cxgb/sys/mvec.h>
73 #endif
74 
75 int      txq_fills = 0;
76 /*
77  * XXX don't re-enable this until TOE stops assuming
78  * we have an m_ext
79  */
80 static int recycle_enable = 0;
81 extern int cxgb_txq_buf_ring_size;
82 int cxgb_cached_allocations;
83 int cxgb_cached;
84 int cxgb_ext_freed = 0;
85 int cxgb_ext_inited = 0;
86 int fl_q_size = 0;
87 int jumbo_q_size = 0;
88 
89 extern int cxgb_use_16k_clusters;
90 extern int cxgb_pcpu_cache_enable;
91 extern int nmbjumbo4;
92 extern int nmbjumbo9;
93 extern int nmbjumbo16;
94 
95 
96 
97 
98 #define USE_GTS 0
99 
100 #define SGE_RX_SM_BUF_SIZE	1536
101 #define SGE_RX_DROP_THRES	16
102 #define SGE_RX_COPY_THRES	128
103 
104 /*
105  * Period of the Tx buffer reclaim timer.  This timer does not need to run
106  * frequently as Tx buffers are usually reclaimed by new Tx packets.
107  */
108 #define TX_RECLAIM_PERIOD       (hz >> 1)
109 
110 /*
111  * Values for sge_txq.flags
112  */
113 enum {
114 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
115 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
116 };
117 
118 struct tx_desc {
119 	uint64_t	flit[TX_DESC_FLITS];
120 } __packed;
121 
122 struct rx_desc {
123 	uint32_t	addr_lo;
124 	uint32_t	len_gen;
125 	uint32_t	gen2;
126 	uint32_t	addr_hi;
127 } __packed;;
128 
129 struct rsp_desc {               /* response queue descriptor */
130 	struct rss_header	rss_hdr;
131 	uint32_t		flags;
132 	uint32_t		len_cq;
133 	uint8_t			imm_data[47];
134 	uint8_t			intr_gen;
135 } __packed;
136 
137 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
138 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
139 #define RX_SW_DESC_INUSE        (1 << 3)
140 #define TX_SW_DESC_MAPPED       (1 << 4)
141 
142 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
143 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
144 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
145 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
146 
147 struct tx_sw_desc {                /* SW state per Tx descriptor */
148 	struct mbuf_iovec mi;
149 	bus_dmamap_t	map;
150 	int		flags;
151 };
152 
153 struct rx_sw_desc {                /* SW state per Rx descriptor */
154 	caddr_t	         rxsd_cl;
155 	caddr_t	         data;
156 	bus_dmamap_t	  map;
157 	int		  flags;
158 };
159 
160 struct txq_state {
161 	unsigned int compl;
162 	unsigned int gen;
163 	unsigned int pidx;
164 };
165 
166 struct refill_fl_cb_arg {
167 	int               error;
168 	bus_dma_segment_t seg;
169 	int               nseg;
170 };
171 
172 /*
173  * Maps a number of flits to the number of Tx descriptors that can hold them.
174  * The formula is
175  *
176  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
177  *
178  * HW allows up to 4 descriptors to be combined into a WR.
179  */
180 static uint8_t flit_desc_map[] = {
181 	0,
182 #if SGE_NUM_GENBITS == 1
183 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
184 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
185 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
186 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
187 #elif SGE_NUM_GENBITS == 2
188 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
189 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
190 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
191 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 #else
193 # error "SGE_NUM_GENBITS must be 1 or 2"
194 #endif
195 };
196 
197 
198 int cxgb_debug = 0;
199 
200 static void sge_timer_cb(void *arg);
201 static void sge_timer_reclaim(void *arg, int ncount);
202 static void sge_txq_reclaim_handler(void *arg, int ncount);
203 
204 /**
205  *	reclaim_completed_tx - reclaims completed Tx descriptors
206  *	@adapter: the adapter
207  *	@q: the Tx queue to reclaim completed descriptors from
208  *
209  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
210  *	and frees the associated buffers if possible.  Called with the Tx
211  *	queue's lock held.
212  */
213 static __inline int
214 reclaim_completed_tx_(struct sge_txq *q, int reclaim_min)
215 {
216 	int reclaim = desc_reclaimable(q);
217 
218 	if (reclaim < reclaim_min)
219 		return (0);
220 
221 	mtx_assert(&q->lock, MA_OWNED);
222 	if (reclaim > 0) {
223 		t3_free_tx_desc(q, reclaim);
224 		q->cleaned += reclaim;
225 		q->in_use -= reclaim;
226 	}
227 	return (reclaim);
228 }
229 
230 /**
231  *	should_restart_tx - are there enough resources to restart a Tx queue?
232  *	@q: the Tx queue
233  *
234  *	Checks if there are enough descriptors to restart a suspended Tx queue.
235  */
236 static __inline int
237 should_restart_tx(const struct sge_txq *q)
238 {
239 	unsigned int r = q->processed - q->cleaned;
240 
241 	return q->in_use - r < (q->size >> 1);
242 }
243 
244 /**
245  *	t3_sge_init - initialize SGE
246  *	@adap: the adapter
247  *	@p: the SGE parameters
248  *
249  *	Performs SGE initialization needed every time after a chip reset.
250  *	We do not initialize any of the queue sets here, instead the driver
251  *	top-level must request those individually.  We also do not enable DMA
252  *	here, that should be done after the queues have been set up.
253  */
254 void
255 t3_sge_init(adapter_t *adap, struct sge_params *p)
256 {
257 	u_int ctrl, ups;
258 
259 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
260 
261 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
262 	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
263 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
264 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
265 #if SGE_NUM_GENBITS == 1
266 	ctrl |= F_EGRGENCTRL;
267 #endif
268 	if (adap->params.rev > 0) {
269 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
270 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
271 	}
272 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
273 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
274 		     V_LORCQDRBTHRSH(512));
275 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
276 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
277 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
278 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
279 		     adap->params.rev < T3_REV_C ? 1000 : 500);
280 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
281 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
282 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
283 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
284 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
285 }
286 
287 
288 /**
289  *	sgl_len - calculates the size of an SGL of the given capacity
290  *	@n: the number of SGL entries
291  *
292  *	Calculates the number of flits needed for a scatter/gather list that
293  *	can hold the given number of entries.
294  */
295 static __inline unsigned int
296 sgl_len(unsigned int n)
297 {
298 	return ((3 * n) / 2 + (n & 1));
299 }
300 
301 /**
302  *	get_imm_packet - return the next ingress packet buffer from a response
303  *	@resp: the response descriptor containing the packet data
304  *
305  *	Return a packet containing the immediate data of the given response.
306  */
307 static int
308 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
309 {
310 
311 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
312 	m->m_ext.ext_buf = NULL;
313 	m->m_ext.ext_type = 0;
314 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
315 	return (0);
316 }
317 
318 static __inline u_int
319 flits_to_desc(u_int n)
320 {
321 	return (flit_desc_map[n]);
322 }
323 
324 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
325 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
326 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
327 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
328 		    F_HIRCQPARITYERROR)
329 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
330 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
331 		      F_RSPQDISABLED)
332 
333 /**
334  *	t3_sge_err_intr_handler - SGE async event interrupt handler
335  *	@adapter: the adapter
336  *
337  *	Interrupt handler for SGE asynchronous (non-data) events.
338  */
339 void
340 t3_sge_err_intr_handler(adapter_t *adapter)
341 {
342 	unsigned int v, status;
343 
344 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
345 	if (status & SGE_PARERR)
346 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
347 			 status & SGE_PARERR);
348 	if (status & SGE_FRAMINGERR)
349 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
350 			 status & SGE_FRAMINGERR);
351 	if (status & F_RSPQCREDITOVERFOW)
352 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
353 
354 	if (status & F_RSPQDISABLED) {
355 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
356 
357 		CH_ALERT(adapter,
358 			 "packet delivered to disabled response queue (0x%x)\n",
359 			 (v >> S_RSPQ0DISABLED) & 0xff);
360 	}
361 
362 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
363 	if (status & SGE_FATALERR)
364 		t3_fatal_err(adapter);
365 }
366 
367 void
368 t3_sge_prep(adapter_t *adap, struct sge_params *p)
369 {
370 	int i, nqsets;
371 
372 	nqsets = min(SGE_QSETS, mp_ncpus*4);
373 
374 	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
375 
376 	while (!powerof2(fl_q_size))
377 		fl_q_size--;
378 #if __FreeBSD_version > 800000
379 	if (cxgb_use_16k_clusters)
380 		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
381 	else
382 		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
383 #else
384 	jumbo_q_size = min(nmbjumbo4/(3*nqsets), JUMBO_Q_SIZE);
385 #endif
386 	while (!powerof2(jumbo_q_size))
387 		jumbo_q_size--;
388 
389 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
390 	p->max_pkt_size = adap->sge.qs[0].fl[1].buf_size - sizeof(struct cpl_rx_data);
391 
392 	for (i = 0; i < SGE_QSETS; ++i) {
393 		struct qset_params *q = p->qset + i;
394 
395 		if (adap->params.nports > 2) {
396 			q->coalesce_usecs = 50;
397 		} else {
398 #ifdef INVARIANTS
399 			q->coalesce_usecs = 10;
400 #else
401 			q->coalesce_usecs = 5;
402 #endif
403 		}
404 		q->polling = adap->params.rev > 0;
405 		q->rspq_size = RSPQ_Q_SIZE;
406 		q->fl_size = fl_q_size;
407 		q->jumbo_size = jumbo_q_size;
408 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
409 		q->txq_size[TXQ_OFLD] = 1024;
410 		q->txq_size[TXQ_CTRL] = 256;
411 		q->cong_thres = 0;
412 	}
413 }
414 
415 int
416 t3_sge_alloc(adapter_t *sc)
417 {
418 
419 	/* The parent tag. */
420 	if (bus_dma_tag_create( NULL,			/* parent */
421 				1, 0,			/* algnmnt, boundary */
422 				BUS_SPACE_MAXADDR,	/* lowaddr */
423 				BUS_SPACE_MAXADDR,	/* highaddr */
424 				NULL, NULL,		/* filter, filterarg */
425 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
426 				BUS_SPACE_UNRESTRICTED, /* nsegments */
427 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
428 				0,			/* flags */
429 				NULL, NULL,		/* lock, lockarg */
430 				&sc->parent_dmat)) {
431 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
432 		return (ENOMEM);
433 	}
434 
435 	/*
436 	 * DMA tag for normal sized RX frames
437 	 */
438 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
439 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
440 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
441 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
442 		return (ENOMEM);
443 	}
444 
445 	/*
446 	 * DMA tag for jumbo sized RX frames.
447 	 */
448 	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
449 		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
450 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
451 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
452 		return (ENOMEM);
453 	}
454 
455 	/*
456 	 * DMA tag for TX frames.
457 	 */
458 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
459 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
460 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
461 		NULL, NULL, &sc->tx_dmat)) {
462 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
463 		return (ENOMEM);
464 	}
465 
466 	return (0);
467 }
468 
469 int
470 t3_sge_free(struct adapter * sc)
471 {
472 
473 	if (sc->tx_dmat != NULL)
474 		bus_dma_tag_destroy(sc->tx_dmat);
475 
476 	if (sc->rx_jumbo_dmat != NULL)
477 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
478 
479 	if (sc->rx_dmat != NULL)
480 		bus_dma_tag_destroy(sc->rx_dmat);
481 
482 	if (sc->parent_dmat != NULL)
483 		bus_dma_tag_destroy(sc->parent_dmat);
484 
485 	return (0);
486 }
487 
488 void
489 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
490 {
491 
492 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
493 	qs->rspq.polling = 0 /* p->polling */;
494 }
495 
496 #if !defined(__i386__) && !defined(__amd64__)
497 static void
498 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
499 {
500 	struct refill_fl_cb_arg *cb_arg = arg;
501 
502 	cb_arg->error = error;
503 	cb_arg->seg = segs[0];
504 	cb_arg->nseg = nseg;
505 
506 }
507 #endif
508 /**
509  *	refill_fl - refill an SGE free-buffer list
510  *	@sc: the controller softc
511  *	@q: the free-list to refill
512  *	@n: the number of new buffers to allocate
513  *
514  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
515  *	The caller must assure that @n does not exceed the queue's capacity.
516  */
517 static void
518 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
519 {
520 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
521 	struct rx_desc *d = &q->desc[q->pidx];
522 	struct refill_fl_cb_arg cb_arg;
523 	caddr_t cl;
524 	int err, count = 0;
525 	int header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
526 
527 	cb_arg.error = 0;
528 	while (n--) {
529 		/*
530 		 * We only allocate a cluster, mbuf allocation happens after rx
531 		 */
532 		if ((cl = cxgb_cache_get(q->zone)) == NULL) {
533 			log(LOG_WARNING, "Failed to allocate cluster\n");
534 			goto done;
535 		}
536 
537 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
538 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
539 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
540 				uma_zfree(q->zone, cl);
541 				goto done;
542 			}
543 			sd->flags |= RX_SW_DESC_MAP_CREATED;
544 		}
545 #if !defined(__i386__) && !defined(__amd64__)
546 		err = bus_dmamap_load(q->entry_tag, sd->map,
547 		    cl + header_size, q->buf_size,
548 		    refill_fl_cb, &cb_arg, 0);
549 
550 		if (err != 0 || cb_arg.error) {
551 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
552 			/*
553 			 * XXX free cluster
554 			 */
555 			return;
556 		}
557 #else
558 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)(cl + header_size));
559 #endif
560 		sd->flags |= RX_SW_DESC_INUSE;
561 		sd->rxsd_cl = cl;
562 		sd->data = cl + header_size;
563 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
564 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
565 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
566 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
567 
568 		d++;
569 		sd++;
570 
571 		if (++q->pidx == q->size) {
572 			q->pidx = 0;
573 			q->gen ^= 1;
574 			sd = q->sdesc;
575 			d = q->desc;
576 		}
577 		q->credits++;
578 		count++;
579 	}
580 
581 done:
582 	if (count)
583 		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
584 }
585 
586 
587 /**
588  *	free_rx_bufs - free the Rx buffers on an SGE free list
589  *	@sc: the controle softc
590  *	@q: the SGE free list to clean up
591  *
592  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
593  *	this queue should be stopped before calling this function.
594  */
595 static void
596 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
597 {
598 	u_int cidx = q->cidx;
599 
600 	while (q->credits--) {
601 		struct rx_sw_desc *d = &q->sdesc[cidx];
602 
603 		if (d->flags & RX_SW_DESC_INUSE) {
604 			bus_dmamap_unload(q->entry_tag, d->map);
605 			bus_dmamap_destroy(q->entry_tag, d->map);
606 			uma_zfree(q->zone, d->rxsd_cl);
607 		}
608 		d->rxsd_cl = NULL;
609 		if (++cidx == q->size)
610 			cidx = 0;
611 	}
612 }
613 
614 static __inline void
615 __refill_fl(adapter_t *adap, struct sge_fl *fl)
616 {
617 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
618 }
619 
620 static __inline void
621 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
622 {
623 	if ((fl->size - fl->credits) < max)
624 		refill_fl(adap, fl, min(max, fl->size - fl->credits));
625 }
626 
627 void
628 refill_fl_service(adapter_t *adap, struct sge_fl *fl)
629 {
630 	__refill_fl_lt(adap, fl, 512);
631 }
632 
633 /**
634  *	recycle_rx_buf - recycle a receive buffer
635  *	@adapter: the adapter
636  *	@q: the SGE free list
637  *	@idx: index of buffer to recycle
638  *
639  *	Recycles the specified buffer on the given free list by adding it at
640  *	the next available slot on the list.
641  */
642 static void
643 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
644 {
645 	struct rx_desc *from = &q->desc[idx];
646 	struct rx_desc *to   = &q->desc[q->pidx];
647 
648 	q->sdesc[q->pidx] = q->sdesc[idx];
649 	to->addr_lo = from->addr_lo;        // already big endian
650 	to->addr_hi = from->addr_hi;        // likewise
651 	wmb();
652 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
653 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
654 	q->credits++;
655 
656 	if (++q->pidx == q->size) {
657 		q->pidx = 0;
658 		q->gen ^= 1;
659 	}
660 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
661 }
662 
663 static void
664 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
665 {
666 	uint32_t *addr;
667 
668 	addr = arg;
669 	*addr = segs[0].ds_addr;
670 }
671 
672 static int
673 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
674     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
675     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
676 {
677 	size_t len = nelem * elem_size;
678 	void *s = NULL;
679 	void *p = NULL;
680 	int err;
681 
682 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
683 				      BUS_SPACE_MAXADDR_32BIT,
684 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
685 				      len, 0, NULL, NULL, tag)) != 0) {
686 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
687 		return (ENOMEM);
688 	}
689 
690 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
691 				    map)) != 0) {
692 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
693 		return (ENOMEM);
694 	}
695 
696 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
697 	bzero(p, len);
698 	*(void **)desc = p;
699 
700 	if (sw_size) {
701 		len = nelem * sw_size;
702 		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
703 		*(void **)sdesc = s;
704 	}
705 	if (parent_entry_tag == NULL)
706 		return (0);
707 
708 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
709 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
710 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
711 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
712 		                      NULL, NULL, entry_tag)) != 0) {
713 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
714 		return (ENOMEM);
715 	}
716 	return (0);
717 }
718 
719 static void
720 sge_slow_intr_handler(void *arg, int ncount)
721 {
722 	adapter_t *sc = arg;
723 
724 	t3_slow_intr_handler(sc);
725 }
726 
727 /**
728  *	sge_timer_cb - perform periodic maintenance of an SGE qset
729  *	@data: the SGE queue set to maintain
730  *
731  *	Runs periodically from a timer to perform maintenance of an SGE queue
732  *	set.  It performs two tasks:
733  *
734  *	a) Cleans up any completed Tx descriptors that may still be pending.
735  *	Normal descriptor cleanup happens when new packets are added to a Tx
736  *	queue so this timer is relatively infrequent and does any cleanup only
737  *	if the Tx queue has not seen any new packets in a while.  We make a
738  *	best effort attempt to reclaim descriptors, in that we don't wait
739  *	around if we cannot get a queue's lock (which most likely is because
740  *	someone else is queueing new packets and so will also handle the clean
741  *	up).  Since control queues use immediate data exclusively we don't
742  *	bother cleaning them up here.
743  *
744  *	b) Replenishes Rx queues that have run out due to memory shortage.
745  *	Normally new Rx buffers are added when existing ones are consumed but
746  *	when out of memory a queue can become empty.  We try to add only a few
747  *	buffers here, the queue will be replenished fully as these new buffers
748  *	are used up if memory shortage has subsided.
749  *
750  *	c) Return coalesced response queue credits in case a response queue is
751  *	starved.
752  *
753  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
754  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
755  */
756 static void
757 sge_timer_cb(void *arg)
758 {
759 	adapter_t *sc = arg;
760 #ifndef IFNET_MULTIQUEUE
761 	struct port_info *pi;
762 	struct sge_qset *qs;
763 	struct sge_txq  *txq;
764 	int i, j;
765 	int reclaim_ofl, refill_rx;
766 
767 	for (i = 0; i < sc->params.nports; i++)
768 		for (j = 0; j < sc->port[i].nqsets; j++) {
769 			qs = &sc->sge.qs[i + j];
770 			txq = &qs->txq[0];
771 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
772 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
773 			    (qs->fl[1].credits < qs->fl[1].size));
774 			if (reclaim_ofl || refill_rx) {
775 				pi = &sc->port[i];
776 				taskqueue_enqueue(pi->tq, &pi->timer_reclaim_task);
777 				break;
778 			}
779 		}
780 #endif
781 	if (sc->params.nports > 2) {
782 		int i;
783 
784 		for_each_port(sc, i) {
785 			struct port_info *pi = &sc->port[i];
786 
787 			t3_write_reg(sc, A_SG_KDOORBELL,
788 				     F_SELEGRCNTX |
789 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
790 		}
791 	}
792 	if (sc->open_device_map != 0)
793 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
794 }
795 
796 /*
797  * This is meant to be a catch-all function to keep sge state private
798  * to sge.c
799  *
800  */
801 int
802 t3_sge_init_adapter(adapter_t *sc)
803 {
804 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
805 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
806 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
807 	mi_init();
808 	cxgb_cache_init();
809 	return (0);
810 }
811 
812 int
813 t3_sge_reset_adapter(adapter_t *sc)
814 {
815 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
816 	return (0);
817 }
818 
819 int
820 t3_sge_init_port(struct port_info *pi)
821 {
822 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
823 	return (0);
824 }
825 
826 void
827 t3_sge_deinit_sw(adapter_t *sc)
828 {
829 
830 	mi_deinit();
831 }
832 
833 /**
834  *	refill_rspq - replenish an SGE response queue
835  *	@adapter: the adapter
836  *	@q: the response queue to replenish
837  *	@credits: how many new responses to make available
838  *
839  *	Replenishes a response queue by making the supplied number of responses
840  *	available to HW.
841  */
842 static __inline void
843 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
844 {
845 
846 	/* mbufs are allocated on demand when a rspq entry is processed. */
847 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
848 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
849 }
850 
851 static __inline void
852 sge_txq_reclaim_(struct sge_txq *txq, int force)
853 {
854 
855 	if (desc_reclaimable(txq) < 16)
856 		return;
857 	if (mtx_trylock(&txq->lock) == 0)
858 		return;
859 	reclaim_completed_tx_(txq, 16);
860 	mtx_unlock(&txq->lock);
861 
862 }
863 
864 static void
865 sge_txq_reclaim_handler(void *arg, int ncount)
866 {
867 	struct sge_txq *q = arg;
868 
869 	sge_txq_reclaim_(q, TRUE);
870 }
871 
872 
873 
874 static void
875 sge_timer_reclaim(void *arg, int ncount)
876 {
877 	struct port_info *pi = arg;
878 	int i, nqsets = pi->nqsets;
879 	adapter_t *sc = pi->adapter;
880 	struct sge_qset *qs;
881 	struct sge_txq *txq;
882 	struct mtx *lock;
883 
884 #ifdef IFNET_MULTIQUEUE
885 	panic("%s should not be called with multiqueue support\n", __FUNCTION__);
886 #endif
887 	for (i = 0; i < nqsets; i++) {
888 		qs = &sc->sge.qs[i];
889 
890 		txq = &qs->txq[TXQ_OFLD];
891 		sge_txq_reclaim_(txq, FALSE);
892 
893 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
894 			    &sc->sge.qs[0].rspq.lock;
895 
896 		if (mtx_trylock(lock)) {
897 			/* XXX currently assume that we are *NOT* polling */
898 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
899 
900 			if (qs->fl[0].credits < qs->fl[0].size - 16)
901 				__refill_fl(sc, &qs->fl[0]);
902 			if (qs->fl[1].credits < qs->fl[1].size - 16)
903 				__refill_fl(sc, &qs->fl[1]);
904 
905 			if (status & (1 << qs->rspq.cntxt_id)) {
906 				if (qs->rspq.credits) {
907 					refill_rspq(sc, &qs->rspq, 1);
908 					qs->rspq.credits--;
909 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
910 					    1 << qs->rspq.cntxt_id);
911 				}
912 			}
913 			mtx_unlock(lock);
914 		}
915 	}
916 }
917 
918 /**
919  *	init_qset_cntxt - initialize an SGE queue set context info
920  *	@qs: the queue set
921  *	@id: the queue set id
922  *
923  *	Initializes the TIDs and context ids for the queues of a queue set.
924  */
925 static void
926 init_qset_cntxt(struct sge_qset *qs, u_int id)
927 {
928 
929 	qs->rspq.cntxt_id = id;
930 	qs->fl[0].cntxt_id = 2 * id;
931 	qs->fl[1].cntxt_id = 2 * id + 1;
932 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
933 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
934 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
935 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
936 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
937 
938 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
939 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
940 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
941 }
942 
943 
944 static void
945 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
946 {
947 	txq->in_use += ndesc;
948 	/*
949 	 * XXX we don't handle stopping of queue
950 	 * presumably start handles this when we bump against the end
951 	 */
952 	txqs->gen = txq->gen;
953 	txq->unacked += ndesc;
954 	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
955 	txq->unacked &= 31;
956 	txqs->pidx = txq->pidx;
957 	txq->pidx += ndesc;
958 #ifdef INVARIANTS
959 	if (((txqs->pidx > txq->cidx) &&
960 		(txq->pidx < txqs->pidx) &&
961 		(txq->pidx >= txq->cidx)) ||
962 	    ((txqs->pidx < txq->cidx) &&
963 		(txq->pidx >= txq-> cidx)) ||
964 	    ((txqs->pidx < txq->cidx) &&
965 		(txq->cidx < txqs->pidx)))
966 		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
967 		    txqs->pidx, txq->pidx, txq->cidx);
968 #endif
969 	if (txq->pidx >= txq->size) {
970 		txq->pidx -= txq->size;
971 		txq->gen ^= 1;
972 	}
973 
974 }
975 
976 /**
977  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
978  *	@m: the packet mbufs
979  *      @nsegs: the number of segments
980  *
981  * 	Returns the number of Tx descriptors needed for the given Ethernet
982  * 	packet.  Ethernet packets require addition of WR and CPL headers.
983  */
984 static __inline unsigned int
985 calc_tx_descs(const struct mbuf *m, int nsegs)
986 {
987 	unsigned int flits;
988 
989 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
990 		return 1;
991 
992 	flits = sgl_len(nsegs) + 2;
993 #ifdef TSO_SUPPORTED
994 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
995 		flits++;
996 #endif
997 	return flits_to_desc(flits);
998 }
999 
1000 static unsigned int
1001 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
1002     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
1003 {
1004 	struct mbuf *m0;
1005 	int err, pktlen, pass = 0;
1006 
1007 retry:
1008 	err = 0;
1009 	m0 = *m;
1010 	pktlen = m0->m_pkthdr.len;
1011 #if defined(__i386__) || defined(__amd64__)
1012 	if (busdma_map_sg_collapse(m, segs, nsegs) == 0) {
1013 		goto done;
1014 	} else
1015 #endif
1016 		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, txsd->map, m0, segs, nsegs, 0);
1017 
1018 	if (err == 0) {
1019 		goto done;
1020 	}
1021 	if (err == EFBIG && pass == 0) {
1022 		pass = 1;
1023 		/* Too many segments, try to defrag */
1024 		m0 = m_defrag(m0, M_DONTWAIT);
1025 		if (m0 == NULL) {
1026 			m_freem(*m);
1027 			*m = NULL;
1028 			return (ENOBUFS);
1029 		}
1030 		*m = m0;
1031 		goto retry;
1032 	} else if (err == ENOMEM) {
1033 		return (err);
1034 	} if (err) {
1035 		if (cxgb_debug)
1036 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1037 		m_freem(m0);
1038 		*m = NULL;
1039 		return (err);
1040 	}
1041 done:
1042 #if !defined(__i386__) && !defined(__amd64__)
1043 	bus_dmamap_sync(txq->entry_tag, txsd->map, BUS_DMASYNC_PREWRITE);
1044 #endif
1045 	txsd->flags |= TX_SW_DESC_MAPPED;
1046 
1047 	return (0);
1048 }
1049 
1050 /**
1051  *	make_sgl - populate a scatter/gather list for a packet
1052  *	@sgp: the SGL to populate
1053  *	@segs: the packet dma segments
1054  *	@nsegs: the number of segments
1055  *
1056  *	Generates a scatter/gather list for the buffers that make up a packet
1057  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1058  *	appropriately.
1059  */
1060 static __inline void
1061 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1062 {
1063 	int i, idx;
1064 
1065 	for (idx = 0, i = 0; i < nsegs; i++) {
1066 		/*
1067 		 * firmware doesn't like empty segments
1068 		 */
1069 		if (segs[i].ds_len == 0)
1070 			continue;
1071 		if (i && idx == 0)
1072 			++sgp;
1073 
1074 		sgp->len[idx] = htobe32(segs[i].ds_len);
1075 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1076 		idx ^= 1;
1077 	}
1078 
1079 	if (idx) {
1080 		sgp->len[idx] = 0;
1081 		sgp->addr[idx] = 0;
1082 	}
1083 }
1084 
1085 /**
1086  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1087  *	@adap: the adapter
1088  *	@q: the Tx queue
1089  *
1090  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1091  *	where the HW is going to sleep just after we checked, however,
1092  *	then the interrupt handler will detect the outstanding TX packet
1093  *	and ring the doorbell for us.
1094  *
1095  *	When GTS is disabled we unconditionally ring the doorbell.
1096  */
1097 static __inline void
1098 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1099 {
1100 #if USE_GTS
1101 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1102 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1103 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1104 #ifdef T3_TRACE
1105 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1106 			  q->cntxt_id);
1107 #endif
1108 		t3_write_reg(adap, A_SG_KDOORBELL,
1109 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1110 	}
1111 #else
1112 	wmb();            /* write descriptors before telling HW */
1113 	t3_write_reg(adap, A_SG_KDOORBELL,
1114 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1115 #endif
1116 }
1117 
1118 static __inline void
1119 wr_gen2(struct tx_desc *d, unsigned int gen)
1120 {
1121 #if SGE_NUM_GENBITS == 2
1122 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1123 #endif
1124 }
1125 
1126 /**
1127  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1128  *	@ndesc: number of Tx descriptors spanned by the SGL
1129  *	@txd: first Tx descriptor to be written
1130  *	@txqs: txq state (generation and producer index)
1131  *	@txq: the SGE Tx queue
1132  *	@sgl: the SGL
1133  *	@flits: number of flits to the start of the SGL in the first descriptor
1134  *	@sgl_flits: the SGL size in flits
1135  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1136  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1137  *
1138  *	Write a work request header and an associated SGL.  If the SGL is
1139  *	small enough to fit into one Tx descriptor it has already been written
1140  *	and we just need to write the WR header.  Otherwise we distribute the
1141  *	SGL across the number of descriptors it spans.
1142  */
1143 static void
1144 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1145     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1146     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1147 {
1148 
1149 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1150 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1151 
1152 	if (__predict_true(ndesc == 1)) {
1153 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1154 		    V_WR_SGLSFLT(flits)) | wr_hi;
1155 		wmb();
1156 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1157 		    V_WR_GEN(txqs->gen)) | wr_lo;
1158 		/* XXX gen? */
1159 		wr_gen2(txd, txqs->gen);
1160 
1161 	} else {
1162 		unsigned int ogen = txqs->gen;
1163 		const uint64_t *fp = (const uint64_t *)sgl;
1164 		struct work_request_hdr *wp = wrp;
1165 
1166 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1167 		    V_WR_SGLSFLT(flits)) | wr_hi;
1168 
1169 		while (sgl_flits) {
1170 			unsigned int avail = WR_FLITS - flits;
1171 
1172 			if (avail > sgl_flits)
1173 				avail = sgl_flits;
1174 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1175 			sgl_flits -= avail;
1176 			ndesc--;
1177 			if (!sgl_flits)
1178 				break;
1179 
1180 			fp += avail;
1181 			txd++;
1182 			txsd++;
1183 			if (++txqs->pidx == txq->size) {
1184 				txqs->pidx = 0;
1185 				txqs->gen ^= 1;
1186 				txd = txq->desc;
1187 				txsd = txq->sdesc;
1188 			}
1189 
1190 			/*
1191 			 * when the head of the mbuf chain
1192 			 * is freed all clusters will be freed
1193 			 * with it
1194 			 */
1195 			KASSERT(txsd->mi.mi_base == NULL,
1196 			    ("overwriting valid entry mi_base==%p", txsd->mi.mi_base));
1197 			wrp = (struct work_request_hdr *)txd;
1198 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1199 			    V_WR_SGLSFLT(1)) | wr_hi;
1200 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1201 				    sgl_flits + 1)) |
1202 			    V_WR_GEN(txqs->gen)) | wr_lo;
1203 			wr_gen2(txd, txqs->gen);
1204 			flits = 1;
1205 		}
1206 		wrp->wr_hi |= htonl(F_WR_EOP);
1207 		wmb();
1208 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1209 		wr_gen2((struct tx_desc *)wp, ogen);
1210 	}
1211 }
1212 
1213 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1214 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1215 
1216 #ifdef VLAN_SUPPORTED
1217 #define GET_VTAG(cntrl, m) \
1218 do { \
1219 	if ((m)->m_flags & M_VLANTAG)					            \
1220 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1221 } while (0)
1222 
1223 #define GET_VTAG_MI(cntrl, mi) \
1224 do { \
1225 	if ((mi)->mi_flags & M_VLANTAG)					\
1226 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((mi)->mi_ether_vtag); \
1227 } while (0)
1228 #else
1229 #define GET_VTAG(cntrl, m)
1230 #define GET_VTAG_MI(cntrl, m)
1231 #endif
1232 
1233 int
1234 t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
1235 {
1236 	adapter_t *sc;
1237 	struct mbuf *m0;
1238 	struct sge_txq *txq;
1239 	struct txq_state txqs;
1240 	struct port_info *pi;
1241 	unsigned int ndesc, flits, cntrl, mlen;
1242 	int err, nsegs, tso_info = 0;
1243 
1244 	struct work_request_hdr *wrp;
1245 	struct tx_sw_desc *txsd;
1246 	struct sg_ent *sgp, *sgl;
1247 	uint32_t wr_hi, wr_lo, sgl_flits;
1248 	bus_dma_segment_t segs[TX_MAX_SEGS];
1249 
1250 	struct tx_desc *txd;
1251 	struct mbuf_vec *mv;
1252 	struct mbuf_iovec *mi;
1253 
1254 	DPRINTF("t3_encap cpu=%d ", curcpu);
1255 
1256 	mi = NULL;
1257 	pi = qs->port;
1258 	sc = pi->adapter;
1259 	txq = &qs->txq[TXQ_ETH];
1260 	txd = &txq->desc[txq->pidx];
1261 	txsd = &txq->sdesc[txq->pidx];
1262 	sgl = txq->txq_sgl;
1263 	m0 = *m;
1264 
1265 	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
1266 	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
1267 	if (cxgb_debug)
1268 		printf("mi_base=%p cidx=%d pidx=%d\n\n", txsd->mi.mi_base, txq->cidx, txq->pidx);
1269 
1270 	mtx_assert(&txq->lock, MA_OWNED);
1271 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1272 /*
1273  * XXX need to add VLAN support for 6.x
1274  */
1275 #ifdef VLAN_SUPPORTED
1276 	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
1277 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1278 #endif
1279 	KASSERT(txsd->mi.mi_base == NULL,
1280 	    ("overwriting valid entry mi_base==%p", txsd->mi.mi_base));
1281 	if (count > 1) {
1282 		panic("count > 1 not support in CVS\n");
1283 		if ((err = busdma_map_sg_vec(m, &m0, segs, count)))
1284 			return (err);
1285 		nsegs = count;
1286 	} else if ((err = busdma_map_sg_collapse(&m0, segs, &nsegs))) {
1287 		if (cxgb_debug)
1288 			printf("failed ... err=%d\n", err);
1289 		return (err);
1290 	}
1291 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d count=%d", nsegs, count));
1292 
1293 	if (!(m0->m_pkthdr.len <= PIO_LEN)) {
1294 		mi_collapse_mbuf(&txsd->mi, m0);
1295 		mi = &txsd->mi;
1296 	}
1297 	if (count > 1) {
1298 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1299 		int i, fidx;
1300 		struct mbuf_iovec *batchmi;
1301 
1302 		mv = mtomv(m0);
1303 		batchmi = mv->mv_vec;
1304 
1305 		wrp = (struct work_request_hdr *)txd;
1306 
1307 		flits = count*2 + 1;
1308 		txq_prod(txq, 1, &txqs);
1309 
1310 		for (fidx = 1, i = 0; i < count; i++, batchmi++, fidx += 2) {
1311 			struct cpl_tx_pkt_batch_entry *cbe = &cpl_batch->pkt_entry[i];
1312 
1313 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1314 			GET_VTAG_MI(cntrl, batchmi);
1315 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1316 			if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1317 				cntrl |= F_TXPKT_IPCSUM_DIS;
1318 			if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1319 				cntrl |= F_TXPKT_L4CSUM_DIS;
1320 			cbe->cntrl = htonl(cntrl);
1321 			cbe->len = htonl(batchmi->mi_len | 0x80000000);
1322 			cbe->addr = htobe64(segs[i].ds_addr);
1323 			txd->flit[fidx] |= htobe64(1 << 24);
1324 		}
1325 
1326 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1327 		    V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1328 		wmb();
1329 		wrp->wr_lo = htonl(V_WR_LEN(flits) |
1330 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1331 		/* XXX gen? */
1332 		wr_gen2(txd, txqs.gen);
1333 		check_ring_tx_db(sc, txq);
1334 
1335 		return (0);
1336 	} else if (tso_info) {
1337 		int min_size = TCPPKTHDRSIZE, eth_type, tagged;
1338 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1339 		struct ip *ip;
1340 		struct tcphdr *tcp;
1341 		char *pkthdr;
1342 
1343 		txd->flit[2] = 0;
1344 		GET_VTAG(cntrl, m0);
1345 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1346 		hdr->cntrl = htonl(cntrl);
1347 		mlen = m0->m_pkthdr.len;
1348 		hdr->len = htonl(mlen | 0x80000000);
1349 
1350 		DPRINTF("tso buf len=%d\n", mlen);
1351 
1352 		tagged = m0->m_flags & M_VLANTAG;
1353 		if (!tagged)
1354 			min_size -= ETHER_VLAN_ENCAP_LEN;
1355 
1356 		if (__predict_false(mlen < min_size)) {
1357 			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1358 			    m0, mlen, m0->m_pkthdr.tso_segsz,
1359 			    m0->m_pkthdr.csum_flags, m0->m_flags);
1360 			panic("tx tso packet too small");
1361 		}
1362 
1363 		/* Make sure that ether, ip, tcp headers are all in m0 */
1364 		if (__predict_false(m0->m_len < min_size)) {
1365 			m0 = m_pullup(m0, min_size);
1366 			if (__predict_false(m0 == NULL)) {
1367 				/* XXX panic probably an overreaction */
1368 				panic("couldn't fit header into mbuf");
1369 			}
1370 		}
1371 		pkthdr = m0->m_data;
1372 
1373 		if (tagged) {
1374 			eth_type = CPL_ETH_II_VLAN;
1375 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1376 			    ETHER_VLAN_ENCAP_LEN);
1377 		} else {
1378 			eth_type = CPL_ETH_II;
1379 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1380 		}
1381 		tcp = (struct tcphdr *)((uint8_t *)ip +
1382 		    sizeof(*ip));
1383 
1384 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1385 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1386 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1387 		hdr->lso_info = htonl(tso_info);
1388 
1389 		if (__predict_false(mlen <= PIO_LEN)) {
1390 			/* pkt not undersized but fits in PIO_LEN */
1391 			printf("**5592 Fix** mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
1392 			    m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags);
1393 			txq_prod(txq, 1, &txqs);
1394 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1395 			m_freem(m0);
1396 			m0 = NULL;
1397 			flits = (mlen + 7) / 8 + 3;
1398 			hdr->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1399 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1400 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1401 			wmb();
1402 			hdr->wr.wr_lo = htonl(V_WR_LEN(flits) |
1403 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1404 
1405 			wr_gen2(txd, txqs.gen);
1406 			check_ring_tx_db(sc, txq);
1407 			return (0);
1408 		}
1409 		flits = 3;
1410 	} else {
1411 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1412 
1413 		GET_VTAG(cntrl, m0);
1414 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1415 		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1416 			cntrl |= F_TXPKT_IPCSUM_DIS;
1417 		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
1418 			cntrl |= F_TXPKT_L4CSUM_DIS;
1419 		cpl->cntrl = htonl(cntrl);
1420 		mlen = m0->m_pkthdr.len;
1421 		cpl->len = htonl(mlen | 0x80000000);
1422 
1423 		if (mlen <= PIO_LEN) {
1424 			txq_prod(txq, 1, &txqs);
1425 			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1426 			m_freem(m0);
1427 			m0 = NULL;
1428 			flits = (mlen + 7) / 8 + 2;
1429 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1430 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1431 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1432 			wmb();
1433 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1434 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1435 
1436 			wr_gen2(txd, txqs.gen);
1437 			check_ring_tx_db(sc, txq);
1438 			DPRINTF("pio buf\n");
1439 			return (0);
1440 		}
1441 		DPRINTF("regular buf\n");
1442 		flits = 2;
1443 	}
1444 	wrp = (struct work_request_hdr *)txd;
1445 
1446 #ifdef	nomore
1447 	/*
1448 	 * XXX need to move into one of the helper routines above
1449 	 *
1450 	 */
1451 	if ((err = busdma_map_mbufs(m, txq, txsd, segs, &nsegs)) != 0)
1452 		return (err);
1453 	m0 = *m;
1454 #endif
1455 	ndesc = calc_tx_descs(m0, nsegs);
1456 
1457 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1458 	make_sgl(sgp, segs, nsegs);
1459 
1460 	sgl_flits = sgl_len(nsegs);
1461 
1462 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1463 	txq_prod(txq, ndesc, &txqs);
1464 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1465 	wr_lo = htonl(V_WR_TID(txq->token));
1466 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
1467 	check_ring_tx_db(pi->adapter, txq);
1468 
1469 	if ((m0->m_type == MT_DATA) &&
1470 	    ((m0->m_flags & (M_EXT|M_NOFREE)) == M_EXT) &&
1471 	    (m0->m_ext.ext_type != EXT_PACKET)) {
1472 		m0->m_flags &= ~M_EXT ;
1473 		cxgb_mbufs_outstanding--;
1474 		m_free(m0);
1475 	}
1476 
1477 	return (0);
1478 }
1479 
1480 
1481 /**
1482  *	write_imm - write a packet into a Tx descriptor as immediate data
1483  *	@d: the Tx descriptor to write
1484  *	@m: the packet
1485  *	@len: the length of packet data to write as immediate data
1486  *	@gen: the generation bit value to write
1487  *
1488  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1489  *	contains a work request at its beginning.  We must write the packet
1490  *	carefully so the SGE doesn't read accidentally before it's written in
1491  *	its entirety.
1492  */
1493 static __inline void
1494 write_imm(struct tx_desc *d, struct mbuf *m,
1495 	  unsigned int len, unsigned int gen)
1496 {
1497 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1498 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1499 
1500 	if (len > WR_LEN)
1501 		panic("len too big %d\n", len);
1502 	if (len < sizeof(*from))
1503 		panic("len too small %d", len);
1504 
1505 	memcpy(&to[1], &from[1], len - sizeof(*from));
1506 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1507 					V_WR_BCNTLFLT(len & 7));
1508 	wmb();
1509 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1510 					V_WR_LEN((len + 7) / 8));
1511 	wr_gen2(d, gen);
1512 
1513 	/*
1514 	 * This check is a hack we should really fix the logic so
1515 	 * that this can't happen
1516 	 */
1517 	if (m->m_type != MT_DONTFREE)
1518 		m_freem(m);
1519 
1520 }
1521 
1522 /**
1523  *	check_desc_avail - check descriptor availability on a send queue
1524  *	@adap: the adapter
1525  *	@q: the TX queue
1526  *	@m: the packet needing the descriptors
1527  *	@ndesc: the number of Tx descriptors needed
1528  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1529  *
1530  *	Checks if the requested number of Tx descriptors is available on an
1531  *	SGE send queue.  If the queue is already suspended or not enough
1532  *	descriptors are available the packet is queued for later transmission.
1533  *	Must be called with the Tx queue locked.
1534  *
1535  *	Returns 0 if enough descriptors are available, 1 if there aren't
1536  *	enough descriptors and the packet has been queued, and 2 if the caller
1537  *	needs to retry because there weren't enough descriptors at the
1538  *	beginning of the call but some freed up in the mean time.
1539  */
1540 static __inline int
1541 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1542 		 struct mbuf *m, unsigned int ndesc,
1543 		 unsigned int qid)
1544 {
1545 	/*
1546 	 * XXX We currently only use this for checking the control queue
1547 	 * the control queue is only used for binding qsets which happens
1548 	 * at init time so we are guaranteed enough descriptors
1549 	 */
1550 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1551 addq_exit:	mbufq_tail(&q->sendq, m);
1552 		return 1;
1553 	}
1554 	if (__predict_false(q->size - q->in_use < ndesc)) {
1555 
1556 		struct sge_qset *qs = txq_to_qset(q, qid);
1557 
1558 		printf("stopping q\n");
1559 
1560 		setbit(&qs->txq_stopped, qid);
1561 		smp_mb();
1562 
1563 		if (should_restart_tx(q) &&
1564 		    test_and_clear_bit(qid, &qs->txq_stopped))
1565 			return 2;
1566 
1567 		q->stops++;
1568 		goto addq_exit;
1569 	}
1570 	return 0;
1571 }
1572 
1573 
1574 /**
1575  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1576  *	@q: the SGE control Tx queue
1577  *
1578  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1579  *	that send only immediate data (presently just the control queues) and
1580  *	thus do not have any mbufs
1581  */
1582 static __inline void
1583 reclaim_completed_tx_imm(struct sge_txq *q)
1584 {
1585 	unsigned int reclaim = q->processed - q->cleaned;
1586 
1587 	mtx_assert(&q->lock, MA_OWNED);
1588 
1589 	q->in_use -= reclaim;
1590 	q->cleaned += reclaim;
1591 }
1592 
1593 static __inline int
1594 immediate(const struct mbuf *m)
1595 {
1596 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1597 }
1598 
1599 /**
1600  *	ctrl_xmit - send a packet through an SGE control Tx queue
1601  *	@adap: the adapter
1602  *	@q: the control queue
1603  *	@m: the packet
1604  *
1605  *	Send a packet through an SGE control Tx queue.  Packets sent through
1606  *	a control queue must fit entirely as immediate data in a single Tx
1607  *	descriptor and have no page fragments.
1608  */
1609 static int
1610 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1611 {
1612 	int ret;
1613 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1614 
1615 	if (__predict_false(!immediate(m))) {
1616 		m_freem(m);
1617 		return 0;
1618 	}
1619 
1620 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1621 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1622 
1623 	mtx_lock(&q->lock);
1624 again:	reclaim_completed_tx_imm(q);
1625 
1626 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1627 	if (__predict_false(ret)) {
1628 		if (ret == 1) {
1629 			mtx_unlock(&q->lock);
1630 			log(LOG_ERR, "no desc available\n");
1631 			return (ENOSPC);
1632 		}
1633 		goto again;
1634 	}
1635 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1636 
1637 	q->in_use++;
1638 	if (++q->pidx >= q->size) {
1639 		q->pidx = 0;
1640 		q->gen ^= 1;
1641 	}
1642 	mtx_unlock(&q->lock);
1643 	wmb();
1644 	t3_write_reg(adap, A_SG_KDOORBELL,
1645 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1646 	return (0);
1647 }
1648 
1649 
1650 /**
1651  *	restart_ctrlq - restart a suspended control queue
1652  *	@qs: the queue set cotaining the control queue
1653  *
1654  *	Resumes transmission on a suspended Tx control queue.
1655  */
1656 static void
1657 restart_ctrlq(void *data, int npending)
1658 {
1659 	struct mbuf *m;
1660 	struct sge_qset *qs = (struct sge_qset *)data;
1661 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1662 	adapter_t *adap = qs->port->adapter;
1663 
1664 	log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use);
1665 
1666 	mtx_lock(&q->lock);
1667 again:	reclaim_completed_tx_imm(q);
1668 
1669 	while (q->in_use < q->size &&
1670 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1671 
1672 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1673 
1674 		if (++q->pidx >= q->size) {
1675 			q->pidx = 0;
1676 			q->gen ^= 1;
1677 		}
1678 		q->in_use++;
1679 	}
1680 	if (!mbufq_empty(&q->sendq)) {
1681 		setbit(&qs->txq_stopped, TXQ_CTRL);
1682 		smp_mb();
1683 
1684 		if (should_restart_tx(q) &&
1685 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1686 			goto again;
1687 		q->stops++;
1688 	}
1689 	mtx_unlock(&q->lock);
1690 	wmb();
1691 	t3_write_reg(adap, A_SG_KDOORBELL,
1692 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1693 }
1694 
1695 
1696 /*
1697  * Send a management message through control queue 0
1698  */
1699 int
1700 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1701 {
1702 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1703 }
1704 
1705 
1706 /**
1707  *	free_qset - free the resources of an SGE queue set
1708  *	@sc: the controller owning the queue set
1709  *	@q: the queue set
1710  *
1711  *	Release the HW and SW resources associated with an SGE queue set, such
1712  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1713  *	queue set must be quiesced prior to calling this.
1714  */
1715 void
1716 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1717 {
1718 	int i;
1719 
1720 	t3_free_tx_desc_all(&q->txq[TXQ_ETH]);
1721 
1722 	for (i = 0; i < SGE_TXQ_PER_SET; i++)
1723 		if (q->txq[i].txq_mr.br_ring != NULL) {
1724 			free(q->txq[i].txq_mr.br_ring, M_DEVBUF);
1725 			mtx_destroy(&q->txq[i].txq_mr.br_lock);
1726 		}
1727 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1728 		if (q->fl[i].desc) {
1729 			mtx_lock_spin(&sc->sge.reg_lock);
1730 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1731 			mtx_unlock_spin(&sc->sge.reg_lock);
1732 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1733 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1734 					q->fl[i].desc_map);
1735 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1736 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1737 		}
1738 		if (q->fl[i].sdesc) {
1739 			free_rx_bufs(sc, &q->fl[i]);
1740 			free(q->fl[i].sdesc, M_DEVBUF);
1741 		}
1742 	}
1743 
1744 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
1745 		if (q->txq[i].desc) {
1746 			mtx_lock_spin(&sc->sge.reg_lock);
1747 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1748 			mtx_unlock_spin(&sc->sge.reg_lock);
1749 			bus_dmamap_unload(q->txq[i].desc_tag,
1750 					q->txq[i].desc_map);
1751 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1752 					q->txq[i].desc_map);
1753 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1754 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1755 			MTX_DESTROY(&q->txq[i].lock);
1756 		}
1757 		if (q->txq[i].sdesc) {
1758 			free(q->txq[i].sdesc, M_DEVBUF);
1759 		}
1760 	}
1761 
1762 	if (q->rspq.desc) {
1763 		mtx_lock_spin(&sc->sge.reg_lock);
1764 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1765 		mtx_unlock_spin(&sc->sge.reg_lock);
1766 
1767 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1768 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1769 			        q->rspq.desc_map);
1770 		bus_dma_tag_destroy(q->rspq.desc_tag);
1771 		MTX_DESTROY(&q->rspq.lock);
1772 	}
1773 
1774 	tcp_lro_free(&q->lro.ctrl);
1775 
1776 	bzero(q, sizeof(*q));
1777 }
1778 
1779 /**
1780  *	t3_free_sge_resources - free SGE resources
1781  *	@sc: the adapter softc
1782  *
1783  *	Frees resources used by the SGE queue sets.
1784  */
1785 void
1786 t3_free_sge_resources(adapter_t *sc)
1787 {
1788 	int i, nqsets;
1789 
1790 #ifdef IFNET_MULTIQUEUE
1791 	panic("%s should not be called when IFNET_MULTIQUEUE is defined", __FUNCTION__);
1792 #endif
1793 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1794 		nqsets += sc->port[i].nqsets;
1795 
1796 	for (i = 0; i < nqsets; ++i)
1797 		t3_free_qset(sc, &sc->sge.qs[i]);
1798 }
1799 
1800 /**
1801  *	t3_sge_start - enable SGE
1802  *	@sc: the controller softc
1803  *
1804  *	Enables the SGE for DMAs.  This is the last step in starting packet
1805  *	transfers.
1806  */
1807 void
1808 t3_sge_start(adapter_t *sc)
1809 {
1810 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1811 }
1812 
1813 /**
1814  *	t3_sge_stop - disable SGE operation
1815  *	@sc: the adapter
1816  *
1817  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
1818  *	from error interrupts) or from normal process context.  In the latter
1819  *	case it also disables any pending queue restart tasklets.  Note that
1820  *	if it is called in interrupt context it cannot disable the restart
1821  *	tasklets as it cannot wait, however the tasklets will have no effect
1822  *	since the doorbells are disabled and the driver will call this again
1823  *	later from process context, at which time the tasklets will be stopped
1824  *	if they are still running.
1825  */
1826 void
1827 t3_sge_stop(adapter_t *sc)
1828 {
1829 	int i, nqsets;
1830 
1831 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
1832 
1833 	if (sc->tq == NULL)
1834 		return;
1835 
1836 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1837 		nqsets += sc->port[i].nqsets;
1838 #ifdef notyet
1839 	/*
1840 	 *
1841 	 * XXX
1842 	 */
1843 	for (i = 0; i < nqsets; ++i) {
1844 		struct sge_qset *qs = &sc->sge.qs[i];
1845 
1846 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
1847 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
1848 	}
1849 #endif
1850 }
1851 
1852 /**
1853  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
1854  *	@adapter: the adapter
1855  *	@q: the Tx queue to reclaim descriptors from
1856  *	@reclaimable: the number of descriptors to reclaim
1857  *      @m_vec_size: maximum number of buffers to reclaim
1858  *      @desc_reclaimed: returns the number of descriptors reclaimed
1859  *
1860  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1861  *	Tx buffers.  Called with the Tx queue lock held.
1862  *
1863  *      Returns number of buffers of reclaimed
1864  */
1865 void
1866 t3_free_tx_desc(struct sge_txq *q, int reclaimable)
1867 {
1868 	struct tx_sw_desc *txsd;
1869 	unsigned int cidx;
1870 
1871 #ifdef T3_TRACE
1872 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1873 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
1874 #endif
1875 	cidx = q->cidx;
1876 	txsd = &q->sdesc[cidx];
1877 	DPRINTF("reclaiming %d WR\n", reclaimable);
1878 	mtx_assert(&q->lock, MA_OWNED);
1879 	while (reclaimable--) {
1880 		DPRINTF("cidx=%d d=%p\n", cidx, txsd);
1881 		if (txsd->mi.mi_base != NULL) {
1882 			if (txsd->flags & TX_SW_DESC_MAPPED) {
1883 				bus_dmamap_unload(q->entry_tag, txsd->map);
1884 				txsd->flags &= ~TX_SW_DESC_MAPPED;
1885 			}
1886 			m_freem_iovec(&txsd->mi);
1887 			buf_ring_scan(&q->txq_mr, txsd->mi.mi_base, __FILE__, __LINE__);
1888 			txsd->mi.mi_base = NULL;
1889 
1890 #if defined(DIAGNOSTIC) && 0
1891 			if (m_get_priority(txsd->m[0]) != cidx)
1892 				printf("pri=%d cidx=%d\n",
1893 				    (int)m_get_priority(txsd->m[0]), cidx);
1894 #endif
1895 
1896 		} else
1897 			q->txq_skipped++;
1898 
1899 		++txsd;
1900 		if (++cidx == q->size) {
1901 			cidx = 0;
1902 			txsd = q->sdesc;
1903 		}
1904 	}
1905 	q->cidx = cidx;
1906 
1907 }
1908 
1909 void
1910 t3_free_tx_desc_all(struct sge_txq *q)
1911 {
1912 	int i;
1913 	struct tx_sw_desc *txsd;
1914 
1915 	for (i = 0; i < q->size; i++) {
1916 		txsd = &q->sdesc[i];
1917 		if (txsd->mi.mi_base != NULL) {
1918 			if (txsd->flags & TX_SW_DESC_MAPPED) {
1919 				bus_dmamap_unload(q->entry_tag, txsd->map);
1920 				txsd->flags &= ~TX_SW_DESC_MAPPED;
1921 			}
1922 			m_freem_iovec(&txsd->mi);
1923 			bzero(&txsd->mi, sizeof(txsd->mi));
1924 		}
1925 	}
1926 }
1927 
1928 /**
1929  *	is_new_response - check if a response is newly written
1930  *	@r: the response descriptor
1931  *	@q: the response queue
1932  *
1933  *	Returns true if a response descriptor contains a yet unprocessed
1934  *	response.
1935  */
1936 static __inline int
1937 is_new_response(const struct rsp_desc *r,
1938     const struct sge_rspq *q)
1939 {
1940 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1941 }
1942 
1943 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1944 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1945 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1946 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1947 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1948 
1949 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1950 #define NOMEM_INTR_DELAY 2500
1951 
1952 /**
1953  *	write_ofld_wr - write an offload work request
1954  *	@adap: the adapter
1955  *	@m: the packet to send
1956  *	@q: the Tx queue
1957  *	@pidx: index of the first Tx descriptor to write
1958  *	@gen: the generation value to use
1959  *	@ndesc: number of descriptors the packet will occupy
1960  *
1961  *	Write an offload work request to send the supplied packet.  The packet
1962  *	data already carry the work request with most fields populated.
1963  */
1964 static void
1965 write_ofld_wr(adapter_t *adap, struct mbuf *m,
1966     struct sge_txq *q, unsigned int pidx,
1967     unsigned int gen, unsigned int ndesc,
1968     bus_dma_segment_t *segs, unsigned int nsegs)
1969 {
1970 	unsigned int sgl_flits, flits;
1971 	struct work_request_hdr *from;
1972 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1973 	struct tx_desc *d = &q->desc[pidx];
1974 	struct txq_state txqs;
1975 
1976 	if (immediate(m) && nsegs == 0) {
1977 		write_imm(d, m, m->m_len, gen);
1978 		return;
1979 	}
1980 
1981 	/* Only TX_DATA builds SGLs */
1982 	from = mtod(m, struct work_request_hdr *);
1983 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
1984 
1985 	flits = m->m_len / 8;
1986 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
1987 
1988 	make_sgl(sgp, segs, nsegs);
1989 	sgl_flits = sgl_len(nsegs);
1990 
1991 	txqs.gen = gen;
1992 	txqs.pidx = pidx;
1993 	txqs.compl = 0;
1994 
1995 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
1996 	    from->wr_hi, from->wr_lo);
1997 }
1998 
1999 /**
2000  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
2001  *	@m: the packet
2002  *
2003  * 	Returns the number of Tx descriptors needed for the given offload
2004  * 	packet.  These packets are already fully constructed.
2005  */
2006 static __inline unsigned int
2007 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
2008 {
2009 	unsigned int flits, cnt = 0;
2010 	int ndescs;
2011 
2012 	if (m->m_len <= WR_LEN && nsegs == 0)
2013 		return (1);                 /* packet fits as immediate data */
2014 
2015 	if (m->m_flags & M_IOVEC)
2016 		cnt = mtomv(m)->mv_count;
2017 	else
2018 		cnt = nsegs;
2019 
2020 	/* headers */
2021 	flits = m->m_len / 8;
2022 
2023 	ndescs = flits_to_desc(flits + sgl_len(cnt));
2024 
2025 	CTR4(KTR_CXGB, "flits=%d sgl_len=%d nsegs=%d ndescs=%d",
2026 	    flits, sgl_len(cnt), nsegs, ndescs);
2027 
2028 	return (ndescs);
2029 }
2030 
2031 /**
2032  *	ofld_xmit - send a packet through an offload queue
2033  *	@adap: the adapter
2034  *	@q: the Tx offload queue
2035  *	@m: the packet
2036  *
2037  *	Send an offload packet through an SGE offload queue.
2038  */
2039 static int
2040 ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
2041 {
2042 	int ret, nsegs;
2043 	unsigned int ndesc;
2044 	unsigned int pidx, gen;
2045 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
2046 	struct tx_sw_desc *stx;
2047 
2048 	nsegs = m_get_sgllen(m);
2049 	vsegs = m_get_sgl(m);
2050 	ndesc = calc_tx_descs_ofld(m, nsegs);
2051 	busdma_map_sgl(vsegs, segs, nsegs);
2052 
2053 	stx = &q->sdesc[q->pidx];
2054 	KASSERT(stx->mi.mi_base == NULL, ("mi_base set"));
2055 
2056 	mtx_lock(&q->lock);
2057 again:	reclaim_completed_tx_(q, 16);
2058 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2059 	if (__predict_false(ret)) {
2060 		if (ret == 1) {
2061 			printf("no ofld desc avail\n");
2062 
2063 			m_set_priority(m, ndesc);     /* save for restart */
2064 			mtx_unlock(&q->lock);
2065 			return (EINTR);
2066 		}
2067 		goto again;
2068 	}
2069 
2070 	gen = q->gen;
2071 	q->in_use += ndesc;
2072 	pidx = q->pidx;
2073 	q->pidx += ndesc;
2074 	if (q->pidx >= q->size) {
2075 		q->pidx -= q->size;
2076 		q->gen ^= 1;
2077 	}
2078 #ifdef T3_TRACE
2079 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2080 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2081 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2082 		  skb_shinfo(skb)->nr_frags);
2083 #endif
2084 	mtx_unlock(&q->lock);
2085 
2086 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2087 	check_ring_tx_db(adap, q);
2088 	return (0);
2089 }
2090 
2091 /**
2092  *	restart_offloadq - restart a suspended offload queue
2093  *	@qs: the queue set cotaining the offload queue
2094  *
2095  *	Resumes transmission on a suspended Tx offload queue.
2096  */
2097 static void
2098 restart_offloadq(void *data, int npending)
2099 {
2100 	struct mbuf *m;
2101 	struct sge_qset *qs = data;
2102 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2103 	adapter_t *adap = qs->port->adapter;
2104 	bus_dma_segment_t segs[TX_MAX_SEGS];
2105 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2106 	int nsegs, cleaned;
2107 
2108 	mtx_lock(&q->lock);
2109 again:	cleaned = reclaim_completed_tx_(q, 16);
2110 
2111 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2112 		unsigned int gen, pidx;
2113 		unsigned int ndesc = m_get_priority(m);
2114 
2115 		if (__predict_false(q->size - q->in_use < ndesc)) {
2116 			setbit(&qs->txq_stopped, TXQ_OFLD);
2117 			smp_mb();
2118 
2119 			if (should_restart_tx(q) &&
2120 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2121 				goto again;
2122 			q->stops++;
2123 			break;
2124 		}
2125 
2126 		gen = q->gen;
2127 		q->in_use += ndesc;
2128 		pidx = q->pidx;
2129 		q->pidx += ndesc;
2130 		if (q->pidx >= q->size) {
2131 			q->pidx -= q->size;
2132 			q->gen ^= 1;
2133 		}
2134 
2135 		(void)mbufq_dequeue(&q->sendq);
2136 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2137 		mtx_unlock(&q->lock);
2138 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2139 		mtx_lock(&q->lock);
2140 	}
2141 	mtx_unlock(&q->lock);
2142 
2143 #if USE_GTS
2144 	set_bit(TXQ_RUNNING, &q->flags);
2145 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2146 #endif
2147 	wmb();
2148 	t3_write_reg(adap, A_SG_KDOORBELL,
2149 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2150 }
2151 
2152 /**
2153  *	queue_set - return the queue set a packet should use
2154  *	@m: the packet
2155  *
2156  *	Maps a packet to the SGE queue set it should use.  The desired queue
2157  *	set is carried in bits 1-3 in the packet's priority.
2158  */
2159 static __inline int
2160 queue_set(const struct mbuf *m)
2161 {
2162 	return m_get_priority(m) >> 1;
2163 }
2164 
2165 /**
2166  *	is_ctrl_pkt - return whether an offload packet is a control packet
2167  *	@m: the packet
2168  *
2169  *	Determines whether an offload packet should use an OFLD or a CTRL
2170  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2171  */
2172 static __inline int
2173 is_ctrl_pkt(const struct mbuf *m)
2174 {
2175 	return m_get_priority(m) & 1;
2176 }
2177 
2178 /**
2179  *	t3_offload_tx - send an offload packet
2180  *	@tdev: the offload device to send to
2181  *	@m: the packet
2182  *
2183  *	Sends an offload packet.  We use the packet priority to select the
2184  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2185  *	should be sent as regular or control, bits 1-3 select the queue set.
2186  */
2187 int
2188 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2189 {
2190 	adapter_t *adap = tdev2adap(tdev);
2191 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2192 
2193 	if (__predict_false(is_ctrl_pkt(m)))
2194 		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
2195 
2196 	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
2197 }
2198 
2199 /**
2200  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2201  *	@tdev: the offload device that will be receiving the packets
2202  *	@q: the SGE response queue that assembled the bundle
2203  *	@m: the partial bundle
2204  *	@n: the number of packets in the bundle
2205  *
2206  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2207  */
2208 static __inline void
2209 deliver_partial_bundle(struct t3cdev *tdev,
2210 			struct sge_rspq *q,
2211 			struct mbuf *mbufs[], int n)
2212 {
2213 	if (n) {
2214 		q->offload_bundles++;
2215 		cxgb_ofld_recv(tdev, mbufs, n);
2216 	}
2217 }
2218 
2219 static __inline int
2220 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2221     struct mbuf *m, struct mbuf *rx_gather[],
2222     unsigned int gather_idx)
2223 {
2224 
2225 	rq->offload_pkts++;
2226 	m->m_pkthdr.header = mtod(m, void *);
2227 	rx_gather[gather_idx++] = m;
2228 	if (gather_idx == RX_BUNDLE_SIZE) {
2229 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2230 		gather_idx = 0;
2231 		rq->offload_bundles++;
2232 	}
2233 	return (gather_idx);
2234 }
2235 
2236 static void
2237 restart_tx(struct sge_qset *qs)
2238 {
2239 	struct adapter *sc = qs->port->adapter;
2240 
2241 
2242 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2243 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2244 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2245 		qs->txq[TXQ_OFLD].restarts++;
2246 		DPRINTF("restarting TXQ_OFLD\n");
2247 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2248 	}
2249 	DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2250 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2251 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2252 	    qs->txq[TXQ_CTRL].in_use);
2253 
2254 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2255 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2256 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2257 		qs->txq[TXQ_CTRL].restarts++;
2258 		DPRINTF("restarting TXQ_CTRL\n");
2259 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2260 	}
2261 }
2262 
2263 /**
2264  *	t3_sge_alloc_qset - initialize an SGE queue set
2265  *	@sc: the controller softc
2266  *	@id: the queue set id
2267  *	@nports: how many Ethernet ports will be using this queue set
2268  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2269  *	@p: configuration parameters for this queue set
2270  *	@ntxq: number of Tx queues for the queue set
2271  *	@pi: port info for queue set
2272  *
2273  *	Allocate resources and initialize an SGE queue set.  A queue set
2274  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2275  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2276  *	queue, offload queue, and control queue.
2277  */
2278 int
2279 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2280 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2281 {
2282 	struct sge_qset *q = &sc->sge.qs[id];
2283 	int i, header_size, ret = 0;
2284 
2285 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2286 		if ((q->txq[i].txq_mr.br_ring = malloc(cxgb_txq_buf_ring_size*sizeof(struct mbuf *),
2287 			    M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
2288 			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2289 			goto err;
2290 		}
2291 		q->txq[i].txq_mr.br_prod = q->txq[i].txq_mr.br_cons = 0;
2292 		q->txq[i].txq_mr.br_size = cxgb_txq_buf_ring_size;
2293 		mtx_init(&q->txq[i].txq_mr.br_lock, "txq mbuf ring", NULL, MTX_DEF);
2294 	}
2295 
2296 	init_qset_cntxt(q, id);
2297 	q->idx = id;
2298 
2299 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2300 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2301 		    &q->fl[0].desc, &q->fl[0].sdesc,
2302 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2303 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2304 		printf("error %d from alloc ring fl0\n", ret);
2305 		goto err;
2306 	}
2307 
2308 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2309 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2310 		    &q->fl[1].desc, &q->fl[1].sdesc,
2311 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2312 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2313 		printf("error %d from alloc ring fl1\n", ret);
2314 		goto err;
2315 	}
2316 
2317 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2318 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2319 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2320 		    NULL, NULL)) != 0) {
2321 		printf("error %d from alloc ring rspq\n", ret);
2322 		goto err;
2323 	}
2324 
2325 	for (i = 0; i < ntxq; ++i) {
2326 		/*
2327 		 * The control queue always uses immediate data so does not
2328 		 * need to keep track of any mbufs.
2329 		 * XXX Placeholder for future TOE support.
2330 		 */
2331 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2332 
2333 		if ((ret = alloc_ring(sc, p->txq_size[i],
2334 			    sizeof(struct tx_desc), sz,
2335 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2336 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2337 			    &q->txq[i].desc_map,
2338 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2339 			printf("error %d from alloc ring tx %i\n", ret, i);
2340 			goto err;
2341 		}
2342 		mbufq_init(&q->txq[i].sendq);
2343 		q->txq[i].gen = 1;
2344 		q->txq[i].size = p->txq_size[i];
2345 		snprintf(q->txq[i].lockbuf, TXQ_NAME_LEN, "t3 txq lock %d:%d:%d",
2346 		    device_get_unit(sc->dev), irq_vec_idx, i);
2347 		MTX_INIT(&q->txq[i].lock, q->txq[i].lockbuf, NULL, MTX_DEF);
2348 	}
2349 
2350 	q->txq[TXQ_ETH].port = pi;
2351 
2352 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2353 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2354 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_ETH]);
2355 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_OFLD]);
2356 
2357 	q->fl[0].gen = q->fl[1].gen = 1;
2358 	q->fl[0].size = p->fl_size;
2359 	q->fl[1].size = p->jumbo_size;
2360 
2361 	q->rspq.gen = 1;
2362 	q->rspq.cidx = 0;
2363 	q->rspq.size = p->rspq_size;
2364 
2365 
2366 	header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
2367 	q->txq[TXQ_ETH].stop_thres = nports *
2368 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2369 
2370 	q->fl[0].buf_size = (MCLBYTES - header_size);
2371 	q->fl[0].zone = zone_clust;
2372 	q->fl[0].type = EXT_CLUSTER;
2373 #if __FreeBSD_version > 800000
2374 	if (cxgb_use_16k_clusters) {
2375 		q->fl[1].buf_size = MJUM16BYTES - header_size;
2376 		q->fl[1].zone = zone_jumbo16;
2377 		q->fl[1].type = EXT_JUMBO16;
2378 	} else {
2379 		q->fl[1].buf_size = MJUM9BYTES - header_size;
2380 		q->fl[1].zone = zone_jumbo9;
2381 		q->fl[1].type = EXT_JUMBO9;
2382 	}
2383 #else
2384 	q->fl[1].buf_size = MJUMPAGESIZE - header_size;
2385 	q->fl[1].zone = zone_jumbop;
2386 	q->fl[1].type = EXT_JUMBOP;
2387 #endif
2388 
2389 	/*
2390 	 * We allocate and setup the lro_ctrl structure irrespective of whether
2391 	 * lro is available and/or enabled.
2392 	 */
2393 	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2394 	ret = tcp_lro_init(&q->lro.ctrl);
2395 	if (ret) {
2396 		printf("error %d from tcp_lro_init\n", ret);
2397 		goto err;
2398 	}
2399 	q->lro.ctrl.ifp = pi->ifp;
2400 
2401 	mtx_lock_spin(&sc->sge.reg_lock);
2402 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2403 				   q->rspq.phys_addr, q->rspq.size,
2404 				   q->fl[0].buf_size, 1, 0);
2405 	if (ret) {
2406 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2407 		goto err_unlock;
2408 	}
2409 
2410 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2411 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2412 					  q->fl[i].phys_addr, q->fl[i].size,
2413 					  q->fl[i].buf_size, p->cong_thres, 1,
2414 					  0);
2415 		if (ret) {
2416 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2417 			goto err_unlock;
2418 		}
2419 	}
2420 
2421 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2422 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2423 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2424 				 1, 0);
2425 	if (ret) {
2426 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2427 		goto err_unlock;
2428 	}
2429 
2430 	if (ntxq > 1) {
2431 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2432 					 USE_GTS, SGE_CNTXT_OFLD, id,
2433 					 q->txq[TXQ_OFLD].phys_addr,
2434 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2435 		if (ret) {
2436 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2437 			goto err_unlock;
2438 		}
2439 	}
2440 
2441 	if (ntxq > 2) {
2442 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2443 					 SGE_CNTXT_CTRL, id,
2444 					 q->txq[TXQ_CTRL].phys_addr,
2445 					 q->txq[TXQ_CTRL].size,
2446 					 q->txq[TXQ_CTRL].token, 1, 0);
2447 		if (ret) {
2448 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2449 			goto err_unlock;
2450 		}
2451 	}
2452 
2453 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2454 	    device_get_unit(sc->dev), irq_vec_idx);
2455 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2456 
2457 	mtx_unlock_spin(&sc->sge.reg_lock);
2458 	t3_update_qset_coalesce(q, p);
2459 	q->port = pi;
2460 
2461 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2462 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2463 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2464 
2465 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2466 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2467 
2468 	return (0);
2469 
2470 err_unlock:
2471 	mtx_unlock_spin(&sc->sge.reg_lock);
2472 err:
2473 	t3_free_qset(sc, q);
2474 
2475 	return (ret);
2476 }
2477 
2478 /*
2479  * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2480  * ethernet data.  Hardware assistance with various checksums and any vlan tag
2481  * will also be taken into account here.
2482  */
2483 void
2484 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2485 {
2486 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2487 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2488 	struct ifnet *ifp = pi->ifp;
2489 
2490 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2491 
2492 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2493 	    cpl->csum_valid && cpl->csum == 0xffff) {
2494 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2495 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2496 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2497 		m->m_pkthdr.csum_data = 0xffff;
2498 	}
2499 	/*
2500 	 * XXX need to add VLAN support for 6.x
2501 	 */
2502 #ifdef VLAN_SUPPORTED
2503 	if (__predict_false(cpl->vlan_valid)) {
2504 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2505 		m->m_flags |= M_VLANTAG;
2506 	}
2507 #endif
2508 
2509 	m->m_pkthdr.rcvif = ifp;
2510 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2511 #ifndef DISABLE_MBUF_IOVEC
2512 	m_explode(m);
2513 #endif
2514 	/*
2515 	 * adjust after conversion to mbuf chain
2516 	 */
2517 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2518 	m->m_len -= (sizeof(*cpl) + ethpad);
2519 	m->m_data += (sizeof(*cpl) + ethpad);
2520 }
2521 
2522 static void
2523 ext_free_handler(void *arg1, void * arg2)
2524 {
2525 	uintptr_t type = (uintptr_t)arg2;
2526 	uma_zone_t zone;
2527 	struct mbuf *m;
2528 
2529 	m = arg1;
2530 	zone = m_getzonefromtype(type);
2531 	m->m_ext.ext_type = (int)type;
2532 	cxgb_ext_freed++;
2533 	cxgb_cache_put(zone, m);
2534 }
2535 
2536 static void
2537 init_cluster_mbuf(caddr_t cl, int flags, int type, uma_zone_t zone)
2538 {
2539 	struct mbuf *m;
2540 	int header_size;
2541 
2542 	header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) +
2543 	    sizeof(struct m_ext_) + sizeof(uint32_t);
2544 
2545 	bzero(cl, header_size);
2546 	m = (struct mbuf *)cl;
2547 
2548 	cxgb_ext_inited++;
2549 	SLIST_INIT(&m->m_pkthdr.tags);
2550 	m->m_type = MT_DATA;
2551 	m->m_flags = flags | M_NOFREE | M_EXT;
2552 	m->m_data = cl + header_size;
2553 	m->m_ext.ext_buf = cl;
2554 	m->m_ext.ref_cnt = (uint32_t *)(cl + header_size - sizeof(uint32_t));
2555 	m->m_ext.ext_size = m_getsizefromtype(type);
2556 	m->m_ext.ext_free = ext_free_handler;
2557 	m->m_ext.ext_arg1 = cl;
2558 	m->m_ext.ext_arg2 = (void *)(uintptr_t)type;
2559 	m->m_ext.ext_type = EXT_EXTREF;
2560 	*(m->m_ext.ref_cnt) = 1;
2561 	DPRINTF("data=%p ref_cnt=%p\n", m->m_data, m->m_ext.ref_cnt);
2562 }
2563 
2564 
2565 /**
2566  *	get_packet - return the next ingress packet buffer from a free list
2567  *	@adap: the adapter that received the packet
2568  *	@drop_thres: # of remaining buffers before we start dropping packets
2569  *	@qs: the qset that the SGE free list holding the packet belongs to
2570  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2571  *      @r: response descriptor
2572  *
2573  *	Get the next packet from a free list and complete setup of the
2574  *	sk_buff.  If the packet is small we make a copy and recycle the
2575  *	original buffer, otherwise we use the original buffer itself.  If a
2576  *	positive drop threshold is supplied packets are dropped and their
2577  *	buffers recycled if (a) the number of remaining buffers is under the
2578  *	threshold and the packet is too big to copy, or (b) the packet should
2579  *	be copied but there is no memory for the copy.
2580  */
2581 #ifdef DISABLE_MBUF_IOVEC
2582 
2583 static int
2584 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2585     struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2586 {
2587 
2588 	unsigned int len_cq =  ntohl(r->len_cq);
2589 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2590 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2591 	uint32_t len = G_RSPD_LEN(len_cq);
2592 	uint32_t flags = ntohl(r->flags);
2593 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2594 	caddr_t cl;
2595 	struct mbuf *m, *m0;
2596 	int ret = 0;
2597 
2598 	prefetch(sd->rxsd_cl);
2599 
2600 	fl->credits--;
2601 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2602 
2603 	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2604 		if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2605 			goto skip_recycle;
2606 		cl = mtod(m0, void *);
2607 		memcpy(cl, sd->data, len);
2608 		recycle_rx_buf(adap, fl, fl->cidx);
2609 		m = m0;
2610 		m0->m_len = len;
2611 	} else {
2612 	skip_recycle:
2613 
2614 		bus_dmamap_unload(fl->entry_tag, sd->map);
2615 		cl = sd->rxsd_cl;
2616 		m = m0 = (struct mbuf *)cl;
2617 
2618 		if ((sopeop == RSPQ_SOP_EOP) ||
2619 		    (sopeop == RSPQ_SOP))
2620 			flags = M_PKTHDR;
2621 		init_cluster_mbuf(cl, flags, fl->type, fl->zone);
2622 		m0->m_len = len;
2623 	}
2624 	switch(sopeop) {
2625 	case RSPQ_SOP_EOP:
2626 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2627 		mh->mh_head = mh->mh_tail = m;
2628 		m->m_pkthdr.len = len;
2629 		ret = 1;
2630 		break;
2631 	case RSPQ_NSOP_NEOP:
2632 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2633 		if (mh->mh_tail == NULL) {
2634 			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2635 			m_freem(m);
2636 			break;
2637 		}
2638 		mh->mh_tail->m_next = m;
2639 		mh->mh_tail = m;
2640 		mh->mh_head->m_pkthdr.len += len;
2641 		ret = 0;
2642 		break;
2643 	case RSPQ_SOP:
2644 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2645 		m->m_pkthdr.len = len;
2646 		mh->mh_head = mh->mh_tail = m;
2647 		ret = 0;
2648 		break;
2649 	case RSPQ_EOP:
2650 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2651 		mh->mh_head->m_pkthdr.len += len;
2652 		mh->mh_tail->m_next = m;
2653 		mh->mh_tail = m;
2654 		ret = 1;
2655 		break;
2656 	}
2657 	if (++fl->cidx == fl->size)
2658 		fl->cidx = 0;
2659 
2660 	return (ret);
2661 }
2662 
2663 #else
2664 
2665 static int
2666 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2667     struct mbuf **m, struct rsp_desc *r)
2668 {
2669 
2670 	unsigned int len_cq =  ntohl(r->len_cq);
2671 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2672 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2673 	uint32_t len = G_RSPD_LEN(len_cq);
2674 	uint32_t flags = ntohl(r->flags);
2675 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2676 	void *cl;
2677 	int ret = 0;
2678 	struct mbuf *m0;
2679 #if 0
2680 	if ((sd + 1 )->rxsd_cl)
2681 		prefetch((sd + 1)->rxsd_cl);
2682 	if ((sd + 2)->rxsd_cl)
2683 		prefetch((sd + 2)->rxsd_cl);
2684 #endif
2685 	DPRINTF("rx cpu=%d\n", curcpu);
2686 	fl->credits--;
2687 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2688 
2689 	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2690 		if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2691 			goto skip_recycle;
2692 		cl = mtod(m0, void *);
2693 		memcpy(cl, sd->data, len);
2694 		recycle_rx_buf(adap, fl, fl->cidx);
2695 		*m = m0;
2696 	} else {
2697 	skip_recycle:
2698 		bus_dmamap_unload(fl->entry_tag, sd->map);
2699 		cl = sd->rxsd_cl;
2700 		*m = m0 = (struct mbuf *)cl;
2701 	}
2702 
2703 	switch(sopeop) {
2704 	case RSPQ_SOP_EOP:
2705 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2706 		if (cl == sd->rxsd_cl)
2707 			init_cluster_mbuf(cl, M_PKTHDR, fl->type, fl->zone);
2708 		m0->m_len = m0->m_pkthdr.len = len;
2709 		ret = 1;
2710 		goto done;
2711 		break;
2712 	case RSPQ_NSOP_NEOP:
2713 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2714 		panic("chaining unsupported");
2715 		ret = 0;
2716 		break;
2717 	case RSPQ_SOP:
2718 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2719 		panic("chaining unsupported");
2720 		m_iovinit(m0);
2721 		ret = 0;
2722 		break;
2723 	case RSPQ_EOP:
2724 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2725 		panic("chaining unsupported");
2726 		ret = 1;
2727 		break;
2728 	}
2729 	panic("append not supported");
2730 #if 0
2731 	m_iovappend(m0, cl, fl->buf_size, len, sizeof(uint32_t), sd->rxsd_ref);
2732 #endif
2733 done:
2734 	if (++fl->cidx == fl->size)
2735 		fl->cidx = 0;
2736 
2737 	return (ret);
2738 }
2739 #endif
2740 /**
2741  *	handle_rsp_cntrl_info - handles control information in a response
2742  *	@qs: the queue set corresponding to the response
2743  *	@flags: the response control flags
2744  *
2745  *	Handles the control information of an SGE response, such as GTS
2746  *	indications and completion credits for the queue set's Tx queues.
2747  *	HW coalesces credits, we don't do any extra SW coalescing.
2748  */
2749 static __inline void
2750 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2751 {
2752 	unsigned int credits;
2753 
2754 #if USE_GTS
2755 	if (flags & F_RSPD_TXQ0_GTS)
2756 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2757 #endif
2758 	credits = G_RSPD_TXQ0_CR(flags);
2759 	if (credits)
2760 		qs->txq[TXQ_ETH].processed += credits;
2761 
2762 	credits = G_RSPD_TXQ2_CR(flags);
2763 	if (credits)
2764 		qs->txq[TXQ_CTRL].processed += credits;
2765 
2766 # if USE_GTS
2767 	if (flags & F_RSPD_TXQ1_GTS)
2768 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2769 # endif
2770 	credits = G_RSPD_TXQ1_CR(flags);
2771 	if (credits)
2772 		qs->txq[TXQ_OFLD].processed += credits;
2773 
2774 }
2775 
2776 static void
2777 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2778     unsigned int sleeping)
2779 {
2780 	;
2781 }
2782 
2783 /**
2784  *	process_responses - process responses from an SGE response queue
2785  *	@adap: the adapter
2786  *	@qs: the queue set to which the response queue belongs
2787  *	@budget: how many responses can be processed in this round
2788  *
2789  *	Process responses from an SGE response queue up to the supplied budget.
2790  *	Responses include received packets as well as credits and other events
2791  *	for the queues that belong to the response queue's queue set.
2792  *	A negative budget is effectively unlimited.
2793  *
2794  *	Additionally choose the interrupt holdoff time for the next interrupt
2795  *	on this queue.  If the system is under memory shortage use a fairly
2796  *	long delay to help recovery.
2797  */
2798 int
2799 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2800 {
2801 	struct sge_rspq *rspq = &qs->rspq;
2802 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2803 	int budget_left = budget;
2804 	unsigned int sleeping = 0;
2805 	int lro_enabled = qs->lro.enabled;
2806 	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2807 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2808 	int ngathered = 0;
2809 #ifdef DEBUG
2810 	static int last_holdoff = 0;
2811 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2812 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2813 		last_holdoff = rspq->holdoff_tmr;
2814 	}
2815 #endif
2816 	rspq->next_holdoff = rspq->holdoff_tmr;
2817 
2818 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2819 		int eth, eop = 0, ethpad = 0;
2820 		uint32_t flags = ntohl(r->flags);
2821 		uint32_t rss_csum = *(const uint32_t *)r;
2822 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2823 
2824 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2825 
2826 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2827 			struct mbuf *m;
2828 
2829 			if (cxgb_debug)
2830 				printf("async notification\n");
2831 
2832 			if (rspq->rspq_mh.mh_head == NULL) {
2833 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2834 				m = rspq->rspq_mh.mh_head;
2835 			} else {
2836 				m = m_gethdr(M_DONTWAIT, MT_DATA);
2837 			}
2838 
2839 			/* XXX m is lost here if rspq->rspq_mbuf is not NULL */
2840 
2841 			if (m == NULL)
2842 				goto no_mem;
2843 
2844                         memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2845 			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2846                         *mtod(m, char *) = CPL_ASYNC_NOTIF;
2847 			rss_csum = htonl(CPL_ASYNC_NOTIF << 24);
2848 			eop = 1;
2849                         rspq->async_notif++;
2850 			goto skip;
2851 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2852 			struct mbuf *m = NULL;
2853 
2854 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n",
2855 			    r->rss_hdr.opcode, rspq->cidx);
2856 			if (rspq->rspq_mh.mh_head == NULL)
2857 				rspq->rspq_mh.mh_head = m_gethdr(M_DONTWAIT, MT_DATA);
2858                         else
2859 				m = m_gethdr(M_DONTWAIT, MT_DATA);
2860 
2861 			if (rspq->rspq_mh.mh_head == NULL &&  m == NULL) {
2862 		no_mem:
2863 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2864 				budget_left--;
2865 				break;
2866 			}
2867 			get_imm_packet(adap, r, rspq->rspq_mh.mh_head);
2868 			eop = 1;
2869 			rspq->imm_data++;
2870 		} else if (r->len_cq) {
2871 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2872 
2873 #ifdef DISABLE_MBUF_IOVEC
2874 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r);
2875 #else
2876 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mbuf, r);
2877 #endif
2878 #ifdef IFNET_MULTIQUEUE
2879 			rspq->rspq_mh.mh_head->m_pkthdr.rss_hash = rss_hash;
2880 #endif
2881 			ethpad = 2;
2882 		} else {
2883 			DPRINTF("pure response\n");
2884 			rspq->pure_rsps++;
2885 		}
2886 	skip:
2887 		if (flags & RSPD_CTRL_MASK) {
2888 			sleeping |= flags & RSPD_GTS_MASK;
2889 			handle_rsp_cntrl_info(qs, flags);
2890 		}
2891 
2892 		r++;
2893 		if (__predict_false(++rspq->cidx == rspq->size)) {
2894 			rspq->cidx = 0;
2895 			rspq->gen ^= 1;
2896 			r = rspq->desc;
2897 		}
2898 		prefetch(r);
2899 		if (++rspq->credits >= (rspq->size / 4)) {
2900 			refill_rspq(adap, rspq, rspq->credits);
2901 			rspq->credits = 0;
2902 		}
2903 		DPRINTF("eth=%d eop=%d flags=0x%x\n", eth, eop, flags);
2904 
2905 		if (!eth && eop) {
2906 			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
2907 			/*
2908 			 * XXX size mismatch
2909 			 */
2910 			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
2911 
2912 
2913 			ngathered = rx_offload(&adap->tdev, rspq,
2914 			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
2915 			rspq->rspq_mh.mh_head = NULL;
2916 			DPRINTF("received offload packet\n");
2917 
2918 		} else if (eth && eop) {
2919 			struct mbuf *m = rspq->rspq_mh.mh_head;
2920 			prefetch(mtod(m, uint8_t *));
2921 			prefetch(mtod(m, uint8_t *) + L1_CACHE_BYTES);
2922 
2923 			t3_rx_eth(adap, rspq, m, ethpad);
2924 			if (lro_enabled && lro_ctrl->lro_cnt &&
2925 			    (tcp_lro_rx(lro_ctrl, m, 0) == 0)) {
2926 				/* successfully queue'd for LRO */
2927 			} else {
2928 				/*
2929 				 * LRO not enabled, packet unsuitable for LRO,
2930 				 * or unable to queue.  Pass it up right now in
2931 				 * either case.
2932 				 */
2933 				struct ifnet *ifp = m->m_pkthdr.rcvif;
2934 				(*ifp->if_input)(ifp, m);
2935 			}
2936 			DPRINTF("received tunnel packet\n");
2937 			rspq->rspq_mh.mh_head = NULL;
2938 
2939 		}
2940 		__refill_fl_lt(adap, &qs->fl[0], 32);
2941 		__refill_fl_lt(adap, &qs->fl[1], 32);
2942 		--budget_left;
2943 	}
2944 
2945 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
2946 
2947 	/* Flush LRO */
2948 	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
2949 		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
2950 		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
2951 		tcp_lro_flush(lro_ctrl, queued);
2952 	}
2953 
2954 	if (sleeping)
2955 		check_ring_db(adap, qs, sleeping);
2956 
2957 	smp_mb();  /* commit Tx queue processed updates */
2958 	if (__predict_false(qs->txq_stopped > 1)) {
2959 		printf("restarting tx on %p\n", qs);
2960 
2961 		restart_tx(qs);
2962 	}
2963 
2964 	__refill_fl_lt(adap, &qs->fl[0], 512);
2965 	__refill_fl_lt(adap, &qs->fl[1], 512);
2966 	budget -= budget_left;
2967 	return (budget);
2968 }
2969 
2970 /*
2971  * A helper function that processes responses and issues GTS.
2972  */
2973 static __inline int
2974 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2975 {
2976 	int work;
2977 	static int last_holdoff = 0;
2978 
2979 	work = process_responses(adap, rspq_to_qset(rq), -1);
2980 
2981 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2982 		printf("next_holdoff=%d\n", rq->next_holdoff);
2983 		last_holdoff = rq->next_holdoff;
2984 	}
2985 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2986 	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2987 
2988 	return (work);
2989 }
2990 
2991 
2992 /*
2993  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2994  * Handles data events from SGE response queues as well as error and other
2995  * async events as they all use the same interrupt pin.  We use one SGE
2996  * response queue per port in this mode and protect all response queues with
2997  * queue 0's lock.
2998  */
2999 void
3000 t3b_intr(void *data)
3001 {
3002 	uint32_t i, map;
3003 	adapter_t *adap = data;
3004 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3005 
3006 	t3_write_reg(adap, A_PL_CLI, 0);
3007 	map = t3_read_reg(adap, A_SG_DATA_INTR);
3008 
3009 	if (!map)
3010 		return;
3011 
3012 	if (__predict_false(map & F_ERRINTR))
3013 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3014 
3015 	mtx_lock(&q0->lock);
3016 	for_each_port(adap, i)
3017 	    if (map & (1 << i))
3018 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3019 	mtx_unlock(&q0->lock);
3020 }
3021 
3022 /*
3023  * The MSI interrupt handler.  This needs to handle data events from SGE
3024  * response queues as well as error and other async events as they all use
3025  * the same MSI vector.  We use one SGE response queue per port in this mode
3026  * and protect all response queues with queue 0's lock.
3027  */
3028 void
3029 t3_intr_msi(void *data)
3030 {
3031 	adapter_t *adap = data;
3032 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3033 	int i, new_packets = 0;
3034 
3035 	mtx_lock(&q0->lock);
3036 
3037 	for_each_port(adap, i)
3038 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3039 		    new_packets = 1;
3040 	mtx_unlock(&q0->lock);
3041 	if (new_packets == 0)
3042 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3043 }
3044 
3045 void
3046 t3_intr_msix(void *data)
3047 {
3048 	struct sge_qset *qs = data;
3049 	adapter_t *adap = qs->port->adapter;
3050 	struct sge_rspq *rspq = &qs->rspq;
3051 #ifndef IFNET_MULTIQUEUE
3052 	mtx_lock(&rspq->lock);
3053 #else
3054 	if (mtx_trylock(&rspq->lock))
3055 #endif
3056 	{
3057 
3058 		if (process_responses_gts(adap, rspq) == 0)
3059 			rspq->unhandled_irqs++;
3060 		mtx_unlock(&rspq->lock);
3061 	}
3062 }
3063 
3064 #define QDUMP_SBUF_SIZE		32 * 400
3065 static int
3066 t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3067 {
3068 	struct sge_rspq *rspq;
3069 	struct sge_qset *qs;
3070 	int i, err, dump_end, idx;
3071 	static int multiplier = 1;
3072 	struct sbuf *sb;
3073 	struct rsp_desc *rspd;
3074 	uint32_t data[4];
3075 
3076 	rspq = arg1;
3077 	qs = rspq_to_qset(rspq);
3078 	if (rspq->rspq_dump_count == 0)
3079 		return (0);
3080 	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3081 		log(LOG_WARNING,
3082 		    "dump count is too large %d\n", rspq->rspq_dump_count);
3083 		rspq->rspq_dump_count = 0;
3084 		return (EINVAL);
3085 	}
3086 	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3087 		log(LOG_WARNING,
3088 		    "dump start of %d is greater than queue size\n",
3089 		    rspq->rspq_dump_start);
3090 		rspq->rspq_dump_start = 0;
3091 		return (EINVAL);
3092 	}
3093 	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3094 	if (err)
3095 		return (err);
3096 retry_sbufops:
3097 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3098 
3099 	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3100 	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3101 	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3102 	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3103 	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3104 
3105 	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3106 	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3107 
3108 	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3109 	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3110 		idx = i & (RSPQ_Q_SIZE-1);
3111 
3112 		rspd = &rspq->desc[idx];
3113 		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3114 		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3115 		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3116 		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3117 		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3118 		    be32toh(rspd->len_cq), rspd->intr_gen);
3119 	}
3120 	if (sbuf_overflowed(sb)) {
3121 		sbuf_delete(sb);
3122 		multiplier++;
3123 		goto retry_sbufops;
3124 	}
3125 	sbuf_finish(sb);
3126 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3127 	sbuf_delete(sb);
3128 	return (err);
3129 }
3130 
3131 static int
3132 t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3133 {
3134 	struct sge_txq *txq;
3135 	struct sge_qset *qs;
3136 	int i, j, err, dump_end;
3137 	static int multiplier = 1;
3138 	struct sbuf *sb;
3139 	struct tx_desc *txd;
3140 	uint32_t *WR, wr_hi, wr_lo, gen;
3141 	uint32_t data[4];
3142 
3143 	txq = arg1;
3144 	qs = txq_to_qset(txq, TXQ_ETH);
3145 	if (txq->txq_dump_count == 0) {
3146 		return (0);
3147 	}
3148 	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3149 		log(LOG_WARNING,
3150 		    "dump count is too large %d\n", txq->txq_dump_count);
3151 		txq->txq_dump_count = 1;
3152 		return (EINVAL);
3153 	}
3154 	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3155 		log(LOG_WARNING,
3156 		    "dump start of %d is greater than queue size\n",
3157 		    txq->txq_dump_start);
3158 		txq->txq_dump_start = 0;
3159 		return (EINVAL);
3160 	}
3161 	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3162 	if (err)
3163 		return (err);
3164 
3165 
3166 retry_sbufops:
3167 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3168 
3169 	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3170 	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3171 	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3172 	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3173 	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3174 	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3175 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3176 	    txq->txq_dump_start,
3177 	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3178 
3179 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3180 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3181 		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3182 		WR = (uint32_t *)txd->flit;
3183 		wr_hi = ntohl(WR[0]);
3184 		wr_lo = ntohl(WR[1]);
3185 		gen = G_WR_GEN(wr_lo);
3186 
3187 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3188 		    wr_hi, wr_lo, gen);
3189 		for (j = 2; j < 30; j += 4)
3190 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3191 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3192 
3193 	}
3194 	if (sbuf_overflowed(sb)) {
3195 		sbuf_delete(sb);
3196 		multiplier++;
3197 		goto retry_sbufops;
3198 	}
3199 	sbuf_finish(sb);
3200 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3201 	sbuf_delete(sb);
3202 	return (err);
3203 }
3204 
3205 static int
3206 t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3207 {
3208 	struct sge_txq *txq;
3209 	struct sge_qset *qs;
3210 	int i, j, err, dump_end;
3211 	static int multiplier = 1;
3212 	struct sbuf *sb;
3213 	struct tx_desc *txd;
3214 	uint32_t *WR, wr_hi, wr_lo, gen;
3215 
3216 	txq = arg1;
3217 	qs = txq_to_qset(txq, TXQ_CTRL);
3218 	if (txq->txq_dump_count == 0) {
3219 		return (0);
3220 	}
3221 	if (txq->txq_dump_count > 256) {
3222 		log(LOG_WARNING,
3223 		    "dump count is too large %d\n", txq->txq_dump_count);
3224 		txq->txq_dump_count = 1;
3225 		return (EINVAL);
3226 	}
3227 	if (txq->txq_dump_start > 255) {
3228 		log(LOG_WARNING,
3229 		    "dump start of %d is greater than queue size\n",
3230 		    txq->txq_dump_start);
3231 		txq->txq_dump_start = 0;
3232 		return (EINVAL);
3233 	}
3234 
3235 retry_sbufops:
3236 	sb = sbuf_new(NULL, NULL, QDUMP_SBUF_SIZE*multiplier, SBUF_FIXEDLEN);
3237 	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3238 	    txq->txq_dump_start,
3239 	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3240 
3241 	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3242 	for (i = txq->txq_dump_start; i < dump_end; i++) {
3243 		txd = &txq->desc[i & (255)];
3244 		WR = (uint32_t *)txd->flit;
3245 		wr_hi = ntohl(WR[0]);
3246 		wr_lo = ntohl(WR[1]);
3247 		gen = G_WR_GEN(wr_lo);
3248 
3249 		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3250 		    wr_hi, wr_lo, gen);
3251 		for (j = 2; j < 30; j += 4)
3252 			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3253 			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3254 
3255 	}
3256 	if (sbuf_overflowed(sb)) {
3257 		sbuf_delete(sb);
3258 		multiplier++;
3259 		goto retry_sbufops;
3260 	}
3261 	sbuf_finish(sb);
3262 	err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
3263 	sbuf_delete(sb);
3264 	return (err);
3265 }
3266 
3267 static int
3268 t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3269 {
3270 	adapter_t *sc = arg1;
3271 	struct qset_params *qsp = &sc->params.sge.qset[0];
3272 	int coalesce_usecs;
3273 	struct sge_qset *qs;
3274 	int i, j, err, nqsets = 0;
3275 	struct mtx *lock;
3276 
3277 	if ((sc->flags & FULL_INIT_DONE) == 0)
3278 		return (ENXIO);
3279 
3280 	coalesce_usecs = qsp->coalesce_usecs;
3281         err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3282 
3283 	if (err != 0) {
3284 		return (err);
3285 	}
3286 	if (coalesce_usecs == qsp->coalesce_usecs)
3287 		return (0);
3288 
3289 	for (i = 0; i < sc->params.nports; i++)
3290 		for (j = 0; j < sc->port[i].nqsets; j++)
3291 			nqsets++;
3292 
3293 	coalesce_usecs = max(1, coalesce_usecs);
3294 
3295 	for (i = 0; i < nqsets; i++) {
3296 		qs = &sc->sge.qs[i];
3297 		qsp = &sc->params.sge.qset[i];
3298 		qsp->coalesce_usecs = coalesce_usecs;
3299 
3300 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3301 			    &sc->sge.qs[0].rspq.lock;
3302 
3303 		mtx_lock(lock);
3304 		t3_update_qset_coalesce(qs, qsp);
3305 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3306 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3307 		mtx_unlock(lock);
3308 	}
3309 
3310 	return (0);
3311 }
3312 
3313 
3314 void
3315 t3_add_attach_sysctls(adapter_t *sc)
3316 {
3317 	struct sysctl_ctx_list *ctx;
3318 	struct sysctl_oid_list *children;
3319 
3320 	ctx = device_get_sysctl_ctx(sc->dev);
3321 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3322 
3323 	/* random information */
3324 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3325 	    "firmware_version",
3326 	    CTLFLAG_RD, &sc->fw_version,
3327 	    0, "firmware version");
3328 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3329 	    "hw_revision",
3330 	    CTLFLAG_RD, &sc->params.rev,
3331 	    0, "chip model");
3332 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3333 	    "enable_debug",
3334 	    CTLFLAG_RW, &cxgb_debug,
3335 	    0, "enable verbose debugging output");
3336 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tunq_coalesce",
3337 	    CTLFLAG_RD, &sc->tunq_coalesce,
3338 	    "#tunneled packets freed");
3339 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3340 	    "txq_overrun",
3341 	    CTLFLAG_RD, &txq_fills,
3342 	    0, "#times txq overrun");
3343 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3344 	    "pcpu_cache_enable",
3345 	    CTLFLAG_RW, &cxgb_pcpu_cache_enable,
3346 	    0, "#enable driver local pcpu caches");
3347 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3348 	    "cache_alloc",
3349 	    CTLFLAG_RD, &cxgb_cached_allocations,
3350 	    0, "#times a cluster was allocated from cache");
3351 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3352 	    "cached",
3353 	    CTLFLAG_RD, &cxgb_cached,
3354 	    0, "#times a cluster was cached");
3355 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3356 	    "ext_freed",
3357 	    CTLFLAG_RD, &cxgb_ext_freed,
3358 	    0, "#times a cluster was freed through ext_free");
3359 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3360 	    "ext_inited",
3361 	    CTLFLAG_RD, &cxgb_ext_inited,
3362 	    0, "#times a cluster was initialized for ext_free");
3363 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3364 	    "mbufs_outstanding",
3365 	    CTLFLAG_RD, &cxgb_mbufs_outstanding,
3366 	    0, "#mbufs in flight in the driver");
3367 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3368 	    "pack_outstanding",
3369 	    CTLFLAG_RD, &cxgb_pack_outstanding,
3370 	    0, "#packet in flight in the driver");
3371 }
3372 
3373 
3374 static const char *rspq_name = "rspq";
3375 static const char *txq_names[] =
3376 {
3377 	"txq_eth",
3378 	"txq_ofld",
3379 	"txq_ctrl"
3380 };
3381 
3382 static int
3383 sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3384 {
3385 	struct port_info *p = arg1;
3386 	uint64_t *parg;
3387 
3388 	if (!p)
3389 		return (EINVAL);
3390 
3391 	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3392 
3393 	PORT_LOCK(p);
3394 	t3_mac_update_stats(&p->mac);
3395 	PORT_UNLOCK(p);
3396 
3397 	return (sysctl_handle_quad(oidp, parg, 0, req));
3398 }
3399 
3400 void
3401 t3_add_configured_sysctls(adapter_t *sc)
3402 {
3403 	struct sysctl_ctx_list *ctx;
3404 	struct sysctl_oid_list *children;
3405 	int i, j;
3406 
3407 	ctx = device_get_sysctl_ctx(sc->dev);
3408 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3409 
3410 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3411 	    "intr_coal",
3412 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3413 	    0, t3_set_coalesce_usecs,
3414 	    "I", "interrupt coalescing timer (us)");
3415 
3416 	for (i = 0; i < sc->params.nports; i++) {
3417 		struct port_info *pi = &sc->port[i];
3418 		struct sysctl_oid *poid;
3419 		struct sysctl_oid_list *poidlist;
3420 		struct mac_stats *mstats = &pi->mac.stats;
3421 
3422 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3423 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3424 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3425 		poidlist = SYSCTL_CHILDREN(poid);
3426 		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3427 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3428 		    0, "#queue sets");
3429 
3430 		for (j = 0; j < pi->nqsets; j++) {
3431 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3432 			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid, *ctrlqpoid, *lropoid;
3433 			struct sysctl_oid_list *qspoidlist, *rspqpoidlist, *txqpoidlist, *ctrlqpoidlist, *lropoidlist;
3434 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3435 
3436 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3437 
3438 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3439 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3440 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3441 
3442 			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3443 			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3444 			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3445 
3446 			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3447 			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3448 			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3449 
3450 			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3451 			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3452 			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3453 
3454 			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3455 			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3456 			lropoidlist = SYSCTL_CHILDREN(lropoid);
3457 
3458 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3459 			    CTLFLAG_RD, &qs->rspq.size,
3460 			    0, "#entries in response queue");
3461 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3462 			    CTLFLAG_RD, &qs->rspq.cidx,
3463 			    0, "consumer index");
3464 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3465 			    CTLFLAG_RD, &qs->rspq.credits,
3466 			    0, "#credits");
3467 			SYSCTL_ADD_XLONG(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3468 			    CTLFLAG_RD, &qs->rspq.phys_addr,
3469 			    "physical_address_of the queue");
3470 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3471 			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3472 			    0, "start rspq dump entry");
3473 			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3474 			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3475 			    0, "#rspq entries to dump");
3476 			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3477 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3478 			    0, t3_dump_rspq, "A", "dump of the response queue");
3479 
3480 
3481 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "dropped",
3482 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3483 			    0, "#tunneled packets dropped");
3484 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3485 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3486 			    0, "#tunneled packets waiting to be sent");
3487 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3488 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3489 			    0, "#tunneled packets queue producer index");
3490 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3491 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3492 			    0, "#tunneled packets queue consumer index");
3493 			SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed",
3494 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3495 			    0, "#tunneled packets processed by the card");
3496 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3497 			    CTLFLAG_RD, &txq->cleaned,
3498 			    0, "#tunneled packets cleaned");
3499 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3500 			    CTLFLAG_RD, &txq->in_use,
3501 			    0, "#tunneled packet slots in use");
3502 			SYSCTL_ADD_ULONG(ctx, txqpoidlist, OID_AUTO, "frees",
3503 			    CTLFLAG_RD, &txq->txq_frees,
3504 			    "#tunneled packets freed");
3505 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3506 			    CTLFLAG_RD, &txq->txq_skipped,
3507 			    0, "#tunneled packet descriptors skipped");
3508 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "coalesced",
3509 			    CTLFLAG_RD, &txq->txq_coalesced,
3510 			    0, "#tunneled packets coalesced");
3511 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3512 			    CTLFLAG_RD, &txq->txq_enqueued,
3513 			    0, "#tunneled packets enqueued to hardware");
3514 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3515 			    CTLFLAG_RD, &qs->txq_stopped,
3516 			    0, "tx queues stopped");
3517 			SYSCTL_ADD_XLONG(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3518 			    CTLFLAG_RD, &txq->phys_addr,
3519 			    "physical_address_of the queue");
3520 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3521 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3522 			    0, "txq generation");
3523 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3524 			    CTLFLAG_RD, &txq->cidx,
3525 			    0, "hardware queue cidx");
3526 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3527 			    CTLFLAG_RD, &txq->pidx,
3528 			    0, "hardware queue pidx");
3529 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3530 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3531 			    0, "txq start idx for dump");
3532 			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3533 			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3534 			    0, "txq #entries to dump");
3535 			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3536 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3537 			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3538 
3539 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3540 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3541 			    0, "ctrlq start idx for dump");
3542 			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3543 			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3544 			    0, "ctrl #entries to dump");
3545 			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3546 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3547 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3548 
3549 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3550 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3551 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3552 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3553 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3554 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3555 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3556 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3557 		}
3558 
3559 		/* Now add a node for mac stats. */
3560 		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3561 		    CTLFLAG_RD, NULL, "MAC statistics");
3562 		poidlist = SYSCTL_CHILDREN(poid);
3563 
3564 		/*
3565 		 * We (ab)use the length argument (arg2) to pass on the offset
3566 		 * of the data that we are interested in.  This is only required
3567 		 * for the quad counters that are updated from the hardware (we
3568 		 * make sure that we return the latest value).
3569 		 * sysctl_handle_macstat first updates *all* the counters from
3570 		 * the hardware, and then returns the latest value of the
3571 		 * requested counter.  Best would be to update only the
3572 		 * requested counter from hardware, but t3_mac_update_stats()
3573 		 * hides all the register details and we don't want to dive into
3574 		 * all that here.
3575 		 */
3576 #define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3577     (CTLTYPE_QUAD | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3578     sysctl_handle_macstat, "QU", 0)
3579 		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3580 		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3581 		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3582 		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3583 		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3584 		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3585 		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3586 		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3587 		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3588 		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3589 		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3590 		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3591 		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3592 		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3593 		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3594 		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3595 		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3596 		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3597 		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3598 		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3599 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3600 		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3601 		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3602 		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3603 		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3604 		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3605 		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3606 		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3607 		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3608 		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3609 		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3610 		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3611 		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3612 		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3613 		CXGB_SYSCTL_ADD_QUAD(rx_short);
3614 		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3615 		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3616 		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3617 		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3618 		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3619 		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3620 		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3621 		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3622 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3623 		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3624 #undef CXGB_SYSCTL_ADD_QUAD
3625 
3626 #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3627     CTLFLAG_RD, &mstats->a, 0)
3628 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3629 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3630 		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3631 		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3632 		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3633 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3634 		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3635 		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3636 		CXGB_SYSCTL_ADD_ULONG(num_resets);
3637 #undef CXGB_SYSCTL_ADD_ULONG
3638 	}
3639 }
3640 
3641 /**
3642  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3643  *	@qs: the queue set
3644  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3645  *	@idx: the descriptor index in the queue
3646  *	@data: where to dump the descriptor contents
3647  *
3648  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3649  *	size of the descriptor.
3650  */
3651 int
3652 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3653 		unsigned char *data)
3654 {
3655 	if (qnum >= 6)
3656 		return (EINVAL);
3657 
3658 	if (qnum < 3) {
3659 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3660 			return -EINVAL;
3661 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3662 		return sizeof(struct tx_desc);
3663 	}
3664 
3665 	if (qnum == 3) {
3666 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3667 			return (EINVAL);
3668 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3669 		return sizeof(struct rsp_desc);
3670 	}
3671 
3672 	qnum -= 4;
3673 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3674 		return (EINVAL);
3675 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3676 	return sizeof(struct rx_desc);
3677 }
3678