xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 8090c9f504c0c19831713ab2392d0993a5fc5b36)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/conf.h>
39 #include <machine/bus.h>
40 #include <machine/resource.h>
41 #include <sys/bus_dma.h>
42 #include <sys/rman.h>
43 #include <sys/queue.h>
44 #include <sys/sysctl.h>
45 #include <sys/taskqueue.h>
46 
47 #include <sys/proc.h>
48 #include <sys/sched.h>
49 #include <sys/smp.h>
50 #include <sys/systm.h>
51 #include <sys/syslog.h>
52 
53 #include <netinet/in_systm.h>
54 #include <netinet/in.h>
55 #include <netinet/ip.h>
56 #include <netinet/tcp.h>
57 
58 #include <dev/pci/pcireg.h>
59 #include <dev/pci/pcivar.h>
60 
61 #include <vm/vm.h>
62 #include <vm/pmap.h>
63 
64 #ifdef CONFIG_DEFINED
65 #include <cxgb_include.h>
66 #include <sys/mvec.h>
67 #else
68 #include <dev/cxgb/cxgb_include.h>
69 #include <dev/cxgb/sys/mvec.h>
70 #endif
71 
72 int      txq_fills = 0;
73 static int bogus_imm = 0;
74 #ifndef DISABLE_MBUF_IOVEC
75 static int recycle_enable = 1;
76 #endif
77 extern int cxgb_txq_buf_ring_size;
78 int cxgb_cached_allocations;
79 int cxgb_cached;
80 int cxgb_ext_freed;
81 
82 #define USE_GTS 0
83 
84 #define SGE_RX_SM_BUF_SIZE	1536
85 #define SGE_RX_DROP_THRES	16
86 #define SGE_RX_COPY_THRES	128
87 
88 /*
89  * Period of the Tx buffer reclaim timer.  This timer does not need to run
90  * frequently as Tx buffers are usually reclaimed by new Tx packets.
91  */
92 #define TX_RECLAIM_PERIOD       (hz >> 1)
93 
94 /*
95  * work request size in bytes
96  */
97 #define WR_LEN (WR_FLITS * 8)
98 
99 /*
100  * Values for sge_txq.flags
101  */
102 enum {
103 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
104 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
105 };
106 
107 struct tx_desc {
108 	uint64_t	flit[TX_DESC_FLITS];
109 } __packed;
110 
111 struct rx_desc {
112 	uint32_t	addr_lo;
113 	uint32_t	len_gen;
114 	uint32_t	gen2;
115 	uint32_t	addr_hi;
116 } __packed;;
117 
118 struct rsp_desc {               /* response queue descriptor */
119 	struct rss_header	rss_hdr;
120 	uint32_t		flags;
121 	uint32_t		len_cq;
122 	uint8_t			imm_data[47];
123 	uint8_t			intr_gen;
124 } __packed;
125 
126 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
127 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
128 #define RX_SW_DESC_INUSE        (1 << 3)
129 #define TX_SW_DESC_MAPPED       (1 << 4)
130 
131 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
132 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
133 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
134 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
135 
136 struct tx_sw_desc {                /* SW state per Tx descriptor */
137 	struct mbuf_iovec mi;
138 	bus_dmamap_t	map;
139 	int		flags;
140 };
141 
142 struct rx_sw_desc {                /* SW state per Rx descriptor */
143 	caddr_t	         rxsd_cl;
144 	uint32_t         *rxsd_ref;
145 	caddr_t	         data;
146 	bus_dmamap_t	  map;
147 	int		  flags;
148 };
149 
150 struct txq_state {
151 	unsigned int compl;
152 	unsigned int gen;
153 	unsigned int pidx;
154 };
155 
156 struct refill_fl_cb_arg {
157 	int               error;
158 	bus_dma_segment_t seg;
159 	int               nseg;
160 };
161 
162 /*
163  * Maps a number of flits to the number of Tx descriptors that can hold them.
164  * The formula is
165  *
166  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
167  *
168  * HW allows up to 4 descriptors to be combined into a WR.
169  */
170 static uint8_t flit_desc_map[] = {
171 	0,
172 #if SGE_NUM_GENBITS == 1
173 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
174 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
175 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
176 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
177 #elif SGE_NUM_GENBITS == 2
178 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
179 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
180 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
181 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
182 #else
183 # error "SGE_NUM_GENBITS must be 1 or 2"
184 #endif
185 };
186 
187 
188 static int lro_default = 0;
189 int cxgb_debug = 0;
190 
191 static void sge_timer_cb(void *arg);
192 static void sge_timer_reclaim(void *arg, int ncount);
193 static void sge_txq_reclaim_handler(void *arg, int ncount);
194 
195 /**
196  *	reclaim_completed_tx - reclaims completed Tx descriptors
197  *	@adapter: the adapter
198  *	@q: the Tx queue to reclaim completed descriptors from
199  *
200  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
201  *	and frees the associated buffers if possible.  Called with the Tx
202  *	queue's lock held.
203  */
204 static __inline int
205 reclaim_completed_tx(struct sge_txq *q)
206 {
207 	int reclaim = desc_reclaimable(q);
208 
209 	mtx_assert(&q->lock, MA_OWNED);
210 	if (reclaim > 0) {
211 		t3_free_tx_desc(q, reclaim);
212 		q->cleaned += reclaim;
213 		q->in_use -= reclaim;
214 	}
215 	return (reclaim);
216 }
217 
218 /**
219  *	should_restart_tx - are there enough resources to restart a Tx queue?
220  *	@q: the Tx queue
221  *
222  *	Checks if there are enough descriptors to restart a suspended Tx queue.
223  */
224 static __inline int
225 should_restart_tx(const struct sge_txq *q)
226 {
227 	unsigned int r = q->processed - q->cleaned;
228 
229 	return q->in_use - r < (q->size >> 1);
230 }
231 
232 /**
233  *	t3_sge_init - initialize SGE
234  *	@adap: the adapter
235  *	@p: the SGE parameters
236  *
237  *	Performs SGE initialization needed every time after a chip reset.
238  *	We do not initialize any of the queue sets here, instead the driver
239  *	top-level must request those individually.  We also do not enable DMA
240  *	here, that should be done after the queues have been set up.
241  */
242 void
243 t3_sge_init(adapter_t *adap, struct sge_params *p)
244 {
245 	u_int ctrl, ups;
246 
247 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
248 
249 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
250 	       F_CQCRDTCTRL |
251 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
252 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
253 #if SGE_NUM_GENBITS == 1
254 	ctrl |= F_EGRGENCTRL;
255 #endif
256 	if (adap->params.rev > 0) {
257 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
258 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
259 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
260 	}
261 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
262 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
263 		     V_LORCQDRBTHRSH(512));
264 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
265 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
266 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
267 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
268 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
269 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
270 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
271 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
272 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
273 }
274 
275 
276 /**
277  *	sgl_len - calculates the size of an SGL of the given capacity
278  *	@n: the number of SGL entries
279  *
280  *	Calculates the number of flits needed for a scatter/gather list that
281  *	can hold the given number of entries.
282  */
283 static __inline unsigned int
284 sgl_len(unsigned int n)
285 {
286 	return ((3 * n) / 2 + (n & 1));
287 }
288 
289 /**
290  *	get_imm_packet - return the next ingress packet buffer from a response
291  *	@resp: the response descriptor containing the packet data
292  *
293  *	Return a packet containing the immediate data of the given response.
294  */
295 #ifdef DISABLE_MBUF_IOVEC
296 static __inline int
297 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh)
298 {
299 		return (0);
300 
301 	m = m_gethdr(M_DONTWAIT, MT_DATA);
302 	len = IMMED_PKT_SIZE;
303 
304 	if (m) {
305 		memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE);
306 		m->m_pkthdr.len = m->m_len = len;
307 	}
308 	return (m != NULL);
309 }
310 
311 #else
312 static int
313 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl, uint32_t flags)
314 {
315 
316 	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
317 	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
318 	return (0);
319 
320 }
321 #endif
322 
323 static __inline u_int
324 flits_to_desc(u_int n)
325 {
326 	return (flit_desc_map[n]);
327 }
328 
329 void
330 t3_sge_err_intr_handler(adapter_t *adapter)
331 {
332 	unsigned int v, status;
333 
334 
335 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
336 
337 	if (status & F_RSPQCREDITOVERFOW)
338 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
339 
340 	if (status & F_RSPQDISABLED) {
341 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
342 
343 		CH_ALERT(adapter,
344 			 "packet delivered to disabled response queue (0x%x)\n",
345 			 (v >> S_RSPQ0DISABLED) & 0xff);
346 	}
347 
348 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
349 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
350 		t3_fatal_err(adapter);
351 }
352 
353 void
354 t3_sge_prep(adapter_t *adap, struct sge_params *p)
355 {
356 	int i;
357 
358 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
359 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
360 
361 	for (i = 0; i < SGE_QSETS; ++i) {
362 		struct qset_params *q = p->qset + i;
363 
364 		q->polling = adap->params.rev > 0;
365 
366 		if (adap->params.nports > 2) {
367 			q->coalesce_nsecs = 50000;
368 		} else {
369 #ifdef INVARIANTS
370 			q->coalesce_nsecs = 20000;
371 #else
372 			q->coalesce_nsecs = 5000;
373 #endif
374 		}
375 		q->rspq_size = RSPQ_Q_SIZE;
376 		q->fl_size = FL_Q_SIZE;
377 		q->jumbo_size = JUMBO_Q_SIZE;
378 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
379 		q->txq_size[TXQ_OFLD] = 1024;
380 		q->txq_size[TXQ_CTRL] = 256;
381 		q->cong_thres = 0;
382 	}
383 }
384 
385 int
386 t3_sge_alloc(adapter_t *sc)
387 {
388 
389 	/* The parent tag. */
390 	if (bus_dma_tag_create( NULL,			/* parent */
391 				1, 0,			/* algnmnt, boundary */
392 				BUS_SPACE_MAXADDR,	/* lowaddr */
393 				BUS_SPACE_MAXADDR,	/* highaddr */
394 				NULL, NULL,		/* filter, filterarg */
395 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
396 				BUS_SPACE_UNRESTRICTED, /* nsegments */
397 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
398 				0,			/* flags */
399 				NULL, NULL,		/* lock, lockarg */
400 				&sc->parent_dmat)) {
401 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
402 		return (ENOMEM);
403 	}
404 
405 	/*
406 	 * DMA tag for normal sized RX frames
407 	 */
408 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
409 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
410 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
411 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
412 		return (ENOMEM);
413 	}
414 
415 	/*
416 	 * DMA tag for jumbo sized RX frames.
417 	 */
418 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
419 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
420 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
421 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
422 		return (ENOMEM);
423 	}
424 
425 	/*
426 	 * DMA tag for TX frames.
427 	 */
428 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
429 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
430 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
431 		NULL, NULL, &sc->tx_dmat)) {
432 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
433 		return (ENOMEM);
434 	}
435 
436 	return (0);
437 }
438 
439 int
440 t3_sge_free(struct adapter * sc)
441 {
442 
443 	if (sc->tx_dmat != NULL)
444 		bus_dma_tag_destroy(sc->tx_dmat);
445 
446 	if (sc->rx_jumbo_dmat != NULL)
447 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
448 
449 	if (sc->rx_dmat != NULL)
450 		bus_dma_tag_destroy(sc->rx_dmat);
451 
452 	if (sc->parent_dmat != NULL)
453 		bus_dma_tag_destroy(sc->parent_dmat);
454 
455 	return (0);
456 }
457 
458 void
459 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
460 {
461 
462 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
463 	qs->rspq.polling = 0 /* p->polling */;
464 }
465 
466 #if !defined(__i386__) && !defined(__amd64__)
467 static void
468 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
469 {
470 	struct refill_fl_cb_arg *cb_arg = arg;
471 
472 	cb_arg->error = error;
473 	cb_arg->seg = segs[0];
474 	cb_arg->nseg = nseg;
475 
476 }
477 #endif
478 /**
479  *	refill_fl - refill an SGE free-buffer list
480  *	@sc: the controller softc
481  *	@q: the free-list to refill
482  *	@n: the number of new buffers to allocate
483  *
484  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
485  *	The caller must assure that @n does not exceed the queue's capacity.
486  */
487 static void
488 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
489 {
490 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
491 	struct rx_desc *d = &q->desc[q->pidx];
492 	struct refill_fl_cb_arg cb_arg;
493 	caddr_t cl;
494 	int err;
495 
496 	cb_arg.error = 0;
497 	while (n--) {
498 		/*
499 		 * We only allocate a cluster, mbuf allocation happens after rx
500 		 */
501 		if ((cl = cxgb_cache_get(q->zone)) == NULL) {
502 			log(LOG_WARNING, "Failed to allocate cluster\n");
503 			goto done;
504 		}
505 
506 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
507 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
508 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
509 				uma_zfree(q->zone, cl);
510 				goto done;
511 			}
512 			sd->flags |= RX_SW_DESC_MAP_CREATED;
513 		}
514 #if !defined(__i386__) && !defined(__amd64__)
515 		err = bus_dmamap_load(q->entry_tag, sd->map,
516 		    cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t), q->buf_size,
517 		    refill_fl_cb, &cb_arg, 0);
518 
519 		if (err != 0 || cb_arg.error) {
520 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
521 			/*
522 			 * XXX free cluster
523 			 */
524 			return;
525 		}
526 #else
527 		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)(cl + sizeof(struct m_hdr) +
528 			sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t)));
529 #endif
530 		sd->flags |= RX_SW_DESC_INUSE;
531 		sd->rxsd_cl = cl;
532 		sd->rxsd_ref = (uint32_t *)(cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_));
533 		sd->data = cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
534 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
535 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
536 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
537 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
538 
539 		d++;
540 		sd++;
541 
542 		if (++q->pidx == q->size) {
543 			q->pidx = 0;
544 			q->gen ^= 1;
545 			sd = q->sdesc;
546 			d = q->desc;
547 		}
548 		q->credits++;
549 	}
550 
551 done:
552 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
553 }
554 
555 
556 /**
557  *	free_rx_bufs - free the Rx buffers on an SGE free list
558  *	@sc: the controle softc
559  *	@q: the SGE free list to clean up
560  *
561  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
562  *	this queue should be stopped before calling this function.
563  */
564 static void
565 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
566 {
567 	u_int cidx = q->cidx;
568 
569 	while (q->credits--) {
570 		struct rx_sw_desc *d = &q->sdesc[cidx];
571 
572 		if (d->flags & RX_SW_DESC_INUSE) {
573 			bus_dmamap_unload(q->entry_tag, d->map);
574 			bus_dmamap_destroy(q->entry_tag, d->map);
575 			uma_zfree(q->zone, d->rxsd_cl);
576 		}
577 		d->rxsd_cl = NULL;
578 		if (++cidx == q->size)
579 			cidx = 0;
580 	}
581 }
582 
583 static __inline void
584 __refill_fl(adapter_t *adap, struct sge_fl *fl)
585 {
586 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
587 }
588 
589 static __inline void
590 __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
591 {
592 	if ((fl->size - fl->credits) < max)
593 		refill_fl(adap, fl, min(max, fl->size - fl->credits));
594 }
595 
596 void
597 refill_fl_service(adapter_t *adap, struct sge_fl *fl)
598 {
599 	__refill_fl_lt(adap, fl, 512);
600 }
601 
602 #ifndef DISABLE_MBUF_IOVEC
603 /**
604  *	recycle_rx_buf - recycle a receive buffer
605  *	@adapter: the adapter
606  *	@q: the SGE free list
607  *	@idx: index of buffer to recycle
608  *
609  *	Recycles the specified buffer on the given free list by adding it at
610  *	the next available slot on the list.
611  */
612 static void
613 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
614 {
615 	struct rx_desc *from = &q->desc[idx];
616 	struct rx_desc *to   = &q->desc[q->pidx];
617 
618 	q->sdesc[q->pidx] = q->sdesc[idx];
619 	to->addr_lo = from->addr_lo;        // already big endian
620 	to->addr_hi = from->addr_hi;        // likewise
621 	wmb();
622 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
623 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
624 	q->credits++;
625 
626 	if (++q->pidx == q->size) {
627 		q->pidx = 0;
628 		q->gen ^= 1;
629 	}
630 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
631 }
632 #endif
633 
634 static void
635 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
636 {
637 	uint32_t *addr;
638 
639 	addr = arg;
640 	*addr = segs[0].ds_addr;
641 }
642 
643 static int
644 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
645     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
646     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
647 {
648 	size_t len = nelem * elem_size;
649 	void *s = NULL;
650 	void *p = NULL;
651 	int err;
652 
653 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
654 				      BUS_SPACE_MAXADDR_32BIT,
655 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
656 				      len, 0, NULL, NULL, tag)) != 0) {
657 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
658 		return (ENOMEM);
659 	}
660 
661 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
662 				    map)) != 0) {
663 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
664 		return (ENOMEM);
665 	}
666 
667 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
668 	bzero(p, len);
669 	*(void **)desc = p;
670 
671 	if (sw_size) {
672 		len = nelem * sw_size;
673 		s = malloc(len, M_DEVBUF, M_WAITOK);
674 		bzero(s, len);
675 		*(void **)sdesc = s;
676 	}
677 	if (parent_entry_tag == NULL)
678 		return (0);
679 
680 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
681 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
682 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
683 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
684 		                      NULL, NULL, entry_tag)) != 0) {
685 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
686 		return (ENOMEM);
687 	}
688 	return (0);
689 }
690 
691 static void
692 sge_slow_intr_handler(void *arg, int ncount)
693 {
694 	adapter_t *sc = arg;
695 
696 	t3_slow_intr_handler(sc);
697 }
698 
699 /**
700  *	sge_timer_cb - perform periodic maintenance of an SGE qset
701  *	@data: the SGE queue set to maintain
702  *
703  *	Runs periodically from a timer to perform maintenance of an SGE queue
704  *	set.  It performs two tasks:
705  *
706  *	a) Cleans up any completed Tx descriptors that may still be pending.
707  *	Normal descriptor cleanup happens when new packets are added to a Tx
708  *	queue so this timer is relatively infrequent and does any cleanup only
709  *	if the Tx queue has not seen any new packets in a while.  We make a
710  *	best effort attempt to reclaim descriptors, in that we don't wait
711  *	around if we cannot get a queue's lock (which most likely is because
712  *	someone else is queueing new packets and so will also handle the clean
713  *	up).  Since control queues use immediate data exclusively we don't
714  *	bother cleaning them up here.
715  *
716  *	b) Replenishes Rx queues that have run out due to memory shortage.
717  *	Normally new Rx buffers are added when existing ones are consumed but
718  *	when out of memory a queue can become empty.  We try to add only a few
719  *	buffers here, the queue will be replenished fully as these new buffers
720  *	are used up if memory shortage has subsided.
721  *
722  *	c) Return coalesced response queue credits in case a response queue is
723  *	starved.
724  *
725  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
726  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
727  */
728 static void
729 sge_timer_cb(void *arg)
730 {
731 	adapter_t *sc = arg;
732 #ifndef IFNET_MULTIQUEUE
733 	struct port_info *pi;
734 	struct sge_qset *qs;
735 	struct sge_txq  *txq;
736 	int i, j;
737 	int reclaim_eth, reclaim_ofl, refill_rx;
738 
739 	for (i = 0; i < sc->params.nports; i++)
740 		for (j = 0; j < sc->port[i].nqsets; j++) {
741 			qs = &sc->sge.qs[i + j];
742 			txq = &qs->txq[0];
743 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
744 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
745 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
746 			    (qs->fl[1].credits < qs->fl[1].size));
747 			if (reclaim_eth || reclaim_ofl || refill_rx) {
748 				pi = &sc->port[i];
749 				taskqueue_enqueue(pi->tq, &pi->timer_reclaim_task);
750 				break;
751 			}
752 		}
753 #endif
754 	if (sc->params.nports > 2) {
755 		int i;
756 
757 		for_each_port(sc, i) {
758 			struct port_info *pi = &sc->port[i];
759 
760 			t3_write_reg(sc, A_SG_KDOORBELL,
761 				     F_SELEGRCNTX |
762 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
763 		}
764 	}
765 	if (sc->open_device_map != 0)
766 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
767 }
768 
769 /*
770  * This is meant to be a catch-all function to keep sge state private
771  * to sge.c
772  *
773  */
774 int
775 t3_sge_init_adapter(adapter_t *sc)
776 {
777 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
778 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
779 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
780 	mi_init();
781 	cxgb_cache_init();
782 	return (0);
783 }
784 
785 int
786 t3_sge_init_port(struct port_info *pi)
787 {
788 	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
789 	return (0);
790 }
791 
792 void
793 t3_sge_deinit_sw(adapter_t *sc)
794 {
795 	int i;
796 
797 	callout_drain(&sc->sge_timer_ch);
798 	if (sc->tq)
799 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
800 	for (i = 0; i < sc->params.nports; i++)
801 		if (sc->port[i].tq != NULL)
802 			taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task);
803 
804 	mi_deinit();
805 }
806 
807 /**
808  *	refill_rspq - replenish an SGE response queue
809  *	@adapter: the adapter
810  *	@q: the response queue to replenish
811  *	@credits: how many new responses to make available
812  *
813  *	Replenishes a response queue by making the supplied number of responses
814  *	available to HW.
815  */
816 static __inline void
817 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
818 {
819 
820 	/* mbufs are allocated on demand when a rspq entry is processed. */
821 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
822 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
823 }
824 
825 static __inline void
826 sge_txq_reclaim_(struct sge_txq *txq)
827 {
828 	int reclaimable, n;
829 	struct port_info *pi;
830 
831 	pi = txq->port;
832 reclaim_more:
833 	n = 0;
834 	reclaimable = desc_reclaimable(txq);
835 	if (reclaimable > 0 && mtx_trylock(&txq->lock)) {
836 		n = reclaim_completed_tx(txq);
837 		mtx_unlock(&txq->lock);
838 	}
839 	if (pi && pi->ifp->if_drv_flags & IFF_DRV_OACTIVE &&
840 	    txq->size - txq->in_use >= TX_START_MAX_DESC) {
841 		txq_fills++;
842 		pi->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
843 		taskqueue_enqueue(pi->tq, &pi->start_task);
844 	}
845 
846 	if (n)
847 		goto reclaim_more;
848 }
849 
850 static void
851 sge_txq_reclaim_handler(void *arg, int ncount)
852 {
853 	struct sge_txq *q = arg;
854 
855 	sge_txq_reclaim_(q);
856 }
857 
858 static void
859 sge_timer_reclaim(void *arg, int ncount)
860 {
861 	struct port_info *pi = arg;
862 	int i, nqsets = pi->nqsets;
863 	adapter_t *sc = pi->adapter;
864 	struct sge_qset *qs;
865 	struct sge_txq *txq;
866 	struct mtx *lock;
867 
868 #ifdef IFNET_MULTIQUEUE
869 	panic("%s should not be called with multiqueue support\n", __FUNCTION__);
870 #endif
871 	for (i = 0; i < nqsets; i++) {
872 		qs = &sc->sge.qs[i];
873 		txq = &qs->txq[TXQ_ETH];
874 		sge_txq_reclaim_(txq);
875 
876 		txq = &qs->txq[TXQ_OFLD];
877 		sge_txq_reclaim_(txq);
878 
879 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
880 			    &sc->sge.qs[0].rspq.lock;
881 
882 		if (mtx_trylock(lock)) {
883 			/* XXX currently assume that we are *NOT* polling */
884 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
885 
886 			if (qs->fl[0].credits < qs->fl[0].size - 16)
887 				__refill_fl(sc, &qs->fl[0]);
888 			if (qs->fl[1].credits < qs->fl[1].size - 16)
889 				__refill_fl(sc, &qs->fl[1]);
890 
891 			if (status & (1 << qs->rspq.cntxt_id)) {
892 				if (qs->rspq.credits) {
893 					refill_rspq(sc, &qs->rspq, 1);
894 					qs->rspq.credits--;
895 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
896 					    1 << qs->rspq.cntxt_id);
897 				}
898 			}
899 			mtx_unlock(lock);
900 		}
901 	}
902 }
903 
904 /**
905  *	init_qset_cntxt - initialize an SGE queue set context info
906  *	@qs: the queue set
907  *	@id: the queue set id
908  *
909  *	Initializes the TIDs and context ids for the queues of a queue set.
910  */
911 static void
912 init_qset_cntxt(struct sge_qset *qs, u_int id)
913 {
914 
915 	qs->rspq.cntxt_id = id;
916 	qs->fl[0].cntxt_id = 2 * id;
917 	qs->fl[1].cntxt_id = 2 * id + 1;
918 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
919 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
920 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
921 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
922 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
923 
924 	mbufq_init(&qs->txq[TXQ_ETH].sendq);
925 	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
926 	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
927 }
928 
929 
930 static void
931 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
932 {
933 	txq->in_use += ndesc;
934 	/*
935 	 * XXX we don't handle stopping of queue
936 	 * presumably start handles this when we bump against the end
937 	 */
938 	txqs->gen = txq->gen;
939 	txq->unacked += ndesc;
940 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
941 	txq->unacked &= 7;
942 	txqs->pidx = txq->pidx;
943 	txq->pidx += ndesc;
944 
945 	if (txq->pidx >= txq->size) {
946 		txq->pidx -= txq->size;
947 		txq->gen ^= 1;
948 	}
949 
950 }
951 
952 /**
953  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
954  *	@m: the packet mbufs
955  *      @nsegs: the number of segments
956  *
957  * 	Returns the number of Tx descriptors needed for the given Ethernet
958  * 	packet.  Ethernet packets require addition of WR and CPL headers.
959  */
960 static __inline unsigned int
961 calc_tx_descs(const struct mbuf *m, int nsegs)
962 {
963 	unsigned int flits;
964 
965 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
966 		return 1;
967 
968 	flits = sgl_len(nsegs) + 2;
969 #ifdef TSO_SUPPORTED
970 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
971 		flits++;
972 #endif
973 	return flits_to_desc(flits);
974 }
975 
976 static unsigned int
977 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
978     struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
979 {
980 	struct mbuf *m0;
981 	int err, pktlen, pass = 0;
982 
983 retry:
984 	err = 0;
985 	m0 = *m;
986 	pktlen = m0->m_pkthdr.len;
987 #if defined(__i386__) || defined(__amd64__)
988 	if (busdma_map_sg_collapse(m, segs, nsegs) == 0) {
989 		goto done;
990 	} else
991 #endif
992 		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, txsd->map, m0, segs, nsegs, 0);
993 
994 	if (err == 0) {
995 		goto done;
996 	}
997 	if (err == EFBIG && pass == 0) {
998 		pass = 1;
999 		/* Too many segments, try to defrag */
1000 		m0 = m_defrag(m0, M_DONTWAIT);
1001 		if (m0 == NULL) {
1002 			m_freem(*m);
1003 			*m = NULL;
1004 			return (ENOBUFS);
1005 		}
1006 		*m = m0;
1007 		goto retry;
1008 	} else if (err == ENOMEM) {
1009 		return (err);
1010 	} if (err) {
1011 		if (cxgb_debug)
1012 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1013 		m_freem(m0);
1014 		*m = NULL;
1015 		return (err);
1016 	}
1017 done:
1018 #if !defined(__i386__) && !defined(__amd64__)
1019 	bus_dmamap_sync(txq->entry_tag, txsd->map, BUS_DMASYNC_PREWRITE);
1020 #endif
1021 	txsd->flags |= TX_SW_DESC_MAPPED;
1022 
1023 	return (0);
1024 }
1025 
1026 /**
1027  *	make_sgl - populate a scatter/gather list for a packet
1028  *	@sgp: the SGL to populate
1029  *	@segs: the packet dma segments
1030  *	@nsegs: the number of segments
1031  *
1032  *	Generates a scatter/gather list for the buffers that make up a packet
1033  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1034  *	appropriately.
1035  */
1036 static __inline void
1037 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1038 {
1039 	int i, idx;
1040 
1041 	for (idx = 0, i = 0; i < nsegs; i++) {
1042 		/*
1043 		 * firmware doesn't like empty segments
1044 		 */
1045 		if (segs[i].ds_len == 0)
1046 			continue;
1047 		if (i && idx == 0)
1048 			++sgp;
1049 
1050 		sgp->len[idx] = htobe32(segs[i].ds_len);
1051 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1052 		idx ^= 1;
1053 	}
1054 
1055 	if (idx)
1056 		sgp->len[idx] = 0;
1057 }
1058 
1059 /**
1060  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1061  *	@adap: the adapter
1062  *	@q: the Tx queue
1063  *
1064  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1065  *	where the HW is going to sleep just after we checked, however,
1066  *	then the interrupt handler will detect the outstanding TX packet
1067  *	and ring the doorbell for us.
1068  *
1069  *	When GTS is disabled we unconditionally ring the doorbell.
1070  */
1071 static __inline void
1072 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1073 {
1074 #if USE_GTS
1075 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1076 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1077 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1078 #ifdef T3_TRACE
1079 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1080 			  q->cntxt_id);
1081 #endif
1082 		t3_write_reg(adap, A_SG_KDOORBELL,
1083 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1084 	}
1085 #else
1086 	wmb();            /* write descriptors before telling HW */
1087 	t3_write_reg(adap, A_SG_KDOORBELL,
1088 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1089 #endif
1090 }
1091 
1092 static __inline void
1093 wr_gen2(struct tx_desc *d, unsigned int gen)
1094 {
1095 #if SGE_NUM_GENBITS == 2
1096 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1097 #endif
1098 }
1099 
1100 #if 0
1101 static int print_wr = 0;
1102 static __inline void
1103 do_print_wr(struct tx_desc *d, int flits)
1104 {
1105 	int i = 0;
1106 
1107 	if (print_wr)
1108 		while (flits--) {
1109 			printf("flit[%d]: 0x%016lx\n", i, d->flit[i]);
1110 			i++;
1111 		}
1112 }
1113 #endif
1114 
1115 
1116 /**
1117  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1118  *	@ndesc: number of Tx descriptors spanned by the SGL
1119  *	@txd: first Tx descriptor to be written
1120  *	@txqs: txq state (generation and producer index)
1121  *	@txq: the SGE Tx queue
1122  *	@sgl: the SGL
1123  *	@flits: number of flits to the start of the SGL in the first descriptor
1124  *	@sgl_flits: the SGL size in flits
1125  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1126  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1127  *
1128  *	Write a work request header and an associated SGL.  If the SGL is
1129  *	small enough to fit into one Tx descriptor it has already been written
1130  *	and we just need to write the WR header.  Otherwise we distribute the
1131  *	SGL across the number of descriptors it spans.
1132  */
1133 static void
1134 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1135     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1136     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1137 {
1138 
1139 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1140 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1141 
1142 	if (__predict_true(ndesc == 1)) {
1143 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1144 		    V_WR_SGLSFLT(flits)) | wr_hi;
1145 		wmb();
1146 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1147 		    V_WR_GEN(txqs->gen)) | wr_lo;
1148 		/* XXX gen? */
1149 		wr_gen2(txd, txqs->gen);
1150 
1151 	} else {
1152 		unsigned int ogen = txqs->gen;
1153 		const uint64_t *fp = (const uint64_t *)sgl;
1154 		struct work_request_hdr *wp = wrp;
1155 
1156 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1157 		    V_WR_SGLSFLT(flits)) | wr_hi;
1158 
1159 		while (sgl_flits) {
1160 			unsigned int avail = WR_FLITS - flits;
1161 
1162 			if (avail > sgl_flits)
1163 				avail = sgl_flits;
1164 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1165 			sgl_flits -= avail;
1166 			ndesc--;
1167 			if (!sgl_flits)
1168 				break;
1169 
1170 			fp += avail;
1171 			txd++;
1172 			txsd++;
1173 			if (++txqs->pidx == txq->size) {
1174 				txqs->pidx = 0;
1175 				txqs->gen ^= 1;
1176 				txd = txq->desc;
1177 				txsd = txq->sdesc;
1178 			}
1179 
1180 			/*
1181 			 * when the head of the mbuf chain
1182 			 * is freed all clusters will be freed
1183 			 * with it
1184 			 */
1185 			txsd->mi.mi_base = NULL;
1186 			wrp = (struct work_request_hdr *)txd;
1187 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1188 			    V_WR_SGLSFLT(1)) | wr_hi;
1189 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1190 				    sgl_flits + 1)) |
1191 			    V_WR_GEN(txqs->gen)) | wr_lo;
1192 			wr_gen2(txd, txqs->gen);
1193 			flits = 1;
1194 		}
1195 		wrp->wr_hi |= htonl(F_WR_EOP);
1196 		wmb();
1197 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1198 		wr_gen2((struct tx_desc *)wp, ogen);
1199 	}
1200 }
1201 
1202 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1203 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1204 
1205 #ifdef VLAN_SUPPORTED
1206 #define GET_VTAG(cntrl, m) \
1207 do { \
1208 	if ((m)->m_flags & M_VLANTAG)					            \
1209 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1210 } while (0)
1211 
1212 #define GET_VTAG_MI(cntrl, mi) \
1213 do { \
1214 	if ((mi)->mi_flags & M_VLANTAG)					\
1215 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((mi)->mi_ether_vtag); \
1216 } while (0)
1217 #else
1218 #define GET_VTAG(cntrl, m)
1219 #define GET_VTAG_MI(cntrl, m)
1220 #endif
1221 
1222 int
1223 t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
1224 {
1225 	adapter_t *sc;
1226 	struct mbuf *m0;
1227 	struct sge_txq *txq;
1228 	struct txq_state txqs;
1229 	struct port_info *pi;
1230 	unsigned int ndesc, flits, cntrl, mlen;
1231 	int err, nsegs, tso_info = 0;
1232 
1233 	struct work_request_hdr *wrp;
1234 	struct tx_sw_desc *txsd;
1235 	struct sg_ent *sgp, *sgl;
1236 	bus_dma_segment_t *segs;
1237 	uint32_t wr_hi, wr_lo, sgl_flits;
1238 
1239 	struct tx_desc *txd;
1240 	struct mbuf_vec *mv;
1241 	struct mbuf_iovec *mi;
1242 
1243 	DPRINTF("t3_encap cpu=%d ", curcpu);
1244 
1245 	pi = qs->port;
1246 	sc = pi->adapter;
1247 	txq = &qs->txq[TXQ_ETH];
1248 	txsd = &txq->sdesc[txq->pidx];
1249 	txd = &txq->desc[txq->pidx];
1250 	sgl = txq->txq_sgl;
1251 	segs = txq->txq_segs;
1252 	m0 = *m;
1253 	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
1254 	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
1255 
1256 	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1257 /*
1258  * XXX need to add VLAN support for 6.x
1259  */
1260 #ifdef VLAN_SUPPORTED
1261 	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
1262 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1263 #endif
1264 
1265 	if (count > 1) {
1266 		if ((err = busdma_map_sg_vec(m, &m0, segs, count)))
1267 			return (err);
1268 		nsegs = count;
1269 	} else if ((err = busdma_map_sg_collapse(&m0, segs, &nsegs))) {
1270 		if (cxgb_debug)
1271 			printf("failed ... err=%d\n", err);
1272 		return (err);
1273 	}
1274 	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d count=%d", nsegs, count));
1275 
1276 	if (m0->m_type == MT_DATA)
1277 		DPRINTF("mbuf type=%d tags:%d head=%p", m0->m_type, !SLIST_EMPTY(&m0->m_pkthdr.tags),
1278 		    SLIST_FIRST(&m0->m_pkthdr.tags));
1279 
1280 	mi_collapse_mbuf(&txsd->mi, m0);
1281 	mi = &txsd->mi;
1282 
1283 	if (count > 1) {
1284 		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1285 		int i, fidx;
1286 		struct mbuf_iovec *batchmi;
1287 
1288 		mv = mtomv(m0);
1289 		batchmi = mv->mv_vec;
1290 
1291 		wrp = (struct work_request_hdr *)txd;
1292 
1293 		flits = count*2 + 1;
1294 		txq_prod(txq, 1, &txqs);
1295 
1296 		for (fidx = 1, i = 0; i < count; i++, batchmi++, fidx += 2) {
1297 			struct cpl_tx_pkt_batch_entry *cbe = &cpl_batch->pkt_entry[i];
1298 
1299 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1300 			GET_VTAG_MI(cntrl, batchmi);
1301 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1302 			cbe->cntrl = htonl(cntrl);
1303 			cbe->len = htonl(batchmi->mi_len | 0x80000000);
1304 			cbe->addr = htobe64(segs[i].ds_addr);
1305 			txd->flit[fidx] |= htobe64(1 << 24);
1306 		}
1307 
1308 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1309 		    V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1310 		wmb();
1311 		wrp->wr_lo = htonl(V_WR_LEN(flits) |
1312 		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1313 		/* XXX gen? */
1314 		wr_gen2(txd, txqs.gen);
1315 		check_ring_tx_db(sc, txq);
1316 
1317 		return (0);
1318 	} else if (tso_info) {
1319 		int undersized, eth_type;
1320 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1321 		struct ip *ip;
1322 		struct tcphdr *tcp;
1323 		char *pkthdr, tmp[TCPPKTHDRSIZE];
1324 		struct mbuf_vec *mv;
1325 		struct mbuf_iovec *tmpmi;
1326 
1327 		mv = mtomv(m0);
1328 		tmpmi = mv->mv_vec;
1329 
1330 		txd->flit[2] = 0;
1331 		GET_VTAG_MI(cntrl, mi);
1332 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1333 		hdr->cntrl = htonl(cntrl);
1334 		mlen = m0->m_pkthdr.len;
1335 		hdr->len = htonl(mlen | 0x80000000);
1336 
1337 		DPRINTF("tso buf len=%d\n", mlen);
1338 		undersized = (((tmpmi->mi_len < TCPPKTHDRSIZE) &&
1339 			(m0->m_flags & M_VLANTAG)) ||
1340 		    (tmpmi->mi_len < TCPPKTHDRSIZE - ETHER_VLAN_ENCAP_LEN));
1341 		if (__predict_false(undersized)) {
1342 			pkthdr = tmp;
1343 			dump_mi(mi);
1344 			panic("discontig packet - fixxorz");
1345 		} else
1346 			pkthdr = m0->m_data;
1347 
1348 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1349 			eth_type = CPL_ETH_II_VLAN;
1350 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1351 			    ETHER_VLAN_ENCAP_LEN);
1352 		} else {
1353 			eth_type = CPL_ETH_II;
1354 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1355 		}
1356 		tcp = (struct tcphdr *)((uint8_t *)ip +
1357 		    sizeof(*ip));
1358 
1359 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1360 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1361 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1362 		hdr->lso_info = htonl(tso_info);
1363 		flits = 3;
1364 	} else {
1365 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1366 
1367 		GET_VTAG(cntrl, m0);
1368 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1369 		cpl->cntrl = htonl(cntrl);
1370 		mlen = m0->m_pkthdr.len;
1371 		cpl->len = htonl(mlen | 0x80000000);
1372 
1373 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1374 			txq_prod(txq, 1, &txqs);
1375 
1376 			DPRINTF("mlen==%d max=%ld\n", mlen, (WR_LEN - sizeof(*cpl)));
1377 			if (mi->mi_type != MT_IOVEC &&
1378 			    mi->mi_type != MT_CLIOVEC)
1379 				memcpy(&txd->flit[2], mi->mi_data, mlen);
1380 			else {
1381 				/*
1382 				 * XXX mbuf_iovec
1383 				 */
1384 #if 0
1385 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1386 #endif
1387 				printf("bailing on m_copydata\n");
1388 			}
1389 			m_freem_iovec(&txsd->mi);
1390 			txsd->mi.mi_base = NULL;
1391 
1392 			flits = (mlen + 7) / 8 + 2;
1393 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1394 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1395 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1396 			wmb();
1397 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1398 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1399 
1400 			wr_gen2(txd, txqs.gen);
1401 			check_ring_tx_db(sc, txq);
1402 			DPRINTF("pio buf\n");
1403 			return (0);
1404 		}
1405 		DPRINTF("regular buf\n");
1406 		flits = 2;
1407 	}
1408 	wrp = (struct work_request_hdr *)txd;
1409 
1410 #ifdef	nomore
1411 	/*
1412 	 * XXX need to move into one of the helper routines above
1413 	 *
1414 	 */
1415 	if ((err = busdma_map_mbufs(m, txq, txsd, segs, &nsegs)) != 0)
1416 		return (err);
1417 	m0 = *m;
1418 #endif
1419 	ndesc = calc_tx_descs(m0, nsegs);
1420 
1421 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1422 	make_sgl(sgp, segs, nsegs);
1423 
1424 	sgl_flits = sgl_len(nsegs);
1425 
1426 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1427 	txq_prod(txq, ndesc, &txqs);
1428 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1429 	wr_lo = htonl(V_WR_TID(txq->token));
1430 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
1431 	check_ring_tx_db(pi->adapter, txq);
1432 
1433 	if ((m0->m_type == MT_DATA) && ((m0->m_flags & (M_EXT|M_NOFREE)) == M_EXT)) {
1434 		m0->m_flags &= ~M_EXT ;
1435 		m_free(m0);
1436 	}
1437 
1438 	return (0);
1439 }
1440 
1441 
1442 /**
1443  *	write_imm - write a packet into a Tx descriptor as immediate data
1444  *	@d: the Tx descriptor to write
1445  *	@m: the packet
1446  *	@len: the length of packet data to write as immediate data
1447  *	@gen: the generation bit value to write
1448  *
1449  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1450  *	contains a work request at its beginning.  We must write the packet
1451  *	carefully so the SGE doesn't read accidentally before it's written in
1452  *	its entirety.
1453  */
1454 static __inline void
1455 write_imm(struct tx_desc *d, struct mbuf *m,
1456 	  unsigned int len, unsigned int gen)
1457 {
1458 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1459 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1460 
1461 	if (len > WR_LEN)
1462 		panic("len too big %d\n", len);
1463 	if (len < sizeof(*from))
1464 		panic("len too small %d", len);
1465 
1466 	memcpy(&to[1], &from[1], len - sizeof(*from));
1467 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1468 					V_WR_BCNTLFLT(len & 7));
1469 	wmb();
1470 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1471 					V_WR_LEN((len + 7) / 8));
1472 	wr_gen2(d, gen);
1473 
1474 	/*
1475 	 * This check is a hack we should really fix the logic so
1476 	 * that this can't happen
1477 	 */
1478 	if (m->m_type != MT_DONTFREE)
1479 		m_freem(m);
1480 
1481 }
1482 
1483 /**
1484  *	check_desc_avail - check descriptor availability on a send queue
1485  *	@adap: the adapter
1486  *	@q: the TX queue
1487  *	@m: the packet needing the descriptors
1488  *	@ndesc: the number of Tx descriptors needed
1489  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1490  *
1491  *	Checks if the requested number of Tx descriptors is available on an
1492  *	SGE send queue.  If the queue is already suspended or not enough
1493  *	descriptors are available the packet is queued for later transmission.
1494  *	Must be called with the Tx queue locked.
1495  *
1496  *	Returns 0 if enough descriptors are available, 1 if there aren't
1497  *	enough descriptors and the packet has been queued, and 2 if the caller
1498  *	needs to retry because there weren't enough descriptors at the
1499  *	beginning of the call but some freed up in the mean time.
1500  */
1501 static __inline int
1502 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1503 		 struct mbuf *m, unsigned int ndesc,
1504 		 unsigned int qid)
1505 {
1506 	/*
1507 	 * XXX We currently only use this for checking the control queue
1508 	 * the control queue is only used for binding qsets which happens
1509 	 * at init time so we are guaranteed enough descriptors
1510 	 */
1511 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1512 addq_exit:	mbufq_tail(&q->sendq, m);
1513 		return 1;
1514 	}
1515 	if (__predict_false(q->size - q->in_use < ndesc)) {
1516 
1517 		struct sge_qset *qs = txq_to_qset(q, qid);
1518 
1519 		printf("stopping q\n");
1520 
1521 		setbit(&qs->txq_stopped, qid);
1522 		smp_mb();
1523 
1524 		if (should_restart_tx(q) &&
1525 		    test_and_clear_bit(qid, &qs->txq_stopped))
1526 			return 2;
1527 
1528 		q->stops++;
1529 		goto addq_exit;
1530 	}
1531 	return 0;
1532 }
1533 
1534 
1535 /**
1536  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1537  *	@q: the SGE control Tx queue
1538  *
1539  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1540  *	that send only immediate data (presently just the control queues) and
1541  *	thus do not have any mbufs
1542  */
1543 static __inline void
1544 reclaim_completed_tx_imm(struct sge_txq *q)
1545 {
1546 	unsigned int reclaim = q->processed - q->cleaned;
1547 
1548 	mtx_assert(&q->lock, MA_OWNED);
1549 
1550 	q->in_use -= reclaim;
1551 	q->cleaned += reclaim;
1552 }
1553 
1554 static __inline int
1555 immediate(const struct mbuf *m)
1556 {
1557 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1558 }
1559 
1560 /**
1561  *	ctrl_xmit - send a packet through an SGE control Tx queue
1562  *	@adap: the adapter
1563  *	@q: the control queue
1564  *	@m: the packet
1565  *
1566  *	Send a packet through an SGE control Tx queue.  Packets sent through
1567  *	a control queue must fit entirely as immediate data in a single Tx
1568  *	descriptor and have no page fragments.
1569  */
1570 static int
1571 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1572 {
1573 	int ret;
1574 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1575 
1576 	if (__predict_false(!immediate(m))) {
1577 		m_freem(m);
1578 		return 0;
1579 	}
1580 
1581 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1582 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1583 
1584 	mtx_lock(&q->lock);
1585 again:	reclaim_completed_tx_imm(q);
1586 
1587 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1588 	if (__predict_false(ret)) {
1589 		if (ret == 1) {
1590 			mtx_unlock(&q->lock);
1591 			log(LOG_ERR, "no desc available\n");
1592 
1593 			return (ENOSPC);
1594 		}
1595 		goto again;
1596 	}
1597 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1598 
1599 	q->in_use++;
1600 	if (++q->pidx >= q->size) {
1601 		q->pidx = 0;
1602 		q->gen ^= 1;
1603 	}
1604 	mtx_unlock(&q->lock);
1605 	wmb();
1606 	t3_write_reg(adap, A_SG_KDOORBELL,
1607 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1608 	return (0);
1609 }
1610 
1611 
1612 /**
1613  *	restart_ctrlq - restart a suspended control queue
1614  *	@qs: the queue set cotaining the control queue
1615  *
1616  *	Resumes transmission on a suspended Tx control queue.
1617  */
1618 static void
1619 restart_ctrlq(void *data, int npending)
1620 {
1621 	struct mbuf *m;
1622 	struct sge_qset *qs = (struct sge_qset *)data;
1623 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1624 	adapter_t *adap = qs->port->adapter;
1625 
1626 	log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use);
1627 
1628 	mtx_lock(&q->lock);
1629 again:	reclaim_completed_tx_imm(q);
1630 
1631 	while (q->in_use < q->size &&
1632 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1633 
1634 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1635 
1636 		if (++q->pidx >= q->size) {
1637 			q->pidx = 0;
1638 			q->gen ^= 1;
1639 		}
1640 		q->in_use++;
1641 	}
1642 	if (!mbufq_empty(&q->sendq)) {
1643 		setbit(&qs->txq_stopped, TXQ_CTRL);
1644 		smp_mb();
1645 
1646 		if (should_restart_tx(q) &&
1647 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1648 			goto again;
1649 		q->stops++;
1650 	}
1651 	mtx_unlock(&q->lock);
1652 	t3_write_reg(adap, A_SG_KDOORBELL,
1653 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1654 }
1655 
1656 
1657 /*
1658  * Send a management message through control queue 0
1659  */
1660 int
1661 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1662 {
1663 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1664 }
1665 
1666 
1667 /**
1668  *	free_qset - free the resources of an SGE queue set
1669  *	@sc: the controller owning the queue set
1670  *	@q: the queue set
1671  *
1672  *	Release the HW and SW resources associated with an SGE queue set, such
1673  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1674  *	queue set must be quiesced prior to calling this.
1675  */
1676 void
1677 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1678 {
1679 	int i;
1680 
1681 	t3_free_tx_desc_all(&q->txq[TXQ_ETH]);
1682 
1683 	for (i = 0; i < SGE_TXQ_PER_SET; i++)
1684 		if (q->txq[i].txq_mr.br_ring != NULL) {
1685 			free(q->txq[i].txq_mr.br_ring, M_DEVBUF);
1686 			mtx_destroy(&q->txq[i].txq_mr.br_lock);
1687 		}
1688 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1689 		if (q->fl[i].desc) {
1690 			mtx_lock(&sc->sge.reg_lock);
1691 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1692 			mtx_unlock(&sc->sge.reg_lock);
1693 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1694 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1695 					q->fl[i].desc_map);
1696 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1697 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1698 		}
1699 		if (q->fl[i].sdesc) {
1700 			free_rx_bufs(sc, &q->fl[i]);
1701 			free(q->fl[i].sdesc, M_DEVBUF);
1702 		}
1703 	}
1704 
1705 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
1706 		if (q->txq[i].desc) {
1707 			mtx_lock(&sc->sge.reg_lock);
1708 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1709 			mtx_unlock(&sc->sge.reg_lock);
1710 			bus_dmamap_unload(q->txq[i].desc_tag,
1711 					q->txq[i].desc_map);
1712 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1713 					q->txq[i].desc_map);
1714 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1715 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1716 			MTX_DESTROY(&q->txq[i].lock);
1717 		}
1718 		if (q->txq[i].sdesc) {
1719 			free(q->txq[i].sdesc, M_DEVBUF);
1720 		}
1721 	}
1722 
1723 	if (q->rspq.desc) {
1724 		mtx_lock(&sc->sge.reg_lock);
1725 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1726 		mtx_unlock(&sc->sge.reg_lock);
1727 
1728 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1729 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1730 			        q->rspq.desc_map);
1731 		bus_dma_tag_destroy(q->rspq.desc_tag);
1732 		MTX_DESTROY(&q->rspq.lock);
1733 	}
1734 
1735 	bzero(q, sizeof(*q));
1736 }
1737 
1738 /**
1739  *	t3_free_sge_resources - free SGE resources
1740  *	@sc: the adapter softc
1741  *
1742  *	Frees resources used by the SGE queue sets.
1743  */
1744 void
1745 t3_free_sge_resources(adapter_t *sc)
1746 {
1747 	int i, nqsets;
1748 
1749 #ifdef IFNET_MULTIQUEUE
1750 	panic("%s should not be called when IFNET_MULTIQUEUE is defined", __FUNCTION__);
1751 #endif
1752 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1753 		nqsets += sc->port[i].nqsets;
1754 
1755 	for (i = 0; i < nqsets; ++i)
1756 		t3_free_qset(sc, &sc->sge.qs[i]);
1757 }
1758 
1759 /**
1760  *	t3_sge_start - enable SGE
1761  *	@sc: the controller softc
1762  *
1763  *	Enables the SGE for DMAs.  This is the last step in starting packet
1764  *	transfers.
1765  */
1766 void
1767 t3_sge_start(adapter_t *sc)
1768 {
1769 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1770 }
1771 
1772 /**
1773  *	t3_sge_stop - disable SGE operation
1774  *	@sc: the adapter
1775  *
1776  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
1777  *	from error interrupts) or from normal process context.  In the latter
1778  *	case it also disables any pending queue restart tasklets.  Note that
1779  *	if it is called in interrupt context it cannot disable the restart
1780  *	tasklets as it cannot wait, however the tasklets will have no effect
1781  *	since the doorbells are disabled and the driver will call this again
1782  *	later from process context, at which time the tasklets will be stopped
1783  *	if they are still running.
1784  */
1785 void
1786 t3_sge_stop(adapter_t *sc)
1787 {
1788 	int i, nqsets;
1789 
1790 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
1791 
1792 	if (sc->tq == NULL)
1793 		return;
1794 
1795 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1796 		nqsets += sc->port[i].nqsets;
1797 
1798 	for (i = 0; i < nqsets; ++i) {
1799 		struct sge_qset *qs = &sc->sge.qs[i];
1800 
1801 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
1802 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
1803 	}
1804 }
1805 
1806 
1807 /**
1808  *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
1809  *	@adapter: the adapter
1810  *	@q: the Tx queue to reclaim descriptors from
1811  *	@reclaimable: the number of descriptors to reclaim
1812  *      @m_vec_size: maximum number of buffers to reclaim
1813  *      @desc_reclaimed: returns the number of descriptors reclaimed
1814  *
1815  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1816  *	Tx buffers.  Called with the Tx queue lock held.
1817  *
1818  *      Returns number of buffers of reclaimed
1819  */
1820 void
1821 t3_free_tx_desc(struct sge_txq *q, int reclaimable)
1822 {
1823 	struct tx_sw_desc *txsd;
1824 	unsigned int cidx;
1825 
1826 #ifdef T3_TRACE
1827 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1828 		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
1829 #endif
1830 	cidx = q->cidx;
1831 	txsd = &q->sdesc[cidx];
1832 	DPRINTF("reclaiming %d WR\n", reclaimable);
1833 	while (reclaimable--) {
1834 		DPRINTF("cidx=%d d=%p\n", cidx, txsd);
1835 		if (txsd->mi.mi_base != NULL) {
1836 			if (txsd->flags & TX_SW_DESC_MAPPED) {
1837 				bus_dmamap_unload(q->entry_tag, txsd->map);
1838 				txsd->flags &= ~TX_SW_DESC_MAPPED;
1839 			}
1840 			m_freem_iovec(&txsd->mi);
1841 			txsd->mi.mi_base = NULL;
1842 
1843 #if defined(DIAGNOSTIC) && 0
1844 			if (m_get_priority(txsd->m[0]) != cidx)
1845 				printf("pri=%d cidx=%d\n", (int)m_get_priority(txsd->m[0]), cidx);
1846 #endif
1847 
1848 		} else
1849 			q->txq_skipped++;
1850 
1851 		++txsd;
1852 		if (++cidx == q->size) {
1853 			cidx = 0;
1854 			txsd = q->sdesc;
1855 		}
1856 	}
1857 	q->cidx = cidx;
1858 
1859 }
1860 
1861 void
1862 t3_free_tx_desc_all(struct sge_txq *q)
1863 {
1864 	int i;
1865 	struct tx_sw_desc *txsd;
1866 
1867 	for (i = 0; i < q->size; i++) {
1868 		txsd = &q->sdesc[i];
1869 		if (txsd->mi.mi_base != NULL) {
1870 			if (txsd->flags & TX_SW_DESC_MAPPED) {
1871 				bus_dmamap_unload(q->entry_tag, txsd->map);
1872 				txsd->flags &= ~TX_SW_DESC_MAPPED;
1873 			}
1874 			m_freem_iovec(&txsd->mi);
1875 			bzero(&txsd->mi, sizeof(txsd->mi));
1876 		}
1877 	}
1878 }
1879 
1880 /**
1881  *	is_new_response - check if a response is newly written
1882  *	@r: the response descriptor
1883  *	@q: the response queue
1884  *
1885  *	Returns true if a response descriptor contains a yet unprocessed
1886  *	response.
1887  */
1888 static __inline int
1889 is_new_response(const struct rsp_desc *r,
1890     const struct sge_rspq *q)
1891 {
1892 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1893 }
1894 
1895 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1896 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1897 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1898 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1899 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1900 
1901 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1902 #define NOMEM_INTR_DELAY 2500
1903 
1904 /**
1905  *	write_ofld_wr - write an offload work request
1906  *	@adap: the adapter
1907  *	@m: the packet to send
1908  *	@q: the Tx queue
1909  *	@pidx: index of the first Tx descriptor to write
1910  *	@gen: the generation value to use
1911  *	@ndesc: number of descriptors the packet will occupy
1912  *
1913  *	Write an offload work request to send the supplied packet.  The packet
1914  *	data already carry the work request with most fields populated.
1915  */
1916 static void
1917 write_ofld_wr(adapter_t *adap, struct mbuf *m,
1918     struct sge_txq *q, unsigned int pidx,
1919     unsigned int gen, unsigned int ndesc,
1920     bus_dma_segment_t *segs, unsigned int nsegs)
1921 {
1922 	unsigned int sgl_flits, flits;
1923 	struct work_request_hdr *from;
1924 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1925 	struct tx_desc *d = &q->desc[pidx];
1926 	struct txq_state txqs;
1927 
1928 	if (immediate(m) && segs == NULL) {
1929 		write_imm(d, m, m->m_len, gen);
1930 		return;
1931 	}
1932 
1933 	/* Only TX_DATA builds SGLs */
1934 	from = mtod(m, struct work_request_hdr *);
1935 	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
1936 
1937 	flits = m->m_len / 8;
1938 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
1939 
1940 	make_sgl(sgp, segs, nsegs);
1941 	sgl_flits = sgl_len(nsegs);
1942 
1943 	txqs.gen = gen;
1944 	txqs.pidx = pidx;
1945 	txqs.compl = 0;
1946 
1947 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
1948 	    from->wr_hi, from->wr_lo);
1949 }
1950 
1951 
1952 
1953 /**
1954  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1955  *	@m: the packet
1956  *
1957  * 	Returns the number of Tx descriptors needed for the given offload
1958  * 	packet.  These packets are already fully constructed.
1959  */
1960 static __inline unsigned int
1961 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
1962 {
1963 	unsigned int flits, cnt = 0;
1964 
1965 
1966 	if (m->m_len <= WR_LEN)
1967 		return 1;                 /* packet fits as immediate data */
1968 
1969 	if (m->m_flags & M_IOVEC)
1970 		cnt = mtomv(m)->mv_count;
1971 
1972 	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;   /* headers */
1973 
1974 	return flits_to_desc(flits + sgl_len(cnt));
1975 }
1976 
1977 /**
1978  *	ofld_xmit - send a packet through an offload queue
1979  *	@adap: the adapter
1980  *	@q: the Tx offload queue
1981  *	@m: the packet
1982  *
1983  *	Send an offload packet through an SGE offload queue.
1984  */
1985 static int
1986 ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1987 {
1988 	int ret, nsegs;
1989 	unsigned int ndesc;
1990 	unsigned int pidx, gen;
1991 	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
1992 	struct tx_sw_desc *stx;
1993 
1994 	nsegs = m_get_sgllen(m);
1995 	vsegs = m_get_sgl(m);
1996 	ndesc = calc_tx_descs_ofld(m, nsegs);
1997 	busdma_map_sgl(vsegs, segs, nsegs);
1998 
1999 	stx = &q->sdesc[q->pidx];
2000 	KASSERT(stx->mi.mi_base == NULL, ("mi_base set"));
2001 
2002 	mtx_lock(&q->lock);
2003 again:	reclaim_completed_tx(q);
2004 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2005 	if (__predict_false(ret)) {
2006 		if (ret == 1) {
2007 			printf("no ofld desc avail\n");
2008 
2009 			m_set_priority(m, ndesc);     /* save for restart */
2010 			mtx_unlock(&q->lock);
2011 			return (EINTR);
2012 		}
2013 		goto again;
2014 	}
2015 
2016 	gen = q->gen;
2017 	q->in_use += ndesc;
2018 	pidx = q->pidx;
2019 	q->pidx += ndesc;
2020 	if (q->pidx >= q->size) {
2021 		q->pidx -= q->size;
2022 		q->gen ^= 1;
2023 	}
2024 #ifdef T3_TRACE
2025 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
2026 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
2027 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
2028 		  skb_shinfo(skb)->nr_frags);
2029 #endif
2030 	mtx_unlock(&q->lock);
2031 
2032 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2033 	check_ring_tx_db(adap, q);
2034 
2035 	return (0);
2036 }
2037 
2038 /**
2039  *	restart_offloadq - restart a suspended offload queue
2040  *	@qs: the queue set cotaining the offload queue
2041  *
2042  *	Resumes transmission on a suspended Tx offload queue.
2043  */
2044 static void
2045 restart_offloadq(void *data, int npending)
2046 {
2047 	struct mbuf *m;
2048 	struct sge_qset *qs = data;
2049 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2050 	adapter_t *adap = qs->port->adapter;
2051 	bus_dma_segment_t segs[TX_MAX_SEGS];
2052 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
2053 	int nsegs, cleaned;
2054 
2055 	mtx_lock(&q->lock);
2056 again:	cleaned = reclaim_completed_tx(q);
2057 
2058 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2059 		unsigned int gen, pidx;
2060 		unsigned int ndesc = m_get_priority(m);
2061 
2062 		if (__predict_false(q->size - q->in_use < ndesc)) {
2063 			setbit(&qs->txq_stopped, TXQ_OFLD);
2064 			smp_mb();
2065 
2066 			if (should_restart_tx(q) &&
2067 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2068 				goto again;
2069 			q->stops++;
2070 			break;
2071 		}
2072 
2073 		gen = q->gen;
2074 		q->in_use += ndesc;
2075 		pidx = q->pidx;
2076 		q->pidx += ndesc;
2077 		if (q->pidx >= q->size) {
2078 			q->pidx -= q->size;
2079 			q->gen ^= 1;
2080 		}
2081 
2082 		(void)mbufq_dequeue(&q->sendq);
2083 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
2084 		mtx_unlock(&q->lock);
2085 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
2086 		mtx_lock(&q->lock);
2087 	}
2088 	mtx_unlock(&q->lock);
2089 
2090 #if USE_GTS
2091 	set_bit(TXQ_RUNNING, &q->flags);
2092 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2093 #endif
2094 	t3_write_reg(adap, A_SG_KDOORBELL,
2095 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2096 #if 0
2097 
2098 	for (i = 0; i < cleaned; i++) {
2099 		m_freem_vec(m_vec[i]);
2100 	}
2101 #endif
2102 }
2103 
2104 /**
2105  *	queue_set - return the queue set a packet should use
2106  *	@m: the packet
2107  *
2108  *	Maps a packet to the SGE queue set it should use.  The desired queue
2109  *	set is carried in bits 1-3 in the packet's priority.
2110  */
2111 static __inline int
2112 queue_set(const struct mbuf *m)
2113 {
2114 	return m_get_priority(m) >> 1;
2115 }
2116 
2117 /**
2118  *	is_ctrl_pkt - return whether an offload packet is a control packet
2119  *	@m: the packet
2120  *
2121  *	Determines whether an offload packet should use an OFLD or a CTRL
2122  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
2123  */
2124 static __inline int
2125 is_ctrl_pkt(const struct mbuf *m)
2126 {
2127 	return m_get_priority(m) & 1;
2128 }
2129 
2130 /**
2131  *	t3_offload_tx - send an offload packet
2132  *	@tdev: the offload device to send to
2133  *	@m: the packet
2134  *
2135  *	Sends an offload packet.  We use the packet priority to select the
2136  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2137  *	should be sent as regular or control, bits 1-3 select the queue set.
2138  */
2139 int
2140 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
2141 {
2142 	adapter_t *adap = tdev2adap(tdev);
2143 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2144 
2145 	if (__predict_false(is_ctrl_pkt(m)))
2146 		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
2147 
2148 	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
2149 }
2150 
2151 /**
2152  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2153  *	@tdev: the offload device that will be receiving the packets
2154  *	@q: the SGE response queue that assembled the bundle
2155  *	@m: the partial bundle
2156  *	@n: the number of packets in the bundle
2157  *
2158  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2159  */
2160 static __inline void
2161 deliver_partial_bundle(struct t3cdev *tdev,
2162 			struct sge_rspq *q,
2163 			struct mbuf *mbufs[], int n)
2164 {
2165 	if (n) {
2166 		q->offload_bundles++;
2167 		cxgb_ofld_recv(tdev, mbufs, n);
2168 	}
2169 }
2170 
2171 static __inline int
2172 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2173     struct mbuf *m, struct mbuf *rx_gather[],
2174     unsigned int gather_idx)
2175 {
2176 
2177 	rq->offload_pkts++;
2178 	m->m_pkthdr.header = mtod(m, void *);
2179 	rx_gather[gather_idx++] = m;
2180 	if (gather_idx == RX_BUNDLE_SIZE) {
2181 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2182 		gather_idx = 0;
2183 		rq->offload_bundles++;
2184 	}
2185 	return (gather_idx);
2186 }
2187 
2188 static void
2189 restart_tx(struct sge_qset *qs)
2190 {
2191 	struct adapter *sc = qs->port->adapter;
2192 
2193 
2194 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2195 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2196 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2197 		qs->txq[TXQ_OFLD].restarts++;
2198 		printf("restarting TXQ_OFLD\n");
2199 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2200 	}
2201 	printf("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
2202 	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
2203 	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
2204 	    qs->txq[TXQ_CTRL].in_use);
2205 
2206 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2207 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2208 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2209 		qs->txq[TXQ_CTRL].restarts++;
2210 		printf("restarting TXQ_CTRL\n");
2211 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2212 	}
2213 }
2214 
2215 /**
2216  *	t3_sge_alloc_qset - initialize an SGE queue set
2217  *	@sc: the controller softc
2218  *	@id: the queue set id
2219  *	@nports: how many Ethernet ports will be using this queue set
2220  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2221  *	@p: configuration parameters for this queue set
2222  *	@ntxq: number of Tx queues for the queue set
2223  *	@pi: port info for queue set
2224  *
2225  *	Allocate resources and initialize an SGE queue set.  A queue set
2226  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2227  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2228  *	queue, offload queue, and control queue.
2229  */
2230 int
2231 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2232 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2233 {
2234 	struct sge_qset *q = &sc->sge.qs[id];
2235 	int i, ret = 0;
2236 
2237 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2238 		if ((q->txq[i].txq_mr.br_ring = malloc(cxgb_txq_buf_ring_size*sizeof(struct mbuf *),
2239 			    M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
2240 			device_printf(sc->dev, "failed to allocate mbuf ring\n");
2241 			goto err;
2242 		}
2243 		q->txq[i].txq_mr.br_prod = q->txq[i].txq_mr.br_cons = 0;
2244 		q->txq[i].txq_mr.br_size = cxgb_txq_buf_ring_size;
2245 		mtx_init(&q->txq[i].txq_mr.br_lock, "txq mbuf ring", NULL, MTX_DEF);
2246 	}
2247 
2248 	init_qset_cntxt(q, id);
2249 
2250 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2251 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2252 		    &q->fl[0].desc, &q->fl[0].sdesc,
2253 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2254 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2255 		printf("error %d from alloc ring fl0\n", ret);
2256 		goto err;
2257 	}
2258 
2259 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2260 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2261 		    &q->fl[1].desc, &q->fl[1].sdesc,
2262 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2263 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2264 		printf("error %d from alloc ring fl1\n", ret);
2265 		goto err;
2266 	}
2267 
2268 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2269 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2270 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2271 		    NULL, NULL)) != 0) {
2272 		printf("error %d from alloc ring rspq\n", ret);
2273 		goto err;
2274 	}
2275 
2276 	for (i = 0; i < ntxq; ++i) {
2277 		/*
2278 		 * The control queue always uses immediate data so does not
2279 		 * need to keep track of any mbufs.
2280 		 * XXX Placeholder for future TOE support.
2281 		 */
2282 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2283 
2284 		if ((ret = alloc_ring(sc, p->txq_size[i],
2285 			    sizeof(struct tx_desc), sz,
2286 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2287 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2288 			    &q->txq[i].desc_map,
2289 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2290 			printf("error %d from alloc ring tx %i\n", ret, i);
2291 			goto err;
2292 		}
2293 		mbufq_init(&q->txq[i].sendq);
2294 		q->txq[i].gen = 1;
2295 		q->txq[i].size = p->txq_size[i];
2296 		snprintf(q->txq[i].lockbuf, TXQ_NAME_LEN, "t3 txq lock %d:%d:%d",
2297 		    device_get_unit(sc->dev), irq_vec_idx, i);
2298 		MTX_INIT(&q->txq[i].lock, q->txq[i].lockbuf, NULL, MTX_DEF);
2299 	}
2300 
2301 	q->txq[TXQ_ETH].port = pi;
2302 
2303 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2304 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2305 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_ETH]);
2306 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_OFLD]);
2307 
2308 	q->fl[0].gen = q->fl[1].gen = 1;
2309 	q->fl[0].size = p->fl_size;
2310 	q->fl[1].size = p->jumbo_size;
2311 
2312 	q->rspq.gen = 1;
2313 	q->rspq.cidx = 0;
2314 	q->rspq.size = p->rspq_size;
2315 
2316 	q->txq[TXQ_ETH].stop_thres = nports *
2317 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2318 
2319 	q->fl[0].buf_size = (MCLBYTES - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_));
2320 	q->fl[0].zone = zone_clust;
2321 	q->fl[0].type = EXT_CLUSTER;
2322 #if __FreeBSD_version > 800000
2323 		q->fl[1].buf_size = MJUM16BYTES - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_);
2324 		q->fl[1].zone = zone_jumbo16;
2325 		q->fl[1].type = EXT_JUMBO16;
2326 #else
2327 		q->fl[1].buf_size = MJUMPAGESIZE - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_);
2328 		q->fl[1].zone = zone_jumbop;
2329 		q->fl[1].type = EXT_JUMBOP;
2330 #endif
2331 	q->lro.enabled = lro_default;
2332 
2333 	mtx_lock(&sc->sge.reg_lock);
2334 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2335 				   q->rspq.phys_addr, q->rspq.size,
2336 				   q->fl[0].buf_size, 1, 0);
2337 	if (ret) {
2338 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2339 		goto err_unlock;
2340 	}
2341 
2342 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2343 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2344 					  q->fl[i].phys_addr, q->fl[i].size,
2345 					  q->fl[i].buf_size, p->cong_thres, 1,
2346 					  0);
2347 		if (ret) {
2348 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2349 			goto err_unlock;
2350 		}
2351 	}
2352 
2353 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2354 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2355 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2356 				 1, 0);
2357 	if (ret) {
2358 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2359 		goto err_unlock;
2360 	}
2361 
2362 	if (ntxq > 1) {
2363 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2364 					 USE_GTS, SGE_CNTXT_OFLD, id,
2365 					 q->txq[TXQ_OFLD].phys_addr,
2366 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2367 		if (ret) {
2368 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2369 			goto err_unlock;
2370 		}
2371 	}
2372 
2373 	if (ntxq > 2) {
2374 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2375 					 SGE_CNTXT_CTRL, id,
2376 					 q->txq[TXQ_CTRL].phys_addr,
2377 					 q->txq[TXQ_CTRL].size,
2378 					 q->txq[TXQ_CTRL].token, 1, 0);
2379 		if (ret) {
2380 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2381 			goto err_unlock;
2382 		}
2383 	}
2384 
2385 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2386 	    device_get_unit(sc->dev), irq_vec_idx);
2387 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2388 
2389 	mtx_unlock(&sc->sge.reg_lock);
2390 	t3_update_qset_coalesce(q, p);
2391 	q->port = pi;
2392 
2393 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2394 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2395 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2396 
2397 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2398 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2399 
2400 	return (0);
2401 
2402 err_unlock:
2403 	mtx_unlock(&sc->sge.reg_lock);
2404 err:
2405 	t3_free_qset(sc, q);
2406 
2407 	return (ret);
2408 }
2409 
2410 void
2411 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2412 {
2413 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2414 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2415 	struct ifnet *ifp = pi->ifp;
2416 
2417 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2418 
2419 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2420 	    cpl->csum_valid && cpl->csum == 0xffff) {
2421 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2422 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2423 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2424 		m->m_pkthdr.csum_data = 0xffff;
2425 	}
2426 	/*
2427 	 * XXX need to add VLAN support for 6.x
2428 	 */
2429 #ifdef VLAN_SUPPORTED
2430 	if (__predict_false(cpl->vlan_valid)) {
2431 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2432 		m->m_flags |= M_VLANTAG;
2433 	}
2434 #endif
2435 
2436 	m->m_pkthdr.rcvif = ifp;
2437 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2438 #ifndef DISABLE_MBUF_IOVEC
2439 	m_explode(m);
2440 #endif
2441 	/*
2442 	 * adjust after conversion to mbuf chain
2443 	 */
2444 	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2445 	m->m_len -= (sizeof(*cpl) + ethpad);
2446 	m->m_data += (sizeof(*cpl) + ethpad);
2447 
2448 	(*ifp->if_input)(ifp, m);
2449 }
2450 
2451 /**
2452  *	get_packet - return the next ingress packet buffer from a free list
2453  *	@adap: the adapter that received the packet
2454  *	@drop_thres: # of remaining buffers before we start dropping packets
2455  *	@qs: the qset that the SGE free list holding the packet belongs to
2456  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2457  *      @r: response descriptor
2458  *
2459  *	Get the next packet from a free list and complete setup of the
2460  *	sk_buff.  If the packet is small we make a copy and recycle the
2461  *	original buffer, otherwise we use the original buffer itself.  If a
2462  *	positive drop threshold is supplied packets are dropped and their
2463  *	buffers recycled if (a) the number of remaining buffers is under the
2464  *	threshold and the packet is too big to copy, or (b) the packet should
2465  *	be copied but there is no memory for the copy.
2466  */
2467 #ifdef DISABLE_MBUF_IOVEC
2468 
2469 static int
2470 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2471     struct t3_mbuf_hdr *mh, struct rsp_desc *r, struct mbuf *m)
2472 {
2473 
2474 	unsigned int len_cq =  ntohl(r->len_cq);
2475 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2476 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2477 	uint32_t len = G_RSPD_LEN(len_cq);
2478 	uint32_t flags = ntohl(r->flags);
2479 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2480 	uint32_t *ref;
2481 	int ret = 0;
2482 
2483 	prefetch(sd->rxsd_cl);
2484 
2485 	fl->credits--;
2486 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2487 	bus_dmamap_unload(fl->entry_tag, sd->map);
2488 
2489 	ref = sd->rxsd_ref;
2490 	m_cljset(m, sd->rxsd_cl, fl->type, sd->rxsd_ref);
2491 	*ref = 1;
2492 	m->m_len = len;
2493 	/*
2494 	 * bump past the refcnt address
2495 	 */
2496 	m->m_data = sd->data;
2497 
2498 	switch(sopeop) {
2499 	case RSPQ_SOP_EOP:
2500 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2501 		mh->mh_head = mh->mh_tail = m;
2502 		m->m_pkthdr.len = len;
2503 		m->m_flags |= M_PKTHDR;
2504 		ret = 1;
2505 		break;
2506 	case RSPQ_NSOP_NEOP:
2507 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2508 		m->m_flags &= ~M_PKTHDR;
2509 		if (mh->mh_tail == NULL) {
2510 			if (cxgb_debug)
2511 				printf("discarding intermediate descriptor entry\n");
2512 			m_freem(m);
2513 			break;
2514 		}
2515 		mh->mh_tail->m_next = m;
2516 		mh->mh_tail = m;
2517 		mh->mh_head->m_pkthdr.len += len;
2518 		ret = 0;
2519 		break;
2520 	case RSPQ_SOP:
2521 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2522 		m->m_pkthdr.len = len;
2523 		mh->mh_head = mh->mh_tail = m;
2524 		m->m_flags |= M_PKTHDR;
2525 		ret = 0;
2526 		break;
2527 	case RSPQ_EOP:
2528 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2529 		m->m_flags &= ~M_PKTHDR;
2530 		mh->mh_head->m_pkthdr.len += len;
2531 		mh->mh_tail->m_next = m;
2532 		mh->mh_tail = m;
2533 		ret = 1;
2534 		break;
2535 	}
2536 	if (++fl->cidx == fl->size)
2537 		fl->cidx = 0;
2538 
2539 	return (ret);
2540 }
2541 
2542 #else
2543 static void
2544 ext_free_handler(void *cl, void * arg)
2545 {
2546 	uintptr_t type = (uintptr_t)arg;
2547 	uma_zone_t zone;
2548 	struct mbuf *m;
2549 
2550 	m = cl;
2551 	zone = m_getzonefromtype(type);
2552 	m->m_ext.ext_type = (int)type;
2553 	cxgb_ext_freed++;
2554 	cxgb_cache_put(zone, cl);
2555 }
2556 
2557 static void
2558 init_cluster_mbuf(caddr_t cl, int flags, int type, uma_zone_t zone)
2559 {
2560 	struct mbuf *m;
2561 	int header_size;
2562 
2563 	header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
2564 
2565 	bzero(cl, header_size);
2566 	m = (struct mbuf *)cl;
2567 
2568 	SLIST_INIT(&m->m_pkthdr.tags);
2569 	m->m_type = MT_DATA;
2570 	m->m_flags = flags | M_NOFREE | M_EXT;
2571 	m->m_data = cl + header_size;
2572 	m->m_ext.ext_buf = cl;
2573 	m->m_ext.ref_cnt = (uint32_t *)(cl + header_size - sizeof(uint32_t));
2574 	m->m_ext.ext_size = m_getsizefromtype(type);
2575 	m->m_ext.ext_free = ext_free_handler;
2576 	m->m_ext.ext_args = (void *)(uintptr_t)type;
2577 	m->m_ext.ext_type = EXT_EXTREF;
2578 	*(m->m_ext.ref_cnt) = 1;
2579 	DPRINTF("data=%p ref_cnt=%p\n", m->m_data, m->m_ext.ref_cnt);
2580 }
2581 
2582 static int
2583 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2584     struct mbuf **m, struct rsp_desc *r)
2585 {
2586 
2587 	unsigned int len_cq =  ntohl(r->len_cq);
2588 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2589 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2590 	uint32_t len = G_RSPD_LEN(len_cq);
2591 	uint32_t flags = ntohl(r->flags);
2592 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2593 	void *cl;
2594 	int ret = 0;
2595 	struct mbuf *m0;
2596 #if 0
2597 	if ((sd + 1 )->rxsd_cl)
2598 		prefetch((sd + 1)->rxsd_cl);
2599 	if ((sd + 2)->rxsd_cl)
2600 		prefetch((sd + 2)->rxsd_cl);
2601 #endif
2602 	DPRINTF("rx cpu=%d\n", curcpu);
2603 	fl->credits--;
2604 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2605 
2606 	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2607 		if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
2608 			goto skip_recycle;
2609 		cl = mtod(m0, void *);
2610 		memcpy(cl, sd->data, len);
2611 		recycle_rx_buf(adap, fl, fl->cidx);
2612 		*m = m0;
2613 	} else {
2614 	skip_recycle:
2615 		bus_dmamap_unload(fl->entry_tag, sd->map);
2616 		cl = sd->rxsd_cl;
2617 		*m = m0 = (struct mbuf *)cl;
2618 	}
2619 
2620 	switch(sopeop) {
2621 	case RSPQ_SOP_EOP:
2622 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2623 		if (cl == sd->rxsd_cl)
2624 			init_cluster_mbuf(cl, M_PKTHDR, fl->type, fl->zone);
2625 		m0->m_len = m0->m_pkthdr.len = len;
2626 		ret = 1;
2627 		goto done;
2628 		break;
2629 	case RSPQ_NSOP_NEOP:
2630 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2631 		panic("chaining unsupported");
2632 		ret = 0;
2633 		break;
2634 	case RSPQ_SOP:
2635 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2636 		panic("chaining unsupported");
2637 		m_iovinit(m0);
2638 		ret = 0;
2639 		break;
2640 	case RSPQ_EOP:
2641 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2642 		panic("chaining unsupported");
2643 		ret = 1;
2644 		break;
2645 	}
2646 	panic("append not supported");
2647 #if 0
2648 	m_iovappend(m0, cl, fl->buf_size, len, sizeof(uint32_t), sd->rxsd_ref);
2649 #endif
2650 done:
2651 	if (++fl->cidx == fl->size)
2652 		fl->cidx = 0;
2653 
2654 	return (ret);
2655 }
2656 #endif
2657 /**
2658  *	handle_rsp_cntrl_info - handles control information in a response
2659  *	@qs: the queue set corresponding to the response
2660  *	@flags: the response control flags
2661  *
2662  *	Handles the control information of an SGE response, such as GTS
2663  *	indications and completion credits for the queue set's Tx queues.
2664  *	HW coalesces credits, we don't do any extra SW coalescing.
2665  */
2666 static __inline void
2667 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2668 {
2669 	unsigned int credits;
2670 
2671 #if USE_GTS
2672 	if (flags & F_RSPD_TXQ0_GTS)
2673 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2674 #endif
2675 	credits = G_RSPD_TXQ0_CR(flags);
2676 	if (credits) {
2677 		qs->txq[TXQ_ETH].processed += credits;
2678 #ifndef	IFNET_MULTIQUEUE
2679 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
2680 			taskqueue_enqueue(qs->port->adapter->tq,
2681 			    &qs->port->timer_reclaim_task);
2682 #endif
2683 	}
2684 
2685 	credits = G_RSPD_TXQ2_CR(flags);
2686 	if (credits)
2687 		qs->txq[TXQ_CTRL].processed += credits;
2688 
2689 # if USE_GTS
2690 	if (flags & F_RSPD_TXQ1_GTS)
2691 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2692 # endif
2693 	credits = G_RSPD_TXQ1_CR(flags);
2694 	if (credits)
2695 		qs->txq[TXQ_OFLD].processed += credits;
2696 
2697 }
2698 
2699 static void
2700 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2701     unsigned int sleeping)
2702 {
2703 	;
2704 }
2705 
2706 /**
2707  *	process_responses - process responses from an SGE response queue
2708  *	@adap: the adapter
2709  *	@qs: the queue set to which the response queue belongs
2710  *	@budget: how many responses can be processed in this round
2711  *
2712  *	Process responses from an SGE response queue up to the supplied budget.
2713  *	Responses include received packets as well as credits and other events
2714  *	for the queues that belong to the response queue's queue set.
2715  *	A negative budget is effectively unlimited.
2716  *
2717  *	Additionally choose the interrupt holdoff time for the next interrupt
2718  *	on this queue.  If the system is under memory shortage use a fairly
2719  *	long delay to help recovery.
2720  */
2721 int
2722 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2723 {
2724 	struct sge_rspq *rspq = &qs->rspq;
2725 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2726 	int budget_left = budget;
2727 	unsigned int sleeping = 0;
2728 	int lro = qs->lro.enabled;
2729 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2730 	int ngathered = 0;
2731 #ifdef DEBUG
2732 	static int last_holdoff = 0;
2733 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2734 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2735 		last_holdoff = rspq->holdoff_tmr;
2736 	}
2737 #endif
2738 	rspq->next_holdoff = rspq->holdoff_tmr;
2739 
2740 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2741 		int eth, eop = 0, ethpad = 0;
2742 		uint32_t flags = ntohl(r->flags);
2743 		uint32_t rss_csum = *(const uint32_t *)r;
2744 		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2745 
2746 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2747 
2748 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2749 			/* XXX */
2750 			printf("async notification\n");
2751 
2752 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2753 #ifdef DISABLE_MBUF_IOVEC
2754 
2755 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
2756 
2757 			if(get_imm_packet(adap, r, &rspq->rspq_mh) == 0) {
2758 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2759 				budget_left--;
2760 				break;
2761 			} else {
2762 				eop = 1;
2763 			}
2764 #else
2765 			struct mbuf *m = NULL;
2766 
2767 			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
2768 			if (rspq->rspq_mbuf == NULL)
2769 				rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
2770                         else
2771 				m = m_gethdr(M_DONTWAIT, MT_DATA);
2772 
2773 			/*
2774 			 * XXX revisit me
2775 			 */
2776 			if (rspq->rspq_mbuf == NULL &&  m == NULL) {
2777 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2778 				budget_left--;
2779 				break;
2780 			}
2781 			get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags);
2782 
2783 			eop = 1;
2784 			rspq->imm_data++;
2785 #endif
2786 		} else if (r->len_cq) {
2787 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2788 
2789 #ifdef DISABLE_MBUF_IOVEC
2790 			struct mbuf *m;
2791 			m = m_gethdr(M_DONTWAIT, MT_DATA);
2792 
2793 			if (m == NULL) {
2794 				log(LOG_WARNING, "failed to get mbuf for packet\n");
2795 				break;
2796 			} else {
2797 				m->m_next = m->m_nextpkt = NULL;
2798 			}
2799 
2800 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r, m);
2801 #else
2802 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mbuf, r);
2803 #ifdef IFNET_MULTIQUEUE
2804 			rspq->rspq_mbuf->m_pkthdr.rss_hash = rss_hash;
2805 #endif
2806 #endif
2807 			ethpad = 2;
2808 		} else {
2809 			DPRINTF("pure response\n");
2810 			rspq->pure_rsps++;
2811 		}
2812 		if (flags & RSPD_CTRL_MASK) {
2813 			sleeping |= flags & RSPD_GTS_MASK;
2814 			handle_rsp_cntrl_info(qs, flags);
2815 		}
2816 
2817 		r++;
2818 		if (__predict_false(++rspq->cidx == rspq->size)) {
2819 			rspq->cidx = 0;
2820 			rspq->gen ^= 1;
2821 			r = rspq->desc;
2822 		}
2823 		prefetch(r);
2824 		if (++rspq->credits >= (rspq->size / 4)) {
2825 			refill_rspq(adap, rspq, rspq->credits);
2826 			rspq->credits = 0;
2827 		}
2828 		DPRINTF("eth=%d eop=%d flags=0x%x\n", eth, eop, flags);
2829 
2830 		if (!eth && eop) {
2831 			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
2832 			/*
2833 			 * XXX size mismatch
2834 			 */
2835 			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
2836 
2837 			ngathered = rx_offload(&adap->tdev, rspq,
2838 			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
2839 			rspq->rspq_mh.mh_head = NULL;
2840 			DPRINTF("received offload packet\n");
2841 
2842 		} else if (eth && eop) {
2843 			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *));
2844 			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES);
2845 
2846 			t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad,
2847 			    rss_hash, rss_csum, lro);
2848 			DPRINTF("received tunnel packet\n");
2849 				rspq->rspq_mh.mh_head = NULL;
2850 
2851 		}
2852 		__refill_fl_lt(adap, &qs->fl[0], 32);
2853 		__refill_fl_lt(adap, &qs->fl[1], 32);
2854 		--budget_left;
2855 	}
2856 
2857 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
2858 	t3_lro_flush(adap, qs, &qs->lro);
2859 
2860 	if (sleeping)
2861 		check_ring_db(adap, qs, sleeping);
2862 
2863 	smp_mb();  /* commit Tx queue processed updates */
2864 	if (__predict_false(qs->txq_stopped != 0)) {
2865 		printf("restarting tx on %p\n", qs);
2866 
2867 		restart_tx(qs);
2868 	}
2869 
2870 	__refill_fl_lt(adap, &qs->fl[0], 512);
2871 	__refill_fl_lt(adap, &qs->fl[1], 512);
2872 	budget -= budget_left;
2873 	return (budget);
2874 }
2875 
2876 /*
2877  * A helper function that processes responses and issues GTS.
2878  */
2879 static __inline int
2880 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2881 {
2882 	int work;
2883 	static int last_holdoff = 0;
2884 
2885 	work = process_responses(adap, rspq_to_qset(rq), -1);
2886 
2887 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2888 		printf("next_holdoff=%d\n", rq->next_holdoff);
2889 		last_holdoff = rq->next_holdoff;
2890 	}
2891 	if (work)
2892 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2893 		    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2894 	return work;
2895 }
2896 
2897 
2898 /*
2899  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2900  * Handles data events from SGE response queues as well as error and other
2901  * async events as they all use the same interrupt pin.  We use one SGE
2902  * response queue per port in this mode and protect all response queues with
2903  * queue 0's lock.
2904  */
2905 void
2906 t3b_intr(void *data)
2907 {
2908 	uint32_t i, map;
2909 	adapter_t *adap = data;
2910 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2911 
2912 	t3_write_reg(adap, A_PL_CLI, 0);
2913 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2914 
2915 	if (!map)
2916 		return;
2917 
2918 	if (__predict_false(map & F_ERRINTR))
2919 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2920 
2921 	mtx_lock(&q0->lock);
2922 	for_each_port(adap, i)
2923 	    if (map & (1 << i))
2924 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
2925 	mtx_unlock(&q0->lock);
2926 }
2927 
2928 /*
2929  * The MSI interrupt handler.  This needs to handle data events from SGE
2930  * response queues as well as error and other async events as they all use
2931  * the same MSI vector.  We use one SGE response queue per port in this mode
2932  * and protect all response queues with queue 0's lock.
2933  */
2934 void
2935 t3_intr_msi(void *data)
2936 {
2937 	adapter_t *adap = data;
2938 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2939 	int i, new_packets = 0;
2940 
2941 	mtx_lock(&q0->lock);
2942 
2943 	for_each_port(adap, i)
2944 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
2945 		    new_packets = 1;
2946 	mtx_unlock(&q0->lock);
2947 	if (new_packets == 0)
2948 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2949 }
2950 
2951 void
2952 t3_intr_msix(void *data)
2953 {
2954 	struct sge_qset *qs = data;
2955 	adapter_t *adap = qs->port->adapter;
2956 	struct sge_rspq *rspq = &qs->rspq;
2957 
2958 	if (mtx_trylock(&rspq->lock)) {
2959 		if (process_responses_gts(adap, rspq) == 0)
2960 			rspq->unhandled_irqs++;
2961 		mtx_unlock(&rspq->lock);
2962 	}
2963 }
2964 
2965 /*
2966  * broken by recent mbuf changes
2967  */
2968 static int
2969 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2970 {
2971 	adapter_t *sc;
2972 	int i, j, enabled, err, nqsets = 0;
2973 
2974 #ifndef LRO_WORKING
2975 	return (0);
2976 #endif
2977 
2978 	sc = arg1;
2979 	enabled = sc->sge.qs[0].lro.enabled;
2980         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2981 
2982 	if (err != 0)
2983 		return (err);
2984 	if (enabled == sc->sge.qs[0].lro.enabled)
2985 		return (0);
2986 
2987 	for (i = 0; i < sc->params.nports; i++)
2988 		for (j = 0; j < sc->port[i].nqsets; j++)
2989 			nqsets++;
2990 
2991 	for (i = 0; i < nqsets; i++)
2992 		sc->sge.qs[i].lro.enabled = enabled;
2993 
2994 	return (0);
2995 }
2996 
2997 static int
2998 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2999 {
3000 	adapter_t *sc = arg1;
3001 	struct qset_params *qsp = &sc->params.sge.qset[0];
3002 	int coalesce_nsecs;
3003 	struct sge_qset *qs;
3004 	int i, j, err, nqsets = 0;
3005 	struct mtx *lock;
3006 
3007 	if ((sc->flags & FULL_INIT_DONE) == 0)
3008 		return (ENXIO);
3009 
3010 	coalesce_nsecs = qsp->coalesce_nsecs;
3011         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
3012 
3013 	if (err != 0) {
3014 		return (err);
3015 	}
3016 	if (coalesce_nsecs == qsp->coalesce_nsecs)
3017 		return (0);
3018 
3019 	for (i = 0; i < sc->params.nports; i++)
3020 		for (j = 0; j < sc->port[i].nqsets; j++)
3021 			nqsets++;
3022 
3023 	coalesce_nsecs = max(100, coalesce_nsecs);
3024 
3025 	for (i = 0; i < nqsets; i++) {
3026 		qs = &sc->sge.qs[i];
3027 		qsp = &sc->params.sge.qset[i];
3028 		qsp->coalesce_nsecs = coalesce_nsecs;
3029 
3030 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3031 			    &sc->sge.qs[0].rspq.lock;
3032 
3033 		mtx_lock(lock);
3034 		t3_update_qset_coalesce(qs, qsp);
3035 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3036 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3037 		mtx_unlock(lock);
3038 	}
3039 
3040 	return (0);
3041 }
3042 
3043 
3044 void
3045 t3_add_attach_sysctls(adapter_t *sc)
3046 {
3047 	struct sysctl_ctx_list *ctx;
3048 	struct sysctl_oid_list *children;
3049 
3050 	ctx = device_get_sysctl_ctx(sc->dev);
3051 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3052 
3053 	/* random information */
3054 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3055 	    "firmware_version",
3056 	    CTLFLAG_RD, &sc->fw_version,
3057 	    0, "firmware version");
3058 
3059 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3060 	    "enable_lro",
3061 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3062 	    0, t3_lro_enable,
3063 	    "I", "enable large receive offload");
3064 
3065 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3066 	    "enable_debug",
3067 	    CTLFLAG_RW, &cxgb_debug,
3068 	    0, "enable verbose debugging output");
3069 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tunq_coalesce",
3070 	    CTLFLAG_RD, &sc->tunq_coalesce,
3071 	    "#tunneled packets freed");
3072 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3073 	    "txq_overrun",
3074 	    CTLFLAG_RD, &txq_fills,
3075 	    0, "#times txq overrun");
3076 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3077 	    "bogus_imm",
3078 	    CTLFLAG_RD, &bogus_imm,
3079 	    0, "#times a bogus immediate response was seen");
3080 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3081 	    "cache_alloc",
3082 	    CTLFLAG_RD, &cxgb_cached_allocations,
3083 	    0, "#times a cluster was allocated from cache");
3084 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3085 	    "cached",
3086 	    CTLFLAG_RD, &cxgb_cached,
3087 	    0, "#times a cluster was cached");
3088 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3089 	    "ext_freed",
3090 	    CTLFLAG_RD, &cxgb_ext_freed,
3091 	    0, "#times a cluster was freed through ext_free");
3092 
3093 }
3094 
3095 void
3096 t3_add_configured_sysctls(adapter_t *sc)
3097 {
3098 	struct sysctl_ctx_list *ctx;
3099 	struct sysctl_oid_list *children;
3100 	int i, j;
3101 
3102 	ctx = device_get_sysctl_ctx(sc->dev);
3103 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3104 
3105 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3106 	    "intr_coal",
3107 	    CTLTYPE_INT|CTLFLAG_RW, sc,
3108 	    0, t3_set_coalesce_nsecs,
3109 	    "I", "interrupt coalescing timer (ns)");
3110 
3111 	for (i = 0; i < sc->params.nports; i++) {
3112 		struct port_info *pi = &sc->port[i];
3113 		struct sysctl_oid *poid;
3114 		struct sysctl_oid_list *poidlist;
3115 
3116 		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3117 		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3118 		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3119 		poidlist = SYSCTL_CHILDREN(poid);
3120 		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
3121 		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3122 		    0, "#queue sets");
3123 
3124 		for (j = 0; j < pi->nqsets; j++) {
3125 			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3126 			struct sysctl_oid *qspoid;
3127 			struct sysctl_oid_list *qspoidlist;
3128 			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3129 
3130 			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3131 
3132 			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3133 			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3134 			qspoidlist = SYSCTL_CHILDREN(qspoid);
3135 
3136 			SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "dropped",
3137 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
3138 			    0, "#tunneled packets dropped");
3139 			SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "sendqlen",
3140 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3141 			    0, "#tunneled packets waiting to be sent");
3142 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "queue_pidx",
3143 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3144 			    0, "#tunneled packets queue producer index");
3145 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "queue_cidx",
3146 			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3147 			    0, "#tunneled packets queue consumer index");
3148 			SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "processed",
3149 			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3150 			    0, "#tunneled packets processed by the card");
3151 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "cleaned",
3152 			    CTLFLAG_RD, &txq->cleaned,
3153 			    0, "#tunneled packets cleaned");
3154 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "in_use",
3155 			    CTLFLAG_RD, &txq->in_use,
3156 			    0, "#tunneled packet slots in use");
3157 			SYSCTL_ADD_ULONG(ctx, qspoidlist, OID_AUTO, "frees",
3158 			    CTLFLAG_RD, &txq->txq_frees,
3159 			    "#tunneled packets freed");
3160 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "skipped",
3161 			    CTLFLAG_RD, &txq->txq_skipped,
3162 			    0, "#tunneled packet descriptors skipped");
3163 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "coalesced",
3164 			    CTLFLAG_RD, &txq->txq_coalesced,
3165 			    0, "#tunneled packets coalesced");
3166 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "enqueued",
3167 			    CTLFLAG_RD, &txq->txq_enqueued,
3168 			    0, "#tunneled packets enqueued to hardware");
3169 			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "stopped_flags",
3170 			    CTLFLAG_RD, &qs->txq_stopped,
3171 			    0, "tx queues stopped");
3172 
3173 		}
3174 	}
3175 }
3176 
3177 /**
3178  *	t3_get_desc - dump an SGE descriptor for debugging purposes
3179  *	@qs: the queue set
3180  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3181  *	@idx: the descriptor index in the queue
3182  *	@data: where to dump the descriptor contents
3183  *
3184  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3185  *	size of the descriptor.
3186  */
3187 int
3188 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3189 		unsigned char *data)
3190 {
3191 	if (qnum >= 6)
3192 		return (EINVAL);
3193 
3194 	if (qnum < 3) {
3195 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3196 			return -EINVAL;
3197 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3198 		return sizeof(struct tx_desc);
3199 	}
3200 
3201 	if (qnum == 3) {
3202 		if (!qs->rspq.desc || idx >= qs->rspq.size)
3203 			return (EINVAL);
3204 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3205 		return sizeof(struct rsp_desc);
3206 	}
3207 
3208 	qnum -= 4;
3209 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3210 		return (EINVAL);
3211 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3212 	return sizeof(struct rx_desc);
3213 }
3214