xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 18242d3b09dbc3f5e278e39baaa3c3b76624c901)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Chelsio Corporation nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <machine/bus.h>
44 #include <machine/resource.h>
45 #include <sys/bus_dma.h>
46 #include <sys/rman.h>
47 #include <sys/queue.h>
48 #include <sys/sysctl.h>
49 #include <sys/taskqueue.h>
50 
51 
52 #include <sys/proc.h>
53 #include <sys/sched.h>
54 #include <sys/smp.h>
55 #include <sys/systm.h>
56 
57 #include <netinet/in_systm.h>
58 #include <netinet/in.h>
59 #include <netinet/ip.h>
60 #include <netinet/tcp.h>
61 
62 #include <dev/pci/pcireg.h>
63 #include <dev/pci/pcivar.h>
64 #include <dev/cxgb/common/cxgb_common.h>
65 #include <dev/cxgb/common/cxgb_regs.h>
66 #include <dev/cxgb/common/cxgb_sge_defs.h>
67 #include <dev/cxgb/common/cxgb_t3_cpl.h>
68 #include <dev/cxgb/common/cxgb_firmware_exports.h>
69 
70 #include <dev/cxgb/sys/mvec.h>
71 
72 uint32_t collapse_free = 0;
73 uint32_t mb_free_vec_free = 0;
74 int      collapse_mbufs = 1;
75 
76 #define USE_GTS 0
77 
78 #define SGE_RX_SM_BUF_SIZE	1536
79 #define SGE_RX_DROP_THRES	16
80 
81 /*
82  * Period of the Tx buffer reclaim timer.  This timer does not need to run
83  * frequently as Tx buffers are usually reclaimed by new Tx packets.
84  */
85 #define TX_RECLAIM_PERIOD       (hz >> 2)
86 
87 /*
88  * work request size in bytes
89  */
90 #define WR_LEN (WR_FLITS * 8)
91 
92 /*
93  * Values for sge_txq.flags
94  */
95 enum {
96 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
97 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
98 };
99 
100 struct tx_desc {
101 	uint64_t	flit[TX_DESC_FLITS];
102 } __packed;
103 
104 struct rx_desc {
105 	uint32_t	addr_lo;
106 	uint32_t	len_gen;
107 	uint32_t	gen2;
108 	uint32_t	addr_hi;
109 } __packed;;
110 
111 struct rsp_desc {               /* response queue descriptor */
112 	struct rss_header	rss_hdr;
113 	uint32_t		flags;
114 	uint32_t		len_cq;
115 	uint8_t			imm_data[47];
116 	uint8_t			intr_gen;
117 } __packed;
118 
119 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
120 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
121 #define RX_SW_DESC_INUSE        (1 << 3)
122 #define TX_SW_DESC_MAPPED       (1 << 4)
123 
124 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
125 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
126 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
127 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
128 
129 struct tx_sw_desc {                /* SW state per Tx descriptor */
130 	struct mbuf	*m;
131 	bus_dmamap_t	map;
132 	int		flags;
133 };
134 
135 struct rx_sw_desc {                /* SW state per Rx descriptor */
136 	void	        *cl;
137 	bus_dmamap_t	map;
138 	int		flags;
139 };
140 
141 struct txq_state {
142 	unsigned int compl;
143 	unsigned int gen;
144 	unsigned int pidx;
145 };
146 
147 struct refill_fl_cb_arg {
148 	int               error;
149 	bus_dma_segment_t seg;
150 	int               nseg;
151 };
152 
153 /*
154  * Maps a number of flits to the number of Tx descriptors that can hold them.
155  * The formula is
156  *
157  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
158  *
159  * HW allows up to 4 descriptors to be combined into a WR.
160  */
161 static uint8_t flit_desc_map[] = {
162 	0,
163 #if SGE_NUM_GENBITS == 1
164 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
165 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
166 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
167 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
168 #elif SGE_NUM_GENBITS == 2
169 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
170 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
171 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
172 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
173 #else
174 # error "SGE_NUM_GENBITS must be 1 or 2"
175 #endif
176 };
177 
178 
179 static int lro_default = 0;
180 int cxgb_debug = 0;
181 
182 static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
183 static void sge_timer_cb(void *arg);
184 static void sge_timer_reclaim(void *arg, int ncount);
185 static int free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec);
186 
187 /**
188  *	reclaim_completed_tx - reclaims completed Tx descriptors
189  *	@adapter: the adapter
190  *	@q: the Tx queue to reclaim completed descriptors from
191  *
192  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
193  *	and frees the associated buffers if possible.  Called with the Tx
194  *	queue's lock held.
195  */
196 static __inline int
197 reclaim_completed_tx(adapter_t *adapter, struct sge_txq *q, int nbufs, struct mbuf **mvec)
198 {
199 	int reclaimed, reclaim = desc_reclaimable(q);
200 	int n = 0;
201 
202 	mtx_assert(&q->lock, MA_OWNED);
203 
204 	if (reclaim > 0) {
205 		n = free_tx_desc(adapter, q, min(reclaim, nbufs), mvec);
206 		reclaimed = min(reclaim, nbufs);
207 		q->cleaned += reclaimed;
208 		q->in_use -= reclaimed;
209 	}
210 
211 	return (n);
212 }
213 
214 /**
215  *	t3_sge_init - initialize SGE
216  *	@adap: the adapter
217  *	@p: the SGE parameters
218  *
219  *	Performs SGE initialization needed every time after a chip reset.
220  *	We do not initialize any of the queue sets here, instead the driver
221  *	top-level must request those individually.  We also do not enable DMA
222  *	here, that should be done after the queues have been set up.
223  */
224 void
225 t3_sge_init(adapter_t *adap, struct sge_params *p)
226 {
227 	u_int ctrl, ups;
228 
229 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
230 
231 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
232 	       F_CQCRDTCTRL |
233 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
234 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
235 #if SGE_NUM_GENBITS == 1
236 	ctrl |= F_EGRGENCTRL;
237 #endif
238 	if (adap->params.rev > 0) {
239 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
240 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
241 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
242 	}
243 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
244 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
245 		     V_LORCQDRBTHRSH(512));
246 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
247 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
248 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
249 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
250 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
251 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
252 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
253 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
254 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
255 }
256 
257 
258 /**
259  *	sgl_len - calculates the size of an SGL of the given capacity
260  *	@n: the number of SGL entries
261  *
262  *	Calculates the number of flits needed for a scatter/gather list that
263  *	can hold the given number of entries.
264  */
265 static __inline unsigned int
266 sgl_len(unsigned int n)
267 {
268 	return ((3 * n) / 2 + (n & 1));
269 }
270 
271 /**
272  *	get_imm_packet - return the next ingress packet buffer from a response
273  *	@resp: the response descriptor containing the packet data
274  *
275  *	Return a packet containing the immediate data of the given response.
276  */
277 static __inline void
278 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl)
279 {
280 	int len;
281 	uint32_t flags = ntohl(resp->flags);
282 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
283 
284 	/*
285 	 * would be a firmware bug
286 	 */
287 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
288 		return;
289 
290 	len = G_RSPD_LEN(ntohl(resp->len_cq));
291 	switch (sopeop) {
292 	case RSPQ_SOP_EOP:
293 		m->m_len = m->m_pkthdr.len = len;
294 		memcpy(m->m_data, resp->imm_data, len);
295 		break;
296 	case RSPQ_EOP:
297 		memcpy(cl, resp->imm_data, len);
298 		m_iovappend(m, cl, MSIZE, len, 0);
299 		break;
300 	}
301 }
302 
303 
304 static __inline u_int
305 flits_to_desc(u_int n)
306 {
307 	return (flit_desc_map[n]);
308 }
309 
310 void
311 t3_sge_err_intr_handler(adapter_t *adapter)
312 {
313 	unsigned int v, status;
314 
315 
316 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
317 
318 	if (status & F_RSPQCREDITOVERFOW)
319 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
320 
321 	if (status & F_RSPQDISABLED) {
322 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
323 
324 		CH_ALERT(adapter,
325 			 "packet delivered to disabled response queue (0x%x)\n",
326 			 (v >> S_RSPQ0DISABLED) & 0xff);
327 	}
328 
329 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
330 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
331 		t3_fatal_err(adapter);
332 }
333 
334 void
335 t3_sge_prep(adapter_t *adap, struct sge_params *p)
336 {
337 	int i;
338 
339 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
340 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
341 
342 	for (i = 0; i < SGE_QSETS; ++i) {
343 		struct qset_params *q = p->qset + i;
344 
345 		q->polling = adap->params.rev > 0;
346 
347 		if (adap->flags & USING_MSIX)
348 			q->coalesce_nsecs = 6000;
349 		else
350 			q->coalesce_nsecs = 3500;
351 
352 		q->rspq_size = RSPQ_Q_SIZE;
353 		q->fl_size = FL_Q_SIZE;
354 		q->jumbo_size = JUMBO_Q_SIZE;
355 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
356 		q->txq_size[TXQ_OFLD] = 1024;
357 		q->txq_size[TXQ_CTRL] = 256;
358 		q->cong_thres = 0;
359 	}
360 }
361 
362 int
363 t3_sge_alloc(adapter_t *sc)
364 {
365 
366 	/* The parent tag. */
367 	if (bus_dma_tag_create( NULL,			/* parent */
368 				1, 0,			/* algnmnt, boundary */
369 				BUS_SPACE_MAXADDR,	/* lowaddr */
370 				BUS_SPACE_MAXADDR,	/* highaddr */
371 				NULL, NULL,		/* filter, filterarg */
372 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
373 				BUS_SPACE_UNRESTRICTED, /* nsegments */
374 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
375 				0,			/* flags */
376 				NULL, NULL,		/* lock, lockarg */
377 				&sc->parent_dmat)) {
378 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
379 		return (ENOMEM);
380 	}
381 
382 	/*
383 	 * DMA tag for normal sized RX frames
384 	 */
385 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
386 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
387 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
388 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
389 		return (ENOMEM);
390 	}
391 
392 	/*
393 	 * DMA tag for jumbo sized RX frames.
394 	 */
395 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
396 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
397 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
398 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
399 		return (ENOMEM);
400 	}
401 
402 	/*
403 	 * DMA tag for TX frames.
404 	 */
405 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
406 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
407 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
408 		NULL, NULL, &sc->tx_dmat)) {
409 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
410 		return (ENOMEM);
411 	}
412 
413 	return (0);
414 }
415 
416 int
417 t3_sge_free(struct adapter * sc)
418 {
419 
420 	if (sc->tx_dmat != NULL)
421 		bus_dma_tag_destroy(sc->tx_dmat);
422 
423 	if (sc->rx_jumbo_dmat != NULL)
424 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
425 
426 	if (sc->rx_dmat != NULL)
427 		bus_dma_tag_destroy(sc->rx_dmat);
428 
429 	if (sc->parent_dmat != NULL)
430 		bus_dma_tag_destroy(sc->parent_dmat);
431 
432 	return (0);
433 }
434 
435 void
436 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
437 {
438 
439 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
440 	qs->rspq.polling = 0 /* p->polling */;
441 }
442 
443 static void
444 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
445 {
446 	struct refill_fl_cb_arg *cb_arg = arg;
447 
448 	cb_arg->error = error;
449 	cb_arg->seg = segs[0];
450 	cb_arg->nseg = nseg;
451 
452 }
453 
454 /**
455  *	refill_fl - refill an SGE free-buffer list
456  *	@sc: the controller softc
457  *	@q: the free-list to refill
458  *	@n: the number of new buffers to allocate
459  *
460  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
461  *	The caller must assure that @n does not exceed the queue's capacity.
462  */
463 static void
464 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
465 {
466 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
467 	struct rx_desc *d = &q->desc[q->pidx];
468 	struct refill_fl_cb_arg cb_arg;
469 	void *cl;
470 	int err;
471 
472 	cb_arg.error = 0;
473 	while (n--) {
474 		/*
475 		 * We only allocate a cluster, mbuf allocation happens after rx
476 		 */
477 		if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
478 			log(LOG_WARNING, "Failed to allocate cluster\n");
479 			goto done;
480 		}
481 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
482 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
483 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
484 				/*
485 				 * XXX free cluster
486 				 */
487 				goto done;
488 			}
489 			sd->flags |= RX_SW_DESC_MAP_CREATED;
490 		}
491 		err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
492 		    refill_fl_cb, &cb_arg, 0);
493 
494 		if (err != 0 || cb_arg.error) {
495 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
496 			/*
497 			 * XXX free cluster
498 			 */
499 			return;
500 		}
501 
502 		sd->flags |= RX_SW_DESC_INUSE;
503 		sd->cl = cl;
504 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
505 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
506 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
507 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
508 
509 		d++;
510 		sd++;
511 
512 		if (++q->pidx == q->size) {
513 			q->pidx = 0;
514 			q->gen ^= 1;
515 			sd = q->sdesc;
516 			d = q->desc;
517 		}
518 		q->credits++;
519 	}
520 
521 done:
522 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
523 }
524 
525 
526 /**
527  *	free_rx_bufs - free the Rx buffers on an SGE free list
528  *	@sc: the controle softc
529  *	@q: the SGE free list to clean up
530  *
531  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
532  *	this queue should be stopped before calling this function.
533  */
534 static void
535 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
536 {
537 	u_int cidx = q->cidx;
538 
539 	while (q->credits--) {
540 		struct rx_sw_desc *d = &q->sdesc[cidx];
541 
542 		if (d->flags & RX_SW_DESC_INUSE) {
543 			bus_dmamap_unload(q->entry_tag, d->map);
544 			bus_dmamap_destroy(q->entry_tag, d->map);
545 			uma_zfree(q->zone, d->cl);
546 		}
547 		d->cl = NULL;
548 		if (++cidx == q->size)
549 			cidx = 0;
550 	}
551 }
552 
553 static __inline void
554 __refill_fl(adapter_t *adap, struct sge_fl *fl)
555 {
556 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
557 }
558 
559 static void
560 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
561 {
562 	uint32_t *addr;
563 
564 	addr = arg;
565 	*addr = segs[0].ds_addr;
566 }
567 
568 static int
569 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
570     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
571     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
572 {
573 	size_t len = nelem * elem_size;
574 	void *s = NULL;
575 	void *p = NULL;
576 	int err;
577 
578 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
579 				      BUS_SPACE_MAXADDR_32BIT,
580 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
581 				      len, 0, NULL, NULL, tag)) != 0) {
582 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
583 		return (ENOMEM);
584 	}
585 
586 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
587 				    map)) != 0) {
588 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
589 		return (ENOMEM);
590 	}
591 
592 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
593 	bzero(p, len);
594 	*(void **)desc = p;
595 
596 	if (sw_size) {
597 		len = nelem * sw_size;
598 		s = malloc(len, M_DEVBUF, M_WAITOK);
599 		bzero(s, len);
600 		*(void **)sdesc = s;
601 	}
602 	if (parent_entry_tag == NULL)
603 		return (0);
604 
605 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
606 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
607 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
608 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
609 		                      NULL, NULL, entry_tag)) != 0) {
610 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
611 		return (ENOMEM);
612 	}
613 	return (0);
614 }
615 
616 static void
617 sge_slow_intr_handler(void *arg, int ncount)
618 {
619 	adapter_t *sc = arg;
620 
621 	t3_slow_intr_handler(sc);
622 }
623 
624 static void
625 sge_timer_cb(void *arg)
626 {
627 	adapter_t *sc = arg;
628 	struct sge_qset *qs;
629 	struct sge_txq  *txq;
630 	int i, j;
631 	int reclaim_eth, reclaim_ofl, refill_rx;
632 
633 	for (i = 0; i < sc->params.nports; i++)
634 		for (j = 0; j < sc->port[i].nqsets; j++) {
635 			qs = &sc->sge.qs[i + j];
636 			txq = &qs->txq[0];
637 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
638 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
639 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
640 			    (qs->fl[1].credits < qs->fl[1].size));
641 			if (reclaim_eth || reclaim_ofl || refill_rx) {
642 				taskqueue_enqueue(sc->tq, &sc->timer_reclaim_task);
643 				goto done;
644 			}
645 		}
646 done:
647 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
648 }
649 
650 /*
651  * This is meant to be a catch-all function to keep sge state private
652  * to sge.c
653  *
654  */
655 int
656 t3_sge_init_sw(adapter_t *sc)
657 {
658 
659 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
660 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
661 	TASK_INIT(&sc->timer_reclaim_task, 0, sge_timer_reclaim, sc);
662 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
663 	return (0);
664 }
665 
666 void
667 t3_sge_deinit_sw(adapter_t *sc)
668 {
669 	callout_drain(&sc->sge_timer_ch);
670 	if (sc->tq) {
671 		taskqueue_drain(sc->tq, &sc->timer_reclaim_task);
672 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
673 	}
674 }
675 
676 /**
677  *	refill_rspq - replenish an SGE response queue
678  *	@adapter: the adapter
679  *	@q: the response queue to replenish
680  *	@credits: how many new responses to make available
681  *
682  *	Replenishes a response queue by making the supplied number of responses
683  *	available to HW.
684  */
685 static __inline void
686 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
687 {
688 
689 	/* mbufs are allocated on demand when a rspq entry is processed. */
690 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
691 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
692 }
693 
694 
695 static void
696 sge_timer_reclaim(void *arg, int ncount)
697 {
698 	adapter_t *sc = arg;
699 	int i, nqsets = 0;
700 	struct sge_qset *qs;
701 	struct sge_txq *txq;
702 	struct mtx *lock;
703 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
704 	int n, reclaimable;
705 	/*
706 	 * XXX assuming these quantities are allowed to change during operation
707 	 */
708 	for (i = 0; i < sc->params.nports; i++)
709 		nqsets += sc->port[i].nqsets;
710 
711 	for (i = 0; i < nqsets; i++) {
712 		qs = &sc->sge.qs[i];
713 		txq = &qs->txq[TXQ_ETH];
714 		reclaimable = desc_reclaimable(txq);
715 		if (reclaimable > 0) {
716 			mtx_lock(&txq->lock);
717 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
718 			mtx_unlock(&txq->lock);
719 
720 			for (i = 0; i < n; i++) {
721 				m_freem_vec(m_vec[i]);
722 			}
723 		}
724 
725 		txq = &qs->txq[TXQ_OFLD];
726 		reclaimable = desc_reclaimable(txq);
727 		if (reclaimable > 0) {
728 			mtx_lock(&txq->lock);
729 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
730 			mtx_unlock(&txq->lock);
731 
732 			for (i = 0; i < n; i++) {
733 				m_freem_vec(m_vec[i]);
734 			}
735 		}
736 
737 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
738 			    &sc->sge.qs[0].rspq.lock;
739 
740 		if (mtx_trylock(lock)) {
741 			/* XXX currently assume that we are *NOT* polling */
742 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
743 
744 			if (qs->fl[0].credits < qs->fl[0].size - 16)
745 				__refill_fl(sc, &qs->fl[0]);
746 			if (qs->fl[1].credits < qs->fl[1].size - 16)
747 				__refill_fl(sc, &qs->fl[1]);
748 
749 			if (status & (1 << qs->rspq.cntxt_id)) {
750 				if (qs->rspq.credits) {
751 					refill_rspq(sc, &qs->rspq, 1);
752 					qs->rspq.credits--;
753 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
754 					    1 << qs->rspq.cntxt_id);
755 				}
756 			}
757 			mtx_unlock(lock);
758 		}
759 	}
760 }
761 
762 /**
763  *	init_qset_cntxt - initialize an SGE queue set context info
764  *	@qs: the queue set
765  *	@id: the queue set id
766  *
767  *	Initializes the TIDs and context ids for the queues of a queue set.
768  */
769 static void
770 init_qset_cntxt(struct sge_qset *qs, u_int id)
771 {
772 
773 	qs->rspq.cntxt_id = id;
774 	qs->fl[0].cntxt_id = 2 * id;
775 	qs->fl[1].cntxt_id = 2 * id + 1;
776 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
777 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
778 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
779 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
780 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
781 }
782 
783 
784 static void
785 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
786 {
787 	txq->in_use += ndesc;
788 	/*
789 	 * XXX we don't handle stopping of queue
790 	 * presumably start handles this when we bump against the end
791 	 */
792 	txqs->gen = txq->gen;
793 	txq->unacked += ndesc;
794 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
795 	txq->unacked &= 7;
796 	txqs->pidx = txq->pidx;
797 	txq->pidx += ndesc;
798 
799 	if (txq->pidx >= txq->size) {
800 		txq->pidx -= txq->size;
801 		txq->gen ^= 1;
802 	}
803 
804 }
805 
806 /**
807  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
808  *	@m: the packet mbufs
809  *      @nsegs: the number of segments
810  *
811  * 	Returns the number of Tx descriptors needed for the given Ethernet
812  * 	packet.  Ethernet packets require addition of WR and CPL headers.
813  */
814 static __inline unsigned int
815 calc_tx_descs(const struct mbuf *m, int nsegs)
816 {
817 	unsigned int flits;
818 
819 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
820 		return 1;
821 
822 	flits = sgl_len(nsegs) + 2;
823 #ifdef TSO_SUPPORTED
824 	if  (m->m_pkthdr.csum_flags & (CSUM_TSO))
825 		flits++;
826 #endif
827 	return flits_to_desc(flits);
828 }
829 
830 static unsigned int
831 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
832     struct tx_sw_desc *stx, bus_dma_segment_t *segs, int *nsegs)
833 {
834 	struct mbuf *m0;
835 	int err, pktlen;
836 
837 	m0 = *m;
838 	pktlen = m0->m_pkthdr.len;
839 
840 	err = bus_dmamap_load_mvec_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
841 #ifdef DEBUG
842 	if (err) {
843 		int n = 0;
844 		struct mbuf *mtmp = m0;
845 		while(mtmp) {
846 			n++;
847 			mtmp = mtmp->m_next;
848 		}
849 		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
850 		    err, m0->m_pkthdr.len, n);
851 	}
852 #endif
853 	if (err == EFBIG) {
854 		/* Too many segments, try to defrag */
855 		m0 = m_defrag(m0, M_NOWAIT);
856 		if (m0 == NULL) {
857 			m_freem(*m);
858 			*m = NULL;
859 			return (ENOBUFS);
860 		}
861 		*m = m0;
862 		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
863 	}
864 
865 	if (err == ENOMEM) {
866 		return (err);
867 	}
868 
869 	if (err) {
870 		if (cxgb_debug)
871 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
872 		m_freem_vec(m0);
873 		*m = NULL;
874 		return (err);
875 	}
876 
877 	bus_dmamap_sync(txq->entry_tag, stx->map, BUS_DMASYNC_PREWRITE);
878 	stx->flags |= TX_SW_DESC_MAPPED;
879 
880 	return (0);
881 }
882 
883 /**
884  *	make_sgl - populate a scatter/gather list for a packet
885  *	@sgp: the SGL to populate
886  *	@segs: the packet dma segments
887  *	@nsegs: the number of segments
888  *
889  *	Generates a scatter/gather list for the buffers that make up a packet
890  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
891  *	appropriately.
892  */
893 static __inline void
894 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
895 {
896 	int i, idx;
897 
898 	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
899 		if (i && idx == 0)
900 			++sgp;
901 
902 		sgp->len[idx] = htobe32(segs[i].ds_len);
903 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
904 	}
905 
906 	if (idx)
907 		sgp->len[idx] = 0;
908 }
909 
910 /**
911  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
912  *	@adap: the adapter
913  *	@q: the Tx queue
914  *
915  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
916  *	where the HW is going to sleep just after we checked, however,
917  *	then the interrupt handler will detect the outstanding TX packet
918  *	and ring the doorbell for us.
919  *
920  *	When GTS is disabled we unconditionally ring the doorbell.
921  */
922 static __inline void
923 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
924 {
925 #if USE_GTS
926 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
927 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
928 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
929 #ifdef T3_TRACE
930 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
931 			  q->cntxt_id);
932 #endif
933 		t3_write_reg(adap, A_SG_KDOORBELL,
934 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
935 	}
936 #else
937 	wmb();            /* write descriptors before telling HW */
938 	t3_write_reg(adap, A_SG_KDOORBELL,
939 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
940 #endif
941 }
942 
943 static __inline void
944 wr_gen2(struct tx_desc *d, unsigned int gen)
945 {
946 #if SGE_NUM_GENBITS == 2
947 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
948 #endif
949 }
950 
951 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
952 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
953 
954 int
955 t3_encap(struct port_info *p, struct mbuf **m)
956 {
957 	adapter_t *sc;
958 	struct mbuf *m0;
959 	struct sge_qset *qs;
960 	struct sge_txq *txq;
961 	struct tx_sw_desc *stx;
962 	struct txq_state txqs;
963 	unsigned int nsegs, ndesc, flits, cntrl, mlen;
964 	int err, tso_info = 0;
965 
966 	struct work_request_hdr *wrp;
967 	struct tx_sw_desc *txsd;
968 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
969 	bus_dma_segment_t segs[TX_MAX_SEGS];
970 	uint32_t wr_hi, wr_lo, sgl_flits;
971 
972 	struct tx_desc *txd;
973 	struct cpl_tx_pkt *cpl;
974 
975 	DPRINTF("t3_encap ");
976 	m0 = *m;
977 	sc = p->adapter;
978 	qs = &sc->sge.qs[p->first_qset];
979 	txq = &qs->txq[TXQ_ETH];
980 	stx = &txq->sdesc[txq->pidx];
981 	txd = &txq->desc[txq->pidx];
982 	cpl = (struct cpl_tx_pkt *)txd;
983 	mlen = m0->m_pkthdr.len;
984 	cpl->len = htonl(mlen | 0x80000000);
985 
986 	DPRINTF("mlen=%d\n", mlen);
987 	/*
988 	 * XXX handle checksum, TSO, and VLAN here
989 	 *
990 	 */
991 	cntrl = V_TXPKT_INTF(p->port);
992 
993 	/*
994 	 * XXX need to add VLAN support for 6.x
995 	 */
996 #ifdef VLAN_SUPPORTED
997 	if (m0->m_flags & M_VLANTAG)
998 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
999 	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
1000 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1001 #endif
1002 	if (tso_info) {
1003 		int eth_type;
1004 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
1005 		struct ip *ip;
1006 		struct tcphdr *tcp;
1007 		uint8_t *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
1008 
1009 		txd->flit[2] = 0;
1010 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1011 		hdr->cntrl = htonl(cntrl);
1012 
1013 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1014 			pkthdr = &tmp[0];
1015 			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
1016 		} else {
1017 			pkthdr = m0->m_data;
1018 		}
1019 
1020 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1021 			eth_type = CPL_ETH_II_VLAN;
1022 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1023 			    ETHER_VLAN_ENCAP_LEN);
1024 		} else {
1025 			eth_type = CPL_ETH_II;
1026 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1027 		}
1028 		tcp = (struct tcphdr *)((uint8_t *)ip +
1029 		    sizeof(*ip));
1030 
1031 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1032 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1033 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1034 		hdr->lso_info = htonl(tso_info);
1035 		flits = 3;
1036 	} else {
1037 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1038 		cpl->cntrl = htonl(cntrl);
1039 
1040 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1041 			txq_prod(txq, 1, &txqs);
1042 			txq->sdesc[txqs.pidx].m = m0;
1043 
1044 			if (m0->m_len == m0->m_pkthdr.len)
1045 				memcpy(&txd->flit[2], m0->m_data, mlen);
1046 			else
1047 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1048 
1049 			flits = (mlen + 7) / 8 + 2;
1050 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1051 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1052 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1053 			wmb();
1054 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1055 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1056 
1057 			wr_gen2(txd, txqs.gen);
1058 			check_ring_tx_db(sc, txq);
1059 			return (0);
1060 		}
1061 		flits = 2;
1062 	}
1063 
1064 	wrp = (struct work_request_hdr *)txd;
1065 
1066 	if ((err = busdma_map_mbufs(m, txq, stx, segs, &nsegs)) != 0) {
1067 		return (err);
1068 	}
1069 	m0 = *m;
1070 	ndesc = calc_tx_descs(m0, nsegs);
1071 
1072 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : &sgl[0];
1073 	make_sgl(sgp, segs, nsegs);
1074 
1075 	sgl_flits = sgl_len(nsegs);
1076 
1077 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1078 	txq_prod(txq, ndesc, &txqs);
1079 	txsd = &txq->sdesc[txqs.pidx];
1080 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1081 	wr_lo = htonl(V_WR_TID(txq->token));
1082 	txsd->m = m0;
1083 
1084 	if (__predict_true(ndesc == 1)) {
1085 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1086 		    V_WR_SGLSFLT(flits)) | wr_hi;
1087 		wmb();
1088 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1089 		    V_WR_GEN(txqs.gen)) | wr_lo;
1090 		/* XXX gen? */
1091 		wr_gen2(txd, txqs.gen);
1092 	} else {
1093 		unsigned int ogen = txqs.gen;
1094 		const uint64_t *fp = (const uint64_t *)sgl;
1095 		struct work_request_hdr *wp = wrp;
1096 
1097 		/* XXX - CHECK ME */
1098 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1099 		    V_WR_SGLSFLT(flits)) | wr_hi;
1100 
1101 		while (sgl_flits) {
1102 			unsigned int avail = WR_FLITS - flits;
1103 
1104 			if (avail > sgl_flits)
1105 				avail = sgl_flits;
1106 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1107 			sgl_flits -= avail;
1108 			ndesc--;
1109 			if (!sgl_flits)
1110 				break;
1111 
1112 			fp += avail;
1113 			txd++;
1114 			txsd++;
1115 			if (++txqs.pidx == txq->size) {
1116 				txqs.pidx = 0;
1117 				txqs.gen ^= 1;
1118 				txd = txq->desc;
1119 				txsd = txq->sdesc;
1120 			}
1121 
1122 			/*
1123 			 * when the head of the mbuf chain
1124 			 * is freed all clusters will be freed
1125 			 * with it
1126 			 */
1127 			txsd->m = NULL;
1128 			wrp = (struct work_request_hdr *)txd;
1129 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1130 			    V_WR_SGLSFLT(1)) | wr_hi;
1131 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1132 				    sgl_flits + 1)) |
1133 			    V_WR_GEN(txqs.gen)) | wr_lo;
1134 			wr_gen2(txd, txqs.gen);
1135 			flits = 1;
1136 		}
1137 #ifdef WHY
1138 		skb->priority = pidx;
1139 #endif
1140 		wrp->wr_hi |= htonl(F_WR_EOP);
1141 		wmb();
1142 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1143 		wr_gen2((struct tx_desc *)wp, ogen);
1144 	}
1145 	check_ring_tx_db(p->adapter, txq);
1146 
1147 	return (0);
1148 }
1149 
1150 
1151 /**
1152  *	write_imm - write a packet into a Tx descriptor as immediate data
1153  *	@d: the Tx descriptor to write
1154  *	@m: the packet
1155  *	@len: the length of packet data to write as immediate data
1156  *	@gen: the generation bit value to write
1157  *
1158  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1159  *	contains a work request at its beginning.  We must write the packet
1160  *	carefully so the SGE doesn't read accidentally before it's written in
1161  *	its entirety.
1162  */
1163 static __inline void write_imm(struct tx_desc *d, struct mbuf *m,
1164 			     unsigned int len, unsigned int gen)
1165 {
1166 	struct work_request_hdr *from = (struct work_request_hdr *)m->m_data;
1167 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1168 
1169 	memcpy(&to[1], &from[1], len - sizeof(*from));
1170 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1171 					V_WR_BCNTLFLT(len & 7));
1172 	wmb();
1173 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1174 					V_WR_LEN((len + 7) / 8));
1175 	wr_gen2(d, gen);
1176 	m_freem(m);
1177 }
1178 
1179 /**
1180  *	check_desc_avail - check descriptor availability on a send queue
1181  *	@adap: the adapter
1182  *	@q: the TX queue
1183  *	@m: the packet needing the descriptors
1184  *	@ndesc: the number of Tx descriptors needed
1185  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1186  *
1187  *	Checks if the requested number of Tx descriptors is available on an
1188  *	SGE send queue.  If the queue is already suspended or not enough
1189  *	descriptors are available the packet is queued for later transmission.
1190  *	Must be called with the Tx queue locked.
1191  *
1192  *	Returns 0 if enough descriptors are available, 1 if there aren't
1193  *	enough descriptors and the packet has been queued, and 2 if the caller
1194  *	needs to retry because there weren't enough descriptors at the
1195  *	beginning of the call but some freed up in the mean time.
1196  */
1197 static __inline int
1198 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1199 				   struct mbuf *m, unsigned int ndesc,
1200 				   unsigned int qid)
1201 {
1202 	/*
1203 	 * XXX We currently only use this for checking the control queue
1204 	 * the control queue is only used for binding qsets which happens
1205 	 * at init time so we are guaranteed enough descriptors
1206 	 */
1207 #if 0
1208 	if (__predict_false(!skb_queue_empty(&q->sendq))) {
1209 addq_exit:	__skb_queue_tail(&q->sendq, skb);
1210 		return 1;
1211 	}
1212 	if (__predict_false(q->size - q->in_use < ndesc)) {
1213 
1214 		struct sge_qset *qs = txq_to_qset(q, qid);
1215 
1216 		set_bit(qid, &qs->txq_stopped);
1217 		smp_mb__after_clear_bit();
1218 
1219 		if (should_restart_tx(q) &&
1220 		    test_and_clear_bit(qid, &qs->txq_stopped))
1221 			return 2;
1222 
1223 		q->stops++;
1224 		goto addq_exit;
1225 	}
1226 #endif
1227 	return 0;
1228 }
1229 
1230 
1231 /**
1232  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1233  *	@q: the SGE control Tx queue
1234  *
1235  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1236  *	that send only immediate data (presently just the control queues) and
1237  *	thus do not have any sk_buffs to release.
1238  */
1239 static __inline void
1240 reclaim_completed_tx_imm(struct sge_txq *q)
1241 {
1242 	unsigned int reclaim = q->processed - q->cleaned;
1243 
1244 	mtx_assert(&q->lock, MA_OWNED);
1245 
1246 	q->in_use -= reclaim;
1247 	q->cleaned += reclaim;
1248 }
1249 
1250 static __inline int
1251 immediate(const struct mbuf *m)
1252 {
1253 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1254 }
1255 
1256 /**
1257  *	ctrl_xmit - send a packet through an SGE control Tx queue
1258  *	@adap: the adapter
1259  *	@q: the control queue
1260  *	@m: the packet
1261  *
1262  *	Send a packet through an SGE control Tx queue.  Packets sent through
1263  *	a control queue must fit entirely as immediate data in a single Tx
1264  *	descriptor and have no page fragments.
1265  */
1266 static int
1267 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1268 {
1269 	int ret;
1270 	struct work_request_hdr *wrp = (struct work_request_hdr *)m->m_data;
1271 
1272 	if (__predict_false(!immediate(m))) {
1273 		m_freem(m);
1274 		return 0;
1275 	}
1276 
1277 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1278 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1279 
1280 	mtx_lock(&q->lock);
1281 again:	reclaim_completed_tx_imm(q);
1282 
1283 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1284 	if (__predict_false(ret)) {
1285 		if (ret == 1) {
1286 			mtx_unlock(&q->lock);
1287 			return (-1);
1288 		}
1289 		goto again;
1290 	}
1291 
1292 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1293 
1294 	q->in_use++;
1295 	if (++q->pidx >= q->size) {
1296 		q->pidx = 0;
1297 		q->gen ^= 1;
1298 	}
1299 	mtx_unlock(&q->lock);
1300 	wmb();
1301 	t3_write_reg(adap, A_SG_KDOORBELL,
1302 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1303 	return (0);
1304 }
1305 
1306 #ifdef RESTART_CTRLQ
1307 /**
1308  *	restart_ctrlq - restart a suspended control queue
1309  *	@qs: the queue set cotaining the control queue
1310  *
1311  *	Resumes transmission on a suspended Tx control queue.
1312  */
1313 static void
1314 restart_ctrlq(unsigned long data)
1315 {
1316 	struct mbuf *m;
1317 	struct sge_qset *qs = (struct sge_qset *)data;
1318 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1319 	adapter_t *adap = qs->port->adapter;
1320 
1321 	mtx_lock(&q->lock);
1322 again:	reclaim_completed_tx_imm(q);
1323 
1324 	while (q->in_use < q->size &&
1325 	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
1326 
1327 		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1328 
1329 		if (++q->pidx >= q->size) {
1330 			q->pidx = 0;
1331 			q->gen ^= 1;
1332 		}
1333 		q->in_use++;
1334 	}
1335 	if (!skb_queue_empty(&q->sendq)) {
1336 		set_bit(TXQ_CTRL, &qs->txq_stopped);
1337 		smp_mb__after_clear_bit();
1338 
1339 		if (should_restart_tx(q) &&
1340 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1341 			goto again;
1342 		q->stops++;
1343 	}
1344 
1345 	mtx_unlock(&q->lock);
1346 	t3_write_reg(adap, A_SG_KDOORBELL,
1347 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1348 }
1349 #endif
1350 
1351 /*
1352  * Send a management message through control queue 0
1353  */
1354 int
1355 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1356 {
1357 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1358 }
1359 
1360 /**
1361  *	t3_sge_alloc_qset - initialize an SGE queue set
1362  *	@sc: the controller softc
1363  *	@id: the queue set id
1364  *	@nports: how many Ethernet ports will be using this queue set
1365  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
1366  *	@p: configuration parameters for this queue set
1367  *	@ntxq: number of Tx queues for the queue set
1368  *	@pi: port info for queue set
1369  *
1370  *	Allocate resources and initialize an SGE queue set.  A queue set
1371  *	comprises a response queue, two Rx free-buffer queues, and up to 3
1372  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
1373  *	queue, offload queue, and control queue.
1374  */
1375 int
1376 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
1377 		  const struct qset_params *p, int ntxq, struct port_info *pi)
1378 {
1379 	struct sge_qset *q = &sc->sge.qs[id];
1380 	int i, ret = 0;
1381 
1382 	init_qset_cntxt(q, id);
1383 
1384 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
1385 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
1386 		    &q->fl[0].desc, &q->fl[0].sdesc,
1387 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
1388 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
1389 		printf("error %d from alloc ring fl0\n", ret);
1390 		goto err;
1391 	}
1392 
1393 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
1394 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
1395 		    &q->fl[1].desc, &q->fl[1].sdesc,
1396 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
1397 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
1398 		printf("error %d from alloc ring fl1\n", ret);
1399 		goto err;
1400 	}
1401 
1402 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
1403 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
1404 		    &q->rspq.desc_tag, &q->rspq.desc_map,
1405 		    NULL, NULL)) != 0) {
1406 		printf("error %d from alloc ring rspq\n", ret);
1407 		goto err;
1408 	}
1409 
1410 	for (i = 0; i < ntxq; ++i) {
1411 		/*
1412 		 * The control queue always uses immediate data so does not
1413 		 * need to keep track of any mbufs.
1414 		 * XXX Placeholder for future TOE support.
1415 		 */
1416 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
1417 
1418 		if ((ret = alloc_ring(sc, p->txq_size[i],
1419 			    sizeof(struct tx_desc), sz,
1420 			    &q->txq[i].phys_addr, &q->txq[i].desc,
1421 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
1422 			    &q->txq[i].desc_map,
1423 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
1424 			printf("error %d from alloc ring tx %i\n", ret, i);
1425 			goto err;
1426 		}
1427 		q->txq[i].gen = 1;
1428 		q->txq[i].size = p->txq_size[i];
1429 		mtx_init(&q->txq[i].lock, "t3 txq lock", NULL, MTX_DEF);
1430 	}
1431 
1432 	q->fl[0].gen = q->fl[1].gen = 1;
1433 	q->fl[0].size = p->fl_size;
1434 	q->fl[1].size = p->jumbo_size;
1435 
1436 	q->rspq.gen = 1;
1437 	q->rspq.size = p->rspq_size;
1438 	mtx_init(&q->rspq.lock, "t3 rspq lock", NULL, MTX_DEF);
1439 
1440 	q->txq[TXQ_ETH].stop_thres = nports *
1441 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
1442 
1443 	q->fl[0].buf_size = MCLBYTES;
1444 	q->fl[0].zone = zone_clust;
1445 	q->fl[0].type = EXT_CLUSTER;
1446 	q->fl[1].buf_size = MJUMPAGESIZE;
1447 	q->fl[1].zone = zone_jumbop;
1448 	q->fl[1].type = EXT_JUMBOP;
1449 
1450 	q->lro.enabled = lro_default;
1451 
1452 	mtx_lock(&sc->sge.reg_lock);
1453 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
1454 				   q->rspq.phys_addr, q->rspq.size,
1455 				   q->fl[0].buf_size, 1, 0);
1456 	if (ret) {
1457 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
1458 		goto err_unlock;
1459 	}
1460 
1461 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1462 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
1463 					  q->fl[i].phys_addr, q->fl[i].size,
1464 					  q->fl[i].buf_size, p->cong_thres, 1,
1465 					  0);
1466 		if (ret) {
1467 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
1468 			goto err_unlock;
1469 		}
1470 	}
1471 
1472 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
1473 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
1474 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
1475 				 1, 0);
1476 	if (ret) {
1477 		printf("error %d from t3_sge_init_ecntxt\n", ret);
1478 		goto err_unlock;
1479 	}
1480 
1481 	if (ntxq > 1) {
1482 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
1483 					 USE_GTS, SGE_CNTXT_OFLD, id,
1484 					 q->txq[TXQ_OFLD].phys_addr,
1485 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
1486 		if (ret) {
1487 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1488 			goto err_unlock;
1489 		}
1490 	}
1491 
1492 	if (ntxq > 2) {
1493 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
1494 					 SGE_CNTXT_CTRL, id,
1495 					 q->txq[TXQ_CTRL].phys_addr,
1496 					 q->txq[TXQ_CTRL].size,
1497 					 q->txq[TXQ_CTRL].token, 1, 0);
1498 		if (ret) {
1499 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1500 			goto err_unlock;
1501 		}
1502 	}
1503 
1504 	mtx_unlock(&sc->sge.reg_lock);
1505 	t3_update_qset_coalesce(q, p);
1506 	q->port = pi;
1507 
1508 	refill_fl(sc, &q->fl[0], q->fl[0].size);
1509 	refill_fl(sc, &q->fl[1], q->fl[1].size);
1510 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
1511 
1512 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
1513 		     V_NEWTIMER(q->rspq.holdoff_tmr));
1514 
1515 	return (0);
1516 
1517 err_unlock:
1518 	mtx_unlock(&sc->sge.reg_lock);
1519 err:
1520 	t3_free_qset(sc, q);
1521 
1522 	return (ret);
1523 }
1524 
1525 
1526 /**
1527  *	free_qset - free the resources of an SGE queue set
1528  *	@sc: the controller owning the queue set
1529  *	@q: the queue set
1530  *
1531  *	Release the HW and SW resources associated with an SGE queue set, such
1532  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1533  *	queue set must be quiesced prior to calling this.
1534  */
1535 static void
1536 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1537 {
1538 	int i;
1539 
1540 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1541 		if (q->fl[i].desc) {
1542 			mtx_lock(&sc->sge.reg_lock);
1543 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1544 			mtx_unlock(&sc->sge.reg_lock);
1545 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1546 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1547 					q->fl[i].desc_map);
1548 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1549 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1550 		}
1551 		if (q->fl[i].sdesc) {
1552 			free_rx_bufs(sc, &q->fl[i]);
1553 			free(q->fl[i].sdesc, M_DEVBUF);
1554 		}
1555 	}
1556 
1557 	for (i = 0; i < SGE_TXQ_PER_SET; ++i) {
1558 		if (q->txq[i].desc) {
1559 			mtx_lock(&sc->sge.reg_lock);
1560 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1561 			mtx_unlock(&sc->sge.reg_lock);
1562 			bus_dmamap_unload(q->txq[i].desc_tag,
1563 					q->txq[i].desc_map);
1564 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1565 					q->txq[i].desc_map);
1566 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1567 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1568 		}
1569 		if (q->txq[i].sdesc) {
1570 			free(q->txq[i].sdesc, M_DEVBUF);
1571 		}
1572 		if (mtx_initialized(&q->txq[i].lock)) {
1573 			mtx_destroy(&q->txq[i].lock);
1574 		}
1575 	}
1576 
1577 	if (q->rspq.desc) {
1578 		mtx_lock(&sc->sge.reg_lock);
1579 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1580 		mtx_unlock(&sc->sge.reg_lock);
1581 
1582 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1583 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1584 			        q->rspq.desc_map);
1585 		bus_dma_tag_destroy(q->rspq.desc_tag);
1586 	}
1587 
1588 	if (mtx_initialized(&q->rspq.lock))
1589 		mtx_destroy(&q->rspq.lock);
1590 
1591 	bzero(q, sizeof(*q));
1592 }
1593 
1594 /**
1595  *	t3_free_sge_resources - free SGE resources
1596  *	@sc: the adapter softc
1597  *
1598  *	Frees resources used by the SGE queue sets.
1599  */
1600 void
1601 t3_free_sge_resources(adapter_t *sc)
1602 {
1603 	int i;
1604 
1605 	for (i = 0; i < SGE_QSETS; ++i)
1606 		t3_free_qset(sc, &sc->sge.qs[i]);
1607 }
1608 
1609 /**
1610  *	t3_sge_start - enable SGE
1611  *	@sc: the controller softc
1612  *
1613  *	Enables the SGE for DMAs.  This is the last step in starting packet
1614  *	transfers.
1615  */
1616 void
1617 t3_sge_start(adapter_t *sc)
1618 {
1619 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1620 }
1621 
1622 
1623 /**
1624  *	free_tx_desc - reclaims Tx descriptors and their buffers
1625  *	@adapter: the adapter
1626  *	@q: the Tx queue to reclaim descriptors from
1627  *	@n: the number of descriptors to reclaim
1628  *
1629  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1630  *	Tx buffers.  Called with the Tx queue lock held.
1631  */
1632 int
1633 free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec)
1634 {
1635 	struct tx_sw_desc *d;
1636 	unsigned int cidx = q->cidx;
1637 	int nbufs = 0;
1638 
1639 #ifdef T3_TRACE
1640 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1641 		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
1642 #endif
1643 	d = &q->sdesc[cidx];
1644 
1645 	while (n-- > 0) {
1646 		DPRINTF("cidx=%d d=%p\n", cidx, d);
1647 		if (d->m) {
1648 			if (d->flags & TX_SW_DESC_MAPPED) {
1649 				bus_dmamap_unload(q->entry_tag, d->map);
1650 				bus_dmamap_destroy(q->entry_tag, d->map);
1651 				d->flags &= ~TX_SW_DESC_MAPPED;
1652 			}
1653 			m_vec[nbufs] = d->m;
1654 			d->m = NULL;
1655 			nbufs++;
1656 		}
1657 		++d;
1658 		if (++cidx == q->size) {
1659 			cidx = 0;
1660 			d = q->sdesc;
1661 		}
1662 	}
1663 	q->cidx = cidx;
1664 
1665 	return (nbufs);
1666 }
1667 
1668 /**
1669  *	is_new_response - check if a response is newly written
1670  *	@r: the response descriptor
1671  *	@q: the response queue
1672  *
1673  *	Returns true if a response descriptor contains a yet unprocessed
1674  *	response.
1675  */
1676 static __inline int
1677 is_new_response(const struct rsp_desc *r,
1678     const struct sge_rspq *q)
1679 {
1680 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1681 }
1682 
1683 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1684 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1685 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1686 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1687 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1688 
1689 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1690 #define NOMEM_INTR_DELAY 2500
1691 
1692 static __inline void
1693 deliver_partial_bundle(struct t3cdev *tdev, struct sge_rspq *q)
1694 {
1695 	;
1696 }
1697 
1698 static __inline void
1699 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1700     struct mbuf *m)
1701 {
1702 #ifdef notyet
1703 	if (rq->polling) {
1704 		rq->offload_skbs[rq->offload_skbs_idx++] = skb;
1705 		if (rq->offload_skbs_idx == RX_BUNDLE_SIZE) {
1706 			cxgb_ofld_recv(tdev, rq->offload_skbs, RX_BUNDLE_SIZE);
1707 			rq->offload_skbs_idx = 0;
1708 			rq->offload_bundles++;
1709 		}
1710 	} else
1711 #endif
1712 	{
1713 		/* XXX */
1714 		panic("implement offload enqueue\n");
1715 	}
1716 
1717 }
1718 
1719 static void
1720 restart_tx(struct sge_qset *qs)
1721 {
1722 	;
1723 }
1724 
1725 void
1726 t3_rx_eth(struct port_info *pi, struct sge_rspq *rq, struct mbuf *m, int ethpad)
1727 {
1728 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(m->m_data + ethpad);
1729 	struct ifnet *ifp = pi->ifp;
1730 
1731 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, m->m_data, cpl->iff);
1732 	if (&pi->adapter->port[cpl->iff] != pi)
1733 		panic("bad port index %d m->m_data=%p\n", cpl->iff, m->m_data);
1734 
1735 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
1736 	    cpl->csum_valid && cpl->csum == 0xffff) {
1737 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
1738 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
1739 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1740 		m->m_pkthdr.csum_data = 0xffff;
1741 	}
1742 	/*
1743 	 * XXX need to add VLAN support for 6.x
1744 	 */
1745 #ifdef VLAN_SUPPORTED
1746 	if (__predict_false(cpl->vlan_valid)) {
1747 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
1748 		m->m_flags |= M_VLANTAG;
1749 	}
1750 #endif
1751 
1752 	m->m_pkthdr.rcvif = ifp;
1753 	m->m_pkthdr.header = m->m_data + sizeof(*cpl) + ethpad;
1754 	m_explode(m);
1755 	/*
1756 	 * adjust after conversion to mbuf chain
1757 	 */
1758 	m_adj(m, sizeof(*cpl) + ethpad);
1759 
1760 	(*ifp->if_input)(ifp, m);
1761 }
1762 
1763 /**
1764  *	get_packet - return the next ingress packet buffer from a free list
1765  *	@adap: the adapter that received the packet
1766  *	@drop_thres: # of remaining buffers before we start dropping packets
1767  *	@qs: the qset that the SGE free list holding the packet belongs to
1768  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
1769  *      @r: response descriptor
1770  *
1771  *	Get the next packet from a free list and complete setup of the
1772  *	sk_buff.  If the packet is small we make a copy and recycle the
1773  *	original buffer, otherwise we use the original buffer itself.  If a
1774  *	positive drop threshold is supplied packets are dropped and their
1775  *	buffers recycled if (a) the number of remaining buffers is under the
1776  *	threshold and the packet is too big to copy, or (b) the packet should
1777  *	be copied but there is no memory for the copy.
1778  */
1779 
1780 static int
1781 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
1782     struct mbuf *m, struct rsp_desc *r)
1783 {
1784 
1785 	unsigned int len_cq =  ntohl(r->len_cq);
1786 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
1787 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
1788 	uint32_t len = G_RSPD_LEN(len_cq);
1789 	uint32_t flags = ntohl(r->flags);
1790 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
1791 	int ret = 0;
1792 
1793 	prefetch(sd->cl);
1794 
1795 	fl->credits--;
1796 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
1797 	bus_dmamap_unload(fl->entry_tag, sd->map);
1798 
1799 
1800 	switch(sopeop) {
1801 	case RSPQ_SOP_EOP:
1802 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
1803 		m_cljset(m, sd->cl, fl->type);
1804 		m->m_len = m->m_pkthdr.len = len;
1805 		ret = 1;
1806 		goto done;
1807 		break;
1808 	case RSPQ_NSOP_NEOP:
1809 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
1810 		ret = 0;
1811 		break;
1812 	case RSPQ_SOP:
1813 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
1814 		m_iovinit(m);
1815 		ret = 0;
1816 		break;
1817 	case RSPQ_EOP:
1818 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
1819 		ret = 1;
1820 		break;
1821 	}
1822 	m_iovappend(m, sd->cl, fl->buf_size, len, 0);
1823 
1824 done:
1825 	if (++fl->cidx == fl->size)
1826 		fl->cidx = 0;
1827 
1828 	return (ret);
1829 }
1830 
1831 
1832 /**
1833  *	handle_rsp_cntrl_info - handles control information in a response
1834  *	@qs: the queue set corresponding to the response
1835  *	@flags: the response control flags
1836  *
1837  *	Handles the control information of an SGE response, such as GTS
1838  *	indications and completion credits for the queue set's Tx queues.
1839  *	HW coalesces credits, we don't do any extra SW coalescing.
1840  */
1841 static __inline void
1842 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
1843 {
1844 	unsigned int credits;
1845 
1846 #if USE_GTS
1847 	if (flags & F_RSPD_TXQ0_GTS)
1848 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
1849 #endif
1850 	credits = G_RSPD_TXQ0_CR(flags);
1851 	if (credits) {
1852 		qs->txq[TXQ_ETH].processed += credits;
1853 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
1854 			taskqueue_enqueue(qs->port->adapter->tq,
1855 			    &qs->port->adapter->timer_reclaim_task);
1856 	}
1857 
1858 	credits = G_RSPD_TXQ2_CR(flags);
1859 	if (credits)
1860 		qs->txq[TXQ_CTRL].processed += credits;
1861 
1862 # if USE_GTS
1863 	if (flags & F_RSPD_TXQ1_GTS)
1864 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
1865 # endif
1866 	credits = G_RSPD_TXQ1_CR(flags);
1867 	if (credits)
1868 		qs->txq[TXQ_OFLD].processed += credits;
1869 }
1870 
1871 static void
1872 check_ring_db(adapter_t *adap, struct sge_qset *qs,
1873     unsigned int sleeping)
1874 {
1875 	;
1876 }
1877 
1878 /*
1879  * This is an awful hack to bind the ithread to CPU 1
1880  * to work around lack of ithread affinity
1881  */
1882 static void
1883 bind_ithread(int cpu)
1884 {
1885 #if 0
1886 	KASSERT(cpu < mp_ncpus, ("invalid cpu identifier"));
1887 	if (mp_ncpus > 1) {
1888 		mtx_lock_spin(&sched_lock);
1889 		sched_bind(curthread, cpu);
1890 		mtx_unlock_spin(&sched_lock);
1891 	}
1892 #endif
1893 }
1894 
1895 /**
1896  *	process_responses - process responses from an SGE response queue
1897  *	@adap: the adapter
1898  *	@qs: the queue set to which the response queue belongs
1899  *	@budget: how many responses can be processed in this round
1900  *
1901  *	Process responses from an SGE response queue up to the supplied budget.
1902  *	Responses include received packets as well as credits and other events
1903  *	for the queues that belong to the response queue's queue set.
1904  *	A negative budget is effectively unlimited.
1905  *
1906  *	Additionally choose the interrupt holdoff time for the next interrupt
1907  *	on this queue.  If the system is under memory shortage use a fairly
1908  *	long delay to help recovery.
1909  */
1910 static int
1911 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
1912 {
1913 	struct sge_rspq *rspq = &qs->rspq;
1914 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
1915 	int budget_left = budget;
1916 	unsigned int sleeping = 0;
1917 	int lro = qs->lro.enabled;
1918 
1919 	static uint8_t pinned[MAXCPU];
1920 
1921 #ifdef DEBUG
1922 	static int last_holdoff = 0;
1923 	if (rspq->holdoff_tmr != last_holdoff) {
1924 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
1925 		last_holdoff = rspq->holdoff_tmr;
1926 	}
1927 #endif
1928 	if (pinned[qs->rspq.cntxt_id * adap->params.nports] == 0) {
1929 		/*
1930 		 * Assumes that cntxt_id < mp_ncpus
1931 		 */
1932 		bind_ithread(qs->rspq.cntxt_id);
1933 		pinned[qs->rspq.cntxt_id * adap->params.nports] = 1;
1934 	}
1935 	rspq->next_holdoff = rspq->holdoff_tmr;
1936 
1937 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
1938 		int eth, eop = 0, ethpad = 0;
1939 		uint32_t flags = ntohl(r->flags);
1940 		uint32_t rss_csum = *(const uint32_t *)r;
1941 		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
1942 
1943 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
1944 
1945 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
1946 			/* XXX */
1947 			printf("async notification\n");
1948 
1949 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
1950 			struct mbuf *m = NULL;
1951 			if (cxgb_debug)
1952 				printf("IMM DATA VALID\n");
1953 			if (rspq->m == NULL)
1954 				rspq->m = m_gethdr(M_NOWAIT, MT_DATA);
1955                         else
1956 				m = m_gethdr(M_NOWAIT, MT_DATA);
1957 
1958 			if (rspq->m == NULL || m == NULL) {
1959 				rspq->next_holdoff = NOMEM_INTR_DELAY;
1960 				budget_left--;
1961 				break;
1962 			}
1963 			get_imm_packet(adap, r, rspq->m, m);
1964 			eop = 1;
1965 			rspq->imm_data++;
1966 		} else if (r->len_cq) {
1967 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
1968 
1969                         if (rspq->m == NULL)
1970 				rspq->m = m_gethdr(M_NOWAIT, MT_DATA);
1971 			if (rspq->m == NULL) {
1972 				log(LOG_WARNING, "failed to get mbuf for packet\n");
1973 				break;
1974 			}
1975 
1976 			ethpad = 2;
1977 			eop = get_packet(adap, drop_thresh, qs, rspq->m, r);
1978 		} else {
1979 			DPRINTF("pure response\n");
1980 			rspq->pure_rsps++;
1981 		}
1982 
1983 		if (flags & RSPD_CTRL_MASK) {
1984 			sleeping |= flags & RSPD_GTS_MASK;
1985 			handle_rsp_cntrl_info(qs, flags);
1986 		}
1987 
1988 		r++;
1989 		if (__predict_false(++rspq->cidx == rspq->size)) {
1990 			rspq->cidx = 0;
1991 			rspq->gen ^= 1;
1992 			r = rspq->desc;
1993 		}
1994 
1995 		prefetch(r);
1996 		if (++rspq->credits >= (rspq->size / 4)) {
1997 			refill_rspq(adap, rspq, rspq->credits);
1998 			rspq->credits = 0;
1999 		}
2000 
2001 		if (eop) {
2002 			prefetch(rspq->m->m_data);
2003 			prefetch(rspq->m->m_data + L1_CACHE_BYTES);
2004 
2005 			if (eth) {
2006 				t3_rx_eth_lro(adap, rspq, rspq->m, ethpad,
2007 				    rss_hash, rss_csum, lro);
2008 
2009 				rspq->m = NULL;
2010 			} else {
2011 #ifdef notyet
2012 				if (__predict_false(r->rss_hdr.opcode == CPL_TRACE_PKT))
2013 					m_adj(m, 2);
2014 
2015 				rx_offload(&adap->tdev, rspq, m);
2016 #endif
2017 			}
2018 #ifdef notyet
2019 			taskqueue_enqueue(adap->tq, &adap->timer_reclaim_task);
2020 #else
2021 			__refill_fl(adap, &qs->fl[0]);
2022 			__refill_fl(adap, &qs->fl[1]);
2023 #endif
2024 		}
2025 		--budget_left;
2026 	}
2027 	t3_sge_lro_flush_all(adap, qs);
2028 	deliver_partial_bundle(&adap->tdev, rspq);
2029 
2030 	if (sleeping)
2031 		check_ring_db(adap, qs, sleeping);
2032 
2033 	smp_mb();  /* commit Tx queue processed updates */
2034 	if (__predict_false(qs->txq_stopped != 0))
2035 		restart_tx(qs);
2036 
2037 	budget -= budget_left;
2038 	return (budget);
2039 }
2040 
2041 /*
2042  * A helper function that processes responses and issues GTS.
2043  */
2044 static __inline int
2045 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2046 {
2047 	int work;
2048 	static int last_holdoff = 0;
2049 
2050 	work = process_responses(adap, rspq_to_qset(rq), -1);
2051 
2052 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2053 		printf("next_holdoff=%d\n", rq->next_holdoff);
2054 		last_holdoff = rq->next_holdoff;
2055 	}
2056 
2057 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2058 		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2059 	return work;
2060 }
2061 
2062 
2063 /*
2064  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2065  * Handles data events from SGE response queues as well as error and other
2066  * async events as they all use the same interrupt pin.  We use one SGE
2067  * response queue per port in this mode and protect all response queues with
2068  * queue 0's lock.
2069  */
2070 void
2071 t3b_intr(void *data)
2072 {
2073 	uint32_t map;
2074 	adapter_t *adap = data;
2075 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2076 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2077 
2078 	t3_write_reg(adap, A_PL_CLI, 0);
2079 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2080 
2081 	if (!map)
2082 		return;
2083 
2084 	if (__predict_false(map & F_ERRINTR))
2085 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2086 
2087 	mtx_lock(&q0->lock);
2088 
2089 	if (__predict_true(map & 1))
2090 		process_responses_gts(adap, q0);
2091 
2092 	if (map & 2)
2093 		process_responses_gts(adap, q1);
2094 
2095 	mtx_unlock(&q0->lock);
2096 }
2097 
2098 /*
2099  * The MSI interrupt handler.  This needs to handle data events from SGE
2100  * response queues as well as error and other async events as they all use
2101  * the same MSI vector.  We use one SGE response queue per port in this mode
2102  * and protect all response queues with queue 0's lock.
2103  */
2104 void
2105 t3_intr_msi(void *data)
2106 {
2107 	adapter_t *adap = data;
2108 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2109 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2110 	int new_packets = 0;
2111 
2112 	mtx_lock(&q0->lock);
2113 	if (process_responses_gts(adap, q0)) {
2114 		new_packets = 1;
2115 	}
2116 
2117 	if (adap->params.nports == 2 &&
2118 	    process_responses_gts(adap, q1)) {
2119 		new_packets = 1;
2120 	}
2121 
2122 	mtx_unlock(&q0->lock);
2123 	if (new_packets == 0)
2124 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2125 }
2126 
2127 void
2128 t3_intr_msix(void *data)
2129 {
2130 	struct sge_qset *qs = data;
2131 	adapter_t *adap = qs->port->adapter;
2132 	struct sge_rspq *rspq = &qs->rspq;
2133 
2134 	mtx_lock(&rspq->lock);
2135 	if (process_responses_gts(adap, rspq) == 0) {
2136 #ifdef notyet
2137 		rspq->unhandled_irqs++;
2138 #endif
2139 	}
2140 	mtx_unlock(&rspq->lock);
2141 }
2142 
2143 /*
2144  * broken by recent mbuf changes
2145  */
2146 static int
2147 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2148 {
2149 	adapter_t *sc;
2150 	int i, j, enabled, err, nqsets = 0;
2151 
2152 #ifndef LRO_WORKING
2153 	return (0);
2154 #endif
2155 
2156 	sc = arg1;
2157 	enabled = sc->sge.qs[0].lro.enabled;
2158         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2159 
2160 	if (err != 0) {
2161 		return (err);
2162 	}
2163 	if (enabled == sc->sge.qs[0].lro.enabled)
2164 		return (0);
2165 
2166 	for (i = 0; i < sc->params.nports; i++)
2167 		for (j = 0; j < sc->port[i].nqsets; j++)
2168 			nqsets++;
2169 
2170 	for (i = 0; i < nqsets; i++) {
2171 		sc->sge.qs[i].lro.enabled = enabled;
2172 	}
2173 
2174 	return (0);
2175 }
2176 
2177 static int
2178 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2179 {
2180 	adapter_t *sc = arg1;
2181 	struct qset_params *qsp = &sc->params.sge.qset[0];
2182 	int coalesce_nsecs;
2183 	struct sge_qset *qs;
2184 	int i, j, err, nqsets = 0;
2185 	struct mtx *lock;
2186 
2187 	coalesce_nsecs = qsp->coalesce_nsecs;
2188         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
2189 
2190 	if (err != 0) {
2191 		return (err);
2192 	}
2193 	if (coalesce_nsecs == qsp->coalesce_nsecs)
2194 		return (0);
2195 
2196 	for (i = 0; i < sc->params.nports; i++)
2197 		for (j = 0; j < sc->port[i].nqsets; j++)
2198 			nqsets++;
2199 
2200 	coalesce_nsecs = max(100, coalesce_nsecs);
2201 
2202 	for (i = 0; i < nqsets; i++) {
2203 		qs = &sc->sge.qs[i];
2204 		qsp = &sc->params.sge.qset[i];
2205 		qsp->coalesce_nsecs = coalesce_nsecs;
2206 
2207 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
2208 			    &sc->sge.qs[0].rspq.lock;
2209 
2210 		mtx_lock(lock);
2211 		t3_update_qset_coalesce(qs, qsp);
2212 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2213 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
2214 		mtx_unlock(lock);
2215 	}
2216 
2217 	return (0);
2218 }
2219 
2220 
2221 void
2222 t3_add_sysctls(adapter_t *sc)
2223 {
2224 	struct sysctl_ctx_list *ctx;
2225 	struct sysctl_oid_list *children;
2226 
2227 	ctx = device_get_sysctl_ctx(sc->dev);
2228 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
2229 
2230 	/* random information */
2231 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
2232 	    "firmware_version",
2233 	    CTLFLAG_RD, &sc->fw_version,
2234 	    0, "firmware version");
2235 
2236 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2237 	    "enable_lro",
2238 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2239 	    0, t3_lro_enable,
2240 	    "I", "enable large receive offload");
2241 
2242 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2243 	    "intr_coal",
2244 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2245 	    0, t3_set_coalesce_nsecs,
2246 	    "I", "interrupt coalescing timer (ns)");
2247 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2248 	    "enable_debug",
2249 	    CTLFLAG_RW, &cxgb_debug,
2250 	    0, "enable verbose debugging output");
2251 
2252 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2253 	    "collapse_free",
2254 	    CTLFLAG_RD, &collapse_free,
2255 	    0, "frees during collapse");
2256 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2257 	    "mb_free_vec_free",
2258 	    CTLFLAG_RD, &mb_free_vec_free,
2259 	    0, "frees during mb_free_vec");
2260 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2261 	    "collapse_mbufs",
2262 	    CTLFLAG_RW, &collapse_mbufs,
2263 	    0, "collapse mbuf chains into iovecs");
2264 }
2265 
2266 /**
2267  *	t3_get_desc - dump an SGE descriptor for debugging purposes
2268  *	@qs: the queue set
2269  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2270  *	@idx: the descriptor index in the queue
2271  *	@data: where to dump the descriptor contents
2272  *
2273  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2274  *	size of the descriptor.
2275  */
2276 int
2277 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2278 		unsigned char *data)
2279 {
2280 	if (qnum >= 6)
2281 		return (EINVAL);
2282 
2283 	if (qnum < 3) {
2284 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2285 			return -EINVAL;
2286 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2287 		return sizeof(struct tx_desc);
2288 	}
2289 
2290 	if (qnum == 3) {
2291 		if (!qs->rspq.desc || idx >= qs->rspq.size)
2292 			return (EINVAL);
2293 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2294 		return sizeof(struct rsp_desc);
2295 	}
2296 
2297 	qnum -= 4;
2298 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2299 		return (EINVAL);
2300 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2301 	return sizeof(struct rx_desc);
2302 }
2303