xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 78007886c995898a9494648343e5236bca1cbba3)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Chelsio Corporation nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <machine/bus.h>
44 #include <machine/resource.h>
45 #include <sys/bus_dma.h>
46 #include <sys/rman.h>
47 #include <sys/queue.h>
48 #include <sys/sysctl.h>
49 #include <sys/taskqueue.h>
50 
51 
52 #include <sys/proc.h>
53 #include <sys/sched.h>
54 #include <sys/smp.h>
55 #include <sys/systm.h>
56 
57 #include <netinet/in_systm.h>
58 #include <netinet/in.h>
59 #include <netinet/ip.h>
60 #include <netinet/tcp.h>
61 
62 #include <dev/pci/pcireg.h>
63 #include <dev/pci/pcivar.h>
64 #include <dev/cxgb/common/cxgb_common.h>
65 #include <dev/cxgb/common/cxgb_regs.h>
66 #include <dev/cxgb/common/cxgb_sge_defs.h>
67 #include <dev/cxgb/common/cxgb_t3_cpl.h>
68 #include <dev/cxgb/common/cxgb_firmware_exports.h>
69 
70 #include <dev/cxgb/sys/mvec.h>
71 
72 uint32_t collapse_free = 0;
73 uint32_t mb_free_vec_free = 0;
74 int      collapse_mbufs = 1;
75 
76 #define USE_GTS 0
77 
78 #define SGE_RX_SM_BUF_SIZE	1536
79 #define SGE_RX_DROP_THRES	16
80 
81 /*
82  * Period of the Tx buffer reclaim timer.  This timer does not need to run
83  * frequently as Tx buffers are usually reclaimed by new Tx packets.
84  */
85 #define TX_RECLAIM_PERIOD       (hz >> 2)
86 
87 /*
88  * work request size in bytes
89  */
90 #define WR_LEN (WR_FLITS * 8)
91 
92 /*
93  * Values for sge_txq.flags
94  */
95 enum {
96 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
97 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
98 };
99 
100 struct tx_desc {
101 	uint64_t	flit[TX_DESC_FLITS];
102 } __packed;
103 
104 struct rx_desc {
105 	uint32_t	addr_lo;
106 	uint32_t	len_gen;
107 	uint32_t	gen2;
108 	uint32_t	addr_hi;
109 } __packed;;
110 
111 struct rsp_desc {               /* response queue descriptor */
112 	struct rss_header	rss_hdr;
113 	uint32_t		flags;
114 	uint32_t		len_cq;
115 	uint8_t			imm_data[47];
116 	uint8_t			intr_gen;
117 } __packed;
118 
119 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
120 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
121 #define RX_SW_DESC_INUSE        (1 << 3)
122 #define TX_SW_DESC_MAPPED       (1 << 4)
123 
124 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
125 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
126 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
127 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
128 
129 struct tx_sw_desc {                /* SW state per Tx descriptor */
130 	struct mbuf	*m;
131 	bus_dmamap_t	map;
132 	int		flags;
133 };
134 
135 struct rx_sw_desc {                /* SW state per Rx descriptor */
136 	void	        *cl;
137 	bus_dmamap_t	map;
138 	int		flags;
139 };
140 
141 struct txq_state {
142 	unsigned int compl;
143 	unsigned int gen;
144 	unsigned int pidx;
145 };
146 
147 struct refill_fl_cb_arg {
148 	int               error;
149 	bus_dma_segment_t seg;
150 	int               nseg;
151 };
152 
153 /*
154  * Maps a number of flits to the number of Tx descriptors that can hold them.
155  * The formula is
156  *
157  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
158  *
159  * HW allows up to 4 descriptors to be combined into a WR.
160  */
161 static uint8_t flit_desc_map[] = {
162 	0,
163 #if SGE_NUM_GENBITS == 1
164 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
165 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
166 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
167 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
168 #elif SGE_NUM_GENBITS == 2
169 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
170 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
171 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
172 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
173 #else
174 # error "SGE_NUM_GENBITS must be 1 or 2"
175 #endif
176 };
177 
178 
179 static int lro_default = 0;
180 int cxgb_debug = 0;
181 
182 static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
183 static void sge_timer_cb(void *arg);
184 static void sge_timer_reclaim(void *arg, int ncount);
185 static int free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec);
186 
187 /**
188  *	reclaim_completed_tx - reclaims completed Tx descriptors
189  *	@adapter: the adapter
190  *	@q: the Tx queue to reclaim completed descriptors from
191  *
192  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
193  *	and frees the associated buffers if possible.  Called with the Tx
194  *	queue's lock held.
195  */
196 static __inline int
197 reclaim_completed_tx(adapter_t *adapter, struct sge_txq *q, int nbufs, struct mbuf **mvec)
198 {
199 	int reclaimed, reclaim = desc_reclaimable(q);
200 	int n = 0;
201 
202 	mtx_assert(&q->lock, MA_OWNED);
203 
204 	if (reclaim > 0) {
205 		n = free_tx_desc(adapter, q, min(reclaim, nbufs), mvec);
206 		reclaimed = min(reclaim, nbufs);
207 		q->cleaned += reclaimed;
208 		q->in_use -= reclaimed;
209 	}
210 
211 	return (n);
212 }
213 
214 /**
215  *	t3_sge_init - initialize SGE
216  *	@adap: the adapter
217  *	@p: the SGE parameters
218  *
219  *	Performs SGE initialization needed every time after a chip reset.
220  *	We do not initialize any of the queue sets here, instead the driver
221  *	top-level must request those individually.  We also do not enable DMA
222  *	here, that should be done after the queues have been set up.
223  */
224 void
225 t3_sge_init(adapter_t *adap, struct sge_params *p)
226 {
227 	u_int ctrl, ups;
228 
229 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
230 
231 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
232 	       F_CQCRDTCTRL |
233 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
234 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
235 #if SGE_NUM_GENBITS == 1
236 	ctrl |= F_EGRGENCTRL;
237 #endif
238 	if (adap->params.rev > 0) {
239 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
240 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
241 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
242 	}
243 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
244 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
245 		     V_LORCQDRBTHRSH(512));
246 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
247 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
248 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
249 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
250 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
251 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
252 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
253 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
254 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
255 }
256 
257 
258 /**
259  *	sgl_len - calculates the size of an SGL of the given capacity
260  *	@n: the number of SGL entries
261  *
262  *	Calculates the number of flits needed for a scatter/gather list that
263  *	can hold the given number of entries.
264  */
265 static __inline unsigned int
266 sgl_len(unsigned int n)
267 {
268 	return ((3 * n) / 2 + (n & 1));
269 }
270 
271 /**
272  *	get_imm_packet - return the next ingress packet buffer from a response
273  *	@resp: the response descriptor containing the packet data
274  *
275  *	Return a packet containing the immediate data of the given response.
276  */
277 static __inline void
278 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl)
279 {
280 	int len;
281 	uint32_t flags = ntohl(resp->flags);
282 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
283 
284 	/*
285 	 * would be a firmware bug
286 	 */
287 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
288 		return;
289 
290 	len = G_RSPD_LEN(ntohl(resp->len_cq));
291 	switch (sopeop) {
292 	case RSPQ_SOP_EOP:
293 		m->m_len = m->m_pkthdr.len = len;
294 		memcpy(m->m_data, resp->imm_data, len);
295 		break;
296 	case RSPQ_EOP:
297 		memcpy(cl, resp->imm_data, len);
298 		m_iovappend(m, cl, MSIZE, len, 0);
299 		break;
300 	}
301 }
302 
303 
304 static __inline u_int
305 flits_to_desc(u_int n)
306 {
307 	return (flit_desc_map[n]);
308 }
309 
310 void
311 t3_sge_err_intr_handler(adapter_t *adapter)
312 {
313 	unsigned int v, status;
314 
315 
316 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
317 
318 	if (status & F_RSPQCREDITOVERFOW)
319 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
320 
321 	if (status & F_RSPQDISABLED) {
322 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
323 
324 		CH_ALERT(adapter,
325 			 "packet delivered to disabled response queue (0x%x)\n",
326 			 (v >> S_RSPQ0DISABLED) & 0xff);
327 	}
328 
329 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
330 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
331 		t3_fatal_err(adapter);
332 }
333 
334 void
335 t3_sge_prep(adapter_t *adap, struct sge_params *p)
336 {
337 	int i;
338 
339 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
340 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
341 
342 	for (i = 0; i < SGE_QSETS; ++i) {
343 		struct qset_params *q = p->qset + i;
344 
345 		q->polling = adap->params.rev > 0;
346 
347 		if (adap->flags & USING_MSIX)
348 			q->coalesce_nsecs = 6000;
349 		else
350 			q->coalesce_nsecs = 3500;
351 
352 		q->rspq_size = RSPQ_Q_SIZE;
353 		q->fl_size = FL_Q_SIZE;
354 		q->jumbo_size = JUMBO_Q_SIZE;
355 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
356 		q->txq_size[TXQ_OFLD] = 1024;
357 		q->txq_size[TXQ_CTRL] = 256;
358 		q->cong_thres = 0;
359 	}
360 }
361 
362 int
363 t3_sge_alloc(adapter_t *sc)
364 {
365 
366 	/* The parent tag. */
367 	if (bus_dma_tag_create( NULL,			/* parent */
368 				1, 0,			/* algnmnt, boundary */
369 				BUS_SPACE_MAXADDR,	/* lowaddr */
370 				BUS_SPACE_MAXADDR,	/* highaddr */
371 				NULL, NULL,		/* filter, filterarg */
372 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
373 				BUS_SPACE_UNRESTRICTED, /* nsegments */
374 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
375 				0,			/* flags */
376 				NULL, NULL,		/* lock, lockarg */
377 				&sc->parent_dmat)) {
378 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
379 		return (ENOMEM);
380 	}
381 
382 	/*
383 	 * DMA tag for normal sized RX frames
384 	 */
385 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
386 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
387 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
388 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
389 		return (ENOMEM);
390 	}
391 
392 	/*
393 	 * DMA tag for jumbo sized RX frames.
394 	 */
395 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
396 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
397 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
398 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
399 		return (ENOMEM);
400 	}
401 
402 	/*
403 	 * DMA tag for TX frames.
404 	 */
405 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
406 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
407 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
408 		NULL, NULL, &sc->tx_dmat)) {
409 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
410 		return (ENOMEM);
411 	}
412 
413 	return (0);
414 }
415 
416 int
417 t3_sge_free(struct adapter * sc)
418 {
419 
420 	if (sc->tx_dmat != NULL)
421 		bus_dma_tag_destroy(sc->tx_dmat);
422 
423 	if (sc->rx_jumbo_dmat != NULL)
424 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
425 
426 	if (sc->rx_dmat != NULL)
427 		bus_dma_tag_destroy(sc->rx_dmat);
428 
429 	if (sc->parent_dmat != NULL)
430 		bus_dma_tag_destroy(sc->parent_dmat);
431 
432 	return (0);
433 }
434 
435 void
436 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
437 {
438 
439 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
440 	qs->rspq.polling = 0 /* p->polling */;
441 }
442 
443 static void
444 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
445 {
446 	struct refill_fl_cb_arg *cb_arg = arg;
447 
448 	cb_arg->error = error;
449 	cb_arg->seg = segs[0];
450 	cb_arg->nseg = nseg;
451 
452 }
453 
454 /**
455  *	refill_fl - refill an SGE free-buffer list
456  *	@sc: the controller softc
457  *	@q: the free-list to refill
458  *	@n: the number of new buffers to allocate
459  *
460  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
461  *	The caller must assure that @n does not exceed the queue's capacity.
462  */
463 static void
464 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
465 {
466 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
467 	struct rx_desc *d = &q->desc[q->pidx];
468 	struct refill_fl_cb_arg cb_arg;
469 	void *cl;
470 	int err;
471 
472 	cb_arg.error = 0;
473 	while (n--) {
474 		/*
475 		 * We only allocate a cluster, mbuf allocation happens after rx
476 		 */
477 		if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
478 			log(LOG_WARNING, "Failed to allocate cluster\n");
479 			goto done;
480 		}
481 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
482 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
483 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
484 				uma_zfree(q->zone, cl);
485 				goto done;
486 			}
487 			sd->flags |= RX_SW_DESC_MAP_CREATED;
488 		}
489 		err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
490 		    refill_fl_cb, &cb_arg, 0);
491 
492 		if (err != 0 || cb_arg.error) {
493 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
494 			/*
495 			 * XXX free cluster
496 			 */
497 			return;
498 		}
499 
500 		sd->flags |= RX_SW_DESC_INUSE;
501 		sd->cl = cl;
502 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
503 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
504 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
505 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
506 
507 		d++;
508 		sd++;
509 
510 		if (++q->pidx == q->size) {
511 			q->pidx = 0;
512 			q->gen ^= 1;
513 			sd = q->sdesc;
514 			d = q->desc;
515 		}
516 		q->credits++;
517 	}
518 
519 done:
520 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
521 }
522 
523 
524 /**
525  *	free_rx_bufs - free the Rx buffers on an SGE free list
526  *	@sc: the controle softc
527  *	@q: the SGE free list to clean up
528  *
529  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
530  *	this queue should be stopped before calling this function.
531  */
532 static void
533 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
534 {
535 	u_int cidx = q->cidx;
536 
537 	while (q->credits--) {
538 		struct rx_sw_desc *d = &q->sdesc[cidx];
539 
540 		if (d->flags & RX_SW_DESC_INUSE) {
541 			bus_dmamap_unload(q->entry_tag, d->map);
542 			bus_dmamap_destroy(q->entry_tag, d->map);
543 			uma_zfree(q->zone, d->cl);
544 		}
545 		d->cl = NULL;
546 		if (++cidx == q->size)
547 			cidx = 0;
548 	}
549 }
550 
551 static __inline void
552 __refill_fl(adapter_t *adap, struct sge_fl *fl)
553 {
554 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
555 }
556 
557 static void
558 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
559 {
560 	uint32_t *addr;
561 
562 	addr = arg;
563 	*addr = segs[0].ds_addr;
564 }
565 
566 static int
567 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
568     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
569     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
570 {
571 	size_t len = nelem * elem_size;
572 	void *s = NULL;
573 	void *p = NULL;
574 	int err;
575 
576 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
577 				      BUS_SPACE_MAXADDR_32BIT,
578 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
579 				      len, 0, NULL, NULL, tag)) != 0) {
580 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
581 		return (ENOMEM);
582 	}
583 
584 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
585 				    map)) != 0) {
586 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
587 		return (ENOMEM);
588 	}
589 
590 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
591 	bzero(p, len);
592 	*(void **)desc = p;
593 
594 	if (sw_size) {
595 		len = nelem * sw_size;
596 		s = malloc(len, M_DEVBUF, M_WAITOK);
597 		bzero(s, len);
598 		*(void **)sdesc = s;
599 	}
600 	if (parent_entry_tag == NULL)
601 		return (0);
602 
603 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
604 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
605 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
606 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
607 		                      NULL, NULL, entry_tag)) != 0) {
608 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
609 		return (ENOMEM);
610 	}
611 	return (0);
612 }
613 
614 static void
615 sge_slow_intr_handler(void *arg, int ncount)
616 {
617 	adapter_t *sc = arg;
618 
619 	t3_slow_intr_handler(sc);
620 }
621 
622 static void
623 sge_timer_cb(void *arg)
624 {
625 	adapter_t *sc = arg;
626 	struct sge_qset *qs;
627 	struct sge_txq  *txq;
628 	int i, j;
629 	int reclaim_eth, reclaim_ofl, refill_rx;
630 
631 	for (i = 0; i < sc->params.nports; i++)
632 		for (j = 0; j < sc->port[i].nqsets; j++) {
633 			qs = &sc->sge.qs[i + j];
634 			txq = &qs->txq[0];
635 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
636 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
637 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
638 			    (qs->fl[1].credits < qs->fl[1].size));
639 			if (reclaim_eth || reclaim_ofl || refill_rx) {
640 				taskqueue_enqueue(sc->tq, &sc->timer_reclaim_task);
641 				goto done;
642 			}
643 		}
644 done:
645 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
646 }
647 
648 /*
649  * This is meant to be a catch-all function to keep sge state private
650  * to sge.c
651  *
652  */
653 int
654 t3_sge_init_sw(adapter_t *sc)
655 {
656 
657 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
658 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
659 	TASK_INIT(&sc->timer_reclaim_task, 0, sge_timer_reclaim, sc);
660 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
661 	return (0);
662 }
663 
664 void
665 t3_sge_deinit_sw(adapter_t *sc)
666 {
667 	callout_drain(&sc->sge_timer_ch);
668 	if (sc->tq) {
669 		taskqueue_drain(sc->tq, &sc->timer_reclaim_task);
670 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
671 	}
672 }
673 
674 /**
675  *	refill_rspq - replenish an SGE response queue
676  *	@adapter: the adapter
677  *	@q: the response queue to replenish
678  *	@credits: how many new responses to make available
679  *
680  *	Replenishes a response queue by making the supplied number of responses
681  *	available to HW.
682  */
683 static __inline void
684 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
685 {
686 
687 	/* mbufs are allocated on demand when a rspq entry is processed. */
688 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
689 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
690 }
691 
692 
693 static void
694 sge_timer_reclaim(void *arg, int ncount)
695 {
696 	adapter_t *sc = arg;
697 	int i, nqsets = 0;
698 	struct sge_qset *qs;
699 	struct sge_txq *txq;
700 	struct mtx *lock;
701 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
702 	int n, reclaimable;
703 	/*
704 	 * XXX assuming these quantities are allowed to change during operation
705 	 */
706 	for (i = 0; i < sc->params.nports; i++)
707 		nqsets += sc->port[i].nqsets;
708 
709 	for (i = 0; i < nqsets; i++) {
710 		qs = &sc->sge.qs[i];
711 		txq = &qs->txq[TXQ_ETH];
712 		reclaimable = desc_reclaimable(txq);
713 		if (reclaimable > 0) {
714 			mtx_lock(&txq->lock);
715 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
716 			mtx_unlock(&txq->lock);
717 
718 			for (i = 0; i < n; i++) {
719 				m_freem_vec(m_vec[i]);
720 			}
721 		}
722 
723 		txq = &qs->txq[TXQ_OFLD];
724 		reclaimable = desc_reclaimable(txq);
725 		if (reclaimable > 0) {
726 			mtx_lock(&txq->lock);
727 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
728 			mtx_unlock(&txq->lock);
729 
730 			for (i = 0; i < n; i++) {
731 				m_freem_vec(m_vec[i]);
732 			}
733 		}
734 
735 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
736 			    &sc->sge.qs[0].rspq.lock;
737 
738 		if (mtx_trylock(lock)) {
739 			/* XXX currently assume that we are *NOT* polling */
740 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
741 
742 			if (qs->fl[0].credits < qs->fl[0].size - 16)
743 				__refill_fl(sc, &qs->fl[0]);
744 			if (qs->fl[1].credits < qs->fl[1].size - 16)
745 				__refill_fl(sc, &qs->fl[1]);
746 
747 			if (status & (1 << qs->rspq.cntxt_id)) {
748 				if (qs->rspq.credits) {
749 					refill_rspq(sc, &qs->rspq, 1);
750 					qs->rspq.credits--;
751 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
752 					    1 << qs->rspq.cntxt_id);
753 				}
754 			}
755 			mtx_unlock(lock);
756 		}
757 	}
758 }
759 
760 /**
761  *	init_qset_cntxt - initialize an SGE queue set context info
762  *	@qs: the queue set
763  *	@id: the queue set id
764  *
765  *	Initializes the TIDs and context ids for the queues of a queue set.
766  */
767 static void
768 init_qset_cntxt(struct sge_qset *qs, u_int id)
769 {
770 
771 	qs->rspq.cntxt_id = id;
772 	qs->fl[0].cntxt_id = 2 * id;
773 	qs->fl[1].cntxt_id = 2 * id + 1;
774 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
775 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
776 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
777 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
778 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
779 }
780 
781 
782 static void
783 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
784 {
785 	txq->in_use += ndesc;
786 	/*
787 	 * XXX we don't handle stopping of queue
788 	 * presumably start handles this when we bump against the end
789 	 */
790 	txqs->gen = txq->gen;
791 	txq->unacked += ndesc;
792 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
793 	txq->unacked &= 7;
794 	txqs->pidx = txq->pidx;
795 	txq->pidx += ndesc;
796 
797 	if (txq->pidx >= txq->size) {
798 		txq->pidx -= txq->size;
799 		txq->gen ^= 1;
800 	}
801 
802 }
803 
804 /**
805  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
806  *	@m: the packet mbufs
807  *      @nsegs: the number of segments
808  *
809  * 	Returns the number of Tx descriptors needed for the given Ethernet
810  * 	packet.  Ethernet packets require addition of WR and CPL headers.
811  */
812 static __inline unsigned int
813 calc_tx_descs(const struct mbuf *m, int nsegs)
814 {
815 	unsigned int flits;
816 
817 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
818 		return 1;
819 
820 	flits = sgl_len(nsegs) + 2;
821 #ifdef TSO_SUPPORTED
822 	if  (m->m_pkthdr.csum_flags & (CSUM_TSO))
823 		flits++;
824 #endif
825 	return flits_to_desc(flits);
826 }
827 
828 static unsigned int
829 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
830     struct tx_sw_desc *stx, bus_dma_segment_t *segs, int *nsegs)
831 {
832 	struct mbuf *m0;
833 	int err, pktlen;
834 
835 	m0 = *m;
836 	pktlen = m0->m_pkthdr.len;
837 
838 	err = bus_dmamap_load_mvec_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
839 #ifdef DEBUG
840 	if (err) {
841 		int n = 0;
842 		struct mbuf *mtmp = m0;
843 		while(mtmp) {
844 			n++;
845 			mtmp = mtmp->m_next;
846 		}
847 		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
848 		    err, m0->m_pkthdr.len, n);
849 	}
850 #endif
851 	if (err == EFBIG) {
852 		/* Too many segments, try to defrag */
853 		m0 = m_defrag(m0, M_NOWAIT);
854 		if (m0 == NULL) {
855 			m_freem(*m);
856 			*m = NULL;
857 			return (ENOBUFS);
858 		}
859 		*m = m0;
860 		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
861 	}
862 
863 	if (err == ENOMEM) {
864 		return (err);
865 	}
866 
867 	if (err) {
868 		if (cxgb_debug)
869 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
870 		m_freem_vec(m0);
871 		*m = NULL;
872 		return (err);
873 	}
874 
875 	bus_dmamap_sync(txq->entry_tag, stx->map, BUS_DMASYNC_PREWRITE);
876 	stx->flags |= TX_SW_DESC_MAPPED;
877 
878 	return (0);
879 }
880 
881 /**
882  *	make_sgl - populate a scatter/gather list for a packet
883  *	@sgp: the SGL to populate
884  *	@segs: the packet dma segments
885  *	@nsegs: the number of segments
886  *
887  *	Generates a scatter/gather list for the buffers that make up a packet
888  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
889  *	appropriately.
890  */
891 static __inline void
892 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
893 {
894 	int i, idx;
895 
896 	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
897 		if (i && idx == 0)
898 			++sgp;
899 
900 		sgp->len[idx] = htobe32(segs[i].ds_len);
901 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
902 	}
903 
904 	if (idx)
905 		sgp->len[idx] = 0;
906 }
907 
908 /**
909  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
910  *	@adap: the adapter
911  *	@q: the Tx queue
912  *
913  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
914  *	where the HW is going to sleep just after we checked, however,
915  *	then the interrupt handler will detect the outstanding TX packet
916  *	and ring the doorbell for us.
917  *
918  *	When GTS is disabled we unconditionally ring the doorbell.
919  */
920 static __inline void
921 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
922 {
923 #if USE_GTS
924 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
925 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
926 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
927 #ifdef T3_TRACE
928 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
929 			  q->cntxt_id);
930 #endif
931 		t3_write_reg(adap, A_SG_KDOORBELL,
932 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
933 	}
934 #else
935 	wmb();            /* write descriptors before telling HW */
936 	t3_write_reg(adap, A_SG_KDOORBELL,
937 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
938 #endif
939 }
940 
941 static __inline void
942 wr_gen2(struct tx_desc *d, unsigned int gen)
943 {
944 #if SGE_NUM_GENBITS == 2
945 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
946 #endif
947 }
948 
949 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
950 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
951 
952 int
953 t3_encap(struct port_info *p, struct mbuf **m)
954 {
955 	adapter_t *sc;
956 	struct mbuf *m0;
957 	struct sge_qset *qs;
958 	struct sge_txq *txq;
959 	struct tx_sw_desc *stx;
960 	struct txq_state txqs;
961 	unsigned int nsegs, ndesc, flits, cntrl, mlen;
962 	int err, tso_info = 0;
963 
964 	struct work_request_hdr *wrp;
965 	struct tx_sw_desc *txsd;
966 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
967 	bus_dma_segment_t segs[TX_MAX_SEGS];
968 	uint32_t wr_hi, wr_lo, sgl_flits;
969 
970 	struct tx_desc *txd;
971 	struct cpl_tx_pkt *cpl;
972 
973 	DPRINTF("t3_encap ");
974 	m0 = *m;
975 	sc = p->adapter;
976 	qs = &sc->sge.qs[p->first_qset];
977 	txq = &qs->txq[TXQ_ETH];
978 	stx = &txq->sdesc[txq->pidx];
979 	txd = &txq->desc[txq->pidx];
980 	cpl = (struct cpl_tx_pkt *)txd;
981 	mlen = m0->m_pkthdr.len;
982 	cpl->len = htonl(mlen | 0x80000000);
983 
984 	DPRINTF("mlen=%d\n", mlen);
985 	/*
986 	 * XXX handle checksum, TSO, and VLAN here
987 	 *
988 	 */
989 	cntrl = V_TXPKT_INTF(p->port);
990 
991 	/*
992 	 * XXX need to add VLAN support for 6.x
993 	 */
994 #ifdef VLAN_SUPPORTED
995 	if (m0->m_flags & M_VLANTAG)
996 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
997 	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
998 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
999 #endif
1000 	if (tso_info) {
1001 		int eth_type;
1002 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
1003 		struct ip *ip;
1004 		struct tcphdr *tcp;
1005 		uint8_t *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
1006 
1007 		txd->flit[2] = 0;
1008 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1009 		hdr->cntrl = htonl(cntrl);
1010 
1011 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1012 			pkthdr = &tmp[0];
1013 			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
1014 		} else {
1015 			pkthdr = m0->m_data;
1016 		}
1017 
1018 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1019 			eth_type = CPL_ETH_II_VLAN;
1020 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1021 			    ETHER_VLAN_ENCAP_LEN);
1022 		} else {
1023 			eth_type = CPL_ETH_II;
1024 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1025 		}
1026 		tcp = (struct tcphdr *)((uint8_t *)ip +
1027 		    sizeof(*ip));
1028 
1029 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1030 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1031 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1032 		hdr->lso_info = htonl(tso_info);
1033 		flits = 3;
1034 	} else {
1035 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1036 		cpl->cntrl = htonl(cntrl);
1037 
1038 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1039 			txq_prod(txq, 1, &txqs);
1040 			txq->sdesc[txqs.pidx].m = m0;
1041 
1042 			if (m0->m_len == m0->m_pkthdr.len)
1043 				memcpy(&txd->flit[2], m0->m_data, mlen);
1044 			else
1045 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1046 
1047 			flits = (mlen + 7) / 8 + 2;
1048 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1049 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1050 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1051 			wmb();
1052 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1053 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1054 
1055 			wr_gen2(txd, txqs.gen);
1056 			check_ring_tx_db(sc, txq);
1057 			return (0);
1058 		}
1059 		flits = 2;
1060 	}
1061 
1062 	wrp = (struct work_request_hdr *)txd;
1063 
1064 	if ((err = busdma_map_mbufs(m, txq, stx, segs, &nsegs)) != 0) {
1065 		return (err);
1066 	}
1067 	m0 = *m;
1068 	ndesc = calc_tx_descs(m0, nsegs);
1069 
1070 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : &sgl[0];
1071 	make_sgl(sgp, segs, nsegs);
1072 
1073 	sgl_flits = sgl_len(nsegs);
1074 
1075 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1076 	txq_prod(txq, ndesc, &txqs);
1077 	txsd = &txq->sdesc[txqs.pidx];
1078 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1079 	wr_lo = htonl(V_WR_TID(txq->token));
1080 	txsd->m = m0;
1081 
1082 	if (__predict_true(ndesc == 1)) {
1083 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1084 		    V_WR_SGLSFLT(flits)) | wr_hi;
1085 		wmb();
1086 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1087 		    V_WR_GEN(txqs.gen)) | wr_lo;
1088 		/* XXX gen? */
1089 		wr_gen2(txd, txqs.gen);
1090 	} else {
1091 		unsigned int ogen = txqs.gen;
1092 		const uint64_t *fp = (const uint64_t *)sgl;
1093 		struct work_request_hdr *wp = wrp;
1094 
1095 		/* XXX - CHECK ME */
1096 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1097 		    V_WR_SGLSFLT(flits)) | wr_hi;
1098 
1099 		while (sgl_flits) {
1100 			unsigned int avail = WR_FLITS - flits;
1101 
1102 			if (avail > sgl_flits)
1103 				avail = sgl_flits;
1104 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1105 			sgl_flits -= avail;
1106 			ndesc--;
1107 			if (!sgl_flits)
1108 				break;
1109 
1110 			fp += avail;
1111 			txd++;
1112 			txsd++;
1113 			if (++txqs.pidx == txq->size) {
1114 				txqs.pidx = 0;
1115 				txqs.gen ^= 1;
1116 				txd = txq->desc;
1117 				txsd = txq->sdesc;
1118 			}
1119 
1120 			/*
1121 			 * when the head of the mbuf chain
1122 			 * is freed all clusters will be freed
1123 			 * with it
1124 			 */
1125 			txsd->m = NULL;
1126 			wrp = (struct work_request_hdr *)txd;
1127 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1128 			    V_WR_SGLSFLT(1)) | wr_hi;
1129 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1130 				    sgl_flits + 1)) |
1131 			    V_WR_GEN(txqs.gen)) | wr_lo;
1132 			wr_gen2(txd, txqs.gen);
1133 			flits = 1;
1134 		}
1135 #ifdef WHY
1136 		skb->priority = pidx;
1137 #endif
1138 		wrp->wr_hi |= htonl(F_WR_EOP);
1139 		wmb();
1140 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1141 		wr_gen2((struct tx_desc *)wp, ogen);
1142 	}
1143 	check_ring_tx_db(p->adapter, txq);
1144 
1145 	return (0);
1146 }
1147 
1148 
1149 /**
1150  *	write_imm - write a packet into a Tx descriptor as immediate data
1151  *	@d: the Tx descriptor to write
1152  *	@m: the packet
1153  *	@len: the length of packet data to write as immediate data
1154  *	@gen: the generation bit value to write
1155  *
1156  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1157  *	contains a work request at its beginning.  We must write the packet
1158  *	carefully so the SGE doesn't read accidentally before it's written in
1159  *	its entirety.
1160  */
1161 static __inline void write_imm(struct tx_desc *d, struct mbuf *m,
1162 			     unsigned int len, unsigned int gen)
1163 {
1164 	struct work_request_hdr *from = (struct work_request_hdr *)m->m_data;
1165 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1166 
1167 	memcpy(&to[1], &from[1], len - sizeof(*from));
1168 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1169 					V_WR_BCNTLFLT(len & 7));
1170 	wmb();
1171 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1172 					V_WR_LEN((len + 7) / 8));
1173 	wr_gen2(d, gen);
1174 	m_freem(m);
1175 }
1176 
1177 /**
1178  *	check_desc_avail - check descriptor availability on a send queue
1179  *	@adap: the adapter
1180  *	@q: the TX queue
1181  *	@m: the packet needing the descriptors
1182  *	@ndesc: the number of Tx descriptors needed
1183  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1184  *
1185  *	Checks if the requested number of Tx descriptors is available on an
1186  *	SGE send queue.  If the queue is already suspended or not enough
1187  *	descriptors are available the packet is queued for later transmission.
1188  *	Must be called with the Tx queue locked.
1189  *
1190  *	Returns 0 if enough descriptors are available, 1 if there aren't
1191  *	enough descriptors and the packet has been queued, and 2 if the caller
1192  *	needs to retry because there weren't enough descriptors at the
1193  *	beginning of the call but some freed up in the mean time.
1194  */
1195 static __inline int
1196 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1197 				   struct mbuf *m, unsigned int ndesc,
1198 				   unsigned int qid)
1199 {
1200 	/*
1201 	 * XXX We currently only use this for checking the control queue
1202 	 * the control queue is only used for binding qsets which happens
1203 	 * at init time so we are guaranteed enough descriptors
1204 	 */
1205 #if 0
1206 	if (__predict_false(!skb_queue_empty(&q->sendq))) {
1207 addq_exit:	__skb_queue_tail(&q->sendq, skb);
1208 		return 1;
1209 	}
1210 	if (__predict_false(q->size - q->in_use < ndesc)) {
1211 
1212 		struct sge_qset *qs = txq_to_qset(q, qid);
1213 
1214 		set_bit(qid, &qs->txq_stopped);
1215 		smp_mb__after_clear_bit();
1216 
1217 		if (should_restart_tx(q) &&
1218 		    test_and_clear_bit(qid, &qs->txq_stopped))
1219 			return 2;
1220 
1221 		q->stops++;
1222 		goto addq_exit;
1223 	}
1224 #endif
1225 	return 0;
1226 }
1227 
1228 
1229 /**
1230  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1231  *	@q: the SGE control Tx queue
1232  *
1233  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1234  *	that send only immediate data (presently just the control queues) and
1235  *	thus do not have any sk_buffs to release.
1236  */
1237 static __inline void
1238 reclaim_completed_tx_imm(struct sge_txq *q)
1239 {
1240 	unsigned int reclaim = q->processed - q->cleaned;
1241 
1242 	mtx_assert(&q->lock, MA_OWNED);
1243 
1244 	q->in_use -= reclaim;
1245 	q->cleaned += reclaim;
1246 }
1247 
1248 static __inline int
1249 immediate(const struct mbuf *m)
1250 {
1251 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1252 }
1253 
1254 /**
1255  *	ctrl_xmit - send a packet through an SGE control Tx queue
1256  *	@adap: the adapter
1257  *	@q: the control queue
1258  *	@m: the packet
1259  *
1260  *	Send a packet through an SGE control Tx queue.  Packets sent through
1261  *	a control queue must fit entirely as immediate data in a single Tx
1262  *	descriptor and have no page fragments.
1263  */
1264 static int
1265 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1266 {
1267 	int ret;
1268 	struct work_request_hdr *wrp = (struct work_request_hdr *)m->m_data;
1269 
1270 	if (__predict_false(!immediate(m))) {
1271 		m_freem(m);
1272 		return 0;
1273 	}
1274 
1275 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1276 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1277 
1278 	mtx_lock(&q->lock);
1279 again:	reclaim_completed_tx_imm(q);
1280 
1281 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1282 	if (__predict_false(ret)) {
1283 		if (ret == 1) {
1284 			mtx_unlock(&q->lock);
1285 			return (-1);
1286 		}
1287 		goto again;
1288 	}
1289 
1290 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1291 
1292 	q->in_use++;
1293 	if (++q->pidx >= q->size) {
1294 		q->pidx = 0;
1295 		q->gen ^= 1;
1296 	}
1297 	mtx_unlock(&q->lock);
1298 	wmb();
1299 	t3_write_reg(adap, A_SG_KDOORBELL,
1300 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1301 	return (0);
1302 }
1303 
1304 #ifdef RESTART_CTRLQ
1305 /**
1306  *	restart_ctrlq - restart a suspended control queue
1307  *	@qs: the queue set cotaining the control queue
1308  *
1309  *	Resumes transmission on a suspended Tx control queue.
1310  */
1311 static void
1312 restart_ctrlq(unsigned long data)
1313 {
1314 	struct mbuf *m;
1315 	struct sge_qset *qs = (struct sge_qset *)data;
1316 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1317 	adapter_t *adap = qs->port->adapter;
1318 
1319 	mtx_lock(&q->lock);
1320 again:	reclaim_completed_tx_imm(q);
1321 
1322 	while (q->in_use < q->size &&
1323 	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
1324 
1325 		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1326 
1327 		if (++q->pidx >= q->size) {
1328 			q->pidx = 0;
1329 			q->gen ^= 1;
1330 		}
1331 		q->in_use++;
1332 	}
1333 	if (!skb_queue_empty(&q->sendq)) {
1334 		set_bit(TXQ_CTRL, &qs->txq_stopped);
1335 		smp_mb__after_clear_bit();
1336 
1337 		if (should_restart_tx(q) &&
1338 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1339 			goto again;
1340 		q->stops++;
1341 	}
1342 
1343 	mtx_unlock(&q->lock);
1344 	t3_write_reg(adap, A_SG_KDOORBELL,
1345 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1346 }
1347 #endif
1348 
1349 /*
1350  * Send a management message through control queue 0
1351  */
1352 int
1353 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1354 {
1355 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1356 }
1357 
1358 /**
1359  *	t3_sge_alloc_qset - initialize an SGE queue set
1360  *	@sc: the controller softc
1361  *	@id: the queue set id
1362  *	@nports: how many Ethernet ports will be using this queue set
1363  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
1364  *	@p: configuration parameters for this queue set
1365  *	@ntxq: number of Tx queues for the queue set
1366  *	@pi: port info for queue set
1367  *
1368  *	Allocate resources and initialize an SGE queue set.  A queue set
1369  *	comprises a response queue, two Rx free-buffer queues, and up to 3
1370  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
1371  *	queue, offload queue, and control queue.
1372  */
1373 int
1374 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
1375 		  const struct qset_params *p, int ntxq, struct port_info *pi)
1376 {
1377 	struct sge_qset *q = &sc->sge.qs[id];
1378 	int i, ret = 0;
1379 
1380 	init_qset_cntxt(q, id);
1381 
1382 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
1383 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
1384 		    &q->fl[0].desc, &q->fl[0].sdesc,
1385 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
1386 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
1387 		printf("error %d from alloc ring fl0\n", ret);
1388 		goto err;
1389 	}
1390 
1391 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
1392 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
1393 		    &q->fl[1].desc, &q->fl[1].sdesc,
1394 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
1395 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
1396 		printf("error %d from alloc ring fl1\n", ret);
1397 		goto err;
1398 	}
1399 
1400 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
1401 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
1402 		    &q->rspq.desc_tag, &q->rspq.desc_map,
1403 		    NULL, NULL)) != 0) {
1404 		printf("error %d from alloc ring rspq\n", ret);
1405 		goto err;
1406 	}
1407 
1408 	for (i = 0; i < ntxq; ++i) {
1409 		/*
1410 		 * The control queue always uses immediate data so does not
1411 		 * need to keep track of any mbufs.
1412 		 * XXX Placeholder for future TOE support.
1413 		 */
1414 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
1415 
1416 		if ((ret = alloc_ring(sc, p->txq_size[i],
1417 			    sizeof(struct tx_desc), sz,
1418 			    &q->txq[i].phys_addr, &q->txq[i].desc,
1419 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
1420 			    &q->txq[i].desc_map,
1421 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
1422 			printf("error %d from alloc ring tx %i\n", ret, i);
1423 			goto err;
1424 		}
1425 		q->txq[i].gen = 1;
1426 		q->txq[i].size = p->txq_size[i];
1427 		mtx_init(&q->txq[i].lock, "t3 txq lock", NULL, MTX_DEF);
1428 	}
1429 
1430 	q->fl[0].gen = q->fl[1].gen = 1;
1431 	q->fl[0].size = p->fl_size;
1432 	q->fl[1].size = p->jumbo_size;
1433 
1434 	q->rspq.gen = 1;
1435 	q->rspq.size = p->rspq_size;
1436 	mtx_init(&q->rspq.lock, "t3 rspq lock", NULL, MTX_DEF);
1437 
1438 	q->txq[TXQ_ETH].stop_thres = nports *
1439 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
1440 
1441 	q->fl[0].buf_size = MCLBYTES;
1442 	q->fl[0].zone = zone_clust;
1443 	q->fl[0].type = EXT_CLUSTER;
1444 	q->fl[1].buf_size = MJUMPAGESIZE;
1445 	q->fl[1].zone = zone_jumbop;
1446 	q->fl[1].type = EXT_JUMBOP;
1447 
1448 	q->lro.enabled = lro_default;
1449 
1450 	mtx_lock(&sc->sge.reg_lock);
1451 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
1452 				   q->rspq.phys_addr, q->rspq.size,
1453 				   q->fl[0].buf_size, 1, 0);
1454 	if (ret) {
1455 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
1456 		goto err_unlock;
1457 	}
1458 
1459 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1460 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
1461 					  q->fl[i].phys_addr, q->fl[i].size,
1462 					  q->fl[i].buf_size, p->cong_thres, 1,
1463 					  0);
1464 		if (ret) {
1465 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
1466 			goto err_unlock;
1467 		}
1468 	}
1469 
1470 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
1471 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
1472 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
1473 				 1, 0);
1474 	if (ret) {
1475 		printf("error %d from t3_sge_init_ecntxt\n", ret);
1476 		goto err_unlock;
1477 	}
1478 
1479 	if (ntxq > 1) {
1480 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
1481 					 USE_GTS, SGE_CNTXT_OFLD, id,
1482 					 q->txq[TXQ_OFLD].phys_addr,
1483 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
1484 		if (ret) {
1485 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1486 			goto err_unlock;
1487 		}
1488 	}
1489 
1490 	if (ntxq > 2) {
1491 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
1492 					 SGE_CNTXT_CTRL, id,
1493 					 q->txq[TXQ_CTRL].phys_addr,
1494 					 q->txq[TXQ_CTRL].size,
1495 					 q->txq[TXQ_CTRL].token, 1, 0);
1496 		if (ret) {
1497 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1498 			goto err_unlock;
1499 		}
1500 	}
1501 
1502 	mtx_unlock(&sc->sge.reg_lock);
1503 	t3_update_qset_coalesce(q, p);
1504 	q->port = pi;
1505 
1506 	refill_fl(sc, &q->fl[0], q->fl[0].size);
1507 	refill_fl(sc, &q->fl[1], q->fl[1].size);
1508 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
1509 
1510 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
1511 		     V_NEWTIMER(q->rspq.holdoff_tmr));
1512 
1513 	return (0);
1514 
1515 err_unlock:
1516 	mtx_unlock(&sc->sge.reg_lock);
1517 err:
1518 	t3_free_qset(sc, q);
1519 
1520 	return (ret);
1521 }
1522 
1523 
1524 /**
1525  *	free_qset - free the resources of an SGE queue set
1526  *	@sc: the controller owning the queue set
1527  *	@q: the queue set
1528  *
1529  *	Release the HW and SW resources associated with an SGE queue set, such
1530  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1531  *	queue set must be quiesced prior to calling this.
1532  */
1533 static void
1534 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1535 {
1536 	int i;
1537 
1538 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1539 		if (q->fl[i].desc) {
1540 			mtx_lock(&sc->sge.reg_lock);
1541 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1542 			mtx_unlock(&sc->sge.reg_lock);
1543 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1544 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1545 					q->fl[i].desc_map);
1546 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1547 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1548 		}
1549 		if (q->fl[i].sdesc) {
1550 			free_rx_bufs(sc, &q->fl[i]);
1551 			free(q->fl[i].sdesc, M_DEVBUF);
1552 		}
1553 	}
1554 
1555 	for (i = 0; i < SGE_TXQ_PER_SET; ++i) {
1556 		if (q->txq[i].desc) {
1557 			mtx_lock(&sc->sge.reg_lock);
1558 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1559 			mtx_unlock(&sc->sge.reg_lock);
1560 			bus_dmamap_unload(q->txq[i].desc_tag,
1561 					q->txq[i].desc_map);
1562 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1563 					q->txq[i].desc_map);
1564 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1565 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1566 		}
1567 		if (q->txq[i].sdesc) {
1568 			free(q->txq[i].sdesc, M_DEVBUF);
1569 		}
1570 		if (mtx_initialized(&q->txq[i].lock)) {
1571 			mtx_destroy(&q->txq[i].lock);
1572 		}
1573 	}
1574 
1575 	if (q->rspq.desc) {
1576 		mtx_lock(&sc->sge.reg_lock);
1577 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1578 		mtx_unlock(&sc->sge.reg_lock);
1579 
1580 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1581 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1582 			        q->rspq.desc_map);
1583 		bus_dma_tag_destroy(q->rspq.desc_tag);
1584 	}
1585 
1586 	if (mtx_initialized(&q->rspq.lock))
1587 		mtx_destroy(&q->rspq.lock);
1588 
1589 	bzero(q, sizeof(*q));
1590 }
1591 
1592 /**
1593  *	t3_free_sge_resources - free SGE resources
1594  *	@sc: the adapter softc
1595  *
1596  *	Frees resources used by the SGE queue sets.
1597  */
1598 void
1599 t3_free_sge_resources(adapter_t *sc)
1600 {
1601 	int i;
1602 
1603 	for (i = 0; i < SGE_QSETS; ++i)
1604 		t3_free_qset(sc, &sc->sge.qs[i]);
1605 }
1606 
1607 /**
1608  *	t3_sge_start - enable SGE
1609  *	@sc: the controller softc
1610  *
1611  *	Enables the SGE for DMAs.  This is the last step in starting packet
1612  *	transfers.
1613  */
1614 void
1615 t3_sge_start(adapter_t *sc)
1616 {
1617 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1618 }
1619 
1620 
1621 /**
1622  *	free_tx_desc - reclaims Tx descriptors and their buffers
1623  *	@adapter: the adapter
1624  *	@q: the Tx queue to reclaim descriptors from
1625  *	@n: the number of descriptors to reclaim
1626  *
1627  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1628  *	Tx buffers.  Called with the Tx queue lock held.
1629  */
1630 int
1631 free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec)
1632 {
1633 	struct tx_sw_desc *d;
1634 	unsigned int cidx = q->cidx;
1635 	int nbufs = 0;
1636 
1637 #ifdef T3_TRACE
1638 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1639 		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
1640 #endif
1641 	d = &q->sdesc[cidx];
1642 
1643 	while (n-- > 0) {
1644 		DPRINTF("cidx=%d d=%p\n", cidx, d);
1645 		if (d->m) {
1646 			if (d->flags & TX_SW_DESC_MAPPED) {
1647 				bus_dmamap_unload(q->entry_tag, d->map);
1648 				bus_dmamap_destroy(q->entry_tag, d->map);
1649 				d->flags &= ~TX_SW_DESC_MAPPED;
1650 			}
1651 			m_vec[nbufs] = d->m;
1652 			d->m = NULL;
1653 			nbufs++;
1654 		}
1655 		++d;
1656 		if (++cidx == q->size) {
1657 			cidx = 0;
1658 			d = q->sdesc;
1659 		}
1660 	}
1661 	q->cidx = cidx;
1662 
1663 	return (nbufs);
1664 }
1665 
1666 /**
1667  *	is_new_response - check if a response is newly written
1668  *	@r: the response descriptor
1669  *	@q: the response queue
1670  *
1671  *	Returns true if a response descriptor contains a yet unprocessed
1672  *	response.
1673  */
1674 static __inline int
1675 is_new_response(const struct rsp_desc *r,
1676     const struct sge_rspq *q)
1677 {
1678 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1679 }
1680 
1681 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1682 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1683 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1684 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1685 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1686 
1687 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1688 #define NOMEM_INTR_DELAY 2500
1689 
1690 static __inline void
1691 deliver_partial_bundle(struct t3cdev *tdev, struct sge_rspq *q)
1692 {
1693 	;
1694 }
1695 
1696 static __inline void
1697 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1698     struct mbuf *m)
1699 {
1700 #ifdef notyet
1701 	if (rq->polling) {
1702 		rq->offload_skbs[rq->offload_skbs_idx++] = skb;
1703 		if (rq->offload_skbs_idx == RX_BUNDLE_SIZE) {
1704 			cxgb_ofld_recv(tdev, rq->offload_skbs, RX_BUNDLE_SIZE);
1705 			rq->offload_skbs_idx = 0;
1706 			rq->offload_bundles++;
1707 		}
1708 	} else
1709 #endif
1710 	{
1711 		/* XXX */
1712 		panic("implement offload enqueue\n");
1713 	}
1714 
1715 }
1716 
1717 static void
1718 restart_tx(struct sge_qset *qs)
1719 {
1720 	;
1721 }
1722 
1723 void
1724 t3_rx_eth(struct port_info *pi, struct sge_rspq *rq, struct mbuf *m, int ethpad)
1725 {
1726 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(m->m_data + ethpad);
1727 	struct ifnet *ifp = pi->ifp;
1728 
1729 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, m->m_data, cpl->iff);
1730 	if (&pi->adapter->port[cpl->iff] != pi)
1731 		panic("bad port index %d m->m_data=%p\n", cpl->iff, m->m_data);
1732 
1733 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
1734 	    cpl->csum_valid && cpl->csum == 0xffff) {
1735 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
1736 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
1737 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1738 		m->m_pkthdr.csum_data = 0xffff;
1739 	}
1740 	/*
1741 	 * XXX need to add VLAN support for 6.x
1742 	 */
1743 #ifdef VLAN_SUPPORTED
1744 	if (__predict_false(cpl->vlan_valid)) {
1745 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
1746 		m->m_flags |= M_VLANTAG;
1747 	}
1748 #endif
1749 
1750 	m->m_pkthdr.rcvif = ifp;
1751 	m->m_pkthdr.header = m->m_data + sizeof(*cpl) + ethpad;
1752 	m_explode(m);
1753 	/*
1754 	 * adjust after conversion to mbuf chain
1755 	 */
1756 	m_adj(m, sizeof(*cpl) + ethpad);
1757 
1758 	(*ifp->if_input)(ifp, m);
1759 }
1760 
1761 /**
1762  *	get_packet - return the next ingress packet buffer from a free list
1763  *	@adap: the adapter that received the packet
1764  *	@drop_thres: # of remaining buffers before we start dropping packets
1765  *	@qs: the qset that the SGE free list holding the packet belongs to
1766  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
1767  *      @r: response descriptor
1768  *
1769  *	Get the next packet from a free list and complete setup of the
1770  *	sk_buff.  If the packet is small we make a copy and recycle the
1771  *	original buffer, otherwise we use the original buffer itself.  If a
1772  *	positive drop threshold is supplied packets are dropped and their
1773  *	buffers recycled if (a) the number of remaining buffers is under the
1774  *	threshold and the packet is too big to copy, or (b) the packet should
1775  *	be copied but there is no memory for the copy.
1776  */
1777 
1778 static int
1779 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
1780     struct mbuf *m, struct rsp_desc *r)
1781 {
1782 
1783 	unsigned int len_cq =  ntohl(r->len_cq);
1784 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
1785 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
1786 	uint32_t len = G_RSPD_LEN(len_cq);
1787 	uint32_t flags = ntohl(r->flags);
1788 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
1789 	int ret = 0;
1790 
1791 	prefetch(sd->cl);
1792 
1793 	fl->credits--;
1794 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
1795 	bus_dmamap_unload(fl->entry_tag, sd->map);
1796 
1797 
1798 	switch(sopeop) {
1799 	case RSPQ_SOP_EOP:
1800 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
1801 		m_cljset(m, sd->cl, fl->type);
1802 		m->m_len = m->m_pkthdr.len = len;
1803 		ret = 1;
1804 		goto done;
1805 		break;
1806 	case RSPQ_NSOP_NEOP:
1807 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
1808 		ret = 0;
1809 		break;
1810 	case RSPQ_SOP:
1811 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
1812 		m_iovinit(m);
1813 		ret = 0;
1814 		break;
1815 	case RSPQ_EOP:
1816 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
1817 		ret = 1;
1818 		break;
1819 	}
1820 	m_iovappend(m, sd->cl, fl->buf_size, len, 0);
1821 
1822 done:
1823 	if (++fl->cidx == fl->size)
1824 		fl->cidx = 0;
1825 
1826 	return (ret);
1827 }
1828 
1829 
1830 /**
1831  *	handle_rsp_cntrl_info - handles control information in a response
1832  *	@qs: the queue set corresponding to the response
1833  *	@flags: the response control flags
1834  *
1835  *	Handles the control information of an SGE response, such as GTS
1836  *	indications and completion credits for the queue set's Tx queues.
1837  *	HW coalesces credits, we don't do any extra SW coalescing.
1838  */
1839 static __inline void
1840 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
1841 {
1842 	unsigned int credits;
1843 
1844 #if USE_GTS
1845 	if (flags & F_RSPD_TXQ0_GTS)
1846 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
1847 #endif
1848 	credits = G_RSPD_TXQ0_CR(flags);
1849 	if (credits) {
1850 		qs->txq[TXQ_ETH].processed += credits;
1851 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
1852 			taskqueue_enqueue(qs->port->adapter->tq,
1853 			    &qs->port->adapter->timer_reclaim_task);
1854 	}
1855 
1856 	credits = G_RSPD_TXQ2_CR(flags);
1857 	if (credits)
1858 		qs->txq[TXQ_CTRL].processed += credits;
1859 
1860 # if USE_GTS
1861 	if (flags & F_RSPD_TXQ1_GTS)
1862 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
1863 # endif
1864 	credits = G_RSPD_TXQ1_CR(flags);
1865 	if (credits)
1866 		qs->txq[TXQ_OFLD].processed += credits;
1867 }
1868 
1869 static void
1870 check_ring_db(adapter_t *adap, struct sge_qset *qs,
1871     unsigned int sleeping)
1872 {
1873 	;
1874 }
1875 
1876 /*
1877  * This is an awful hack to bind the ithread to CPU 1
1878  * to work around lack of ithread affinity
1879  */
1880 static void
1881 bind_ithread(int cpu)
1882 {
1883 #if 0
1884 	KASSERT(cpu < mp_ncpus, ("invalid cpu identifier"));
1885 	if (mp_ncpus > 1) {
1886 		mtx_lock_spin(&sched_lock);
1887 		sched_bind(curthread, cpu);
1888 		mtx_unlock_spin(&sched_lock);
1889 	}
1890 #endif
1891 }
1892 
1893 /**
1894  *	process_responses - process responses from an SGE response queue
1895  *	@adap: the adapter
1896  *	@qs: the queue set to which the response queue belongs
1897  *	@budget: how many responses can be processed in this round
1898  *
1899  *	Process responses from an SGE response queue up to the supplied budget.
1900  *	Responses include received packets as well as credits and other events
1901  *	for the queues that belong to the response queue's queue set.
1902  *	A negative budget is effectively unlimited.
1903  *
1904  *	Additionally choose the interrupt holdoff time for the next interrupt
1905  *	on this queue.  If the system is under memory shortage use a fairly
1906  *	long delay to help recovery.
1907  */
1908 static int
1909 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
1910 {
1911 	struct sge_rspq *rspq = &qs->rspq;
1912 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
1913 	int budget_left = budget;
1914 	unsigned int sleeping = 0;
1915 	int lro = qs->lro.enabled;
1916 
1917 	static uint8_t pinned[MAXCPU];
1918 
1919 #ifdef DEBUG
1920 	static int last_holdoff = 0;
1921 	if (rspq->holdoff_tmr != last_holdoff) {
1922 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
1923 		last_holdoff = rspq->holdoff_tmr;
1924 	}
1925 #endif
1926 	if (pinned[qs->rspq.cntxt_id * adap->params.nports] == 0) {
1927 		/*
1928 		 * Assumes that cntxt_id < mp_ncpus
1929 		 */
1930 		bind_ithread(qs->rspq.cntxt_id);
1931 		pinned[qs->rspq.cntxt_id * adap->params.nports] = 1;
1932 	}
1933 	rspq->next_holdoff = rspq->holdoff_tmr;
1934 
1935 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
1936 		int eth, eop = 0, ethpad = 0;
1937 		uint32_t flags = ntohl(r->flags);
1938 		uint32_t rss_csum = *(const uint32_t *)r;
1939 		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
1940 
1941 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
1942 
1943 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
1944 			/* XXX */
1945 			printf("async notification\n");
1946 
1947 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
1948 			struct mbuf *m = NULL;
1949 			if (cxgb_debug)
1950 				printf("IMM DATA VALID\n");
1951 			if (rspq->m == NULL)
1952 				rspq->m = m_gethdr(M_NOWAIT, MT_DATA);
1953                         else
1954 				m = m_gethdr(M_NOWAIT, MT_DATA);
1955 
1956 			if (rspq->m == NULL || m == NULL) {
1957 				rspq->next_holdoff = NOMEM_INTR_DELAY;
1958 				budget_left--;
1959 				break;
1960 			}
1961 			get_imm_packet(adap, r, rspq->m, m);
1962 			eop = 1;
1963 			rspq->imm_data++;
1964 		} else if (r->len_cq) {
1965 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
1966 
1967                         if (rspq->m == NULL)
1968 				rspq->m = m_gethdr(M_NOWAIT, MT_DATA);
1969 			if (rspq->m == NULL) {
1970 				log(LOG_WARNING, "failed to get mbuf for packet\n");
1971 				break;
1972 			}
1973 
1974 			ethpad = 2;
1975 			eop = get_packet(adap, drop_thresh, qs, rspq->m, r);
1976 		} else {
1977 			DPRINTF("pure response\n");
1978 			rspq->pure_rsps++;
1979 		}
1980 
1981 		if (flags & RSPD_CTRL_MASK) {
1982 			sleeping |= flags & RSPD_GTS_MASK;
1983 			handle_rsp_cntrl_info(qs, flags);
1984 		}
1985 
1986 		r++;
1987 		if (__predict_false(++rspq->cidx == rspq->size)) {
1988 			rspq->cidx = 0;
1989 			rspq->gen ^= 1;
1990 			r = rspq->desc;
1991 		}
1992 
1993 		prefetch(r);
1994 		if (++rspq->credits >= (rspq->size / 4)) {
1995 			refill_rspq(adap, rspq, rspq->credits);
1996 			rspq->credits = 0;
1997 		}
1998 
1999 		if (eop) {
2000 			prefetch(rspq->m->m_data);
2001 			prefetch(rspq->m->m_data + L1_CACHE_BYTES);
2002 
2003 			if (eth) {
2004 				t3_rx_eth_lro(adap, rspq, rspq->m, ethpad,
2005 				    rss_hash, rss_csum, lro);
2006 
2007 				rspq->m = NULL;
2008 			} else {
2009 #ifdef notyet
2010 				if (__predict_false(r->rss_hdr.opcode == CPL_TRACE_PKT))
2011 					m_adj(m, 2);
2012 
2013 				rx_offload(&adap->tdev, rspq, m);
2014 #endif
2015 			}
2016 #ifdef notyet
2017 			taskqueue_enqueue(adap->tq, &adap->timer_reclaim_task);
2018 #else
2019 			__refill_fl(adap, &qs->fl[0]);
2020 			__refill_fl(adap, &qs->fl[1]);
2021 #endif
2022 		}
2023 		--budget_left;
2024 	}
2025 	t3_sge_lro_flush_all(adap, qs);
2026 	deliver_partial_bundle(&adap->tdev, rspq);
2027 
2028 	if (sleeping)
2029 		check_ring_db(adap, qs, sleeping);
2030 
2031 	smp_mb();  /* commit Tx queue processed updates */
2032 	if (__predict_false(qs->txq_stopped != 0))
2033 		restart_tx(qs);
2034 
2035 	budget -= budget_left;
2036 	return (budget);
2037 }
2038 
2039 /*
2040  * A helper function that processes responses and issues GTS.
2041  */
2042 static __inline int
2043 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2044 {
2045 	int work;
2046 	static int last_holdoff = 0;
2047 
2048 	work = process_responses(adap, rspq_to_qset(rq), -1);
2049 
2050 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2051 		printf("next_holdoff=%d\n", rq->next_holdoff);
2052 		last_holdoff = rq->next_holdoff;
2053 	}
2054 
2055 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2056 		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2057 	return work;
2058 }
2059 
2060 
2061 /*
2062  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2063  * Handles data events from SGE response queues as well as error and other
2064  * async events as they all use the same interrupt pin.  We use one SGE
2065  * response queue per port in this mode and protect all response queues with
2066  * queue 0's lock.
2067  */
2068 void
2069 t3b_intr(void *data)
2070 {
2071 	uint32_t map;
2072 	adapter_t *adap = data;
2073 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2074 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2075 
2076 	t3_write_reg(adap, A_PL_CLI, 0);
2077 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2078 
2079 	if (!map)
2080 		return;
2081 
2082 	if (__predict_false(map & F_ERRINTR))
2083 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2084 
2085 	mtx_lock(&q0->lock);
2086 
2087 	if (__predict_true(map & 1))
2088 		process_responses_gts(adap, q0);
2089 
2090 	if (map & 2)
2091 		process_responses_gts(adap, q1);
2092 
2093 	mtx_unlock(&q0->lock);
2094 }
2095 
2096 /*
2097  * The MSI interrupt handler.  This needs to handle data events from SGE
2098  * response queues as well as error and other async events as they all use
2099  * the same MSI vector.  We use one SGE response queue per port in this mode
2100  * and protect all response queues with queue 0's lock.
2101  */
2102 void
2103 t3_intr_msi(void *data)
2104 {
2105 	adapter_t *adap = data;
2106 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2107 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2108 	int new_packets = 0;
2109 
2110 	mtx_lock(&q0->lock);
2111 	if (process_responses_gts(adap, q0)) {
2112 		new_packets = 1;
2113 	}
2114 
2115 	if (adap->params.nports == 2 &&
2116 	    process_responses_gts(adap, q1)) {
2117 		new_packets = 1;
2118 	}
2119 
2120 	mtx_unlock(&q0->lock);
2121 	if (new_packets == 0)
2122 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2123 }
2124 
2125 void
2126 t3_intr_msix(void *data)
2127 {
2128 	struct sge_qset *qs = data;
2129 	adapter_t *adap = qs->port->adapter;
2130 	struct sge_rspq *rspq = &qs->rspq;
2131 
2132 	mtx_lock(&rspq->lock);
2133 	if (process_responses_gts(adap, rspq) == 0) {
2134 #ifdef notyet
2135 		rspq->unhandled_irqs++;
2136 #endif
2137 	}
2138 	mtx_unlock(&rspq->lock);
2139 }
2140 
2141 /*
2142  * broken by recent mbuf changes
2143  */
2144 static int
2145 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2146 {
2147 	adapter_t *sc;
2148 	int i, j, enabled, err, nqsets = 0;
2149 
2150 #ifndef LRO_WORKING
2151 	return (0);
2152 #endif
2153 
2154 	sc = arg1;
2155 	enabled = sc->sge.qs[0].lro.enabled;
2156         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2157 
2158 	if (err != 0) {
2159 		return (err);
2160 	}
2161 	if (enabled == sc->sge.qs[0].lro.enabled)
2162 		return (0);
2163 
2164 	for (i = 0; i < sc->params.nports; i++)
2165 		for (j = 0; j < sc->port[i].nqsets; j++)
2166 			nqsets++;
2167 
2168 	for (i = 0; i < nqsets; i++) {
2169 		sc->sge.qs[i].lro.enabled = enabled;
2170 	}
2171 
2172 	return (0);
2173 }
2174 
2175 static int
2176 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2177 {
2178 	adapter_t *sc = arg1;
2179 	struct qset_params *qsp = &sc->params.sge.qset[0];
2180 	int coalesce_nsecs;
2181 	struct sge_qset *qs;
2182 	int i, j, err, nqsets = 0;
2183 	struct mtx *lock;
2184 
2185 	coalesce_nsecs = qsp->coalesce_nsecs;
2186         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
2187 
2188 	if (err != 0) {
2189 		return (err);
2190 	}
2191 	if (coalesce_nsecs == qsp->coalesce_nsecs)
2192 		return (0);
2193 
2194 	for (i = 0; i < sc->params.nports; i++)
2195 		for (j = 0; j < sc->port[i].nqsets; j++)
2196 			nqsets++;
2197 
2198 	coalesce_nsecs = max(100, coalesce_nsecs);
2199 
2200 	for (i = 0; i < nqsets; i++) {
2201 		qs = &sc->sge.qs[i];
2202 		qsp = &sc->params.sge.qset[i];
2203 		qsp->coalesce_nsecs = coalesce_nsecs;
2204 
2205 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
2206 			    &sc->sge.qs[0].rspq.lock;
2207 
2208 		mtx_lock(lock);
2209 		t3_update_qset_coalesce(qs, qsp);
2210 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2211 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
2212 		mtx_unlock(lock);
2213 	}
2214 
2215 	return (0);
2216 }
2217 
2218 
2219 void
2220 t3_add_sysctls(adapter_t *sc)
2221 {
2222 	struct sysctl_ctx_list *ctx;
2223 	struct sysctl_oid_list *children;
2224 
2225 	ctx = device_get_sysctl_ctx(sc->dev);
2226 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
2227 
2228 	/* random information */
2229 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
2230 	    "firmware_version",
2231 	    CTLFLAG_RD, &sc->fw_version,
2232 	    0, "firmware version");
2233 
2234 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2235 	    "enable_lro",
2236 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2237 	    0, t3_lro_enable,
2238 	    "I", "enable large receive offload");
2239 
2240 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2241 	    "intr_coal",
2242 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2243 	    0, t3_set_coalesce_nsecs,
2244 	    "I", "interrupt coalescing timer (ns)");
2245 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2246 	    "enable_debug",
2247 	    CTLFLAG_RW, &cxgb_debug,
2248 	    0, "enable verbose debugging output");
2249 
2250 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2251 	    "collapse_free",
2252 	    CTLFLAG_RD, &collapse_free,
2253 	    0, "frees during collapse");
2254 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2255 	    "mb_free_vec_free",
2256 	    CTLFLAG_RD, &mb_free_vec_free,
2257 	    0, "frees during mb_free_vec");
2258 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2259 	    "collapse_mbufs",
2260 	    CTLFLAG_RW, &collapse_mbufs,
2261 	    0, "collapse mbuf chains into iovecs");
2262 }
2263 
2264 /**
2265  *	t3_get_desc - dump an SGE descriptor for debugging purposes
2266  *	@qs: the queue set
2267  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2268  *	@idx: the descriptor index in the queue
2269  *	@data: where to dump the descriptor contents
2270  *
2271  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2272  *	size of the descriptor.
2273  */
2274 int
2275 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2276 		unsigned char *data)
2277 {
2278 	if (qnum >= 6)
2279 		return (EINVAL);
2280 
2281 	if (qnum < 3) {
2282 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2283 			return -EINVAL;
2284 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2285 		return sizeof(struct tx_desc);
2286 	}
2287 
2288 	if (qnum == 3) {
2289 		if (!qs->rspq.desc || idx >= qs->rspq.size)
2290 			return (EINVAL);
2291 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2292 		return sizeof(struct rsp_desc);
2293 	}
2294 
2295 	qnum -= 4;
2296 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2297 		return (EINVAL);
2298 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2299 	return sizeof(struct rx_desc);
2300 }
2301