xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision bfe691b2f75de2224c7ceb304ebcdef2b42d4179)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Chelsio Corporation nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <machine/bus.h>
44 #include <machine/resource.h>
45 #include <sys/bus_dma.h>
46 #include <sys/rman.h>
47 #include <sys/queue.h>
48 #include <sys/sysctl.h>
49 #include <sys/taskqueue.h>
50 
51 
52 #include <sys/proc.h>
53 #include <sys/sched.h>
54 #include <sys/smp.h>
55 #include <sys/systm.h>
56 
57 #include <netinet/in_systm.h>
58 #include <netinet/in.h>
59 #include <netinet/ip.h>
60 #include <netinet/tcp.h>
61 
62 #include <dev/pci/pcireg.h>
63 #include <dev/pci/pcivar.h>
64 #include <dev/cxgb/common/cxgb_common.h>
65 #include <dev/cxgb/common/cxgb_regs.h>
66 #include <dev/cxgb/common/cxgb_sge_defs.h>
67 #include <dev/cxgb/common/cxgb_t3_cpl.h>
68 #include <dev/cxgb/common/cxgb_firmware_exports.h>
69 
70 #include <dev/cxgb/sys/mvec.h>
71 
72 #define USE_GTS 0
73 
74 #define SGE_RX_SM_BUF_SIZE	1536
75 #define SGE_RX_DROP_THRES	16
76 
77 /*
78  * Period of the Tx buffer reclaim timer.  This timer does not need to run
79  * frequently as Tx buffers are usually reclaimed by new Tx packets.
80  */
81 #define TX_RECLAIM_PERIOD       (hz >> 2)
82 
83 /*
84  * work request size in bytes
85  */
86 #define WR_LEN (WR_FLITS * 8)
87 
88 /*
89  * Values for sge_txq.flags
90  */
91 enum {
92 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
93 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
94 };
95 
96 struct tx_desc {
97 	uint64_t	flit[TX_DESC_FLITS];
98 } __packed;
99 
100 struct rx_desc {
101 	uint32_t	addr_lo;
102 	uint32_t	len_gen;
103 	uint32_t	gen2;
104 	uint32_t	addr_hi;
105 } __packed;;
106 
107 struct rsp_desc {               /* response queue descriptor */
108 	struct rss_header	rss_hdr;
109 	uint32_t		flags;
110 	uint32_t		len_cq;
111 	uint8_t			imm_data[47];
112 	uint8_t			intr_gen;
113 } __packed;
114 
115 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
116 #define RX_SW_DESC_INUSE        (1 << 3)
117 #define TX_SW_DESC_MAPPED       (1 << 4)
118 
119 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
120 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
121 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
122 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
123 
124 struct tx_sw_desc {                /* SW state per Tx descriptor */
125 	struct mbuf	*m;
126 	bus_dmamap_t	map;
127 	int		flags;
128 };
129 
130 struct rx_sw_desc {                /* SW state per Rx descriptor */
131 	void	        *cl;
132 	bus_dmamap_t	map;
133 	int		flags;
134 };
135 
136 struct txq_state {
137 	unsigned int compl;
138 	unsigned int gen;
139 	unsigned int pidx;
140 };
141 
142 struct refill_fl_cb_arg {
143 	int               error;
144 	bus_dma_segment_t seg;
145 	int               nseg;
146 };
147 
148 /*
149  * Maps a number of flits to the number of Tx descriptors that can hold them.
150  * The formula is
151  *
152  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
153  *
154  * HW allows up to 4 descriptors to be combined into a WR.
155  */
156 static uint8_t flit_desc_map[] = {
157 	0,
158 #if SGE_NUM_GENBITS == 1
159 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
160 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
161 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
162 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
163 #elif SGE_NUM_GENBITS == 2
164 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
165 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
166 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
167 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
168 #else
169 # error "SGE_NUM_GENBITS must be 1 or 2"
170 #endif
171 };
172 
173 
174 static int lro_default = 0;
175 int cxgb_debug = 0;
176 
177 static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
178 static void sge_timer_cb(void *arg);
179 static void sge_timer_reclaim(void *arg, int ncount);
180 static int free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec);
181 
182 /**
183  *	reclaim_completed_tx - reclaims completed Tx descriptors
184  *	@adapter: the adapter
185  *	@q: the Tx queue to reclaim completed descriptors from
186  *
187  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
188  *	and frees the associated buffers if possible.  Called with the Tx
189  *	queue's lock held.
190  */
191 static __inline int
192 reclaim_completed_tx(adapter_t *adapter, struct sge_txq *q, int nbufs, struct mbuf **mvec)
193 {
194 	int reclaimed, reclaim = desc_reclaimable(q);
195 	int n = 0;
196 
197 	mtx_assert(&q->lock, MA_OWNED);
198 
199 	if (reclaim > 0) {
200 		n = free_tx_desc(adapter, q, min(reclaim, nbufs), mvec);
201 		reclaimed = min(reclaim, nbufs);
202 		q->cleaned += reclaimed;
203 		q->in_use -= reclaimed;
204 	}
205 
206 	return (n);
207 }
208 
209 /**
210  *	t3_sge_init - initialize SGE
211  *	@adap: the adapter
212  *	@p: the SGE parameters
213  *
214  *	Performs SGE initialization needed every time after a chip reset.
215  *	We do not initialize any of the queue sets here, instead the driver
216  *	top-level must request those individually.  We also do not enable DMA
217  *	here, that should be done after the queues have been set up.
218  */
219 void
220 t3_sge_init(adapter_t *adap, struct sge_params *p)
221 {
222 	u_int ctrl, ups;
223 
224 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
225 
226 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
227 	       F_CQCRDTCTRL |
228 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
229 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
230 #if SGE_NUM_GENBITS == 1
231 	ctrl |= F_EGRGENCTRL;
232 #endif
233 	if (adap->params.rev > 0) {
234 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
235 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
236 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
237 	}
238 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
239 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
240 		     V_LORCQDRBTHRSH(512));
241 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
242 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
243 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
244 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
245 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
246 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
247 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
248 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
249 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
250 }
251 
252 
253 /**
254  *	sgl_len - calculates the size of an SGL of the given capacity
255  *	@n: the number of SGL entries
256  *
257  *	Calculates the number of flits needed for a scatter/gather list that
258  *	can hold the given number of entries.
259  */
260 static __inline unsigned int
261 sgl_len(unsigned int n)
262 {
263 	return ((3 * n) / 2 + (n & 1));
264 }
265 
266 /**
267  *	get_imm_packet - return the next ingress packet buffer from a response
268  *	@resp: the response descriptor containing the packet data
269  *
270  *	Return a packet containing the immediate data of the given response.
271  */
272 static __inline int
273 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl)
274 {
275 	int len;
276 	uint32_t flags = ntohl(resp->flags);
277 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
278 
279 	/*
280 	 * would be a firmware bug
281 	 */
282 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
283 		return (0);
284 
285 	m = m_gethdr(M_NOWAIT, MT_DATA);
286 	len = G_RSPD_LEN(ntohl(resp->len_cq));
287 
288 	if (m) {
289 		MH_ALIGN(m, IMMED_PKT_SIZE);
290 		memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE);
291 		m->m_len = len;
292 
293 		switch (sopeop) {
294 		case RSPQ_SOP_EOP:
295 			m->m_len = m->m_pkthdr.len = len;
296 			m->m_flags |= M_PKTHDR;
297 			memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE);
298 			MH_ALIGN(m, IMMED_PKT_SIZE);
299 			break;
300 		case RSPQ_EOP:
301 			memcpy(cl, resp->imm_data, len);
302 			m_iovappend(m, cl, MSIZE, len, 0);
303 			break;
304 		}
305 	}
306 	return (m != NULL);
307 }
308 
309 
310 static __inline u_int
311 flits_to_desc(u_int n)
312 {
313 	return (flit_desc_map[n]);
314 }
315 
316 void
317 t3_sge_err_intr_handler(adapter_t *adapter)
318 {
319 	unsigned int v, status;
320 
321 
322 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
323 
324 	if (status & F_RSPQCREDITOVERFOW)
325 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
326 
327 	if (status & F_RSPQDISABLED) {
328 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
329 
330 		CH_ALERT(adapter,
331 			 "packet delivered to disabled response queue (0x%x)\n",
332 			 (v >> S_RSPQ0DISABLED) & 0xff);
333 	}
334 
335 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
336 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
337 		t3_fatal_err(adapter);
338 }
339 
340 void
341 t3_sge_prep(adapter_t *adap, struct sge_params *p)
342 {
343 	int i;
344 
345 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
346 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
347 
348 	for (i = 0; i < SGE_QSETS; ++i) {
349 		struct qset_params *q = p->qset + i;
350 
351 		q->polling = adap->params.rev > 0;
352 
353 		if (adap->flags & USING_MSIX)
354 			q->coalesce_nsecs = 6000;
355 		else
356 			q->coalesce_nsecs = 3500;
357 
358 		q->rspq_size = RSPQ_Q_SIZE;
359 		q->fl_size = FL_Q_SIZE;
360 		q->jumbo_size = JUMBO_Q_SIZE;
361 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
362 		q->txq_size[TXQ_OFLD] = 1024;
363 		q->txq_size[TXQ_CTRL] = 256;
364 		q->cong_thres = 0;
365 	}
366 }
367 
368 int
369 t3_sge_alloc(adapter_t *sc)
370 {
371 
372 	/* The parent tag. */
373 	if (bus_dma_tag_create( NULL,			/* parent */
374 				1, 0,			/* algnmnt, boundary */
375 				BUS_SPACE_MAXADDR,	/* lowaddr */
376 				BUS_SPACE_MAXADDR,	/* highaddr */
377 				NULL, NULL,		/* filter, filterarg */
378 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
379 				BUS_SPACE_UNRESTRICTED, /* nsegments */
380 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
381 				0,			/* flags */
382 				NULL, NULL,		/* lock, lockarg */
383 				&sc->parent_dmat)) {
384 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
385 		return (ENOMEM);
386 	}
387 
388 	/*
389 	 * DMA tag for normal sized RX frames
390 	 */
391 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
392 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
393 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
394 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
395 		return (ENOMEM);
396 	}
397 
398 	/*
399 	 * DMA tag for jumbo sized RX frames.
400 	 */
401 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
402 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
403 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
404 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
405 		return (ENOMEM);
406 	}
407 
408 	/*
409 	 * DMA tag for TX frames.
410 	 */
411 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
412 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
413 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
414 		NULL, NULL, &sc->tx_dmat)) {
415 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
416 		return (ENOMEM);
417 	}
418 
419 	return (0);
420 }
421 
422 int
423 t3_sge_free(struct adapter * sc)
424 {
425 
426 	if (sc->tx_dmat != NULL)
427 		bus_dma_tag_destroy(sc->tx_dmat);
428 
429 	if (sc->rx_jumbo_dmat != NULL)
430 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
431 
432 	if (sc->rx_dmat != NULL)
433 		bus_dma_tag_destroy(sc->rx_dmat);
434 
435 	if (sc->parent_dmat != NULL)
436 		bus_dma_tag_destroy(sc->parent_dmat);
437 
438 	return (0);
439 }
440 
441 void
442 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
443 {
444 
445 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
446 	qs->rspq.polling = 0 /* p->polling */;
447 }
448 
449 static void
450 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
451 {
452 	struct refill_fl_cb_arg *cb_arg = arg;
453 
454 	cb_arg->error = error;
455 	cb_arg->seg = segs[0];
456 	cb_arg->nseg = nseg;
457 
458 }
459 
460 /**
461  *	refill_fl - refill an SGE free-buffer list
462  *	@sc: the controller softc
463  *	@q: the free-list to refill
464  *	@n: the number of new buffers to allocate
465  *
466  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
467  *	The caller must assure that @n does not exceed the queue's capacity.
468  */
469 static void
470 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
471 {
472 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
473 	struct rx_desc *d = &q->desc[q->pidx];
474 	struct refill_fl_cb_arg cb_arg;
475 	void *cl;
476 	int err;
477 
478 	cb_arg.error = 0;
479 	while (n--) {
480 		/*
481 		 * We only allocate a cluster, mbuf allocation happens after rx
482 		 */
483 		if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
484 			log(LOG_WARNING, "Failed to allocate cluster\n");
485 			goto done;
486 		}
487 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
488 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
489 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
490 				/*
491 				 * XXX free cluster
492 				 */
493 				goto done;
494 			}
495 			sd->flags |= RX_SW_DESC_MAP_CREATED;
496 		}
497 		err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
498 		    refill_fl_cb, &cb_arg, 0);
499 
500 		if (err != 0 || cb_arg.error) {
501 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
502 			/*
503 			 * XXX free cluster
504 			 */
505 			return;
506 		}
507 
508 		sd->flags |= RX_SW_DESC_INUSE;
509 		sd->cl = cl;
510 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
511 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
512 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
513 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
514 
515 		d++;
516 		sd++;
517 
518 		if (++q->pidx == q->size) {
519 			q->pidx = 0;
520 			q->gen ^= 1;
521 			sd = q->sdesc;
522 			d = q->desc;
523 		}
524 		q->credits++;
525 	}
526 
527 done:
528 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
529 }
530 
531 
532 /**
533  *	free_rx_bufs - free the Rx buffers on an SGE free list
534  *	@sc: the controle softc
535  *	@q: the SGE free list to clean up
536  *
537  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
538  *	this queue should be stopped before calling this function.
539  */
540 static void
541 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
542 {
543 	u_int cidx = q->cidx;
544 
545 	while (q->credits--) {
546 		struct rx_sw_desc *d = &q->sdesc[cidx];
547 
548 		if (d->flags & RX_SW_DESC_INUSE) {
549 			bus_dmamap_unload(q->entry_tag, d->map);
550 			bus_dmamap_destroy(q->entry_tag, d->map);
551 			uma_zfree(q->zone, d->cl);
552 		}
553 		d->cl = NULL;
554 		if (++cidx == q->size)
555 			cidx = 0;
556 	}
557 }
558 
559 static __inline void
560 __refill_fl(adapter_t *adap, struct sge_fl *fl)
561 {
562 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
563 }
564 
565 static void
566 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
567 {
568 	uint32_t *addr;
569 
570 	addr = arg;
571 	*addr = segs[0].ds_addr;
572 }
573 
574 static int
575 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
576     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
577     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
578 {
579 	size_t len = nelem * elem_size;
580 	void *s = NULL;
581 	void *p = NULL;
582 	int err;
583 
584 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
585 				      BUS_SPACE_MAXADDR_32BIT,
586 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
587 				      len, 0, NULL, NULL, tag)) != 0) {
588 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
589 		return (ENOMEM);
590 	}
591 
592 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
593 				    map)) != 0) {
594 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
595 		return (ENOMEM);
596 	}
597 
598 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
599 	bzero(p, len);
600 	*(void **)desc = p;
601 
602 	if (sw_size) {
603 		len = nelem * sw_size;
604 		s = malloc(len, M_DEVBUF, M_WAITOK);
605 		bzero(s, len);
606 		*(void **)sdesc = s;
607 	}
608 	if (parent_entry_tag == NULL)
609 		return (0);
610 
611 	if ((err = bus_dma_tag_create(parent_entry_tag, PAGE_SIZE, 0,
612 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
613 		                      NULL, NULL, PAGE_SIZE, 1,
614 				      PAGE_SIZE, BUS_DMA_ALLOCNOW,
615 		                      NULL, NULL, entry_tag)) != 0) {
616 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
617 		return (ENOMEM);
618 	}
619 	return (0);
620 }
621 
622 static void
623 sge_slow_intr_handler(void *arg, int ncount)
624 {
625 	adapter_t *sc = arg;
626 
627 	t3_slow_intr_handler(sc);
628 }
629 
630 static void
631 sge_timer_cb(void *arg)
632 {
633 	adapter_t *sc = arg;
634 	struct sge_qset *qs;
635 	struct sge_txq  *txq;
636 	int i, j;
637 	int reclaim_eth, reclaim_ofl, refill_rx;
638 
639 	for (i = 0; i < sc->params.nports; i++)
640 		for (j = 0; j < sc->port[i].nqsets; j++) {
641 			qs = &sc->sge.qs[i + j];
642 			txq = &qs->txq[0];
643 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
644 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
645 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
646 			    (qs->fl[1].credits < qs->fl[1].size));
647 			if (reclaim_eth || reclaim_ofl || refill_rx) {
648 				taskqueue_enqueue(sc->tq, &sc->timer_reclaim_task);
649 				goto done;
650 			}
651 		}
652 done:
653 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
654 }
655 
656 /*
657  * This is meant to be a catch-all function to keep sge state private
658  * to sge.c
659  *
660  */
661 int
662 t3_sge_init_sw(adapter_t *sc)
663 {
664 
665 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
666 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
667 	TASK_INIT(&sc->timer_reclaim_task, 0, sge_timer_reclaim, sc);
668 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
669 	return (0);
670 }
671 
672 void
673 t3_sge_deinit_sw(adapter_t *sc)
674 {
675 	callout_drain(&sc->sge_timer_ch);
676 	if (sc->tq) {
677 		taskqueue_drain(sc->tq, &sc->timer_reclaim_task);
678 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
679 	}
680 }
681 
682 /**
683  *	refill_rspq - replenish an SGE response queue
684  *	@adapter: the adapter
685  *	@q: the response queue to replenish
686  *	@credits: how many new responses to make available
687  *
688  *	Replenishes a response queue by making the supplied number of responses
689  *	available to HW.
690  */
691 static __inline void
692 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
693 {
694 
695 	/* mbufs are allocated on demand when a rspq entry is processed. */
696 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
697 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
698 }
699 
700 
701 static void
702 sge_timer_reclaim(void *arg, int ncount)
703 {
704 	adapter_t *sc = arg;
705 	int i, nqsets = 0;
706 	struct sge_qset *qs;
707 	struct sge_txq *txq;
708 	struct mtx *lock;
709 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
710 	int n, reclaimable;
711 	/*
712 	 * XXX assuming these quantities are allowed to change during operation
713 	 */
714 	for (i = 0; i < sc->params.nports; i++)
715 		nqsets += sc->port[i].nqsets;
716 
717 	for (i = 0; i < nqsets; i++) {
718 		qs = &sc->sge.qs[i];
719 		txq = &qs->txq[TXQ_ETH];
720 		reclaimable = desc_reclaimable(txq);
721 		if (reclaimable > 0) {
722 			mtx_lock(&txq->lock);
723 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
724 			mtx_unlock(&txq->lock);
725 
726 			for (i = 0; i < n; i++) {
727 				m_freem(m_vec[i]);
728 			}
729 		}
730 
731 		txq = &qs->txq[TXQ_OFLD];
732 		reclaimable = desc_reclaimable(txq);
733 		if (reclaimable > 0) {
734 			mtx_lock(&txq->lock);
735 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
736 			mtx_unlock(&txq->lock);
737 
738 			for (i = 0; i < n; i++) {
739 				m_freem(m_vec[i]);
740 			}
741 		}
742 
743 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
744 			    &sc->sge.qs[0].rspq.lock;
745 
746 		if (mtx_trylock(lock)) {
747 			/* XXX currently assume that we are *NOT* polling */
748 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
749 
750 			if (qs->fl[0].credits < qs->fl[0].size - 16)
751 				__refill_fl(sc, &qs->fl[0]);
752 			if (qs->fl[1].credits < qs->fl[1].size - 16)
753 				__refill_fl(sc, &qs->fl[1]);
754 
755 			if (status & (1 << qs->rspq.cntxt_id)) {
756 				if (qs->rspq.credits) {
757 					refill_rspq(sc, &qs->rspq, 1);
758 					qs->rspq.credits--;
759 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
760 					    1 << qs->rspq.cntxt_id);
761 				}
762 			}
763 			mtx_unlock(lock);
764 		}
765 	}
766 }
767 
768 /**
769  *	init_qset_cntxt - initialize an SGE queue set context info
770  *	@qs: the queue set
771  *	@id: the queue set id
772  *
773  *	Initializes the TIDs and context ids for the queues of a queue set.
774  */
775 static void
776 init_qset_cntxt(struct sge_qset *qs, u_int id)
777 {
778 
779 	qs->rspq.cntxt_id = id;
780 	qs->fl[0].cntxt_id = 2 * id;
781 	qs->fl[1].cntxt_id = 2 * id + 1;
782 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
783 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
784 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
785 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
786 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
787 }
788 
789 
790 static void
791 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
792 {
793 	txq->in_use += ndesc;
794 	/*
795 	 * XXX we don't handle stopping of queue
796 	 * presumably start handles this when we bump against the end
797 	 */
798 	txqs->gen = txq->gen;
799 	txq->unacked += ndesc;
800 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
801 	txq->unacked &= 7;
802 	txqs->pidx = txq->pidx;
803 	txq->pidx += ndesc;
804 
805 	if (txq->pidx >= txq->size) {
806 		txq->pidx -= txq->size;
807 		txq->gen ^= 1;
808 	}
809 
810 }
811 
812 /**
813  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
814  *	@m: the packet mbufs
815  *      @nsegs: the number of segments
816  *
817  * 	Returns the number of Tx descriptors needed for the given Ethernet
818  * 	packet.  Ethernet packets require addition of WR and CPL headers.
819  */
820 static __inline unsigned int
821 calc_tx_descs(const struct mbuf *m, int nsegs)
822 {
823 	unsigned int flits;
824 
825 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
826 		return 1;
827 
828 	flits = sgl_len(nsegs) + 2;
829 #ifdef TSO_SUPPORTED
830 	if (m->m_pkthdr.tso_segsz)
831 		flits++;
832 #endif
833 	return flits_to_desc(flits);
834 }
835 
836 static __inline unsigned int
837 busdma_map_mbufs(struct mbuf **m, adapter_t *sc, struct tx_sw_desc *stx,
838     bus_dma_segment_t *segs, int *nsegs)
839 {
840 	struct mbuf *m0, *mtmp;
841 	int err, pktlen;
842 
843 	m0 = *m;
844 	pktlen = m0->m_pkthdr.len;
845 	err = bus_dmamap_load_mbuf_sg(sc->tx_dmat, stx->map, m0, segs, nsegs, 0);
846 	if (err) {
847 		int n = 0;
848 		mtmp = m0;
849 		while(mtmp) {
850 			n++;
851 			mtmp = mtmp->m_next;
852 		}
853 #ifdef DEBUG
854 		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
855 		    err, m0->m_pkthdr.len, n);
856 #endif
857 	}
858 
859 	if (err == EFBIG) {
860 		/* Too many segments, try to defrag */
861 		m0 = m_defrag(m0, M_NOWAIT);
862 		if (m0 == NULL) {
863 			m_freem(*m);
864 			*m = NULL;
865 			return (ENOBUFS);
866 		}
867 		*m = m0;
868 		err = bus_dmamap_load_mbuf_sg(sc->tx_dmat, stx->map, m0, segs, nsegs, 0);
869 	}
870 
871 	if (err == ENOMEM) {
872 		return (err);
873 	}
874 
875 	if (err) {
876 		if (cxgb_debug)
877 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
878 		m_freem(m0);
879 		*m = NULL;
880 		return (err);
881 	}
882 
883 	bus_dmamap_sync(sc->tx_dmat, stx->map, BUS_DMASYNC_PREWRITE);
884 	stx->flags |= TX_SW_DESC_MAPPED;
885 
886 	return (0);
887 }
888 
889 /**
890  *	make_sgl - populate a scatter/gather list for a packet
891  *	@sgp: the SGL to populate
892  *	@segs: the packet dma segments
893  *	@nsegs: the number of segments
894  *
895  *	Generates a scatter/gather list for the buffers that make up a packet
896  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
897  *	appropriately.
898  */
899 static __inline void
900 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
901 {
902 	int i, idx;
903 
904 	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
905 		if (i && idx == 0)
906 			++sgp;
907 
908 		sgp->len[idx] = htobe32(segs[i].ds_len);
909 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
910 	}
911 
912 	if (idx)
913 		sgp->len[idx] = 0;
914 }
915 
916 /**
917  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
918  *	@adap: the adapter
919  *	@q: the Tx queue
920  *
921  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
922  *	where the HW is going to sleep just after we checked, however,
923  *	then the interrupt handler will detect the outstanding TX packet
924  *	and ring the doorbell for us.
925  *
926  *	When GTS is disabled we unconditionally ring the doorbell.
927  */
928 static __inline void
929 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
930 {
931 #if USE_GTS
932 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
933 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
934 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
935 #ifdef T3_TRACE
936 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
937 			  q->cntxt_id);
938 #endif
939 		t3_write_reg(adap, A_SG_KDOORBELL,
940 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
941 	}
942 #else
943 	wmb();            /* write descriptors before telling HW */
944 	t3_write_reg(adap, A_SG_KDOORBELL,
945 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
946 #endif
947 }
948 
949 static __inline void
950 wr_gen2(struct tx_desc *d, unsigned int gen)
951 {
952 #if SGE_NUM_GENBITS == 2
953 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
954 #endif
955 }
956 
957 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
958 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
959 
960 int
961 t3_encap(struct port_info *p, struct mbuf **m)
962 {
963 	adapter_t *sc;
964 	struct mbuf *m0;
965 	struct sge_qset *qs;
966 	struct sge_txq *txq;
967 	struct tx_sw_desc *stx;
968 	struct txq_state txqs;
969 	unsigned int nsegs, ndesc, flits, cntrl, mlen, tso_info;
970 	int err;
971 
972 	struct work_request_hdr *wrp;
973 	struct tx_sw_desc *txsd;
974 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
975 	bus_dma_segment_t segs[TX_MAX_SEGS];
976 	uint32_t wr_hi, wr_lo, sgl_flits;
977 
978 	struct tx_desc *txd;
979 	struct cpl_tx_pkt *cpl;
980 
981 	DPRINTF("t3_encap ");
982 	m0 = *m;
983 	sc = p->adapter;
984 	qs = &sc->sge.qs[p->first_qset];
985 	txq = &qs->txq[TXQ_ETH];
986 	stx = &txq->sdesc[txq->pidx];
987 	txd = &txq->desc[txq->pidx];
988 	cpl = (struct cpl_tx_pkt *)txd;
989 	mlen = m0->m_pkthdr.len;
990 	cpl->len = htonl(mlen | 0x80000000);
991 
992 	DPRINTF("mlen=%d\n", mlen);
993 	/*
994 	 * XXX handle checksum, TSO, and VLAN here
995 	 *
996 	 */
997 	cntrl = V_TXPKT_INTF(p->port);
998 
999 	/*
1000 	 * XXX need to add VLAN support for 6.x
1001 	 */
1002 #ifdef VLAN_SUPPORTED
1003 	if (m0->m_flags & M_VLANTAG)
1004 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
1005 
1006 	tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1007 #else
1008 	tso_info = 0;
1009 #endif
1010 	if (tso_info) {
1011 		int eth_type;
1012 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
1013 		struct ip *ip;
1014 		struct tcphdr *tcp;
1015 		uint8_t *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
1016 
1017 		txd->flit[2] = 0;
1018 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1019 		hdr->cntrl = htonl(cntrl);
1020 
1021 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1022 			pkthdr = &tmp[0];
1023 			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
1024 		} else {
1025 			pkthdr = m0->m_data;
1026 		}
1027 
1028 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1029 			eth_type = CPL_ETH_II_VLAN;
1030 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1031 			    ETHER_VLAN_ENCAP_LEN);
1032 		} else {
1033 			eth_type = CPL_ETH_II;
1034 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1035 		}
1036 		tcp = (struct tcphdr *)((uint8_t *)ip +
1037 		    sizeof(*ip));
1038 
1039 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1040 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1041 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1042 		hdr->lso_info = htonl(tso_info);
1043 
1044 		flits = 3;
1045 	} else {
1046 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1047 		cpl->cntrl = htonl(cntrl);
1048 
1049 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1050 			txq_prod(txq, 1, &txqs);
1051 			txq->sdesc[txqs.pidx].m = m0;
1052 
1053 			if (m0->m_len == m0->m_pkthdr.len)
1054 				memcpy(&txd->flit[2], m0->m_data, mlen);
1055 			else
1056 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1057 
1058 			flits = (mlen + 7) / 8 + 2;
1059 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1060 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1061 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1062 			wmb();
1063 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1064 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1065 
1066 			wr_gen2(txd, txqs.gen);
1067 			check_ring_tx_db(sc, txq);
1068 			return (0);
1069 		}
1070 		flits = 2;
1071 	}
1072 
1073 	wrp = (struct work_request_hdr *)txd;
1074 
1075 	if ((err = busdma_map_mbufs(m, sc, stx, segs, &nsegs)) != 0) {
1076 		return (err);
1077 	}
1078 	m0 = *m;
1079 	ndesc = calc_tx_descs(m0, nsegs);
1080 
1081 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : &sgl[0];
1082 	make_sgl(sgp, segs, nsegs);
1083 
1084 	sgl_flits = sgl_len(nsegs);
1085 
1086 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1087 	txq_prod(txq, ndesc, &txqs);
1088 	txsd = &txq->sdesc[txqs.pidx];
1089 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1090 	wr_lo = htonl(V_WR_TID(txq->token));
1091 	txsd->m = m0;
1092 
1093 	if (__predict_true(ndesc == 1)) {
1094 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1095 		    V_WR_SGLSFLT(flits)) | wr_hi;
1096 		wmb();
1097 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1098 		    V_WR_GEN(txqs.gen)) | wr_lo;
1099 		/* XXX gen? */
1100 		wr_gen2(txd, txqs.gen);
1101 	} else {
1102 		unsigned int ogen = txqs.gen;
1103 		const uint64_t *fp = (const uint64_t *)sgl;
1104 		struct work_request_hdr *wp = wrp;
1105 
1106 		/* XXX - CHECK ME */
1107 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1108 		    V_WR_SGLSFLT(flits)) | wr_hi;
1109 
1110 		while (sgl_flits) {
1111 			unsigned int avail = WR_FLITS - flits;
1112 
1113 			if (avail > sgl_flits)
1114 				avail = sgl_flits;
1115 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1116 			sgl_flits -= avail;
1117 			ndesc--;
1118 			if (!sgl_flits)
1119 				break;
1120 
1121 			fp += avail;
1122 			txd++;
1123 			txsd++;
1124 			if (++txqs.pidx == txq->size) {
1125 				txqs.pidx = 0;
1126 				txqs.gen ^= 1;
1127 				txd = txq->desc;
1128 				txsd = txq->sdesc;
1129 			}
1130 
1131 			/*
1132 			 * when the head of the mbuf chain
1133 			 * is freed all clusters will be freed
1134 			 * with it
1135 			 */
1136 			txsd->m = NULL;
1137 			wrp = (struct work_request_hdr *)txd;
1138 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1139 			    V_WR_SGLSFLT(1)) | wr_hi;
1140 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1141 				    sgl_flits + 1)) |
1142 			    V_WR_GEN(txqs.gen)) | wr_lo;
1143 			wr_gen2(txd, txqs.gen);
1144 			flits = 1;
1145 		}
1146 #ifdef WHY
1147 		skb->priority = pidx;
1148 #endif
1149 		wrp->wr_hi |= htonl(F_WR_EOP);
1150 		wmb();
1151 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1152 		wr_gen2((struct tx_desc *)wp, ogen);
1153 	}
1154 	check_ring_tx_db(p->adapter, txq);
1155 
1156 	return (0);
1157 }
1158 
1159 
1160 /**
1161  *	write_imm - write a packet into a Tx descriptor as immediate data
1162  *	@d: the Tx descriptor to write
1163  *	@m: the packet
1164  *	@len: the length of packet data to write as immediate data
1165  *	@gen: the generation bit value to write
1166  *
1167  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1168  *	contains a work request at its beginning.  We must write the packet
1169  *	carefully so the SGE doesn't read accidentally before it's written in
1170  *	its entirety.
1171  */
1172 static __inline void write_imm(struct tx_desc *d, struct mbuf *m,
1173 			     unsigned int len, unsigned int gen)
1174 {
1175 	struct work_request_hdr *from = (struct work_request_hdr *)m->m_data;
1176 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1177 
1178 	memcpy(&to[1], &from[1], len - sizeof(*from));
1179 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1180 					V_WR_BCNTLFLT(len & 7));
1181 	wmb();
1182 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1183 					V_WR_LEN((len + 7) / 8));
1184 	wr_gen2(d, gen);
1185 	m_freem(m);
1186 }
1187 
1188 /**
1189  *	check_desc_avail - check descriptor availability on a send queue
1190  *	@adap: the adapter
1191  *	@q: the TX queue
1192  *	@m: the packet needing the descriptors
1193  *	@ndesc: the number of Tx descriptors needed
1194  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1195  *
1196  *	Checks if the requested number of Tx descriptors is available on an
1197  *	SGE send queue.  If the queue is already suspended or not enough
1198  *	descriptors are available the packet is queued for later transmission.
1199  *	Must be called with the Tx queue locked.
1200  *
1201  *	Returns 0 if enough descriptors are available, 1 if there aren't
1202  *	enough descriptors and the packet has been queued, and 2 if the caller
1203  *	needs to retry because there weren't enough descriptors at the
1204  *	beginning of the call but some freed up in the mean time.
1205  */
1206 static __inline int
1207 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1208 				   struct mbuf *m, unsigned int ndesc,
1209 				   unsigned int qid)
1210 {
1211 	/*
1212 	 * XXX We currently only use this for checking the control queue
1213 	 * the control queue is only used for binding qsets which happens
1214 	 * at init time so we are guaranteed enough descriptors
1215 	 */
1216 #if 0
1217 	if (__predict_false(!skb_queue_empty(&q->sendq))) {
1218 addq_exit:	__skb_queue_tail(&q->sendq, skb);
1219 		return 1;
1220 	}
1221 	if (__predict_false(q->size - q->in_use < ndesc)) {
1222 
1223 		struct sge_qset *qs = txq_to_qset(q, qid);
1224 
1225 		set_bit(qid, &qs->txq_stopped);
1226 		smp_mb__after_clear_bit();
1227 
1228 		if (should_restart_tx(q) &&
1229 		    test_and_clear_bit(qid, &qs->txq_stopped))
1230 			return 2;
1231 
1232 		q->stops++;
1233 		goto addq_exit;
1234 	}
1235 #endif
1236 	return 0;
1237 }
1238 
1239 
1240 /**
1241  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1242  *	@q: the SGE control Tx queue
1243  *
1244  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1245  *	that send only immediate data (presently just the control queues) and
1246  *	thus do not have any sk_buffs to release.
1247  */
1248 static __inline void
1249 reclaim_completed_tx_imm(struct sge_txq *q)
1250 {
1251 	unsigned int reclaim = q->processed - q->cleaned;
1252 
1253 	mtx_assert(&q->lock, MA_OWNED);
1254 
1255 	q->in_use -= reclaim;
1256 	q->cleaned += reclaim;
1257 }
1258 
1259 static __inline int
1260 immediate(const struct mbuf *m)
1261 {
1262 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1263 }
1264 
1265 /**
1266  *	ctrl_xmit - send a packet through an SGE control Tx queue
1267  *	@adap: the adapter
1268  *	@q: the control queue
1269  *	@m: the packet
1270  *
1271  *	Send a packet through an SGE control Tx queue.  Packets sent through
1272  *	a control queue must fit entirely as immediate data in a single Tx
1273  *	descriptor and have no page fragments.
1274  */
1275 static int
1276 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1277 {
1278 	int ret;
1279 	struct work_request_hdr *wrp = (struct work_request_hdr *)m->m_data;
1280 
1281 	if (__predict_false(!immediate(m))) {
1282 		m_freem(m);
1283 		return 0;
1284 	}
1285 
1286 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1287 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1288 
1289 	mtx_lock(&q->lock);
1290 again:	reclaim_completed_tx_imm(q);
1291 
1292 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1293 	if (__predict_false(ret)) {
1294 		if (ret == 1) {
1295 			mtx_unlock(&q->lock);
1296 			return (-1);
1297 		}
1298 		goto again;
1299 	}
1300 
1301 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1302 
1303 	q->in_use++;
1304 	if (++q->pidx >= q->size) {
1305 		q->pidx = 0;
1306 		q->gen ^= 1;
1307 	}
1308 	mtx_unlock(&q->lock);
1309 	wmb();
1310 	t3_write_reg(adap, A_SG_KDOORBELL,
1311 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1312 	return (0);
1313 }
1314 
1315 #ifdef RESTART_CTRLQ
1316 /**
1317  *	restart_ctrlq - restart a suspended control queue
1318  *	@qs: the queue set cotaining the control queue
1319  *
1320  *	Resumes transmission on a suspended Tx control queue.
1321  */
1322 static void
1323 restart_ctrlq(unsigned long data)
1324 {
1325 	struct mbuf *m;
1326 	struct sge_qset *qs = (struct sge_qset *)data;
1327 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1328 	adapter_t *adap = qs->port->adapter;
1329 
1330 	mtx_lock(&q->lock);
1331 again:	reclaim_completed_tx_imm(q);
1332 
1333 	while (q->in_use < q->size &&
1334 	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
1335 
1336 		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1337 
1338 		if (++q->pidx >= q->size) {
1339 			q->pidx = 0;
1340 			q->gen ^= 1;
1341 		}
1342 		q->in_use++;
1343 	}
1344 	if (!skb_queue_empty(&q->sendq)) {
1345 		set_bit(TXQ_CTRL, &qs->txq_stopped);
1346 		smp_mb__after_clear_bit();
1347 
1348 		if (should_restart_tx(q) &&
1349 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1350 			goto again;
1351 		q->stops++;
1352 	}
1353 
1354 	mtx_unlock(&q->lock);
1355 	t3_write_reg(adap, A_SG_KDOORBELL,
1356 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1357 }
1358 #endif
1359 
1360 /*
1361  * Send a management message through control queue 0
1362  */
1363 int
1364 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1365 {
1366 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1367 }
1368 
1369 /**
1370  *	t3_sge_alloc_qset - initialize an SGE queue set
1371  *	@sc: the controller softc
1372  *	@id: the queue set id
1373  *	@nports: how many Ethernet ports will be using this queue set
1374  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
1375  *	@p: configuration parameters for this queue set
1376  *	@ntxq: number of Tx queues for the queue set
1377  *	@pi: port info for queue set
1378  *
1379  *	Allocate resources and initialize an SGE queue set.  A queue set
1380  *	comprises a response queue, two Rx free-buffer queues, and up to 3
1381  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
1382  *	queue, offload queue, and control queue.
1383  */
1384 int
1385 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
1386 		  const struct qset_params *p, int ntxq, struct port_info *pi)
1387 {
1388 	struct sge_qset *q = &sc->sge.qs[id];
1389 	int i, ret = 0;
1390 
1391 	init_qset_cntxt(q, id);
1392 
1393 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
1394 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
1395 		    &q->fl[0].desc, &q->fl[0].sdesc,
1396 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
1397 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
1398 		printf("error %d from alloc ring fl0\n", ret);
1399 		goto err;
1400 	}
1401 
1402 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
1403 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
1404 		    &q->fl[1].desc, &q->fl[1].sdesc,
1405 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
1406 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
1407 		printf("error %d from alloc ring fl1\n", ret);
1408 		goto err;
1409 	}
1410 
1411 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
1412 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
1413 		    &q->rspq.desc_tag, &q->rspq.desc_map,
1414 		    NULL, NULL)) != 0) {
1415 		printf("error %d from alloc ring rspq\n", ret);
1416 		goto err;
1417 	}
1418 
1419 	for (i = 0; i < ntxq; ++i) {
1420 		/*
1421 		 * The control queue always uses immediate data so does not
1422 		 * need to keep track of any mbufs.
1423 		 * XXX Placeholder for future TOE support.
1424 		 */
1425 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
1426 
1427 		if ((ret = alloc_ring(sc, p->txq_size[i],
1428 			    sizeof(struct tx_desc), sz,
1429 			    &q->txq[i].phys_addr, &q->txq[i].desc,
1430 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
1431 			    &q->txq[i].desc_map,
1432 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
1433 			printf("error %d from alloc ring tx %i\n", ret, i);
1434 			goto err;
1435 		}
1436 
1437 		q->txq[i].gen = 1;
1438 		q->txq[i].size = p->txq_size[i];
1439 		mtx_init(&q->txq[i].lock, "t3 txq lock", NULL, MTX_DEF);
1440 	}
1441 
1442 	q->fl[0].gen = q->fl[1].gen = 1;
1443 	q->fl[0].size = p->fl_size;
1444 	q->fl[1].size = p->jumbo_size;
1445 
1446 	q->rspq.gen = 1;
1447 	q->rspq.size = p->rspq_size;
1448 	mtx_init(&q->rspq.lock, "t3 rspq lock", NULL, MTX_DEF);
1449 
1450 	q->txq[TXQ_ETH].stop_thres = nports *
1451 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
1452 
1453 	q->fl[0].buf_size = MCLBYTES;
1454 	q->fl[0].zone = zone_clust;
1455 	q->fl[0].type = EXT_CLUSTER;
1456 	q->fl[1].buf_size = MJUMPAGESIZE;
1457 	q->fl[1].zone = zone_jumbop;
1458 	q->fl[1].type = EXT_JUMBOP;
1459 
1460 	q->lro.enabled = lro_default;
1461 
1462 	mtx_lock(&sc->sge.reg_lock);
1463 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
1464 				   q->rspq.phys_addr, q->rspq.size,
1465 				   q->fl[0].buf_size, 1, 0);
1466 	if (ret) {
1467 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
1468 		goto err_unlock;
1469 	}
1470 
1471 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1472 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
1473 					  q->fl[i].phys_addr, q->fl[i].size,
1474 					  q->fl[i].buf_size, p->cong_thres, 1,
1475 					  0);
1476 		if (ret) {
1477 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
1478 			goto err_unlock;
1479 		}
1480 	}
1481 
1482 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
1483 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
1484 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
1485 				 1, 0);
1486 	if (ret) {
1487 		printf("error %d from t3_sge_init_ecntxt\n", ret);
1488 		goto err_unlock;
1489 	}
1490 
1491 	if (ntxq > 1) {
1492 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
1493 					 USE_GTS, SGE_CNTXT_OFLD, id,
1494 					 q->txq[TXQ_OFLD].phys_addr,
1495 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
1496 		if (ret) {
1497 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1498 			goto err_unlock;
1499 		}
1500 	}
1501 
1502 	if (ntxq > 2) {
1503 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
1504 					 SGE_CNTXT_CTRL, id,
1505 					 q->txq[TXQ_CTRL].phys_addr,
1506 					 q->txq[TXQ_CTRL].size,
1507 					 q->txq[TXQ_CTRL].token, 1, 0);
1508 		if (ret) {
1509 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1510 			goto err_unlock;
1511 		}
1512 	}
1513 
1514 	mtx_unlock(&sc->sge.reg_lock);
1515 	t3_update_qset_coalesce(q, p);
1516 	q->port = pi;
1517 
1518 	refill_fl(sc, &q->fl[0], q->fl[0].size);
1519 	refill_fl(sc, &q->fl[1], q->fl[1].size);
1520 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
1521 
1522 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
1523 		     V_NEWTIMER(q->rspq.holdoff_tmr));
1524 
1525 	return (0);
1526 
1527 err_unlock:
1528 	mtx_unlock(&sc->sge.reg_lock);
1529 err:
1530 	t3_free_qset(sc, q);
1531 
1532 	return (ret);
1533 }
1534 
1535 
1536 /**
1537  *	free_qset - free the resources of an SGE queue set
1538  *	@sc: the controller owning the queue set
1539  *	@q: the queue set
1540  *
1541  *	Release the HW and SW resources associated with an SGE queue set, such
1542  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1543  *	queue set must be quiesced prior to calling this.
1544  */
1545 static void
1546 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1547 {
1548 	int i;
1549 
1550 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1551 		if (q->fl[i].desc) {
1552 			mtx_lock(&sc->sge.reg_lock);
1553 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1554 			mtx_unlock(&sc->sge.reg_lock);
1555 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1556 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1557 					q->fl[i].desc_map);
1558 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1559 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1560 		}
1561 		if (q->fl[i].sdesc) {
1562 			free_rx_bufs(sc, &q->fl[i]);
1563 			free(q->fl[i].sdesc, M_DEVBUF);
1564 		}
1565 	}
1566 
1567 	for (i = 0; i < SGE_TXQ_PER_SET; ++i) {
1568 		if (q->txq[i].desc) {
1569 			mtx_lock(&sc->sge.reg_lock);
1570 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1571 			mtx_unlock(&sc->sge.reg_lock);
1572 			bus_dmamap_unload(q->txq[i].desc_tag,
1573 					q->txq[i].desc_map);
1574 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1575 					q->txq[i].desc_map);
1576 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1577 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1578 		}
1579 		if (q->txq[i].sdesc) {
1580 			free(q->txq[i].sdesc, M_DEVBUF);
1581 		}
1582 		if (mtx_initialized(&q->txq[i].lock)) {
1583 			mtx_destroy(&q->txq[i].lock);
1584 		}
1585 	}
1586 
1587 	if (q->rspq.desc) {
1588 		mtx_lock(&sc->sge.reg_lock);
1589 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1590 		mtx_unlock(&sc->sge.reg_lock);
1591 
1592 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1593 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1594 			        q->rspq.desc_map);
1595 		bus_dma_tag_destroy(q->rspq.desc_tag);
1596 	}
1597 
1598 	if (mtx_initialized(&q->rspq.lock))
1599 		mtx_destroy(&q->rspq.lock);
1600 
1601 	bzero(q, sizeof(*q));
1602 }
1603 
1604 /**
1605  *	t3_free_sge_resources - free SGE resources
1606  *	@sc: the adapter softc
1607  *
1608  *	Frees resources used by the SGE queue sets.
1609  */
1610 void
1611 t3_free_sge_resources(adapter_t *sc)
1612 {
1613 	int i;
1614 
1615 	for (i = 0; i < SGE_QSETS; ++i)
1616 		t3_free_qset(sc, &sc->sge.qs[i]);
1617 }
1618 
1619 /**
1620  *	t3_sge_start - enable SGE
1621  *	@sc: the controller softc
1622  *
1623  *	Enables the SGE for DMAs.  This is the last step in starting packet
1624  *	transfers.
1625  */
1626 void
1627 t3_sge_start(adapter_t *sc)
1628 {
1629 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1630 }
1631 
1632 
1633 /**
1634  *	free_tx_desc - reclaims Tx descriptors and their buffers
1635  *	@adapter: the adapter
1636  *	@q: the Tx queue to reclaim descriptors from
1637  *	@n: the number of descriptors to reclaim
1638  *
1639  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1640  *	Tx buffers.  Called with the Tx queue lock held.
1641  */
1642 int
1643 free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec)
1644 {
1645 	struct tx_sw_desc *d;
1646 	unsigned int cidx = q->cidx;
1647 	int nbufs = 0;
1648 
1649 #ifdef T3_TRACE
1650 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1651 		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
1652 #endif
1653 	d = &q->sdesc[cidx];
1654 
1655 	while (n-- > 0) {
1656 		DPRINTF("cidx=%d d=%p\n", cidx, d);
1657 		if (d->m) {
1658 			if (d->flags & TX_SW_DESC_MAPPED) {
1659 				bus_dmamap_unload(q->entry_tag, d->map);
1660 				bus_dmamap_destroy(q->entry_tag, d->map);
1661 				d->flags &= ~TX_SW_DESC_MAPPED;
1662 			}
1663 			m_vec[nbufs] = d->m;
1664 			d->m = NULL;
1665 			nbufs++;
1666 		}
1667 		++d;
1668 		if (++cidx == q->size) {
1669 			cidx = 0;
1670 			d = q->sdesc;
1671 		}
1672 	}
1673 	q->cidx = cidx;
1674 
1675 	return (nbufs);
1676 }
1677 
1678 /**
1679  *	is_new_response - check if a response is newly written
1680  *	@r: the response descriptor
1681  *	@q: the response queue
1682  *
1683  *	Returns true if a response descriptor contains a yet unprocessed
1684  *	response.
1685  */
1686 static __inline int
1687 is_new_response(const struct rsp_desc *r,
1688     const struct sge_rspq *q)
1689 {
1690 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1691 }
1692 
1693 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1694 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1695 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1696 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1697 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1698 
1699 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1700 #define NOMEM_INTR_DELAY 2500
1701 
1702 static __inline void
1703 deliver_partial_bundle(struct t3cdev *tdev, struct sge_rspq *q)
1704 {
1705 	;
1706 }
1707 
1708 static __inline void
1709 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1710     struct mbuf *m)
1711 {
1712 #ifdef notyet
1713 	if (rq->polling) {
1714 		rq->offload_skbs[rq->offload_skbs_idx++] = skb;
1715 		if (rq->offload_skbs_idx == RX_BUNDLE_SIZE) {
1716 			cxgb_ofld_recv(tdev, rq->offload_skbs, RX_BUNDLE_SIZE);
1717 			rq->offload_skbs_idx = 0;
1718 			rq->offload_bundles++;
1719 		}
1720 	} else
1721 #endif
1722 	{
1723 		/* XXX */
1724 		panic("implement offload enqueue\n");
1725 	}
1726 
1727 }
1728 
1729 static void
1730 restart_tx(struct sge_qset *qs)
1731 {
1732 	;
1733 }
1734 
1735 void
1736 t3_rx_eth(struct port_info *pi, struct sge_rspq *rq, struct mbuf *m, int ethpad)
1737 {
1738 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(m->m_data + ethpad);
1739 	struct ifnet *ifp = pi->ifp;
1740 
1741 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, m->m_data, cpl->iff);
1742 	if (&pi->adapter->port[cpl->iff] != pi)
1743 		panic("bad port index %d m->m_data=%p\n", cpl->iff, m->m_data);
1744 
1745 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
1746 	    cpl->csum_valid && cpl->csum == 0xffff) {
1747 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
1748 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
1749 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1750 		m->m_pkthdr.csum_data = 0xffff;
1751 	}
1752 	/*
1753 	 * XXX need to add VLAN support for 6.x
1754 	 */
1755 #ifdef VLAN_SUPPORTED
1756 	if (__predict_false(cpl->vlan_valid)) {
1757 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
1758 		m->m_flags |= M_VLANTAG;
1759 	}
1760 #endif
1761 	m->m_pkthdr.rcvif = ifp;
1762 
1763 	m_explode(m);
1764 	m_adj(m, sizeof(*cpl) + ethpad);
1765 
1766 	(*ifp->if_input)(ifp, m);
1767 }
1768 
1769 /**
1770  *	get_packet - return the next ingress packet buffer from a free list
1771  *	@adap: the adapter that received the packet
1772  *	@drop_thres: # of remaining buffers before we start dropping packets
1773  *	@qs: the qset that the SGE free list holding the packet belongs to
1774  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
1775  *      @r: response descriptor
1776  *
1777  *	Get the next packet from a free list and complete setup of the
1778  *	sk_buff.  If the packet is small we make a copy and recycle the
1779  *	original buffer, otherwise we use the original buffer itself.  If a
1780  *	positive drop threshold is supplied packets are dropped and their
1781  *	buffers recycled if (a) the number of remaining buffers is under the
1782  *	threshold and the packet is too big to copy, or (b) the packet should
1783  *	be copied but there is no memory for the copy.
1784  */
1785 
1786 static int
1787 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
1788     struct mbuf *m, struct rsp_desc *r)
1789 {
1790 
1791 	unsigned int len_cq =  ntohl(r->len_cq);
1792 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
1793 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
1794 	uint32_t len = G_RSPD_LEN(len_cq);
1795 	uint32_t flags = ntohl(r->flags);
1796 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
1797 	int ret = 0;
1798 
1799 	prefetch(sd->cl);
1800 
1801 	fl->credits--;
1802 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
1803 	bus_dmamap_unload(fl->entry_tag, sd->map);
1804 
1805 
1806 	switch(sopeop) {
1807 	case RSPQ_SOP_EOP:
1808 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
1809 		m_cljset(m, sd->cl, fl->type);
1810 		m->m_len = m->m_pkthdr.len = len;
1811 		m->m_flags |= M_PKTHDR;
1812 		ret = 1;
1813 		goto done;
1814 		break;
1815 	case RSPQ_NSOP_NEOP:
1816 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
1817 		ret = 0;
1818 		break;
1819 	case RSPQ_SOP:
1820 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
1821 		m->m_flags |= M_PKTHDR;
1822 		m_iovinit(m);
1823 		ret = 0;
1824 		break;
1825 	case RSPQ_EOP:
1826 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
1827 		ret = 1;
1828 		break;
1829 	}
1830 	m_iovappend(m, sd->cl, fl->buf_size, len, 0);
1831 
1832 done:
1833 	if (++fl->cidx == fl->size)
1834 		fl->cidx = 0;
1835 
1836 	return (ret);
1837 }
1838 
1839 
1840 /**
1841  *	handle_rsp_cntrl_info - handles control information in a response
1842  *	@qs: the queue set corresponding to the response
1843  *	@flags: the response control flags
1844  *
1845  *	Handles the control information of an SGE response, such as GTS
1846  *	indications and completion credits for the queue set's Tx queues.
1847  *	HW coalesces credits, we don't do any extra SW coalescing.
1848  */
1849 static __inline void
1850 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
1851 {
1852 	unsigned int credits;
1853 
1854 #if USE_GTS
1855 	if (flags & F_RSPD_TXQ0_GTS)
1856 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
1857 #endif
1858 	credits = G_RSPD_TXQ0_CR(flags);
1859 	if (credits) {
1860 		qs->txq[TXQ_ETH].processed += credits;
1861 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
1862 			taskqueue_enqueue(qs->port->adapter->tq,
1863 			    &qs->port->adapter->timer_reclaim_task);
1864 	}
1865 
1866 	credits = G_RSPD_TXQ2_CR(flags);
1867 	if (credits)
1868 		qs->txq[TXQ_CTRL].processed += credits;
1869 
1870 # if USE_GTS
1871 	if (flags & F_RSPD_TXQ1_GTS)
1872 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
1873 # endif
1874 	credits = G_RSPD_TXQ1_CR(flags);
1875 	if (credits)
1876 		qs->txq[TXQ_OFLD].processed += credits;
1877 }
1878 
1879 static void
1880 check_ring_db(adapter_t *adap, struct sge_qset *qs,
1881     unsigned int sleeping)
1882 {
1883 	;
1884 }
1885 
1886 /*
1887  * This is an awful hack to bind the ithread to CPU 1
1888  * to work around lack of ithread affinity
1889  */
1890 static void
1891 bind_ithread(int cpu)
1892 {
1893 	KASSERT(cpu < mp_ncpus, ("invalid cpu identifier"));
1894 #if 0
1895 	if (mp_ncpus > 1) {
1896 		mtx_lock_spin(&sched_lock);
1897 		sched_bind(curthread, cpu);
1898 		mtx_unlock_spin(&sched_lock);
1899 	}
1900 #endif
1901 }
1902 
1903 /**
1904  *	process_responses - process responses from an SGE response queue
1905  *	@adap: the adapter
1906  *	@qs: the queue set to which the response queue belongs
1907  *	@budget: how many responses can be processed in this round
1908  *
1909  *	Process responses from an SGE response queue up to the supplied budget.
1910  *	Responses include received packets as well as credits and other events
1911  *	for the queues that belong to the response queue's queue set.
1912  *	A negative budget is effectively unlimited.
1913  *
1914  *	Additionally choose the interrupt holdoff time for the next interrupt
1915  *	on this queue.  If the system is under memory shortage use a fairly
1916  *	long delay to help recovery.
1917  */
1918 static int
1919 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
1920 {
1921 	struct sge_rspq *rspq = &qs->rspq;
1922 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
1923 	int budget_left = budget;
1924 	unsigned int sleeping = 0;
1925 	int lro = qs->lro.enabled;
1926 
1927 	static uint8_t pinned[MAXCPU];
1928 
1929 #ifdef DEBUG
1930 	static int last_holdoff = 0;
1931 	if (rspq->holdoff_tmr != last_holdoff) {
1932 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
1933 		last_holdoff = rspq->holdoff_tmr;
1934 	}
1935 #endif
1936 	if (pinned[qs->rspq.cntxt_id * adap->params.nports] == 0) {
1937 		/*
1938 		 * Assumes that cntxt_id < mp_ncpus
1939 		 */
1940 		bind_ithread(qs->rspq.cntxt_id);
1941 		pinned[qs->rspq.cntxt_id * adap->params.nports] = 1;
1942 	}
1943 	rspq->next_holdoff = rspq->holdoff_tmr;
1944 
1945 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
1946 		int eth, eop = 0, ethpad = 0;
1947 		uint32_t flags = ntohl(r->flags);
1948 		uint32_t rss_csum = *(const uint32_t *)r;
1949 		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
1950 
1951 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
1952 
1953 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
1954 			/* XXX */
1955 			printf("async notification\n");
1956 
1957 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
1958 			struct mbuf *m = NULL;
1959 			if (cxgb_debug)
1960 				printf("IMM DATA VALID\n");
1961 			if (rspq->m == NULL)
1962 				rspq->m = m_gethdr(M_NOWAIT, MT_DATA);
1963                         else
1964 				m = m_gethdr(M_NOWAIT, MT_DATA);
1965 
1966 			if (rspq->m == NULL || m == NULL) {
1967 				rspq->next_holdoff = NOMEM_INTR_DELAY;
1968 				budget_left--;
1969 				break;
1970 			}
1971 			get_imm_packet(adap, r, rspq->m, m);
1972 			eop = 1;
1973 			rspq->imm_data++;
1974 		} else if (r->len_cq) {
1975 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
1976 
1977                         if (rspq->m == NULL) {
1978 				rspq->m = m_gethdr(M_NOWAIT, MT_DATA);
1979 				rspq->m->m_flags = 0;
1980                         } else if (rspq->m == NULL) {
1981 				log(LOG_WARNING, "failed to get mbuf for packet\n");
1982 				break;
1983 			}
1984 
1985 			ethpad = 2;
1986 			eop = get_packet(adap, drop_thresh, qs, rspq->m, r);
1987 		} else {
1988 			DPRINTF("pure response\n");
1989 			rspq->pure_rsps++;
1990 		}
1991 
1992 		if (flags & RSPD_CTRL_MASK) {
1993 			sleeping |= flags & RSPD_GTS_MASK;
1994 			handle_rsp_cntrl_info(qs, flags);
1995 		}
1996 
1997 		r++;
1998 		if (__predict_false(++rspq->cidx == rspq->size)) {
1999 			rspq->cidx = 0;
2000 			rspq->gen ^= 1;
2001 			r = rspq->desc;
2002 		}
2003 
2004 		prefetch(r);
2005 		if (++rspq->credits >= (rspq->size / 4)) {
2006 			refill_rspq(adap, rspq, rspq->credits);
2007 			rspq->credits = 0;
2008 		}
2009 
2010 		if (eop) {
2011 			prefetch(rspq->m->m_data);
2012 			prefetch(rspq->m->m_data + L1_CACHE_BYTES);
2013 
2014 			if (eth) {
2015 				t3_rx_eth_lro(adap, rspq, rspq->m, ethpad,
2016 				    rss_hash, rss_csum, lro);
2017 
2018 				rspq->m = NULL;
2019 			} else {
2020 #ifdef notyet
2021 				if (__predict_false(r->rss_hdr.opcode == CPL_TRACE_PKT))
2022 					m_adj(m, 2);
2023 
2024 				rx_offload(&adap->tdev, rspq, m);
2025 #endif
2026 			}
2027 #ifdef notyet
2028 			taskqueue_enqueue(adap->tq, &adap->timer_reclaim_task);
2029 #else
2030 			__refill_fl(adap, &qs->fl[0]);
2031 			__refill_fl(adap, &qs->fl[1]);
2032 #endif
2033 		}
2034 		--budget_left;
2035 	}
2036 	t3_sge_lro_flush_all(adap, qs);
2037 	deliver_partial_bundle(&adap->tdev, rspq);
2038 
2039 	if (sleeping)
2040 		check_ring_db(adap, qs, sleeping);
2041 
2042 	smp_mb();  /* commit Tx queue processed updates */
2043 	if (__predict_false(qs->txq_stopped != 0))
2044 		restart_tx(qs);
2045 
2046 	budget -= budget_left;
2047 	return (budget);
2048 }
2049 
2050 /*
2051  * A helper function that processes responses and issues GTS.
2052  */
2053 static __inline int
2054 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2055 {
2056 	int work;
2057 	static int last_holdoff = 0;
2058 
2059 	work = process_responses(adap, rspq_to_qset(rq), -1);
2060 
2061 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2062 		printf("next_holdoff=%d\n", rq->next_holdoff);
2063 		last_holdoff = rq->next_holdoff;
2064 	}
2065 
2066 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2067 		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2068 	return work;
2069 }
2070 
2071 
2072 /*
2073  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2074  * Handles data events from SGE response queues as well as error and other
2075  * async events as they all use the same interrupt pin.  We use one SGE
2076  * response queue per port in this mode and protect all response queues with
2077  * queue 0's lock.
2078  */
2079 void
2080 t3b_intr(void *data)
2081 {
2082 	uint32_t map;
2083 	adapter_t *adap = data;
2084 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2085 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2086 
2087 	t3_write_reg(adap, A_PL_CLI, 0);
2088 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2089 
2090 	if (!map)
2091 		return;
2092 
2093 	if (__predict_false(map & F_ERRINTR))
2094 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2095 
2096 	mtx_lock(&q0->lock);
2097 
2098 	if (__predict_true(map & 1))
2099 		process_responses_gts(adap, q0);
2100 
2101 	if (map & 2)
2102 		process_responses_gts(adap, q1);
2103 
2104 	mtx_unlock(&q0->lock);
2105 }
2106 
2107 /*
2108  * The MSI interrupt handler.  This needs to handle data events from SGE
2109  * response queues as well as error and other async events as they all use
2110  * the same MSI vector.  We use one SGE response queue per port in this mode
2111  * and protect all response queues with queue 0's lock.
2112  */
2113 void
2114 t3_intr_msi(void *data)
2115 {
2116 	adapter_t *adap = data;
2117 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2118 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2119 	int new_packets = 0;
2120 
2121 	mtx_lock(&q0->lock);
2122 	if (process_responses_gts(adap, q0)) {
2123 		new_packets = 1;
2124 	}
2125 
2126 	if (adap->params.nports == 2 &&
2127 	    process_responses_gts(adap, q1)) {
2128 		new_packets = 1;
2129 	}
2130 
2131 	mtx_unlock(&q0->lock);
2132 	if (new_packets == 0)
2133 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2134 }
2135 
2136 void
2137 t3_intr_msix(void *data)
2138 {
2139 	struct sge_qset *qs = data;
2140 	adapter_t *adap = qs->port->adapter;
2141 	struct sge_rspq *rspq = &qs->rspq;
2142 
2143 	mtx_lock(&rspq->lock);
2144 	if (process_responses_gts(adap, rspq) == 0) {
2145 #ifdef notyet
2146 		rspq->unhandled_irqs++;
2147 #endif
2148 	}
2149 	mtx_unlock(&rspq->lock);
2150 }
2151 
2152 /*
2153  * broken by recent mbuf changes
2154  */
2155 static int
2156 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2157 {
2158 	adapter_t *sc;
2159 	int i, j, enabled, err, nqsets = 0;
2160 
2161 #ifndef LRO_WORKING
2162 	return (0);
2163 #endif
2164 
2165 	sc = arg1;
2166 	enabled = sc->sge.qs[0].lro.enabled;
2167         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2168 
2169 	if (err != 0) {
2170 		return (err);
2171 	}
2172 	if (enabled == sc->sge.qs[0].lro.enabled)
2173 		return (0);
2174 
2175 	for (i = 0; i < sc->params.nports; i++)
2176 		for (j = 0; j < sc->port[i].nqsets; j++)
2177 			nqsets++;
2178 
2179 	for (i = 0; i < nqsets; i++) {
2180 		sc->sge.qs[i].lro.enabled = enabled;
2181 	}
2182 
2183 	return (0);
2184 }
2185 
2186 static int
2187 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2188 {
2189 	adapter_t *sc = arg1;
2190 	struct qset_params *qsp = &sc->params.sge.qset[0];
2191 	int coalesce_nsecs;
2192 	struct sge_qset *qs;
2193 	int i, j, err, nqsets = 0;
2194 	struct mtx *lock;
2195 
2196 	coalesce_nsecs = qsp->coalesce_nsecs;
2197         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
2198 
2199 	if (err != 0) {
2200 		return (err);
2201 	}
2202 	if (coalesce_nsecs == qsp->coalesce_nsecs)
2203 		return (0);
2204 
2205 	for (i = 0; i < sc->params.nports; i++)
2206 		for (j = 0; j < sc->port[i].nqsets; j++)
2207 			nqsets++;
2208 
2209 	coalesce_nsecs = max(100, coalesce_nsecs);
2210 
2211 	for (i = 0; i < nqsets; i++) {
2212 		qs = &sc->sge.qs[i];
2213 		qsp = &sc->params.sge.qset[i];
2214 		qsp->coalesce_nsecs = coalesce_nsecs;
2215 
2216 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
2217 			    &sc->sge.qs[0].rspq.lock;
2218 
2219 		mtx_lock(lock);
2220 		t3_update_qset_coalesce(qs, qsp);
2221 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2222 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
2223 		mtx_unlock(lock);
2224 	}
2225 
2226 	return (0);
2227 }
2228 
2229 
2230 void
2231 t3_add_sysctls(adapter_t *sc)
2232 {
2233 	struct sysctl_ctx_list *ctx;
2234 	struct sysctl_oid_list *children;
2235 
2236 	ctx = device_get_sysctl_ctx(sc->dev);
2237 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
2238 
2239 	/* random information */
2240 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
2241 	    "firmware_version",
2242 	    CTLFLAG_RD, &sc->fw_version,
2243 	    0, "firmware version");
2244 
2245 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2246 	    "enable_lro",
2247 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2248 	    0, t3_lro_enable,
2249 	    "I", "enable large receive offload");
2250 
2251 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2252 	    "intr_coal",
2253 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2254 	    0, t3_set_coalesce_nsecs,
2255 	    "I", "interrupt coalescing timer (ns)");
2256 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2257 	    "enable_debug",
2258 	    CTLFLAG_RW, &cxgb_debug,
2259 	    0, "enable verbose debugging output");
2260 
2261 }
2262 
2263 /**
2264  *	t3_get_desc - dump an SGE descriptor for debugging purposes
2265  *	@qs: the queue set
2266  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2267  *	@idx: the descriptor index in the queue
2268  *	@data: where to dump the descriptor contents
2269  *
2270  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2271  *	size of the descriptor.
2272  */
2273 int
2274 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2275 		unsigned char *data)
2276 {
2277 	if (qnum >= 6)
2278 		return (EINVAL);
2279 
2280 	if (qnum < 3) {
2281 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2282 			return -EINVAL;
2283 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2284 		return sizeof(struct tx_desc);
2285 	}
2286 
2287 	if (qnum == 3) {
2288 		if (!qs->rspq.desc || idx >= qs->rspq.size)
2289 			return (EINVAL);
2290 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2291 		return sizeof(struct rsp_desc);
2292 	}
2293 
2294 	qnum -= 4;
2295 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2296 		return (EINVAL);
2297 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2298 	return sizeof(struct rx_desc);
2299 }
2300