xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision f0a75d274af375d15b97b830966b99a02b7db911)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Chelsio Corporation nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <machine/bus.h>
44 #include <machine/resource.h>
45 #include <sys/bus_dma.h>
46 #include <sys/rman.h>
47 #include <sys/queue.h>
48 #include <sys/sysctl.h>
49 #include <sys/taskqueue.h>
50 
51 
52 #include <sys/proc.h>
53 #include <sys/sched.h>
54 #include <sys/smp.h>
55 #include <sys/systm.h>
56 
57 #include <netinet/in_systm.h>
58 #include <netinet/in.h>
59 #include <netinet/ip.h>
60 #include <netinet/tcp.h>
61 
62 #include <dev/pci/pcireg.h>
63 #include <dev/pci/pcivar.h>
64 #include <dev/cxgb/common/cxgb_common.h>
65 #include <dev/cxgb/common/cxgb_regs.h>
66 #include <dev/cxgb/common/cxgb_sge_defs.h>
67 #include <dev/cxgb/common/cxgb_t3_cpl.h>
68 #include <dev/cxgb/common/cxgb_firmware_exports.h>
69 
70 #define USE_GTS 0
71 
72 #define SGE_RX_SM_BUF_SIZE	1536
73 #define SGE_RX_DROP_THRES	16
74 
75 /*
76  * Period of the Tx buffer reclaim timer.  This timer does not need to run
77  * frequently as Tx buffers are usually reclaimed by new Tx packets.
78  */
79 #define TX_RECLAIM_PERIOD       (hz >> 2)
80 
81 /*
82  * work request size in bytes
83  */
84 #define WR_LEN (WR_FLITS * 8)
85 
86 /*
87  * Values for sge_txq.flags
88  */
89 enum {
90 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
91 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
92 };
93 
94 struct tx_desc {
95 	uint64_t	flit[TX_DESC_FLITS];
96 } __packed;
97 
98 struct rx_desc {
99 	uint32_t	addr_lo;
100 	uint32_t	len_gen;
101 	uint32_t	gen2;
102 	uint32_t	addr_hi;
103 } __packed;;
104 
105 struct rsp_desc {               /* response queue descriptor */
106 	struct rss_header	rss_hdr;
107 	uint32_t		flags;
108 	uint32_t		len_cq;
109 	uint8_t			imm_data[47];
110 	uint8_t			intr_gen;
111 } __packed;
112 
113 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
114 #define RX_SW_DESC_INUSE        (1 << 3)
115 #define TX_SW_DESC_MAPPED       (1 << 4)
116 
117 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
118 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
119 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
120 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
121 
122 struct tx_sw_desc {                /* SW state per Tx descriptor */
123 	struct mbuf	*m;
124 	bus_dmamap_t	map;
125 	int		flags;
126 };
127 
128 struct rx_sw_desc {                /* SW state per Rx descriptor */
129 	void	        *cl;
130 	bus_dmamap_t	map;
131 	int		flags;
132 };
133 
134 struct txq_state {
135 	unsigned int compl;
136 	unsigned int gen;
137 	unsigned int pidx;
138 };
139 
140 struct refill_fl_cb_arg {
141 	int               error;
142 	bus_dma_segment_t seg;
143 	int               nseg;
144 };
145 
146 /*
147  * Maps a number of flits to the number of Tx descriptors that can hold them.
148  * The formula is
149  *
150  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
151  *
152  * HW allows up to 4 descriptors to be combined into a WR.
153  */
154 static uint8_t flit_desc_map[] = {
155 	0,
156 #if SGE_NUM_GENBITS == 1
157 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
159 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
160 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
161 #elif SGE_NUM_GENBITS == 2
162 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
163 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
164 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
165 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
166 #else
167 # error "SGE_NUM_GENBITS must be 1 or 2"
168 #endif
169 };
170 
171 
172 static int lro_default = 0;
173 int cxgb_debug = 0;
174 
175 static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
176 static void sge_timer_cb(void *arg);
177 static void sge_timer_reclaim(void *arg, int ncount);
178 static int free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec);
179 
180 /**
181  *	reclaim_completed_tx - reclaims completed Tx descriptors
182  *	@adapter: the adapter
183  *	@q: the Tx queue to reclaim completed descriptors from
184  *
185  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
186  *	and frees the associated buffers if possible.  Called with the Tx
187  *	queue's lock held.
188  */
189 static __inline int
190 reclaim_completed_tx(adapter_t *adapter, struct sge_txq *q, int nbufs, struct mbuf **mvec)
191 {
192 	int reclaimed, reclaim = desc_reclaimable(q);
193 	int n = 0;
194 
195 	mtx_assert(&q->lock, MA_OWNED);
196 
197 	if (reclaim > 0) {
198 		n = free_tx_desc(adapter, q, min(reclaim, nbufs), mvec);
199 		reclaimed = min(reclaim, nbufs);
200 		q->cleaned += reclaimed;
201 		q->in_use -= reclaimed;
202 	}
203 
204 	return (n);
205 }
206 
207 /**
208  *	t3_sge_init - initialize SGE
209  *	@adap: the adapter
210  *	@p: the SGE parameters
211  *
212  *	Performs SGE initialization needed every time after a chip reset.
213  *	We do not initialize any of the queue sets here, instead the driver
214  *	top-level must request those individually.  We also do not enable DMA
215  *	here, that should be done after the queues have been set up.
216  */
217 void
218 t3_sge_init(adapter_t *adap, struct sge_params *p)
219 {
220 	u_int ctrl, ups;
221 
222 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
223 
224 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
225 	       F_CQCRDTCTRL |
226 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
227 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
228 #if SGE_NUM_GENBITS == 1
229 	ctrl |= F_EGRGENCTRL;
230 #endif
231 	if (adap->params.rev > 0) {
232 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
233 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
234 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
235 	}
236 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
237 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
238 		     V_LORCQDRBTHRSH(512));
239 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
240 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
241 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
242 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
243 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
244 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
245 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
246 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
247 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
248 }
249 
250 
251 /**
252  *	sgl_len - calculates the size of an SGL of the given capacity
253  *	@n: the number of SGL entries
254  *
255  *	Calculates the number of flits needed for a scatter/gather list that
256  *	can hold the given number of entries.
257  */
258 static __inline unsigned int
259 sgl_len(unsigned int n)
260 {
261 	return ((3 * n) / 2 + (n & 1));
262 }
263 
264 /**
265  *	get_imm_packet - return the next ingress packet buffer from a response
266  *	@resp: the response descriptor containing the packet data
267  *
268  *	Return a packet containing the immediate data of the given response.
269  */
270 static __inline int
271 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh)
272 {
273 	struct mbuf *m;
274 	int len;
275 	uint32_t flags = ntohl(resp->flags);
276 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
277 
278 	/*
279 	 * would be a firmware bug
280 	 */
281 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
282 		return (0);
283 
284 	m = m_gethdr(M_NOWAIT, MT_DATA);
285 	len = G_RSPD_LEN(ntohl(resp->len_cq));
286 
287 	if (m) {
288 		MH_ALIGN(m, IMMED_PKT_SIZE);
289 		memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE);
290 		m->m_len = len;
291 
292 		switch (sopeop) {
293 		case RSPQ_SOP_EOP:
294 			mh->mh_head = mh->mh_tail = m;
295 			m->m_pkthdr.len = len;
296 			m->m_flags |= M_PKTHDR;
297 			break;
298 		case RSPQ_EOP:
299 			m->m_flags &= ~M_PKTHDR;
300 			mh->mh_head->m_pkthdr.len += len;
301 			mh->mh_tail->m_next = m;
302 			mh->mh_tail = m;
303 			break;
304 		}
305 	}
306 	return (m != NULL);
307 }
308 
309 
310 static __inline u_int
311 flits_to_desc(u_int n)
312 {
313 	return (flit_desc_map[n]);
314 }
315 
316 void
317 t3_sge_err_intr_handler(adapter_t *adapter)
318 {
319 	unsigned int v, status;
320 
321 
322 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
323 
324 	if (status & F_RSPQCREDITOVERFOW)
325 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
326 
327 	if (status & F_RSPQDISABLED) {
328 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
329 
330 		CH_ALERT(adapter,
331 			 "packet delivered to disabled response queue (0x%x)\n",
332 			 (v >> S_RSPQ0DISABLED) & 0xff);
333 	}
334 
335 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
336 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
337 		t3_fatal_err(adapter);
338 }
339 
340 void
341 t3_sge_prep(adapter_t *adap, struct sge_params *p)
342 {
343 	int i;
344 
345 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
346 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
347 
348 	for (i = 0; i < SGE_QSETS; ++i) {
349 		struct qset_params *q = p->qset + i;
350 
351 		q->polling = adap->params.rev > 0;
352 
353 		if (adap->flags & USING_MSIX)
354 			q->coalesce_nsecs = 6000;
355 		else
356 			q->coalesce_nsecs = 3500;
357 
358 		q->rspq_size = RSPQ_Q_SIZE;
359 		q->fl_size = FL_Q_SIZE;
360 		q->jumbo_size = JUMBO_Q_SIZE;
361 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
362 		q->txq_size[TXQ_OFLD] = 1024;
363 		q->txq_size[TXQ_CTRL] = 256;
364 		q->cong_thres = 0;
365 	}
366 }
367 
368 int
369 t3_sge_alloc(adapter_t *sc)
370 {
371 
372 	/* The parent tag. */
373 	if (bus_dma_tag_create( NULL,			/* parent */
374 				1, 0,			/* algnmnt, boundary */
375 				BUS_SPACE_MAXADDR,	/* lowaddr */
376 				BUS_SPACE_MAXADDR,	/* highaddr */
377 				NULL, NULL,		/* filter, filterarg */
378 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
379 				BUS_SPACE_UNRESTRICTED, /* nsegments */
380 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
381 				0,			/* flags */
382 				NULL, NULL,		/* lock, lockarg */
383 				&sc->parent_dmat)) {
384 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
385 		return (ENOMEM);
386 	}
387 
388 	/*
389 	 * DMA tag for normal sized RX frames
390 	 */
391 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
392 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
393 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
394 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
395 		return (ENOMEM);
396 	}
397 
398 	/*
399 	 * DMA tag for jumbo sized RX frames.
400 	 */
401 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
402 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
403 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
404 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
405 		return (ENOMEM);
406 	}
407 
408 	/*
409 	 * DMA tag for TX frames.
410 	 */
411 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
412 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
413 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
414 		NULL, NULL, &sc->tx_dmat)) {
415 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
416 		return (ENOMEM);
417 	}
418 
419 	return (0);
420 }
421 
422 int
423 t3_sge_free(struct adapter * sc)
424 {
425 
426 	if (sc->tx_dmat != NULL)
427 		bus_dma_tag_destroy(sc->tx_dmat);
428 
429 	if (sc->rx_jumbo_dmat != NULL)
430 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
431 
432 	if (sc->rx_dmat != NULL)
433 		bus_dma_tag_destroy(sc->rx_dmat);
434 
435 	if (sc->parent_dmat != NULL)
436 		bus_dma_tag_destroy(sc->parent_dmat);
437 
438 	return (0);
439 }
440 
441 void
442 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
443 {
444 
445 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
446 	qs->rspq.polling = 0 /* p->polling */;
447 }
448 
449 static void
450 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
451 {
452 	struct refill_fl_cb_arg *cb_arg = arg;
453 
454 	cb_arg->error = error;
455 	cb_arg->seg = segs[0];
456 	cb_arg->nseg = nseg;
457 
458 }
459 
460 /**
461  *	refill_fl - refill an SGE free-buffer list
462  *	@sc: the controller softc
463  *	@q: the free-list to refill
464  *	@n: the number of new buffers to allocate
465  *
466  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
467  *	The caller must assure that @n does not exceed the queue's capacity.
468  */
469 static void
470 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
471 {
472 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
473 	struct rx_desc *d = &q->desc[q->pidx];
474 	struct refill_fl_cb_arg cb_arg;
475 	void *cl;
476 	int err;
477 
478 	cb_arg.error = 0;
479 	while (n--) {
480 		/*
481 		 * We only allocate a cluster, mbuf allocation happens after rx
482 		 */
483 		if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
484 			log(LOG_WARNING, "Failed to allocate cluster\n");
485 			goto done;
486 		}
487 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
488 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
489 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
490 				/*
491 				 * XXX free cluster
492 				 */
493 				goto done;
494 			}
495 			sd->flags |= RX_SW_DESC_MAP_CREATED;
496 		}
497 		err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
498 		    refill_fl_cb, &cb_arg, 0);
499 
500 		if (err != 0 || cb_arg.error) {
501 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
502 			/*
503 			 * XXX free cluster
504 			 */
505 			return;
506 		}
507 
508 		sd->flags |= RX_SW_DESC_INUSE;
509 		sd->cl = cl;
510 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
511 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
512 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
513 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
514 
515 		d++;
516 		sd++;
517 
518 		if (++q->pidx == q->size) {
519 			q->pidx = 0;
520 			q->gen ^= 1;
521 			sd = q->sdesc;
522 			d = q->desc;
523 		}
524 		q->credits++;
525 	}
526 
527 done:
528 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
529 }
530 
531 
532 /**
533  *	free_rx_bufs - free the Rx buffers on an SGE free list
534  *	@sc: the controle softc
535  *	@q: the SGE free list to clean up
536  *
537  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
538  *	this queue should be stopped before calling this function.
539  */
540 static void
541 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
542 {
543 	u_int cidx = q->cidx;
544 
545 	while (q->credits--) {
546 		struct rx_sw_desc *d = &q->sdesc[cidx];
547 
548 		if (d->flags & RX_SW_DESC_INUSE) {
549 			bus_dmamap_unload(q->entry_tag, d->map);
550 			bus_dmamap_destroy(q->entry_tag, d->map);
551 			uma_zfree(q->zone, d->cl);
552 		}
553 		d->cl = NULL;
554 		if (++cidx == q->size)
555 			cidx = 0;
556 	}
557 }
558 
559 static __inline void
560 __refill_fl(adapter_t *adap, struct sge_fl *fl)
561 {
562 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
563 }
564 
565 static void
566 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
567 {
568 	uint32_t *addr;
569 
570 	addr = arg;
571 	*addr = segs[0].ds_addr;
572 }
573 
574 static int
575 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
576     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
577     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
578 {
579 	size_t len = nelem * elem_size;
580 	void *s = NULL;
581 	void *p = NULL;
582 	int err;
583 
584 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
585 				      BUS_SPACE_MAXADDR_32BIT,
586 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
587 				      len, 0, NULL, NULL, tag)) != 0) {
588 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
589 		return (ENOMEM);
590 	}
591 
592 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
593 				    map)) != 0) {
594 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
595 		return (ENOMEM);
596 	}
597 
598 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
599 	bzero(p, len);
600 	*(void **)desc = p;
601 
602 	if (sw_size) {
603 		len = nelem * sw_size;
604 		s = malloc(len, M_DEVBUF, M_WAITOK);
605 		bzero(s, len);
606 		*(void **)sdesc = s;
607 	}
608 	if (parent_entry_tag == NULL)
609 		return (0);
610 
611 	if ((err = bus_dma_tag_create(parent_entry_tag, PAGE_SIZE, 0,
612 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
613 		                      NULL, NULL, PAGE_SIZE, 1,
614 				      PAGE_SIZE, BUS_DMA_ALLOCNOW,
615 		                      NULL, NULL, entry_tag)) != 0) {
616 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
617 		return (ENOMEM);
618 	}
619 	return (0);
620 }
621 
622 static void
623 sge_slow_intr_handler(void *arg, int ncount)
624 {
625 	adapter_t *sc = arg;
626 
627 	t3_slow_intr_handler(sc);
628 }
629 
630 static void
631 sge_timer_cb(void *arg)
632 {
633 	adapter_t *sc = arg;
634 	struct sge_qset *qs;
635 	struct sge_txq  *txq;
636 	int i, j;
637 	int reclaim_eth, reclaim_ofl, refill_rx;
638 
639 	for (i = 0; i < sc->params.nports; i++)
640 		for (j = 0; j < sc->port[i].nqsets; j++) {
641 			qs = &sc->sge.qs[i + j];
642 			txq = &qs->txq[0];
643 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
644 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
645 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
646 			    (qs->fl[1].credits < qs->fl[1].size));
647 			if (reclaim_eth || reclaim_ofl || refill_rx) {
648 				taskqueue_enqueue(sc->tq, &sc->timer_reclaim_task);
649 				goto done;
650 			}
651 		}
652 done:
653 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
654 }
655 
656 /*
657  * This is meant to be a catch-all function to keep sge state private
658  * to sge.c
659  *
660  */
661 int
662 t3_sge_init_sw(adapter_t *sc)
663 {
664 
665 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
666 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
667 	TASK_INIT(&sc->timer_reclaim_task, 0, sge_timer_reclaim, sc);
668 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
669 	return (0);
670 }
671 
672 void
673 t3_sge_deinit_sw(adapter_t *sc)
674 {
675 	callout_drain(&sc->sge_timer_ch);
676 	if (sc->tq) {
677 		taskqueue_drain(sc->tq, &sc->timer_reclaim_task);
678 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
679 	}
680 }
681 
682 /**
683  *	refill_rspq - replenish an SGE response queue
684  *	@adapter: the adapter
685  *	@q: the response queue to replenish
686  *	@credits: how many new responses to make available
687  *
688  *	Replenishes a response queue by making the supplied number of responses
689  *	available to HW.
690  */
691 static __inline void
692 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
693 {
694 
695 	/* mbufs are allocated on demand when a rspq entry is processed. */
696 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
697 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
698 }
699 
700 
701 static void
702 sge_timer_reclaim(void *arg, int ncount)
703 {
704 	adapter_t *sc = arg;
705 	int i, nqsets = 0;
706 	struct sge_qset *qs;
707 	struct sge_txq *txq;
708 	struct mtx *lock;
709 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
710 	int n, reclaimable;
711 	/*
712 	 * XXX assuming these quantities are allowed to change during operation
713 	 */
714 	for (i = 0; i < sc->params.nports; i++)
715 		nqsets += sc->port[i].nqsets;
716 
717 	for (i = 0; i < nqsets; i++) {
718 		qs = &sc->sge.qs[i];
719 		txq = &qs->txq[TXQ_ETH];
720 		reclaimable = desc_reclaimable(txq);
721 		if (reclaimable > 0) {
722 			mtx_lock(&txq->lock);
723 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
724 			mtx_unlock(&txq->lock);
725 
726 			for (i = 0; i < n; i++) {
727 				m_freem(m_vec[i]);
728 			}
729 		}
730 
731 		txq = &qs->txq[TXQ_OFLD];
732 		reclaimable = desc_reclaimable(txq);
733 		if (reclaimable > 0) {
734 			mtx_lock(&txq->lock);
735 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
736 			mtx_unlock(&txq->lock);
737 
738 			for (i = 0; i < n; i++) {
739 				m_freem(m_vec[i]);
740 			}
741 		}
742 
743 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
744 			    &sc->sge.qs[0].rspq.lock;
745 
746 		if (mtx_trylock(lock)) {
747 			/* XXX currently assume that we are *NOT* polling */
748 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
749 
750 			if (qs->fl[0].credits < qs->fl[0].size - 16)
751 				__refill_fl(sc, &qs->fl[0]);
752 			if (qs->fl[1].credits < qs->fl[1].size - 16)
753 				__refill_fl(sc, &qs->fl[1]);
754 
755 			if (status & (1 << qs->rspq.cntxt_id)) {
756 				if (qs->rspq.credits) {
757 					refill_rspq(sc, &qs->rspq, 1);
758 					qs->rspq.credits--;
759 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
760 					    1 << qs->rspq.cntxt_id);
761 				}
762 			}
763 			mtx_unlock(lock);
764 		}
765 	}
766 }
767 
768 /**
769  *	init_qset_cntxt - initialize an SGE queue set context info
770  *	@qs: the queue set
771  *	@id: the queue set id
772  *
773  *	Initializes the TIDs and context ids for the queues of a queue set.
774  */
775 static void
776 init_qset_cntxt(struct sge_qset *qs, u_int id)
777 {
778 
779 	qs->rspq.cntxt_id = id;
780 	qs->fl[0].cntxt_id = 2 * id;
781 	qs->fl[1].cntxt_id = 2 * id + 1;
782 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
783 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
784 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
785 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
786 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
787 }
788 
789 
790 static void
791 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
792 {
793 	txq->in_use += ndesc;
794 	/*
795 	 * XXX we don't handle stopping of queue
796 	 * presumably start handles this when we bump against the end
797 	 */
798 	txqs->gen = txq->gen;
799 	txq->unacked += ndesc;
800 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
801 	txq->unacked &= 7;
802 	txqs->pidx = txq->pidx;
803 	txq->pidx += ndesc;
804 
805 	if (txq->pidx >= txq->size) {
806 		txq->pidx -= txq->size;
807 		txq->gen ^= 1;
808 	}
809 
810 }
811 
812 /**
813  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
814  *	@m: the packet mbufs
815  *      @nsegs: the number of segments
816  *
817  * 	Returns the number of Tx descriptors needed for the given Ethernet
818  * 	packet.  Ethernet packets require addition of WR and CPL headers.
819  */
820 static __inline unsigned int
821 calc_tx_descs(const struct mbuf *m, int nsegs)
822 {
823 	unsigned int flits;
824 
825 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
826 		return 1;
827 
828 	flits = sgl_len(nsegs) + 2;
829 #ifdef TSO_SUPPORTED
830 	if (m->m_pkthdr.tso_segsz)
831 		flits++;
832 #endif
833 	return flits_to_desc(flits);
834 }
835 
836 static __inline unsigned int
837 busdma_map_mbufs(struct mbuf **m, adapter_t *sc, struct tx_sw_desc *stx,
838     bus_dma_segment_t *segs, int *nsegs)
839 {
840 	struct mbuf *m0, *mtmp;
841 	int err, pktlen;
842 
843 	m0 = *m;
844 	pktlen = m0->m_pkthdr.len;
845 	err = bus_dmamap_load_mbuf_sg(sc->tx_dmat, stx->map, m0, segs, nsegs, 0);
846 	if (err) {
847 		int n = 0;
848 		mtmp = m0;
849 		while(mtmp) {
850 			n++;
851 			mtmp = mtmp->m_next;
852 		}
853 #ifdef DEBUG
854 		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
855 		    err, m0->m_pkthdr.len, n);
856 #endif
857 	}
858 
859 	if (err == EFBIG) {
860 		/* Too many segments, try to defrag */
861 		m0 = m_defrag(m0, M_NOWAIT);
862 		if (m0 == NULL) {
863 			m_freem(*m);
864 			*m = NULL;
865 			return (ENOBUFS);
866 		}
867 		*m = m0;
868 		err = bus_dmamap_load_mbuf_sg(sc->tx_dmat, stx->map, m0, segs, nsegs, 0);
869 	}
870 
871 	if (err == ENOMEM) {
872 		return (err);
873 	}
874 
875 	if (err) {
876 		if (cxgb_debug)
877 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
878 		m_freem(m0);
879 		*m = NULL;
880 		return (err);
881 	}
882 
883 	bus_dmamap_sync(sc->tx_dmat, stx->map, BUS_DMASYNC_PREWRITE);
884 	stx->flags |= TX_SW_DESC_MAPPED;
885 
886 	return (0);
887 }
888 
889 /**
890  *	make_sgl - populate a scatter/gather list for a packet
891  *	@sgp: the SGL to populate
892  *	@segs: the packet dma segments
893  *	@nsegs: the number of segments
894  *
895  *	Generates a scatter/gather list for the buffers that make up a packet
896  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
897  *	appropriately.
898  */
899 static __inline void
900 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
901 {
902 	int i, idx;
903 
904 	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
905 		if (i && idx == 0)
906 			++sgp;
907 
908 		sgp->len[idx] = htobe32(segs[i].ds_len);
909 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
910 	}
911 
912 	if (idx)
913 		sgp->len[idx] = 0;
914 }
915 
916 /**
917  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
918  *	@adap: the adapter
919  *	@q: the Tx queue
920  *
921  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
922  *	where the HW is going to sleep just after we checked, however,
923  *	then the interrupt handler will detect the outstanding TX packet
924  *	and ring the doorbell for us.
925  *
926  *	When GTS is disabled we unconditionally ring the doorbell.
927  */
928 static __inline void
929 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
930 {
931 #if USE_GTS
932 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
933 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
934 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
935 #ifdef T3_TRACE
936 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
937 			  q->cntxt_id);
938 #endif
939 		t3_write_reg(adap, A_SG_KDOORBELL,
940 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
941 	}
942 #else
943 	wmb();            /* write descriptors before telling HW */
944 	t3_write_reg(adap, A_SG_KDOORBELL,
945 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
946 #endif
947 }
948 
949 static __inline void
950 wr_gen2(struct tx_desc *d, unsigned int gen)
951 {
952 #if SGE_NUM_GENBITS == 2
953 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
954 #endif
955 }
956 
957 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
958 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
959 
960 int
961 t3_encap(struct port_info *p, struct mbuf **m)
962 {
963 	adapter_t *sc;
964 	struct mbuf *m0;
965 	struct sge_qset *qs;
966 	struct sge_txq *txq;
967 	struct tx_sw_desc *stx;
968 	struct txq_state txqs;
969 	unsigned int nsegs, ndesc, flits, cntrl, mlen, tso_info;
970 	int err;
971 
972 	struct work_request_hdr *wrp;
973 	struct tx_sw_desc *txsd;
974 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
975 	bus_dma_segment_t segs[TX_MAX_SEGS];
976 	uint32_t wr_hi, wr_lo, sgl_flits;
977 
978 	struct tx_desc *txd;
979 	struct cpl_tx_pkt *cpl;
980 
981 	DPRINTF("t3_encap ");
982 	m0 = *m;
983 	sc = p->adapter;
984 	qs = &sc->sge.qs[p->first_qset];
985 	txq = &qs->txq[TXQ_ETH];
986 	stx = &txq->sdesc[txq->pidx];
987 	txd = &txq->desc[txq->pidx];
988 	cpl = (struct cpl_tx_pkt *)txd;
989 	mlen = m0->m_pkthdr.len;
990 	cpl->len = htonl(mlen | 0x80000000);
991 
992 	DPRINTF("mlen=%d\n", mlen);
993 	/*
994 	 * XXX handle checksum, TSO, and VLAN here
995 	 *
996 	 */
997 	cntrl = V_TXPKT_INTF(p->port);
998 
999 	/*
1000 	 * XXX need to add VLAN support for 6.x
1001 	 */
1002 #ifdef VLAN_SUPPORTED
1003 	if (m0->m_flags & M_VLANTAG)
1004 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
1005 
1006 	tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1007 #else
1008 	tso_info = 0;
1009 #endif
1010 	if (tso_info) {
1011 		int eth_type;
1012 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
1013 		struct ip *ip;
1014 		struct tcphdr *tcp;
1015 		uint8_t *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
1016 
1017 		txd->flit[2] = 0;
1018 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1019 		hdr->cntrl = htonl(cntrl);
1020 
1021 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1022 			pkthdr = &tmp[0];
1023 			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
1024 		} else {
1025 			pkthdr = m0->m_data;
1026 		}
1027 
1028 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1029 			eth_type = CPL_ETH_II_VLAN;
1030 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1031 			    ETHER_VLAN_ENCAP_LEN);
1032 		} else {
1033 			eth_type = CPL_ETH_II;
1034 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1035 		}
1036 		tcp = (struct tcphdr *)((uint8_t *)ip +
1037 		    sizeof(*ip));
1038 
1039 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1040 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1041 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1042 		hdr->lso_info = htonl(tso_info);
1043 
1044 		flits = 3;
1045 	} else {
1046 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1047 		cpl->cntrl = htonl(cntrl);
1048 
1049 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1050 			txq_prod(txq, 1, &txqs);
1051 			txq->sdesc[txqs.pidx].m = m0;
1052 
1053 			if (m0->m_len == m0->m_pkthdr.len)
1054 				memcpy(&txd->flit[2], m0->m_data, mlen);
1055 			else
1056 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1057 
1058 			flits = (mlen + 7) / 8 + 2;
1059 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1060 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1061 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1062 			wmb();
1063 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1064 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1065 
1066 			wr_gen2(txd, txqs.gen);
1067 			check_ring_tx_db(sc, txq);
1068 			return (0);
1069 		}
1070 		flits = 2;
1071 	}
1072 
1073 	wrp = (struct work_request_hdr *)txd;
1074 
1075 	if ((err = busdma_map_mbufs(m, sc, stx, segs, &nsegs)) != 0) {
1076 		return (err);
1077 	}
1078 	m0 = *m;
1079 	ndesc = calc_tx_descs(m0, nsegs);
1080 
1081 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : &sgl[0];
1082 	make_sgl(sgp, segs, nsegs);
1083 
1084 	sgl_flits = sgl_len(nsegs);
1085 
1086 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1087 	txq_prod(txq, ndesc, &txqs);
1088 	txsd = &txq->sdesc[txqs.pidx];
1089 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1090 	wr_lo = htonl(V_WR_TID(txq->token));
1091 	txsd->m = m0;
1092 
1093 	if (__predict_true(ndesc == 1)) {
1094 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1095 		    V_WR_SGLSFLT(flits)) | wr_hi;
1096 		wmb();
1097 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1098 		    V_WR_GEN(txqs.gen)) | wr_lo;
1099 		/* XXX gen? */
1100 		wr_gen2(txd, txqs.gen);
1101 	} else {
1102 		unsigned int ogen = txqs.gen;
1103 		const uint64_t *fp = (const uint64_t *)sgl;
1104 		struct work_request_hdr *wp = wrp;
1105 
1106 		/* XXX - CHECK ME */
1107 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1108 		    V_WR_SGLSFLT(flits)) | wr_hi;
1109 
1110 		while (sgl_flits) {
1111 			unsigned int avail = WR_FLITS - flits;
1112 
1113 			if (avail > sgl_flits)
1114 				avail = sgl_flits;
1115 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1116 			sgl_flits -= avail;
1117 			ndesc--;
1118 			if (!sgl_flits)
1119 				break;
1120 
1121 			fp += avail;
1122 			txd++;
1123 			txsd++;
1124 			if (++txqs.pidx == txq->size) {
1125 				txqs.pidx = 0;
1126 				txqs.gen ^= 1;
1127 				txd = txq->desc;
1128 				txsd = txq->sdesc;
1129 			}
1130 
1131 			/*
1132 			 * when the head of the mbuf chain
1133 			 * is freed all clusters will be freed
1134 			 * with it
1135 			 */
1136 			txsd->m = NULL;
1137 			wrp = (struct work_request_hdr *)txd;
1138 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1139 			    V_WR_SGLSFLT(1)) | wr_hi;
1140 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1141 				    sgl_flits + 1)) |
1142 			    V_WR_GEN(txqs.gen)) | wr_lo;
1143 			wr_gen2(txd, txqs.gen);
1144 			flits = 1;
1145 		}
1146 #ifdef WHY
1147 		skb->priority = pidx;
1148 #endif
1149 		wrp->wr_hi |= htonl(F_WR_EOP);
1150 		wmb();
1151 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1152 		wr_gen2((struct tx_desc *)wp, ogen);
1153 	}
1154 	check_ring_tx_db(p->adapter, txq);
1155 
1156 	return (0);
1157 }
1158 
1159 
1160 /**
1161  *	write_imm - write a packet into a Tx descriptor as immediate data
1162  *	@d: the Tx descriptor to write
1163  *	@m: the packet
1164  *	@len: the length of packet data to write as immediate data
1165  *	@gen: the generation bit value to write
1166  *
1167  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1168  *	contains a work request at its beginning.  We must write the packet
1169  *	carefully so the SGE doesn't read accidentally before it's written in
1170  *	its entirety.
1171  */
1172 static __inline void write_imm(struct tx_desc *d, struct mbuf *m,
1173 			     unsigned int len, unsigned int gen)
1174 {
1175 	struct work_request_hdr *from = (struct work_request_hdr *)m->m_data;
1176 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1177 
1178 	memcpy(&to[1], &from[1], len - sizeof(*from));
1179 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1180 					V_WR_BCNTLFLT(len & 7));
1181 	wmb();
1182 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1183 					V_WR_LEN((len + 7) / 8));
1184 	wr_gen2(d, gen);
1185 	m_freem(m);
1186 }
1187 
1188 /**
1189  *	check_desc_avail - check descriptor availability on a send queue
1190  *	@adap: the adapter
1191  *	@q: the TX queue
1192  *	@m: the packet needing the descriptors
1193  *	@ndesc: the number of Tx descriptors needed
1194  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1195  *
1196  *	Checks if the requested number of Tx descriptors is available on an
1197  *	SGE send queue.  If the queue is already suspended or not enough
1198  *	descriptors are available the packet is queued for later transmission.
1199  *	Must be called with the Tx queue locked.
1200  *
1201  *	Returns 0 if enough descriptors are available, 1 if there aren't
1202  *	enough descriptors and the packet has been queued, and 2 if the caller
1203  *	needs to retry because there weren't enough descriptors at the
1204  *	beginning of the call but some freed up in the mean time.
1205  */
1206 static __inline int
1207 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1208 				   struct mbuf *m, unsigned int ndesc,
1209 				   unsigned int qid)
1210 {
1211 	/*
1212 	 * XXX We currently only use this for checking the control queue
1213 	 * the control queue is only used for binding qsets which happens
1214 	 * at init time so we are guaranteed enough descriptors
1215 	 */
1216 #if 0
1217 	if (__predict_false(!skb_queue_empty(&q->sendq))) {
1218 addq_exit:	__skb_queue_tail(&q->sendq, skb);
1219 		return 1;
1220 	}
1221 	if (__predict_false(q->size - q->in_use < ndesc)) {
1222 
1223 		struct sge_qset *qs = txq_to_qset(q, qid);
1224 
1225 		set_bit(qid, &qs->txq_stopped);
1226 		smp_mb__after_clear_bit();
1227 
1228 		if (should_restart_tx(q) &&
1229 		    test_and_clear_bit(qid, &qs->txq_stopped))
1230 			return 2;
1231 
1232 		q->stops++;
1233 		goto addq_exit;
1234 	}
1235 #endif
1236 	return 0;
1237 }
1238 
1239 
1240 /**
1241  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1242  *	@q: the SGE control Tx queue
1243  *
1244  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1245  *	that send only immediate data (presently just the control queues) and
1246  *	thus do not have any sk_buffs to release.
1247  */
1248 static __inline void
1249 reclaim_completed_tx_imm(struct sge_txq *q)
1250 {
1251 	unsigned int reclaim = q->processed - q->cleaned;
1252 
1253 	mtx_assert(&q->lock, MA_OWNED);
1254 
1255 	q->in_use -= reclaim;
1256 	q->cleaned += reclaim;
1257 }
1258 
1259 static __inline int
1260 immediate(const struct mbuf *m)
1261 {
1262 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1263 }
1264 
1265 /**
1266  *	ctrl_xmit - send a packet through an SGE control Tx queue
1267  *	@adap: the adapter
1268  *	@q: the control queue
1269  *	@m: the packet
1270  *
1271  *	Send a packet through an SGE control Tx queue.  Packets sent through
1272  *	a control queue must fit entirely as immediate data in a single Tx
1273  *	descriptor and have no page fragments.
1274  */
1275 static int
1276 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1277 {
1278 	int ret;
1279 	struct work_request_hdr *wrp = (struct work_request_hdr *)m->m_data;
1280 
1281 	if (__predict_false(!immediate(m))) {
1282 		m_freem(m);
1283 		return 0;
1284 	}
1285 
1286 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1287 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1288 
1289 	mtx_lock(&q->lock);
1290 again:	reclaim_completed_tx_imm(q);
1291 
1292 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1293 	if (__predict_false(ret)) {
1294 		if (ret == 1) {
1295 			mtx_unlock(&q->lock);
1296 			return (-1);
1297 		}
1298 		goto again;
1299 	}
1300 
1301 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1302 
1303 	q->in_use++;
1304 	if (++q->pidx >= q->size) {
1305 		q->pidx = 0;
1306 		q->gen ^= 1;
1307 	}
1308 	mtx_unlock(&q->lock);
1309 	wmb();
1310 	t3_write_reg(adap, A_SG_KDOORBELL,
1311 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1312 	return (0);
1313 }
1314 
1315 #ifdef RESTART_CTRLQ
1316 /**
1317  *	restart_ctrlq - restart a suspended control queue
1318  *	@qs: the queue set cotaining the control queue
1319  *
1320  *	Resumes transmission on a suspended Tx control queue.
1321  */
1322 static void
1323 restart_ctrlq(unsigned long data)
1324 {
1325 	struct mbuf *m;
1326 	struct sge_qset *qs = (struct sge_qset *)data;
1327 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1328 	adapter_t *adap = qs->port->adapter;
1329 
1330 	mtx_lock(&q->lock);
1331 again:	reclaim_completed_tx_imm(q);
1332 
1333 	while (q->in_use < q->size &&
1334 	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
1335 
1336 		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1337 
1338 		if (++q->pidx >= q->size) {
1339 			q->pidx = 0;
1340 			q->gen ^= 1;
1341 		}
1342 		q->in_use++;
1343 	}
1344 	if (!skb_queue_empty(&q->sendq)) {
1345 		set_bit(TXQ_CTRL, &qs->txq_stopped);
1346 		smp_mb__after_clear_bit();
1347 
1348 		if (should_restart_tx(q) &&
1349 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1350 			goto again;
1351 		q->stops++;
1352 	}
1353 
1354 	mtx_unlock(&q->lock);
1355 	t3_write_reg(adap, A_SG_KDOORBELL,
1356 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1357 }
1358 #endif
1359 
1360 /*
1361  * Send a management message through control queue 0
1362  */
1363 int
1364 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1365 {
1366 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1367 }
1368 
1369 /**
1370  *	t3_sge_alloc_qset - initialize an SGE queue set
1371  *	@sc: the controller softc
1372  *	@id: the queue set id
1373  *	@nports: how many Ethernet ports will be using this queue set
1374  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
1375  *	@p: configuration parameters for this queue set
1376  *	@ntxq: number of Tx queues for the queue set
1377  *	@pi: port info for queue set
1378  *
1379  *	Allocate resources and initialize an SGE queue set.  A queue set
1380  *	comprises a response queue, two Rx free-buffer queues, and up to 3
1381  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
1382  *	queue, offload queue, and control queue.
1383  */
1384 int
1385 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
1386 		  const struct qset_params *p, int ntxq, struct port_info *pi)
1387 {
1388 	struct sge_qset *q = &sc->sge.qs[id];
1389 	int i, ret = 0;
1390 
1391 	init_qset_cntxt(q, id);
1392 
1393 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
1394 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
1395 		    &q->fl[0].desc, &q->fl[0].sdesc,
1396 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
1397 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
1398 		printf("error %d from alloc ring fl0\n", ret);
1399 		goto err;
1400 	}
1401 
1402 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
1403 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
1404 		    &q->fl[1].desc, &q->fl[1].sdesc,
1405 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
1406 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
1407 		printf("error %d from alloc ring fl1\n", ret);
1408 		goto err;
1409 	}
1410 
1411 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
1412 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
1413 		    &q->rspq.desc_tag, &q->rspq.desc_map,
1414 		    NULL, NULL)) != 0) {
1415 		printf("error %d from alloc ring rspq\n", ret);
1416 		goto err;
1417 	}
1418 
1419 	for (i = 0; i < ntxq; ++i) {
1420 		/*
1421 		 * The control queue always uses immediate data so does not
1422 		 * need to keep track of any mbufs.
1423 		 * XXX Placeholder for future TOE support.
1424 		 */
1425 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
1426 
1427 		if ((ret = alloc_ring(sc, p->txq_size[i],
1428 			    sizeof(struct tx_desc), sz,
1429 			    &q->txq[i].phys_addr, &q->txq[i].desc,
1430 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
1431 			    &q->txq[i].desc_map,
1432 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
1433 			printf("error %d from alloc ring tx %i\n", ret, i);
1434 			goto err;
1435 		}
1436 
1437 		q->txq[i].gen = 1;
1438 		q->txq[i].size = p->txq_size[i];
1439 		mtx_init(&q->txq[i].lock, "t3 txq lock", NULL, MTX_DEF);
1440 	}
1441 
1442 	q->fl[0].gen = q->fl[1].gen = 1;
1443 	q->fl[0].size = p->fl_size;
1444 	q->fl[1].size = p->jumbo_size;
1445 
1446 	q->rspq.gen = 1;
1447 	q->rspq.size = p->rspq_size;
1448 	mtx_init(&q->rspq.lock, "t3 rspq lock", NULL, MTX_DEF);
1449 
1450 	q->txq[TXQ_ETH].stop_thres = nports *
1451 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
1452 
1453 	q->fl[0].buf_size = MCLBYTES;
1454 	q->fl[0].zone = zone_clust;
1455 	q->fl[0].type = EXT_CLUSTER;
1456 	q->fl[1].buf_size = MJUMPAGESIZE;
1457 	q->fl[1].zone = zone_jumbop;
1458 	q->fl[1].type = EXT_JUMBOP;
1459 
1460 	q->lro.enabled = lro_default;
1461 
1462 	mtx_lock(&sc->sge.reg_lock);
1463 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
1464 				   q->rspq.phys_addr, q->rspq.size,
1465 				   q->fl[0].buf_size, 1, 0);
1466 	if (ret) {
1467 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
1468 		goto err_unlock;
1469 	}
1470 
1471 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1472 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
1473 					  q->fl[i].phys_addr, q->fl[i].size,
1474 					  q->fl[i].buf_size, p->cong_thres, 1,
1475 					  0);
1476 		if (ret) {
1477 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
1478 			goto err_unlock;
1479 		}
1480 	}
1481 
1482 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
1483 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
1484 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
1485 				 1, 0);
1486 	if (ret) {
1487 		printf("error %d from t3_sge_init_ecntxt\n", ret);
1488 		goto err_unlock;
1489 	}
1490 
1491 	if (ntxq > 1) {
1492 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
1493 					 USE_GTS, SGE_CNTXT_OFLD, id,
1494 					 q->txq[TXQ_OFLD].phys_addr,
1495 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
1496 		if (ret) {
1497 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1498 			goto err_unlock;
1499 		}
1500 	}
1501 
1502 	if (ntxq > 2) {
1503 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
1504 					 SGE_CNTXT_CTRL, id,
1505 					 q->txq[TXQ_CTRL].phys_addr,
1506 					 q->txq[TXQ_CTRL].size,
1507 					 q->txq[TXQ_CTRL].token, 1, 0);
1508 		if (ret) {
1509 			printf("error %d from t3_sge_init_ecntxt\n", ret);
1510 			goto err_unlock;
1511 		}
1512 	}
1513 
1514 	mtx_unlock(&sc->sge.reg_lock);
1515 	t3_update_qset_coalesce(q, p);
1516 	q->port = pi;
1517 
1518 	refill_fl(sc, &q->fl[0], q->fl[0].size);
1519 	refill_fl(sc, &q->fl[1], q->fl[1].size);
1520 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
1521 
1522 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
1523 		     V_NEWTIMER(q->rspq.holdoff_tmr));
1524 
1525 	return (0);
1526 
1527 err_unlock:
1528 	mtx_unlock(&sc->sge.reg_lock);
1529 err:
1530 	t3_free_qset(sc, q);
1531 
1532 	return (ret);
1533 }
1534 
1535 
1536 /**
1537  *	free_qset - free the resources of an SGE queue set
1538  *	@sc: the controller owning the queue set
1539  *	@q: the queue set
1540  *
1541  *	Release the HW and SW resources associated with an SGE queue set, such
1542  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1543  *	queue set must be quiesced prior to calling this.
1544  */
1545 static void
1546 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1547 {
1548 	int i;
1549 
1550 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1551 		if (q->fl[i].desc) {
1552 			mtx_lock(&sc->sge.reg_lock);
1553 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1554 			mtx_unlock(&sc->sge.reg_lock);
1555 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1556 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1557 					q->fl[i].desc_map);
1558 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1559 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1560 		}
1561 		if (q->fl[i].sdesc) {
1562 			free_rx_bufs(sc, &q->fl[i]);
1563 			free(q->fl[i].sdesc, M_DEVBUF);
1564 		}
1565 	}
1566 
1567 	for (i = 0; i < SGE_TXQ_PER_SET; ++i) {
1568 		if (q->txq[i].desc) {
1569 			mtx_lock(&sc->sge.reg_lock);
1570 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1571 			mtx_unlock(&sc->sge.reg_lock);
1572 			bus_dmamap_unload(q->txq[i].desc_tag,
1573 					q->txq[i].desc_map);
1574 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1575 					q->txq[i].desc_map);
1576 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1577 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1578 		}
1579 		if (q->txq[i].sdesc) {
1580 			free(q->txq[i].sdesc, M_DEVBUF);
1581 		}
1582 		if (mtx_initialized(&q->txq[i].lock)) {
1583 			mtx_destroy(&q->txq[i].lock);
1584 		}
1585 	}
1586 
1587 	if (q->rspq.desc) {
1588 		mtx_lock(&sc->sge.reg_lock);
1589 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1590 		mtx_unlock(&sc->sge.reg_lock);
1591 
1592 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1593 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1594 			        q->rspq.desc_map);
1595 		bus_dma_tag_destroy(q->rspq.desc_tag);
1596 	}
1597 
1598 	if (mtx_initialized(&q->rspq.lock))
1599 		mtx_destroy(&q->rspq.lock);
1600 
1601 	bzero(q, sizeof(*q));
1602 }
1603 
1604 /**
1605  *	t3_free_sge_resources - free SGE resources
1606  *	@sc: the adapter softc
1607  *
1608  *	Frees resources used by the SGE queue sets.
1609  */
1610 void
1611 t3_free_sge_resources(adapter_t *sc)
1612 {
1613 	int i;
1614 
1615 	for (i = 0; i < SGE_QSETS; ++i)
1616 		t3_free_qset(sc, &sc->sge.qs[i]);
1617 }
1618 
1619 /**
1620  *	t3_sge_start - enable SGE
1621  *	@sc: the controller softc
1622  *
1623  *	Enables the SGE for DMAs.  This is the last step in starting packet
1624  *	transfers.
1625  */
1626 void
1627 t3_sge_start(adapter_t *sc)
1628 {
1629 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1630 }
1631 
1632 
1633 /**
1634  *	free_tx_desc - reclaims Tx descriptors and their buffers
1635  *	@adapter: the adapter
1636  *	@q: the Tx queue to reclaim descriptors from
1637  *	@n: the number of descriptors to reclaim
1638  *
1639  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1640  *	Tx buffers.  Called with the Tx queue lock held.
1641  */
1642 int
1643 free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec)
1644 {
1645 	struct tx_sw_desc *d;
1646 	unsigned int cidx = q->cidx;
1647 	int nbufs = 0;
1648 
1649 #ifdef T3_TRACE
1650 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1651 		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
1652 #endif
1653 	d = &q->sdesc[cidx];
1654 
1655 	while (n-- > 0) {
1656 		DPRINTF("cidx=%d d=%p\n", cidx, d);
1657 		if (d->m) {
1658 			if (d->flags & TX_SW_DESC_MAPPED) {
1659 				bus_dmamap_unload(q->entry_tag, d->map);
1660 				bus_dmamap_destroy(q->entry_tag, d->map);
1661 				d->flags &= ~TX_SW_DESC_MAPPED;
1662 			}
1663 			m_vec[nbufs] = d->m;
1664 			d->m = NULL;
1665 			nbufs++;
1666 		}
1667 		++d;
1668 		if (++cidx == q->size) {
1669 			cidx = 0;
1670 			d = q->sdesc;
1671 		}
1672 	}
1673 	q->cidx = cidx;
1674 
1675 	return (nbufs);
1676 }
1677 
1678 /**
1679  *	is_new_response - check if a response is newly written
1680  *	@r: the response descriptor
1681  *	@q: the response queue
1682  *
1683  *	Returns true if a response descriptor contains a yet unprocessed
1684  *	response.
1685  */
1686 static __inline int
1687 is_new_response(const struct rsp_desc *r,
1688     const struct sge_rspq *q)
1689 {
1690 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1691 }
1692 
1693 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1694 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1695 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1696 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1697 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1698 
1699 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1700 #define NOMEM_INTR_DELAY 2500
1701 
1702 static __inline void
1703 deliver_partial_bundle(struct t3cdev *tdev, struct sge_rspq *q)
1704 {
1705 	;
1706 }
1707 
1708 static __inline void
1709 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1710     struct mbuf *m)
1711 {
1712 #ifdef notyet
1713 	if (rq->polling) {
1714 		rq->offload_skbs[rq->offload_skbs_idx++] = skb;
1715 		if (rq->offload_skbs_idx == RX_BUNDLE_SIZE) {
1716 			cxgb_ofld_recv(tdev, rq->offload_skbs, RX_BUNDLE_SIZE);
1717 			rq->offload_skbs_idx = 0;
1718 			rq->offload_bundles++;
1719 		}
1720 	} else
1721 #endif
1722 	{
1723 		/* XXX */
1724 		panic("implement offload enqueue\n");
1725 	}
1726 
1727 }
1728 
1729 static void
1730 restart_tx(struct sge_qset *qs)
1731 {
1732 	;
1733 }
1734 
1735 void
1736 t3_rx_eth(struct port_info *pi, struct sge_rspq *rq, struct mbuf *m, int ethpad)
1737 {
1738 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(m->m_data + ethpad);
1739 	struct ifnet *ifp = pi->ifp;
1740 
1741 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, m->m_data, cpl->iff);
1742 	if (&pi->adapter->port[cpl->iff] != pi)
1743 		panic("bad port index %d m->m_data=%p\n", cpl->iff, m->m_data);
1744 
1745 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
1746 	    cpl->csum_valid && cpl->csum == 0xffff) {
1747 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
1748 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
1749 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1750 		m->m_pkthdr.csum_data = 0xffff;
1751 	}
1752 	/*
1753 	 * XXX need to add VLAN support for 6.x
1754 	 */
1755 #ifdef VLAN_SUPPORTED
1756 	if (__predict_false(cpl->vlan_valid)) {
1757 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
1758 		m->m_flags |= M_VLANTAG;
1759 	}
1760 #endif
1761 	m->m_pkthdr.rcvif = ifp;
1762 
1763 	m_adj(m, sizeof(*cpl) + ethpad);
1764 
1765 	(*ifp->if_input)(ifp, m);
1766 }
1767 
1768 /**
1769  *	get_packet - return the next ingress packet buffer from a free list
1770  *	@adap: the adapter that received the packet
1771  *	@drop_thres: # of remaining buffers before we start dropping packets
1772  *	@qs: the qset that the SGE free list holding the packet belongs to
1773  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
1774  *      @r: response descriptor
1775  *
1776  *	Get the next packet from a free list and complete setup of the
1777  *	sk_buff.  If the packet is small we make a copy and recycle the
1778  *	original buffer, otherwise we use the original buffer itself.  If a
1779  *	positive drop threshold is supplied packets are dropped and their
1780  *	buffers recycled if (a) the number of remaining buffers is under the
1781  *	threshold and the packet is too big to copy, or (b) the packet should
1782  *	be copied but there is no memory for the copy.
1783  */
1784 
1785 static int
1786 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
1787     struct t3_mbuf_hdr *mh, struct rsp_desc *r, struct mbuf *m)
1788 {
1789 
1790 	unsigned int len_cq =  ntohl(r->len_cq);
1791 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
1792 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
1793 	uint32_t len = G_RSPD_LEN(len_cq);
1794 	uint32_t flags = ntohl(r->flags);
1795 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
1796 	int ret = 0;
1797 
1798 	prefetch(sd->cl);
1799 
1800 	fl->credits--;
1801 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
1802 	bus_dmamap_unload(fl->entry_tag, sd->map);
1803 
1804 	m_cljset(m, sd->cl, fl->type);
1805 	m->m_len = len;
1806 
1807 	switch(sopeop) {
1808 	case RSPQ_SOP_EOP:
1809 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
1810 		mh->mh_head = mh->mh_tail = m;
1811 		m->m_pkthdr.len = len;
1812 		m->m_flags |= M_PKTHDR;
1813 		ret = 1;
1814 		break;
1815 	case RSPQ_NSOP_NEOP:
1816 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
1817 		m->m_flags &= ~M_PKTHDR;
1818 		if (mh->mh_tail == NULL) {
1819 			if (cxgb_debug)
1820 				printf("discarding intermediate descriptor entry\n");
1821 			m_freem(m);
1822 			break;
1823 		}
1824 		mh->mh_tail->m_next = m;
1825 		mh->mh_tail = m;
1826 		mh->mh_head->m_pkthdr.len += len;
1827 		ret = 0;
1828 		break;
1829 	case RSPQ_SOP:
1830 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
1831 		m->m_pkthdr.len = len;
1832 		mh->mh_head = mh->mh_tail = m;
1833 		m->m_flags |= M_PKTHDR;
1834 		ret = 0;
1835 		break;
1836 	case RSPQ_EOP:
1837 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
1838 		m->m_flags &= ~M_PKTHDR;
1839 		mh->mh_head->m_pkthdr.len += len;
1840 		mh->mh_tail->m_next = m;
1841 		mh->mh_tail = m;
1842 		ret = 1;
1843 		break;
1844 	}
1845 	if (++fl->cidx == fl->size)
1846 		fl->cidx = 0;
1847 
1848 	return (ret);
1849 }
1850 
1851 
1852 /**
1853  *	handle_rsp_cntrl_info - handles control information in a response
1854  *	@qs: the queue set corresponding to the response
1855  *	@flags: the response control flags
1856  *
1857  *	Handles the control information of an SGE response, such as GTS
1858  *	indications and completion credits for the queue set's Tx queues.
1859  *	HW coalesces credits, we don't do any extra SW coalescing.
1860  */
1861 static __inline void
1862 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
1863 {
1864 	unsigned int credits;
1865 
1866 #if USE_GTS
1867 	if (flags & F_RSPD_TXQ0_GTS)
1868 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
1869 #endif
1870 	credits = G_RSPD_TXQ0_CR(flags);
1871 	if (credits) {
1872 		qs->txq[TXQ_ETH].processed += credits;
1873 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
1874 			taskqueue_enqueue(qs->port->adapter->tq,
1875 			    &qs->port->adapter->timer_reclaim_task);
1876 	}
1877 
1878 	credits = G_RSPD_TXQ2_CR(flags);
1879 	if (credits)
1880 		qs->txq[TXQ_CTRL].processed += credits;
1881 
1882 # if USE_GTS
1883 	if (flags & F_RSPD_TXQ1_GTS)
1884 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
1885 # endif
1886 	credits = G_RSPD_TXQ1_CR(flags);
1887 	if (credits)
1888 		qs->txq[TXQ_OFLD].processed += credits;
1889 }
1890 
1891 static void
1892 check_ring_db(adapter_t *adap, struct sge_qset *qs,
1893     unsigned int sleeping)
1894 {
1895 	;
1896 }
1897 
1898 /*
1899  * This is an awful hack to bind the ithread to CPU 1
1900  * to work around lack of ithread affinity
1901  */
1902 static void
1903 bind_ithread(int cpu)
1904 {
1905 	KASSERT(cpu < mp_ncpus, ("invalid cpu identifier"));
1906 #if 0
1907 	if (mp_ncpus > 1) {
1908 		mtx_lock_spin(&sched_lock);
1909 		sched_bind(curthread, cpu);
1910 		mtx_unlock_spin(&sched_lock);
1911 	}
1912 #endif
1913 }
1914 
1915 /**
1916  *	process_responses - process responses from an SGE response queue
1917  *	@adap: the adapter
1918  *	@qs: the queue set to which the response queue belongs
1919  *	@budget: how many responses can be processed in this round
1920  *
1921  *	Process responses from an SGE response queue up to the supplied budget.
1922  *	Responses include received packets as well as credits and other events
1923  *	for the queues that belong to the response queue's queue set.
1924  *	A negative budget is effectively unlimited.
1925  *
1926  *	Additionally choose the interrupt holdoff time for the next interrupt
1927  *	on this queue.  If the system is under memory shortage use a fairly
1928  *	long delay to help recovery.
1929  */
1930 static int
1931 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
1932 {
1933 	struct sge_rspq *rspq = &qs->rspq;
1934 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
1935 	int budget_left = budget;
1936 	unsigned int sleeping = 0;
1937 	int lro = qs->lro.enabled;
1938 
1939 	static uint8_t pinned[MAXCPU];
1940 
1941 #ifdef DEBUG
1942 	static int last_holdoff = 0;
1943 	if (rspq->holdoff_tmr != last_holdoff) {
1944 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
1945 		last_holdoff = rspq->holdoff_tmr;
1946 	}
1947 #endif
1948 	if (pinned[qs->rspq.cntxt_id * adap->params.nports] == 0) {
1949 		/*
1950 		 * Assumes that cntxt_id < mp_ncpus
1951 		 */
1952 		bind_ithread(qs->rspq.cntxt_id);
1953 		pinned[qs->rspq.cntxt_id * adap->params.nports] = 1;
1954 	}
1955 	rspq->next_holdoff = rspq->holdoff_tmr;
1956 
1957 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
1958 		int eth, eop = 0, ethpad = 0;
1959 		uint32_t flags = ntohl(r->flags);
1960 		uint32_t rss_csum = *(const uint32_t *)r;
1961 		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
1962 
1963 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
1964 
1965 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
1966 			/* XXX */
1967 			printf("async notification\n");
1968 
1969 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
1970 			if (cxgb_debug)
1971 				printf("IMM DATA VALID\n");
1972 
1973 			if(get_imm_packet(adap, r, &rspq->mh) == 0) {
1974 				rspq->next_holdoff = NOMEM_INTR_DELAY;
1975 				budget_left--;
1976 				break;
1977 			} else {
1978 				eop = 1;
1979 			}
1980 
1981 			rspq->imm_data++;
1982 		} else if (r->len_cq) {
1983 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
1984 			struct mbuf *m;
1985 
1986 			m = m_gethdr(M_NOWAIT, MT_DATA);
1987 
1988 			if (m == NULL) {
1989 				log(LOG_WARNING, "failed to get mbuf for packet\n");
1990 				break;
1991 			}
1992 
1993 			ethpad = 2;
1994 			eop = get_packet(adap, drop_thresh, qs, &rspq->mh, r, m);
1995 		} else {
1996 			DPRINTF("pure response\n");
1997 			rspq->pure_rsps++;
1998 		}
1999 
2000 		if (flags & RSPD_CTRL_MASK) {
2001 			sleeping |= flags & RSPD_GTS_MASK;
2002 			handle_rsp_cntrl_info(qs, flags);
2003 		}
2004 
2005 		r++;
2006 		if (__predict_false(++rspq->cidx == rspq->size)) {
2007 			rspq->cidx = 0;
2008 			rspq->gen ^= 1;
2009 			r = rspq->desc;
2010 		}
2011 
2012 		prefetch(r);
2013 		if (++rspq->credits >= (rspq->size / 4)) {
2014 			refill_rspq(adap, rspq, rspq->credits);
2015 			rspq->credits = 0;
2016 		}
2017 
2018 		if (eop) {
2019 			prefetch(rspq->mh.mh_head->m_data);
2020 			prefetch(rspq->mh.mh_head->m_data + L1_CACHE_BYTES);
2021 
2022 			if (eth) {
2023 				t3_rx_eth_lro(adap, rspq, &rspq->mh, ethpad,
2024 				    rss_hash, rss_csum, lro);
2025 
2026 				rspq->mh.mh_tail = rspq->mh.mh_head = NULL;
2027 			} else {
2028 #ifdef notyet
2029 				if (__predict_false(r->rss_hdr.opcode == CPL_TRACE_PKT))
2030 					m_adj(m, 2);
2031 
2032 				rx_offload(&adap->tdev, rspq, m);
2033 #endif
2034 			}
2035 #ifdef notyet
2036 			taskqueue_enqueue(adap->tq, &adap->timer_reclaim_task);
2037 #else
2038 			__refill_fl(adap, &qs->fl[0]);
2039 			__refill_fl(adap, &qs->fl[1]);
2040 #endif
2041 
2042 		}
2043 		--budget_left;
2044 	}
2045 	t3_sge_lro_flush_all(adap, qs);
2046 	deliver_partial_bundle(&adap->tdev, rspq);
2047 
2048 	if (sleeping)
2049 		check_ring_db(adap, qs, sleeping);
2050 
2051 	smp_mb();  /* commit Tx queue processed updates */
2052 	if (__predict_false(qs->txq_stopped != 0))
2053 		restart_tx(qs);
2054 
2055 	budget -= budget_left;
2056 	return (budget);
2057 }
2058 
2059 /*
2060  * A helper function that processes responses and issues GTS.
2061  */
2062 static __inline int
2063 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2064 {
2065 	int work;
2066 	static int last_holdoff = 0;
2067 
2068 	work = process_responses(adap, rspq_to_qset(rq), -1);
2069 
2070 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2071 		printf("next_holdoff=%d\n", rq->next_holdoff);
2072 		last_holdoff = rq->next_holdoff;
2073 	}
2074 
2075 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2076 		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2077 	return work;
2078 }
2079 
2080 
2081 /*
2082  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2083  * Handles data events from SGE response queues as well as error and other
2084  * async events as they all use the same interrupt pin.  We use one SGE
2085  * response queue per port in this mode and protect all response queues with
2086  * queue 0's lock.
2087  */
2088 void
2089 t3b_intr(void *data)
2090 {
2091 	uint32_t map;
2092 	adapter_t *adap = data;
2093 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2094 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2095 
2096 	t3_write_reg(adap, A_PL_CLI, 0);
2097 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2098 
2099 	if (!map)
2100 		return;
2101 
2102 	if (__predict_false(map & F_ERRINTR))
2103 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2104 
2105 	mtx_lock(&q0->lock);
2106 
2107 	if (__predict_true(map & 1))
2108 		process_responses_gts(adap, q0);
2109 
2110 	if (map & 2)
2111 		process_responses_gts(adap, q1);
2112 
2113 	mtx_unlock(&q0->lock);
2114 }
2115 
2116 /*
2117  * The MSI interrupt handler.  This needs to handle data events from SGE
2118  * response queues as well as error and other async events as they all use
2119  * the same MSI vector.  We use one SGE response queue per port in this mode
2120  * and protect all response queues with queue 0's lock.
2121  */
2122 void
2123 t3_intr_msi(void *data)
2124 {
2125 	adapter_t *adap = data;
2126 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2127 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2128 	int new_packets = 0;
2129 
2130 	mtx_lock(&q0->lock);
2131 	if (process_responses_gts(adap, q0)) {
2132 		new_packets = 1;
2133 	}
2134 
2135 	if (adap->params.nports == 2 &&
2136 	    process_responses_gts(adap, q1)) {
2137 		new_packets = 1;
2138 	}
2139 
2140 	mtx_unlock(&q0->lock);
2141 	if (new_packets == 0)
2142 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2143 }
2144 
2145 void
2146 t3_intr_msix(void *data)
2147 {
2148 	struct sge_qset *qs = data;
2149 	adapter_t *adap = qs->port->adapter;
2150 	struct sge_rspq *rspq = &qs->rspq;
2151 
2152 	mtx_lock(&rspq->lock);
2153 	if (process_responses_gts(adap, rspq) == 0) {
2154 #ifdef notyet
2155 		rspq->unhandled_irqs++;
2156 #endif
2157 	}
2158 	mtx_unlock(&rspq->lock);
2159 }
2160 
2161 static int
2162 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2163 {
2164 	adapter_t *sc;
2165 	int i, j, enabled, err, nqsets = 0;
2166 
2167 	sc = arg1;
2168 	enabled = sc->sge.qs[0].lro.enabled;
2169         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2170 
2171 	if (err != 0) {
2172 		return (err);
2173 	}
2174 	if (enabled == sc->sge.qs[0].lro.enabled)
2175 		return (0);
2176 
2177 	for (i = 0; i < sc->params.nports; i++)
2178 		for (j = 0; j < sc->port[i].nqsets; j++)
2179 			nqsets++;
2180 
2181 	for (i = 0; i < nqsets; i++) {
2182 		sc->sge.qs[i].lro.enabled = enabled;
2183 	}
2184 
2185 	return (0);
2186 }
2187 
2188 static int
2189 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2190 {
2191 	adapter_t *sc = arg1;
2192 	struct qset_params *qsp = &sc->params.sge.qset[0];
2193 	int coalesce_nsecs;
2194 	struct sge_qset *qs;
2195 	int i, j, err, nqsets = 0;
2196 	struct mtx *lock;
2197 
2198 	coalesce_nsecs = qsp->coalesce_nsecs;
2199         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
2200 
2201 	if (err != 0) {
2202 		return (err);
2203 	}
2204 	if (coalesce_nsecs == qsp->coalesce_nsecs)
2205 		return (0);
2206 
2207 	for (i = 0; i < sc->params.nports; i++)
2208 		for (j = 0; j < sc->port[i].nqsets; j++)
2209 			nqsets++;
2210 
2211 	coalesce_nsecs = max(100, coalesce_nsecs);
2212 
2213 	for (i = 0; i < nqsets; i++) {
2214 		qs = &sc->sge.qs[i];
2215 		qsp = &sc->params.sge.qset[i];
2216 		qsp->coalesce_nsecs = coalesce_nsecs;
2217 
2218 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
2219 			    &sc->sge.qs[0].rspq.lock;
2220 
2221 		mtx_lock(lock);
2222 		t3_update_qset_coalesce(qs, qsp);
2223 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2224 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
2225 		mtx_unlock(lock);
2226 	}
2227 
2228 	return (0);
2229 }
2230 
2231 
2232 void
2233 t3_add_sysctls(adapter_t *sc)
2234 {
2235 	struct sysctl_ctx_list *ctx;
2236 	struct sysctl_oid_list *children;
2237 
2238 	ctx = device_get_sysctl_ctx(sc->dev);
2239 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
2240 
2241 	/* random information */
2242 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
2243 	    "firmware_version",
2244 	    CTLFLAG_RD, &sc->fw_version,
2245 	    0, "firmware version");
2246 
2247 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2248 	    "enable_lro",
2249 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2250 	    0, t3_lro_enable,
2251 	    "I", "enable large receive offload");
2252 
2253 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2254 	    "intr_coal",
2255 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2256 	    0, t3_set_coalesce_nsecs,
2257 	    "I", "interrupt coalescing timer (ns)");
2258 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2259 	    "enable_debug",
2260 	    CTLFLAG_RW, &cxgb_debug,
2261 	    0, "enable verbose debugging output");
2262 
2263 }
2264 
2265 /**
2266  *	t3_get_desc - dump an SGE descriptor for debugging purposes
2267  *	@qs: the queue set
2268  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2269  *	@idx: the descriptor index in the queue
2270  *	@data: where to dump the descriptor contents
2271  *
2272  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2273  *	size of the descriptor.
2274  */
2275 int
2276 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2277 		unsigned char *data)
2278 {
2279 	if (qnum >= 6)
2280 		return (EINVAL);
2281 
2282 	if (qnum < 3) {
2283 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2284 			return -EINVAL;
2285 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2286 		return sizeof(struct tx_desc);
2287 	}
2288 
2289 	if (qnum == 3) {
2290 		if (!qs->rspq.desc || idx >= qs->rspq.size)
2291 			return (EINVAL);
2292 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2293 		return sizeof(struct rsp_desc);
2294 	}
2295 
2296 	qnum -= 4;
2297 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2298 		return (EINVAL);
2299 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2300 	return sizeof(struct rx_desc);
2301 }
2302