xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 0bb263df82e129f5f8c82da6deb55dfe10daa677)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/conf.h>
39 #include <machine/bus.h>
40 #include <machine/resource.h>
41 #include <sys/bus_dma.h>
42 #include <sys/rman.h>
43 #include <sys/queue.h>
44 #include <sys/sysctl.h>
45 #include <sys/taskqueue.h>
46 
47 
48 #include <sys/proc.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/systm.h>
52 
53 #include <netinet/in_systm.h>
54 #include <netinet/in.h>
55 #include <netinet/ip.h>
56 #include <netinet/tcp.h>
57 
58 #include <dev/pci/pcireg.h>
59 #include <dev/pci/pcivar.h>
60 
61 #ifdef CONFIG_DEFINED
62 #include <cxgb_include.h>
63 #else
64 #include <dev/cxgb/cxgb_include.h>
65 #endif
66 
67 uint32_t collapse_free = 0;
68 uint32_t mb_free_vec_free = 0;
69 int      collapse_mbufs = 0;
70 static int recycle_enable = 1;
71 
72 
73 /*
74  * XXX GC
75  */
76 #define NET_XMIT_CN 2
77 #define NET_XMIT_SUCCESS 0
78 
79 #define USE_GTS 0
80 
81 #define SGE_RX_SM_BUF_SIZE	1536
82 #define SGE_RX_DROP_THRES	16
83 #define SGE_RX_COPY_THRES	128
84 
85 /*
86  * Period of the Tx buffer reclaim timer.  This timer does not need to run
87  * frequently as Tx buffers are usually reclaimed by new Tx packets.
88  */
89 #define TX_RECLAIM_PERIOD       (hz >> 1)
90 
91 /*
92  * work request size in bytes
93  */
94 #define WR_LEN (WR_FLITS * 8)
95 
96 /*
97  * Values for sge_txq.flags
98  */
99 enum {
100 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
101 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
102 };
103 
104 struct tx_desc {
105 	uint64_t	flit[TX_DESC_FLITS];
106 } __packed;
107 
108 struct rx_desc {
109 	uint32_t	addr_lo;
110 	uint32_t	len_gen;
111 	uint32_t	gen2;
112 	uint32_t	addr_hi;
113 } __packed;;
114 
115 struct rsp_desc {               /* response queue descriptor */
116 	struct rss_header	rss_hdr;
117 	uint32_t		flags;
118 	uint32_t		len_cq;
119 	uint8_t			imm_data[47];
120 	uint8_t			intr_gen;
121 } __packed;
122 
123 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
124 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
125 #define RX_SW_DESC_INUSE        (1 << 3)
126 #define TX_SW_DESC_MAPPED       (1 << 4)
127 
128 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
129 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
130 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
131 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
132 
133 struct tx_sw_desc {                /* SW state per Tx descriptor */
134 	struct mbuf	*m;
135 	bus_dmamap_t	map;
136 	int		flags;
137 };
138 
139 struct rx_sw_desc {                /* SW state per Rx descriptor */
140 	void	        *cl;
141 	bus_dmamap_t	map;
142 	int		flags;
143 };
144 
145 struct txq_state {
146 	unsigned int compl;
147 	unsigned int gen;
148 	unsigned int pidx;
149 };
150 
151 struct refill_fl_cb_arg {
152 	int               error;
153 	bus_dma_segment_t seg;
154 	int               nseg;
155 };
156 
157 /*
158  * Maps a number of flits to the number of Tx descriptors that can hold them.
159  * The formula is
160  *
161  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
162  *
163  * HW allows up to 4 descriptors to be combined into a WR.
164  */
165 static uint8_t flit_desc_map[] = {
166 	0,
167 #if SGE_NUM_GENBITS == 1
168 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
169 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
170 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
171 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
172 #elif SGE_NUM_GENBITS == 2
173 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
174 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
175 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
176 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
177 #else
178 # error "SGE_NUM_GENBITS must be 1 or 2"
179 #endif
180 };
181 
182 
183 static int lro_default = 0;
184 int cxgb_debug = 0;
185 
186 static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
187 static void sge_timer_cb(void *arg);
188 static void sge_timer_reclaim(void *arg, int ncount);
189 static int free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec);
190 
191 /**
192  *	reclaim_completed_tx - reclaims completed Tx descriptors
193  *	@adapter: the adapter
194  *	@q: the Tx queue to reclaim completed descriptors from
195  *
196  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
197  *	and frees the associated buffers if possible.  Called with the Tx
198  *	queue's lock held.
199  */
200 static __inline int
201 reclaim_completed_tx(adapter_t *adapter, struct sge_txq *q, int nbufs, struct mbuf **mvec)
202 {
203 	int reclaimed, reclaim = desc_reclaimable(q);
204 	int n = 0;
205 
206 	mtx_assert(&q->lock, MA_OWNED);
207 	if (reclaim > 0) {
208 		n = free_tx_desc(adapter, q, min(reclaim, nbufs), mvec);
209 		reclaimed = min(reclaim, nbufs);
210 		q->cleaned += reclaimed;
211 		q->in_use -= reclaimed;
212 	}
213 	return (n);
214 }
215 
216 /**
217  *	should_restart_tx - are there enough resources to restart a Tx queue?
218  *	@q: the Tx queue
219  *
220  *	Checks if there are enough descriptors to restart a suspended Tx queue.
221  */
222 static __inline int
223 should_restart_tx(const struct sge_txq *q)
224 {
225 	unsigned int r = q->processed - q->cleaned;
226 
227 	return q->in_use - r < (q->size >> 1);
228 }
229 
230 /**
231  *	t3_sge_init - initialize SGE
232  *	@adap: the adapter
233  *	@p: the SGE parameters
234  *
235  *	Performs SGE initialization needed every time after a chip reset.
236  *	We do not initialize any of the queue sets here, instead the driver
237  *	top-level must request those individually.  We also do not enable DMA
238  *	here, that should be done after the queues have been set up.
239  */
240 void
241 t3_sge_init(adapter_t *adap, struct sge_params *p)
242 {
243 	u_int ctrl, ups;
244 
245 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
246 
247 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
248 	       F_CQCRDTCTRL |
249 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
250 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
251 #if SGE_NUM_GENBITS == 1
252 	ctrl |= F_EGRGENCTRL;
253 #endif
254 	if (adap->params.rev > 0) {
255 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
256 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
257 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
258 	}
259 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
260 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
261 		     V_LORCQDRBTHRSH(512));
262 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
263 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
264 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
265 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
266 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
267 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
268 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
269 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
270 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
271 }
272 
273 
274 /**
275  *	sgl_len - calculates the size of an SGL of the given capacity
276  *	@n: the number of SGL entries
277  *
278  *	Calculates the number of flits needed for a scatter/gather list that
279  *	can hold the given number of entries.
280  */
281 static __inline unsigned int
282 sgl_len(unsigned int n)
283 {
284 	return ((3 * n) / 2 + (n & 1));
285 }
286 
287 /**
288  *	get_imm_packet - return the next ingress packet buffer from a response
289  *	@resp: the response descriptor containing the packet data
290  *
291  *	Return a packet containing the immediate data of the given response.
292  */
293 static __inline void
294 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl)
295 {
296 	int len;
297 	uint32_t flags = ntohl(resp->flags);
298 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
299 
300 	/*
301 	 * would be a firmware bug
302 	 */
303 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
304 		return;
305 
306 	len = G_RSPD_LEN(ntohl(resp->len_cq));
307 	switch (sopeop) {
308 	case RSPQ_SOP_EOP:
309 		m->m_len = m->m_pkthdr.len = len;
310 		memcpy(mtod(m, uint8_t *), resp->imm_data, len);
311 		break;
312 	case RSPQ_EOP:
313 		memcpy(cl, resp->imm_data, len);
314 		m_iovappend(m, cl, MSIZE, len, 0);
315 		break;
316 	}
317 }
318 
319 
320 static __inline u_int
321 flits_to_desc(u_int n)
322 {
323 	return (flit_desc_map[n]);
324 }
325 
326 void
327 t3_sge_err_intr_handler(adapter_t *adapter)
328 {
329 	unsigned int v, status;
330 
331 
332 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
333 
334 	if (status & F_RSPQCREDITOVERFOW)
335 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
336 
337 	if (status & F_RSPQDISABLED) {
338 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
339 
340 		CH_ALERT(adapter,
341 			 "packet delivered to disabled response queue (0x%x)\n",
342 			 (v >> S_RSPQ0DISABLED) & 0xff);
343 	}
344 
345 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
346 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
347 		t3_fatal_err(adapter);
348 }
349 
350 void
351 t3_sge_prep(adapter_t *adap, struct sge_params *p)
352 {
353 	int i;
354 
355 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
356 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
357 
358 	for (i = 0; i < SGE_QSETS; ++i) {
359 		struct qset_params *q = p->qset + i;
360 
361 		q->polling = adap->params.rev > 0;
362 
363 		q->coalesce_nsecs = 5000;
364 
365 		q->rspq_size = RSPQ_Q_SIZE;
366 		q->fl_size = FL_Q_SIZE;
367 		q->jumbo_size = JUMBO_Q_SIZE;
368 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
369 		q->txq_size[TXQ_OFLD] = 1024;
370 		q->txq_size[TXQ_CTRL] = 256;
371 		q->cong_thres = 0;
372 	}
373 }
374 
375 int
376 t3_sge_alloc(adapter_t *sc)
377 {
378 
379 	/* The parent tag. */
380 	if (bus_dma_tag_create( NULL,			/* parent */
381 				1, 0,			/* algnmnt, boundary */
382 				BUS_SPACE_MAXADDR,	/* lowaddr */
383 				BUS_SPACE_MAXADDR,	/* highaddr */
384 				NULL, NULL,		/* filter, filterarg */
385 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
386 				BUS_SPACE_UNRESTRICTED, /* nsegments */
387 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
388 				0,			/* flags */
389 				NULL, NULL,		/* lock, lockarg */
390 				&sc->parent_dmat)) {
391 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
392 		return (ENOMEM);
393 	}
394 
395 	/*
396 	 * DMA tag for normal sized RX frames
397 	 */
398 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
399 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
400 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
401 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
402 		return (ENOMEM);
403 	}
404 
405 	/*
406 	 * DMA tag for jumbo sized RX frames.
407 	 */
408 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
409 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
410 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
411 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
412 		return (ENOMEM);
413 	}
414 
415 	/*
416 	 * DMA tag for TX frames.
417 	 */
418 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
419 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
420 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
421 		NULL, NULL, &sc->tx_dmat)) {
422 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
423 		return (ENOMEM);
424 	}
425 
426 	return (0);
427 }
428 
429 int
430 t3_sge_free(struct adapter * sc)
431 {
432 
433 	if (sc->tx_dmat != NULL)
434 		bus_dma_tag_destroy(sc->tx_dmat);
435 
436 	if (sc->rx_jumbo_dmat != NULL)
437 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
438 
439 	if (sc->rx_dmat != NULL)
440 		bus_dma_tag_destroy(sc->rx_dmat);
441 
442 	if (sc->parent_dmat != NULL)
443 		bus_dma_tag_destroy(sc->parent_dmat);
444 
445 	return (0);
446 }
447 
448 void
449 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
450 {
451 
452 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
453 	qs->rspq.polling = 0 /* p->polling */;
454 }
455 
456 static void
457 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
458 {
459 	struct refill_fl_cb_arg *cb_arg = arg;
460 
461 	cb_arg->error = error;
462 	cb_arg->seg = segs[0];
463 	cb_arg->nseg = nseg;
464 
465 }
466 
467 /**
468  *	refill_fl - refill an SGE free-buffer list
469  *	@sc: the controller softc
470  *	@q: the free-list to refill
471  *	@n: the number of new buffers to allocate
472  *
473  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
474  *	The caller must assure that @n does not exceed the queue's capacity.
475  */
476 static void
477 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
478 {
479 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
480 	struct rx_desc *d = &q->desc[q->pidx];
481 	struct refill_fl_cb_arg cb_arg;
482 	void *cl;
483 	int err;
484 
485 	cb_arg.error = 0;
486 	while (n--) {
487 		/*
488 		 * We only allocate a cluster, mbuf allocation happens after rx
489 		 */
490 		if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
491 			log(LOG_WARNING, "Failed to allocate cluster\n");
492 			goto done;
493 		}
494 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
495 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
496 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
497 				uma_zfree(q->zone, cl);
498 				goto done;
499 			}
500 			sd->flags |= RX_SW_DESC_MAP_CREATED;
501 		}
502 		err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
503 		    refill_fl_cb, &cb_arg, 0);
504 
505 		if (err != 0 || cb_arg.error) {
506 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
507 			/*
508 			 * XXX free cluster
509 			 */
510 			return;
511 		}
512 
513 		sd->flags |= RX_SW_DESC_INUSE;
514 		sd->cl = cl;
515 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
516 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
517 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
518 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
519 
520 		d++;
521 		sd++;
522 
523 		if (++q->pidx == q->size) {
524 			q->pidx = 0;
525 			q->gen ^= 1;
526 			sd = q->sdesc;
527 			d = q->desc;
528 		}
529 		q->credits++;
530 	}
531 
532 done:
533 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
534 }
535 
536 
537 /**
538  *	free_rx_bufs - free the Rx buffers on an SGE free list
539  *	@sc: the controle softc
540  *	@q: the SGE free list to clean up
541  *
542  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
543  *	this queue should be stopped before calling this function.
544  */
545 static void
546 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
547 {
548 	u_int cidx = q->cidx;
549 
550 	while (q->credits--) {
551 		struct rx_sw_desc *d = &q->sdesc[cidx];
552 
553 		if (d->flags & RX_SW_DESC_INUSE) {
554 			bus_dmamap_unload(q->entry_tag, d->map);
555 			bus_dmamap_destroy(q->entry_tag, d->map);
556 			uma_zfree(q->zone, d->cl);
557 		}
558 		d->cl = NULL;
559 		if (++cidx == q->size)
560 			cidx = 0;
561 	}
562 }
563 
564 static __inline void
565 __refill_fl(adapter_t *adap, struct sge_fl *fl)
566 {
567 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
568 }
569 
570 /**
571  *	recycle_rx_buf - recycle a receive buffer
572  *	@adapter: the adapter
573  *	@q: the SGE free list
574  *	@idx: index of buffer to recycle
575  *
576  *	Recycles the specified buffer on the given free list by adding it at
577  *	the next available slot on the list.
578  */
579 static void
580 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
581 {
582 	struct rx_desc *from = &q->desc[idx];
583 	struct rx_desc *to   = &q->desc[q->pidx];
584 
585 	q->sdesc[q->pidx] = q->sdesc[idx];
586 	to->addr_lo = from->addr_lo;        // already big endian
587 	to->addr_hi = from->addr_hi;        // likewise
588 	wmb();
589 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
590 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
591 	q->credits++;
592 
593 	if (++q->pidx == q->size) {
594 		q->pidx = 0;
595 		q->gen ^= 1;
596 	}
597 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
598 }
599 
600 static void
601 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
602 {
603 	uint32_t *addr;
604 
605 	addr = arg;
606 	*addr = segs[0].ds_addr;
607 }
608 
609 static int
610 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
611     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
612     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
613 {
614 	size_t len = nelem * elem_size;
615 	void *s = NULL;
616 	void *p = NULL;
617 	int err;
618 
619 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
620 				      BUS_SPACE_MAXADDR_32BIT,
621 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
622 				      len, 0, NULL, NULL, tag)) != 0) {
623 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
624 		return (ENOMEM);
625 	}
626 
627 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
628 				    map)) != 0) {
629 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
630 		return (ENOMEM);
631 	}
632 
633 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
634 	bzero(p, len);
635 	*(void **)desc = p;
636 
637 	if (sw_size) {
638 		len = nelem * sw_size;
639 		s = malloc(len, M_DEVBUF, M_WAITOK);
640 		bzero(s, len);
641 		*(void **)sdesc = s;
642 	}
643 	if (parent_entry_tag == NULL)
644 		return (0);
645 
646 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
647 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
648 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
649 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
650 		                      NULL, NULL, entry_tag)) != 0) {
651 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
652 		return (ENOMEM);
653 	}
654 	return (0);
655 }
656 
657 static void
658 sge_slow_intr_handler(void *arg, int ncount)
659 {
660 	adapter_t *sc = arg;
661 
662 	t3_slow_intr_handler(sc);
663 }
664 
665 static void
666 sge_timer_cb(void *arg)
667 {
668 	adapter_t *sc = arg;
669 	struct port_info *p;
670 	struct sge_qset *qs;
671 	struct sge_txq  *txq;
672 	int i, j;
673 	int reclaim_eth, reclaim_ofl, refill_rx;
674 
675 	for (i = 0; i < sc->params.nports; i++)
676 		for (j = 0; j < sc->port[i].nqsets; j++) {
677 			qs = &sc->sge.qs[i + j];
678 			txq = &qs->txq[0];
679 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
680 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
681 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
682 			    (qs->fl[1].credits < qs->fl[1].size));
683 			if (reclaim_eth || reclaim_ofl || refill_rx) {
684 				p = &sc->port[i];
685 				taskqueue_enqueue(p->tq, &p->timer_reclaim_task);
686 				break;
687 			}
688 		}
689 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
690 }
691 
692 /*
693  * This is meant to be a catch-all function to keep sge state private
694  * to sge.c
695  *
696  */
697 int
698 t3_sge_init_adapter(adapter_t *sc)
699 {
700 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
701 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
702 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
703 	return (0);
704 }
705 
706 int
707 t3_sge_init_port(struct port_info *p)
708 {
709 	TASK_INIT(&p->timer_reclaim_task, 0, sge_timer_reclaim, p);
710 }
711 
712 void
713 t3_sge_deinit_sw(adapter_t *sc)
714 {
715 	int i;
716 
717 	callout_drain(&sc->sge_timer_ch);
718 	if (sc->tq)
719 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
720 	for (i = 0; i < sc->params.nports; i++)
721 		if (sc->port[i].tq != NULL)
722 			taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task);
723 }
724 
725 /**
726  *	refill_rspq - replenish an SGE response queue
727  *	@adapter: the adapter
728  *	@q: the response queue to replenish
729  *	@credits: how many new responses to make available
730  *
731  *	Replenishes a response queue by making the supplied number of responses
732  *	available to HW.
733  */
734 static __inline void
735 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
736 {
737 
738 	/* mbufs are allocated on demand when a rspq entry is processed. */
739 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
740 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
741 }
742 
743 
744 static void
745 sge_timer_reclaim(void *arg, int ncount)
746 {
747 	struct port_info *p = arg;
748 	int i, nqsets = p->nqsets;
749 	adapter_t *sc = p->adapter;
750 	struct sge_qset *qs;
751 	struct sge_txq *txq;
752 	struct mtx *lock;
753 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
754 	int n, reclaimable;
755 
756 	for (i = 0; i < nqsets; i++) {
757 		qs = &sc->sge.qs[i];
758 		txq = &qs->txq[TXQ_ETH];
759 		reclaimable = desc_reclaimable(txq);
760 		if (reclaimable > 0) {
761 			mtx_lock(&txq->lock);
762 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
763 			mtx_unlock(&txq->lock);
764 
765 			for (i = 0; i < n; i++)
766 				m_freem_vec(m_vec[i]);
767 
768 			if (p->ifp->if_drv_flags & IFF_DRV_OACTIVE &&
769 			    txq->size - txq->in_use >= TX_START_MAX_DESC) {
770 				p->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
771 				taskqueue_enqueue(p->tq, &p->start_task);
772 			}
773 		}
774 
775 		txq = &qs->txq[TXQ_OFLD];
776 		reclaimable = desc_reclaimable(txq);
777 		if (reclaimable > 0) {
778 			mtx_lock(&txq->lock);
779 			n = reclaim_completed_tx(sc, txq, TX_CLEAN_MAX_DESC, m_vec);
780 			mtx_unlock(&txq->lock);
781 
782 			for (i = 0; i < n; i++)
783 				m_freem_vec(m_vec[i]);
784 		}
785 
786 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
787 			    &sc->sge.qs[0].rspq.lock;
788 
789 		if (mtx_trylock(lock)) {
790 			/* XXX currently assume that we are *NOT* polling */
791 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
792 
793 			if (qs->fl[0].credits < qs->fl[0].size - 16)
794 				__refill_fl(sc, &qs->fl[0]);
795 			if (qs->fl[1].credits < qs->fl[1].size - 16)
796 				__refill_fl(sc, &qs->fl[1]);
797 
798 			if (status & (1 << qs->rspq.cntxt_id)) {
799 				if (qs->rspq.credits) {
800 					refill_rspq(sc, &qs->rspq, 1);
801 					qs->rspq.credits--;
802 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
803 					    1 << qs->rspq.cntxt_id);
804 				}
805 			}
806 			mtx_unlock(lock);
807 		}
808 	}
809 }
810 
811 /**
812  *	init_qset_cntxt - initialize an SGE queue set context info
813  *	@qs: the queue set
814  *	@id: the queue set id
815  *
816  *	Initializes the TIDs and context ids for the queues of a queue set.
817  */
818 static void
819 init_qset_cntxt(struct sge_qset *qs, u_int id)
820 {
821 
822 	qs->rspq.cntxt_id = id;
823 	qs->fl[0].cntxt_id = 2 * id;
824 	qs->fl[1].cntxt_id = 2 * id + 1;
825 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
826 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
827 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
828 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
829 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
830 }
831 
832 
833 static void
834 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
835 {
836 	txq->in_use += ndesc;
837 	/*
838 	 * XXX we don't handle stopping of queue
839 	 * presumably start handles this when we bump against the end
840 	 */
841 	txqs->gen = txq->gen;
842 	txq->unacked += ndesc;
843 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
844 	txq->unacked &= 7;
845 	txqs->pidx = txq->pidx;
846 	txq->pidx += ndesc;
847 
848 	if (txq->pidx >= txq->size) {
849 		txq->pidx -= txq->size;
850 		txq->gen ^= 1;
851 	}
852 
853 }
854 
855 /**
856  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
857  *	@m: the packet mbufs
858  *      @nsegs: the number of segments
859  *
860  * 	Returns the number of Tx descriptors needed for the given Ethernet
861  * 	packet.  Ethernet packets require addition of WR and CPL headers.
862  */
863 static __inline unsigned int
864 calc_tx_descs(const struct mbuf *m, int nsegs)
865 {
866 	unsigned int flits;
867 
868 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
869 		return 1;
870 
871 	flits = sgl_len(nsegs) + 2;
872 #ifdef TSO_SUPPORTED
873 	if  (m->m_pkthdr.csum_flags & (CSUM_TSO))
874 		flits++;
875 #endif
876 	return flits_to_desc(flits);
877 }
878 
879 static unsigned int
880 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
881     struct tx_sw_desc *stx, bus_dma_segment_t *segs, int *nsegs)
882 {
883 	struct mbuf *m0;
884 	int err, pktlen;
885 
886 	m0 = *m;
887 	pktlen = m0->m_pkthdr.len;
888 
889 	err = bus_dmamap_load_mvec_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
890 #ifdef DEBUG
891 	if (err) {
892 		int n = 0;
893 		struct mbuf *mtmp = m0;
894 		while(mtmp) {
895 			n++;
896 			mtmp = mtmp->m_next;
897 		}
898 		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
899 		    err, m0->m_pkthdr.len, n);
900 	}
901 #endif
902 	if (err == EFBIG) {
903 		/* Too many segments, try to defrag */
904 		m0 = m_defrag(m0, M_NOWAIT);
905 		if (m0 == NULL) {
906 			m_freem(*m);
907 			*m = NULL;
908 			return (ENOBUFS);
909 		}
910 		*m = m0;
911 		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
912 	}
913 
914 	if (err == ENOMEM) {
915 		return (err);
916 	}
917 
918 	if (err) {
919 		if (cxgb_debug)
920 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
921 		m_freem_vec(m0);
922 		*m = NULL;
923 		return (err);
924 	}
925 
926 	bus_dmamap_sync(txq->entry_tag, stx->map, BUS_DMASYNC_PREWRITE);
927 	stx->flags |= TX_SW_DESC_MAPPED;
928 
929 	return (0);
930 }
931 
932 /**
933  *	make_sgl - populate a scatter/gather list for a packet
934  *	@sgp: the SGL to populate
935  *	@segs: the packet dma segments
936  *	@nsegs: the number of segments
937  *
938  *	Generates a scatter/gather list for the buffers that make up a packet
939  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
940  *	appropriately.
941  */
942 static __inline void
943 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
944 {
945 	int i, idx;
946 
947 	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
948 		if (i && idx == 0)
949 			++sgp;
950 
951 		sgp->len[idx] = htobe32(segs[i].ds_len);
952 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
953 	}
954 
955 	if (idx)
956 		sgp->len[idx] = 0;
957 }
958 
959 /**
960  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
961  *	@adap: the adapter
962  *	@q: the Tx queue
963  *
964  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
965  *	where the HW is going to sleep just after we checked, however,
966  *	then the interrupt handler will detect the outstanding TX packet
967  *	and ring the doorbell for us.
968  *
969  *	When GTS is disabled we unconditionally ring the doorbell.
970  */
971 static __inline void
972 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
973 {
974 #if USE_GTS
975 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
976 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
977 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
978 #ifdef T3_TRACE
979 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
980 			  q->cntxt_id);
981 #endif
982 		t3_write_reg(adap, A_SG_KDOORBELL,
983 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
984 	}
985 #else
986 	wmb();            /* write descriptors before telling HW */
987 	t3_write_reg(adap, A_SG_KDOORBELL,
988 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
989 #endif
990 }
991 
992 static __inline void
993 wr_gen2(struct tx_desc *d, unsigned int gen)
994 {
995 #if SGE_NUM_GENBITS == 2
996 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
997 #endif
998 }
999 
1000 
1001 
1002 /**
1003  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1004  *	@ndesc: number of Tx descriptors spanned by the SGL
1005  *	@txd: first Tx descriptor to be written
1006  *	@txqs: txq state (generation and producer index)
1007  *	@txq: the SGE Tx queue
1008  *	@sgl: the SGL
1009  *	@flits: number of flits to the start of the SGL in the first descriptor
1010  *	@sgl_flits: the SGL size in flits
1011  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1012  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1013  *
1014  *	Write a work request header and an associated SGL.  If the SGL is
1015  *	small enough to fit into one Tx descriptor it has already been written
1016  *	and we just need to write the WR header.  Otherwise we distribute the
1017  *	SGL across the number of descriptors it spans.
1018  */
1019 
1020 static void
1021 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1022     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1023     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1024 {
1025 
1026 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1027 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1028 
1029 	if (__predict_true(ndesc == 1)) {
1030 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1031 		    V_WR_SGLSFLT(flits)) | wr_hi;
1032 		wmb();
1033 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1034 		    V_WR_GEN(txqs->gen)) | wr_lo;
1035 		/* XXX gen? */
1036 		wr_gen2(txd, txqs->gen);
1037 	} else {
1038 		unsigned int ogen = txqs->gen;
1039 		const uint64_t *fp = (const uint64_t *)sgl;
1040 		struct work_request_hdr *wp = wrp;
1041 
1042 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1043 		    V_WR_SGLSFLT(flits)) | wr_hi;
1044 
1045 		while (sgl_flits) {
1046 			unsigned int avail = WR_FLITS - flits;
1047 
1048 			if (avail > sgl_flits)
1049 				avail = sgl_flits;
1050 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1051 			sgl_flits -= avail;
1052 			ndesc--;
1053 			if (!sgl_flits)
1054 				break;
1055 
1056 			fp += avail;
1057 			txd++;
1058 			txsd++;
1059 			if (++txqs->pidx == txq->size) {
1060 				txqs->pidx = 0;
1061 				txqs->gen ^= 1;
1062 				txd = txq->desc;
1063 				txsd = txq->sdesc;
1064 			}
1065 
1066 			/*
1067 			 * when the head of the mbuf chain
1068 			 * is freed all clusters will be freed
1069 			 * with it
1070 			 */
1071 			txsd->m = NULL;
1072 			wrp = (struct work_request_hdr *)txd;
1073 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1074 			    V_WR_SGLSFLT(1)) | wr_hi;
1075 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1076 				    sgl_flits + 1)) |
1077 			    V_WR_GEN(txqs->gen)) | wr_lo;
1078 			wr_gen2(txd, txqs->gen);
1079 			flits = 1;
1080 		}
1081 		wrp->wr_hi |= htonl(F_WR_EOP);
1082 		wmb();
1083 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1084 		wr_gen2((struct tx_desc *)wp, ogen);
1085 	}
1086 }
1087 
1088 
1089 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1090 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1091 
1092 int
1093 t3_encap(struct port_info *p, struct mbuf **m)
1094 {
1095 	adapter_t *sc;
1096 	struct mbuf *m0;
1097 	struct sge_qset *qs;
1098 	struct sge_txq *txq;
1099 	struct tx_sw_desc *stx;
1100 	struct txq_state txqs;
1101 	unsigned int nsegs, ndesc, flits, cntrl, mlen;
1102 	int err, tso_info = 0;
1103 
1104 	struct work_request_hdr *wrp;
1105 	struct tx_sw_desc *txsd;
1106 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1107 	bus_dma_segment_t segs[TX_MAX_SEGS];
1108 	uint32_t wr_hi, wr_lo, sgl_flits;
1109 
1110 	struct tx_desc *txd;
1111 	struct cpl_tx_pkt *cpl;
1112 
1113 	DPRINTF("t3_encap ");
1114 	m0 = *m;
1115 	sc = p->adapter;
1116 	qs = &sc->sge.qs[p->first_qset];
1117 	txq = &qs->txq[TXQ_ETH];
1118 	stx = &txq->sdesc[txq->pidx];
1119 	txd = &txq->desc[txq->pidx];
1120 	cpl = (struct cpl_tx_pkt *)txd;
1121 	mlen = m0->m_pkthdr.len;
1122 	cpl->len = htonl(mlen | 0x80000000);
1123 
1124 	DPRINTF("mlen=%d\n", mlen);
1125 	/*
1126 	 * XXX handle checksum, TSO, and VLAN here
1127 	 *
1128 	 */
1129 	cntrl = V_TXPKT_INTF(p->port);
1130 
1131 	/*
1132 	 * XXX need to add VLAN support for 6.x
1133 	 */
1134 #ifdef VLAN_SUPPORTED
1135 	if (m0->m_flags & M_VLANTAG)
1136 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
1137 	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
1138 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1139 #endif
1140 	if (tso_info) {
1141 		int eth_type;
1142 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
1143 		struct ip *ip;
1144 		struct tcphdr *tcp;
1145 		uint8_t *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
1146 
1147 		txd->flit[2] = 0;
1148 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1149 		hdr->cntrl = htonl(cntrl);
1150 
1151 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1152 			pkthdr = &tmp[0];
1153 			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
1154 		} else {
1155 			pkthdr = mtod(m0, uint8_t *);
1156 		}
1157 
1158 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1159 			eth_type = CPL_ETH_II_VLAN;
1160 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1161 			    ETHER_VLAN_ENCAP_LEN);
1162 		} else {
1163 			eth_type = CPL_ETH_II;
1164 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1165 		}
1166 		tcp = (struct tcphdr *)((uint8_t *)ip +
1167 		    sizeof(*ip));
1168 
1169 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1170 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1171 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1172 		hdr->lso_info = htonl(tso_info);
1173 		flits = 3;
1174 	} else {
1175 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1176 		cpl->cntrl = htonl(cntrl);
1177 
1178 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1179 			txq_prod(txq, 1, &txqs);
1180 			txq->sdesc[txqs.pidx].m = m0;
1181 			m_set_priority(m0, txqs.pidx);
1182 
1183 			if (m0->m_len == m0->m_pkthdr.len)
1184 				memcpy(&txd->flit[2], mtod(m0, uint8_t *), mlen);
1185 			else
1186 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1187 
1188 			flits = (mlen + 7) / 8 + 2;
1189 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1190 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1191 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1192 			wmb();
1193 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1194 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1195 
1196 			wr_gen2(txd, txqs.gen);
1197 			check_ring_tx_db(sc, txq);
1198 			return (0);
1199 		}
1200 		flits = 2;
1201 	}
1202 
1203 	wrp = (struct work_request_hdr *)txd;
1204 
1205 	if ((err = busdma_map_mbufs(m, txq, stx, segs, &nsegs)) != 0) {
1206 		return (err);
1207 	}
1208 	m0 = *m;
1209 	ndesc = calc_tx_descs(m0, nsegs);
1210 
1211 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1212 	make_sgl(sgp, segs, nsegs);
1213 
1214 	sgl_flits = sgl_len(nsegs);
1215 
1216 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1217 	txq_prod(txq, ndesc, &txqs);
1218 	txsd = &txq->sdesc[txqs.pidx];
1219 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1220 	wr_lo = htonl(V_WR_TID(txq->token));
1221 	txsd->m = m0;
1222 	m_set_priority(m0, txqs.pidx);
1223 
1224 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
1225 	check_ring_tx_db(p->adapter, txq);
1226 
1227 	return (0);
1228 }
1229 
1230 
1231 /**
1232  *	write_imm - write a packet into a Tx descriptor as immediate data
1233  *	@d: the Tx descriptor to write
1234  *	@m: the packet
1235  *	@len: the length of packet data to write as immediate data
1236  *	@gen: the generation bit value to write
1237  *
1238  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1239  *	contains a work request at its beginning.  We must write the packet
1240  *	carefully so the SGE doesn't read accidentally before it's written in
1241  *	its entirety.
1242  */
1243 static __inline void
1244 write_imm(struct tx_desc *d, struct mbuf *m,
1245 	  unsigned int len, unsigned int gen)
1246 {
1247 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1248 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1249 
1250 	memcpy(&to[1], &from[1], len - sizeof(*from));
1251 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1252 					V_WR_BCNTLFLT(len & 7));
1253 	wmb();
1254 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1255 					V_WR_LEN((len + 7) / 8));
1256 	wr_gen2(d, gen);
1257 	m_freem(m);
1258 }
1259 
1260 /**
1261  *	check_desc_avail - check descriptor availability on a send queue
1262  *	@adap: the adapter
1263  *	@q: the TX queue
1264  *	@m: the packet needing the descriptors
1265  *	@ndesc: the number of Tx descriptors needed
1266  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1267  *
1268  *	Checks if the requested number of Tx descriptors is available on an
1269  *	SGE send queue.  If the queue is already suspended or not enough
1270  *	descriptors are available the packet is queued for later transmission.
1271  *	Must be called with the Tx queue locked.
1272  *
1273  *	Returns 0 if enough descriptors are available, 1 if there aren't
1274  *	enough descriptors and the packet has been queued, and 2 if the caller
1275  *	needs to retry because there weren't enough descriptors at the
1276  *	beginning of the call but some freed up in the mean time.
1277  */
1278 static __inline int
1279 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1280 		 struct mbuf *m, unsigned int ndesc,
1281 		 unsigned int qid)
1282 {
1283 	/*
1284 	 * XXX We currently only use this for checking the control queue
1285 	 * the control queue is only used for binding qsets which happens
1286 	 * at init time so we are guaranteed enough descriptors
1287 	 */
1288 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1289 addq_exit:	mbufq_tail(&q->sendq, m);
1290 		return 1;
1291 	}
1292 	if (__predict_false(q->size - q->in_use < ndesc)) {
1293 
1294 		struct sge_qset *qs = txq_to_qset(q, qid);
1295 
1296 		setbit(&qs->txq_stopped, qid);
1297 		smp_mb();
1298 
1299 		if (should_restart_tx(q) &&
1300 		    test_and_clear_bit(qid, &qs->txq_stopped))
1301 			return 2;
1302 
1303 		q->stops++;
1304 		goto addq_exit;
1305 	}
1306 	return 0;
1307 }
1308 
1309 
1310 /**
1311  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1312  *	@q: the SGE control Tx queue
1313  *
1314  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1315  *	that send only immediate data (presently just the control queues) and
1316  *	thus do not have any mbufs
1317  */
1318 static __inline void
1319 reclaim_completed_tx_imm(struct sge_txq *q)
1320 {
1321 	unsigned int reclaim = q->processed - q->cleaned;
1322 
1323 	mtx_assert(&q->lock, MA_OWNED);
1324 
1325 	q->in_use -= reclaim;
1326 	q->cleaned += reclaim;
1327 }
1328 
1329 static __inline int
1330 immediate(const struct mbuf *m)
1331 {
1332 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1333 }
1334 
1335 /**
1336  *	ctrl_xmit - send a packet through an SGE control Tx queue
1337  *	@adap: the adapter
1338  *	@q: the control queue
1339  *	@m: the packet
1340  *
1341  *	Send a packet through an SGE control Tx queue.  Packets sent through
1342  *	a control queue must fit entirely as immediate data in a single Tx
1343  *	descriptor and have no page fragments.
1344  */
1345 static int
1346 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1347 {
1348 	int ret;
1349 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1350 
1351 	if (__predict_false(!immediate(m))) {
1352 		m_freem(m);
1353 		return 0;
1354 	}
1355 
1356 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1357 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1358 
1359 	mtx_lock(&q->lock);
1360 again:	reclaim_completed_tx_imm(q);
1361 
1362 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1363 	if (__predict_false(ret)) {
1364 		if (ret == 1) {
1365 			mtx_unlock(&q->lock);
1366 			return (-1);
1367 		}
1368 		goto again;
1369 	}
1370 
1371 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1372 
1373 	q->in_use++;
1374 	if (++q->pidx >= q->size) {
1375 		q->pidx = 0;
1376 		q->gen ^= 1;
1377 	}
1378 	mtx_unlock(&q->lock);
1379 	wmb();
1380 	t3_write_reg(adap, A_SG_KDOORBELL,
1381 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1382 	return (0);
1383 }
1384 
1385 
1386 /**
1387  *	restart_ctrlq - restart a suspended control queue
1388  *	@qs: the queue set cotaining the control queue
1389  *
1390  *	Resumes transmission on a suspended Tx control queue.
1391  */
1392 static void
1393 restart_ctrlq(void *data, int npending)
1394 {
1395 	struct mbuf *m;
1396 	struct sge_qset *qs = (struct sge_qset *)data;
1397 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1398 	adapter_t *adap = qs->port->adapter;
1399 
1400 	mtx_lock(&q->lock);
1401 again:	reclaim_completed_tx_imm(q);
1402 
1403 	while (q->in_use < q->size &&
1404 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1405 
1406 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1407 
1408 		if (++q->pidx >= q->size) {
1409 			q->pidx = 0;
1410 			q->gen ^= 1;
1411 		}
1412 		q->in_use++;
1413 	}
1414 	if (!mbufq_empty(&q->sendq)) {
1415 		setbit(&qs->txq_stopped, TXQ_CTRL);
1416 		smp_mb();
1417 
1418 		if (should_restart_tx(q) &&
1419 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1420 			goto again;
1421 		q->stops++;
1422 	}
1423 	mtx_unlock(&q->lock);
1424 	t3_write_reg(adap, A_SG_KDOORBELL,
1425 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1426 }
1427 
1428 
1429 /*
1430  * Send a management message through control queue 0
1431  */
1432 int
1433 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1434 {
1435 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1436 }
1437 
1438 /**
1439  *	free_qset - free the resources of an SGE queue set
1440  *	@sc: the controller owning the queue set
1441  *	@q: the queue set
1442  *
1443  *	Release the HW and SW resources associated with an SGE queue set, such
1444  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1445  *	queue set must be quiesced prior to calling this.
1446  */
1447 static void
1448 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1449 {
1450 	int i;
1451 
1452 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1453 		if (q->fl[i].desc) {
1454 			mtx_lock(&sc->sge.reg_lock);
1455 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1456 			mtx_unlock(&sc->sge.reg_lock);
1457 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1458 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1459 					q->fl[i].desc_map);
1460 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1461 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1462 		}
1463 		if (q->fl[i].sdesc) {
1464 			free_rx_bufs(sc, &q->fl[i]);
1465 			free(q->fl[i].sdesc, M_DEVBUF);
1466 		}
1467 	}
1468 
1469 	for (i = 0; i < SGE_TXQ_PER_SET; ++i) {
1470 		if (q->txq[i].desc) {
1471 			mtx_lock(&sc->sge.reg_lock);
1472 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1473 			mtx_unlock(&sc->sge.reg_lock);
1474 			bus_dmamap_unload(q->txq[i].desc_tag,
1475 					q->txq[i].desc_map);
1476 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1477 					q->txq[i].desc_map);
1478 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1479 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1480 		}
1481 		if (q->txq[i].sdesc) {
1482 			free(q->txq[i].sdesc, M_DEVBUF);
1483 		}
1484 		if (mtx_initialized(&q->txq[i].lock)) {
1485 			mtx_destroy(&q->txq[i].lock);
1486 		}
1487 	}
1488 
1489 	if (q->rspq.desc) {
1490 		mtx_lock(&sc->sge.reg_lock);
1491 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1492 		mtx_unlock(&sc->sge.reg_lock);
1493 
1494 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1495 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1496 			        q->rspq.desc_map);
1497 		bus_dma_tag_destroy(q->rspq.desc_tag);
1498 	}
1499 
1500 	if (mtx_initialized(&q->rspq.lock))
1501 		mtx_destroy(&q->rspq.lock);
1502 
1503 	bzero(q, sizeof(*q));
1504 }
1505 
1506 /**
1507  *	t3_free_sge_resources - free SGE resources
1508  *	@sc: the adapter softc
1509  *
1510  *	Frees resources used by the SGE queue sets.
1511  */
1512 void
1513 t3_free_sge_resources(adapter_t *sc)
1514 {
1515 	int i;
1516 
1517 	for (i = 0; i < SGE_QSETS; ++i)
1518 		t3_free_qset(sc, &sc->sge.qs[i]);
1519 }
1520 
1521 /**
1522  *	t3_sge_start - enable SGE
1523  *	@sc: the controller softc
1524  *
1525  *	Enables the SGE for DMAs.  This is the last step in starting packet
1526  *	transfers.
1527  */
1528 void
1529 t3_sge_start(adapter_t *sc)
1530 {
1531 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1532 }
1533 
1534 /**
1535  *	t3_sge_stop - disable SGE operation
1536  *	@sc: the adapter
1537  *
1538  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
1539  *	from error interrupts) or from normal process context.  In the latter
1540  *	case it also disables any pending queue restart tasklets.  Note that
1541  *	if it is called in interrupt context it cannot disable the restart
1542  *	tasklets as it cannot wait, however the tasklets will have no effect
1543  *	since the doorbells are disabled and the driver will call this again
1544  *	later from process context, at which time the tasklets will be stopped
1545  *	if they are still running.
1546  */
1547 void
1548 t3_sge_stop(adapter_t *sc)
1549 {
1550 	int i;
1551 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
1552 
1553 	if (sc->tq == NULL)
1554 		return;
1555 
1556 	for (i = 0; i < SGE_QSETS; ++i) {
1557 		struct sge_qset *qs = &sc->sge.qs[i];
1558 
1559 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_tsk);
1560 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_tsk);
1561 	}
1562 }
1563 
1564 
1565 /**
1566  *	free_tx_desc - reclaims Tx descriptors and their buffers
1567  *	@adapter: the adapter
1568  *	@q: the Tx queue to reclaim descriptors from
1569  *	@n: the number of descriptors to reclaim
1570  *
1571  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1572  *	Tx buffers.  Called with the Tx queue lock held.
1573  */
1574 int
1575 free_tx_desc(adapter_t *sc, struct sge_txq *q, int n, struct mbuf **m_vec)
1576 {
1577 	struct tx_sw_desc *d;
1578 	unsigned int cidx = q->cidx;
1579 	int nbufs = 0;
1580 
1581 #ifdef T3_TRACE
1582 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1583 		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
1584 #endif
1585 	d = &q->sdesc[cidx];
1586 
1587 	while (n-- > 0) {
1588 		DPRINTF("cidx=%d d=%p\n", cidx, d);
1589 		if (d->m) {
1590 			if (d->flags & TX_SW_DESC_MAPPED) {
1591 				bus_dmamap_unload(q->entry_tag, d->map);
1592 				bus_dmamap_destroy(q->entry_tag, d->map);
1593 				d->flags &= ~TX_SW_DESC_MAPPED;
1594 			}
1595 			if (m_get_priority(d->m) == cidx) {
1596 				m_vec[nbufs] = d->m;
1597 				d->m = NULL;
1598 				nbufs++;
1599 			} else {
1600 				printf("pri=%d cidx=%d\n", (int)m_get_priority(d->m), cidx);
1601 			}
1602 		}
1603 		++d;
1604 		if (++cidx == q->size) {
1605 			cidx = 0;
1606 			d = q->sdesc;
1607 		}
1608 	}
1609 	q->cidx = cidx;
1610 
1611 	return (nbufs);
1612 }
1613 
1614 /**
1615  *	is_new_response - check if a response is newly written
1616  *	@r: the response descriptor
1617  *	@q: the response queue
1618  *
1619  *	Returns true if a response descriptor contains a yet unprocessed
1620  *	response.
1621  */
1622 static __inline int
1623 is_new_response(const struct rsp_desc *r,
1624     const struct sge_rspq *q)
1625 {
1626 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1627 }
1628 
1629 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1630 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1631 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1632 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1633 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1634 
1635 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1636 #define NOMEM_INTR_DELAY 2500
1637 
1638 /**
1639  *	write_ofld_wr - write an offload work request
1640  *	@adap: the adapter
1641  *	@m: the packet to send
1642  *	@q: the Tx queue
1643  *	@pidx: index of the first Tx descriptor to write
1644  *	@gen: the generation value to use
1645  *	@ndesc: number of descriptors the packet will occupy
1646  *
1647  *	Write an offload work request to send the supplied packet.  The packet
1648  *	data already carry the work request with most fields populated.
1649  */
1650 static void
1651 write_ofld_wr(adapter_t *adap, struct mbuf *m,
1652     struct sge_txq *q, unsigned int pidx,
1653     unsigned int gen, unsigned int ndesc,
1654     bus_dma_segment_t *segs, unsigned int nsegs)
1655 {
1656 	unsigned int sgl_flits, flits;
1657 	struct work_request_hdr *from;
1658 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1659 	struct tx_desc *d = &q->desc[pidx];
1660 	struct txq_state txqs;
1661 
1662 	if (immediate(m)) {
1663 		q->sdesc[pidx].m = NULL;
1664 		write_imm(d, m, m->m_len, gen);
1665 		return;
1666 	}
1667 
1668 	/* Only TX_DATA builds SGLs */
1669 
1670 	from = mtod(m, struct work_request_hdr *);
1671 	memcpy(&d->flit[1], &from[1],
1672 	    (uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *) - sizeof(*from));
1673 
1674 	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;
1675 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
1676 
1677 	make_sgl(sgp, segs, nsegs);
1678 	sgl_flits = sgl_len(nsegs);
1679 
1680 	txqs.gen = q->gen;
1681 	txqs.pidx = q->pidx;
1682 	txqs.compl = (q->unacked & 8) << (S_WR_COMPL - 3);
1683 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
1684 	    from->wr_hi, from->wr_lo);
1685 }
1686 
1687 /**
1688  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1689  *	@m: the packet
1690  *
1691  * 	Returns the number of Tx descriptors needed for the given offload
1692  * 	packet.  These packets are already fully constructed.
1693  */
1694 static __inline unsigned int
1695 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
1696 {
1697 	unsigned int flits, cnt = 0;
1698 
1699 
1700 	if (m->m_len <= WR_LEN)
1701 		return 1;                 /* packet fits as immediate data */
1702 
1703 	if (m->m_flags & M_IOVEC)
1704 		cnt = mtomv(m)->mv_count;
1705 
1706 	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;   /* headers */
1707 
1708 	return flits_to_desc(flits + sgl_len(cnt));
1709 }
1710 
1711 /**
1712  *	ofld_xmit - send a packet through an offload queue
1713  *	@adap: the adapter
1714  *	@q: the Tx offload queue
1715  *	@m: the packet
1716  *
1717  *	Send an offload packet through an SGE offload queue.
1718  */
1719 static int
1720 ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1721 {
1722 	int ret;
1723 	unsigned int pidx, gen, nsegs;
1724 	unsigned int ndesc;
1725 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
1726 	bus_dma_segment_t segs[TX_MAX_SEGS];
1727 	int i, cleaned;
1728 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
1729 
1730 	mtx_lock(&q->lock);
1731 	if ((ret = busdma_map_mbufs(&m, q, stx, segs, &nsegs)) != 0) {
1732 		mtx_unlock(&q->lock);
1733 		return (ret);
1734 	}
1735 	ndesc = calc_tx_descs_ofld(m, nsegs);
1736 again:	cleaned = reclaim_completed_tx(adap, q, TX_CLEAN_MAX_DESC, m_vec);
1737 
1738 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
1739 	if (__predict_false(ret)) {
1740 		if (ret == 1) {
1741 			m_set_priority(m, ndesc);     /* save for restart */
1742 			mtx_unlock(&q->lock);
1743 			return NET_XMIT_CN;
1744 		}
1745 		goto again;
1746 	}
1747 
1748 	gen = q->gen;
1749 	q->in_use += ndesc;
1750 	pidx = q->pidx;
1751 	q->pidx += ndesc;
1752 	if (q->pidx >= q->size) {
1753 		q->pidx -= q->size;
1754 		q->gen ^= 1;
1755 	}
1756 #ifdef T3_TRACE
1757 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
1758 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
1759 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
1760 		  skb_shinfo(skb)->nr_frags);
1761 #endif
1762 	mtx_unlock(&q->lock);
1763 
1764 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
1765 	check_ring_tx_db(adap, q);
1766 
1767 	for (i = 0; i < cleaned; i++) {
1768 		m_freem_vec(m_vec[i]);
1769 	}
1770 	return NET_XMIT_SUCCESS;
1771 }
1772 
1773 /**
1774  *	restart_offloadq - restart a suspended offload queue
1775  *	@qs: the queue set cotaining the offload queue
1776  *
1777  *	Resumes transmission on a suspended Tx offload queue.
1778  */
1779 static void
1780 restart_offloadq(void *data, int npending)
1781 {
1782 
1783 	struct mbuf *m;
1784 	struct sge_qset *qs = data;
1785 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
1786 	adapter_t *adap = qs->port->adapter;
1787 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
1788 	bus_dma_segment_t segs[TX_MAX_SEGS];
1789 	int nsegs, i, cleaned;
1790 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
1791 
1792 	mtx_lock(&q->lock);
1793 again:	cleaned = reclaim_completed_tx(adap, q, TX_CLEAN_MAX_DESC, m_vec);
1794 
1795 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
1796 		unsigned int gen, pidx;
1797 		unsigned int ndesc = m_get_priority(m);
1798 
1799 		if (__predict_false(q->size - q->in_use < ndesc)) {
1800 			setbit(&qs->txq_stopped, TXQ_OFLD);
1801 			smp_mb();
1802 
1803 			if (should_restart_tx(q) &&
1804 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
1805 				goto again;
1806 			q->stops++;
1807 			break;
1808 		}
1809 
1810 		gen = q->gen;
1811 		q->in_use += ndesc;
1812 		pidx = q->pidx;
1813 		q->pidx += ndesc;
1814 		if (q->pidx >= q->size) {
1815 			q->pidx -= q->size;
1816 			q->gen ^= 1;
1817 		}
1818 
1819 		(void)mbufq_dequeue(&q->sendq);
1820 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
1821 		mtx_unlock(&q->lock);
1822 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
1823 		mtx_lock(&q->lock);
1824 	}
1825 	mtx_unlock(&q->lock);
1826 
1827 #if USE_GTS
1828 	set_bit(TXQ_RUNNING, &q->flags);
1829 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
1830 #endif
1831 	t3_write_reg(adap, A_SG_KDOORBELL,
1832 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1833 
1834 	for (i = 0; i < cleaned; i++) {
1835 		m_freem_vec(m_vec[i]);
1836 	}
1837 }
1838 
1839 /**
1840  *	queue_set - return the queue set a packet should use
1841  *	@m: the packet
1842  *
1843  *	Maps a packet to the SGE queue set it should use.  The desired queue
1844  *	set is carried in bits 1-3 in the packet's priority.
1845  */
1846 static __inline int
1847 queue_set(const struct mbuf *m)
1848 {
1849 	return m_get_priority(m) >> 1;
1850 }
1851 
1852 /**
1853  *	is_ctrl_pkt - return whether an offload packet is a control packet
1854  *	@m: the packet
1855  *
1856  *	Determines whether an offload packet should use an OFLD or a CTRL
1857  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
1858  */
1859 static __inline int
1860 is_ctrl_pkt(const struct mbuf *m)
1861 {
1862 	return m_get_priority(m) & 1;
1863 }
1864 
1865 /**
1866  *	t3_offload_tx - send an offload packet
1867  *	@tdev: the offload device to send to
1868  *	@m: the packet
1869  *
1870  *	Sends an offload packet.  We use the packet priority to select the
1871  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
1872  *	should be sent as regular or control, bits 1-3 select the queue set.
1873  */
1874 int
1875 t3_offload_tx(struct toedev *tdev, struct mbuf *m)
1876 {
1877 	adapter_t *adap = tdev2adap(tdev);
1878 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
1879 
1880 	if (__predict_false(is_ctrl_pkt(m)))
1881 		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
1882 
1883 	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
1884 }
1885 
1886 /**
1887  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
1888  *	@tdev: the offload device that will be receiving the packets
1889  *	@q: the SGE response queue that assembled the bundle
1890  *	@m: the partial bundle
1891  *	@n: the number of packets in the bundle
1892  *
1893  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
1894  */
1895 static __inline void
1896 deliver_partial_bundle(struct toedev *tdev,
1897 			struct sge_rspq *q,
1898 			struct mbuf *mbufs[], int n)
1899 {
1900 	if (n) {
1901 		q->offload_bundles++;
1902 		cxgb_ofld_recv(tdev, mbufs, n);
1903 	}
1904 }
1905 
1906 static __inline int
1907 rx_offload(struct toedev *tdev, struct sge_rspq *rq,
1908     struct mbuf *m, struct mbuf *rx_gather[],
1909     unsigned int gather_idx)
1910 {
1911 	rq->offload_pkts++;
1912 	m->m_pkthdr.header = mtod(m, void *);
1913 
1914 	rx_gather[gather_idx++] = m;
1915 	if (gather_idx == RX_BUNDLE_SIZE) {
1916 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
1917 		gather_idx = 0;
1918 		rq->offload_bundles++;
1919 	}
1920 	return (gather_idx);
1921 }
1922 
1923 static void
1924 restart_tx(struct sge_qset *qs)
1925 {
1926 	struct adapter *sc = qs->port->adapter;
1927 
1928 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
1929 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
1930 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
1931 		qs->txq[TXQ_OFLD].restarts++;
1932 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_tsk);
1933 	}
1934 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
1935 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
1936 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
1937 		qs->txq[TXQ_CTRL].restarts++;
1938 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_tsk);
1939 	}
1940 }
1941 
1942 /**
1943  *	t3_sge_alloc_qset - initialize an SGE queue set
1944  *	@sc: the controller softc
1945  *	@id: the queue set id
1946  *	@nports: how many Ethernet ports will be using this queue set
1947  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
1948  *	@p: configuration parameters for this queue set
1949  *	@ntxq: number of Tx queues for the queue set
1950  *	@pi: port info for queue set
1951  *
1952  *	Allocate resources and initialize an SGE queue set.  A queue set
1953  *	comprises a response queue, two Rx free-buffer queues, and up to 3
1954  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
1955  *	queue, offload queue, and control queue.
1956  */
1957 int
1958 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
1959 		  const struct qset_params *p, int ntxq, struct port_info *pi)
1960 {
1961 	struct sge_qset *q = &sc->sge.qs[id];
1962 	int i, ret = 0;
1963 
1964 	init_qset_cntxt(q, id);
1965 
1966 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
1967 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
1968 		    &q->fl[0].desc, &q->fl[0].sdesc,
1969 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
1970 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
1971 		printf("error %d from alloc ring fl0\n", ret);
1972 		goto err;
1973 	}
1974 
1975 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
1976 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
1977 		    &q->fl[1].desc, &q->fl[1].sdesc,
1978 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
1979 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
1980 		printf("error %d from alloc ring fl1\n", ret);
1981 		goto err;
1982 	}
1983 
1984 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
1985 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
1986 		    &q->rspq.desc_tag, &q->rspq.desc_map,
1987 		    NULL, NULL)) != 0) {
1988 		printf("error %d from alloc ring rspq\n", ret);
1989 		goto err;
1990 	}
1991 
1992 	for (i = 0; i < ntxq; ++i) {
1993 		/*
1994 		 * The control queue always uses immediate data so does not
1995 		 * need to keep track of any mbufs.
1996 		 * XXX Placeholder for future TOE support.
1997 		 */
1998 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
1999 
2000 		if ((ret = alloc_ring(sc, p->txq_size[i],
2001 			    sizeof(struct tx_desc), sz,
2002 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2003 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2004 			    &q->txq[i].desc_map,
2005 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2006 			printf("error %d from alloc ring tx %i\n", ret, i);
2007 			goto err;
2008 		}
2009 		mbufq_init(&q->txq[i].sendq);
2010 		q->txq[i].gen = 1;
2011 		q->txq[i].size = p->txq_size[i];
2012 		mtx_init(&q->txq[i].lock, "t3 txq lock", NULL, MTX_DEF);
2013 	}
2014 
2015 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_tsk, 0, restart_offloadq, q);
2016 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_tsk, 0, restart_ctrlq, q);
2017 
2018 	q->fl[0].gen = q->fl[1].gen = 1;
2019 	q->fl[0].size = p->fl_size;
2020 	q->fl[1].size = p->jumbo_size;
2021 
2022 	q->rspq.gen = 1;
2023 	q->rspq.size = p->rspq_size;
2024 	mtx_init(&q->rspq.lock, "t3 rspq lock", NULL, MTX_DEF);
2025 
2026 	q->txq[TXQ_ETH].stop_thres = nports *
2027 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2028 
2029 	q->fl[0].buf_size = MCLBYTES;
2030 	q->fl[0].zone = zone_clust;
2031 	q->fl[0].type = EXT_CLUSTER;
2032 	q->fl[1].buf_size = MJUMPAGESIZE;
2033 	q->fl[1].zone = zone_jumbop;
2034 	q->fl[1].type = EXT_JUMBOP;
2035 
2036 	q->lro.enabled = lro_default;
2037 
2038 	mtx_lock(&sc->sge.reg_lock);
2039 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2040 				   q->rspq.phys_addr, q->rspq.size,
2041 				   q->fl[0].buf_size, 1, 0);
2042 	if (ret) {
2043 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2044 		goto err_unlock;
2045 	}
2046 
2047 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2048 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2049 					  q->fl[i].phys_addr, q->fl[i].size,
2050 					  q->fl[i].buf_size, p->cong_thres, 1,
2051 					  0);
2052 		if (ret) {
2053 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2054 			goto err_unlock;
2055 		}
2056 	}
2057 
2058 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2059 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2060 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2061 				 1, 0);
2062 	if (ret) {
2063 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2064 		goto err_unlock;
2065 	}
2066 
2067 	if (ntxq > 1) {
2068 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2069 					 USE_GTS, SGE_CNTXT_OFLD, id,
2070 					 q->txq[TXQ_OFLD].phys_addr,
2071 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2072 		if (ret) {
2073 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2074 			goto err_unlock;
2075 		}
2076 	}
2077 
2078 	if (ntxq > 2) {
2079 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2080 					 SGE_CNTXT_CTRL, id,
2081 					 q->txq[TXQ_CTRL].phys_addr,
2082 					 q->txq[TXQ_CTRL].size,
2083 					 q->txq[TXQ_CTRL].token, 1, 0);
2084 		if (ret) {
2085 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2086 			goto err_unlock;
2087 		}
2088 	}
2089 
2090 	mtx_unlock(&sc->sge.reg_lock);
2091 	t3_update_qset_coalesce(q, p);
2092 	q->port = pi;
2093 
2094 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2095 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2096 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2097 
2098 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2099 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2100 
2101 	return (0);
2102 
2103 err_unlock:
2104 	mtx_unlock(&sc->sge.reg_lock);
2105 err:
2106 	t3_free_qset(sc, q);
2107 
2108 	return (ret);
2109 }
2110 
2111 void
2112 t3_rx_eth(struct port_info *pi, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2113 {
2114 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2115 	struct ifnet *ifp = pi->ifp;
2116 
2117 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2118 	if (&pi->adapter->port[cpl->iff] != pi)
2119 		panic("bad port index %d m->m_data=%p\n", cpl->iff, mtod(m, uint8_t *));
2120 
2121 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2122 	    cpl->csum_valid && cpl->csum == 0xffff) {
2123 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2124 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2125 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2126 		m->m_pkthdr.csum_data = 0xffff;
2127 	}
2128 	/*
2129 	 * XXX need to add VLAN support for 6.x
2130 	 */
2131 #ifdef VLAN_SUPPORTED
2132 	if (__predict_false(cpl->vlan_valid)) {
2133 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2134 		m->m_flags |= M_VLANTAG;
2135 	}
2136 #endif
2137 
2138 	m->m_pkthdr.rcvif = ifp;
2139 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2140 	m_explode(m);
2141 	/*
2142 	 * adjust after conversion to mbuf chain
2143 	 */
2144 	m_adj(m, sizeof(*cpl) + ethpad);
2145 
2146 	(*ifp->if_input)(ifp, m);
2147 }
2148 
2149 /**
2150  *	get_packet - return the next ingress packet buffer from a free list
2151  *	@adap: the adapter that received the packet
2152  *	@drop_thres: # of remaining buffers before we start dropping packets
2153  *	@qs: the qset that the SGE free list holding the packet belongs to
2154  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2155  *      @r: response descriptor
2156  *
2157  *	Get the next packet from a free list and complete setup of the
2158  *	sk_buff.  If the packet is small we make a copy and recycle the
2159  *	original buffer, otherwise we use the original buffer itself.  If a
2160  *	positive drop threshold is supplied packets are dropped and their
2161  *	buffers recycled if (a) the number of remaining buffers is under the
2162  *	threshold and the packet is too big to copy, or (b) the packet should
2163  *	be copied but there is no memory for the copy.
2164  */
2165 static int
2166 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2167     struct mbuf *m, struct rsp_desc *r)
2168 {
2169 
2170 	unsigned int len_cq =  ntohl(r->len_cq);
2171 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2172 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2173 	uint32_t len = G_RSPD_LEN(len_cq);
2174 	uint32_t flags = ntohl(r->flags);
2175 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2176 	void *cl;
2177 	int ret = 0;
2178 
2179 	prefetch(sd->cl);
2180 
2181 	fl->credits--;
2182 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2183 
2184 	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2185 		cl = mtod(m, void *);
2186 		memcpy(cl, sd->cl, len);
2187 		recycle_rx_buf(adap, fl, fl->cidx);
2188 	} else {
2189 		cl = sd->cl;
2190 		bus_dmamap_unload(fl->entry_tag, sd->map);
2191 	}
2192 	switch(sopeop) {
2193 	case RSPQ_SOP_EOP:
2194 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2195 		if (cl == sd->cl)
2196 			m_cljset(m, cl, fl->type);
2197 		m->m_len = m->m_pkthdr.len = len;
2198 		ret = 1;
2199 		goto done;
2200 		break;
2201 	case RSPQ_NSOP_NEOP:
2202 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2203 		ret = 0;
2204 		break;
2205 	case RSPQ_SOP:
2206 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2207 		m_iovinit(m);
2208 		ret = 0;
2209 		break;
2210 	case RSPQ_EOP:
2211 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2212 		ret = 1;
2213 		break;
2214 	}
2215 	m_iovappend(m, cl, fl->buf_size, len, 0);
2216 
2217 done:
2218 	if (++fl->cidx == fl->size)
2219 		fl->cidx = 0;
2220 
2221 	return (ret);
2222 }
2223 
2224 /**
2225  *	handle_rsp_cntrl_info - handles control information in a response
2226  *	@qs: the queue set corresponding to the response
2227  *	@flags: the response control flags
2228  *
2229  *	Handles the control information of an SGE response, such as GTS
2230  *	indications and completion credits for the queue set's Tx queues.
2231  *	HW coalesces credits, we don't do any extra SW coalescing.
2232  */
2233 static __inline void
2234 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2235 {
2236 	unsigned int credits;
2237 
2238 #if USE_GTS
2239 	if (flags & F_RSPD_TXQ0_GTS)
2240 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2241 #endif
2242 	credits = G_RSPD_TXQ0_CR(flags);
2243 	if (credits) {
2244 		qs->txq[TXQ_ETH].processed += credits;
2245 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
2246 			taskqueue_enqueue(qs->port->adapter->tq,
2247 			    &qs->port->timer_reclaim_task);
2248 	}
2249 
2250 	credits = G_RSPD_TXQ2_CR(flags);
2251 	if (credits)
2252 		qs->txq[TXQ_CTRL].processed += credits;
2253 
2254 # if USE_GTS
2255 	if (flags & F_RSPD_TXQ1_GTS)
2256 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2257 # endif
2258 	credits = G_RSPD_TXQ1_CR(flags);
2259 	if (credits)
2260 		qs->txq[TXQ_OFLD].processed += credits;
2261 }
2262 
2263 static void
2264 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2265     unsigned int sleeping)
2266 {
2267 	;
2268 }
2269 
2270 /**
2271  *	process_responses - process responses from an SGE response queue
2272  *	@adap: the adapter
2273  *	@qs: the queue set to which the response queue belongs
2274  *	@budget: how many responses can be processed in this round
2275  *
2276  *	Process responses from an SGE response queue up to the supplied budget.
2277  *	Responses include received packets as well as credits and other events
2278  *	for the queues that belong to the response queue's queue set.
2279  *	A negative budget is effectively unlimited.
2280  *
2281  *	Additionally choose the interrupt holdoff time for the next interrupt
2282  *	on this queue.  If the system is under memory shortage use a fairly
2283  *	long delay to help recovery.
2284  */
2285 static int
2286 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2287 {
2288 	struct sge_rspq *rspq = &qs->rspq;
2289 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2290 	int budget_left = budget;
2291 	unsigned int sleeping = 0;
2292 	int lro = qs->lro.enabled;
2293 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2294 	int ngathered = 0;
2295 #ifdef DEBUG
2296 	static int last_holdoff = 0;
2297 	if (rspq->holdoff_tmr != last_holdoff) {
2298 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2299 		last_holdoff = rspq->holdoff_tmr;
2300 	}
2301 #endif
2302 	rspq->next_holdoff = rspq->holdoff_tmr;
2303 
2304 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2305 		int eth, eop = 0, ethpad = 0;
2306 		uint32_t flags = ntohl(r->flags);
2307 		uint32_t rss_csum = *(const uint32_t *)r;
2308 		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
2309 
2310 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2311 
2312 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2313 			/* XXX */
2314 			printf("async notification\n");
2315 
2316 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2317 			struct mbuf *m = NULL;
2318 			if (cxgb_debug)
2319 				printf("IMM DATA VALID\n");
2320 			if (rspq->m == NULL)
2321 				rspq->m = m_gethdr(M_NOWAIT, MT_DATA);
2322                         else
2323 				m = m_gethdr(M_NOWAIT, MT_DATA);
2324 
2325 			if (rspq->m == NULL || m == NULL) {
2326 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2327 				budget_left--;
2328 				break;
2329 			}
2330 			get_imm_packet(adap, r, rspq->m, m);
2331 			eop = 1;
2332 			rspq->imm_data++;
2333 		} else if (r->len_cq) {
2334 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2335 
2336                         if (rspq->m == NULL)
2337 				rspq->m = m_gethdr(M_NOWAIT, MT_DATA);
2338 			if (rspq->m == NULL) {
2339 				log(LOG_WARNING, "failed to get mbuf for packet\n");
2340 				break;
2341 			}
2342 
2343 			ethpad = 2;
2344 			eop = get_packet(adap, drop_thresh, qs, rspq->m, r);
2345 		} else {
2346 			DPRINTF("pure response\n");
2347 			rspq->pure_rsps++;
2348 		}
2349 
2350 		if (flags & RSPD_CTRL_MASK) {
2351 			sleeping |= flags & RSPD_GTS_MASK;
2352 			handle_rsp_cntrl_info(qs, flags);
2353 		}
2354 
2355 		r++;
2356 		if (__predict_false(++rspq->cidx == rspq->size)) {
2357 			rspq->cidx = 0;
2358 			rspq->gen ^= 1;
2359 			r = rspq->desc;
2360 		}
2361 
2362 		prefetch(r);
2363 		if (++rspq->credits >= (rspq->size / 4)) {
2364 			refill_rspq(adap, rspq, rspq->credits);
2365 			rspq->credits = 0;
2366 		}
2367 
2368 		if (eop) {
2369 			prefetch(mtod(rspq->m, uint8_t *));
2370 			prefetch(mtod(rspq->m, uint8_t *) + L1_CACHE_BYTES);
2371 
2372 			if (eth) {
2373 				t3_rx_eth_lro(adap, rspq, rspq->m, ethpad,
2374 				    rss_hash, rss_csum, lro);
2375 
2376 				rspq->m = NULL;
2377 			} else {
2378 				rspq->m->m_pkthdr.csum_data = rss_csum;
2379 				/*
2380 				 * XXX size mismatch
2381 				 */
2382 				m_set_priority(rspq->m, rss_hash);
2383 
2384 				ngathered = rx_offload(&adap->tdev, rspq, rspq->m,
2385 				    offload_mbufs, ngathered);
2386 			}
2387 #ifdef notyet
2388 			taskqueue_enqueue(adap->tq, &adap->timer_reclaim_task);
2389 #else
2390 			__refill_fl(adap, &qs->fl[0]);
2391 			__refill_fl(adap, &qs->fl[1]);
2392 #endif
2393 		}
2394 		--budget_left;
2395 	}
2396 
2397 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
2398 	t3_lro_flush(adap, qs, &qs->lro);
2399 
2400 	if (sleeping)
2401 		check_ring_db(adap, qs, sleeping);
2402 
2403 	smp_mb();  /* commit Tx queue processed updates */
2404 	if (__predict_false(qs->txq_stopped != 0))
2405 		restart_tx(qs);
2406 
2407 	budget -= budget_left;
2408 	return (budget);
2409 }
2410 
2411 /*
2412  * A helper function that processes responses and issues GTS.
2413  */
2414 static __inline int
2415 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2416 {
2417 	int work;
2418 	static int last_holdoff = 0;
2419 
2420 	work = process_responses(adap, rspq_to_qset(rq), -1);
2421 
2422 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2423 		printf("next_holdoff=%d\n", rq->next_holdoff);
2424 		last_holdoff = rq->next_holdoff;
2425 	}
2426 
2427 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2428 		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2429 	return work;
2430 }
2431 
2432 
2433 /*
2434  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2435  * Handles data events from SGE response queues as well as error and other
2436  * async events as they all use the same interrupt pin.  We use one SGE
2437  * response queue per port in this mode and protect all response queues with
2438  * queue 0's lock.
2439  */
2440 void
2441 t3b_intr(void *data)
2442 {
2443 	uint32_t map;
2444 	adapter_t *adap = data;
2445 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2446 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2447 
2448 	t3_write_reg(adap, A_PL_CLI, 0);
2449 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2450 
2451 	if (!map)
2452 		return;
2453 
2454 	if (__predict_false(map & F_ERRINTR))
2455 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2456 
2457 	mtx_lock(&q0->lock);
2458 
2459 	if (__predict_true(map & 1))
2460 		process_responses_gts(adap, q0);
2461 
2462 	if (map & 2)
2463 		process_responses_gts(adap, q1);
2464 
2465 	mtx_unlock(&q0->lock);
2466 }
2467 
2468 /*
2469  * The MSI interrupt handler.  This needs to handle data events from SGE
2470  * response queues as well as error and other async events as they all use
2471  * the same MSI vector.  We use one SGE response queue per port in this mode
2472  * and protect all response queues with queue 0's lock.
2473  */
2474 void
2475 t3_intr_msi(void *data)
2476 {
2477 	adapter_t *adap = data;
2478 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2479 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2480 	int new_packets = 0;
2481 
2482 	mtx_lock(&q0->lock);
2483 	if (process_responses_gts(adap, q0)) {
2484 		new_packets = 1;
2485 	}
2486 
2487 	if (adap->params.nports == 2 &&
2488 	    process_responses_gts(adap, q1)) {
2489 		new_packets = 1;
2490 	}
2491 
2492 	mtx_unlock(&q0->lock);
2493 	if (new_packets == 0)
2494 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2495 }
2496 
2497 void
2498 t3_intr_msix(void *data)
2499 {
2500 	struct sge_qset *qs = data;
2501 	adapter_t *adap = qs->port->adapter;
2502 	struct sge_rspq *rspq = &qs->rspq;
2503 
2504 	mtx_lock(&rspq->lock);
2505 	if (process_responses_gts(adap, rspq) == 0)
2506 		rspq->unhandled_irqs++;
2507 	mtx_unlock(&rspq->lock);
2508 }
2509 
2510 /*
2511  * broken by recent mbuf changes
2512  */
2513 static int
2514 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2515 {
2516 	adapter_t *sc;
2517 	int i, j, enabled, err, nqsets = 0;
2518 
2519 #ifndef LRO_WORKING
2520 	return (0);
2521 #endif
2522 
2523 	sc = arg1;
2524 	enabled = sc->sge.qs[0].lro.enabled;
2525         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2526 
2527 	if (err != 0)
2528 		return (err);
2529 	if (enabled == sc->sge.qs[0].lro.enabled)
2530 		return (0);
2531 
2532 	for (i = 0; i < sc->params.nports; i++)
2533 		for (j = 0; j < sc->port[i].nqsets; j++)
2534 			nqsets++;
2535 
2536 	for (i = 0; i < nqsets; i++)
2537 		sc->sge.qs[i].lro.enabled = enabled;
2538 
2539 	return (0);
2540 }
2541 
2542 static int
2543 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2544 {
2545 	adapter_t *sc = arg1;
2546 	struct qset_params *qsp = &sc->params.sge.qset[0];
2547 	int coalesce_nsecs;
2548 	struct sge_qset *qs;
2549 	int i, j, err, nqsets = 0;
2550 	struct mtx *lock;
2551 
2552 	coalesce_nsecs = qsp->coalesce_nsecs;
2553         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
2554 
2555 	if (err != 0) {
2556 		return (err);
2557 	}
2558 	if (coalesce_nsecs == qsp->coalesce_nsecs)
2559 		return (0);
2560 
2561 	for (i = 0; i < sc->params.nports; i++)
2562 		for (j = 0; j < sc->port[i].nqsets; j++)
2563 			nqsets++;
2564 
2565 	coalesce_nsecs = max(100, coalesce_nsecs);
2566 
2567 	for (i = 0; i < nqsets; i++) {
2568 		qs = &sc->sge.qs[i];
2569 		qsp = &sc->params.sge.qset[i];
2570 		qsp->coalesce_nsecs = coalesce_nsecs;
2571 
2572 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
2573 			    &sc->sge.qs[0].rspq.lock;
2574 
2575 		mtx_lock(lock);
2576 		t3_update_qset_coalesce(qs, qsp);
2577 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2578 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
2579 		mtx_unlock(lock);
2580 	}
2581 
2582 	return (0);
2583 }
2584 
2585 
2586 void
2587 t3_add_sysctls(adapter_t *sc)
2588 {
2589 	struct sysctl_ctx_list *ctx;
2590 	struct sysctl_oid_list *children;
2591 
2592 	ctx = device_get_sysctl_ctx(sc->dev);
2593 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
2594 
2595 	/* random information */
2596 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
2597 	    "firmware_version",
2598 	    CTLFLAG_RD, &sc->fw_version,
2599 	    0, "firmware version");
2600 
2601 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2602 	    "enable_lro",
2603 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2604 	    0, t3_lro_enable,
2605 	    "I", "enable large receive offload");
2606 
2607 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2608 	    "intr_coal",
2609 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2610 	    0, t3_set_coalesce_nsecs,
2611 	    "I", "interrupt coalescing timer (ns)");
2612 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2613 	    "enable_debug",
2614 	    CTLFLAG_RW, &cxgb_debug,
2615 	    0, "enable verbose debugging output");
2616 
2617 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2618 	    "collapse_free",
2619 	    CTLFLAG_RD, &collapse_free,
2620 	    0, "frees during collapse");
2621 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2622 	    "mb_free_vec_free",
2623 	    CTLFLAG_RD, &mb_free_vec_free,
2624 	    0, "frees during mb_free_vec");
2625 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2626 	    "collapse_mbufs",
2627 	    CTLFLAG_RW, &collapse_mbufs,
2628 	    0, "collapse mbuf chains into iovecs");
2629 }
2630 
2631 /**
2632  *	t3_get_desc - dump an SGE descriptor for debugging purposes
2633  *	@qs: the queue set
2634  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2635  *	@idx: the descriptor index in the queue
2636  *	@data: where to dump the descriptor contents
2637  *
2638  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2639  *	size of the descriptor.
2640  */
2641 int
2642 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2643 		unsigned char *data)
2644 {
2645 	if (qnum >= 6)
2646 		return (EINVAL);
2647 
2648 	if (qnum < 3) {
2649 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2650 			return -EINVAL;
2651 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2652 		return sizeof(struct tx_desc);
2653 	}
2654 
2655 	if (qnum == 3) {
2656 		if (!qs->rspq.desc || idx >= qs->rspq.size)
2657 			return (EINVAL);
2658 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2659 		return sizeof(struct rsp_desc);
2660 	}
2661 
2662 	qnum -= 4;
2663 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2664 		return (EINVAL);
2665 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2666 	return sizeof(struct rx_desc);
2667 }
2668