xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 282a3889ebf826db9839be296ff1dd903f6d6d6e)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/conf.h>
39 #include <machine/bus.h>
40 #include <machine/resource.h>
41 #include <sys/bus_dma.h>
42 #include <sys/rman.h>
43 #include <sys/queue.h>
44 #include <sys/sysctl.h>
45 #include <sys/taskqueue.h>
46 
47 
48 #include <sys/proc.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/systm.h>
52 
53 #include <netinet/in_systm.h>
54 #include <netinet/in.h>
55 #include <netinet/ip.h>
56 #include <netinet/tcp.h>
57 
58 #include <dev/pci/pcireg.h>
59 #include <dev/pci/pcivar.h>
60 
61 #ifdef CONFIG_DEFINED
62 #include <cxgb_include.h>
63 #else
64 #include <dev/cxgb/cxgb_include.h>
65 #endif
66 
67 uint32_t collapse_free = 0;
68 uint32_t mb_free_vec_free = 0;
69 int      txq_fills = 0;
70 int      collapse_mbufs = 0;
71 static int recycle_enable = 1;
72 static int bogus_imm = 0;
73 
74 /*
75  * XXX GC
76  */
77 #define NET_XMIT_CN 2
78 #define NET_XMIT_SUCCESS 0
79 
80 #define USE_GTS 0
81 
82 #define SGE_RX_SM_BUF_SIZE	1536
83 #define SGE_RX_DROP_THRES	16
84 #define SGE_RX_COPY_THRES	128
85 
86 /*
87  * Period of the Tx buffer reclaim timer.  This timer does not need to run
88  * frequently as Tx buffers are usually reclaimed by new Tx packets.
89  */
90 #define TX_RECLAIM_PERIOD       (hz >> 1)
91 
92 /*
93  * work request size in bytes
94  */
95 #define WR_LEN (WR_FLITS * 8)
96 
97 /*
98  * Values for sge_txq.flags
99  */
100 enum {
101 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
102 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
103 };
104 
105 struct tx_desc {
106 	uint64_t	flit[TX_DESC_FLITS];
107 } __packed;
108 
109 struct rx_desc {
110 	uint32_t	addr_lo;
111 	uint32_t	len_gen;
112 	uint32_t	gen2;
113 	uint32_t	addr_hi;
114 } __packed;;
115 
116 struct rsp_desc {               /* response queue descriptor */
117 	struct rss_header	rss_hdr;
118 	uint32_t		flags;
119 	uint32_t		len_cq;
120 	uint8_t			imm_data[47];
121 	uint8_t			intr_gen;
122 } __packed;
123 
124 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
125 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
126 #define RX_SW_DESC_INUSE        (1 << 3)
127 #define TX_SW_DESC_MAPPED       (1 << 4)
128 
129 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
130 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
131 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
132 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
133 
134 struct tx_sw_desc {                /* SW state per Tx descriptor */
135 	struct mbuf	*m;
136 	bus_dmamap_t	map;
137 	int		flags;
138 };
139 
140 struct rx_sw_desc {                /* SW state per Rx descriptor */
141 	void	        *cl;
142 	bus_dmamap_t	map;
143 	int		flags;
144 };
145 
146 struct txq_state {
147 	unsigned int compl;
148 	unsigned int gen;
149 	unsigned int pidx;
150 };
151 
152 struct refill_fl_cb_arg {
153 	int               error;
154 	bus_dma_segment_t seg;
155 	int               nseg;
156 };
157 
158 /*
159  * Maps a number of flits to the number of Tx descriptors that can hold them.
160  * The formula is
161  *
162  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
163  *
164  * HW allows up to 4 descriptors to be combined into a WR.
165  */
166 static uint8_t flit_desc_map[] = {
167 	0,
168 #if SGE_NUM_GENBITS == 1
169 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
170 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
171 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
172 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
173 #elif SGE_NUM_GENBITS == 2
174 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
175 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
176 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
177 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
178 #else
179 # error "SGE_NUM_GENBITS must be 1 or 2"
180 #endif
181 };
182 
183 
184 static int lro_default = 0;
185 int cxgb_debug = 0;
186 
187 static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
188 static void sge_timer_cb(void *arg);
189 static void sge_timer_reclaim(void *arg, int ncount);
190 static void sge_txq_reclaim_handler(void *arg, int ncount);
191 static int free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec);
192 
193 /**
194  *	reclaim_completed_tx - reclaims completed Tx descriptors
195  *	@adapter: the adapter
196  *	@q: the Tx queue to reclaim completed descriptors from
197  *
198  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
199  *	and frees the associated buffers if possible.  Called with the Tx
200  *	queue's lock held.
201  */
202 static __inline int
203 reclaim_completed_tx(struct sge_txq *q, int nbufs, struct mbuf **mvec)
204 {
205 	int reclaimed, reclaim = desc_reclaimable(q);
206 	int n = 0;
207 
208 	mtx_assert(&q->lock, MA_OWNED);
209 	if (reclaim > 0) {
210 		n = free_tx_desc(q, min(reclaim, nbufs), mvec);
211 		reclaimed = min(reclaim, nbufs);
212 		q->cleaned += reclaimed;
213 		q->in_use -= reclaimed;
214 	}
215 	return (n);
216 }
217 
218 /**
219  *	should_restart_tx - are there enough resources to restart a Tx queue?
220  *	@q: the Tx queue
221  *
222  *	Checks if there are enough descriptors to restart a suspended Tx queue.
223  */
224 static __inline int
225 should_restart_tx(const struct sge_txq *q)
226 {
227 	unsigned int r = q->processed - q->cleaned;
228 
229 	return q->in_use - r < (q->size >> 1);
230 }
231 
232 /**
233  *	t3_sge_init - initialize SGE
234  *	@adap: the adapter
235  *	@p: the SGE parameters
236  *
237  *	Performs SGE initialization needed every time after a chip reset.
238  *	We do not initialize any of the queue sets here, instead the driver
239  *	top-level must request those individually.  We also do not enable DMA
240  *	here, that should be done after the queues have been set up.
241  */
242 void
243 t3_sge_init(adapter_t *adap, struct sge_params *p)
244 {
245 	u_int ctrl, ups;
246 
247 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
248 
249 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
250 	       F_CQCRDTCTRL |
251 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
252 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
253 #if SGE_NUM_GENBITS == 1
254 	ctrl |= F_EGRGENCTRL;
255 #endif
256 	if (adap->params.rev > 0) {
257 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
258 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
259 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
260 	}
261 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
262 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
263 		     V_LORCQDRBTHRSH(512));
264 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
265 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
266 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
267 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
268 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
269 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
270 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
271 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
272 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
273 }
274 
275 
276 /**
277  *	sgl_len - calculates the size of an SGL of the given capacity
278  *	@n: the number of SGL entries
279  *
280  *	Calculates the number of flits needed for a scatter/gather list that
281  *	can hold the given number of entries.
282  */
283 static __inline unsigned int
284 sgl_len(unsigned int n)
285 {
286 	return ((3 * n) / 2 + (n & 1));
287 }
288 
289 /**
290  *	get_imm_packet - return the next ingress packet buffer from a response
291  *	@resp: the response descriptor containing the packet data
292  *
293  *	Return a packet containing the immediate data of the given response.
294  */
295 static int
296 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl, uint32_t flags)
297 {
298 	int len, error;
299 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
300 
301 	/*
302 	 * would be a firmware bug
303 	 */
304 	len = G_RSPD_LEN(ntohl(resp->len_cq));
305 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP) {
306 		if (cxgb_debug)
307 			device_printf(sc->dev, "unexpected value sopeop=%d flags=0x%x len=%din get_imm_packet\n", sopeop, flags, len);
308 		bogus_imm++;
309 		return (EINVAL);
310 	}
311 	error = 0;
312 	switch (sopeop) {
313 	case RSPQ_SOP_EOP:
314 		m->m_len = m->m_pkthdr.len = len;
315 		memcpy(mtod(m, uint8_t *), resp->imm_data, len);
316 		break;
317 	case RSPQ_EOP:
318 		memcpy(cl, resp->imm_data, len);
319 		m_iovappend(m, cl, MSIZE, len, 0);
320 		break;
321 	default:
322 		bogus_imm++;
323 		error = EINVAL;
324 	}
325 
326 	return (error);
327 }
328 
329 
330 static __inline u_int
331 flits_to_desc(u_int n)
332 {
333 	return (flit_desc_map[n]);
334 }
335 
336 void
337 t3_sge_err_intr_handler(adapter_t *adapter)
338 {
339 	unsigned int v, status;
340 
341 
342 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
343 
344 	if (status & F_RSPQCREDITOVERFOW)
345 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
346 
347 	if (status & F_RSPQDISABLED) {
348 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
349 
350 		CH_ALERT(adapter,
351 			 "packet delivered to disabled response queue (0x%x)\n",
352 			 (v >> S_RSPQ0DISABLED) & 0xff);
353 	}
354 
355 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
356 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
357 		t3_fatal_err(adapter);
358 }
359 
360 void
361 t3_sge_prep(adapter_t *adap, struct sge_params *p)
362 {
363 	int i;
364 
365 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
366 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
367 
368 	for (i = 0; i < SGE_QSETS; ++i) {
369 		struct qset_params *q = p->qset + i;
370 
371 		q->polling = adap->params.rev > 0;
372 
373 		if (adap->params.nports > 2)
374 			q->coalesce_nsecs = 50000;
375 		else
376 			q->coalesce_nsecs = 5000;
377 
378 		q->rspq_size = RSPQ_Q_SIZE;
379 		q->fl_size = FL_Q_SIZE;
380 		q->jumbo_size = JUMBO_Q_SIZE;
381 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
382 		q->txq_size[TXQ_OFLD] = 1024;
383 		q->txq_size[TXQ_CTRL] = 256;
384 		q->cong_thres = 0;
385 	}
386 }
387 
388 int
389 t3_sge_alloc(adapter_t *sc)
390 {
391 
392 	/* The parent tag. */
393 	if (bus_dma_tag_create( NULL,			/* parent */
394 				1, 0,			/* algnmnt, boundary */
395 				BUS_SPACE_MAXADDR,	/* lowaddr */
396 				BUS_SPACE_MAXADDR,	/* highaddr */
397 				NULL, NULL,		/* filter, filterarg */
398 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
399 				BUS_SPACE_UNRESTRICTED, /* nsegments */
400 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
401 				0,			/* flags */
402 				NULL, NULL,		/* lock, lockarg */
403 				&sc->parent_dmat)) {
404 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
405 		return (ENOMEM);
406 	}
407 
408 	/*
409 	 * DMA tag for normal sized RX frames
410 	 */
411 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
412 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
413 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
414 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
415 		return (ENOMEM);
416 	}
417 
418 	/*
419 	 * DMA tag for jumbo sized RX frames.
420 	 */
421 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
422 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
423 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
424 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
425 		return (ENOMEM);
426 	}
427 
428 	/*
429 	 * DMA tag for TX frames.
430 	 */
431 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
432 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
433 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
434 		NULL, NULL, &sc->tx_dmat)) {
435 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
436 		return (ENOMEM);
437 	}
438 
439 	return (0);
440 }
441 
442 int
443 t3_sge_free(struct adapter * sc)
444 {
445 
446 	if (sc->tx_dmat != NULL)
447 		bus_dma_tag_destroy(sc->tx_dmat);
448 
449 	if (sc->rx_jumbo_dmat != NULL)
450 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
451 
452 	if (sc->rx_dmat != NULL)
453 		bus_dma_tag_destroy(sc->rx_dmat);
454 
455 	if (sc->parent_dmat != NULL)
456 		bus_dma_tag_destroy(sc->parent_dmat);
457 
458 	return (0);
459 }
460 
461 void
462 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
463 {
464 
465 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
466 	qs->rspq.polling = 0 /* p->polling */;
467 }
468 
469 static void
470 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
471 {
472 	struct refill_fl_cb_arg *cb_arg = arg;
473 
474 	cb_arg->error = error;
475 	cb_arg->seg = segs[0];
476 	cb_arg->nseg = nseg;
477 
478 }
479 
480 /**
481  *	refill_fl - refill an SGE free-buffer list
482  *	@sc: the controller softc
483  *	@q: the free-list to refill
484  *	@n: the number of new buffers to allocate
485  *
486  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
487  *	The caller must assure that @n does not exceed the queue's capacity.
488  */
489 static void
490 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
491 {
492 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
493 	struct rx_desc *d = &q->desc[q->pidx];
494 	struct refill_fl_cb_arg cb_arg;
495 	void *cl;
496 	int err;
497 
498 	cb_arg.error = 0;
499 	while (n--) {
500 		/*
501 		 * We only allocate a cluster, mbuf allocation happens after rx
502 		 */
503 		if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
504 			log(LOG_WARNING, "Failed to allocate cluster\n");
505 			goto done;
506 		}
507 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
508 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
509 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
510 				uma_zfree(q->zone, cl);
511 				goto done;
512 			}
513 			sd->flags |= RX_SW_DESC_MAP_CREATED;
514 		}
515 		err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
516 		    refill_fl_cb, &cb_arg, 0);
517 
518 		if (err != 0 || cb_arg.error) {
519 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
520 			/*
521 			 * XXX free cluster
522 			 */
523 			return;
524 		}
525 
526 		sd->flags |= RX_SW_DESC_INUSE;
527 		sd->cl = cl;
528 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
529 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
530 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
531 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
532 
533 		d++;
534 		sd++;
535 
536 		if (++q->pidx == q->size) {
537 			q->pidx = 0;
538 			q->gen ^= 1;
539 			sd = q->sdesc;
540 			d = q->desc;
541 		}
542 		q->credits++;
543 	}
544 
545 done:
546 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
547 }
548 
549 
550 /**
551  *	free_rx_bufs - free the Rx buffers on an SGE free list
552  *	@sc: the controle softc
553  *	@q: the SGE free list to clean up
554  *
555  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
556  *	this queue should be stopped before calling this function.
557  */
558 static void
559 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
560 {
561 	u_int cidx = q->cidx;
562 
563 	while (q->credits--) {
564 		struct rx_sw_desc *d = &q->sdesc[cidx];
565 
566 		if (d->flags & RX_SW_DESC_INUSE) {
567 			bus_dmamap_unload(q->entry_tag, d->map);
568 			bus_dmamap_destroy(q->entry_tag, d->map);
569 			uma_zfree(q->zone, d->cl);
570 		}
571 		d->cl = NULL;
572 		if (++cidx == q->size)
573 			cidx = 0;
574 	}
575 }
576 
577 static __inline void
578 __refill_fl(adapter_t *adap, struct sge_fl *fl)
579 {
580 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
581 }
582 
583 /**
584  *	recycle_rx_buf - recycle a receive buffer
585  *	@adapter: the adapter
586  *	@q: the SGE free list
587  *	@idx: index of buffer to recycle
588  *
589  *	Recycles the specified buffer on the given free list by adding it at
590  *	the next available slot on the list.
591  */
592 static void
593 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
594 {
595 	struct rx_desc *from = &q->desc[idx];
596 	struct rx_desc *to   = &q->desc[q->pidx];
597 
598 	q->sdesc[q->pidx] = q->sdesc[idx];
599 	to->addr_lo = from->addr_lo;        // already big endian
600 	to->addr_hi = from->addr_hi;        // likewise
601 	wmb();
602 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
603 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
604 	q->credits++;
605 
606 	if (++q->pidx == q->size) {
607 		q->pidx = 0;
608 		q->gen ^= 1;
609 	}
610 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
611 }
612 
613 static void
614 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
615 {
616 	uint32_t *addr;
617 
618 	addr = arg;
619 	*addr = segs[0].ds_addr;
620 }
621 
622 static int
623 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
624     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
625     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
626 {
627 	size_t len = nelem * elem_size;
628 	void *s = NULL;
629 	void *p = NULL;
630 	int err;
631 
632 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
633 				      BUS_SPACE_MAXADDR_32BIT,
634 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
635 				      len, 0, NULL, NULL, tag)) != 0) {
636 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
637 		return (ENOMEM);
638 	}
639 
640 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
641 				    map)) != 0) {
642 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
643 		return (ENOMEM);
644 	}
645 
646 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
647 	bzero(p, len);
648 	*(void **)desc = p;
649 
650 	if (sw_size) {
651 		len = nelem * sw_size;
652 		s = malloc(len, M_DEVBUF, M_WAITOK);
653 		bzero(s, len);
654 		*(void **)sdesc = s;
655 	}
656 	if (parent_entry_tag == NULL)
657 		return (0);
658 
659 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
660 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
661 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
662 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
663 		                      NULL, NULL, entry_tag)) != 0) {
664 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
665 		return (ENOMEM);
666 	}
667 	return (0);
668 }
669 
670 static void
671 sge_slow_intr_handler(void *arg, int ncount)
672 {
673 	adapter_t *sc = arg;
674 
675 	t3_slow_intr_handler(sc);
676 }
677 
678 /**
679  *	sge_timer_cb - perform periodic maintenance of an SGE qset
680  *	@data: the SGE queue set to maintain
681  *
682  *	Runs periodically from a timer to perform maintenance of an SGE queue
683  *	set.  It performs two tasks:
684  *
685  *	a) Cleans up any completed Tx descriptors that may still be pending.
686  *	Normal descriptor cleanup happens when new packets are added to a Tx
687  *	queue so this timer is relatively infrequent and does any cleanup only
688  *	if the Tx queue has not seen any new packets in a while.  We make a
689  *	best effort attempt to reclaim descriptors, in that we don't wait
690  *	around if we cannot get a queue's lock (which most likely is because
691  *	someone else is queueing new packets and so will also handle the clean
692  *	up).  Since control queues use immediate data exclusively we don't
693  *	bother cleaning them up here.
694  *
695  *	b) Replenishes Rx queues that have run out due to memory shortage.
696  *	Normally new Rx buffers are added when existing ones are consumed but
697  *	when out of memory a queue can become empty.  We try to add only a few
698  *	buffers here, the queue will be replenished fully as these new buffers
699  *	are used up if memory shortage has subsided.
700  *
701  *	c) Return coalesced response queue credits in case a response queue is
702  *	starved.
703  *
704  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
705  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
706  */
707 static void
708 sge_timer_cb(void *arg)
709 {
710 	adapter_t *sc = arg;
711 	struct port_info *p;
712 	struct sge_qset *qs;
713 	struct sge_txq  *txq;
714 	int i, j;
715 	int reclaim_eth, reclaim_ofl, refill_rx;
716 
717 	for (i = 0; i < sc->params.nports; i++)
718 		for (j = 0; j < sc->port[i].nqsets; j++) {
719 			qs = &sc->sge.qs[i + j];
720 			txq = &qs->txq[0];
721 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
722 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
723 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
724 			    (qs->fl[1].credits < qs->fl[1].size));
725 			if (reclaim_eth || reclaim_ofl || refill_rx) {
726 				p = &sc->port[i];
727 				taskqueue_enqueue(p->tq, &p->timer_reclaim_task);
728 				break;
729 			}
730 		}
731 	if (sc->params.nports > 2) {
732 		int i;
733 
734 		for_each_port(sc, i) {
735 			struct port_info *pi = &sc->port[i];
736 
737 			t3_write_reg(sc, A_SG_KDOORBELL,
738 				     F_SELEGRCNTX |
739 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
740 		}
741 	}
742 	if (sc->open_device_map != 0)
743 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
744 }
745 
746 /*
747  * This is meant to be a catch-all function to keep sge state private
748  * to sge.c
749  *
750  */
751 int
752 t3_sge_init_adapter(adapter_t *sc)
753 {
754 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
755 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
756 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
757 	return (0);
758 }
759 
760 int
761 t3_sge_init_port(struct port_info *p)
762 {
763 	TASK_INIT(&p->timer_reclaim_task, 0, sge_timer_reclaim, p);
764 	return (0);
765 }
766 
767 void
768 t3_sge_deinit_sw(adapter_t *sc)
769 {
770 	int i;
771 
772 	callout_drain(&sc->sge_timer_ch);
773 	if (sc->tq)
774 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
775 	for (i = 0; i < sc->params.nports; i++)
776 		if (sc->port[i].tq != NULL)
777 			taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task);
778 }
779 
780 /**
781  *	refill_rspq - replenish an SGE response queue
782  *	@adapter: the adapter
783  *	@q: the response queue to replenish
784  *	@credits: how many new responses to make available
785  *
786  *	Replenishes a response queue by making the supplied number of responses
787  *	available to HW.
788  */
789 static __inline void
790 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
791 {
792 
793 	/* mbufs are allocated on demand when a rspq entry is processed. */
794 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
795 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
796 }
797 
798 static __inline void
799 sge_txq_reclaim_(struct sge_txq *txq)
800 {
801 	int reclaimable, i, n;
802 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
803 	struct port_info *p;
804 
805 	p = txq->port;
806 reclaim_more:
807 	n = 0;
808 	reclaimable = desc_reclaimable(txq);
809 	if (reclaimable > 0 && mtx_trylock(&txq->lock)) {
810 		n = reclaim_completed_tx(txq, TX_CLEAN_MAX_DESC, m_vec);
811 		mtx_unlock(&txq->lock);
812 	}
813 	if (n == 0)
814 		return;
815 
816 	for (i = 0; i < n; i++) {
817 		m_freem_vec(m_vec[i]);
818 	}
819 	if (p && p->ifp->if_drv_flags & IFF_DRV_OACTIVE &&
820 	    txq->size - txq->in_use >= TX_START_MAX_DESC) {
821 		txq_fills++;
822 		p->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
823 		taskqueue_enqueue(p->tq, &p->start_task);
824 	}
825 
826 	if (n)
827 		goto reclaim_more;
828 }
829 
830 static void
831 sge_txq_reclaim_handler(void *arg, int ncount)
832 {
833 	struct sge_txq *q = arg;
834 
835 	sge_txq_reclaim_(q);
836 }
837 
838 static void
839 sge_timer_reclaim(void *arg, int ncount)
840 {
841 	struct port_info *p = arg;
842 	int i, nqsets = p->nqsets;
843 	adapter_t *sc = p->adapter;
844 	struct sge_qset *qs;
845 	struct sge_txq *txq;
846 	struct mtx *lock;
847 
848 	for (i = 0; i < nqsets; i++) {
849 		qs = &sc->sge.qs[i];
850 		txq = &qs->txq[TXQ_ETH];
851 		sge_txq_reclaim_(txq);
852 
853 		txq = &qs->txq[TXQ_OFLD];
854 		sge_txq_reclaim_(txq);
855 
856 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
857 			    &sc->sge.qs[0].rspq.lock;
858 
859 		if (mtx_trylock(lock)) {
860 			/* XXX currently assume that we are *NOT* polling */
861 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
862 
863 			if (qs->fl[0].credits < qs->fl[0].size - 16)
864 				__refill_fl(sc, &qs->fl[0]);
865 			if (qs->fl[1].credits < qs->fl[1].size - 16)
866 				__refill_fl(sc, &qs->fl[1]);
867 
868 			if (status & (1 << qs->rspq.cntxt_id)) {
869 				if (qs->rspq.credits) {
870 					refill_rspq(sc, &qs->rspq, 1);
871 					qs->rspq.credits--;
872 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
873 					    1 << qs->rspq.cntxt_id);
874 				}
875 			}
876 			mtx_unlock(lock);
877 		}
878 	}
879 }
880 
881 /**
882  *	init_qset_cntxt - initialize an SGE queue set context info
883  *	@qs: the queue set
884  *	@id: the queue set id
885  *
886  *	Initializes the TIDs and context ids for the queues of a queue set.
887  */
888 static void
889 init_qset_cntxt(struct sge_qset *qs, u_int id)
890 {
891 
892 	qs->rspq.cntxt_id = id;
893 	qs->fl[0].cntxt_id = 2 * id;
894 	qs->fl[1].cntxt_id = 2 * id + 1;
895 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
896 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
897 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
898 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
899 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
900 }
901 
902 
903 static void
904 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
905 {
906 	txq->in_use += ndesc;
907 	/*
908 	 * XXX we don't handle stopping of queue
909 	 * presumably start handles this when we bump against the end
910 	 */
911 	txqs->gen = txq->gen;
912 	txq->unacked += ndesc;
913 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
914 	txq->unacked &= 7;
915 	txqs->pidx = txq->pidx;
916 	txq->pidx += ndesc;
917 
918 	if (txq->pidx >= txq->size) {
919 		txq->pidx -= txq->size;
920 		txq->gen ^= 1;
921 	}
922 
923 }
924 
925 /**
926  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
927  *	@m: the packet mbufs
928  *      @nsegs: the number of segments
929  *
930  * 	Returns the number of Tx descriptors needed for the given Ethernet
931  * 	packet.  Ethernet packets require addition of WR and CPL headers.
932  */
933 static __inline unsigned int
934 calc_tx_descs(const struct mbuf *m, int nsegs)
935 {
936 	unsigned int flits;
937 
938 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
939 		return 1;
940 
941 	flits = sgl_len(nsegs) + 2;
942 #ifdef TSO_SUPPORTED
943 	if  (m->m_pkthdr.csum_flags & (CSUM_TSO))
944 		flits++;
945 #endif
946 	return flits_to_desc(flits);
947 }
948 
949 static unsigned int
950 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
951     struct tx_sw_desc *stx, bus_dma_segment_t *segs, int *nsegs)
952 {
953 	struct mbuf *m0;
954 	int err, pktlen;
955 
956 	m0 = *m;
957 	pktlen = m0->m_pkthdr.len;
958 
959 	err = bus_dmamap_load_mvec_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
960 #ifdef DEBUG
961 	if (err) {
962 		int n = 0;
963 		struct mbuf *mtmp = m0;
964 		while(mtmp) {
965 			n++;
966 			mtmp = mtmp->m_next;
967 		}
968 		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
969 		    err, m0->m_pkthdr.len, n);
970 	}
971 #endif
972 	if (err == EFBIG) {
973 		/* Too many segments, try to defrag */
974 		m0 = m_defrag(m0, M_DONTWAIT);
975 		if (m0 == NULL) {
976 			m_freem(*m);
977 			*m = NULL;
978 			return (ENOBUFS);
979 		}
980 		*m = m0;
981 		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
982 	}
983 
984 	if (err == ENOMEM) {
985 		return (err);
986 	}
987 
988 	if (err) {
989 		if (cxgb_debug)
990 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
991 		m_freem_vec(m0);
992 		*m = NULL;
993 		return (err);
994 	}
995 
996 	bus_dmamap_sync(txq->entry_tag, stx->map, BUS_DMASYNC_PREWRITE);
997 	stx->flags |= TX_SW_DESC_MAPPED;
998 
999 	return (0);
1000 }
1001 
1002 /**
1003  *	make_sgl - populate a scatter/gather list for a packet
1004  *	@sgp: the SGL to populate
1005  *	@segs: the packet dma segments
1006  *	@nsegs: the number of segments
1007  *
1008  *	Generates a scatter/gather list for the buffers that make up a packet
1009  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1010  *	appropriately.
1011  */
1012 static __inline void
1013 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1014 {
1015 	int i, idx;
1016 
1017 	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
1018 		if (i && idx == 0)
1019 			++sgp;
1020 
1021 		sgp->len[idx] = htobe32(segs[i].ds_len);
1022 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1023 	}
1024 
1025 	if (idx)
1026 		sgp->len[idx] = 0;
1027 }
1028 
1029 /**
1030  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1031  *	@adap: the adapter
1032  *	@q: the Tx queue
1033  *
1034  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1035  *	where the HW is going to sleep just after we checked, however,
1036  *	then the interrupt handler will detect the outstanding TX packet
1037  *	and ring the doorbell for us.
1038  *
1039  *	When GTS is disabled we unconditionally ring the doorbell.
1040  */
1041 static __inline void
1042 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1043 {
1044 #if USE_GTS
1045 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1046 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1047 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1048 #ifdef T3_TRACE
1049 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1050 			  q->cntxt_id);
1051 #endif
1052 		t3_write_reg(adap, A_SG_KDOORBELL,
1053 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1054 	}
1055 #else
1056 	wmb();            /* write descriptors before telling HW */
1057 	t3_write_reg(adap, A_SG_KDOORBELL,
1058 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1059 #endif
1060 }
1061 
1062 static __inline void
1063 wr_gen2(struct tx_desc *d, unsigned int gen)
1064 {
1065 #if SGE_NUM_GENBITS == 2
1066 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1067 #endif
1068 }
1069 
1070 
1071 
1072 /**
1073  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1074  *	@ndesc: number of Tx descriptors spanned by the SGL
1075  *	@txd: first Tx descriptor to be written
1076  *	@txqs: txq state (generation and producer index)
1077  *	@txq: the SGE Tx queue
1078  *	@sgl: the SGL
1079  *	@flits: number of flits to the start of the SGL in the first descriptor
1080  *	@sgl_flits: the SGL size in flits
1081  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1082  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1083  *
1084  *	Write a work request header and an associated SGL.  If the SGL is
1085  *	small enough to fit into one Tx descriptor it has already been written
1086  *	and we just need to write the WR header.  Otherwise we distribute the
1087  *	SGL across the number of descriptors it spans.
1088  */
1089 
1090 static void
1091 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1092     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1093     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1094 {
1095 
1096 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1097 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1098 
1099 	if (__predict_true(ndesc == 1)) {
1100 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1101 		    V_WR_SGLSFLT(flits)) | wr_hi;
1102 		wmb();
1103 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1104 		    V_WR_GEN(txqs->gen)) | wr_lo;
1105 		/* XXX gen? */
1106 		wr_gen2(txd, txqs->gen);
1107 	} else {
1108 		unsigned int ogen = txqs->gen;
1109 		const uint64_t *fp = (const uint64_t *)sgl;
1110 		struct work_request_hdr *wp = wrp;
1111 
1112 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1113 		    V_WR_SGLSFLT(flits)) | wr_hi;
1114 
1115 		while (sgl_flits) {
1116 			unsigned int avail = WR_FLITS - flits;
1117 
1118 			if (avail > sgl_flits)
1119 				avail = sgl_flits;
1120 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1121 			sgl_flits -= avail;
1122 			ndesc--;
1123 			if (!sgl_flits)
1124 				break;
1125 
1126 			fp += avail;
1127 			txd++;
1128 			txsd++;
1129 			if (++txqs->pidx == txq->size) {
1130 				txqs->pidx = 0;
1131 				txqs->gen ^= 1;
1132 				txd = txq->desc;
1133 				txsd = txq->sdesc;
1134 			}
1135 
1136 			/*
1137 			 * when the head of the mbuf chain
1138 			 * is freed all clusters will be freed
1139 			 * with it
1140 			 */
1141 			txsd->m = NULL;
1142 			wrp = (struct work_request_hdr *)txd;
1143 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1144 			    V_WR_SGLSFLT(1)) | wr_hi;
1145 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1146 				    sgl_flits + 1)) |
1147 			    V_WR_GEN(txqs->gen)) | wr_lo;
1148 			wr_gen2(txd, txqs->gen);
1149 			flits = 1;
1150 		}
1151 		wrp->wr_hi |= htonl(F_WR_EOP);
1152 		wmb();
1153 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1154 		wr_gen2((struct tx_desc *)wp, ogen);
1155 	}
1156 }
1157 
1158 
1159 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1160 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1161 
1162 int
1163 t3_encap(struct port_info *p, struct mbuf **m)
1164 {
1165 	adapter_t *sc;
1166 	struct mbuf *m0;
1167 	struct sge_qset *qs;
1168 	struct sge_txq *txq;
1169 	struct tx_sw_desc *stx;
1170 	struct txq_state txqs;
1171 	unsigned int nsegs, ndesc, flits, cntrl, mlen;
1172 	int err, tso_info = 0;
1173 
1174 	struct work_request_hdr *wrp;
1175 	struct tx_sw_desc *txsd;
1176 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1177 	bus_dma_segment_t segs[TX_MAX_SEGS];
1178 	uint32_t wr_hi, wr_lo, sgl_flits;
1179 
1180 	struct tx_desc *txd;
1181 	struct cpl_tx_pkt *cpl;
1182 
1183 	DPRINTF("t3_encap ");
1184 	m0 = *m;
1185 	sc = p->adapter;
1186 	qs = &sc->sge.qs[p->first_qset];
1187 	txq = &qs->txq[TXQ_ETH];
1188 	stx = &txq->sdesc[txq->pidx];
1189 	txd = &txq->desc[txq->pidx];
1190 	cpl = (struct cpl_tx_pkt *)txd;
1191 	mlen = m0->m_pkthdr.len;
1192 	cpl->len = htonl(mlen | 0x80000000);
1193 
1194 	DPRINTF("mlen=%d\n", mlen);
1195 	/*
1196 	 * XXX handle checksum, TSO, and VLAN here
1197 	 *
1198 	 */
1199 	cntrl = V_TXPKT_INTF(p->port);
1200 
1201 	/*
1202 	 * XXX need to add VLAN support for 6.x
1203 	 */
1204 #ifdef VLAN_SUPPORTED
1205 	if (m0->m_flags & M_VLANTAG)
1206 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
1207 	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
1208 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1209 #endif
1210 	if (tso_info) {
1211 		int eth_type;
1212 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
1213 		struct ip *ip;
1214 		struct tcphdr *tcp;
1215 		uint8_t *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
1216 
1217 		txd->flit[2] = 0;
1218 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1219 		hdr->cntrl = htonl(cntrl);
1220 
1221 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1222 			pkthdr = &tmp[0];
1223 			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
1224 		} else {
1225 			pkthdr = mtod(m0, uint8_t *);
1226 		}
1227 
1228 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1229 			eth_type = CPL_ETH_II_VLAN;
1230 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1231 			    ETHER_VLAN_ENCAP_LEN);
1232 		} else {
1233 			eth_type = CPL_ETH_II;
1234 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1235 		}
1236 		tcp = (struct tcphdr *)((uint8_t *)ip +
1237 		    sizeof(*ip));
1238 
1239 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1240 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1241 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1242 		hdr->lso_info = htonl(tso_info);
1243 		flits = 3;
1244 	} else {
1245 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1246 		cpl->cntrl = htonl(cntrl);
1247 
1248 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1249 			txq_prod(txq, 1, &txqs);
1250 			txq->sdesc[txqs.pidx].m = m0;
1251 			m_set_priority(m0, txqs.pidx);
1252 
1253 			if (m0->m_len == m0->m_pkthdr.len)
1254 				memcpy(&txd->flit[2], mtod(m0, uint8_t *), mlen);
1255 			else
1256 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1257 
1258 			flits = (mlen + 7) / 8 + 2;
1259 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1260 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1261 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1262 			wmb();
1263 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1264 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1265 
1266 			wr_gen2(txd, txqs.gen);
1267 			check_ring_tx_db(sc, txq);
1268 			return (0);
1269 		}
1270 		flits = 2;
1271 	}
1272 
1273 	wrp = (struct work_request_hdr *)txd;
1274 
1275 	if ((err = busdma_map_mbufs(m, txq, stx, segs, &nsegs)) != 0) {
1276 		return (err);
1277 	}
1278 	m0 = *m;
1279 	ndesc = calc_tx_descs(m0, nsegs);
1280 
1281 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1282 	make_sgl(sgp, segs, nsegs);
1283 
1284 	sgl_flits = sgl_len(nsegs);
1285 
1286 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1287 	txq_prod(txq, ndesc, &txqs);
1288 	txsd = &txq->sdesc[txqs.pidx];
1289 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1290 	wr_lo = htonl(V_WR_TID(txq->token));
1291 	txsd->m = m0;
1292 	m_set_priority(m0, txqs.pidx);
1293 
1294 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
1295 	check_ring_tx_db(p->adapter, txq);
1296 
1297 	return (0);
1298 }
1299 
1300 
1301 /**
1302  *	write_imm - write a packet into a Tx descriptor as immediate data
1303  *	@d: the Tx descriptor to write
1304  *	@m: the packet
1305  *	@len: the length of packet data to write as immediate data
1306  *	@gen: the generation bit value to write
1307  *
1308  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1309  *	contains a work request at its beginning.  We must write the packet
1310  *	carefully so the SGE doesn't read accidentally before it's written in
1311  *	its entirety.
1312  */
1313 static __inline void
1314 write_imm(struct tx_desc *d, struct mbuf *m,
1315 	  unsigned int len, unsigned int gen)
1316 {
1317 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1318 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1319 
1320 	memcpy(&to[1], &from[1], len - sizeof(*from));
1321 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1322 					V_WR_BCNTLFLT(len & 7));
1323 	wmb();
1324 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1325 					V_WR_LEN((len + 7) / 8));
1326 	wr_gen2(d, gen);
1327 	m_freem(m);
1328 }
1329 
1330 /**
1331  *	check_desc_avail - check descriptor availability on a send queue
1332  *	@adap: the adapter
1333  *	@q: the TX queue
1334  *	@m: the packet needing the descriptors
1335  *	@ndesc: the number of Tx descriptors needed
1336  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1337  *
1338  *	Checks if the requested number of Tx descriptors is available on an
1339  *	SGE send queue.  If the queue is already suspended or not enough
1340  *	descriptors are available the packet is queued for later transmission.
1341  *	Must be called with the Tx queue locked.
1342  *
1343  *	Returns 0 if enough descriptors are available, 1 if there aren't
1344  *	enough descriptors and the packet has been queued, and 2 if the caller
1345  *	needs to retry because there weren't enough descriptors at the
1346  *	beginning of the call but some freed up in the mean time.
1347  */
1348 static __inline int
1349 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1350 		 struct mbuf *m, unsigned int ndesc,
1351 		 unsigned int qid)
1352 {
1353 	/*
1354 	 * XXX We currently only use this for checking the control queue
1355 	 * the control queue is only used for binding qsets which happens
1356 	 * at init time so we are guaranteed enough descriptors
1357 	 */
1358 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1359 addq_exit:	mbufq_tail(&q->sendq, m);
1360 		return 1;
1361 	}
1362 	if (__predict_false(q->size - q->in_use < ndesc)) {
1363 
1364 		struct sge_qset *qs = txq_to_qset(q, qid);
1365 
1366 		setbit(&qs->txq_stopped, qid);
1367 		smp_mb();
1368 
1369 		if (should_restart_tx(q) &&
1370 		    test_and_clear_bit(qid, &qs->txq_stopped))
1371 			return 2;
1372 
1373 		q->stops++;
1374 		goto addq_exit;
1375 	}
1376 	return 0;
1377 }
1378 
1379 
1380 /**
1381  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1382  *	@q: the SGE control Tx queue
1383  *
1384  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1385  *	that send only immediate data (presently just the control queues) and
1386  *	thus do not have any mbufs
1387  */
1388 static __inline void
1389 reclaim_completed_tx_imm(struct sge_txq *q)
1390 {
1391 	unsigned int reclaim = q->processed - q->cleaned;
1392 
1393 	mtx_assert(&q->lock, MA_OWNED);
1394 
1395 	q->in_use -= reclaim;
1396 	q->cleaned += reclaim;
1397 }
1398 
1399 static __inline int
1400 immediate(const struct mbuf *m)
1401 {
1402 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1403 }
1404 
1405 /**
1406  *	ctrl_xmit - send a packet through an SGE control Tx queue
1407  *	@adap: the adapter
1408  *	@q: the control queue
1409  *	@m: the packet
1410  *
1411  *	Send a packet through an SGE control Tx queue.  Packets sent through
1412  *	a control queue must fit entirely as immediate data in a single Tx
1413  *	descriptor and have no page fragments.
1414  */
1415 static int
1416 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1417 {
1418 	int ret;
1419 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1420 
1421 	if (__predict_false(!immediate(m))) {
1422 		m_freem(m);
1423 		return 0;
1424 	}
1425 
1426 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1427 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1428 
1429 	mtx_lock(&q->lock);
1430 again:	reclaim_completed_tx_imm(q);
1431 
1432 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1433 	if (__predict_false(ret)) {
1434 		if (ret == 1) {
1435 			mtx_unlock(&q->lock);
1436 			return (-1);
1437 		}
1438 		goto again;
1439 	}
1440 
1441 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1442 
1443 	q->in_use++;
1444 	if (++q->pidx >= q->size) {
1445 		q->pidx = 0;
1446 		q->gen ^= 1;
1447 	}
1448 	mtx_unlock(&q->lock);
1449 	wmb();
1450 	t3_write_reg(adap, A_SG_KDOORBELL,
1451 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1452 	return (0);
1453 }
1454 
1455 
1456 /**
1457  *	restart_ctrlq - restart a suspended control queue
1458  *	@qs: the queue set cotaining the control queue
1459  *
1460  *	Resumes transmission on a suspended Tx control queue.
1461  */
1462 static void
1463 restart_ctrlq(void *data, int npending)
1464 {
1465 	struct mbuf *m;
1466 	struct sge_qset *qs = (struct sge_qset *)data;
1467 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1468 	adapter_t *adap = qs->port->adapter;
1469 
1470 	mtx_lock(&q->lock);
1471 again:	reclaim_completed_tx_imm(q);
1472 
1473 	while (q->in_use < q->size &&
1474 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1475 
1476 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1477 
1478 		if (++q->pidx >= q->size) {
1479 			q->pidx = 0;
1480 			q->gen ^= 1;
1481 		}
1482 		q->in_use++;
1483 	}
1484 	if (!mbufq_empty(&q->sendq)) {
1485 		setbit(&qs->txq_stopped, TXQ_CTRL);
1486 		smp_mb();
1487 
1488 		if (should_restart_tx(q) &&
1489 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1490 			goto again;
1491 		q->stops++;
1492 	}
1493 	mtx_unlock(&q->lock);
1494 	t3_write_reg(adap, A_SG_KDOORBELL,
1495 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1496 }
1497 
1498 
1499 /*
1500  * Send a management message through control queue 0
1501  */
1502 int
1503 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1504 {
1505 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1506 }
1507 
1508 /**
1509  *	free_qset - free the resources of an SGE queue set
1510  *	@sc: the controller owning the queue set
1511  *	@q: the queue set
1512  *
1513  *	Release the HW and SW resources associated with an SGE queue set, such
1514  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1515  *	queue set must be quiesced prior to calling this.
1516  */
1517 static void
1518 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1519 {
1520 	int i;
1521 
1522 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1523 		if (q->fl[i].desc) {
1524 			mtx_lock(&sc->sge.reg_lock);
1525 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1526 			mtx_unlock(&sc->sge.reg_lock);
1527 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1528 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1529 					q->fl[i].desc_map);
1530 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1531 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1532 		}
1533 		if (q->fl[i].sdesc) {
1534 			free_rx_bufs(sc, &q->fl[i]);
1535 			free(q->fl[i].sdesc, M_DEVBUF);
1536 		}
1537 	}
1538 
1539 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
1540 		if (q->txq[i].desc) {
1541 			mtx_lock(&sc->sge.reg_lock);
1542 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1543 			mtx_unlock(&sc->sge.reg_lock);
1544 			bus_dmamap_unload(q->txq[i].desc_tag,
1545 					q->txq[i].desc_map);
1546 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1547 					q->txq[i].desc_map);
1548 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1549 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1550 			MTX_DESTROY(&q->txq[i].lock);
1551 		}
1552 		if (q->txq[i].sdesc) {
1553 			free(q->txq[i].sdesc, M_DEVBUF);
1554 		}
1555 	}
1556 
1557 	if (q->rspq.desc) {
1558 		mtx_lock(&sc->sge.reg_lock);
1559 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1560 		mtx_unlock(&sc->sge.reg_lock);
1561 
1562 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1563 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1564 			        q->rspq.desc_map);
1565 		bus_dma_tag_destroy(q->rspq.desc_tag);
1566 		MTX_DESTROY(&q->rspq.lock);
1567 	}
1568 
1569 	bzero(q, sizeof(*q));
1570 }
1571 
1572 /**
1573  *	t3_free_sge_resources - free SGE resources
1574  *	@sc: the adapter softc
1575  *
1576  *	Frees resources used by the SGE queue sets.
1577  */
1578 void
1579 t3_free_sge_resources(adapter_t *sc)
1580 {
1581 	int i, nqsets;
1582 
1583 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1584 		nqsets += sc->port[i].nqsets;
1585 
1586 	for (i = 0; i < nqsets; ++i)
1587 		t3_free_qset(sc, &sc->sge.qs[i]);
1588 }
1589 
1590 /**
1591  *	t3_sge_start - enable SGE
1592  *	@sc: the controller softc
1593  *
1594  *	Enables the SGE for DMAs.  This is the last step in starting packet
1595  *	transfers.
1596  */
1597 void
1598 t3_sge_start(adapter_t *sc)
1599 {
1600 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1601 }
1602 
1603 /**
1604  *	t3_sge_stop - disable SGE operation
1605  *	@sc: the adapter
1606  *
1607  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
1608  *	from error interrupts) or from normal process context.  In the latter
1609  *	case it also disables any pending queue restart tasklets.  Note that
1610  *	if it is called in interrupt context it cannot disable the restart
1611  *	tasklets as it cannot wait, however the tasklets will have no effect
1612  *	since the doorbells are disabled and the driver will call this again
1613  *	later from process context, at which time the tasklets will be stopped
1614  *	if they are still running.
1615  */
1616 void
1617 t3_sge_stop(adapter_t *sc)
1618 {
1619 	int i, nqsets;
1620 
1621 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
1622 
1623 	if (sc->tq == NULL)
1624 		return;
1625 
1626 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1627 		nqsets += sc->port[i].nqsets;
1628 
1629 	for (i = 0; i < nqsets; ++i) {
1630 		struct sge_qset *qs = &sc->sge.qs[i];
1631 
1632 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
1633 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
1634 	}
1635 }
1636 
1637 
1638 /**
1639  *	free_tx_desc - reclaims Tx descriptors and their buffers
1640  *	@adapter: the adapter
1641  *	@q: the Tx queue to reclaim descriptors from
1642  *	@n: the number of descriptors to reclaim
1643  *
1644  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1645  *	Tx buffers.  Called with the Tx queue lock held.
1646  */
1647 int
1648 free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec)
1649 {
1650 	struct tx_sw_desc *d;
1651 	unsigned int cidx = q->cidx;
1652 	int nbufs = 0;
1653 
1654 #ifdef T3_TRACE
1655 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1656 		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
1657 #endif
1658 	d = &q->sdesc[cidx];
1659 
1660 	while (n-- > 0) {
1661 		DPRINTF("cidx=%d d=%p\n", cidx, d);
1662 		if (d->m) {
1663 			if (d->flags & TX_SW_DESC_MAPPED) {
1664 				bus_dmamap_unload(q->entry_tag, d->map);
1665 				bus_dmamap_destroy(q->entry_tag, d->map);
1666 				d->flags &= ~TX_SW_DESC_MAPPED;
1667 			}
1668 			if (m_get_priority(d->m) == cidx) {
1669 				m_vec[nbufs] = d->m;
1670 				d->m = NULL;
1671 				nbufs++;
1672 			} else {
1673 				printf("pri=%d cidx=%d\n", (int)m_get_priority(d->m), cidx);
1674 			}
1675 		}
1676 		++d;
1677 		if (++cidx == q->size) {
1678 			cidx = 0;
1679 			d = q->sdesc;
1680 		}
1681 	}
1682 	q->cidx = cidx;
1683 
1684 	return (nbufs);
1685 }
1686 
1687 /**
1688  *	is_new_response - check if a response is newly written
1689  *	@r: the response descriptor
1690  *	@q: the response queue
1691  *
1692  *	Returns true if a response descriptor contains a yet unprocessed
1693  *	response.
1694  */
1695 static __inline int
1696 is_new_response(const struct rsp_desc *r,
1697     const struct sge_rspq *q)
1698 {
1699 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1700 }
1701 
1702 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1703 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1704 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1705 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1706 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1707 
1708 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1709 #define NOMEM_INTR_DELAY 2500
1710 
1711 /**
1712  *	write_ofld_wr - write an offload work request
1713  *	@adap: the adapter
1714  *	@m: the packet to send
1715  *	@q: the Tx queue
1716  *	@pidx: index of the first Tx descriptor to write
1717  *	@gen: the generation value to use
1718  *	@ndesc: number of descriptors the packet will occupy
1719  *
1720  *	Write an offload work request to send the supplied packet.  The packet
1721  *	data already carry the work request with most fields populated.
1722  */
1723 static void
1724 write_ofld_wr(adapter_t *adap, struct mbuf *m,
1725     struct sge_txq *q, unsigned int pidx,
1726     unsigned int gen, unsigned int ndesc,
1727     bus_dma_segment_t *segs, unsigned int nsegs)
1728 {
1729 	unsigned int sgl_flits, flits;
1730 	struct work_request_hdr *from;
1731 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1732 	struct tx_desc *d = &q->desc[pidx];
1733 	struct txq_state txqs;
1734 
1735 	if (immediate(m)) {
1736 		q->sdesc[pidx].m = NULL;
1737 		write_imm(d, m, m->m_len, gen);
1738 		return;
1739 	}
1740 
1741 	/* Only TX_DATA builds SGLs */
1742 
1743 	from = mtod(m, struct work_request_hdr *);
1744 	memcpy(&d->flit[1], &from[1],
1745 	    (uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *) - sizeof(*from));
1746 
1747 	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;
1748 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
1749 
1750 	make_sgl(sgp, segs, nsegs);
1751 	sgl_flits = sgl_len(nsegs);
1752 
1753 	txqs.gen = q->gen;
1754 	txqs.pidx = q->pidx;
1755 	txqs.compl = (q->unacked & 8) << (S_WR_COMPL - 3);
1756 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
1757 	    from->wr_hi, from->wr_lo);
1758 }
1759 
1760 /**
1761  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1762  *	@m: the packet
1763  *
1764  * 	Returns the number of Tx descriptors needed for the given offload
1765  * 	packet.  These packets are already fully constructed.
1766  */
1767 static __inline unsigned int
1768 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
1769 {
1770 	unsigned int flits, cnt = 0;
1771 
1772 
1773 	if (m->m_len <= WR_LEN)
1774 		return 1;                 /* packet fits as immediate data */
1775 
1776 	if (m->m_flags & M_IOVEC)
1777 		cnt = mtomv(m)->mv_count;
1778 
1779 	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;   /* headers */
1780 
1781 	return flits_to_desc(flits + sgl_len(cnt));
1782 }
1783 
1784 /**
1785  *	ofld_xmit - send a packet through an offload queue
1786  *	@adap: the adapter
1787  *	@q: the Tx offload queue
1788  *	@m: the packet
1789  *
1790  *	Send an offload packet through an SGE offload queue.
1791  */
1792 static int
1793 ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1794 {
1795 	int ret;
1796 	unsigned int pidx, gen, nsegs;
1797 	unsigned int ndesc;
1798 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
1799 	bus_dma_segment_t segs[TX_MAX_SEGS];
1800 	int i, cleaned;
1801 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
1802 
1803 	mtx_lock(&q->lock);
1804 	if ((ret = busdma_map_mbufs(&m, q, stx, segs, &nsegs)) != 0) {
1805 		mtx_unlock(&q->lock);
1806 		return (ret);
1807 	}
1808 	ndesc = calc_tx_descs_ofld(m, nsegs);
1809 again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
1810 
1811 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
1812 	if (__predict_false(ret)) {
1813 		if (ret == 1) {
1814 			m_set_priority(m, ndesc);     /* save for restart */
1815 			mtx_unlock(&q->lock);
1816 			return NET_XMIT_CN;
1817 		}
1818 		goto again;
1819 	}
1820 
1821 	gen = q->gen;
1822 	q->in_use += ndesc;
1823 	pidx = q->pidx;
1824 	q->pidx += ndesc;
1825 	if (q->pidx >= q->size) {
1826 		q->pidx -= q->size;
1827 		q->gen ^= 1;
1828 	}
1829 #ifdef T3_TRACE
1830 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
1831 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
1832 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
1833 		  skb_shinfo(skb)->nr_frags);
1834 #endif
1835 	mtx_unlock(&q->lock);
1836 
1837 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
1838 	check_ring_tx_db(adap, q);
1839 
1840 	for (i = 0; i < cleaned; i++) {
1841 		m_freem_vec(m_vec[i]);
1842 	}
1843 	return NET_XMIT_SUCCESS;
1844 }
1845 
1846 /**
1847  *	restart_offloadq - restart a suspended offload queue
1848  *	@qs: the queue set cotaining the offload queue
1849  *
1850  *	Resumes transmission on a suspended Tx offload queue.
1851  */
1852 static void
1853 restart_offloadq(void *data, int npending)
1854 {
1855 
1856 	struct mbuf *m;
1857 	struct sge_qset *qs = data;
1858 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
1859 	adapter_t *adap = qs->port->adapter;
1860 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
1861 	bus_dma_segment_t segs[TX_MAX_SEGS];
1862 	int nsegs, i, cleaned;
1863 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
1864 
1865 	mtx_lock(&q->lock);
1866 again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
1867 
1868 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
1869 		unsigned int gen, pidx;
1870 		unsigned int ndesc = m_get_priority(m);
1871 
1872 		if (__predict_false(q->size - q->in_use < ndesc)) {
1873 			setbit(&qs->txq_stopped, TXQ_OFLD);
1874 			smp_mb();
1875 
1876 			if (should_restart_tx(q) &&
1877 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
1878 				goto again;
1879 			q->stops++;
1880 			break;
1881 		}
1882 
1883 		gen = q->gen;
1884 		q->in_use += ndesc;
1885 		pidx = q->pidx;
1886 		q->pidx += ndesc;
1887 		if (q->pidx >= q->size) {
1888 			q->pidx -= q->size;
1889 			q->gen ^= 1;
1890 		}
1891 
1892 		(void)mbufq_dequeue(&q->sendq);
1893 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
1894 		mtx_unlock(&q->lock);
1895 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
1896 		mtx_lock(&q->lock);
1897 	}
1898 	mtx_unlock(&q->lock);
1899 
1900 #if USE_GTS
1901 	set_bit(TXQ_RUNNING, &q->flags);
1902 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
1903 #endif
1904 	t3_write_reg(adap, A_SG_KDOORBELL,
1905 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1906 
1907 	for (i = 0; i < cleaned; i++) {
1908 		m_freem_vec(m_vec[i]);
1909 	}
1910 }
1911 
1912 /**
1913  *	queue_set - return the queue set a packet should use
1914  *	@m: the packet
1915  *
1916  *	Maps a packet to the SGE queue set it should use.  The desired queue
1917  *	set is carried in bits 1-3 in the packet's priority.
1918  */
1919 static __inline int
1920 queue_set(const struct mbuf *m)
1921 {
1922 	return m_get_priority(m) >> 1;
1923 }
1924 
1925 /**
1926  *	is_ctrl_pkt - return whether an offload packet is a control packet
1927  *	@m: the packet
1928  *
1929  *	Determines whether an offload packet should use an OFLD or a CTRL
1930  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
1931  */
1932 static __inline int
1933 is_ctrl_pkt(const struct mbuf *m)
1934 {
1935 	return m_get_priority(m) & 1;
1936 }
1937 
1938 /**
1939  *	t3_offload_tx - send an offload packet
1940  *	@tdev: the offload device to send to
1941  *	@m: the packet
1942  *
1943  *	Sends an offload packet.  We use the packet priority to select the
1944  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
1945  *	should be sent as regular or control, bits 1-3 select the queue set.
1946  */
1947 int
1948 t3_offload_tx(struct toedev *tdev, struct mbuf *m)
1949 {
1950 	adapter_t *adap = tdev2adap(tdev);
1951 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
1952 
1953 	if (__predict_false(is_ctrl_pkt(m)))
1954 		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
1955 
1956 	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
1957 }
1958 
1959 /**
1960  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
1961  *	@tdev: the offload device that will be receiving the packets
1962  *	@q: the SGE response queue that assembled the bundle
1963  *	@m: the partial bundle
1964  *	@n: the number of packets in the bundle
1965  *
1966  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
1967  */
1968 static __inline void
1969 deliver_partial_bundle(struct toedev *tdev,
1970 			struct sge_rspq *q,
1971 			struct mbuf *mbufs[], int n)
1972 {
1973 	if (n) {
1974 		q->offload_bundles++;
1975 		cxgb_ofld_recv(tdev, mbufs, n);
1976 	}
1977 }
1978 
1979 static __inline int
1980 rx_offload(struct toedev *tdev, struct sge_rspq *rq,
1981     struct mbuf *m, struct mbuf *rx_gather[],
1982     unsigned int gather_idx)
1983 {
1984 	rq->offload_pkts++;
1985 	m->m_pkthdr.header = mtod(m, void *);
1986 
1987 	rx_gather[gather_idx++] = m;
1988 	if (gather_idx == RX_BUNDLE_SIZE) {
1989 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
1990 		gather_idx = 0;
1991 		rq->offload_bundles++;
1992 	}
1993 	return (gather_idx);
1994 }
1995 
1996 static void
1997 restart_tx(struct sge_qset *qs)
1998 {
1999 	struct adapter *sc = qs->port->adapter;
2000 
2001 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2002 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2003 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2004 		qs->txq[TXQ_OFLD].restarts++;
2005 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2006 	}
2007 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2008 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2009 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2010 		qs->txq[TXQ_CTRL].restarts++;
2011 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2012 	}
2013 }
2014 
2015 /**
2016  *	t3_sge_alloc_qset - initialize an SGE queue set
2017  *	@sc: the controller softc
2018  *	@id: the queue set id
2019  *	@nports: how many Ethernet ports will be using this queue set
2020  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2021  *	@p: configuration parameters for this queue set
2022  *	@ntxq: number of Tx queues for the queue set
2023  *	@pi: port info for queue set
2024  *
2025  *	Allocate resources and initialize an SGE queue set.  A queue set
2026  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2027  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2028  *	queue, offload queue, and control queue.
2029  */
2030 int
2031 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2032 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2033 {
2034 	struct sge_qset *q = &sc->sge.qs[id];
2035 	int i, ret = 0;
2036 
2037 	init_qset_cntxt(q, id);
2038 
2039 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2040 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2041 		    &q->fl[0].desc, &q->fl[0].sdesc,
2042 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2043 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2044 		printf("error %d from alloc ring fl0\n", ret);
2045 		goto err;
2046 	}
2047 
2048 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2049 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2050 		    &q->fl[1].desc, &q->fl[1].sdesc,
2051 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2052 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2053 		printf("error %d from alloc ring fl1\n", ret);
2054 		goto err;
2055 	}
2056 
2057 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2058 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2059 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2060 		    NULL, NULL)) != 0) {
2061 		printf("error %d from alloc ring rspq\n", ret);
2062 		goto err;
2063 	}
2064 
2065 	for (i = 0; i < ntxq; ++i) {
2066 		/*
2067 		 * The control queue always uses immediate data so does not
2068 		 * need to keep track of any mbufs.
2069 		 * XXX Placeholder for future TOE support.
2070 		 */
2071 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2072 
2073 		if ((ret = alloc_ring(sc, p->txq_size[i],
2074 			    sizeof(struct tx_desc), sz,
2075 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2076 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2077 			    &q->txq[i].desc_map,
2078 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2079 			printf("error %d from alloc ring tx %i\n", ret, i);
2080 			goto err;
2081 		}
2082 		mbufq_init(&q->txq[i].sendq);
2083 		q->txq[i].gen = 1;
2084 		q->txq[i].size = p->txq_size[i];
2085 		snprintf(q->txq[i].lockbuf, TXQ_NAME_LEN, "t3 txq lock %d:%d:%d",
2086 		    device_get_unit(sc->dev), irq_vec_idx, i);
2087 		MTX_INIT(&q->txq[i].lock, q->txq[i].lockbuf, NULL, MTX_DEF);
2088 	}
2089 
2090 	q->txq[TXQ_ETH].port = pi;
2091 
2092 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2093 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2094 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_ETH]);
2095 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_OFLD]);
2096 
2097 
2098 
2099 
2100 	q->fl[0].gen = q->fl[1].gen = 1;
2101 	q->fl[0].size = p->fl_size;
2102 	q->fl[1].size = p->jumbo_size;
2103 
2104 	q->rspq.gen = 1;
2105 	q->rspq.cidx = 0;
2106 	q->rspq.size = p->rspq_size;
2107 
2108 	q->txq[TXQ_ETH].stop_thres = nports *
2109 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2110 
2111 	q->fl[0].buf_size = MCLBYTES;
2112 	q->fl[0].zone = zone_clust;
2113 	q->fl[0].type = EXT_CLUSTER;
2114 	q->fl[1].buf_size = MJUMPAGESIZE;
2115 	q->fl[1].zone = zone_jumbop;
2116 	q->fl[1].type = EXT_JUMBOP;
2117 
2118 	q->lro.enabled = lro_default;
2119 
2120 	mtx_lock(&sc->sge.reg_lock);
2121 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2122 				   q->rspq.phys_addr, q->rspq.size,
2123 				   q->fl[0].buf_size, 1, 0);
2124 	if (ret) {
2125 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2126 		goto err_unlock;
2127 	}
2128 
2129 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2130 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2131 					  q->fl[i].phys_addr, q->fl[i].size,
2132 					  q->fl[i].buf_size, p->cong_thres, 1,
2133 					  0);
2134 		if (ret) {
2135 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2136 			goto err_unlock;
2137 		}
2138 	}
2139 
2140 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2141 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2142 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2143 				 1, 0);
2144 	if (ret) {
2145 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2146 		goto err_unlock;
2147 	}
2148 
2149 	if (ntxq > 1) {
2150 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2151 					 USE_GTS, SGE_CNTXT_OFLD, id,
2152 					 q->txq[TXQ_OFLD].phys_addr,
2153 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2154 		if (ret) {
2155 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2156 			goto err_unlock;
2157 		}
2158 	}
2159 
2160 	if (ntxq > 2) {
2161 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2162 					 SGE_CNTXT_CTRL, id,
2163 					 q->txq[TXQ_CTRL].phys_addr,
2164 					 q->txq[TXQ_CTRL].size,
2165 					 q->txq[TXQ_CTRL].token, 1, 0);
2166 		if (ret) {
2167 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2168 			goto err_unlock;
2169 		}
2170 	}
2171 
2172 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2173 	    device_get_unit(sc->dev), irq_vec_idx);
2174 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2175 
2176 	mtx_unlock(&sc->sge.reg_lock);
2177 	t3_update_qset_coalesce(q, p);
2178 	q->port = pi;
2179 
2180 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2181 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2182 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2183 
2184 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2185 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2186 
2187 	return (0);
2188 
2189 err_unlock:
2190 	mtx_unlock(&sc->sge.reg_lock);
2191 err:
2192 	t3_free_qset(sc, q);
2193 
2194 	return (ret);
2195 }
2196 
2197 void
2198 t3_rx_eth(struct port_info *pi, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2199 {
2200 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2201 	struct ifnet *ifp = pi->ifp;
2202 
2203 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2204 	if (&pi->adapter->port[cpl->iff] != pi)
2205 		panic("bad port index %d m->m_data=%p\n", cpl->iff, mtod(m, uint8_t *));
2206 
2207 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2208 	    cpl->csum_valid && cpl->csum == 0xffff) {
2209 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2210 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2211 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2212 		m->m_pkthdr.csum_data = 0xffff;
2213 	}
2214 	/*
2215 	 * XXX need to add VLAN support for 6.x
2216 	 */
2217 #ifdef VLAN_SUPPORTED
2218 	if (__predict_false(cpl->vlan_valid)) {
2219 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2220 		m->m_flags |= M_VLANTAG;
2221 	}
2222 #endif
2223 
2224 	m->m_pkthdr.rcvif = ifp;
2225 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2226 	m_explode(m);
2227 	/*
2228 	 * adjust after conversion to mbuf chain
2229 	 */
2230 	m_adj(m, sizeof(*cpl) + ethpad);
2231 
2232 	(*ifp->if_input)(ifp, m);
2233 }
2234 
2235 /**
2236  *	get_packet - return the next ingress packet buffer from a free list
2237  *	@adap: the adapter that received the packet
2238  *	@drop_thres: # of remaining buffers before we start dropping packets
2239  *	@qs: the qset that the SGE free list holding the packet belongs to
2240  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2241  *      @r: response descriptor
2242  *
2243  *	Get the next packet from a free list and complete setup of the
2244  *	sk_buff.  If the packet is small we make a copy and recycle the
2245  *	original buffer, otherwise we use the original buffer itself.  If a
2246  *	positive drop threshold is supplied packets are dropped and their
2247  *	buffers recycled if (a) the number of remaining buffers is under the
2248  *	threshold and the packet is too big to copy, or (b) the packet should
2249  *	be copied but there is no memory for the copy.
2250  */
2251 static int
2252 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2253     struct mbuf *m, struct rsp_desc *r)
2254 {
2255 
2256 	unsigned int len_cq =  ntohl(r->len_cq);
2257 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2258 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2259 	uint32_t len = G_RSPD_LEN(len_cq);
2260 	uint32_t flags = ntohl(r->flags);
2261 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2262 	void *cl;
2263 	int ret = 0;
2264 
2265 	prefetch(sd->cl);
2266 
2267 	fl->credits--;
2268 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2269 
2270 	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2271 		cl = mtod(m, void *);
2272 		memcpy(cl, sd->cl, len);
2273 		recycle_rx_buf(adap, fl, fl->cidx);
2274 	} else {
2275 		cl = sd->cl;
2276 		bus_dmamap_unload(fl->entry_tag, sd->map);
2277 	}
2278 	switch(sopeop) {
2279 	case RSPQ_SOP_EOP:
2280 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2281 		if (cl == sd->cl)
2282 			m_cljset(m, cl, fl->type);
2283 		m->m_len = m->m_pkthdr.len = len;
2284 		ret = 1;
2285 		goto done;
2286 		break;
2287 	case RSPQ_NSOP_NEOP:
2288 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2289 		ret = 0;
2290 		break;
2291 	case RSPQ_SOP:
2292 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2293 		m_iovinit(m);
2294 		ret = 0;
2295 		break;
2296 	case RSPQ_EOP:
2297 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2298 		ret = 1;
2299 		break;
2300 	}
2301 	m_iovappend(m, cl, fl->buf_size, len, 0);
2302 
2303 done:
2304 	if (++fl->cidx == fl->size)
2305 		fl->cidx = 0;
2306 
2307 	return (ret);
2308 }
2309 
2310 /**
2311  *	handle_rsp_cntrl_info - handles control information in a response
2312  *	@qs: the queue set corresponding to the response
2313  *	@flags: the response control flags
2314  *
2315  *	Handles the control information of an SGE response, such as GTS
2316  *	indications and completion credits for the queue set's Tx queues.
2317  *	HW coalesces credits, we don't do any extra SW coalescing.
2318  */
2319 static __inline void
2320 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2321 {
2322 	unsigned int credits;
2323 
2324 #if USE_GTS
2325 	if (flags & F_RSPD_TXQ0_GTS)
2326 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2327 #endif
2328 	credits = G_RSPD_TXQ0_CR(flags);
2329 	if (credits) {
2330 		qs->txq[TXQ_ETH].processed += credits;
2331 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
2332 			taskqueue_enqueue(qs->port->adapter->tq,
2333 			    &qs->port->timer_reclaim_task);
2334 	}
2335 
2336 	credits = G_RSPD_TXQ2_CR(flags);
2337 	if (credits)
2338 		qs->txq[TXQ_CTRL].processed += credits;
2339 
2340 # if USE_GTS
2341 	if (flags & F_RSPD_TXQ1_GTS)
2342 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2343 # endif
2344 	credits = G_RSPD_TXQ1_CR(flags);
2345 	if (credits)
2346 		qs->txq[TXQ_OFLD].processed += credits;
2347 }
2348 
2349 static void
2350 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2351     unsigned int sleeping)
2352 {
2353 	;
2354 }
2355 
2356 /**
2357  *	process_responses - process responses from an SGE response queue
2358  *	@adap: the adapter
2359  *	@qs: the queue set to which the response queue belongs
2360  *	@budget: how many responses can be processed in this round
2361  *
2362  *	Process responses from an SGE response queue up to the supplied budget.
2363  *	Responses include received packets as well as credits and other events
2364  *	for the queues that belong to the response queue's queue set.
2365  *	A negative budget is effectively unlimited.
2366  *
2367  *	Additionally choose the interrupt holdoff time for the next interrupt
2368  *	on this queue.  If the system is under memory shortage use a fairly
2369  *	long delay to help recovery.
2370  */
2371 static int
2372 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2373 {
2374 	struct sge_rspq *rspq = &qs->rspq;
2375 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2376 	int budget_left = budget;
2377 	unsigned int sleeping = 0;
2378 	int lro = qs->lro.enabled;
2379 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2380 	int ngathered = 0;
2381 #ifdef DEBUG
2382 	static int last_holdoff = 0;
2383 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2384 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2385 		last_holdoff = rspq->holdoff_tmr;
2386 	}
2387 #endif
2388 	rspq->next_holdoff = rspq->holdoff_tmr;
2389 
2390 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2391 		int eth, eop = 0, ethpad = 0;
2392 		uint32_t flags = ntohl(r->flags);
2393 		uint32_t rss_csum = *(const uint32_t *)r;
2394 		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
2395 
2396 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2397 
2398 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2399 			/* XXX */
2400 			printf("async notification\n");
2401 
2402 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2403 			struct mbuf *m = NULL;
2404 
2405 			if (cxgb_debug)
2406 				printf("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
2407 			if (rspq->m == NULL)
2408 				rspq->m = m_gethdr(M_DONTWAIT, MT_DATA);
2409                         else
2410 				m = m_gethdr(M_DONTWAIT, MT_DATA);
2411 
2412 			/*
2413 			 * XXX revisit me
2414 			 */
2415 			if (rspq->m == NULL &&  m == NULL) {
2416 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2417 				budget_left--;
2418 				break;
2419 			}
2420 			if (get_imm_packet(adap, r, rspq->m, m, flags))
2421 				goto skip;
2422 			eop = 1;
2423 			rspq->imm_data++;
2424 		} else if (r->len_cq) {
2425 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2426 
2427                         if (rspq->m == NULL)
2428 				rspq->m = m_gethdr(M_DONTWAIT, MT_DATA);
2429 			if (rspq->m == NULL) {
2430 				log(LOG_WARNING, "failed to get mbuf for packet\n");
2431 				break;
2432 			}
2433 
2434 			ethpad = 2;
2435 			eop = get_packet(adap, drop_thresh, qs, rspq->m, r);
2436 		} else {
2437 			DPRINTF("pure response\n");
2438 			rspq->pure_rsps++;
2439 		}
2440 
2441 		if (flags & RSPD_CTRL_MASK) {
2442 			sleeping |= flags & RSPD_GTS_MASK;
2443 			handle_rsp_cntrl_info(qs, flags);
2444 		}
2445 	skip:
2446 		r++;
2447 		if (__predict_false(++rspq->cidx == rspq->size)) {
2448 			rspq->cidx = 0;
2449 			rspq->gen ^= 1;
2450 			r = rspq->desc;
2451 		}
2452 
2453 		prefetch(r);
2454 		if (++rspq->credits >= (rspq->size / 4)) {
2455 			refill_rspq(adap, rspq, rspq->credits);
2456 			rspq->credits = 0;
2457 		}
2458 
2459 		if (eop) {
2460 			prefetch(mtod(rspq->m, uint8_t *));
2461 			prefetch(mtod(rspq->m, uint8_t *) + L1_CACHE_BYTES);
2462 
2463 			if (eth) {
2464 				t3_rx_eth_lro(adap, rspq, rspq->m, ethpad,
2465 				    rss_hash, rss_csum, lro);
2466 
2467 				rspq->m = NULL;
2468 			} else {
2469 				rspq->m->m_pkthdr.csum_data = rss_csum;
2470 				/*
2471 				 * XXX size mismatch
2472 				 */
2473 				m_set_priority(rspq->m, rss_hash);
2474 
2475 				ngathered = rx_offload(&adap->tdev, rspq, rspq->m,
2476 				    offload_mbufs, ngathered);
2477 			}
2478 			__refill_fl(adap, &qs->fl[0]);
2479 			__refill_fl(adap, &qs->fl[1]);
2480 
2481 		}
2482 		--budget_left;
2483 	}
2484 
2485 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
2486 	t3_lro_flush(adap, qs, &qs->lro);
2487 
2488 	if (sleeping)
2489 		check_ring_db(adap, qs, sleeping);
2490 
2491 	smp_mb();  /* commit Tx queue processed updates */
2492 	if (__predict_false(qs->txq_stopped != 0))
2493 		restart_tx(qs);
2494 
2495 	budget -= budget_left;
2496 	return (budget);
2497 }
2498 
2499 /*
2500  * A helper function that processes responses and issues GTS.
2501  */
2502 static __inline int
2503 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2504 {
2505 	int work;
2506 	static int last_holdoff = 0;
2507 
2508 	work = process_responses(adap, rspq_to_qset(rq), -1);
2509 
2510 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2511 		printf("next_holdoff=%d\n", rq->next_holdoff);
2512 		last_holdoff = rq->next_holdoff;
2513 	}
2514 
2515 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2516 		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2517 	return work;
2518 }
2519 
2520 
2521 /*
2522  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2523  * Handles data events from SGE response queues as well as error and other
2524  * async events as they all use the same interrupt pin.  We use one SGE
2525  * response queue per port in this mode and protect all response queues with
2526  * queue 0's lock.
2527  */
2528 void
2529 t3b_intr(void *data)
2530 {
2531 	uint32_t map;
2532 	adapter_t *adap = data;
2533 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2534 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2535 
2536 	t3_write_reg(adap, A_PL_CLI, 0);
2537 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2538 
2539 	if (!map)
2540 		return;
2541 
2542 	if (__predict_false(map & F_ERRINTR))
2543 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2544 
2545 	mtx_lock(&q0->lock);
2546 
2547 	if (__predict_true(map & 1))
2548 		process_responses_gts(adap, q0);
2549 
2550 	if (map & 2)
2551 		process_responses_gts(adap, q1);
2552 
2553 	mtx_unlock(&q0->lock);
2554 }
2555 
2556 /*
2557  * The MSI interrupt handler.  This needs to handle data events from SGE
2558  * response queues as well as error and other async events as they all use
2559  * the same MSI vector.  We use one SGE response queue per port in this mode
2560  * and protect all response queues with queue 0's lock.
2561  */
2562 void
2563 t3_intr_msi(void *data)
2564 {
2565 	adapter_t *adap = data;
2566 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2567 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2568 	int new_packets = 0;
2569 
2570 	mtx_lock(&q0->lock);
2571 	if (process_responses_gts(adap, q0)) {
2572 		new_packets = 1;
2573 	}
2574 
2575 	if (adap->params.nports == 2 &&
2576 	    process_responses_gts(adap, q1)) {
2577 		new_packets = 1;
2578 	}
2579 
2580 	mtx_unlock(&q0->lock);
2581 	if (new_packets == 0)
2582 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2583 }
2584 
2585 void
2586 t3_intr_msix(void *data)
2587 {
2588 	struct sge_qset *qs = data;
2589 	adapter_t *adap = qs->port->adapter;
2590 	struct sge_rspq *rspq = &qs->rspq;
2591 
2592 	mtx_lock(&rspq->lock);
2593 	if (process_responses_gts(adap, rspq) == 0)
2594 		rspq->unhandled_irqs++;
2595 	mtx_unlock(&rspq->lock);
2596 }
2597 
2598 /*
2599  * broken by recent mbuf changes
2600  */
2601 static int
2602 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2603 {
2604 	adapter_t *sc;
2605 	int i, j, enabled, err, nqsets = 0;
2606 
2607 #ifndef LRO_WORKING
2608 	return (0);
2609 #endif
2610 
2611 	sc = arg1;
2612 	enabled = sc->sge.qs[0].lro.enabled;
2613         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2614 
2615 	if (err != 0)
2616 		return (err);
2617 	if (enabled == sc->sge.qs[0].lro.enabled)
2618 		return (0);
2619 
2620 	for (i = 0; i < sc->params.nports; i++)
2621 		for (j = 0; j < sc->port[i].nqsets; j++)
2622 			nqsets++;
2623 
2624 	for (i = 0; i < nqsets; i++)
2625 		sc->sge.qs[i].lro.enabled = enabled;
2626 
2627 	return (0);
2628 }
2629 
2630 static int
2631 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2632 {
2633 	adapter_t *sc = arg1;
2634 	struct qset_params *qsp = &sc->params.sge.qset[0];
2635 	int coalesce_nsecs;
2636 	struct sge_qset *qs;
2637 	int i, j, err, nqsets = 0;
2638 	struct mtx *lock;
2639 
2640 	coalesce_nsecs = qsp->coalesce_nsecs;
2641         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
2642 
2643 	if (err != 0) {
2644 		return (err);
2645 	}
2646 	if (coalesce_nsecs == qsp->coalesce_nsecs)
2647 		return (0);
2648 
2649 	for (i = 0; i < sc->params.nports; i++)
2650 		for (j = 0; j < sc->port[i].nqsets; j++)
2651 			nqsets++;
2652 
2653 	coalesce_nsecs = max(100, coalesce_nsecs);
2654 
2655 	for (i = 0; i < nqsets; i++) {
2656 		qs = &sc->sge.qs[i];
2657 		qsp = &sc->params.sge.qset[i];
2658 		qsp->coalesce_nsecs = coalesce_nsecs;
2659 
2660 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
2661 			    &sc->sge.qs[0].rspq.lock;
2662 
2663 		mtx_lock(lock);
2664 		t3_update_qset_coalesce(qs, qsp);
2665 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2666 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
2667 		mtx_unlock(lock);
2668 	}
2669 
2670 	return (0);
2671 }
2672 
2673 
2674 void
2675 t3_add_sysctls(adapter_t *sc)
2676 {
2677 	struct sysctl_ctx_list *ctx;
2678 	struct sysctl_oid_list *children;
2679 
2680 	ctx = device_get_sysctl_ctx(sc->dev);
2681 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
2682 
2683 	/* random information */
2684 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
2685 	    "firmware_version",
2686 	    CTLFLAG_RD, &sc->fw_version,
2687 	    0, "firmware version");
2688 
2689 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2690 	    "enable_lro",
2691 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2692 	    0, t3_lro_enable,
2693 	    "I", "enable large receive offload");
2694 
2695 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2696 	    "intr_coal",
2697 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2698 	    0, t3_set_coalesce_nsecs,
2699 	    "I", "interrupt coalescing timer (ns)");
2700 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2701 	    "enable_debug",
2702 	    CTLFLAG_RW, &cxgb_debug,
2703 	    0, "enable verbose debugging output");
2704 
2705 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2706 	    "collapse_free",
2707 	    CTLFLAG_RD, &collapse_free,
2708 	    0, "frees during collapse");
2709 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2710 	    "mb_free_vec_free",
2711 	    CTLFLAG_RD, &mb_free_vec_free,
2712 	    0, "frees during mb_free_vec");
2713 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2714 	    "collapse_mbufs",
2715 	    CTLFLAG_RW, &collapse_mbufs,
2716 	    0, "collapse mbuf chains into iovecs");
2717 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2718 	    "txq_overrun",
2719 	    CTLFLAG_RD, &txq_fills,
2720 	    0, "#times txq overrun");
2721 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2722 	    "bogus_imm",
2723 	    CTLFLAG_RD, &bogus_imm,
2724 	    0, "#times a bogus immediate response was seen");
2725 }
2726 
2727 /**
2728  *	t3_get_desc - dump an SGE descriptor for debugging purposes
2729  *	@qs: the queue set
2730  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2731  *	@idx: the descriptor index in the queue
2732  *	@data: where to dump the descriptor contents
2733  *
2734  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2735  *	size of the descriptor.
2736  */
2737 int
2738 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2739 		unsigned char *data)
2740 {
2741 	if (qnum >= 6)
2742 		return (EINVAL);
2743 
2744 	if (qnum < 3) {
2745 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2746 			return -EINVAL;
2747 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2748 		return sizeof(struct tx_desc);
2749 	}
2750 
2751 	if (qnum == 3) {
2752 		if (!qs->rspq.desc || idx >= qs->rspq.size)
2753 			return (EINVAL);
2754 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2755 		return sizeof(struct rsp_desc);
2756 	}
2757 
2758 	qnum -= 4;
2759 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2760 		return (EINVAL);
2761 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2762 	return sizeof(struct rx_desc);
2763 }
2764