xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 30d239bc4c510432e65a84fa1c14ed67a3ab1c92)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/conf.h>
39 #include <machine/bus.h>
40 #include <machine/resource.h>
41 #include <sys/bus_dma.h>
42 #include <sys/rman.h>
43 #include <sys/queue.h>
44 #include <sys/sysctl.h>
45 #include <sys/taskqueue.h>
46 
47 #include <sys/proc.h>
48 #include <sys/sched.h>
49 #include <sys/smp.h>
50 #include <sys/systm.h>
51 
52 #include <netinet/in_systm.h>
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/tcp.h>
56 
57 #include <dev/pci/pcireg.h>
58 #include <dev/pci/pcivar.h>
59 
60 #ifdef CONFIG_DEFINED
61 #include <cxgb_include.h>
62 #else
63 #include <dev/cxgb/cxgb_include.h>
64 #endif
65 
66 uint32_t collapse_free = 0;
67 uint32_t mb_free_vec_free = 0;
68 int      txq_fills = 0;
69 int      collapse_mbufs = 0;
70 static int bogus_imm = 0;
71 #ifndef DISABLE_MBUF_IOVEC
72 static int recycle_enable = 1;
73 #endif
74 
75 #define USE_GTS 0
76 
77 #define SGE_RX_SM_BUF_SIZE	1536
78 #define SGE_RX_DROP_THRES	16
79 #define SGE_RX_COPY_THRES	128
80 
81 /*
82  * Period of the Tx buffer reclaim timer.  This timer does not need to run
83  * frequently as Tx buffers are usually reclaimed by new Tx packets.
84  */
85 #define TX_RECLAIM_PERIOD       (hz >> 1)
86 
87 /*
88  * work request size in bytes
89  */
90 #define WR_LEN (WR_FLITS * 8)
91 
92 /*
93  * Values for sge_txq.flags
94  */
95 enum {
96 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
97 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
98 };
99 
100 struct tx_desc {
101 	uint64_t	flit[TX_DESC_FLITS];
102 } __packed;
103 
104 struct rx_desc {
105 	uint32_t	addr_lo;
106 	uint32_t	len_gen;
107 	uint32_t	gen2;
108 	uint32_t	addr_hi;
109 } __packed;;
110 
111 struct rsp_desc {               /* response queue descriptor */
112 	struct rss_header	rss_hdr;
113 	uint32_t		flags;
114 	uint32_t		len_cq;
115 	uint8_t			imm_data[47];
116 	uint8_t			intr_gen;
117 } __packed;
118 
119 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
120 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
121 #define RX_SW_DESC_INUSE        (1 << 3)
122 #define TX_SW_DESC_MAPPED       (1 << 4)
123 
124 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
125 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
126 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
127 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
128 
129 struct tx_sw_desc {                /* SW state per Tx descriptor */
130 	struct mbuf	*m;
131 	bus_dmamap_t	map;
132 	int		flags;
133 };
134 
135 struct rx_sw_desc {                /* SW state per Rx descriptor */
136 	void	        *cl;
137 	bus_dmamap_t	map;
138 	int		flags;
139 };
140 
141 struct txq_state {
142 	unsigned int compl;
143 	unsigned int gen;
144 	unsigned int pidx;
145 };
146 
147 struct refill_fl_cb_arg {
148 	int               error;
149 	bus_dma_segment_t seg;
150 	int               nseg;
151 };
152 
153 /*
154  * Maps a number of flits to the number of Tx descriptors that can hold them.
155  * The formula is
156  *
157  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
158  *
159  * HW allows up to 4 descriptors to be combined into a WR.
160  */
161 static uint8_t flit_desc_map[] = {
162 	0,
163 #if SGE_NUM_GENBITS == 1
164 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
165 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
166 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
167 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
168 #elif SGE_NUM_GENBITS == 2
169 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
170 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
171 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
172 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
173 #else
174 # error "SGE_NUM_GENBITS must be 1 or 2"
175 #endif
176 };
177 
178 
179 static int lro_default = 0;
180 int cxgb_debug = 0;
181 
182 static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
183 static void sge_timer_cb(void *arg);
184 static void sge_timer_reclaim(void *arg, int ncount);
185 static void sge_txq_reclaim_handler(void *arg, int ncount);
186 static int free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec);
187 
188 /**
189  *	reclaim_completed_tx - reclaims completed Tx descriptors
190  *	@adapter: the adapter
191  *	@q: the Tx queue to reclaim completed descriptors from
192  *
193  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
194  *	and frees the associated buffers if possible.  Called with the Tx
195  *	queue's lock held.
196  */
197 static __inline int
198 reclaim_completed_tx(struct sge_txq *q, int nbufs, struct mbuf **mvec)
199 {
200 	int reclaimed, reclaim = desc_reclaimable(q);
201 	int n = 0;
202 
203 	mtx_assert(&q->lock, MA_OWNED);
204 	if (reclaim > 0) {
205 		n = free_tx_desc(q, min(reclaim, nbufs), mvec);
206 		reclaimed = min(reclaim, nbufs);
207 		q->cleaned += reclaimed;
208 		q->in_use -= reclaimed;
209 	}
210 	return (n);
211 }
212 
213 /**
214  *	should_restart_tx - are there enough resources to restart a Tx queue?
215  *	@q: the Tx queue
216  *
217  *	Checks if there are enough descriptors to restart a suspended Tx queue.
218  */
219 static __inline int
220 should_restart_tx(const struct sge_txq *q)
221 {
222 	unsigned int r = q->processed - q->cleaned;
223 
224 	return q->in_use - r < (q->size >> 1);
225 }
226 
227 /**
228  *	t3_sge_init - initialize SGE
229  *	@adap: the adapter
230  *	@p: the SGE parameters
231  *
232  *	Performs SGE initialization needed every time after a chip reset.
233  *	We do not initialize any of the queue sets here, instead the driver
234  *	top-level must request those individually.  We also do not enable DMA
235  *	here, that should be done after the queues have been set up.
236  */
237 void
238 t3_sge_init(adapter_t *adap, struct sge_params *p)
239 {
240 	u_int ctrl, ups;
241 
242 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
243 
244 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
245 	       F_CQCRDTCTRL |
246 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
247 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
248 #if SGE_NUM_GENBITS == 1
249 	ctrl |= F_EGRGENCTRL;
250 #endif
251 	if (adap->params.rev > 0) {
252 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
253 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
254 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
255 	}
256 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
257 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
258 		     V_LORCQDRBTHRSH(512));
259 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
260 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
261 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
262 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
263 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
264 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
265 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
266 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
267 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
268 }
269 
270 
271 /**
272  *	sgl_len - calculates the size of an SGL of the given capacity
273  *	@n: the number of SGL entries
274  *
275  *	Calculates the number of flits needed for a scatter/gather list that
276  *	can hold the given number of entries.
277  */
278 static __inline unsigned int
279 sgl_len(unsigned int n)
280 {
281 	return ((3 * n) / 2 + (n & 1));
282 }
283 
284 /**
285  *	get_imm_packet - return the next ingress packet buffer from a response
286  *	@resp: the response descriptor containing the packet data
287  *
288  *	Return a packet containing the immediate data of the given response.
289  */
290 #ifdef DISABLE_MBUF_IOVEC
291 static __inline int
292 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh)
293 {
294 	struct mbuf *m;
295 	int len;
296 	uint32_t flags = ntohl(resp->flags);
297 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
298 
299 	/*
300 	 * would be a firmware bug
301 	 */
302 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
303 		return (0);
304 
305 	m = m_gethdr(M_NOWAIT, MT_DATA);
306 	len = G_RSPD_LEN(ntohl(resp->len_cq));
307 
308 	if (m) {
309 		MH_ALIGN(m, IMMED_PKT_SIZE);
310 		memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE);
311 		m->m_len = len;
312 
313 		switch (sopeop) {
314 		case RSPQ_SOP_EOP:
315 			mh->mh_head = mh->mh_tail = m;
316 			m->m_pkthdr.len = len;
317 			m->m_flags |= M_PKTHDR;
318 			break;
319 		case RSPQ_EOP:
320 			m->m_flags &= ~M_PKTHDR;
321 			mh->mh_head->m_pkthdr.len += len;
322 			mh->mh_tail->m_next = m;
323 			mh->mh_tail = m;
324 			break;
325 		}
326 	}
327 	return (m != NULL);
328 }
329 
330 #else
331 static int
332 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl, uint32_t flags)
333 {
334 	int len, error;
335 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
336 
337 	/*
338 	 * would be a firmware bug
339 	 */
340 	len = G_RSPD_LEN(ntohl(resp->len_cq));
341 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP) {
342 		if (cxgb_debug)
343 			device_printf(sc->dev, "unexpected value sopeop=%d flags=0x%x len=%din get_imm_packet\n", sopeop, flags, len);
344 		bogus_imm++;
345 		return (EINVAL);
346 	}
347 	error = 0;
348 	switch (sopeop) {
349 	case RSPQ_SOP_EOP:
350 		m->m_len = m->m_pkthdr.len = len;
351 		memcpy(mtod(m, uint8_t *), resp->imm_data, len);
352 		break;
353 	case RSPQ_EOP:
354 		memcpy(cl, resp->imm_data, len);
355 		m_iovappend(m, cl, MSIZE, len, 0);
356 		break;
357 	default:
358 		bogus_imm++;
359 		error = EINVAL;
360 	}
361 
362 	return (error);
363 }
364 #endif
365 
366 static __inline u_int
367 flits_to_desc(u_int n)
368 {
369 	return (flit_desc_map[n]);
370 }
371 
372 void
373 t3_sge_err_intr_handler(adapter_t *adapter)
374 {
375 	unsigned int v, status;
376 
377 
378 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
379 
380 	if (status & F_RSPQCREDITOVERFOW)
381 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
382 
383 	if (status & F_RSPQDISABLED) {
384 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
385 
386 		CH_ALERT(adapter,
387 			 "packet delivered to disabled response queue (0x%x)\n",
388 			 (v >> S_RSPQ0DISABLED) & 0xff);
389 	}
390 
391 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
392 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
393 		t3_fatal_err(adapter);
394 }
395 
396 void
397 t3_sge_prep(adapter_t *adap, struct sge_params *p)
398 {
399 	int i;
400 
401 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
402 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
403 
404 	for (i = 0; i < SGE_QSETS; ++i) {
405 		struct qset_params *q = p->qset + i;
406 
407 		q->polling = adap->params.rev > 0;
408 
409 		if (adap->params.nports > 2)
410 			q->coalesce_nsecs = 50000;
411 		else
412 			q->coalesce_nsecs = 5000;
413 
414 		q->rspq_size = RSPQ_Q_SIZE;
415 		q->fl_size = FL_Q_SIZE;
416 		q->jumbo_size = JUMBO_Q_SIZE;
417 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
418 		q->txq_size[TXQ_OFLD] = 1024;
419 		q->txq_size[TXQ_CTRL] = 256;
420 		q->cong_thres = 0;
421 	}
422 }
423 
424 int
425 t3_sge_alloc(adapter_t *sc)
426 {
427 
428 	/* The parent tag. */
429 	if (bus_dma_tag_create( NULL,			/* parent */
430 				1, 0,			/* algnmnt, boundary */
431 				BUS_SPACE_MAXADDR,	/* lowaddr */
432 				BUS_SPACE_MAXADDR,	/* highaddr */
433 				NULL, NULL,		/* filter, filterarg */
434 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
435 				BUS_SPACE_UNRESTRICTED, /* nsegments */
436 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
437 				0,			/* flags */
438 				NULL, NULL,		/* lock, lockarg */
439 				&sc->parent_dmat)) {
440 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
441 		return (ENOMEM);
442 	}
443 
444 	/*
445 	 * DMA tag for normal sized RX frames
446 	 */
447 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
448 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
449 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
450 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
451 		return (ENOMEM);
452 	}
453 
454 	/*
455 	 * DMA tag for jumbo sized RX frames.
456 	 */
457 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
458 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
459 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
460 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
461 		return (ENOMEM);
462 	}
463 
464 	/*
465 	 * DMA tag for TX frames.
466 	 */
467 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
468 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
469 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
470 		NULL, NULL, &sc->tx_dmat)) {
471 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
472 		return (ENOMEM);
473 	}
474 
475 	return (0);
476 }
477 
478 int
479 t3_sge_free(struct adapter * sc)
480 {
481 
482 	if (sc->tx_dmat != NULL)
483 		bus_dma_tag_destroy(sc->tx_dmat);
484 
485 	if (sc->rx_jumbo_dmat != NULL)
486 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
487 
488 	if (sc->rx_dmat != NULL)
489 		bus_dma_tag_destroy(sc->rx_dmat);
490 
491 	if (sc->parent_dmat != NULL)
492 		bus_dma_tag_destroy(sc->parent_dmat);
493 
494 	return (0);
495 }
496 
497 void
498 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
499 {
500 
501 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
502 	qs->rspq.polling = 0 /* p->polling */;
503 }
504 
505 static void
506 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
507 {
508 	struct refill_fl_cb_arg *cb_arg = arg;
509 
510 	cb_arg->error = error;
511 	cb_arg->seg = segs[0];
512 	cb_arg->nseg = nseg;
513 
514 }
515 
516 /**
517  *	refill_fl - refill an SGE free-buffer list
518  *	@sc: the controller softc
519  *	@q: the free-list to refill
520  *	@n: the number of new buffers to allocate
521  *
522  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
523  *	The caller must assure that @n does not exceed the queue's capacity.
524  */
525 static void
526 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
527 {
528 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
529 	struct rx_desc *d = &q->desc[q->pidx];
530 	struct refill_fl_cb_arg cb_arg;
531 	void *cl;
532 	int err;
533 
534 	cb_arg.error = 0;
535 	while (n--) {
536 		/*
537 		 * We only allocate a cluster, mbuf allocation happens after rx
538 		 */
539 		if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
540 			log(LOG_WARNING, "Failed to allocate cluster\n");
541 			goto done;
542 		}
543 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
544 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
545 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
546 				uma_zfree(q->zone, cl);
547 				goto done;
548 			}
549 			sd->flags |= RX_SW_DESC_MAP_CREATED;
550 		}
551 		err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
552 		    refill_fl_cb, &cb_arg, 0);
553 
554 		if (err != 0 || cb_arg.error) {
555 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
556 			/*
557 			 * XXX free cluster
558 			 */
559 			return;
560 		}
561 
562 		sd->flags |= RX_SW_DESC_INUSE;
563 		sd->cl = cl;
564 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
565 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
566 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
567 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
568 
569 		d++;
570 		sd++;
571 
572 		if (++q->pidx == q->size) {
573 			q->pidx = 0;
574 			q->gen ^= 1;
575 			sd = q->sdesc;
576 			d = q->desc;
577 		}
578 		q->credits++;
579 	}
580 
581 done:
582 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
583 }
584 
585 
586 /**
587  *	free_rx_bufs - free the Rx buffers on an SGE free list
588  *	@sc: the controle softc
589  *	@q: the SGE free list to clean up
590  *
591  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
592  *	this queue should be stopped before calling this function.
593  */
594 static void
595 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
596 {
597 	u_int cidx = q->cidx;
598 
599 	while (q->credits--) {
600 		struct rx_sw_desc *d = &q->sdesc[cidx];
601 
602 		if (d->flags & RX_SW_DESC_INUSE) {
603 			bus_dmamap_unload(q->entry_tag, d->map);
604 			bus_dmamap_destroy(q->entry_tag, d->map);
605 			uma_zfree(q->zone, d->cl);
606 		}
607 		d->cl = NULL;
608 		if (++cidx == q->size)
609 			cidx = 0;
610 	}
611 }
612 
613 static __inline void
614 __refill_fl(adapter_t *adap, struct sge_fl *fl)
615 {
616 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
617 }
618 
619 #ifndef DISABLE_MBUF_IOVEC
620 /**
621  *	recycle_rx_buf - recycle a receive buffer
622  *	@adapter: the adapter
623  *	@q: the SGE free list
624  *	@idx: index of buffer to recycle
625  *
626  *	Recycles the specified buffer on the given free list by adding it at
627  *	the next available slot on the list.
628  */
629 static void
630 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
631 {
632 	struct rx_desc *from = &q->desc[idx];
633 	struct rx_desc *to   = &q->desc[q->pidx];
634 
635 	q->sdesc[q->pidx] = q->sdesc[idx];
636 	to->addr_lo = from->addr_lo;        // already big endian
637 	to->addr_hi = from->addr_hi;        // likewise
638 	wmb();
639 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
640 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
641 	q->credits++;
642 
643 	if (++q->pidx == q->size) {
644 		q->pidx = 0;
645 		q->gen ^= 1;
646 	}
647 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
648 }
649 #endif
650 
651 static void
652 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
653 {
654 	uint32_t *addr;
655 
656 	addr = arg;
657 	*addr = segs[0].ds_addr;
658 }
659 
660 static int
661 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
662     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
663     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
664 {
665 	size_t len = nelem * elem_size;
666 	void *s = NULL;
667 	void *p = NULL;
668 	int err;
669 
670 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
671 				      BUS_SPACE_MAXADDR_32BIT,
672 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
673 				      len, 0, NULL, NULL, tag)) != 0) {
674 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
675 		return (ENOMEM);
676 	}
677 
678 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
679 				    map)) != 0) {
680 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
681 		return (ENOMEM);
682 	}
683 
684 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
685 	bzero(p, len);
686 	*(void **)desc = p;
687 
688 	if (sw_size) {
689 		len = nelem * sw_size;
690 		s = malloc(len, M_DEVBUF, M_WAITOK);
691 		bzero(s, len);
692 		*(void **)sdesc = s;
693 	}
694 	if (parent_entry_tag == NULL)
695 		return (0);
696 
697 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
698 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
699 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
700 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
701 		                      NULL, NULL, entry_tag)) != 0) {
702 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
703 		return (ENOMEM);
704 	}
705 	return (0);
706 }
707 
708 static void
709 sge_slow_intr_handler(void *arg, int ncount)
710 {
711 	adapter_t *sc = arg;
712 
713 	t3_slow_intr_handler(sc);
714 }
715 
716 /**
717  *	sge_timer_cb - perform periodic maintenance of an SGE qset
718  *	@data: the SGE queue set to maintain
719  *
720  *	Runs periodically from a timer to perform maintenance of an SGE queue
721  *	set.  It performs two tasks:
722  *
723  *	a) Cleans up any completed Tx descriptors that may still be pending.
724  *	Normal descriptor cleanup happens when new packets are added to a Tx
725  *	queue so this timer is relatively infrequent and does any cleanup only
726  *	if the Tx queue has not seen any new packets in a while.  We make a
727  *	best effort attempt to reclaim descriptors, in that we don't wait
728  *	around if we cannot get a queue's lock (which most likely is because
729  *	someone else is queueing new packets and so will also handle the clean
730  *	up).  Since control queues use immediate data exclusively we don't
731  *	bother cleaning them up here.
732  *
733  *	b) Replenishes Rx queues that have run out due to memory shortage.
734  *	Normally new Rx buffers are added when existing ones are consumed but
735  *	when out of memory a queue can become empty.  We try to add only a few
736  *	buffers here, the queue will be replenished fully as these new buffers
737  *	are used up if memory shortage has subsided.
738  *
739  *	c) Return coalesced response queue credits in case a response queue is
740  *	starved.
741  *
742  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
743  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
744  */
745 static void
746 sge_timer_cb(void *arg)
747 {
748 	adapter_t *sc = arg;
749 	struct port_info *p;
750 	struct sge_qset *qs;
751 	struct sge_txq  *txq;
752 	int i, j;
753 	int reclaim_eth, reclaim_ofl, refill_rx;
754 
755 	for (i = 0; i < sc->params.nports; i++)
756 		for (j = 0; j < sc->port[i].nqsets; j++) {
757 			qs = &sc->sge.qs[i + j];
758 			txq = &qs->txq[0];
759 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
760 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
761 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
762 			    (qs->fl[1].credits < qs->fl[1].size));
763 			if (reclaim_eth || reclaim_ofl || refill_rx) {
764 				p = &sc->port[i];
765 				taskqueue_enqueue(p->tq, &p->timer_reclaim_task);
766 				break;
767 			}
768 		}
769 	if (sc->params.nports > 2) {
770 		int i;
771 
772 		for_each_port(sc, i) {
773 			struct port_info *pi = &sc->port[i];
774 
775 			t3_write_reg(sc, A_SG_KDOORBELL,
776 				     F_SELEGRCNTX |
777 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
778 		}
779 	}
780 	if (sc->open_device_map != 0)
781 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
782 }
783 
784 /*
785  * This is meant to be a catch-all function to keep sge state private
786  * to sge.c
787  *
788  */
789 int
790 t3_sge_init_adapter(adapter_t *sc)
791 {
792 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
793 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
794 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
795 	return (0);
796 }
797 
798 int
799 t3_sge_init_port(struct port_info *p)
800 {
801 	TASK_INIT(&p->timer_reclaim_task, 0, sge_timer_reclaim, p);
802 	return (0);
803 }
804 
805 void
806 t3_sge_deinit_sw(adapter_t *sc)
807 {
808 	int i;
809 
810 	callout_drain(&sc->sge_timer_ch);
811 	if (sc->tq)
812 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
813 	for (i = 0; i < sc->params.nports; i++)
814 		if (sc->port[i].tq != NULL)
815 			taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task);
816 }
817 
818 /**
819  *	refill_rspq - replenish an SGE response queue
820  *	@adapter: the adapter
821  *	@q: the response queue to replenish
822  *	@credits: how many new responses to make available
823  *
824  *	Replenishes a response queue by making the supplied number of responses
825  *	available to HW.
826  */
827 static __inline void
828 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
829 {
830 
831 	/* mbufs are allocated on demand when a rspq entry is processed. */
832 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
833 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
834 }
835 
836 static __inline void
837 sge_txq_reclaim_(struct sge_txq *txq)
838 {
839 	int reclaimable, i, n;
840 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
841 	struct port_info *p;
842 
843 	p = txq->port;
844 reclaim_more:
845 	n = 0;
846 	reclaimable = desc_reclaimable(txq);
847 	if (reclaimable > 0 && mtx_trylock(&txq->lock)) {
848 		n = reclaim_completed_tx(txq, TX_CLEAN_MAX_DESC, m_vec);
849 		mtx_unlock(&txq->lock);
850 	}
851 	if (n == 0)
852 		return;
853 
854 	for (i = 0; i < n; i++) {
855 		m_freem_vec(m_vec[i]);
856 	}
857 	if (p && p->ifp->if_drv_flags & IFF_DRV_OACTIVE &&
858 	    txq->size - txq->in_use >= TX_START_MAX_DESC) {
859 		txq_fills++;
860 		p->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
861 		taskqueue_enqueue(p->tq, &p->start_task);
862 	}
863 
864 	if (n)
865 		goto reclaim_more;
866 }
867 
868 static void
869 sge_txq_reclaim_handler(void *arg, int ncount)
870 {
871 	struct sge_txq *q = arg;
872 
873 	sge_txq_reclaim_(q);
874 }
875 
876 static void
877 sge_timer_reclaim(void *arg, int ncount)
878 {
879 	struct port_info *p = arg;
880 	int i, nqsets = p->nqsets;
881 	adapter_t *sc = p->adapter;
882 	struct sge_qset *qs;
883 	struct sge_txq *txq;
884 	struct mtx *lock;
885 
886 	for (i = 0; i < nqsets; i++) {
887 		qs = &sc->sge.qs[i];
888 		txq = &qs->txq[TXQ_ETH];
889 		sge_txq_reclaim_(txq);
890 
891 		txq = &qs->txq[TXQ_OFLD];
892 		sge_txq_reclaim_(txq);
893 
894 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
895 			    &sc->sge.qs[0].rspq.lock;
896 
897 		if (mtx_trylock(lock)) {
898 			/* XXX currently assume that we are *NOT* polling */
899 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
900 
901 			if (qs->fl[0].credits < qs->fl[0].size - 16)
902 				__refill_fl(sc, &qs->fl[0]);
903 			if (qs->fl[1].credits < qs->fl[1].size - 16)
904 				__refill_fl(sc, &qs->fl[1]);
905 
906 			if (status & (1 << qs->rspq.cntxt_id)) {
907 				if (qs->rspq.credits) {
908 					refill_rspq(sc, &qs->rspq, 1);
909 					qs->rspq.credits--;
910 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
911 					    1 << qs->rspq.cntxt_id);
912 				}
913 			}
914 			mtx_unlock(lock);
915 		}
916 	}
917 }
918 
919 /**
920  *	init_qset_cntxt - initialize an SGE queue set context info
921  *	@qs: the queue set
922  *	@id: the queue set id
923  *
924  *	Initializes the TIDs and context ids for the queues of a queue set.
925  */
926 static void
927 init_qset_cntxt(struct sge_qset *qs, u_int id)
928 {
929 
930 	qs->rspq.cntxt_id = id;
931 	qs->fl[0].cntxt_id = 2 * id;
932 	qs->fl[1].cntxt_id = 2 * id + 1;
933 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
934 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
935 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
936 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
937 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
938 }
939 
940 
941 static void
942 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
943 {
944 	txq->in_use += ndesc;
945 	/*
946 	 * XXX we don't handle stopping of queue
947 	 * presumably start handles this when we bump against the end
948 	 */
949 	txqs->gen = txq->gen;
950 	txq->unacked += ndesc;
951 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
952 	txq->unacked &= 7;
953 	txqs->pidx = txq->pidx;
954 	txq->pidx += ndesc;
955 
956 	if (txq->pidx >= txq->size) {
957 		txq->pidx -= txq->size;
958 		txq->gen ^= 1;
959 	}
960 
961 }
962 
963 /**
964  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
965  *	@m: the packet mbufs
966  *      @nsegs: the number of segments
967  *
968  * 	Returns the number of Tx descriptors needed for the given Ethernet
969  * 	packet.  Ethernet packets require addition of WR and CPL headers.
970  */
971 static __inline unsigned int
972 calc_tx_descs(const struct mbuf *m, int nsegs)
973 {
974 	unsigned int flits;
975 
976 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
977 		return 1;
978 
979 	flits = sgl_len(nsegs) + 2;
980 #ifdef TSO_SUPPORTED
981 	if  (m->m_pkthdr.csum_flags & (CSUM_TSO))
982 		flits++;
983 #endif
984 	return flits_to_desc(flits);
985 }
986 
987 static unsigned int
988 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
989     struct tx_sw_desc *stx, bus_dma_segment_t *segs, int *nsegs)
990 {
991 	struct mbuf *m0;
992 	int err, pktlen;
993 
994 	m0 = *m;
995 	pktlen = m0->m_pkthdr.len;
996 
997 	err = bus_dmamap_load_mvec_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
998 #ifdef DEBUG
999 	if (err) {
1000 		int n = 0;
1001 		struct mbuf *mtmp = m0;
1002 		while(mtmp) {
1003 			n++;
1004 			mtmp = mtmp->m_next;
1005 		}
1006 		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
1007 		    err, m0->m_pkthdr.len, n);
1008 	}
1009 #endif
1010 	if (err == EFBIG) {
1011 		/* Too many segments, try to defrag */
1012 		m0 = m_defrag(m0, M_DONTWAIT);
1013 		if (m0 == NULL) {
1014 			m_freem(*m);
1015 			*m = NULL;
1016 			return (ENOBUFS);
1017 		}
1018 		*m = m0;
1019 		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
1020 	}
1021 
1022 	if (err == ENOMEM) {
1023 		return (err);
1024 	}
1025 
1026 	if (err) {
1027 		if (cxgb_debug)
1028 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1029 		m_freem_vec(m0);
1030 		*m = NULL;
1031 		return (err);
1032 	}
1033 
1034 	bus_dmamap_sync(txq->entry_tag, stx->map, BUS_DMASYNC_PREWRITE);
1035 	stx->flags |= TX_SW_DESC_MAPPED;
1036 
1037 	return (0);
1038 }
1039 
1040 /**
1041  *	make_sgl - populate a scatter/gather list for a packet
1042  *	@sgp: the SGL to populate
1043  *	@segs: the packet dma segments
1044  *	@nsegs: the number of segments
1045  *
1046  *	Generates a scatter/gather list for the buffers that make up a packet
1047  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1048  *	appropriately.
1049  */
1050 static __inline void
1051 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1052 {
1053 	int i, idx;
1054 
1055 	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
1056 		if (i && idx == 0)
1057 			++sgp;
1058 
1059 		sgp->len[idx] = htobe32(segs[i].ds_len);
1060 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1061 	}
1062 
1063 	if (idx)
1064 		sgp->len[idx] = 0;
1065 }
1066 
1067 /**
1068  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1069  *	@adap: the adapter
1070  *	@q: the Tx queue
1071  *
1072  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1073  *	where the HW is going to sleep just after we checked, however,
1074  *	then the interrupt handler will detect the outstanding TX packet
1075  *	and ring the doorbell for us.
1076  *
1077  *	When GTS is disabled we unconditionally ring the doorbell.
1078  */
1079 static __inline void
1080 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1081 {
1082 #if USE_GTS
1083 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1084 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1085 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1086 #ifdef T3_TRACE
1087 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1088 			  q->cntxt_id);
1089 #endif
1090 		t3_write_reg(adap, A_SG_KDOORBELL,
1091 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1092 	}
1093 #else
1094 	wmb();            /* write descriptors before telling HW */
1095 	t3_write_reg(adap, A_SG_KDOORBELL,
1096 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1097 #endif
1098 }
1099 
1100 static __inline void
1101 wr_gen2(struct tx_desc *d, unsigned int gen)
1102 {
1103 #if SGE_NUM_GENBITS == 2
1104 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1105 #endif
1106 }
1107 
1108 
1109 
1110 /**
1111  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1112  *	@ndesc: number of Tx descriptors spanned by the SGL
1113  *	@txd: first Tx descriptor to be written
1114  *	@txqs: txq state (generation and producer index)
1115  *	@txq: the SGE Tx queue
1116  *	@sgl: the SGL
1117  *	@flits: number of flits to the start of the SGL in the first descriptor
1118  *	@sgl_flits: the SGL size in flits
1119  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1120  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1121  *
1122  *	Write a work request header and an associated SGL.  If the SGL is
1123  *	small enough to fit into one Tx descriptor it has already been written
1124  *	and we just need to write the WR header.  Otherwise we distribute the
1125  *	SGL across the number of descriptors it spans.
1126  */
1127 
1128 static void
1129 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1130     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1131     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1132 {
1133 
1134 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1135 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1136 
1137 	if (__predict_true(ndesc == 1)) {
1138 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1139 		    V_WR_SGLSFLT(flits)) | wr_hi;
1140 		wmb();
1141 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1142 		    V_WR_GEN(txqs->gen)) | wr_lo;
1143 		/* XXX gen? */
1144 		wr_gen2(txd, txqs->gen);
1145 	} else {
1146 		unsigned int ogen = txqs->gen;
1147 		const uint64_t *fp = (const uint64_t *)sgl;
1148 		struct work_request_hdr *wp = wrp;
1149 
1150 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1151 		    V_WR_SGLSFLT(flits)) | wr_hi;
1152 
1153 		while (sgl_flits) {
1154 			unsigned int avail = WR_FLITS - flits;
1155 
1156 			if (avail > sgl_flits)
1157 				avail = sgl_flits;
1158 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1159 			sgl_flits -= avail;
1160 			ndesc--;
1161 			if (!sgl_flits)
1162 				break;
1163 
1164 			fp += avail;
1165 			txd++;
1166 			txsd++;
1167 			if (++txqs->pidx == txq->size) {
1168 				txqs->pidx = 0;
1169 				txqs->gen ^= 1;
1170 				txd = txq->desc;
1171 				txsd = txq->sdesc;
1172 			}
1173 
1174 			/*
1175 			 * when the head of the mbuf chain
1176 			 * is freed all clusters will be freed
1177 			 * with it
1178 			 */
1179 			txsd->m = NULL;
1180 			wrp = (struct work_request_hdr *)txd;
1181 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1182 			    V_WR_SGLSFLT(1)) | wr_hi;
1183 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1184 				    sgl_flits + 1)) |
1185 			    V_WR_GEN(txqs->gen)) | wr_lo;
1186 			wr_gen2(txd, txqs->gen);
1187 			flits = 1;
1188 		}
1189 		wrp->wr_hi |= htonl(F_WR_EOP);
1190 		wmb();
1191 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1192 		wr_gen2((struct tx_desc *)wp, ogen);
1193 	}
1194 }
1195 
1196 
1197 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1198 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1199 
1200 int
1201 t3_encap(struct port_info *p, struct mbuf **m, int *free)
1202 {
1203 	adapter_t *sc;
1204 	struct mbuf *m0;
1205 	struct sge_qset *qs;
1206 	struct sge_txq *txq;
1207 	struct tx_sw_desc *stx;
1208 	struct txq_state txqs;
1209 	unsigned int ndesc, flits, cntrl, mlen;
1210 	int err, nsegs, tso_info = 0;
1211 
1212 	struct work_request_hdr *wrp;
1213 	struct tx_sw_desc *txsd;
1214 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1215 	bus_dma_segment_t segs[TX_MAX_SEGS];
1216 	uint32_t wr_hi, wr_lo, sgl_flits;
1217 
1218 	struct tx_desc *txd;
1219 	struct cpl_tx_pkt *cpl;
1220 
1221 	m0 = *m;
1222 	sc = p->adapter;
1223 
1224 	DPRINTF("t3_encap port_id=%d qsidx=%d ", p->port_id, p->first_qset);
1225 
1226 	/* port_id=1 qsid=1 txpkt_intf=2 tx_chan=0 */
1227 
1228 	qs = &sc->sge.qs[p->first_qset];
1229 
1230 	txq = &qs->txq[TXQ_ETH];
1231 	stx = &txq->sdesc[txq->pidx];
1232 	txd = &txq->desc[txq->pidx];
1233 	cpl = (struct cpl_tx_pkt *)txd;
1234 	mlen = m0->m_pkthdr.len;
1235 	cpl->len = htonl(mlen | 0x80000000);
1236 
1237 	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", mlen, p->txpkt_intf, p->tx_chan);
1238 	/*
1239 	 * XXX handle checksum, TSO, and VLAN here
1240 	 *
1241 	 */
1242 	cntrl = V_TXPKT_INTF(p->txpkt_intf);
1243 
1244 	/*
1245 	 * XXX need to add VLAN support for 6.x
1246 	 */
1247 #ifdef VLAN_SUPPORTED
1248 	if (m0->m_flags & M_VLANTAG)
1249 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
1250 	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
1251 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1252 #endif
1253 	if (tso_info) {
1254 		int eth_type;
1255 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
1256 		struct ip *ip;
1257 		struct tcphdr *tcp;
1258 		char *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
1259 
1260 		txd->flit[2] = 0;
1261 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1262 		hdr->cntrl = htonl(cntrl);
1263 
1264 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1265 			pkthdr = &tmp[0];
1266 			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
1267 		} else {
1268 			pkthdr = mtod(m0, char *);
1269 		}
1270 
1271 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1272 			eth_type = CPL_ETH_II_VLAN;
1273 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1274 			    ETHER_VLAN_ENCAP_LEN);
1275 		} else {
1276 			eth_type = CPL_ETH_II;
1277 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1278 		}
1279 		tcp = (struct tcphdr *)((uint8_t *)ip +
1280 		    sizeof(*ip));
1281 
1282 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1283 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1284 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1285 		hdr->lso_info = htonl(tso_info);
1286 		flits = 3;
1287 	} else {
1288 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1289 		cpl->cntrl = htonl(cntrl);
1290 
1291 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1292 			txq_prod(txq, 1, &txqs);
1293 			txq->sdesc[txqs.pidx].m = NULL;
1294 
1295 			if (m0->m_len == m0->m_pkthdr.len)
1296 				memcpy(&txd->flit[2], mtod(m0, uint8_t *), mlen);
1297 			else
1298 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1299 
1300 			*free = 1;
1301 			flits = (mlen + 7) / 8 + 2;
1302 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1303 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1304 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1305 			wmb();
1306 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1307 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1308 
1309 			wr_gen2(txd, txqs.gen);
1310 			check_ring_tx_db(sc, txq);
1311 			return (0);
1312 		}
1313 		flits = 2;
1314 	}
1315 
1316 	wrp = (struct work_request_hdr *)txd;
1317 
1318 	if ((err = busdma_map_mbufs(m, txq, stx, segs, &nsegs)) != 0) {
1319 		return (err);
1320 	}
1321 	m0 = *m;
1322 	ndesc = calc_tx_descs(m0, nsegs);
1323 
1324 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1325 	make_sgl(sgp, segs, nsegs);
1326 
1327 	sgl_flits = sgl_len(nsegs);
1328 
1329 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1330 	txq_prod(txq, ndesc, &txqs);
1331 	txsd = &txq->sdesc[txqs.pidx];
1332 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1333 	wr_lo = htonl(V_WR_TID(txq->token));
1334 	txsd->m = m0;
1335 	m_set_priority(m0, txqs.pidx);
1336 
1337 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
1338 	check_ring_tx_db(p->adapter, txq);
1339 
1340 	return (0);
1341 }
1342 
1343 
1344 /**
1345  *	write_imm - write a packet into a Tx descriptor as immediate data
1346  *	@d: the Tx descriptor to write
1347  *	@m: the packet
1348  *	@len: the length of packet data to write as immediate data
1349  *	@gen: the generation bit value to write
1350  *
1351  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1352  *	contains a work request at its beginning.  We must write the packet
1353  *	carefully so the SGE doesn't read accidentally before it's written in
1354  *	its entirety.
1355  */
1356 static __inline void
1357 write_imm(struct tx_desc *d, struct mbuf *m,
1358 	  unsigned int len, unsigned int gen)
1359 {
1360 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1361 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1362 
1363 	memcpy(&to[1], &from[1], len - sizeof(*from));
1364 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1365 					V_WR_BCNTLFLT(len & 7));
1366 	wmb();
1367 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1368 					V_WR_LEN((len + 7) / 8));
1369 	wr_gen2(d, gen);
1370 	m_freem(m);
1371 }
1372 
1373 /**
1374  *	check_desc_avail - check descriptor availability on a send queue
1375  *	@adap: the adapter
1376  *	@q: the TX queue
1377  *	@m: the packet needing the descriptors
1378  *	@ndesc: the number of Tx descriptors needed
1379  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1380  *
1381  *	Checks if the requested number of Tx descriptors is available on an
1382  *	SGE send queue.  If the queue is already suspended or not enough
1383  *	descriptors are available the packet is queued for later transmission.
1384  *	Must be called with the Tx queue locked.
1385  *
1386  *	Returns 0 if enough descriptors are available, 1 if there aren't
1387  *	enough descriptors and the packet has been queued, and 2 if the caller
1388  *	needs to retry because there weren't enough descriptors at the
1389  *	beginning of the call but some freed up in the mean time.
1390  */
1391 static __inline int
1392 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1393 		 struct mbuf *m, unsigned int ndesc,
1394 		 unsigned int qid)
1395 {
1396 	/*
1397 	 * XXX We currently only use this for checking the control queue
1398 	 * the control queue is only used for binding qsets which happens
1399 	 * at init time so we are guaranteed enough descriptors
1400 	 */
1401 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1402 addq_exit:	mbufq_tail(&q->sendq, m);
1403 		return 1;
1404 	}
1405 	if (__predict_false(q->size - q->in_use < ndesc)) {
1406 
1407 		struct sge_qset *qs = txq_to_qset(q, qid);
1408 
1409 		setbit(&qs->txq_stopped, qid);
1410 		smp_mb();
1411 
1412 		if (should_restart_tx(q) &&
1413 		    test_and_clear_bit(qid, &qs->txq_stopped))
1414 			return 2;
1415 
1416 		q->stops++;
1417 		goto addq_exit;
1418 	}
1419 	return 0;
1420 }
1421 
1422 
1423 /**
1424  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1425  *	@q: the SGE control Tx queue
1426  *
1427  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1428  *	that send only immediate data (presently just the control queues) and
1429  *	thus do not have any mbufs
1430  */
1431 static __inline void
1432 reclaim_completed_tx_imm(struct sge_txq *q)
1433 {
1434 	unsigned int reclaim = q->processed - q->cleaned;
1435 
1436 	mtx_assert(&q->lock, MA_OWNED);
1437 
1438 	q->in_use -= reclaim;
1439 	q->cleaned += reclaim;
1440 }
1441 
1442 static __inline int
1443 immediate(const struct mbuf *m)
1444 {
1445 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1446 }
1447 
1448 /**
1449  *	ctrl_xmit - send a packet through an SGE control Tx queue
1450  *	@adap: the adapter
1451  *	@q: the control queue
1452  *	@m: the packet
1453  *
1454  *	Send a packet through an SGE control Tx queue.  Packets sent through
1455  *	a control queue must fit entirely as immediate data in a single Tx
1456  *	descriptor and have no page fragments.
1457  */
1458 static int
1459 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1460 {
1461 	int ret;
1462 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1463 
1464 	if (__predict_false(!immediate(m))) {
1465 		m_freem(m);
1466 		return 0;
1467 	}
1468 
1469 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1470 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1471 
1472 	mtx_lock(&q->lock);
1473 again:	reclaim_completed_tx_imm(q);
1474 
1475 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1476 	if (__predict_false(ret)) {
1477 		if (ret == 1) {
1478 			mtx_unlock(&q->lock);
1479 			return (-1);
1480 		}
1481 		goto again;
1482 	}
1483 
1484 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1485 
1486 	q->in_use++;
1487 	if (++q->pidx >= q->size) {
1488 		q->pidx = 0;
1489 		q->gen ^= 1;
1490 	}
1491 	mtx_unlock(&q->lock);
1492 	wmb();
1493 	t3_write_reg(adap, A_SG_KDOORBELL,
1494 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1495 	return (0);
1496 }
1497 
1498 
1499 /**
1500  *	restart_ctrlq - restart a suspended control queue
1501  *	@qs: the queue set cotaining the control queue
1502  *
1503  *	Resumes transmission on a suspended Tx control queue.
1504  */
1505 static void
1506 restart_ctrlq(void *data, int npending)
1507 {
1508 	struct mbuf *m;
1509 	struct sge_qset *qs = (struct sge_qset *)data;
1510 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1511 	adapter_t *adap = qs->port->adapter;
1512 
1513 	mtx_lock(&q->lock);
1514 again:	reclaim_completed_tx_imm(q);
1515 
1516 	while (q->in_use < q->size &&
1517 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1518 
1519 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1520 
1521 		if (++q->pidx >= q->size) {
1522 			q->pidx = 0;
1523 			q->gen ^= 1;
1524 		}
1525 		q->in_use++;
1526 	}
1527 	if (!mbufq_empty(&q->sendq)) {
1528 		setbit(&qs->txq_stopped, TXQ_CTRL);
1529 		smp_mb();
1530 
1531 		if (should_restart_tx(q) &&
1532 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1533 			goto again;
1534 		q->stops++;
1535 	}
1536 	mtx_unlock(&q->lock);
1537 	t3_write_reg(adap, A_SG_KDOORBELL,
1538 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1539 }
1540 
1541 
1542 /*
1543  * Send a management message through control queue 0
1544  */
1545 int
1546 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1547 {
1548 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1549 }
1550 
1551 /**
1552  *	free_qset - free the resources of an SGE queue set
1553  *	@sc: the controller owning the queue set
1554  *	@q: the queue set
1555  *
1556  *	Release the HW and SW resources associated with an SGE queue set, such
1557  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1558  *	queue set must be quiesced prior to calling this.
1559  */
1560 static void
1561 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1562 {
1563 	int i;
1564 
1565 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1566 		if (q->fl[i].desc) {
1567 			mtx_lock(&sc->sge.reg_lock);
1568 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1569 			mtx_unlock(&sc->sge.reg_lock);
1570 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1571 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1572 					q->fl[i].desc_map);
1573 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1574 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1575 		}
1576 		if (q->fl[i].sdesc) {
1577 			free_rx_bufs(sc, &q->fl[i]);
1578 			free(q->fl[i].sdesc, M_DEVBUF);
1579 		}
1580 	}
1581 
1582 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
1583 		if (q->txq[i].desc) {
1584 			mtx_lock(&sc->sge.reg_lock);
1585 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1586 			mtx_unlock(&sc->sge.reg_lock);
1587 			bus_dmamap_unload(q->txq[i].desc_tag,
1588 					q->txq[i].desc_map);
1589 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1590 					q->txq[i].desc_map);
1591 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1592 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1593 			MTX_DESTROY(&q->txq[i].lock);
1594 		}
1595 		if (q->txq[i].sdesc) {
1596 			free(q->txq[i].sdesc, M_DEVBUF);
1597 		}
1598 	}
1599 
1600 	if (q->rspq.desc) {
1601 		mtx_lock(&sc->sge.reg_lock);
1602 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1603 		mtx_unlock(&sc->sge.reg_lock);
1604 
1605 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1606 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1607 			        q->rspq.desc_map);
1608 		bus_dma_tag_destroy(q->rspq.desc_tag);
1609 		MTX_DESTROY(&q->rspq.lock);
1610 	}
1611 
1612 	bzero(q, sizeof(*q));
1613 }
1614 
1615 /**
1616  *	t3_free_sge_resources - free SGE resources
1617  *	@sc: the adapter softc
1618  *
1619  *	Frees resources used by the SGE queue sets.
1620  */
1621 void
1622 t3_free_sge_resources(adapter_t *sc)
1623 {
1624 	int i, nqsets;
1625 
1626 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1627 		nqsets += sc->port[i].nqsets;
1628 
1629 	for (i = 0; i < nqsets; ++i)
1630 		t3_free_qset(sc, &sc->sge.qs[i]);
1631 }
1632 
1633 /**
1634  *	t3_sge_start - enable SGE
1635  *	@sc: the controller softc
1636  *
1637  *	Enables the SGE for DMAs.  This is the last step in starting packet
1638  *	transfers.
1639  */
1640 void
1641 t3_sge_start(adapter_t *sc)
1642 {
1643 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1644 }
1645 
1646 /**
1647  *	t3_sge_stop - disable SGE operation
1648  *	@sc: the adapter
1649  *
1650  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
1651  *	from error interrupts) or from normal process context.  In the latter
1652  *	case it also disables any pending queue restart tasklets.  Note that
1653  *	if it is called in interrupt context it cannot disable the restart
1654  *	tasklets as it cannot wait, however the tasklets will have no effect
1655  *	since the doorbells are disabled and the driver will call this again
1656  *	later from process context, at which time the tasklets will be stopped
1657  *	if they are still running.
1658  */
1659 void
1660 t3_sge_stop(adapter_t *sc)
1661 {
1662 	int i, nqsets;
1663 
1664 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
1665 
1666 	if (sc->tq == NULL)
1667 		return;
1668 
1669 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1670 		nqsets += sc->port[i].nqsets;
1671 
1672 	for (i = 0; i < nqsets; ++i) {
1673 		struct sge_qset *qs = &sc->sge.qs[i];
1674 
1675 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
1676 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
1677 	}
1678 }
1679 
1680 
1681 /**
1682  *	free_tx_desc - reclaims Tx descriptors and their buffers
1683  *	@adapter: the adapter
1684  *	@q: the Tx queue to reclaim descriptors from
1685  *	@n: the number of descriptors to reclaim
1686  *
1687  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1688  *	Tx buffers.  Called with the Tx queue lock held.
1689  */
1690 int
1691 free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec)
1692 {
1693 	struct tx_sw_desc *d;
1694 	unsigned int cidx = q->cidx;
1695 	int nbufs = 0;
1696 
1697 #ifdef T3_TRACE
1698 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1699 		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
1700 #endif
1701 	d = &q->sdesc[cidx];
1702 
1703 	while (n-- > 0) {
1704 		DPRINTF("cidx=%d d=%p\n", cidx, d);
1705 		if (d->m) {
1706 			if (d->flags & TX_SW_DESC_MAPPED) {
1707 				bus_dmamap_unload(q->entry_tag, d->map);
1708 				bus_dmamap_destroy(q->entry_tag, d->map);
1709 				d->flags &= ~TX_SW_DESC_MAPPED;
1710 			}
1711 			if (m_get_priority(d->m) == cidx) {
1712 				m_vec[nbufs] = d->m;
1713 				d->m = NULL;
1714 				nbufs++;
1715 			} else {
1716 				printf("pri=%d cidx=%d\n", (int)m_get_priority(d->m), cidx);
1717 			}
1718 		}
1719 		++d;
1720 		if (++cidx == q->size) {
1721 			cidx = 0;
1722 			d = q->sdesc;
1723 		}
1724 	}
1725 	q->cidx = cidx;
1726 
1727 	return (nbufs);
1728 }
1729 
1730 /**
1731  *	is_new_response - check if a response is newly written
1732  *	@r: the response descriptor
1733  *	@q: the response queue
1734  *
1735  *	Returns true if a response descriptor contains a yet unprocessed
1736  *	response.
1737  */
1738 static __inline int
1739 is_new_response(const struct rsp_desc *r,
1740     const struct sge_rspq *q)
1741 {
1742 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1743 }
1744 
1745 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1746 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1747 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1748 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1749 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1750 
1751 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1752 #define NOMEM_INTR_DELAY 2500
1753 
1754 /**
1755  *	write_ofld_wr - write an offload work request
1756  *	@adap: the adapter
1757  *	@m: the packet to send
1758  *	@q: the Tx queue
1759  *	@pidx: index of the first Tx descriptor to write
1760  *	@gen: the generation value to use
1761  *	@ndesc: number of descriptors the packet will occupy
1762  *
1763  *	Write an offload work request to send the supplied packet.  The packet
1764  *	data already carry the work request with most fields populated.
1765  */
1766 static void
1767 write_ofld_wr(adapter_t *adap, struct mbuf *m,
1768     struct sge_txq *q, unsigned int pidx,
1769     unsigned int gen, unsigned int ndesc,
1770     bus_dma_segment_t *segs, unsigned int nsegs)
1771 {
1772 	unsigned int sgl_flits, flits;
1773 	struct work_request_hdr *from;
1774 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1775 	struct tx_desc *d = &q->desc[pidx];
1776 	struct txq_state txqs;
1777 
1778 	if (immediate(m)) {
1779 		q->sdesc[pidx].m = NULL;
1780 		write_imm(d, m, m->m_len, gen);
1781 		return;
1782 	}
1783 
1784 	/* Only TX_DATA builds SGLs */
1785 
1786 	from = mtod(m, struct work_request_hdr *);
1787 	memcpy(&d->flit[1], &from[1],
1788 	    (uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *) - sizeof(*from));
1789 
1790 	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;
1791 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
1792 
1793 	make_sgl(sgp, segs, nsegs);
1794 	sgl_flits = sgl_len(nsegs);
1795 
1796 	txqs.gen = q->gen;
1797 	txqs.pidx = q->pidx;
1798 	txqs.compl = (q->unacked & 8) << (S_WR_COMPL - 3);
1799 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
1800 	    from->wr_hi, from->wr_lo);
1801 }
1802 
1803 /**
1804  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1805  *	@m: the packet
1806  *
1807  * 	Returns the number of Tx descriptors needed for the given offload
1808  * 	packet.  These packets are already fully constructed.
1809  */
1810 static __inline unsigned int
1811 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
1812 {
1813 	unsigned int flits, cnt = 0;
1814 
1815 
1816 	if (m->m_len <= WR_LEN)
1817 		return 1;                 /* packet fits as immediate data */
1818 
1819 	if (m->m_flags & M_IOVEC)
1820 		cnt = mtomv(m)->mv_count;
1821 
1822 	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;   /* headers */
1823 
1824 	return flits_to_desc(flits + sgl_len(cnt));
1825 }
1826 
1827 /**
1828  *	ofld_xmit - send a packet through an offload queue
1829  *	@adap: the adapter
1830  *	@q: the Tx offload queue
1831  *	@m: the packet
1832  *
1833  *	Send an offload packet through an SGE offload queue.
1834  */
1835 static int
1836 ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1837 {
1838 	int ret, nsegs;
1839 	unsigned int ndesc;
1840 	unsigned int pidx, gen;
1841 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
1842 	bus_dma_segment_t segs[TX_MAX_SEGS];
1843 	int i, cleaned;
1844 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
1845 
1846 	mtx_lock(&q->lock);
1847 	if ((ret = busdma_map_mbufs(&m, q, stx, segs, &nsegs)) != 0) {
1848 		mtx_unlock(&q->lock);
1849 		return (ret);
1850 	}
1851 	ndesc = calc_tx_descs_ofld(m, nsegs);
1852 again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
1853 
1854 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
1855 	if (__predict_false(ret)) {
1856 		if (ret == 1) {
1857 			m_set_priority(m, ndesc);     /* save for restart */
1858 			mtx_unlock(&q->lock);
1859 			return EINTR;
1860 		}
1861 		goto again;
1862 	}
1863 
1864 	gen = q->gen;
1865 	q->in_use += ndesc;
1866 	pidx = q->pidx;
1867 	q->pidx += ndesc;
1868 	if (q->pidx >= q->size) {
1869 		q->pidx -= q->size;
1870 		q->gen ^= 1;
1871 	}
1872 #ifdef T3_TRACE
1873 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
1874 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
1875 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
1876 		  skb_shinfo(skb)->nr_frags);
1877 #endif
1878 	mtx_unlock(&q->lock);
1879 
1880 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
1881 	check_ring_tx_db(adap, q);
1882 
1883 	for (i = 0; i < cleaned; i++) {
1884 		m_freem_vec(m_vec[i]);
1885 	}
1886 	return (0);
1887 }
1888 
1889 /**
1890  *	restart_offloadq - restart a suspended offload queue
1891  *	@qs: the queue set cotaining the offload queue
1892  *
1893  *	Resumes transmission on a suspended Tx offload queue.
1894  */
1895 static void
1896 restart_offloadq(void *data, int npending)
1897 {
1898 
1899 	struct mbuf *m;
1900 	struct sge_qset *qs = data;
1901 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
1902 	adapter_t *adap = qs->port->adapter;
1903 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
1904 	bus_dma_segment_t segs[TX_MAX_SEGS];
1905 	int nsegs, i, cleaned;
1906 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
1907 
1908 	mtx_lock(&q->lock);
1909 again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
1910 
1911 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
1912 		unsigned int gen, pidx;
1913 		unsigned int ndesc = m_get_priority(m);
1914 
1915 		if (__predict_false(q->size - q->in_use < ndesc)) {
1916 			setbit(&qs->txq_stopped, TXQ_OFLD);
1917 			smp_mb();
1918 
1919 			if (should_restart_tx(q) &&
1920 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
1921 				goto again;
1922 			q->stops++;
1923 			break;
1924 		}
1925 
1926 		gen = q->gen;
1927 		q->in_use += ndesc;
1928 		pidx = q->pidx;
1929 		q->pidx += ndesc;
1930 		if (q->pidx >= q->size) {
1931 			q->pidx -= q->size;
1932 			q->gen ^= 1;
1933 		}
1934 
1935 		(void)mbufq_dequeue(&q->sendq);
1936 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
1937 		mtx_unlock(&q->lock);
1938 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
1939 		mtx_lock(&q->lock);
1940 	}
1941 	mtx_unlock(&q->lock);
1942 
1943 #if USE_GTS
1944 	set_bit(TXQ_RUNNING, &q->flags);
1945 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
1946 #endif
1947 	t3_write_reg(adap, A_SG_KDOORBELL,
1948 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1949 
1950 	for (i = 0; i < cleaned; i++) {
1951 		m_freem_vec(m_vec[i]);
1952 	}
1953 }
1954 
1955 /**
1956  *	queue_set - return the queue set a packet should use
1957  *	@m: the packet
1958  *
1959  *	Maps a packet to the SGE queue set it should use.  The desired queue
1960  *	set is carried in bits 1-3 in the packet's priority.
1961  */
1962 static __inline int
1963 queue_set(const struct mbuf *m)
1964 {
1965 	return m_get_priority(m) >> 1;
1966 }
1967 
1968 /**
1969  *	is_ctrl_pkt - return whether an offload packet is a control packet
1970  *	@m: the packet
1971  *
1972  *	Determines whether an offload packet should use an OFLD or a CTRL
1973  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
1974  */
1975 static __inline int
1976 is_ctrl_pkt(const struct mbuf *m)
1977 {
1978 	return m_get_priority(m) & 1;
1979 }
1980 
1981 /**
1982  *	t3_offload_tx - send an offload packet
1983  *	@tdev: the offload device to send to
1984  *	@m: the packet
1985  *
1986  *	Sends an offload packet.  We use the packet priority to select the
1987  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
1988  *	should be sent as regular or control, bits 1-3 select the queue set.
1989  */
1990 int
1991 t3_offload_tx(struct toedev *tdev, struct mbuf *m)
1992 {
1993 	adapter_t *adap = tdev2adap(tdev);
1994 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
1995 
1996 	if (__predict_false(is_ctrl_pkt(m)))
1997 		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
1998 
1999 	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
2000 }
2001 
2002 /**
2003  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2004  *	@tdev: the offload device that will be receiving the packets
2005  *	@q: the SGE response queue that assembled the bundle
2006  *	@m: the partial bundle
2007  *	@n: the number of packets in the bundle
2008  *
2009  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2010  */
2011 static __inline void
2012 deliver_partial_bundle(struct toedev *tdev,
2013 			struct sge_rspq *q,
2014 			struct mbuf *mbufs[], int n)
2015 {
2016 	if (n) {
2017 		q->offload_bundles++;
2018 		cxgb_ofld_recv(tdev, mbufs, n);
2019 	}
2020 }
2021 
2022 static __inline int
2023 rx_offload(struct toedev *tdev, struct sge_rspq *rq,
2024     struct mbuf *m, struct mbuf *rx_gather[],
2025     unsigned int gather_idx)
2026 {
2027 	rq->offload_pkts++;
2028 	m->m_pkthdr.header = mtod(m, void *);
2029 
2030 	rx_gather[gather_idx++] = m;
2031 	if (gather_idx == RX_BUNDLE_SIZE) {
2032 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2033 		gather_idx = 0;
2034 		rq->offload_bundles++;
2035 	}
2036 	return (gather_idx);
2037 }
2038 
2039 static void
2040 restart_tx(struct sge_qset *qs)
2041 {
2042 	struct adapter *sc = qs->port->adapter;
2043 
2044 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2045 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2046 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2047 		qs->txq[TXQ_OFLD].restarts++;
2048 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2049 	}
2050 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2051 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2052 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2053 		qs->txq[TXQ_CTRL].restarts++;
2054 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2055 	}
2056 }
2057 
2058 /**
2059  *	t3_sge_alloc_qset - initialize an SGE queue set
2060  *	@sc: the controller softc
2061  *	@id: the queue set id
2062  *	@nports: how many Ethernet ports will be using this queue set
2063  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2064  *	@p: configuration parameters for this queue set
2065  *	@ntxq: number of Tx queues for the queue set
2066  *	@pi: port info for queue set
2067  *
2068  *	Allocate resources and initialize an SGE queue set.  A queue set
2069  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2070  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2071  *	queue, offload queue, and control queue.
2072  */
2073 int
2074 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2075 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2076 {
2077 	struct sge_qset *q = &sc->sge.qs[id];
2078 	int i, ret = 0;
2079 
2080 	init_qset_cntxt(q, id);
2081 
2082 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2083 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2084 		    &q->fl[0].desc, &q->fl[0].sdesc,
2085 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2086 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2087 		printf("error %d from alloc ring fl0\n", ret);
2088 		goto err;
2089 	}
2090 
2091 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2092 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2093 		    &q->fl[1].desc, &q->fl[1].sdesc,
2094 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2095 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2096 		printf("error %d from alloc ring fl1\n", ret);
2097 		goto err;
2098 	}
2099 
2100 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2101 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2102 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2103 		    NULL, NULL)) != 0) {
2104 		printf("error %d from alloc ring rspq\n", ret);
2105 		goto err;
2106 	}
2107 
2108 	for (i = 0; i < ntxq; ++i) {
2109 		/*
2110 		 * The control queue always uses immediate data so does not
2111 		 * need to keep track of any mbufs.
2112 		 * XXX Placeholder for future TOE support.
2113 		 */
2114 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2115 
2116 		if ((ret = alloc_ring(sc, p->txq_size[i],
2117 			    sizeof(struct tx_desc), sz,
2118 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2119 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2120 			    &q->txq[i].desc_map,
2121 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2122 			printf("error %d from alloc ring tx %i\n", ret, i);
2123 			goto err;
2124 		}
2125 		mbufq_init(&q->txq[i].sendq);
2126 		q->txq[i].gen = 1;
2127 		q->txq[i].size = p->txq_size[i];
2128 		snprintf(q->txq[i].lockbuf, TXQ_NAME_LEN, "t3 txq lock %d:%d:%d",
2129 		    device_get_unit(sc->dev), irq_vec_idx, i);
2130 		MTX_INIT(&q->txq[i].lock, q->txq[i].lockbuf, NULL, MTX_DEF);
2131 	}
2132 
2133 	q->txq[TXQ_ETH].port = pi;
2134 
2135 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2136 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2137 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_ETH]);
2138 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_OFLD]);
2139 
2140 	q->fl[0].gen = q->fl[1].gen = 1;
2141 	q->fl[0].size = p->fl_size;
2142 	q->fl[1].size = p->jumbo_size;
2143 
2144 	q->rspq.gen = 1;
2145 	q->rspq.cidx = 0;
2146 	q->rspq.size = p->rspq_size;
2147 
2148 	q->txq[TXQ_ETH].stop_thres = nports *
2149 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2150 
2151 	q->fl[0].buf_size = MCLBYTES;
2152 	q->fl[0].zone = zone_clust;
2153 	q->fl[0].type = EXT_CLUSTER;
2154 	q->fl[1].buf_size = MJUMPAGESIZE;
2155 	q->fl[1].zone = zone_jumbop;
2156 	q->fl[1].type = EXT_JUMBOP;
2157 
2158 	q->lro.enabled = lro_default;
2159 
2160 	mtx_lock(&sc->sge.reg_lock);
2161 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2162 				   q->rspq.phys_addr, q->rspq.size,
2163 				   q->fl[0].buf_size, 1, 0);
2164 	if (ret) {
2165 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2166 		goto err_unlock;
2167 	}
2168 
2169 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2170 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2171 					  q->fl[i].phys_addr, q->fl[i].size,
2172 					  q->fl[i].buf_size, p->cong_thres, 1,
2173 					  0);
2174 		if (ret) {
2175 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2176 			goto err_unlock;
2177 		}
2178 	}
2179 
2180 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2181 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2182 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2183 				 1, 0);
2184 	if (ret) {
2185 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2186 		goto err_unlock;
2187 	}
2188 
2189 	if (ntxq > 1) {
2190 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2191 					 USE_GTS, SGE_CNTXT_OFLD, id,
2192 					 q->txq[TXQ_OFLD].phys_addr,
2193 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2194 		if (ret) {
2195 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2196 			goto err_unlock;
2197 		}
2198 	}
2199 
2200 	if (ntxq > 2) {
2201 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2202 					 SGE_CNTXT_CTRL, id,
2203 					 q->txq[TXQ_CTRL].phys_addr,
2204 					 q->txq[TXQ_CTRL].size,
2205 					 q->txq[TXQ_CTRL].token, 1, 0);
2206 		if (ret) {
2207 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2208 			goto err_unlock;
2209 		}
2210 	}
2211 
2212 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2213 	    device_get_unit(sc->dev), irq_vec_idx);
2214 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2215 
2216 	mtx_unlock(&sc->sge.reg_lock);
2217 	t3_update_qset_coalesce(q, p);
2218 	q->port = pi;
2219 
2220 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2221 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2222 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2223 
2224 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2225 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2226 
2227 	return (0);
2228 
2229 err_unlock:
2230 	mtx_unlock(&sc->sge.reg_lock);
2231 err:
2232 	t3_free_qset(sc, q);
2233 
2234 	return (ret);
2235 }
2236 
2237 void
2238 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2239 {
2240 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2241 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2242 	struct ifnet *ifp = pi->ifp;
2243 
2244 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2245 
2246 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2247 	    cpl->csum_valid && cpl->csum == 0xffff) {
2248 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2249 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2250 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2251 		m->m_pkthdr.csum_data = 0xffff;
2252 	}
2253 	/*
2254 	 * XXX need to add VLAN support for 6.x
2255 	 */
2256 #ifdef VLAN_SUPPORTED
2257 	if (__predict_false(cpl->vlan_valid)) {
2258 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2259 		m->m_flags |= M_VLANTAG;
2260 	}
2261 #endif
2262 
2263 	m->m_pkthdr.rcvif = ifp;
2264 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2265 	m_explode(m);
2266 	/*
2267 	 * adjust after conversion to mbuf chain
2268 	 */
2269 	m_adj(m, sizeof(*cpl) + ethpad);
2270 
2271 	(*ifp->if_input)(ifp, m);
2272 }
2273 
2274 /**
2275  *	get_packet - return the next ingress packet buffer from a free list
2276  *	@adap: the adapter that received the packet
2277  *	@drop_thres: # of remaining buffers before we start dropping packets
2278  *	@qs: the qset that the SGE free list holding the packet belongs to
2279  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2280  *      @r: response descriptor
2281  *
2282  *	Get the next packet from a free list and complete setup of the
2283  *	sk_buff.  If the packet is small we make a copy and recycle the
2284  *	original buffer, otherwise we use the original buffer itself.  If a
2285  *	positive drop threshold is supplied packets are dropped and their
2286  *	buffers recycled if (a) the number of remaining buffers is under the
2287  *	threshold and the packet is too big to copy, or (b) the packet should
2288  *	be copied but there is no memory for the copy.
2289  */
2290 #ifdef DISABLE_MBUF_IOVEC
2291 
2292 static int
2293 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2294     struct t3_mbuf_hdr *mh, struct rsp_desc *r, struct mbuf *m)
2295 {
2296 
2297 	unsigned int len_cq =  ntohl(r->len_cq);
2298 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2299 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2300 	uint32_t len = G_RSPD_LEN(len_cq);
2301 	uint32_t flags = ntohl(r->flags);
2302 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2303 	int ret = 0;
2304 
2305 	prefetch(sd->cl);
2306 
2307 	fl->credits--;
2308 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2309 	bus_dmamap_unload(fl->entry_tag, sd->map);
2310 
2311 	m_cljset(m, sd->cl, fl->type);
2312 	m->m_len = len;
2313 
2314 	switch(sopeop) {
2315 	case RSPQ_SOP_EOP:
2316 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2317 		mh->mh_head = mh->mh_tail = m;
2318 		m->m_pkthdr.len = len;
2319 		m->m_flags |= M_PKTHDR;
2320 		ret = 1;
2321 		break;
2322 	case RSPQ_NSOP_NEOP:
2323 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2324 		m->m_flags &= ~M_PKTHDR;
2325 		if (mh->mh_tail == NULL) {
2326 			if (cxgb_debug)
2327 				printf("discarding intermediate descriptor entry\n");
2328 			m_freem(m);
2329 			break;
2330 		}
2331 		mh->mh_tail->m_next = m;
2332 		mh->mh_tail = m;
2333 		mh->mh_head->m_pkthdr.len += len;
2334 		ret = 0;
2335 		break;
2336 	case RSPQ_SOP:
2337 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2338 		m->m_pkthdr.len = len;
2339 		mh->mh_head = mh->mh_tail = m;
2340 		m->m_flags |= M_PKTHDR;
2341 		ret = 0;
2342 		break;
2343 	case RSPQ_EOP:
2344 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2345 		m->m_flags &= ~M_PKTHDR;
2346 		mh->mh_head->m_pkthdr.len += len;
2347 		mh->mh_tail->m_next = m;
2348 		mh->mh_tail = m;
2349 		ret = 1;
2350 		break;
2351 	}
2352 	if (++fl->cidx == fl->size)
2353 		fl->cidx = 0;
2354 
2355 	return (ret);
2356 }
2357 
2358 #else
2359 static int
2360 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2361     struct mbuf *m, struct rsp_desc *r)
2362 {
2363 
2364 	unsigned int len_cq =  ntohl(r->len_cq);
2365 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2366 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2367 	uint32_t len = G_RSPD_LEN(len_cq);
2368 	uint32_t flags = ntohl(r->flags);
2369 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2370 	void *cl;
2371 	int ret = 0;
2372 
2373 	prefetch(sd->cl);
2374 
2375 	fl->credits--;
2376 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2377 
2378 	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2379 		cl = mtod(m, void *);
2380 		memcpy(cl, sd->cl, len);
2381 		recycle_rx_buf(adap, fl, fl->cidx);
2382 	} else {
2383 		cl = sd->cl;
2384 		bus_dmamap_unload(fl->entry_tag, sd->map);
2385 	}
2386 	switch(sopeop) {
2387 	case RSPQ_SOP_EOP:
2388 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2389 		if (cl == sd->cl)
2390 			m_cljset(m, cl, fl->type);
2391 		m->m_len = m->m_pkthdr.len = len;
2392 		ret = 1;
2393 		goto done;
2394 		break;
2395 	case RSPQ_NSOP_NEOP:
2396 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2397 		ret = 0;
2398 		break;
2399 	case RSPQ_SOP:
2400 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2401 		m_iovinit(m);
2402 		ret = 0;
2403 		break;
2404 	case RSPQ_EOP:
2405 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2406 		ret = 1;
2407 		break;
2408 	}
2409 	m_iovappend(m, cl, fl->buf_size, len, 0);
2410 
2411 done:
2412 	if (++fl->cidx == fl->size)
2413 		fl->cidx = 0;
2414 
2415 	return (ret);
2416 }
2417 #endif
2418 /**
2419  *	handle_rsp_cntrl_info - handles control information in a response
2420  *	@qs: the queue set corresponding to the response
2421  *	@flags: the response control flags
2422  *
2423  *	Handles the control information of an SGE response, such as GTS
2424  *	indications and completion credits for the queue set's Tx queues.
2425  *	HW coalesces credits, we don't do any extra SW coalescing.
2426  */
2427 static __inline void
2428 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2429 {
2430 	unsigned int credits;
2431 
2432 #if USE_GTS
2433 	if (flags & F_RSPD_TXQ0_GTS)
2434 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2435 #endif
2436 	credits = G_RSPD_TXQ0_CR(flags);
2437 	if (credits) {
2438 		qs->txq[TXQ_ETH].processed += credits;
2439 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
2440 			taskqueue_enqueue(qs->port->adapter->tq,
2441 			    &qs->port->timer_reclaim_task);
2442 	}
2443 
2444 	credits = G_RSPD_TXQ2_CR(flags);
2445 	if (credits)
2446 		qs->txq[TXQ_CTRL].processed += credits;
2447 
2448 # if USE_GTS
2449 	if (flags & F_RSPD_TXQ1_GTS)
2450 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2451 # endif
2452 	credits = G_RSPD_TXQ1_CR(flags);
2453 	if (credits)
2454 		qs->txq[TXQ_OFLD].processed += credits;
2455 }
2456 
2457 static void
2458 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2459     unsigned int sleeping)
2460 {
2461 	;
2462 }
2463 
2464 /**
2465  *	process_responses - process responses from an SGE response queue
2466  *	@adap: the adapter
2467  *	@qs: the queue set to which the response queue belongs
2468  *	@budget: how many responses can be processed in this round
2469  *
2470  *	Process responses from an SGE response queue up to the supplied budget.
2471  *	Responses include received packets as well as credits and other events
2472  *	for the queues that belong to the response queue's queue set.
2473  *	A negative budget is effectively unlimited.
2474  *
2475  *	Additionally choose the interrupt holdoff time for the next interrupt
2476  *	on this queue.  If the system is under memory shortage use a fairly
2477  *	long delay to help recovery.
2478  */
2479 static int
2480 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2481 {
2482 	struct sge_rspq *rspq = &qs->rspq;
2483 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2484 	int budget_left = budget;
2485 	unsigned int sleeping = 0;
2486 	int lro = qs->lro.enabled;
2487 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2488 	int ngathered = 0;
2489 #ifdef DEBUG
2490 	static int last_holdoff = 0;
2491 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2492 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2493 		last_holdoff = rspq->holdoff_tmr;
2494 	}
2495 #endif
2496 	rspq->next_holdoff = rspq->holdoff_tmr;
2497 
2498 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2499 		int eth, eop = 0, ethpad = 0;
2500 		uint32_t flags = ntohl(r->flags);
2501 		uint32_t rss_csum = *(const uint32_t *)r;
2502 		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
2503 
2504 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2505 
2506 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2507 			/* XXX */
2508 			printf("async notification\n");
2509 
2510 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2511 #ifdef DISABLE_MBUF_IOVEC
2512 
2513 			if (cxgb_debug)
2514 				printf("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
2515 
2516 			if(get_imm_packet(adap, r, &rspq->rspq_mh) == 0) {
2517 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2518 				budget_left--;
2519 				break;
2520 			} else {
2521 				eop = 1;
2522 			}
2523 #else
2524 			struct mbuf *m = NULL;
2525 
2526 			if (rspq->rspq_mbuf == NULL)
2527 				rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
2528                         else
2529 				m = m_gethdr(M_DONTWAIT, MT_DATA);
2530 
2531 			/*
2532 			 * XXX revisit me
2533 			 */
2534 			if (rspq->rspq_mbuf == NULL &&  m == NULL) {
2535 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2536 				budget_left--;
2537 				break;
2538 			}
2539 			if (get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags))
2540 				goto skip;
2541 			eop = 1;
2542 #endif
2543 			rspq->imm_data++;
2544 		} else if (r->len_cq) {
2545 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2546 
2547 #ifdef DISABLE_MBUF_IOVEC
2548 			struct mbuf *m;
2549 			m = m_gethdr(M_NOWAIT, MT_DATA);
2550 
2551 			if (m == NULL) {
2552 				log(LOG_WARNING, "failed to get mbuf for packet\n");
2553 				break;
2554 			}
2555 
2556 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r, m);
2557 #else
2558 			if (rspq->rspq_mbuf == NULL)
2559 				rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
2560 			if (rspq->rspq_mbuf == NULL) {
2561 				log(LOG_WARNING, "failed to get mbuf for packet\n");
2562 				break;
2563 			}
2564 			eop = get_packet(adap, drop_thresh, qs, rspq->rspq_mbuf, r);
2565 #endif
2566 			ethpad = 2;
2567 		} else {
2568 			DPRINTF("pure response\n");
2569 			rspq->pure_rsps++;
2570 		}
2571 
2572 		if (flags & RSPD_CTRL_MASK) {
2573 			sleeping |= flags & RSPD_GTS_MASK;
2574 			handle_rsp_cntrl_info(qs, flags);
2575 		}
2576 #ifndef DISABLE_MBUF_IOVEC
2577 	skip:
2578 #endif
2579 		r++;
2580 		if (__predict_false(++rspq->cidx == rspq->size)) {
2581 			rspq->cidx = 0;
2582 			rspq->gen ^= 1;
2583 			r = rspq->desc;
2584 		}
2585 
2586 		prefetch(r);
2587 		if (++rspq->credits >= (rspq->size / 4)) {
2588 			refill_rspq(adap, rspq, rspq->credits);
2589 			rspq->credits = 0;
2590 		}
2591 
2592 		if (eop) {
2593 			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *));
2594 			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES);
2595 
2596 			if (eth) {
2597 				t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad,
2598 				    rss_hash, rss_csum, lro);
2599 
2600 				rspq->rspq_mh.mh_head = NULL;
2601 			} else {
2602 				rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
2603 				/*
2604 				 * XXX size mismatch
2605 				 */
2606 				m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
2607 
2608 				ngathered = rx_offload(&adap->tdev, rspq,
2609 				    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
2610 			}
2611 			__refill_fl(adap, &qs->fl[0]);
2612 			__refill_fl(adap, &qs->fl[1]);
2613 
2614 		}
2615 		--budget_left;
2616 	}
2617 
2618 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
2619 	t3_lro_flush(adap, qs, &qs->lro);
2620 
2621 	if (sleeping)
2622 		check_ring_db(adap, qs, sleeping);
2623 
2624 	smp_mb();  /* commit Tx queue processed updates */
2625 	if (__predict_false(qs->txq_stopped != 0))
2626 		restart_tx(qs);
2627 
2628 	budget -= budget_left;
2629 	return (budget);
2630 }
2631 
2632 /*
2633  * A helper function that processes responses and issues GTS.
2634  */
2635 static __inline int
2636 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2637 {
2638 	int work;
2639 	static int last_holdoff = 0;
2640 
2641 	work = process_responses(adap, rspq_to_qset(rq), -1);
2642 
2643 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2644 		printf("next_holdoff=%d\n", rq->next_holdoff);
2645 		last_holdoff = rq->next_holdoff;
2646 	}
2647 	if (work)
2648 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2649 		    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2650 	return work;
2651 }
2652 
2653 
2654 /*
2655  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2656  * Handles data events from SGE response queues as well as error and other
2657  * async events as they all use the same interrupt pin.  We use one SGE
2658  * response queue per port in this mode and protect all response queues with
2659  * queue 0's lock.
2660  */
2661 void
2662 t3b_intr(void *data)
2663 {
2664 	uint32_t i, map;
2665 	adapter_t *adap = data;
2666 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2667 
2668 	t3_write_reg(adap, A_PL_CLI, 0);
2669 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2670 
2671 	if (!map)
2672 		return;
2673 
2674 	if (__predict_false(map & F_ERRINTR))
2675 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2676 
2677 	mtx_lock(&q0->lock);
2678 	for_each_port(adap, i)
2679 	    if (map & (1 << i))
2680 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
2681 	mtx_unlock(&q0->lock);
2682 }
2683 
2684 /*
2685  * The MSI interrupt handler.  This needs to handle data events from SGE
2686  * response queues as well as error and other async events as they all use
2687  * the same MSI vector.  We use one SGE response queue per port in this mode
2688  * and protect all response queues with queue 0's lock.
2689  */
2690 void
2691 t3_intr_msi(void *data)
2692 {
2693 	adapter_t *adap = data;
2694 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2695 	int i, new_packets = 0;
2696 
2697 	mtx_lock(&q0->lock);
2698 
2699 	for_each_port(adap, i)
2700 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
2701 		    new_packets = 1;
2702 	mtx_unlock(&q0->lock);
2703 	if (new_packets == 0)
2704 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2705 }
2706 
2707 void
2708 t3_intr_msix(void *data)
2709 {
2710 	struct sge_qset *qs = data;
2711 	adapter_t *adap = qs->port->adapter;
2712 	struct sge_rspq *rspq = &qs->rspq;
2713 
2714 	mtx_lock(&rspq->lock);
2715 	if (process_responses_gts(adap, rspq) == 0)
2716 		rspq->unhandled_irqs++;
2717 	mtx_unlock(&rspq->lock);
2718 }
2719 
2720 /*
2721  * broken by recent mbuf changes
2722  */
2723 static int
2724 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2725 {
2726 	adapter_t *sc;
2727 	int i, j, enabled, err, nqsets = 0;
2728 
2729 #ifndef LRO_WORKING
2730 	return (0);
2731 #endif
2732 
2733 	sc = arg1;
2734 	enabled = sc->sge.qs[0].lro.enabled;
2735         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2736 
2737 	if (err != 0)
2738 		return (err);
2739 	if (enabled == sc->sge.qs[0].lro.enabled)
2740 		return (0);
2741 
2742 	for (i = 0; i < sc->params.nports; i++)
2743 		for (j = 0; j < sc->port[i].nqsets; j++)
2744 			nqsets++;
2745 
2746 	for (i = 0; i < nqsets; i++)
2747 		sc->sge.qs[i].lro.enabled = enabled;
2748 
2749 	return (0);
2750 }
2751 
2752 static int
2753 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2754 {
2755 	adapter_t *sc = arg1;
2756 	struct qset_params *qsp = &sc->params.sge.qset[0];
2757 	int coalesce_nsecs;
2758 	struct sge_qset *qs;
2759 	int i, j, err, nqsets = 0;
2760 	struct mtx *lock;
2761 
2762 	coalesce_nsecs = qsp->coalesce_nsecs;
2763         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
2764 
2765 	if (err != 0) {
2766 		return (err);
2767 	}
2768 	if (coalesce_nsecs == qsp->coalesce_nsecs)
2769 		return (0);
2770 
2771 	for (i = 0; i < sc->params.nports; i++)
2772 		for (j = 0; j < sc->port[i].nqsets; j++)
2773 			nqsets++;
2774 
2775 	coalesce_nsecs = max(100, coalesce_nsecs);
2776 
2777 	for (i = 0; i < nqsets; i++) {
2778 		qs = &sc->sge.qs[i];
2779 		qsp = &sc->params.sge.qset[i];
2780 		qsp->coalesce_nsecs = coalesce_nsecs;
2781 
2782 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
2783 			    &sc->sge.qs[0].rspq.lock;
2784 
2785 		mtx_lock(lock);
2786 		t3_update_qset_coalesce(qs, qsp);
2787 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2788 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
2789 		mtx_unlock(lock);
2790 	}
2791 
2792 	return (0);
2793 }
2794 
2795 
2796 void
2797 t3_add_sysctls(adapter_t *sc)
2798 {
2799 	struct sysctl_ctx_list *ctx;
2800 	struct sysctl_oid_list *children;
2801 
2802 	ctx = device_get_sysctl_ctx(sc->dev);
2803 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
2804 
2805 	/* random information */
2806 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
2807 	    "firmware_version",
2808 	    CTLFLAG_RD, &sc->fw_version,
2809 	    0, "firmware version");
2810 
2811 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2812 	    "enable_lro",
2813 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2814 	    0, t3_lro_enable,
2815 	    "I", "enable large receive offload");
2816 
2817 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2818 	    "intr_coal",
2819 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2820 	    0, t3_set_coalesce_nsecs,
2821 	    "I", "interrupt coalescing timer (ns)");
2822 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2823 	    "enable_debug",
2824 	    CTLFLAG_RW, &cxgb_debug,
2825 	    0, "enable verbose debugging output");
2826 
2827 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2828 	    "collapse_free",
2829 	    CTLFLAG_RD, &collapse_free,
2830 	    0, "frees during collapse");
2831 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2832 	    "mb_free_vec_free",
2833 	    CTLFLAG_RD, &mb_free_vec_free,
2834 	    0, "frees during mb_free_vec");
2835 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2836 	    "collapse_mbufs",
2837 	    CTLFLAG_RW, &collapse_mbufs,
2838 	    0, "collapse mbuf chains into iovecs");
2839 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2840 	    "txq_overrun",
2841 	    CTLFLAG_RD, &txq_fills,
2842 	    0, "#times txq overrun");
2843 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2844 	    "bogus_imm",
2845 	    CTLFLAG_RD, &bogus_imm,
2846 	    0, "#times a bogus immediate response was seen");
2847 }
2848 
2849 /**
2850  *	t3_get_desc - dump an SGE descriptor for debugging purposes
2851  *	@qs: the queue set
2852  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2853  *	@idx: the descriptor index in the queue
2854  *	@data: where to dump the descriptor contents
2855  *
2856  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2857  *	size of the descriptor.
2858  */
2859 int
2860 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2861 		unsigned char *data)
2862 {
2863 	if (qnum >= 6)
2864 		return (EINVAL);
2865 
2866 	if (qnum < 3) {
2867 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2868 			return -EINVAL;
2869 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2870 		return sizeof(struct tx_desc);
2871 	}
2872 
2873 	if (qnum == 3) {
2874 		if (!qs->rspq.desc || idx >= qs->rspq.size)
2875 			return (EINVAL);
2876 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2877 		return sizeof(struct rsp_desc);
2878 	}
2879 
2880 	qnum -= 4;
2881 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2882 		return (EINVAL);
2883 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2884 	return sizeof(struct rx_desc);
2885 }
2886