xref: /freebsd/sys/dev/cxgb/cxgb_sge.c (revision 2ca7a12a81a5376ef17d3a3ea70f7000c025c579)
1 /**************************************************************************
2 
3 Copyright (c) 2007, Chelsio Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Chelsio Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/conf.h>
39 #include <machine/bus.h>
40 #include <machine/resource.h>
41 #include <sys/bus_dma.h>
42 #include <sys/rman.h>
43 #include <sys/queue.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/taskqueue.h>
47 
48 #include <sys/proc.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/systm.h>
52 
53 #include <netinet/in_systm.h>
54 #include <netinet/in.h>
55 #include <netinet/ip.h>
56 #include <netinet/tcp.h>
57 
58 #include <dev/pci/pcireg.h>
59 #include <dev/pci/pcivar.h>
60 
61 #include <vm/vm.h>
62 #include <vm/vm_page.h>
63 #include <vm/vm_map.h>
64 
65 #ifdef CONFIG_DEFINED
66 #include <cxgb_include.h>
67 #include <sys/mvec.h>
68 #else
69 #include <dev/cxgb/cxgb_include.h>
70 #include <dev/cxgb/sys/mvec.h>
71 #endif
72 
73 uint32_t collapse_free = 0;
74 uint32_t mb_free_vec_free = 0;
75 int      txq_fills = 0;
76 int      collapse_mbufs = 0;
77 static int bogus_imm = 0;
78 #ifndef DISABLE_MBUF_IOVEC
79 static int recycle_enable = 1;
80 #endif
81 
82 #define USE_GTS 0
83 
84 #define SGE_RX_SM_BUF_SIZE	1536
85 #define SGE_RX_DROP_THRES	16
86 #define SGE_RX_COPY_THRES	128
87 
88 /*
89  * Period of the Tx buffer reclaim timer.  This timer does not need to run
90  * frequently as Tx buffers are usually reclaimed by new Tx packets.
91  */
92 #define TX_RECLAIM_PERIOD       (hz >> 1)
93 
94 /*
95  * work request size in bytes
96  */
97 #define WR_LEN (WR_FLITS * 8)
98 
99 /*
100  * Values for sge_txq.flags
101  */
102 enum {
103 	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
104 	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
105 };
106 
107 struct tx_desc {
108 	uint64_t	flit[TX_DESC_FLITS];
109 } __packed;
110 
111 struct rx_desc {
112 	uint32_t	addr_lo;
113 	uint32_t	len_gen;
114 	uint32_t	gen2;
115 	uint32_t	addr_hi;
116 } __packed;;
117 
118 struct rsp_desc {               /* response queue descriptor */
119 	struct rss_header	rss_hdr;
120 	uint32_t		flags;
121 	uint32_t		len_cq;
122 	uint8_t			imm_data[47];
123 	uint8_t			intr_gen;
124 } __packed;
125 
126 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
127 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
128 #define RX_SW_DESC_INUSE        (1 << 3)
129 #define TX_SW_DESC_MAPPED       (1 << 4)
130 
131 #define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
132 #define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
133 #define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
134 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
135 
136 struct tx_sw_desc {                /* SW state per Tx descriptor */
137 	struct mbuf	*m;
138 	bus_dmamap_t	map;
139 	int		flags;
140 };
141 
142 struct rx_sw_desc {                /* SW state per Rx descriptor */
143 	void	        *cl;
144 	bus_dmamap_t	map;
145 	int		flags;
146 };
147 
148 struct txq_state {
149 	unsigned int compl;
150 	unsigned int gen;
151 	unsigned int pidx;
152 };
153 
154 struct refill_fl_cb_arg {
155 	int               error;
156 	bus_dma_segment_t seg;
157 	int               nseg;
158 };
159 
160 /*
161  * Maps a number of flits to the number of Tx descriptors that can hold them.
162  * The formula is
163  *
164  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
165  *
166  * HW allows up to 4 descriptors to be combined into a WR.
167  */
168 static uint8_t flit_desc_map[] = {
169 	0,
170 #if SGE_NUM_GENBITS == 1
171 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
173 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
174 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
175 #elif SGE_NUM_GENBITS == 2
176 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
177 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
178 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
179 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
180 #else
181 # error "SGE_NUM_GENBITS must be 1 or 2"
182 #endif
183 };
184 
185 
186 static int lro_default = 0;
187 int cxgb_debug = 0;
188 
189 static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
190 static void sge_timer_cb(void *arg);
191 static void sge_timer_reclaim(void *arg, int ncount);
192 static void sge_txq_reclaim_handler(void *arg, int ncount);
193 static int free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec);
194 
195 /**
196  *	reclaim_completed_tx - reclaims completed Tx descriptors
197  *	@adapter: the adapter
198  *	@q: the Tx queue to reclaim completed descriptors from
199  *
200  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
201  *	and frees the associated buffers if possible.  Called with the Tx
202  *	queue's lock held.
203  */
204 static __inline int
205 reclaim_completed_tx(struct sge_txq *q, int nbufs, struct mbuf **mvec)
206 {
207 	int reclaimed, reclaim = desc_reclaimable(q);
208 	int n = 0;
209 
210 	mtx_assert(&q->lock, MA_OWNED);
211 	if (reclaim > 0) {
212 		n = free_tx_desc(q, min(reclaim, nbufs), mvec);
213 		reclaimed = min(reclaim, nbufs);
214 		q->cleaned += reclaimed;
215 		q->in_use -= reclaimed;
216 	}
217 	return (n);
218 }
219 
220 /**
221  *	should_restart_tx - are there enough resources to restart a Tx queue?
222  *	@q: the Tx queue
223  *
224  *	Checks if there are enough descriptors to restart a suspended Tx queue.
225  */
226 static __inline int
227 should_restart_tx(const struct sge_txq *q)
228 {
229 	unsigned int r = q->processed - q->cleaned;
230 
231 	return q->in_use - r < (q->size >> 1);
232 }
233 
234 /**
235  *	t3_sge_init - initialize SGE
236  *	@adap: the adapter
237  *	@p: the SGE parameters
238  *
239  *	Performs SGE initialization needed every time after a chip reset.
240  *	We do not initialize any of the queue sets here, instead the driver
241  *	top-level must request those individually.  We also do not enable DMA
242  *	here, that should be done after the queues have been set up.
243  */
244 void
245 t3_sge_init(adapter_t *adap, struct sge_params *p)
246 {
247 	u_int ctrl, ups;
248 
249 	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
250 
251 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
252 	       F_CQCRDTCTRL |
253 	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
254 	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
255 #if SGE_NUM_GENBITS == 1
256 	ctrl |= F_EGRGENCTRL;
257 #endif
258 	if (adap->params.rev > 0) {
259 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
260 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
261 		ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
262 	}
263 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
264 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
265 		     V_LORCQDRBTHRSH(512));
266 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
267 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
268 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
269 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
270 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
271 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
272 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
273 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
274 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
275 }
276 
277 
278 /**
279  *	sgl_len - calculates the size of an SGL of the given capacity
280  *	@n: the number of SGL entries
281  *
282  *	Calculates the number of flits needed for a scatter/gather list that
283  *	can hold the given number of entries.
284  */
285 static __inline unsigned int
286 sgl_len(unsigned int n)
287 {
288 	return ((3 * n) / 2 + (n & 1));
289 }
290 
291 /**
292  *	get_imm_packet - return the next ingress packet buffer from a response
293  *	@resp: the response descriptor containing the packet data
294  *
295  *	Return a packet containing the immediate data of the given response.
296  */
297 #ifdef DISABLE_MBUF_IOVEC
298 static __inline int
299 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh)
300 {
301 	struct mbuf *m;
302 	int len;
303 	uint32_t flags = ntohl(resp->flags);
304 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
305 
306 	/*
307 	 * would be a firmware bug
308 	 */
309 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
310 		return (0);
311 
312 	m = m_gethdr(M_NOWAIT, MT_DATA);
313 	len = G_RSPD_LEN(ntohl(resp->len_cq));
314 
315 	if (m) {
316 		MH_ALIGN(m, IMMED_PKT_SIZE);
317 		memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE);
318 		m->m_len = len;
319 
320 		switch (sopeop) {
321 		case RSPQ_SOP_EOP:
322 			mh->mh_head = mh->mh_tail = m;
323 			m->m_pkthdr.len = len;
324 			m->m_flags |= M_PKTHDR;
325 			break;
326 		case RSPQ_EOP:
327 			m->m_flags &= ~M_PKTHDR;
328 			mh->mh_head->m_pkthdr.len += len;
329 			mh->mh_tail->m_next = m;
330 			mh->mh_tail = m;
331 			break;
332 		}
333 	}
334 	return (m != NULL);
335 }
336 
337 #else
338 static int
339 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl, uint32_t flags)
340 {
341 	int len, error;
342 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
343 
344 	/*
345 	 * would be a firmware bug
346 	 */
347 	len = G_RSPD_LEN(ntohl(resp->len_cq));
348 	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP) {
349 		if (cxgb_debug)
350 			device_printf(sc->dev, "unexpected value sopeop=%d flags=0x%x len=%din get_imm_packet\n", sopeop, flags, len);
351 		bogus_imm++;
352 		return (EINVAL);
353 	}
354 	error = 0;
355 	switch (sopeop) {
356 	case RSPQ_SOP_EOP:
357 		m->m_len = m->m_pkthdr.len = len;
358 		memcpy(mtod(m, uint8_t *), resp->imm_data, len);
359 		break;
360 	case RSPQ_EOP:
361 		memcpy(cl, resp->imm_data, len);
362 		m_iovappend(m, cl, MSIZE, len, 0);
363 		break;
364 	default:
365 		bogus_imm++;
366 		error = EINVAL;
367 	}
368 
369 	return (error);
370 }
371 #endif
372 
373 static __inline u_int
374 flits_to_desc(u_int n)
375 {
376 	return (flit_desc_map[n]);
377 }
378 
379 void
380 t3_sge_err_intr_handler(adapter_t *adapter)
381 {
382 	unsigned int v, status;
383 
384 
385 	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
386 
387 	if (status & F_RSPQCREDITOVERFOW)
388 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
389 
390 	if (status & F_RSPQDISABLED) {
391 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
392 
393 		CH_ALERT(adapter,
394 			 "packet delivered to disabled response queue (0x%x)\n",
395 			 (v >> S_RSPQ0DISABLED) & 0xff);
396 	}
397 
398 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
399 	if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
400 		t3_fatal_err(adapter);
401 }
402 
403 void
404 t3_sge_prep(adapter_t *adap, struct sge_params *p)
405 {
406 	int i;
407 
408 	/* XXX Does ETHER_ALIGN need to be accounted for here? */
409 	p->max_pkt_size = MJUM16BYTES - sizeof(struct cpl_rx_data);
410 
411 	for (i = 0; i < SGE_QSETS; ++i) {
412 		struct qset_params *q = p->qset + i;
413 
414 		q->polling = adap->params.rev > 0;
415 
416 		if (adap->params.nports > 2)
417 			q->coalesce_nsecs = 50000;
418 		else
419 			q->coalesce_nsecs = 5000;
420 
421 		q->rspq_size = RSPQ_Q_SIZE;
422 		q->fl_size = FL_Q_SIZE;
423 		q->jumbo_size = JUMBO_Q_SIZE;
424 		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
425 		q->txq_size[TXQ_OFLD] = 1024;
426 		q->txq_size[TXQ_CTRL] = 256;
427 		q->cong_thres = 0;
428 	}
429 }
430 
431 int
432 t3_sge_alloc(adapter_t *sc)
433 {
434 
435 	/* The parent tag. */
436 	if (bus_dma_tag_create( NULL,			/* parent */
437 				1, 0,			/* algnmnt, boundary */
438 				BUS_SPACE_MAXADDR,	/* lowaddr */
439 				BUS_SPACE_MAXADDR,	/* highaddr */
440 				NULL, NULL,		/* filter, filterarg */
441 				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
442 				BUS_SPACE_UNRESTRICTED, /* nsegments */
443 				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
444 				0,			/* flags */
445 				NULL, NULL,		/* lock, lockarg */
446 				&sc->parent_dmat)) {
447 		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
448 		return (ENOMEM);
449 	}
450 
451 	/*
452 	 * DMA tag for normal sized RX frames
453 	 */
454 	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
455 		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
456 		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
457 		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
458 		return (ENOMEM);
459 	}
460 
461 	/*
462 	 * DMA tag for jumbo sized RX frames.
463 	 */
464 	if (bus_dma_tag_create(sc->parent_dmat, MJUMPAGESIZE, 0, BUS_SPACE_MAXADDR,
465 		BUS_SPACE_MAXADDR, NULL, NULL, MJUMPAGESIZE, 1, MJUMPAGESIZE,
466 		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
467 		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
468 		return (ENOMEM);
469 	}
470 
471 	/*
472 	 * DMA tag for TX frames.
473 	 */
474 	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
475 		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
476 		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
477 		NULL, NULL, &sc->tx_dmat)) {
478 		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
479 		return (ENOMEM);
480 	}
481 
482 	return (0);
483 }
484 
485 int
486 t3_sge_free(struct adapter * sc)
487 {
488 
489 	if (sc->tx_dmat != NULL)
490 		bus_dma_tag_destroy(sc->tx_dmat);
491 
492 	if (sc->rx_jumbo_dmat != NULL)
493 		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
494 
495 	if (sc->rx_dmat != NULL)
496 		bus_dma_tag_destroy(sc->rx_dmat);
497 
498 	if (sc->parent_dmat != NULL)
499 		bus_dma_tag_destroy(sc->parent_dmat);
500 
501 	return (0);
502 }
503 
504 void
505 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
506 {
507 
508 	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
509 	qs->rspq.polling = 0 /* p->polling */;
510 }
511 
512 static void
513 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
514 {
515 	struct refill_fl_cb_arg *cb_arg = arg;
516 
517 	cb_arg->error = error;
518 	cb_arg->seg = segs[0];
519 	cb_arg->nseg = nseg;
520 
521 }
522 
523 /**
524  *	refill_fl - refill an SGE free-buffer list
525  *	@sc: the controller softc
526  *	@q: the free-list to refill
527  *	@n: the number of new buffers to allocate
528  *
529  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
530  *	The caller must assure that @n does not exceed the queue's capacity.
531  */
532 static void
533 refill_fl(adapter_t *sc, struct sge_fl *q, int n)
534 {
535 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
536 	struct rx_desc *d = &q->desc[q->pidx];
537 	struct refill_fl_cb_arg cb_arg;
538 	void *cl;
539 	int err;
540 
541 	cb_arg.error = 0;
542 	while (n--) {
543 		/*
544 		 * We only allocate a cluster, mbuf allocation happens after rx
545 		 */
546 		if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
547 			log(LOG_WARNING, "Failed to allocate cluster\n");
548 			goto done;
549 		}
550 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
551 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
552 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
553 				uma_zfree(q->zone, cl);
554 				goto done;
555 			}
556 			sd->flags |= RX_SW_DESC_MAP_CREATED;
557 		}
558 		err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
559 		    refill_fl_cb, &cb_arg, 0);
560 
561 		if (err != 0 || cb_arg.error) {
562 			log(LOG_WARNING, "failure in refill_fl %d\n", cb_arg.error);
563 			/*
564 			 * XXX free cluster
565 			 */
566 			return;
567 		}
568 
569 		sd->flags |= RX_SW_DESC_INUSE;
570 		sd->cl = cl;
571 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
572 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
573 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
574 		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
575 
576 		d++;
577 		sd++;
578 
579 		if (++q->pidx == q->size) {
580 			q->pidx = 0;
581 			q->gen ^= 1;
582 			sd = q->sdesc;
583 			d = q->desc;
584 		}
585 		q->credits++;
586 	}
587 
588 done:
589 	t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
590 }
591 
592 
593 /**
594  *	free_rx_bufs - free the Rx buffers on an SGE free list
595  *	@sc: the controle softc
596  *	@q: the SGE free list to clean up
597  *
598  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
599  *	this queue should be stopped before calling this function.
600  */
601 static void
602 free_rx_bufs(adapter_t *sc, struct sge_fl *q)
603 {
604 	u_int cidx = q->cidx;
605 
606 	while (q->credits--) {
607 		struct rx_sw_desc *d = &q->sdesc[cidx];
608 
609 		if (d->flags & RX_SW_DESC_INUSE) {
610 			bus_dmamap_unload(q->entry_tag, d->map);
611 			bus_dmamap_destroy(q->entry_tag, d->map);
612 			uma_zfree(q->zone, d->cl);
613 		}
614 		d->cl = NULL;
615 		if (++cidx == q->size)
616 			cidx = 0;
617 	}
618 }
619 
620 static __inline void
621 __refill_fl(adapter_t *adap, struct sge_fl *fl)
622 {
623 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
624 }
625 
626 #ifndef DISABLE_MBUF_IOVEC
627 /**
628  *	recycle_rx_buf - recycle a receive buffer
629  *	@adapter: the adapter
630  *	@q: the SGE free list
631  *	@idx: index of buffer to recycle
632  *
633  *	Recycles the specified buffer on the given free list by adding it at
634  *	the next available slot on the list.
635  */
636 static void
637 recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
638 {
639 	struct rx_desc *from = &q->desc[idx];
640 	struct rx_desc *to   = &q->desc[q->pidx];
641 
642 	q->sdesc[q->pidx] = q->sdesc[idx];
643 	to->addr_lo = from->addr_lo;        // already big endian
644 	to->addr_hi = from->addr_hi;        // likewise
645 	wmb();
646 	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
647 	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
648 	q->credits++;
649 
650 	if (++q->pidx == q->size) {
651 		q->pidx = 0;
652 		q->gen ^= 1;
653 	}
654 	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
655 }
656 #endif
657 
658 static void
659 alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
660 {
661 	uint32_t *addr;
662 
663 	addr = arg;
664 	*addr = segs[0].ds_addr;
665 }
666 
667 static int
668 alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
669     bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
670     bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
671 {
672 	size_t len = nelem * elem_size;
673 	void *s = NULL;
674 	void *p = NULL;
675 	int err;
676 
677 	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
678 				      BUS_SPACE_MAXADDR_32BIT,
679 				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
680 				      len, 0, NULL, NULL, tag)) != 0) {
681 		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
682 		return (ENOMEM);
683 	}
684 
685 	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
686 				    map)) != 0) {
687 		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
688 		return (ENOMEM);
689 	}
690 
691 	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
692 	bzero(p, len);
693 	*(void **)desc = p;
694 
695 	if (sw_size) {
696 		len = nelem * sw_size;
697 		s = malloc(len, M_DEVBUF, M_WAITOK);
698 		bzero(s, len);
699 		*(void **)sdesc = s;
700 	}
701 	if (parent_entry_tag == NULL)
702 		return (0);
703 
704 	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
705 				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
706 		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
707 				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
708 		                      NULL, NULL, entry_tag)) != 0) {
709 		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
710 		return (ENOMEM);
711 	}
712 	return (0);
713 }
714 
715 static void
716 sge_slow_intr_handler(void *arg, int ncount)
717 {
718 	adapter_t *sc = arg;
719 
720 	t3_slow_intr_handler(sc);
721 }
722 
723 /**
724  *	sge_timer_cb - perform periodic maintenance of an SGE qset
725  *	@data: the SGE queue set to maintain
726  *
727  *	Runs periodically from a timer to perform maintenance of an SGE queue
728  *	set.  It performs two tasks:
729  *
730  *	a) Cleans up any completed Tx descriptors that may still be pending.
731  *	Normal descriptor cleanup happens when new packets are added to a Tx
732  *	queue so this timer is relatively infrequent and does any cleanup only
733  *	if the Tx queue has not seen any new packets in a while.  We make a
734  *	best effort attempt to reclaim descriptors, in that we don't wait
735  *	around if we cannot get a queue's lock (which most likely is because
736  *	someone else is queueing new packets and so will also handle the clean
737  *	up).  Since control queues use immediate data exclusively we don't
738  *	bother cleaning them up here.
739  *
740  *	b) Replenishes Rx queues that have run out due to memory shortage.
741  *	Normally new Rx buffers are added when existing ones are consumed but
742  *	when out of memory a queue can become empty.  We try to add only a few
743  *	buffers here, the queue will be replenished fully as these new buffers
744  *	are used up if memory shortage has subsided.
745  *
746  *	c) Return coalesced response queue credits in case a response queue is
747  *	starved.
748  *
749  *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
750  *	fifo overflows and the FW doesn't implement any recovery scheme yet.
751  */
752 static void
753 sge_timer_cb(void *arg)
754 {
755 	adapter_t *sc = arg;
756 	struct port_info *p;
757 	struct sge_qset *qs;
758 	struct sge_txq  *txq;
759 	int i, j;
760 	int reclaim_eth, reclaim_ofl, refill_rx;
761 
762 	for (i = 0; i < sc->params.nports; i++)
763 		for (j = 0; j < sc->port[i].nqsets; j++) {
764 			qs = &sc->sge.qs[i + j];
765 			txq = &qs->txq[0];
766 			reclaim_eth = txq[TXQ_ETH].processed - txq[TXQ_ETH].cleaned;
767 			reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
768 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
769 			    (qs->fl[1].credits < qs->fl[1].size));
770 			if (reclaim_eth || reclaim_ofl || refill_rx) {
771 				p = &sc->port[i];
772 				taskqueue_enqueue(p->tq, &p->timer_reclaim_task);
773 				break;
774 			}
775 		}
776 	if (sc->params.nports > 2) {
777 		int i;
778 
779 		for_each_port(sc, i) {
780 			struct port_info *pi = &sc->port[i];
781 
782 			t3_write_reg(sc, A_SG_KDOORBELL,
783 				     F_SELEGRCNTX |
784 				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
785 		}
786 	}
787 	if (sc->open_device_map != 0)
788 		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
789 }
790 
791 /*
792  * This is meant to be a catch-all function to keep sge state private
793  * to sge.c
794  *
795  */
796 int
797 t3_sge_init_adapter(adapter_t *sc)
798 {
799 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
800 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
801 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
802 	return (0);
803 }
804 
805 int
806 t3_sge_init_port(struct port_info *p)
807 {
808 	TASK_INIT(&p->timer_reclaim_task, 0, sge_timer_reclaim, p);
809 	return (0);
810 }
811 
812 void
813 t3_sge_deinit_sw(adapter_t *sc)
814 {
815 	int i;
816 
817 	callout_drain(&sc->sge_timer_ch);
818 	if (sc->tq)
819 		taskqueue_drain(sc->tq, &sc->slow_intr_task);
820 	for (i = 0; i < sc->params.nports; i++)
821 		if (sc->port[i].tq != NULL)
822 			taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task);
823 }
824 
825 /**
826  *	refill_rspq - replenish an SGE response queue
827  *	@adapter: the adapter
828  *	@q: the response queue to replenish
829  *	@credits: how many new responses to make available
830  *
831  *	Replenishes a response queue by making the supplied number of responses
832  *	available to HW.
833  */
834 static __inline void
835 refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
836 {
837 
838 	/* mbufs are allocated on demand when a rspq entry is processed. */
839 	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
840 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
841 }
842 
843 static __inline void
844 sge_txq_reclaim_(struct sge_txq *txq)
845 {
846 	int reclaimable, i, n;
847 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
848 	struct port_info *p;
849 
850 	p = txq->port;
851 reclaim_more:
852 	n = 0;
853 	reclaimable = desc_reclaimable(txq);
854 	if (reclaimable > 0 && mtx_trylock(&txq->lock)) {
855 		n = reclaim_completed_tx(txq, TX_CLEAN_MAX_DESC, m_vec);
856 		mtx_unlock(&txq->lock);
857 	}
858 	if (n == 0)
859 		return;
860 
861 	for (i = 0; i < n; i++) {
862 		m_freem(m_vec[i]);
863 	}
864 	if (p && p->ifp->if_drv_flags & IFF_DRV_OACTIVE &&
865 	    txq->size - txq->in_use >= TX_START_MAX_DESC) {
866 		txq_fills++;
867 		p->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
868 		taskqueue_enqueue(p->tq, &p->start_task);
869 	}
870 
871 	if (n)
872 		goto reclaim_more;
873 }
874 
875 static void
876 sge_txq_reclaim_handler(void *arg, int ncount)
877 {
878 	struct sge_txq *q = arg;
879 
880 	sge_txq_reclaim_(q);
881 }
882 
883 static void
884 sge_timer_reclaim(void *arg, int ncount)
885 {
886 	struct port_info *p = arg;
887 	int i, nqsets = p->nqsets;
888 	adapter_t *sc = p->adapter;
889 	struct sge_qset *qs;
890 	struct sge_txq *txq;
891 	struct mtx *lock;
892 
893 	for (i = 0; i < nqsets; i++) {
894 		qs = &sc->sge.qs[i];
895 		txq = &qs->txq[TXQ_ETH];
896 		sge_txq_reclaim_(txq);
897 
898 		txq = &qs->txq[TXQ_OFLD];
899 		sge_txq_reclaim_(txq);
900 
901 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
902 			    &sc->sge.qs[0].rspq.lock;
903 
904 		if (mtx_trylock(lock)) {
905 			/* XXX currently assume that we are *NOT* polling */
906 			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
907 
908 			if (qs->fl[0].credits < qs->fl[0].size - 16)
909 				__refill_fl(sc, &qs->fl[0]);
910 			if (qs->fl[1].credits < qs->fl[1].size - 16)
911 				__refill_fl(sc, &qs->fl[1]);
912 
913 			if (status & (1 << qs->rspq.cntxt_id)) {
914 				if (qs->rspq.credits) {
915 					refill_rspq(sc, &qs->rspq, 1);
916 					qs->rspq.credits--;
917 					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
918 					    1 << qs->rspq.cntxt_id);
919 				}
920 			}
921 			mtx_unlock(lock);
922 		}
923 	}
924 }
925 
926 /**
927  *	init_qset_cntxt - initialize an SGE queue set context info
928  *	@qs: the queue set
929  *	@id: the queue set id
930  *
931  *	Initializes the TIDs and context ids for the queues of a queue set.
932  */
933 static void
934 init_qset_cntxt(struct sge_qset *qs, u_int id)
935 {
936 
937 	qs->rspq.cntxt_id = id;
938 	qs->fl[0].cntxt_id = 2 * id;
939 	qs->fl[1].cntxt_id = 2 * id + 1;
940 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
941 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
942 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
943 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
944 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
945 }
946 
947 
948 static void
949 txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
950 {
951 	txq->in_use += ndesc;
952 	/*
953 	 * XXX we don't handle stopping of queue
954 	 * presumably start handles this when we bump against the end
955 	 */
956 	txqs->gen = txq->gen;
957 	txq->unacked += ndesc;
958 	txqs->compl = (txq->unacked & 8) << (S_WR_COMPL - 3);
959 	txq->unacked &= 7;
960 	txqs->pidx = txq->pidx;
961 	txq->pidx += ndesc;
962 
963 	if (txq->pidx >= txq->size) {
964 		txq->pidx -= txq->size;
965 		txq->gen ^= 1;
966 	}
967 
968 }
969 
970 /**
971  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
972  *	@m: the packet mbufs
973  *      @nsegs: the number of segments
974  *
975  * 	Returns the number of Tx descriptors needed for the given Ethernet
976  * 	packet.  Ethernet packets require addition of WR and CPL headers.
977  */
978 static __inline unsigned int
979 calc_tx_descs(const struct mbuf *m, int nsegs)
980 {
981 	unsigned int flits;
982 
983 	if (m->m_pkthdr.len <= WR_LEN - sizeof(struct cpl_tx_pkt))
984 		return 1;
985 
986 	flits = sgl_len(nsegs) + 2;
987 #ifdef TSO_SUPPORTED
988 	if  (m->m_pkthdr.csum_flags & (CSUM_TSO))
989 		flits++;
990 #endif
991 	return flits_to_desc(flits);
992 }
993 
994 static unsigned int
995 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
996     struct tx_sw_desc *stx, bus_dma_segment_t *segs, int *nsegs)
997 {
998 	struct mbuf *m0;
999 	int err, pktlen;
1000 
1001 	m0 = *m;
1002 	pktlen = m0->m_pkthdr.len;
1003 
1004 	err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
1005 #ifdef DEBUG
1006 	if (err) {
1007 		int n = 0;
1008 		struct mbuf *mtmp = m0;
1009 		while(mtmp) {
1010 			n++;
1011 			mtmp = mtmp->m_next;
1012 		}
1013 		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
1014 		    err, m0->m_pkthdr.len, n);
1015 	}
1016 #endif
1017 	if (err == EFBIG) {
1018 		/* Too many segments, try to defrag */
1019 		m0 = m_defrag(m0, M_DONTWAIT);
1020 		if (m0 == NULL) {
1021 			m_freem(*m);
1022 			*m = NULL;
1023 			return (ENOBUFS);
1024 		}
1025 		*m = m0;
1026 		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
1027 	}
1028 
1029 	if (err == ENOMEM) {
1030 		return (err);
1031 	}
1032 
1033 	if (err) {
1034 		if (cxgb_debug)
1035 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
1036 		m_freem(m0);
1037 		*m = NULL;
1038 		return (err);
1039 	}
1040 
1041 	bus_dmamap_sync(txq->entry_tag, stx->map, BUS_DMASYNC_PREWRITE);
1042 	stx->flags |= TX_SW_DESC_MAPPED;
1043 
1044 	return (0);
1045 }
1046 
1047 /**
1048  *	make_sgl - populate a scatter/gather list for a packet
1049  *	@sgp: the SGL to populate
1050  *	@segs: the packet dma segments
1051  *	@nsegs: the number of segments
1052  *
1053  *	Generates a scatter/gather list for the buffers that make up a packet
1054  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1055  *	appropriately.
1056  */
1057 static __inline void
1058 make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1059 {
1060 	int i, idx;
1061 
1062 	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
1063 		if (i && idx == 0)
1064 			++sgp;
1065 
1066 		sgp->len[idx] = htobe32(segs[i].ds_len);
1067 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1068 	}
1069 
1070 	if (idx)
1071 		sgp->len[idx] = 0;
1072 }
1073 
1074 /**
1075  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1076  *	@adap: the adapter
1077  *	@q: the Tx queue
1078  *
1079  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1080  *	where the HW is going to sleep just after we checked, however,
1081  *	then the interrupt handler will detect the outstanding TX packet
1082  *	and ring the doorbell for us.
1083  *
1084  *	When GTS is disabled we unconditionally ring the doorbell.
1085  */
1086 static __inline void
1087 check_ring_tx_db(adapter_t *adap, struct sge_txq *q)
1088 {
1089 #if USE_GTS
1090 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1091 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1092 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1093 #ifdef T3_TRACE
1094 		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1095 			  q->cntxt_id);
1096 #endif
1097 		t3_write_reg(adap, A_SG_KDOORBELL,
1098 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1099 	}
1100 #else
1101 	wmb();            /* write descriptors before telling HW */
1102 	t3_write_reg(adap, A_SG_KDOORBELL,
1103 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1104 #endif
1105 }
1106 
1107 static __inline void
1108 wr_gen2(struct tx_desc *d, unsigned int gen)
1109 {
1110 #if SGE_NUM_GENBITS == 2
1111 	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1112 #endif
1113 }
1114 
1115 
1116 
1117 /**
1118  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1119  *	@ndesc: number of Tx descriptors spanned by the SGL
1120  *	@txd: first Tx descriptor to be written
1121  *	@txqs: txq state (generation and producer index)
1122  *	@txq: the SGE Tx queue
1123  *	@sgl: the SGL
1124  *	@flits: number of flits to the start of the SGL in the first descriptor
1125  *	@sgl_flits: the SGL size in flits
1126  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1127  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1128  *
1129  *	Write a work request header and an associated SGL.  If the SGL is
1130  *	small enough to fit into one Tx descriptor it has already been written
1131  *	and we just need to write the WR header.  Otherwise we distribute the
1132  *	SGL across the number of descriptors it spans.
1133  */
1134 
1135 static void
1136 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1137     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1138     unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1139 {
1140 
1141 	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1142 	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1143 
1144 	if (__predict_true(ndesc == 1)) {
1145 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1146 		    V_WR_SGLSFLT(flits)) | wr_hi;
1147 		wmb();
1148 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1149 		    V_WR_GEN(txqs->gen)) | wr_lo;
1150 		/* XXX gen? */
1151 		wr_gen2(txd, txqs->gen);
1152 	} else {
1153 		unsigned int ogen = txqs->gen;
1154 		const uint64_t *fp = (const uint64_t *)sgl;
1155 		struct work_request_hdr *wp = wrp;
1156 
1157 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1158 		    V_WR_SGLSFLT(flits)) | wr_hi;
1159 
1160 		while (sgl_flits) {
1161 			unsigned int avail = WR_FLITS - flits;
1162 
1163 			if (avail > sgl_flits)
1164 				avail = sgl_flits;
1165 			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1166 			sgl_flits -= avail;
1167 			ndesc--;
1168 			if (!sgl_flits)
1169 				break;
1170 
1171 			fp += avail;
1172 			txd++;
1173 			txsd++;
1174 			if (++txqs->pidx == txq->size) {
1175 				txqs->pidx = 0;
1176 				txqs->gen ^= 1;
1177 				txd = txq->desc;
1178 				txsd = txq->sdesc;
1179 			}
1180 
1181 			/*
1182 			 * when the head of the mbuf chain
1183 			 * is freed all clusters will be freed
1184 			 * with it
1185 			 */
1186 			txsd->m = NULL;
1187 			wrp = (struct work_request_hdr *)txd;
1188 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1189 			    V_WR_SGLSFLT(1)) | wr_hi;
1190 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1191 				    sgl_flits + 1)) |
1192 			    V_WR_GEN(txqs->gen)) | wr_lo;
1193 			wr_gen2(txd, txqs->gen);
1194 			flits = 1;
1195 		}
1196 		wrp->wr_hi |= htonl(F_WR_EOP);
1197 		wmb();
1198 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1199 		wr_gen2((struct tx_desc *)wp, ogen);
1200 	}
1201 }
1202 
1203 
1204 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
1205 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
1206 
1207 int
1208 t3_encap(struct port_info *p, struct mbuf **m, int *free)
1209 {
1210 	adapter_t *sc;
1211 	struct mbuf *m0;
1212 	struct sge_qset *qs;
1213 	struct sge_txq *txq;
1214 	struct tx_sw_desc *stx;
1215 	struct txq_state txqs;
1216 	unsigned int ndesc, flits, cntrl, mlen;
1217 	int err, nsegs, tso_info = 0;
1218 
1219 	struct work_request_hdr *wrp;
1220 	struct tx_sw_desc *txsd;
1221 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1222 	bus_dma_segment_t segs[TX_MAX_SEGS];
1223 	uint32_t wr_hi, wr_lo, sgl_flits;
1224 
1225 	struct tx_desc *txd;
1226 	struct cpl_tx_pkt *cpl;
1227 
1228 	m0 = *m;
1229 	sc = p->adapter;
1230 
1231 	DPRINTF("t3_encap port_id=%d qsidx=%d ", p->port_id, p->first_qset);
1232 
1233 	/* port_id=1 qsid=1 txpkt_intf=2 tx_chan=0 */
1234 
1235 	qs = &sc->sge.qs[p->first_qset];
1236 
1237 	txq = &qs->txq[TXQ_ETH];
1238 	stx = &txq->sdesc[txq->pidx];
1239 	txd = &txq->desc[txq->pidx];
1240 	cpl = (struct cpl_tx_pkt *)txd;
1241 	mlen = m0->m_pkthdr.len;
1242 	cpl->len = htonl(mlen | 0x80000000);
1243 
1244 	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", mlen, p->txpkt_intf, p->tx_chan);
1245 	/*
1246 	 * XXX handle checksum, TSO, and VLAN here
1247 	 *
1248 	 */
1249 	cntrl = V_TXPKT_INTF(p->txpkt_intf);
1250 
1251 	/*
1252 	 * XXX need to add VLAN support for 6.x
1253 	 */
1254 #ifdef VLAN_SUPPORTED
1255 	if (m0->m_flags & M_VLANTAG)
1256 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
1257 	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
1258 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1259 #endif
1260 	if (tso_info) {
1261 		int eth_type;
1262 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
1263 		struct ip *ip;
1264 		struct tcphdr *tcp;
1265 		char *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
1266 
1267 		txd->flit[2] = 0;
1268 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1269 		hdr->cntrl = htonl(cntrl);
1270 
1271 		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1272 			pkthdr = &tmp[0];
1273 			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
1274 		} else {
1275 			pkthdr = mtod(m0, char *);
1276 		}
1277 
1278 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
1279 			eth_type = CPL_ETH_II_VLAN;
1280 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN +
1281 			    ETHER_VLAN_ENCAP_LEN);
1282 		} else {
1283 			eth_type = CPL_ETH_II;
1284 			ip = (struct ip *)(pkthdr + ETHER_HDR_LEN);
1285 		}
1286 		tcp = (struct tcphdr *)((uint8_t *)ip +
1287 		    sizeof(*ip));
1288 
1289 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1290 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
1291 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
1292 		hdr->lso_info = htonl(tso_info);
1293 		flits = 3;
1294 	} else {
1295 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1296 		cpl->cntrl = htonl(cntrl);
1297 
1298 		if (mlen <= WR_LEN - sizeof(*cpl)) {
1299 			txq_prod(txq, 1, &txqs);
1300 			txq->sdesc[txqs.pidx].m = NULL;
1301 
1302 			if (m0->m_len == m0->m_pkthdr.len)
1303 				memcpy(&txd->flit[2], mtod(m0, uint8_t *), mlen);
1304 			else
1305 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1306 
1307 			*free = 1;
1308 			flits = (mlen + 7) / 8 + 2;
1309 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1310 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1311 					  F_WR_SOP | F_WR_EOP | txqs.compl);
1312 			wmb();
1313 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) |
1314 			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1315 
1316 			wr_gen2(txd, txqs.gen);
1317 			check_ring_tx_db(sc, txq);
1318 			return (0);
1319 		}
1320 		flits = 2;
1321 	}
1322 
1323 	wrp = (struct work_request_hdr *)txd;
1324 
1325 	if ((err = busdma_map_mbufs(m, txq, stx, segs, &nsegs)) != 0) {
1326 		return (err);
1327 	}
1328 	m0 = *m;
1329 	ndesc = calc_tx_descs(m0, nsegs);
1330 
1331 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1332 	make_sgl(sgp, segs, nsegs);
1333 
1334 	sgl_flits = sgl_len(nsegs);
1335 
1336 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
1337 	txq_prod(txq, ndesc, &txqs);
1338 	txsd = &txq->sdesc[txqs.pidx];
1339 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1340 	wr_lo = htonl(V_WR_TID(txq->token));
1341 	txsd->m = m0;
1342 	m_set_priority(m0, txqs.pidx);
1343 
1344 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
1345 	check_ring_tx_db(p->adapter, txq);
1346 
1347 	return (0);
1348 }
1349 
1350 
1351 /**
1352  *	write_imm - write a packet into a Tx descriptor as immediate data
1353  *	@d: the Tx descriptor to write
1354  *	@m: the packet
1355  *	@len: the length of packet data to write as immediate data
1356  *	@gen: the generation bit value to write
1357  *
1358  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1359  *	contains a work request at its beginning.  We must write the packet
1360  *	carefully so the SGE doesn't read accidentally before it's written in
1361  *	its entirety.
1362  */
1363 static __inline void
1364 write_imm(struct tx_desc *d, struct mbuf *m,
1365 	  unsigned int len, unsigned int gen)
1366 {
1367 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
1368 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1369 
1370 	memcpy(&to[1], &from[1], len - sizeof(*from));
1371 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1372 					V_WR_BCNTLFLT(len & 7));
1373 	wmb();
1374 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1375 					V_WR_LEN((len + 7) / 8));
1376 	wr_gen2(d, gen);
1377 	m_freem(m);
1378 }
1379 
1380 /**
1381  *	check_desc_avail - check descriptor availability on a send queue
1382  *	@adap: the adapter
1383  *	@q: the TX queue
1384  *	@m: the packet needing the descriptors
1385  *	@ndesc: the number of Tx descriptors needed
1386  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1387  *
1388  *	Checks if the requested number of Tx descriptors is available on an
1389  *	SGE send queue.  If the queue is already suspended or not enough
1390  *	descriptors are available the packet is queued for later transmission.
1391  *	Must be called with the Tx queue locked.
1392  *
1393  *	Returns 0 if enough descriptors are available, 1 if there aren't
1394  *	enough descriptors and the packet has been queued, and 2 if the caller
1395  *	needs to retry because there weren't enough descriptors at the
1396  *	beginning of the call but some freed up in the mean time.
1397  */
1398 static __inline int
1399 check_desc_avail(adapter_t *adap, struct sge_txq *q,
1400 		 struct mbuf *m, unsigned int ndesc,
1401 		 unsigned int qid)
1402 {
1403 	/*
1404 	 * XXX We currently only use this for checking the control queue
1405 	 * the control queue is only used for binding qsets which happens
1406 	 * at init time so we are guaranteed enough descriptors
1407 	 */
1408 	if (__predict_false(!mbufq_empty(&q->sendq))) {
1409 addq_exit:	mbufq_tail(&q->sendq, m);
1410 		return 1;
1411 	}
1412 	if (__predict_false(q->size - q->in_use < ndesc)) {
1413 
1414 		struct sge_qset *qs = txq_to_qset(q, qid);
1415 
1416 		setbit(&qs->txq_stopped, qid);
1417 		smp_mb();
1418 
1419 		if (should_restart_tx(q) &&
1420 		    test_and_clear_bit(qid, &qs->txq_stopped))
1421 			return 2;
1422 
1423 		q->stops++;
1424 		goto addq_exit;
1425 	}
1426 	return 0;
1427 }
1428 
1429 
1430 /**
1431  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1432  *	@q: the SGE control Tx queue
1433  *
1434  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1435  *	that send only immediate data (presently just the control queues) and
1436  *	thus do not have any mbufs
1437  */
1438 static __inline void
1439 reclaim_completed_tx_imm(struct sge_txq *q)
1440 {
1441 	unsigned int reclaim = q->processed - q->cleaned;
1442 
1443 	mtx_assert(&q->lock, MA_OWNED);
1444 
1445 	q->in_use -= reclaim;
1446 	q->cleaned += reclaim;
1447 }
1448 
1449 static __inline int
1450 immediate(const struct mbuf *m)
1451 {
1452 	return m->m_len <= WR_LEN  && m->m_pkthdr.len <= WR_LEN ;
1453 }
1454 
1455 /**
1456  *	ctrl_xmit - send a packet through an SGE control Tx queue
1457  *	@adap: the adapter
1458  *	@q: the control queue
1459  *	@m: the packet
1460  *
1461  *	Send a packet through an SGE control Tx queue.  Packets sent through
1462  *	a control queue must fit entirely as immediate data in a single Tx
1463  *	descriptor and have no page fragments.
1464  */
1465 static int
1466 ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1467 {
1468 	int ret;
1469 	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1470 
1471 	if (__predict_false(!immediate(m))) {
1472 		m_freem(m);
1473 		return 0;
1474 	}
1475 
1476 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1477 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1478 
1479 	mtx_lock(&q->lock);
1480 again:	reclaim_completed_tx_imm(q);
1481 
1482 	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1483 	if (__predict_false(ret)) {
1484 		if (ret == 1) {
1485 			mtx_unlock(&q->lock);
1486 			return (-1);
1487 		}
1488 		goto again;
1489 	}
1490 
1491 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1492 
1493 	q->in_use++;
1494 	if (++q->pidx >= q->size) {
1495 		q->pidx = 0;
1496 		q->gen ^= 1;
1497 	}
1498 	mtx_unlock(&q->lock);
1499 	wmb();
1500 	t3_write_reg(adap, A_SG_KDOORBELL,
1501 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1502 	return (0);
1503 }
1504 
1505 
1506 /**
1507  *	restart_ctrlq - restart a suspended control queue
1508  *	@qs: the queue set cotaining the control queue
1509  *
1510  *	Resumes transmission on a suspended Tx control queue.
1511  */
1512 static void
1513 restart_ctrlq(void *data, int npending)
1514 {
1515 	struct mbuf *m;
1516 	struct sge_qset *qs = (struct sge_qset *)data;
1517 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1518 	adapter_t *adap = qs->port->adapter;
1519 
1520 	mtx_lock(&q->lock);
1521 again:	reclaim_completed_tx_imm(q);
1522 
1523 	while (q->in_use < q->size &&
1524 	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1525 
1526 		write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
1527 
1528 		if (++q->pidx >= q->size) {
1529 			q->pidx = 0;
1530 			q->gen ^= 1;
1531 		}
1532 		q->in_use++;
1533 	}
1534 	if (!mbufq_empty(&q->sendq)) {
1535 		setbit(&qs->txq_stopped, TXQ_CTRL);
1536 		smp_mb();
1537 
1538 		if (should_restart_tx(q) &&
1539 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1540 			goto again;
1541 		q->stops++;
1542 	}
1543 	mtx_unlock(&q->lock);
1544 	t3_write_reg(adap, A_SG_KDOORBELL,
1545 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1546 }
1547 
1548 
1549 /*
1550  * Send a management message through control queue 0
1551  */
1552 int
1553 t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1554 {
1555 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
1556 }
1557 
1558 /**
1559  *	free_qset - free the resources of an SGE queue set
1560  *	@sc: the controller owning the queue set
1561  *	@q: the queue set
1562  *
1563  *	Release the HW and SW resources associated with an SGE queue set, such
1564  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1565  *	queue set must be quiesced prior to calling this.
1566  */
1567 static void
1568 t3_free_qset(adapter_t *sc, struct sge_qset *q)
1569 {
1570 	int i;
1571 
1572 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1573 		if (q->fl[i].desc) {
1574 			mtx_lock(&sc->sge.reg_lock);
1575 			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1576 			mtx_unlock(&sc->sge.reg_lock);
1577 			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1578 			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1579 					q->fl[i].desc_map);
1580 			bus_dma_tag_destroy(q->fl[i].desc_tag);
1581 			bus_dma_tag_destroy(q->fl[i].entry_tag);
1582 		}
1583 		if (q->fl[i].sdesc) {
1584 			free_rx_bufs(sc, &q->fl[i]);
1585 			free(q->fl[i].sdesc, M_DEVBUF);
1586 		}
1587 	}
1588 
1589 	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
1590 		if (q->txq[i].desc) {
1591 			mtx_lock(&sc->sge.reg_lock);
1592 			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
1593 			mtx_unlock(&sc->sge.reg_lock);
1594 			bus_dmamap_unload(q->txq[i].desc_tag,
1595 					q->txq[i].desc_map);
1596 			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
1597 					q->txq[i].desc_map);
1598 			bus_dma_tag_destroy(q->txq[i].desc_tag);
1599 			bus_dma_tag_destroy(q->txq[i].entry_tag);
1600 			MTX_DESTROY(&q->txq[i].lock);
1601 		}
1602 		if (q->txq[i].sdesc) {
1603 			free(q->txq[i].sdesc, M_DEVBUF);
1604 		}
1605 	}
1606 
1607 	if (q->rspq.desc) {
1608 		mtx_lock(&sc->sge.reg_lock);
1609 		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
1610 		mtx_unlock(&sc->sge.reg_lock);
1611 
1612 		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
1613 		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
1614 			        q->rspq.desc_map);
1615 		bus_dma_tag_destroy(q->rspq.desc_tag);
1616 		MTX_DESTROY(&q->rspq.lock);
1617 	}
1618 
1619 	bzero(q, sizeof(*q));
1620 }
1621 
1622 /**
1623  *	t3_free_sge_resources - free SGE resources
1624  *	@sc: the adapter softc
1625  *
1626  *	Frees resources used by the SGE queue sets.
1627  */
1628 void
1629 t3_free_sge_resources(adapter_t *sc)
1630 {
1631 	int i, nqsets;
1632 
1633 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1634 		nqsets += sc->port[i].nqsets;
1635 
1636 	for (i = 0; i < nqsets; ++i)
1637 		t3_free_qset(sc, &sc->sge.qs[i]);
1638 }
1639 
1640 /**
1641  *	t3_sge_start - enable SGE
1642  *	@sc: the controller softc
1643  *
1644  *	Enables the SGE for DMAs.  This is the last step in starting packet
1645  *	transfers.
1646  */
1647 void
1648 t3_sge_start(adapter_t *sc)
1649 {
1650 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
1651 }
1652 
1653 /**
1654  *	t3_sge_stop - disable SGE operation
1655  *	@sc: the adapter
1656  *
1657  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
1658  *	from error interrupts) or from normal process context.  In the latter
1659  *	case it also disables any pending queue restart tasklets.  Note that
1660  *	if it is called in interrupt context it cannot disable the restart
1661  *	tasklets as it cannot wait, however the tasklets will have no effect
1662  *	since the doorbells are disabled and the driver will call this again
1663  *	later from process context, at which time the tasklets will be stopped
1664  *	if they are still running.
1665  */
1666 void
1667 t3_sge_stop(adapter_t *sc)
1668 {
1669 	int i, nqsets;
1670 
1671 	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
1672 
1673 	if (sc->tq == NULL)
1674 		return;
1675 
1676 	for (nqsets = i = 0; i < (sc)->params.nports; i++)
1677 		nqsets += sc->port[i].nqsets;
1678 
1679 	for (i = 0; i < nqsets; ++i) {
1680 		struct sge_qset *qs = &sc->sge.qs[i];
1681 
1682 		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
1683 		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
1684 	}
1685 }
1686 
1687 
1688 /**
1689  *	free_tx_desc - reclaims Tx descriptors and their buffers
1690  *	@adapter: the adapter
1691  *	@q: the Tx queue to reclaim descriptors from
1692  *	@n: the number of descriptors to reclaim
1693  *
1694  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
1695  *	Tx buffers.  Called with the Tx queue lock held.
1696  */
1697 int
1698 free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec)
1699 {
1700 	struct tx_sw_desc *d;
1701 	unsigned int cidx = q->cidx;
1702 	int nbufs = 0;
1703 
1704 #ifdef T3_TRACE
1705 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
1706 		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
1707 #endif
1708 	d = &q->sdesc[cidx];
1709 
1710 	while (n-- > 0) {
1711 		DPRINTF("cidx=%d d=%p\n", cidx, d);
1712 		if (d->m) {
1713 			if (d->flags & TX_SW_DESC_MAPPED) {
1714 				bus_dmamap_unload(q->entry_tag, d->map);
1715 				bus_dmamap_destroy(q->entry_tag, d->map);
1716 				d->flags &= ~TX_SW_DESC_MAPPED;
1717 			}
1718 			if (m_get_priority(d->m) == cidx) {
1719 				m_vec[nbufs] = d->m;
1720 				d->m = NULL;
1721 				nbufs++;
1722 			} else {
1723 				printf("pri=%d cidx=%d\n", (int)m_get_priority(d->m), cidx);
1724 			}
1725 		}
1726 		++d;
1727 		if (++cidx == q->size) {
1728 			cidx = 0;
1729 			d = q->sdesc;
1730 		}
1731 	}
1732 	q->cidx = cidx;
1733 
1734 	return (nbufs);
1735 }
1736 
1737 /**
1738  *	is_new_response - check if a response is newly written
1739  *	@r: the response descriptor
1740  *	@q: the response queue
1741  *
1742  *	Returns true if a response descriptor contains a yet unprocessed
1743  *	response.
1744  */
1745 static __inline int
1746 is_new_response(const struct rsp_desc *r,
1747     const struct sge_rspq *q)
1748 {
1749 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1750 }
1751 
1752 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1753 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1754 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1755 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1756 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1757 
1758 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1759 #define NOMEM_INTR_DELAY 2500
1760 
1761 /**
1762  *	write_ofld_wr - write an offload work request
1763  *	@adap: the adapter
1764  *	@m: the packet to send
1765  *	@q: the Tx queue
1766  *	@pidx: index of the first Tx descriptor to write
1767  *	@gen: the generation value to use
1768  *	@ndesc: number of descriptors the packet will occupy
1769  *
1770  *	Write an offload work request to send the supplied packet.  The packet
1771  *	data already carry the work request with most fields populated.
1772  */
1773 static void
1774 write_ofld_wr(adapter_t *adap, struct mbuf *m,
1775     struct sge_txq *q, unsigned int pidx,
1776     unsigned int gen, unsigned int ndesc,
1777     bus_dma_segment_t *segs, unsigned int nsegs)
1778 {
1779 	unsigned int sgl_flits, flits;
1780 	struct work_request_hdr *from;
1781 	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
1782 	struct tx_desc *d = &q->desc[pidx];
1783 	struct txq_state txqs;
1784 
1785 	if (immediate(m)) {
1786 		q->sdesc[pidx].m = NULL;
1787 		write_imm(d, m, m->m_len, gen);
1788 		return;
1789 	}
1790 
1791 	/* Only TX_DATA builds SGLs */
1792 
1793 	from = mtod(m, struct work_request_hdr *);
1794 	memcpy(&d->flit[1], &from[1],
1795 	    (uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *) - sizeof(*from));
1796 
1797 	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;
1798 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
1799 
1800 	make_sgl(sgp, segs, nsegs);
1801 	sgl_flits = sgl_len(nsegs);
1802 
1803 	txqs.gen = q->gen;
1804 	txqs.pidx = q->pidx;
1805 	txqs.compl = (q->unacked & 8) << (S_WR_COMPL - 3);
1806 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
1807 	    from->wr_hi, from->wr_lo);
1808 }
1809 
1810 /**
1811  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1812  *	@m: the packet
1813  *
1814  * 	Returns the number of Tx descriptors needed for the given offload
1815  * 	packet.  These packets are already fully constructed.
1816  */
1817 static __inline unsigned int
1818 calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs)
1819 {
1820 	unsigned int flits, cnt = 0;
1821 
1822 
1823 	if (m->m_len <= WR_LEN)
1824 		return 1;                 /* packet fits as immediate data */
1825 
1826 	if (m->m_flags & M_IOVEC)
1827 		cnt = mtomv(m)->mv_count;
1828 
1829 	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;   /* headers */
1830 
1831 	return flits_to_desc(flits + sgl_len(cnt));
1832 }
1833 
1834 /**
1835  *	ofld_xmit - send a packet through an offload queue
1836  *	@adap: the adapter
1837  *	@q: the Tx offload queue
1838  *	@m: the packet
1839  *
1840  *	Send an offload packet through an SGE offload queue.
1841  */
1842 static int
1843 ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
1844 {
1845 	int ret, nsegs;
1846 	unsigned int ndesc;
1847 	unsigned int pidx, gen;
1848 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
1849 	bus_dma_segment_t segs[TX_MAX_SEGS];
1850 	int i, cleaned;
1851 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
1852 
1853 	mtx_lock(&q->lock);
1854 	if ((ret = busdma_map_mbufs(&m, q, stx, segs, &nsegs)) != 0) {
1855 		mtx_unlock(&q->lock);
1856 		return (ret);
1857 	}
1858 	ndesc = calc_tx_descs_ofld(m, nsegs);
1859 again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
1860 
1861 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
1862 	if (__predict_false(ret)) {
1863 		if (ret == 1) {
1864 			m_set_priority(m, ndesc);     /* save for restart */
1865 			mtx_unlock(&q->lock);
1866 			return EINTR;
1867 		}
1868 		goto again;
1869 	}
1870 
1871 	gen = q->gen;
1872 	q->in_use += ndesc;
1873 	pidx = q->pidx;
1874 	q->pidx += ndesc;
1875 	if (q->pidx >= q->size) {
1876 		q->pidx -= q->size;
1877 		q->gen ^= 1;
1878 	}
1879 #ifdef T3_TRACE
1880 	T3_TRACE5(adap->tb[q->cntxt_id & 7],
1881 		  "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u",
1882 		  ndesc, pidx, skb->len, skb->len - skb->data_len,
1883 		  skb_shinfo(skb)->nr_frags);
1884 #endif
1885 	mtx_unlock(&q->lock);
1886 
1887 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
1888 	check_ring_tx_db(adap, q);
1889 
1890 	for (i = 0; i < cleaned; i++) {
1891 		m_freem(m_vec[i]);
1892 	}
1893 	return (0);
1894 }
1895 
1896 /**
1897  *	restart_offloadq - restart a suspended offload queue
1898  *	@qs: the queue set cotaining the offload queue
1899  *
1900  *	Resumes transmission on a suspended Tx offload queue.
1901  */
1902 static void
1903 restart_offloadq(void *data, int npending)
1904 {
1905 
1906 	struct mbuf *m;
1907 	struct sge_qset *qs = data;
1908 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
1909 	adapter_t *adap = qs->port->adapter;
1910 	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
1911 	bus_dma_segment_t segs[TX_MAX_SEGS];
1912 	int nsegs, i, cleaned;
1913 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
1914 
1915 	mtx_lock(&q->lock);
1916 again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
1917 
1918 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
1919 		unsigned int gen, pidx;
1920 		unsigned int ndesc = m_get_priority(m);
1921 
1922 		if (__predict_false(q->size - q->in_use < ndesc)) {
1923 			setbit(&qs->txq_stopped, TXQ_OFLD);
1924 			smp_mb();
1925 
1926 			if (should_restart_tx(q) &&
1927 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
1928 				goto again;
1929 			q->stops++;
1930 			break;
1931 		}
1932 
1933 		gen = q->gen;
1934 		q->in_use += ndesc;
1935 		pidx = q->pidx;
1936 		q->pidx += ndesc;
1937 		if (q->pidx >= q->size) {
1938 			q->pidx -= q->size;
1939 			q->gen ^= 1;
1940 		}
1941 
1942 		(void)mbufq_dequeue(&q->sendq);
1943 		busdma_map_mbufs(&m, q, stx, segs, &nsegs);
1944 		mtx_unlock(&q->lock);
1945 		write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
1946 		mtx_lock(&q->lock);
1947 	}
1948 	mtx_unlock(&q->lock);
1949 
1950 #if USE_GTS
1951 	set_bit(TXQ_RUNNING, &q->flags);
1952 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
1953 #endif
1954 	t3_write_reg(adap, A_SG_KDOORBELL,
1955 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1956 
1957 	for (i = 0; i < cleaned; i++) {
1958 		m_freem(m_vec[i]);
1959 	}
1960 }
1961 
1962 /**
1963  *	queue_set - return the queue set a packet should use
1964  *	@m: the packet
1965  *
1966  *	Maps a packet to the SGE queue set it should use.  The desired queue
1967  *	set is carried in bits 1-3 in the packet's priority.
1968  */
1969 static __inline int
1970 queue_set(const struct mbuf *m)
1971 {
1972 	return m_get_priority(m) >> 1;
1973 }
1974 
1975 /**
1976  *	is_ctrl_pkt - return whether an offload packet is a control packet
1977  *	@m: the packet
1978  *
1979  *	Determines whether an offload packet should use an OFLD or a CTRL
1980  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
1981  */
1982 static __inline int
1983 is_ctrl_pkt(const struct mbuf *m)
1984 {
1985 	return m_get_priority(m) & 1;
1986 }
1987 
1988 /**
1989  *	t3_offload_tx - send an offload packet
1990  *	@tdev: the offload device to send to
1991  *	@m: the packet
1992  *
1993  *	Sends an offload packet.  We use the packet priority to select the
1994  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
1995  *	should be sent as regular or control, bits 1-3 select the queue set.
1996  */
1997 int
1998 t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
1999 {
2000 	adapter_t *adap = tdev2adap(tdev);
2001 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
2002 
2003 	if (__predict_false(is_ctrl_pkt(m)))
2004 		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
2005 
2006 	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
2007 }
2008 
2009 /**
2010  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
2011  *	@tdev: the offload device that will be receiving the packets
2012  *	@q: the SGE response queue that assembled the bundle
2013  *	@m: the partial bundle
2014  *	@n: the number of packets in the bundle
2015  *
2016  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
2017  */
2018 static __inline void
2019 deliver_partial_bundle(struct t3cdev *tdev,
2020 			struct sge_rspq *q,
2021 			struct mbuf *mbufs[], int n)
2022 {
2023 	if (n) {
2024 		q->offload_bundles++;
2025 		cxgb_ofld_recv(tdev, mbufs, n);
2026 	}
2027 }
2028 
2029 static __inline int
2030 rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
2031     struct mbuf *m, struct mbuf *rx_gather[],
2032     unsigned int gather_idx)
2033 {
2034 	rq->offload_pkts++;
2035 	m->m_pkthdr.header = mtod(m, void *);
2036 
2037 	rx_gather[gather_idx++] = m;
2038 	if (gather_idx == RX_BUNDLE_SIZE) {
2039 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
2040 		gather_idx = 0;
2041 		rq->offload_bundles++;
2042 	}
2043 	return (gather_idx);
2044 }
2045 
2046 static void
2047 restart_tx(struct sge_qset *qs)
2048 {
2049 	struct adapter *sc = qs->port->adapter;
2050 
2051 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2052 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2053 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2054 		qs->txq[TXQ_OFLD].restarts++;
2055 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2056 	}
2057 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2058 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2059 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2060 		qs->txq[TXQ_CTRL].restarts++;
2061 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2062 	}
2063 }
2064 
2065 /**
2066  *	t3_sge_alloc_qset - initialize an SGE queue set
2067  *	@sc: the controller softc
2068  *	@id: the queue set id
2069  *	@nports: how many Ethernet ports will be using this queue set
2070  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2071  *	@p: configuration parameters for this queue set
2072  *	@ntxq: number of Tx queues for the queue set
2073  *	@pi: port info for queue set
2074  *
2075  *	Allocate resources and initialize an SGE queue set.  A queue set
2076  *	comprises a response queue, two Rx free-buffer queues, and up to 3
2077  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2078  *	queue, offload queue, and control queue.
2079  */
2080 int
2081 t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2082 		  const struct qset_params *p, int ntxq, struct port_info *pi)
2083 {
2084 	struct sge_qset *q = &sc->sge.qs[id];
2085 	int i, ret = 0;
2086 
2087 	init_qset_cntxt(q, id);
2088 
2089 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2090 		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2091 		    &q->fl[0].desc, &q->fl[0].sdesc,
2092 		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2093 		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2094 		printf("error %d from alloc ring fl0\n", ret);
2095 		goto err;
2096 	}
2097 
2098 	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2099 		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2100 		    &q->fl[1].desc, &q->fl[1].sdesc,
2101 		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2102 		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2103 		printf("error %d from alloc ring fl1\n", ret);
2104 		goto err;
2105 	}
2106 
2107 	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2108 		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2109 		    &q->rspq.desc_tag, &q->rspq.desc_map,
2110 		    NULL, NULL)) != 0) {
2111 		printf("error %d from alloc ring rspq\n", ret);
2112 		goto err;
2113 	}
2114 
2115 	for (i = 0; i < ntxq; ++i) {
2116 		/*
2117 		 * The control queue always uses immediate data so does not
2118 		 * need to keep track of any mbufs.
2119 		 * XXX Placeholder for future TOE support.
2120 		 */
2121 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2122 
2123 		if ((ret = alloc_ring(sc, p->txq_size[i],
2124 			    sizeof(struct tx_desc), sz,
2125 			    &q->txq[i].phys_addr, &q->txq[i].desc,
2126 			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2127 			    &q->txq[i].desc_map,
2128 			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2129 			printf("error %d from alloc ring tx %i\n", ret, i);
2130 			goto err;
2131 		}
2132 		mbufq_init(&q->txq[i].sendq);
2133 		q->txq[i].gen = 1;
2134 		q->txq[i].size = p->txq_size[i];
2135 		snprintf(q->txq[i].lockbuf, TXQ_NAME_LEN, "t3 txq lock %d:%d:%d",
2136 		    device_get_unit(sc->dev), irq_vec_idx, i);
2137 		MTX_INIT(&q->txq[i].lock, q->txq[i].lockbuf, NULL, MTX_DEF);
2138 	}
2139 
2140 	q->txq[TXQ_ETH].port = pi;
2141 
2142 	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2143 	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2144 	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_ETH]);
2145 	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, &q->txq[TXQ_OFLD]);
2146 
2147 	q->fl[0].gen = q->fl[1].gen = 1;
2148 	q->fl[0].size = p->fl_size;
2149 	q->fl[1].size = p->jumbo_size;
2150 
2151 	q->rspq.gen = 1;
2152 	q->rspq.cidx = 0;
2153 	q->rspq.size = p->rspq_size;
2154 
2155 	q->txq[TXQ_ETH].stop_thres = nports *
2156 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2157 
2158 	q->fl[0].buf_size = MCLBYTES;
2159 	q->fl[0].zone = zone_clust;
2160 	q->fl[0].type = EXT_CLUSTER;
2161 	q->fl[1].buf_size = MJUMPAGESIZE;
2162 	q->fl[1].zone = zone_jumbop;
2163 	q->fl[1].type = EXT_JUMBOP;
2164 
2165 	q->lro.enabled = lro_default;
2166 
2167 	mtx_lock(&sc->sge.reg_lock);
2168 	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2169 				   q->rspq.phys_addr, q->rspq.size,
2170 				   q->fl[0].buf_size, 1, 0);
2171 	if (ret) {
2172 		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2173 		goto err_unlock;
2174 	}
2175 
2176 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2177 		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2178 					  q->fl[i].phys_addr, q->fl[i].size,
2179 					  q->fl[i].buf_size, p->cong_thres, 1,
2180 					  0);
2181 		if (ret) {
2182 			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2183 			goto err_unlock;
2184 		}
2185 	}
2186 
2187 	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2188 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2189 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2190 				 1, 0);
2191 	if (ret) {
2192 		printf("error %d from t3_sge_init_ecntxt\n", ret);
2193 		goto err_unlock;
2194 	}
2195 
2196 	if (ntxq > 1) {
2197 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2198 					 USE_GTS, SGE_CNTXT_OFLD, id,
2199 					 q->txq[TXQ_OFLD].phys_addr,
2200 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2201 		if (ret) {
2202 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2203 			goto err_unlock;
2204 		}
2205 	}
2206 
2207 	if (ntxq > 2) {
2208 		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2209 					 SGE_CNTXT_CTRL, id,
2210 					 q->txq[TXQ_CTRL].phys_addr,
2211 					 q->txq[TXQ_CTRL].size,
2212 					 q->txq[TXQ_CTRL].token, 1, 0);
2213 		if (ret) {
2214 			printf("error %d from t3_sge_init_ecntxt\n", ret);
2215 			goto err_unlock;
2216 		}
2217 	}
2218 
2219 	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2220 	    device_get_unit(sc->dev), irq_vec_idx);
2221 	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2222 
2223 	mtx_unlock(&sc->sge.reg_lock);
2224 	t3_update_qset_coalesce(q, p);
2225 	q->port = pi;
2226 
2227 	refill_fl(sc, &q->fl[0], q->fl[0].size);
2228 	refill_fl(sc, &q->fl[1], q->fl[1].size);
2229 	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2230 
2231 	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2232 		     V_NEWTIMER(q->rspq.holdoff_tmr));
2233 
2234 	return (0);
2235 
2236 err_unlock:
2237 	mtx_unlock(&sc->sge.reg_lock);
2238 err:
2239 	t3_free_qset(sc, q);
2240 
2241 	return (ret);
2242 }
2243 
2244 void
2245 t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
2246 {
2247 	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2248 	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2249 	struct ifnet *ifp = pi->ifp;
2250 
2251 	DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff);
2252 
2253 	if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment &&
2254 	    cpl->csum_valid && cpl->csum == 0xffff) {
2255 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID);
2256 		rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2257 		m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID|CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
2258 		m->m_pkthdr.csum_data = 0xffff;
2259 	}
2260 	/*
2261 	 * XXX need to add VLAN support for 6.x
2262 	 */
2263 #ifdef VLAN_SUPPORTED
2264 	if (__predict_false(cpl->vlan_valid)) {
2265 		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2266 		m->m_flags |= M_VLANTAG;
2267 	}
2268 #endif
2269 
2270 	m->m_pkthdr.rcvif = ifp;
2271 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
2272 	m_explode(m);
2273 	/*
2274 	 * adjust after conversion to mbuf chain
2275 	 */
2276 	m_adj(m, sizeof(*cpl) + ethpad);
2277 
2278 	(*ifp->if_input)(ifp, m);
2279 }
2280 
2281 /**
2282  *	get_packet - return the next ingress packet buffer from a free list
2283  *	@adap: the adapter that received the packet
2284  *	@drop_thres: # of remaining buffers before we start dropping packets
2285  *	@qs: the qset that the SGE free list holding the packet belongs to
2286  *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2287  *      @r: response descriptor
2288  *
2289  *	Get the next packet from a free list and complete setup of the
2290  *	sk_buff.  If the packet is small we make a copy and recycle the
2291  *	original buffer, otherwise we use the original buffer itself.  If a
2292  *	positive drop threshold is supplied packets are dropped and their
2293  *	buffers recycled if (a) the number of remaining buffers is under the
2294  *	threshold and the packet is too big to copy, or (b) the packet should
2295  *	be copied but there is no memory for the copy.
2296  */
2297 #ifdef DISABLE_MBUF_IOVEC
2298 
2299 static int
2300 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2301     struct t3_mbuf_hdr *mh, struct rsp_desc *r, struct mbuf *m)
2302 {
2303 
2304 	unsigned int len_cq =  ntohl(r->len_cq);
2305 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2306 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2307 	uint32_t len = G_RSPD_LEN(len_cq);
2308 	uint32_t flags = ntohl(r->flags);
2309 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2310 	int ret = 0;
2311 
2312 	prefetch(sd->cl);
2313 
2314 	fl->credits--;
2315 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2316 	bus_dmamap_unload(fl->entry_tag, sd->map);
2317 
2318 	m_cljset(m, sd->cl, fl->type);
2319 	m->m_len = len;
2320 
2321 	switch(sopeop) {
2322 	case RSPQ_SOP_EOP:
2323 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2324 		mh->mh_head = mh->mh_tail = m;
2325 		m->m_pkthdr.len = len;
2326 		m->m_flags |= M_PKTHDR;
2327 		ret = 1;
2328 		break;
2329 	case RSPQ_NSOP_NEOP:
2330 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2331 		m->m_flags &= ~M_PKTHDR;
2332 		if (mh->mh_tail == NULL) {
2333 			if (cxgb_debug)
2334 				printf("discarding intermediate descriptor entry\n");
2335 			m_freem(m);
2336 			break;
2337 		}
2338 		mh->mh_tail->m_next = m;
2339 		mh->mh_tail = m;
2340 		mh->mh_head->m_pkthdr.len += len;
2341 		ret = 0;
2342 		break;
2343 	case RSPQ_SOP:
2344 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2345 		m->m_pkthdr.len = len;
2346 		mh->mh_head = mh->mh_tail = m;
2347 		m->m_flags |= M_PKTHDR;
2348 		ret = 0;
2349 		break;
2350 	case RSPQ_EOP:
2351 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2352 		m->m_flags &= ~M_PKTHDR;
2353 		mh->mh_head->m_pkthdr.len += len;
2354 		mh->mh_tail->m_next = m;
2355 		mh->mh_tail = m;
2356 		ret = 1;
2357 		break;
2358 	}
2359 	if (++fl->cidx == fl->size)
2360 		fl->cidx = 0;
2361 
2362 	return (ret);
2363 }
2364 
2365 #else
2366 static int
2367 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2368     struct mbuf *m, struct rsp_desc *r)
2369 {
2370 
2371 	unsigned int len_cq =  ntohl(r->len_cq);
2372 	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2373 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2374 	uint32_t len = G_RSPD_LEN(len_cq);
2375 	uint32_t flags = ntohl(r->flags);
2376 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
2377 	void *cl;
2378 	int ret = 0;
2379 
2380 	prefetch(sd->cl);
2381 
2382 	fl->credits--;
2383 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2384 
2385 	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
2386 		cl = mtod(m, void *);
2387 		memcpy(cl, sd->cl, len);
2388 		recycle_rx_buf(adap, fl, fl->cidx);
2389 	} else {
2390 		cl = sd->cl;
2391 		bus_dmamap_unload(fl->entry_tag, sd->map);
2392 	}
2393 	switch(sopeop) {
2394 	case RSPQ_SOP_EOP:
2395 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
2396 		if (cl == sd->cl)
2397 			m_cljset(m, cl, fl->type);
2398 		m->m_len = m->m_pkthdr.len = len;
2399 		ret = 1;
2400 		goto done;
2401 		break;
2402 	case RSPQ_NSOP_NEOP:
2403 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
2404 		ret = 0;
2405 		break;
2406 	case RSPQ_SOP:
2407 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
2408 		m_iovinit(m);
2409 		ret = 0;
2410 		break;
2411 	case RSPQ_EOP:
2412 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
2413 		ret = 1;
2414 		break;
2415 	}
2416 	m_iovappend(m, cl, fl->buf_size, len, 0);
2417 
2418 done:
2419 	if (++fl->cidx == fl->size)
2420 		fl->cidx = 0;
2421 
2422 	return (ret);
2423 }
2424 #endif
2425 /**
2426  *	handle_rsp_cntrl_info - handles control information in a response
2427  *	@qs: the queue set corresponding to the response
2428  *	@flags: the response control flags
2429  *
2430  *	Handles the control information of an SGE response, such as GTS
2431  *	indications and completion credits for the queue set's Tx queues.
2432  *	HW coalesces credits, we don't do any extra SW coalescing.
2433  */
2434 static __inline void
2435 handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2436 {
2437 	unsigned int credits;
2438 
2439 #if USE_GTS
2440 	if (flags & F_RSPD_TXQ0_GTS)
2441 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2442 #endif
2443 	credits = G_RSPD_TXQ0_CR(flags);
2444 	if (credits) {
2445 		qs->txq[TXQ_ETH].processed += credits;
2446 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
2447 			taskqueue_enqueue(qs->port->adapter->tq,
2448 			    &qs->port->timer_reclaim_task);
2449 	}
2450 
2451 	credits = G_RSPD_TXQ2_CR(flags);
2452 	if (credits)
2453 		qs->txq[TXQ_CTRL].processed += credits;
2454 
2455 # if USE_GTS
2456 	if (flags & F_RSPD_TXQ1_GTS)
2457 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2458 # endif
2459 	credits = G_RSPD_TXQ1_CR(flags);
2460 	if (credits)
2461 		qs->txq[TXQ_OFLD].processed += credits;
2462 }
2463 
2464 static void
2465 check_ring_db(adapter_t *adap, struct sge_qset *qs,
2466     unsigned int sleeping)
2467 {
2468 	;
2469 }
2470 
2471 /**
2472  *	process_responses - process responses from an SGE response queue
2473  *	@adap: the adapter
2474  *	@qs: the queue set to which the response queue belongs
2475  *	@budget: how many responses can be processed in this round
2476  *
2477  *	Process responses from an SGE response queue up to the supplied budget.
2478  *	Responses include received packets as well as credits and other events
2479  *	for the queues that belong to the response queue's queue set.
2480  *	A negative budget is effectively unlimited.
2481  *
2482  *	Additionally choose the interrupt holdoff time for the next interrupt
2483  *	on this queue.  If the system is under memory shortage use a fairly
2484  *	long delay to help recovery.
2485  */
2486 static int
2487 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2488 {
2489 	struct sge_rspq *rspq = &qs->rspq;
2490 	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2491 	int budget_left = budget;
2492 	unsigned int sleeping = 0;
2493 	int lro = qs->lro.enabled;
2494 	struct mbuf *offload_mbufs[RX_BUNDLE_SIZE];
2495 	int ngathered = 0;
2496 #ifdef DEBUG
2497 	static int last_holdoff = 0;
2498 	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2499 		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2500 		last_holdoff = rspq->holdoff_tmr;
2501 	}
2502 #endif
2503 	rspq->next_holdoff = rspq->holdoff_tmr;
2504 
2505 	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2506 		int eth, eop = 0, ethpad = 0;
2507 		uint32_t flags = ntohl(r->flags);
2508 		uint32_t rss_csum = *(const uint32_t *)r;
2509 		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
2510 
2511 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
2512 
2513 		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2514 			/* XXX */
2515 			printf("async notification\n");
2516 
2517 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2518 #ifdef DISABLE_MBUF_IOVEC
2519 
2520 			if (cxgb_debug)
2521 				printf("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
2522 
2523 			if(get_imm_packet(adap, r, &rspq->rspq_mh) == 0) {
2524 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2525 				budget_left--;
2526 				break;
2527 			} else {
2528 				eop = 1;
2529 			}
2530 #else
2531 			struct mbuf *m = NULL;
2532 
2533 			if (rspq->rspq_mbuf == NULL)
2534 				rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
2535                         else
2536 				m = m_gethdr(M_DONTWAIT, MT_DATA);
2537 
2538 			/*
2539 			 * XXX revisit me
2540 			 */
2541 			if (rspq->rspq_mbuf == NULL &&  m == NULL) {
2542 				rspq->next_holdoff = NOMEM_INTR_DELAY;
2543 				budget_left--;
2544 				break;
2545 			}
2546 			if (get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags))
2547 				goto skip;
2548 			eop = 1;
2549 #endif
2550 			rspq->imm_data++;
2551 		} else if (r->len_cq) {
2552 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2553 
2554 #ifdef DISABLE_MBUF_IOVEC
2555 			struct mbuf *m;
2556 			m = m_gethdr(M_NOWAIT, MT_DATA);
2557 
2558 			if (m == NULL) {
2559 				log(LOG_WARNING, "failed to get mbuf for packet\n");
2560 				break;
2561 			}
2562 
2563 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r, m);
2564 #else
2565 			if (rspq->rspq_mbuf == NULL)
2566 				rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
2567 			if (rspq->rspq_mbuf == NULL) {
2568 				log(LOG_WARNING, "failed to get mbuf for packet\n");
2569 				break;
2570 			}
2571 			eop = get_packet(adap, drop_thresh, qs, rspq->rspq_mbuf, r);
2572 #endif
2573 			ethpad = 2;
2574 		} else {
2575 			DPRINTF("pure response\n");
2576 			rspq->pure_rsps++;
2577 		}
2578 
2579 		if (flags & RSPD_CTRL_MASK) {
2580 			sleeping |= flags & RSPD_GTS_MASK;
2581 			handle_rsp_cntrl_info(qs, flags);
2582 		}
2583 #ifndef DISABLE_MBUF_IOVEC
2584 	skip:
2585 #endif
2586 		r++;
2587 		if (__predict_false(++rspq->cidx == rspq->size)) {
2588 			rspq->cidx = 0;
2589 			rspq->gen ^= 1;
2590 			r = rspq->desc;
2591 		}
2592 
2593 		prefetch(r);
2594 		if (++rspq->credits >= (rspq->size / 4)) {
2595 			refill_rspq(adap, rspq, rspq->credits);
2596 			rspq->credits = 0;
2597 		}
2598 
2599 		if (eop) {
2600 			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *));
2601 			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES);
2602 
2603 			if (eth) {
2604 				t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad,
2605 				    rss_hash, rss_csum, lro);
2606 
2607 				rspq->rspq_mh.mh_head = NULL;
2608 			} else {
2609 				rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
2610 				/*
2611 				 * XXX size mismatch
2612 				 */
2613 				m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
2614 
2615 				ngathered = rx_offload(&adap->tdev, rspq,
2616 				    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
2617 			}
2618 			__refill_fl(adap, &qs->fl[0]);
2619 			__refill_fl(adap, &qs->fl[1]);
2620 
2621 		}
2622 		--budget_left;
2623 	}
2624 
2625 	deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered);
2626 	t3_lro_flush(adap, qs, &qs->lro);
2627 
2628 	if (sleeping)
2629 		check_ring_db(adap, qs, sleeping);
2630 
2631 	smp_mb();  /* commit Tx queue processed updates */
2632 	if (__predict_false(qs->txq_stopped != 0))
2633 		restart_tx(qs);
2634 
2635 	budget -= budget_left;
2636 	return (budget);
2637 }
2638 
2639 /*
2640  * A helper function that processes responses and issues GTS.
2641  */
2642 static __inline int
2643 process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
2644 {
2645 	int work;
2646 	static int last_holdoff = 0;
2647 
2648 	work = process_responses(adap, rspq_to_qset(rq), -1);
2649 
2650 	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
2651 		printf("next_holdoff=%d\n", rq->next_holdoff);
2652 		last_holdoff = rq->next_holdoff;
2653 	}
2654 	if (work)
2655 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2656 		    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2657 	return work;
2658 }
2659 
2660 
2661 /*
2662  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2663  * Handles data events from SGE response queues as well as error and other
2664  * async events as they all use the same interrupt pin.  We use one SGE
2665  * response queue per port in this mode and protect all response queues with
2666  * queue 0's lock.
2667  */
2668 void
2669 t3b_intr(void *data)
2670 {
2671 	uint32_t i, map;
2672 	adapter_t *adap = data;
2673 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2674 
2675 	t3_write_reg(adap, A_PL_CLI, 0);
2676 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2677 
2678 	if (!map)
2679 		return;
2680 
2681 	if (__predict_false(map & F_ERRINTR))
2682 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2683 
2684 	mtx_lock(&q0->lock);
2685 	for_each_port(adap, i)
2686 	    if (map & (1 << i))
2687 			process_responses_gts(adap, &adap->sge.qs[i].rspq);
2688 	mtx_unlock(&q0->lock);
2689 }
2690 
2691 /*
2692  * The MSI interrupt handler.  This needs to handle data events from SGE
2693  * response queues as well as error and other async events as they all use
2694  * the same MSI vector.  We use one SGE response queue per port in this mode
2695  * and protect all response queues with queue 0's lock.
2696  */
2697 void
2698 t3_intr_msi(void *data)
2699 {
2700 	adapter_t *adap = data;
2701 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2702 	int i, new_packets = 0;
2703 
2704 	mtx_lock(&q0->lock);
2705 
2706 	for_each_port(adap, i)
2707 	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
2708 		    new_packets = 1;
2709 	mtx_unlock(&q0->lock);
2710 	if (new_packets == 0)
2711 		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
2712 }
2713 
2714 void
2715 t3_intr_msix(void *data)
2716 {
2717 	struct sge_qset *qs = data;
2718 	adapter_t *adap = qs->port->adapter;
2719 	struct sge_rspq *rspq = &qs->rspq;
2720 
2721 	mtx_lock(&rspq->lock);
2722 	if (process_responses_gts(adap, rspq) == 0)
2723 		rspq->unhandled_irqs++;
2724 	mtx_unlock(&rspq->lock);
2725 }
2726 
2727 /*
2728  * broken by recent mbuf changes
2729  */
2730 static int
2731 t3_lro_enable(SYSCTL_HANDLER_ARGS)
2732 {
2733 	adapter_t *sc;
2734 	int i, j, enabled, err, nqsets = 0;
2735 
2736 #ifndef LRO_WORKING
2737 	return (0);
2738 #endif
2739 
2740 	sc = arg1;
2741 	enabled = sc->sge.qs[0].lro.enabled;
2742         err = sysctl_handle_int(oidp, &enabled, arg2, req);
2743 
2744 	if (err != 0)
2745 		return (err);
2746 	if (enabled == sc->sge.qs[0].lro.enabled)
2747 		return (0);
2748 
2749 	for (i = 0; i < sc->params.nports; i++)
2750 		for (j = 0; j < sc->port[i].nqsets; j++)
2751 			nqsets++;
2752 
2753 	for (i = 0; i < nqsets; i++)
2754 		sc->sge.qs[i].lro.enabled = enabled;
2755 
2756 	return (0);
2757 }
2758 
2759 static int
2760 t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
2761 {
2762 	adapter_t *sc = arg1;
2763 	struct qset_params *qsp = &sc->params.sge.qset[0];
2764 	int coalesce_nsecs;
2765 	struct sge_qset *qs;
2766 	int i, j, err, nqsets = 0;
2767 	struct mtx *lock;
2768 
2769 	coalesce_nsecs = qsp->coalesce_nsecs;
2770         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
2771 
2772 	if (err != 0) {
2773 		return (err);
2774 	}
2775 	if (coalesce_nsecs == qsp->coalesce_nsecs)
2776 		return (0);
2777 
2778 	for (i = 0; i < sc->params.nports; i++)
2779 		for (j = 0; j < sc->port[i].nqsets; j++)
2780 			nqsets++;
2781 
2782 	coalesce_nsecs = max(100, coalesce_nsecs);
2783 
2784 	for (i = 0; i < nqsets; i++) {
2785 		qs = &sc->sge.qs[i];
2786 		qsp = &sc->params.sge.qset[i];
2787 		qsp->coalesce_nsecs = coalesce_nsecs;
2788 
2789 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
2790 			    &sc->sge.qs[0].rspq.lock;
2791 
2792 		mtx_lock(lock);
2793 		t3_update_qset_coalesce(qs, qsp);
2794 		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2795 		    V_NEWTIMER(qs->rspq.holdoff_tmr));
2796 		mtx_unlock(lock);
2797 	}
2798 
2799 	return (0);
2800 }
2801 
2802 
2803 void
2804 t3_add_sysctls(adapter_t *sc)
2805 {
2806 	struct sysctl_ctx_list *ctx;
2807 	struct sysctl_oid_list *children;
2808 
2809 	ctx = device_get_sysctl_ctx(sc->dev);
2810 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
2811 
2812 	/* random information */
2813 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
2814 	    "firmware_version",
2815 	    CTLFLAG_RD, &sc->fw_version,
2816 	    0, "firmware version");
2817 
2818 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2819 	    "enable_lro",
2820 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2821 	    0, t3_lro_enable,
2822 	    "I", "enable large receive offload");
2823 
2824 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
2825 	    "intr_coal",
2826 	    CTLTYPE_INT|CTLFLAG_RW, sc,
2827 	    0, t3_set_coalesce_nsecs,
2828 	    "I", "interrupt coalescing timer (ns)");
2829 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2830 	    "enable_debug",
2831 	    CTLFLAG_RW, &cxgb_debug,
2832 	    0, "enable verbose debugging output");
2833 
2834 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2835 	    "collapse_free",
2836 	    CTLFLAG_RD, &collapse_free,
2837 	    0, "frees during collapse");
2838 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2839 	    "mb_free_vec_free",
2840 	    CTLFLAG_RD, &mb_free_vec_free,
2841 	    0, "frees during mb_free_vec");
2842 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2843 	    "collapse_mbufs",
2844 	    CTLFLAG_RW, &collapse_mbufs,
2845 	    0, "collapse mbuf chains into iovecs");
2846 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2847 	    "txq_overrun",
2848 	    CTLFLAG_RD, &txq_fills,
2849 	    0, "#times txq overrun");
2850 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
2851 	    "bogus_imm",
2852 	    CTLFLAG_RD, &bogus_imm,
2853 	    0, "#times a bogus immediate response was seen");
2854 }
2855 
2856 /**
2857  *	t3_get_desc - dump an SGE descriptor for debugging purposes
2858  *	@qs: the queue set
2859  *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2860  *	@idx: the descriptor index in the queue
2861  *	@data: where to dump the descriptor contents
2862  *
2863  *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2864  *	size of the descriptor.
2865  */
2866 int
2867 t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2868 		unsigned char *data)
2869 {
2870 	if (qnum >= 6)
2871 		return (EINVAL);
2872 
2873 	if (qnum < 3) {
2874 		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2875 			return -EINVAL;
2876 		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2877 		return sizeof(struct tx_desc);
2878 	}
2879 
2880 	if (qnum == 3) {
2881 		if (!qs->rspq.desc || idx >= qs->rspq.size)
2882 			return (EINVAL);
2883 		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2884 		return sizeof(struct rsp_desc);
2885 	}
2886 
2887 	qnum -= 4;
2888 	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2889 		return (EINVAL);
2890 	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2891 	return sizeof(struct rx_desc);
2892 }
2893