xref: /linux/drivers/net/ethernet/chelsio/cxgb3/sge.c (revision ff5599816711d2e67da2d7561fd36ac48debd433)
1 /*
2  * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 #include <linux/skbuff.h>
33 #include <linux/netdevice.h>
34 #include <linux/etherdevice.h>
35 #include <linux/if_vlan.h>
36 #include <linux/ip.h>
37 #include <linux/tcp.h>
38 #include <linux/dma-mapping.h>
39 #include <linux/slab.h>
40 #include <linux/prefetch.h>
41 #include <net/arp.h>
42 #include "common.h"
43 #include "regs.h"
44 #include "sge_defs.h"
45 #include "t3_cpl.h"
46 #include "firmware_exports.h"
47 #include "cxgb3_offload.h"
48 
49 #define USE_GTS 0
50 
51 #define SGE_RX_SM_BUF_SIZE 1536
52 
53 #define SGE_RX_COPY_THRES  256
54 #define SGE_RX_PULL_LEN    128
55 
56 #define SGE_PG_RSVD SMP_CACHE_BYTES
57 /*
58  * Page chunk size for FL0 buffers if FL0 is to be populated with page chunks.
59  * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
60  * directly.
61  */
62 #define FL0_PG_CHUNK_SIZE  2048
63 #define FL0_PG_ORDER 0
64 #define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER)
65 #define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
66 #define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
67 #define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
68 
69 #define SGE_RX_DROP_THRES 16
70 #define RX_RECLAIM_PERIOD (HZ/4)
71 
72 /*
73  * Max number of Rx buffers we replenish at a time.
74  */
75 #define MAX_RX_REFILL 16U
76 /*
77  * Period of the Tx buffer reclaim timer.  This timer does not need to run
78  * frequently as Tx buffers are usually reclaimed by new Tx packets.
79  */
80 #define TX_RECLAIM_PERIOD (HZ / 4)
81 #define TX_RECLAIM_TIMER_CHUNK 64U
82 #define TX_RECLAIM_CHUNK 16U
83 
84 /* WR size in bytes */
85 #define WR_LEN (WR_FLITS * 8)
86 
87 /*
88  * Types of Tx queues in each queue set.  Order here matters, do not change.
89  */
90 enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
91 
92 /* Values for sge_txq.flags */
93 enum {
94 	TXQ_RUNNING = 1 << 0,	/* fetch engine is running */
95 	TXQ_LAST_PKT_DB = 1 << 1,	/* last packet rang the doorbell */
96 };
97 
98 struct tx_desc {
99 	__be64 flit[TX_DESC_FLITS];
100 };
101 
102 struct rx_desc {
103 	__be32 addr_lo;
104 	__be32 len_gen;
105 	__be32 gen2;
106 	__be32 addr_hi;
107 };
108 
109 struct tx_sw_desc {		/* SW state per Tx descriptor */
110 	struct sk_buff *skb;
111 	u8 eop;       /* set if last descriptor for packet */
112 	u8 addr_idx;  /* buffer index of first SGL entry in descriptor */
113 	u8 fragidx;   /* first page fragment associated with descriptor */
114 	s8 sflit;     /* start flit of first SGL entry in descriptor */
115 };
116 
117 struct rx_sw_desc {                /* SW state per Rx descriptor */
118 	union {
119 		struct sk_buff *skb;
120 		struct fl_pg_chunk pg_chunk;
121 	};
122 	DEFINE_DMA_UNMAP_ADDR(dma_addr);
123 };
124 
125 struct rsp_desc {		/* response queue descriptor */
126 	struct rss_header rss_hdr;
127 	__be32 flags;
128 	__be32 len_cq;
129 	u8 imm_data[47];
130 	u8 intr_gen;
131 };
132 
133 /*
134  * Holds unmapping information for Tx packets that need deferred unmapping.
135  * This structure lives at skb->head and must be allocated by callers.
136  */
137 struct deferred_unmap_info {
138 	struct pci_dev *pdev;
139 	dma_addr_t addr[MAX_SKB_FRAGS + 1];
140 };
141 
142 /*
143  * Maps a number of flits to the number of Tx descriptors that can hold them.
144  * The formula is
145  *
146  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
147  *
148  * HW allows up to 4 descriptors to be combined into a WR.
149  */
150 static u8 flit_desc_map[] = {
151 	0,
152 #if SGE_NUM_GENBITS == 1
153 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
155 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
156 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
157 #elif SGE_NUM_GENBITS == 2
158 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
160 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
161 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
162 #else
163 # error "SGE_NUM_GENBITS must be 1 or 2"
164 #endif
165 };
166 
167 static inline struct sge_qset *fl_to_qset(const struct sge_fl *q, int qidx)
168 {
169 	return container_of(q, struct sge_qset, fl[qidx]);
170 }
171 
172 static inline struct sge_qset *rspq_to_qset(const struct sge_rspq *q)
173 {
174 	return container_of(q, struct sge_qset, rspq);
175 }
176 
177 static inline struct sge_qset *txq_to_qset(const struct sge_txq *q, int qidx)
178 {
179 	return container_of(q, struct sge_qset, txq[qidx]);
180 }
181 
182 /**
183  *	refill_rspq - replenish an SGE response queue
184  *	@adapter: the adapter
185  *	@q: the response queue to replenish
186  *	@credits: how many new responses to make available
187  *
188  *	Replenishes a response queue by making the supplied number of responses
189  *	available to HW.
190  */
191 static inline void refill_rspq(struct adapter *adapter,
192 			       const struct sge_rspq *q, unsigned int credits)
193 {
194 	rmb();
195 	t3_write_reg(adapter, A_SG_RSPQ_CREDIT_RETURN,
196 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
197 }
198 
199 /**
200  *	need_skb_unmap - does the platform need unmapping of sk_buffs?
201  *
202  *	Returns true if the platform needs sk_buff unmapping.  The compiler
203  *	optimizes away unnecessary code if this returns true.
204  */
205 static inline int need_skb_unmap(void)
206 {
207 #ifdef CONFIG_NEED_DMA_MAP_STATE
208 	return 1;
209 #else
210 	return 0;
211 #endif
212 }
213 
214 /**
215  *	unmap_skb - unmap a packet main body and its page fragments
216  *	@skb: the packet
217  *	@q: the Tx queue containing Tx descriptors for the packet
218  *	@cidx: index of Tx descriptor
219  *	@pdev: the PCI device
220  *
221  *	Unmap the main body of an sk_buff and its page fragments, if any.
222  *	Because of the fairly complicated structure of our SGLs and the desire
223  *	to conserve space for metadata, the information necessary to unmap an
224  *	sk_buff is spread across the sk_buff itself (buffer lengths), the HW Tx
225  *	descriptors (the physical addresses of the various data buffers), and
226  *	the SW descriptor state (assorted indices).  The send functions
227  *	initialize the indices for the first packet descriptor so we can unmap
228  *	the buffers held in the first Tx descriptor here, and we have enough
229  *	information at this point to set the state for the next Tx descriptor.
230  *
231  *	Note that it is possible to clean up the first descriptor of a packet
232  *	before the send routines have written the next descriptors, but this
233  *	race does not cause any problem.  We just end up writing the unmapping
234  *	info for the descriptor first.
235  */
236 static inline void unmap_skb(struct sk_buff *skb, struct sge_txq *q,
237 			     unsigned int cidx, struct pci_dev *pdev)
238 {
239 	const struct sg_ent *sgp;
240 	struct tx_sw_desc *d = &q->sdesc[cidx];
241 	int nfrags, frag_idx, curflit, j = d->addr_idx;
242 
243 	sgp = (struct sg_ent *)&q->desc[cidx].flit[d->sflit];
244 	frag_idx = d->fragidx;
245 
246 	if (frag_idx == 0 && skb_headlen(skb)) {
247 		pci_unmap_single(pdev, be64_to_cpu(sgp->addr[0]),
248 				 skb_headlen(skb), PCI_DMA_TODEVICE);
249 		j = 1;
250 	}
251 
252 	curflit = d->sflit + 1 + j;
253 	nfrags = skb_shinfo(skb)->nr_frags;
254 
255 	while (frag_idx < nfrags && curflit < WR_FLITS) {
256 		pci_unmap_page(pdev, be64_to_cpu(sgp->addr[j]),
257 			       skb_frag_size(&skb_shinfo(skb)->frags[frag_idx]),
258 			       PCI_DMA_TODEVICE);
259 		j ^= 1;
260 		if (j == 0) {
261 			sgp++;
262 			curflit++;
263 		}
264 		curflit++;
265 		frag_idx++;
266 	}
267 
268 	if (frag_idx < nfrags) {   /* SGL continues into next Tx descriptor */
269 		d = cidx + 1 == q->size ? q->sdesc : d + 1;
270 		d->fragidx = frag_idx;
271 		d->addr_idx = j;
272 		d->sflit = curflit - WR_FLITS - j; /* sflit can be -1 */
273 	}
274 }
275 
276 /**
277  *	free_tx_desc - reclaims Tx descriptors and their buffers
278  *	@adapter: the adapter
279  *	@q: the Tx queue to reclaim descriptors from
280  *	@n: the number of descriptors to reclaim
281  *
282  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
283  *	Tx buffers.  Called with the Tx queue lock held.
284  */
285 static void free_tx_desc(struct adapter *adapter, struct sge_txq *q,
286 			 unsigned int n)
287 {
288 	struct tx_sw_desc *d;
289 	struct pci_dev *pdev = adapter->pdev;
290 	unsigned int cidx = q->cidx;
291 
292 	const int need_unmap = need_skb_unmap() &&
293 			       q->cntxt_id >= FW_TUNNEL_SGEEC_START;
294 
295 	d = &q->sdesc[cidx];
296 	while (n--) {
297 		if (d->skb) {	/* an SGL is present */
298 			if (need_unmap)
299 				unmap_skb(d->skb, q, cidx, pdev);
300 			if (d->eop) {
301 				kfree_skb(d->skb);
302 				d->skb = NULL;
303 			}
304 		}
305 		++d;
306 		if (++cidx == q->size) {
307 			cidx = 0;
308 			d = q->sdesc;
309 		}
310 	}
311 	q->cidx = cidx;
312 }
313 
314 /**
315  *	reclaim_completed_tx - reclaims completed Tx descriptors
316  *	@adapter: the adapter
317  *	@q: the Tx queue to reclaim completed descriptors from
318  *	@chunk: maximum number of descriptors to reclaim
319  *
320  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
321  *	and frees the associated buffers if possible.  Called with the Tx
322  *	queue's lock held.
323  */
324 static inline unsigned int reclaim_completed_tx(struct adapter *adapter,
325 						struct sge_txq *q,
326 						unsigned int chunk)
327 {
328 	unsigned int reclaim = q->processed - q->cleaned;
329 
330 	reclaim = min(chunk, reclaim);
331 	if (reclaim) {
332 		free_tx_desc(adapter, q, reclaim);
333 		q->cleaned += reclaim;
334 		q->in_use -= reclaim;
335 	}
336 	return q->processed - q->cleaned;
337 }
338 
339 /**
340  *	should_restart_tx - are there enough resources to restart a Tx queue?
341  *	@q: the Tx queue
342  *
343  *	Checks if there are enough descriptors to restart a suspended Tx queue.
344  */
345 static inline int should_restart_tx(const struct sge_txq *q)
346 {
347 	unsigned int r = q->processed - q->cleaned;
348 
349 	return q->in_use - r < (q->size >> 1);
350 }
351 
352 static void clear_rx_desc(struct pci_dev *pdev, const struct sge_fl *q,
353 			  struct rx_sw_desc *d)
354 {
355 	if (q->use_pages && d->pg_chunk.page) {
356 		(*d->pg_chunk.p_cnt)--;
357 		if (!*d->pg_chunk.p_cnt)
358 			pci_unmap_page(pdev,
359 				       d->pg_chunk.mapping,
360 				       q->alloc_size, PCI_DMA_FROMDEVICE);
361 
362 		put_page(d->pg_chunk.page);
363 		d->pg_chunk.page = NULL;
364 	} else {
365 		pci_unmap_single(pdev, dma_unmap_addr(d, dma_addr),
366 				 q->buf_size, PCI_DMA_FROMDEVICE);
367 		kfree_skb(d->skb);
368 		d->skb = NULL;
369 	}
370 }
371 
372 /**
373  *	free_rx_bufs - free the Rx buffers on an SGE free list
374  *	@pdev: the PCI device associated with the adapter
375  *	@rxq: the SGE free list to clean up
376  *
377  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
378  *	this queue should be stopped before calling this function.
379  */
380 static void free_rx_bufs(struct pci_dev *pdev, struct sge_fl *q)
381 {
382 	unsigned int cidx = q->cidx;
383 
384 	while (q->credits--) {
385 		struct rx_sw_desc *d = &q->sdesc[cidx];
386 
387 
388 		clear_rx_desc(pdev, q, d);
389 		if (++cidx == q->size)
390 			cidx = 0;
391 	}
392 
393 	if (q->pg_chunk.page) {
394 		__free_pages(q->pg_chunk.page, q->order);
395 		q->pg_chunk.page = NULL;
396 	}
397 }
398 
399 /**
400  *	add_one_rx_buf - add a packet buffer to a free-buffer list
401  *	@va:  buffer start VA
402  *	@len: the buffer length
403  *	@d: the HW Rx descriptor to write
404  *	@sd: the SW Rx descriptor to write
405  *	@gen: the generation bit value
406  *	@pdev: the PCI device associated with the adapter
407  *
408  *	Add a buffer of the given length to the supplied HW and SW Rx
409  *	descriptors.
410  */
411 static inline int add_one_rx_buf(void *va, unsigned int len,
412 				 struct rx_desc *d, struct rx_sw_desc *sd,
413 				 unsigned int gen, struct pci_dev *pdev)
414 {
415 	dma_addr_t mapping;
416 
417 	mapping = pci_map_single(pdev, va, len, PCI_DMA_FROMDEVICE);
418 	if (unlikely(pci_dma_mapping_error(pdev, mapping)))
419 		return -ENOMEM;
420 
421 	dma_unmap_addr_set(sd, dma_addr, mapping);
422 
423 	d->addr_lo = cpu_to_be32(mapping);
424 	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
425 	wmb();
426 	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
427 	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
428 	return 0;
429 }
430 
431 static inline int add_one_rx_chunk(dma_addr_t mapping, struct rx_desc *d,
432 				   unsigned int gen)
433 {
434 	d->addr_lo = cpu_to_be32(mapping);
435 	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
436 	wmb();
437 	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
438 	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
439 	return 0;
440 }
441 
442 static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q,
443 			  struct rx_sw_desc *sd, gfp_t gfp,
444 			  unsigned int order)
445 {
446 	if (!q->pg_chunk.page) {
447 		dma_addr_t mapping;
448 
449 		q->pg_chunk.page = alloc_pages(gfp, order);
450 		if (unlikely(!q->pg_chunk.page))
451 			return -ENOMEM;
452 		q->pg_chunk.va = page_address(q->pg_chunk.page);
453 		q->pg_chunk.p_cnt = q->pg_chunk.va + (PAGE_SIZE << order) -
454 				    SGE_PG_RSVD;
455 		q->pg_chunk.offset = 0;
456 		mapping = pci_map_page(adapter->pdev, q->pg_chunk.page,
457 				       0, q->alloc_size, PCI_DMA_FROMDEVICE);
458 		if (unlikely(pci_dma_mapping_error(adapter->pdev, mapping))) {
459 			__free_pages(q->pg_chunk.page, order);
460 			q->pg_chunk.page = NULL;
461 			return -EIO;
462 		}
463 		q->pg_chunk.mapping = mapping;
464 	}
465 	sd->pg_chunk = q->pg_chunk;
466 
467 	prefetch(sd->pg_chunk.p_cnt);
468 
469 	q->pg_chunk.offset += q->buf_size;
470 	if (q->pg_chunk.offset == (PAGE_SIZE << order))
471 		q->pg_chunk.page = NULL;
472 	else {
473 		q->pg_chunk.va += q->buf_size;
474 		get_page(q->pg_chunk.page);
475 	}
476 
477 	if (sd->pg_chunk.offset == 0)
478 		*sd->pg_chunk.p_cnt = 1;
479 	else
480 		*sd->pg_chunk.p_cnt += 1;
481 
482 	return 0;
483 }
484 
485 static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
486 {
487 	if (q->pend_cred >= q->credits / 4) {
488 		q->pend_cred = 0;
489 		wmb();
490 		t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
491 	}
492 }
493 
494 /**
495  *	refill_fl - refill an SGE free-buffer list
496  *	@adapter: the adapter
497  *	@q: the free-list to refill
498  *	@n: the number of new buffers to allocate
499  *	@gfp: the gfp flags for allocating new buffers
500  *
501  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers,
502  *	allocated with the supplied gfp flags.  The caller must assure that
503  *	@n does not exceed the queue's capacity.
504  */
505 static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp)
506 {
507 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
508 	struct rx_desc *d = &q->desc[q->pidx];
509 	unsigned int count = 0;
510 
511 	while (n--) {
512 		dma_addr_t mapping;
513 		int err;
514 
515 		if (q->use_pages) {
516 			if (unlikely(alloc_pg_chunk(adap, q, sd, gfp,
517 						    q->order))) {
518 nomem:				q->alloc_failed++;
519 				break;
520 			}
521 			mapping = sd->pg_chunk.mapping + sd->pg_chunk.offset;
522 			dma_unmap_addr_set(sd, dma_addr, mapping);
523 
524 			add_one_rx_chunk(mapping, d, q->gen);
525 			pci_dma_sync_single_for_device(adap->pdev, mapping,
526 						q->buf_size - SGE_PG_RSVD,
527 						PCI_DMA_FROMDEVICE);
528 		} else {
529 			void *buf_start;
530 
531 			struct sk_buff *skb = alloc_skb(q->buf_size, gfp);
532 			if (!skb)
533 				goto nomem;
534 
535 			sd->skb = skb;
536 			buf_start = skb->data;
537 			err = add_one_rx_buf(buf_start, q->buf_size, d, sd,
538 					     q->gen, adap->pdev);
539 			if (unlikely(err)) {
540 				clear_rx_desc(adap->pdev, q, sd);
541 				break;
542 			}
543 		}
544 
545 		d++;
546 		sd++;
547 		if (++q->pidx == q->size) {
548 			q->pidx = 0;
549 			q->gen ^= 1;
550 			sd = q->sdesc;
551 			d = q->desc;
552 		}
553 		count++;
554 	}
555 
556 	q->credits += count;
557 	q->pend_cred += count;
558 	ring_fl_db(adap, q);
559 
560 	return count;
561 }
562 
563 static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
564 {
565 	refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits),
566 		  GFP_ATOMIC | __GFP_COMP);
567 }
568 
569 /**
570  *	recycle_rx_buf - recycle a receive buffer
571  *	@adapter: the adapter
572  *	@q: the SGE free list
573  *	@idx: index of buffer to recycle
574  *
575  *	Recycles the specified buffer on the given free list by adding it at
576  *	the next available slot on the list.
577  */
578 static void recycle_rx_buf(struct adapter *adap, struct sge_fl *q,
579 			   unsigned int idx)
580 {
581 	struct rx_desc *from = &q->desc[idx];
582 	struct rx_desc *to = &q->desc[q->pidx];
583 
584 	q->sdesc[q->pidx] = q->sdesc[idx];
585 	to->addr_lo = from->addr_lo;	/* already big endian */
586 	to->addr_hi = from->addr_hi;	/* likewise */
587 	wmb();
588 	to->len_gen = cpu_to_be32(V_FLD_GEN1(q->gen));
589 	to->gen2 = cpu_to_be32(V_FLD_GEN2(q->gen));
590 
591 	if (++q->pidx == q->size) {
592 		q->pidx = 0;
593 		q->gen ^= 1;
594 	}
595 
596 	q->credits++;
597 	q->pend_cred++;
598 	ring_fl_db(adap, q);
599 }
600 
601 /**
602  *	alloc_ring - allocate resources for an SGE descriptor ring
603  *	@pdev: the PCI device
604  *	@nelem: the number of descriptors
605  *	@elem_size: the size of each descriptor
606  *	@sw_size: the size of the SW state associated with each ring element
607  *	@phys: the physical address of the allocated ring
608  *	@metadata: address of the array holding the SW state for the ring
609  *
610  *	Allocates resources for an SGE descriptor ring, such as Tx queues,
611  *	free buffer lists, or response queues.  Each SGE ring requires
612  *	space for its HW descriptors plus, optionally, space for the SW state
613  *	associated with each HW entry (the metadata).  The function returns
614  *	three values: the virtual address for the HW ring (the return value
615  *	of the function), the physical address of the HW ring, and the address
616  *	of the SW ring.
617  */
618 static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
619 			size_t sw_size, dma_addr_t * phys, void *metadata)
620 {
621 	size_t len = nelem * elem_size;
622 	void *s = NULL;
623 	void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
624 
625 	if (!p)
626 		return NULL;
627 	if (sw_size && metadata) {
628 		s = kcalloc(nelem, sw_size, GFP_KERNEL);
629 
630 		if (!s) {
631 			dma_free_coherent(&pdev->dev, len, p, *phys);
632 			return NULL;
633 		}
634 		*(void **)metadata = s;
635 	}
636 	memset(p, 0, len);
637 	return p;
638 }
639 
640 /**
641  *	t3_reset_qset - reset a sge qset
642  *	@q: the queue set
643  *
644  *	Reset the qset structure.
645  *	the NAPI structure is preserved in the event of
646  *	the qset's reincarnation, for example during EEH recovery.
647  */
648 static void t3_reset_qset(struct sge_qset *q)
649 {
650 	if (q->adap &&
651 	    !(q->adap->flags & NAPI_INIT)) {
652 		memset(q, 0, sizeof(*q));
653 		return;
654 	}
655 
656 	q->adap = NULL;
657 	memset(&q->rspq, 0, sizeof(q->rspq));
658 	memset(q->fl, 0, sizeof(struct sge_fl) * SGE_RXQ_PER_SET);
659 	memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
660 	q->txq_stopped = 0;
661 	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
662 	q->rx_reclaim_timer.function = NULL;
663 	q->nomem = 0;
664 	napi_free_frags(&q->napi);
665 }
666 
667 
668 /**
669  *	free_qset - free the resources of an SGE queue set
670  *	@adapter: the adapter owning the queue set
671  *	@q: the queue set
672  *
673  *	Release the HW and SW resources associated with an SGE queue set, such
674  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
675  *	queue set must be quiesced prior to calling this.
676  */
677 static void t3_free_qset(struct adapter *adapter, struct sge_qset *q)
678 {
679 	int i;
680 	struct pci_dev *pdev = adapter->pdev;
681 
682 	for (i = 0; i < SGE_RXQ_PER_SET; ++i)
683 		if (q->fl[i].desc) {
684 			spin_lock_irq(&adapter->sge.reg_lock);
685 			t3_sge_disable_fl(adapter, q->fl[i].cntxt_id);
686 			spin_unlock_irq(&adapter->sge.reg_lock);
687 			free_rx_bufs(pdev, &q->fl[i]);
688 			kfree(q->fl[i].sdesc);
689 			dma_free_coherent(&pdev->dev,
690 					  q->fl[i].size *
691 					  sizeof(struct rx_desc), q->fl[i].desc,
692 					  q->fl[i].phys_addr);
693 		}
694 
695 	for (i = 0; i < SGE_TXQ_PER_SET; ++i)
696 		if (q->txq[i].desc) {
697 			spin_lock_irq(&adapter->sge.reg_lock);
698 			t3_sge_enable_ecntxt(adapter, q->txq[i].cntxt_id, 0);
699 			spin_unlock_irq(&adapter->sge.reg_lock);
700 			if (q->txq[i].sdesc) {
701 				free_tx_desc(adapter, &q->txq[i],
702 					     q->txq[i].in_use);
703 				kfree(q->txq[i].sdesc);
704 			}
705 			dma_free_coherent(&pdev->dev,
706 					  q->txq[i].size *
707 					  sizeof(struct tx_desc),
708 					  q->txq[i].desc, q->txq[i].phys_addr);
709 			__skb_queue_purge(&q->txq[i].sendq);
710 		}
711 
712 	if (q->rspq.desc) {
713 		spin_lock_irq(&adapter->sge.reg_lock);
714 		t3_sge_disable_rspcntxt(adapter, q->rspq.cntxt_id);
715 		spin_unlock_irq(&adapter->sge.reg_lock);
716 		dma_free_coherent(&pdev->dev,
717 				  q->rspq.size * sizeof(struct rsp_desc),
718 				  q->rspq.desc, q->rspq.phys_addr);
719 	}
720 
721 	t3_reset_qset(q);
722 }
723 
724 /**
725  *	init_qset_cntxt - initialize an SGE queue set context info
726  *	@qs: the queue set
727  *	@id: the queue set id
728  *
729  *	Initializes the TIDs and context ids for the queues of a queue set.
730  */
731 static void init_qset_cntxt(struct sge_qset *qs, unsigned int id)
732 {
733 	qs->rspq.cntxt_id = id;
734 	qs->fl[0].cntxt_id = 2 * id;
735 	qs->fl[1].cntxt_id = 2 * id + 1;
736 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
737 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
738 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
739 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
740 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
741 }
742 
743 /**
744  *	sgl_len - calculates the size of an SGL of the given capacity
745  *	@n: the number of SGL entries
746  *
747  *	Calculates the number of flits needed for a scatter/gather list that
748  *	can hold the given number of entries.
749  */
750 static inline unsigned int sgl_len(unsigned int n)
751 {
752 	/* alternatively: 3 * (n / 2) + 2 * (n & 1) */
753 	return (3 * n) / 2 + (n & 1);
754 }
755 
756 /**
757  *	flits_to_desc - returns the num of Tx descriptors for the given flits
758  *	@n: the number of flits
759  *
760  *	Calculates the number of Tx descriptors needed for the supplied number
761  *	of flits.
762  */
763 static inline unsigned int flits_to_desc(unsigned int n)
764 {
765 	BUG_ON(n >= ARRAY_SIZE(flit_desc_map));
766 	return flit_desc_map[n];
767 }
768 
769 /**
770  *	get_packet - return the next ingress packet buffer from a free list
771  *	@adap: the adapter that received the packet
772  *	@fl: the SGE free list holding the packet
773  *	@len: the packet length including any SGE padding
774  *	@drop_thres: # of remaining buffers before we start dropping packets
775  *
776  *	Get the next packet from a free list and complete setup of the
777  *	sk_buff.  If the packet is small we make a copy and recycle the
778  *	original buffer, otherwise we use the original buffer itself.  If a
779  *	positive drop threshold is supplied packets are dropped and their
780  *	buffers recycled if (a) the number of remaining buffers is under the
781  *	threshold and the packet is too big to copy, or (b) the packet should
782  *	be copied but there is no memory for the copy.
783  */
784 static struct sk_buff *get_packet(struct adapter *adap, struct sge_fl *fl,
785 				  unsigned int len, unsigned int drop_thres)
786 {
787 	struct sk_buff *skb = NULL;
788 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
789 
790 	prefetch(sd->skb->data);
791 	fl->credits--;
792 
793 	if (len <= SGE_RX_COPY_THRES) {
794 		skb = alloc_skb(len, GFP_ATOMIC);
795 		if (likely(skb != NULL)) {
796 			__skb_put(skb, len);
797 			pci_dma_sync_single_for_cpu(adap->pdev,
798 					    dma_unmap_addr(sd, dma_addr), len,
799 					    PCI_DMA_FROMDEVICE);
800 			memcpy(skb->data, sd->skb->data, len);
801 			pci_dma_sync_single_for_device(adap->pdev,
802 					    dma_unmap_addr(sd, dma_addr), len,
803 					    PCI_DMA_FROMDEVICE);
804 		} else if (!drop_thres)
805 			goto use_orig_buf;
806 recycle:
807 		recycle_rx_buf(adap, fl, fl->cidx);
808 		return skb;
809 	}
810 
811 	if (unlikely(fl->credits < drop_thres) &&
812 	    refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits - 1),
813 		      GFP_ATOMIC | __GFP_COMP) == 0)
814 		goto recycle;
815 
816 use_orig_buf:
817 	pci_unmap_single(adap->pdev, dma_unmap_addr(sd, dma_addr),
818 			 fl->buf_size, PCI_DMA_FROMDEVICE);
819 	skb = sd->skb;
820 	skb_put(skb, len);
821 	__refill_fl(adap, fl);
822 	return skb;
823 }
824 
825 /**
826  *	get_packet_pg - return the next ingress packet buffer from a free list
827  *	@adap: the adapter that received the packet
828  *	@fl: the SGE free list holding the packet
829  *	@len: the packet length including any SGE padding
830  *	@drop_thres: # of remaining buffers before we start dropping packets
831  *
832  *	Get the next packet from a free list populated with page chunks.
833  *	If the packet is small we make a copy and recycle the original buffer,
834  *	otherwise we attach the original buffer as a page fragment to a fresh
835  *	sk_buff.  If a positive drop threshold is supplied packets are dropped
836  *	and their buffers recycled if (a) the number of remaining buffers is
837  *	under the threshold and the packet is too big to copy, or (b) there's
838  *	no system memory.
839  *
840  * 	Note: this function is similar to @get_packet but deals with Rx buffers
841  * 	that are page chunks rather than sk_buffs.
842  */
843 static struct sk_buff *get_packet_pg(struct adapter *adap, struct sge_fl *fl,
844 				     struct sge_rspq *q, unsigned int len,
845 				     unsigned int drop_thres)
846 {
847 	struct sk_buff *newskb, *skb;
848 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
849 
850 	dma_addr_t dma_addr = dma_unmap_addr(sd, dma_addr);
851 
852 	newskb = skb = q->pg_skb;
853 	if (!skb && (len <= SGE_RX_COPY_THRES)) {
854 		newskb = alloc_skb(len, GFP_ATOMIC);
855 		if (likely(newskb != NULL)) {
856 			__skb_put(newskb, len);
857 			pci_dma_sync_single_for_cpu(adap->pdev, dma_addr, len,
858 					    PCI_DMA_FROMDEVICE);
859 			memcpy(newskb->data, sd->pg_chunk.va, len);
860 			pci_dma_sync_single_for_device(adap->pdev, dma_addr,
861 						       len,
862 						       PCI_DMA_FROMDEVICE);
863 		} else if (!drop_thres)
864 			return NULL;
865 recycle:
866 		fl->credits--;
867 		recycle_rx_buf(adap, fl, fl->cidx);
868 		q->rx_recycle_buf++;
869 		return newskb;
870 	}
871 
872 	if (unlikely(q->rx_recycle_buf || (!skb && fl->credits <= drop_thres)))
873 		goto recycle;
874 
875 	prefetch(sd->pg_chunk.p_cnt);
876 
877 	if (!skb)
878 		newskb = alloc_skb(SGE_RX_PULL_LEN, GFP_ATOMIC);
879 
880 	if (unlikely(!newskb)) {
881 		if (!drop_thres)
882 			return NULL;
883 		goto recycle;
884 	}
885 
886 	pci_dma_sync_single_for_cpu(adap->pdev, dma_addr, len,
887 				    PCI_DMA_FROMDEVICE);
888 	(*sd->pg_chunk.p_cnt)--;
889 	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
890 		pci_unmap_page(adap->pdev,
891 			       sd->pg_chunk.mapping,
892 			       fl->alloc_size,
893 			       PCI_DMA_FROMDEVICE);
894 	if (!skb) {
895 		__skb_put(newskb, SGE_RX_PULL_LEN);
896 		memcpy(newskb->data, sd->pg_chunk.va, SGE_RX_PULL_LEN);
897 		skb_fill_page_desc(newskb, 0, sd->pg_chunk.page,
898 				   sd->pg_chunk.offset + SGE_RX_PULL_LEN,
899 				   len - SGE_RX_PULL_LEN);
900 		newskb->len = len;
901 		newskb->data_len = len - SGE_RX_PULL_LEN;
902 		newskb->truesize += newskb->data_len;
903 	} else {
904 		skb_fill_page_desc(newskb, skb_shinfo(newskb)->nr_frags,
905 				   sd->pg_chunk.page,
906 				   sd->pg_chunk.offset, len);
907 		newskb->len += len;
908 		newskb->data_len += len;
909 		newskb->truesize += len;
910 	}
911 
912 	fl->credits--;
913 	/*
914 	 * We do not refill FLs here, we let the caller do it to overlap a
915 	 * prefetch.
916 	 */
917 	return newskb;
918 }
919 
920 /**
921  *	get_imm_packet - return the next ingress packet buffer from a response
922  *	@resp: the response descriptor containing the packet data
923  *
924  *	Return a packet containing the immediate data of the given response.
925  */
926 static inline struct sk_buff *get_imm_packet(const struct rsp_desc *resp)
927 {
928 	struct sk_buff *skb = alloc_skb(IMMED_PKT_SIZE, GFP_ATOMIC);
929 
930 	if (skb) {
931 		__skb_put(skb, IMMED_PKT_SIZE);
932 		skb_copy_to_linear_data(skb, resp->imm_data, IMMED_PKT_SIZE);
933 	}
934 	return skb;
935 }
936 
937 /**
938  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
939  *	@skb: the packet
940  *
941  * 	Returns the number of Tx descriptors needed for the given Ethernet
942  * 	packet.  Ethernet packets require addition of WR and CPL headers.
943  */
944 static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
945 {
946 	unsigned int flits;
947 
948 	if (skb->len <= WR_LEN - sizeof(struct cpl_tx_pkt))
949 		return 1;
950 
951 	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 2;
952 	if (skb_shinfo(skb)->gso_size)
953 		flits++;
954 	return flits_to_desc(flits);
955 }
956 
957 
958 /*	map_skb - map a packet main body and its page fragments
959  *	@pdev: the PCI device
960  *	@skb: the packet
961  *	@addr: placeholder to save the mapped addresses
962  *
963  *	map the main body of an sk_buff and its page fragments, if any.
964  */
965 static int map_skb(struct pci_dev *pdev, const struct sk_buff *skb,
966 		   dma_addr_t *addr)
967 {
968 	const skb_frag_t *fp, *end;
969 	const struct skb_shared_info *si;
970 
971 	*addr = pci_map_single(pdev, skb->data, skb_headlen(skb),
972 			       PCI_DMA_TODEVICE);
973 	if (pci_dma_mapping_error(pdev, *addr))
974 		goto out_err;
975 
976 	si = skb_shinfo(skb);
977 	end = &si->frags[si->nr_frags];
978 
979 	for (fp = si->frags; fp < end; fp++) {
980 		*++addr = skb_frag_dma_map(&pdev->dev, fp, 0, skb_frag_size(fp),
981 					   DMA_TO_DEVICE);
982 		if (pci_dma_mapping_error(pdev, *addr))
983 			goto unwind;
984 	}
985 	return 0;
986 
987 unwind:
988 	while (fp-- > si->frags)
989 		dma_unmap_page(&pdev->dev, *--addr, skb_frag_size(fp),
990 			       DMA_TO_DEVICE);
991 
992 	pci_unmap_single(pdev, addr[-1], skb_headlen(skb), PCI_DMA_TODEVICE);
993 out_err:
994 	return -ENOMEM;
995 }
996 
997 /**
998  *	write_sgl - populate a scatter/gather list for a packet
999  *	@skb: the packet
1000  *	@sgp: the SGL to populate
1001  *	@start: start address of skb main body data to include in the SGL
1002  *	@len: length of skb main body data to include in the SGL
1003  *	@addr: the list of the mapped addresses
1004  *
1005  *	Copies the scatter/gather list for the buffers that make up a packet
1006  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1007  *	appropriately.
1008  */
1009 static inline unsigned int write_sgl(const struct sk_buff *skb,
1010 				    struct sg_ent *sgp, unsigned char *start,
1011 				    unsigned int len, const dma_addr_t *addr)
1012 {
1013 	unsigned int i, j = 0, k = 0, nfrags;
1014 
1015 	if (len) {
1016 		sgp->len[0] = cpu_to_be32(len);
1017 		sgp->addr[j++] = cpu_to_be64(addr[k++]);
1018 	}
1019 
1020 	nfrags = skb_shinfo(skb)->nr_frags;
1021 	for (i = 0; i < nfrags; i++) {
1022 		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1023 
1024 		sgp->len[j] = cpu_to_be32(skb_frag_size(frag));
1025 		sgp->addr[j] = cpu_to_be64(addr[k++]);
1026 		j ^= 1;
1027 		if (j == 0)
1028 			++sgp;
1029 	}
1030 	if (j)
1031 		sgp->len[j] = 0;
1032 	return ((nfrags + (len != 0)) * 3) / 2 + j;
1033 }
1034 
1035 /**
1036  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1037  *	@adap: the adapter
1038  *	@q: the Tx queue
1039  *
1040  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1041  *	where the HW is going to sleep just after we checked, however,
1042  *	then the interrupt handler will detect the outstanding TX packet
1043  *	and ring the doorbell for us.
1044  *
1045  *	When GTS is disabled we unconditionally ring the doorbell.
1046  */
1047 static inline void check_ring_tx_db(struct adapter *adap, struct sge_txq *q)
1048 {
1049 #if USE_GTS
1050 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1051 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1052 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1053 		t3_write_reg(adap, A_SG_KDOORBELL,
1054 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1055 	}
1056 #else
1057 	wmb();			/* write descriptors before telling HW */
1058 	t3_write_reg(adap, A_SG_KDOORBELL,
1059 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1060 #endif
1061 }
1062 
1063 static inline void wr_gen2(struct tx_desc *d, unsigned int gen)
1064 {
1065 #if SGE_NUM_GENBITS == 2
1066 	d->flit[TX_DESC_FLITS - 1] = cpu_to_be64(gen);
1067 #endif
1068 }
1069 
1070 /**
1071  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1072  *	@ndesc: number of Tx descriptors spanned by the SGL
1073  *	@skb: the packet corresponding to the WR
1074  *	@d: first Tx descriptor to be written
1075  *	@pidx: index of above descriptors
1076  *	@q: the SGE Tx queue
1077  *	@sgl: the SGL
1078  *	@flits: number of flits to the start of the SGL in the first descriptor
1079  *	@sgl_flits: the SGL size in flits
1080  *	@gen: the Tx descriptor generation
1081  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1082  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1083  *
1084  *	Write a work request header and an associated SGL.  If the SGL is
1085  *	small enough to fit into one Tx descriptor it has already been written
1086  *	and we just need to write the WR header.  Otherwise we distribute the
1087  *	SGL across the number of descriptors it spans.
1088  */
1089 static void write_wr_hdr_sgl(unsigned int ndesc, struct sk_buff *skb,
1090 			     struct tx_desc *d, unsigned int pidx,
1091 			     const struct sge_txq *q,
1092 			     const struct sg_ent *sgl,
1093 			     unsigned int flits, unsigned int sgl_flits,
1094 			     unsigned int gen, __be32 wr_hi,
1095 			     __be32 wr_lo)
1096 {
1097 	struct work_request_hdr *wrp = (struct work_request_hdr *)d;
1098 	struct tx_sw_desc *sd = &q->sdesc[pidx];
1099 
1100 	sd->skb = skb;
1101 	if (need_skb_unmap()) {
1102 		sd->fragidx = 0;
1103 		sd->addr_idx = 0;
1104 		sd->sflit = flits;
1105 	}
1106 
1107 	if (likely(ndesc == 1)) {
1108 		sd->eop = 1;
1109 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1110 				   V_WR_SGLSFLT(flits)) | wr_hi;
1111 		wmb();
1112 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1113 				   V_WR_GEN(gen)) | wr_lo;
1114 		wr_gen2(d, gen);
1115 	} else {
1116 		unsigned int ogen = gen;
1117 		const u64 *fp = (const u64 *)sgl;
1118 		struct work_request_hdr *wp = wrp;
1119 
1120 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1121 				   V_WR_SGLSFLT(flits)) | wr_hi;
1122 
1123 		while (sgl_flits) {
1124 			unsigned int avail = WR_FLITS - flits;
1125 
1126 			if (avail > sgl_flits)
1127 				avail = sgl_flits;
1128 			memcpy(&d->flit[flits], fp, avail * sizeof(*fp));
1129 			sgl_flits -= avail;
1130 			ndesc--;
1131 			if (!sgl_flits)
1132 				break;
1133 
1134 			fp += avail;
1135 			d++;
1136 			sd->eop = 0;
1137 			sd++;
1138 			if (++pidx == q->size) {
1139 				pidx = 0;
1140 				gen ^= 1;
1141 				d = q->desc;
1142 				sd = q->sdesc;
1143 			}
1144 
1145 			sd->skb = skb;
1146 			wrp = (struct work_request_hdr *)d;
1147 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1148 					   V_WR_SGLSFLT(1)) | wr_hi;
1149 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1150 							sgl_flits + 1)) |
1151 					   V_WR_GEN(gen)) | wr_lo;
1152 			wr_gen2(d, gen);
1153 			flits = 1;
1154 		}
1155 		sd->eop = 1;
1156 		wrp->wr_hi |= htonl(F_WR_EOP);
1157 		wmb();
1158 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1159 		wr_gen2((struct tx_desc *)wp, ogen);
1160 		WARN_ON(ndesc != 0);
1161 	}
1162 }
1163 
1164 /**
1165  *	write_tx_pkt_wr - write a TX_PKT work request
1166  *	@adap: the adapter
1167  *	@skb: the packet to send
1168  *	@pi: the egress interface
1169  *	@pidx: index of the first Tx descriptor to write
1170  *	@gen: the generation value to use
1171  *	@q: the Tx queue
1172  *	@ndesc: number of descriptors the packet will occupy
1173  *	@compl: the value of the COMPL bit to use
1174  *
1175  *	Generate a TX_PKT work request to send the supplied packet.
1176  */
1177 static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
1178 			    const struct port_info *pi,
1179 			    unsigned int pidx, unsigned int gen,
1180 			    struct sge_txq *q, unsigned int ndesc,
1181 			    unsigned int compl, const dma_addr_t *addr)
1182 {
1183 	unsigned int flits, sgl_flits, cntrl, tso_info;
1184 	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
1185 	struct tx_desc *d = &q->desc[pidx];
1186 	struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)d;
1187 
1188 	cpl->len = htonl(skb->len);
1189 	cntrl = V_TXPKT_INTF(pi->port_id);
1190 
1191 	if (vlan_tx_tag_present(skb))
1192 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(vlan_tx_tag_get(skb));
1193 
1194 	tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
1195 	if (tso_info) {
1196 		int eth_type;
1197 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)cpl;
1198 
1199 		d->flit[2] = 0;
1200 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1201 		hdr->cntrl = htonl(cntrl);
1202 		eth_type = skb_network_offset(skb) == ETH_HLEN ?
1203 		    CPL_ETH_II : CPL_ETH_II_VLAN;
1204 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1205 		    V_LSO_IPHDR_WORDS(ip_hdr(skb)->ihl) |
1206 		    V_LSO_TCPHDR_WORDS(tcp_hdr(skb)->doff);
1207 		hdr->lso_info = htonl(tso_info);
1208 		flits = 3;
1209 	} else {
1210 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1211 		cntrl |= F_TXPKT_IPCSUM_DIS;	/* SW calculates IP csum */
1212 		cntrl |= V_TXPKT_L4CSUM_DIS(skb->ip_summed != CHECKSUM_PARTIAL);
1213 		cpl->cntrl = htonl(cntrl);
1214 
1215 		if (skb->len <= WR_LEN - sizeof(*cpl)) {
1216 			q->sdesc[pidx].skb = NULL;
1217 			if (!skb->data_len)
1218 				skb_copy_from_linear_data(skb, &d->flit[2],
1219 							  skb->len);
1220 			else
1221 				skb_copy_bits(skb, 0, &d->flit[2], skb->len);
1222 
1223 			flits = (skb->len + 7) / 8 + 2;
1224 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(skb->len & 7) |
1225 					      V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT)
1226 					      | F_WR_SOP | F_WR_EOP | compl);
1227 			wmb();
1228 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(gen) |
1229 					      V_WR_TID(q->token));
1230 			wr_gen2(d, gen);
1231 			kfree_skb(skb);
1232 			return;
1233 		}
1234 
1235 		flits = 2;
1236 	}
1237 
1238 	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1239 	sgl_flits = write_sgl(skb, sgp, skb->data, skb_headlen(skb), addr);
1240 
1241 	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
1242 			 htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
1243 			 htonl(V_WR_TID(q->token)));
1244 }
1245 
1246 static inline void t3_stop_tx_queue(struct netdev_queue *txq,
1247 				    struct sge_qset *qs, struct sge_txq *q)
1248 {
1249 	netif_tx_stop_queue(txq);
1250 	set_bit(TXQ_ETH, &qs->txq_stopped);
1251 	q->stops++;
1252 }
1253 
1254 /**
1255  *	eth_xmit - add a packet to the Ethernet Tx queue
1256  *	@skb: the packet
1257  *	@dev: the egress net device
1258  *
1259  *	Add a packet to an SGE Tx queue.  Runs with softirqs disabled.
1260  */
1261 netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
1262 {
1263 	int qidx;
1264 	unsigned int ndesc, pidx, credits, gen, compl;
1265 	const struct port_info *pi = netdev_priv(dev);
1266 	struct adapter *adap = pi->adapter;
1267 	struct netdev_queue *txq;
1268 	struct sge_qset *qs;
1269 	struct sge_txq *q;
1270 	dma_addr_t addr[MAX_SKB_FRAGS + 1];
1271 
1272 	/*
1273 	 * The chip min packet length is 9 octets but play safe and reject
1274 	 * anything shorter than an Ethernet header.
1275 	 */
1276 	if (unlikely(skb->len < ETH_HLEN)) {
1277 		dev_kfree_skb(skb);
1278 		return NETDEV_TX_OK;
1279 	}
1280 
1281 	qidx = skb_get_queue_mapping(skb);
1282 	qs = &pi->qs[qidx];
1283 	q = &qs->txq[TXQ_ETH];
1284 	txq = netdev_get_tx_queue(dev, qidx);
1285 
1286 	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
1287 
1288 	credits = q->size - q->in_use;
1289 	ndesc = calc_tx_descs(skb);
1290 
1291 	if (unlikely(credits < ndesc)) {
1292 		t3_stop_tx_queue(txq, qs, q);
1293 		dev_err(&adap->pdev->dev,
1294 			"%s: Tx ring %u full while queue awake!\n",
1295 			dev->name, q->cntxt_id & 7);
1296 		return NETDEV_TX_BUSY;
1297 	}
1298 
1299 	if (unlikely(map_skb(adap->pdev, skb, addr) < 0)) {
1300 		dev_kfree_skb(skb);
1301 		return NETDEV_TX_OK;
1302 	}
1303 
1304 	q->in_use += ndesc;
1305 	if (unlikely(credits - ndesc < q->stop_thres)) {
1306 		t3_stop_tx_queue(txq, qs, q);
1307 
1308 		if (should_restart_tx(q) &&
1309 		    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1310 			q->restarts++;
1311 			netif_tx_start_queue(txq);
1312 		}
1313 	}
1314 
1315 	gen = q->gen;
1316 	q->unacked += ndesc;
1317 	compl = (q->unacked & 8) << (S_WR_COMPL - 3);
1318 	q->unacked &= 7;
1319 	pidx = q->pidx;
1320 	q->pidx += ndesc;
1321 	if (q->pidx >= q->size) {
1322 		q->pidx -= q->size;
1323 		q->gen ^= 1;
1324 	}
1325 
1326 	/* update port statistics */
1327 	if (skb->ip_summed == CHECKSUM_PARTIAL)
1328 		qs->port_stats[SGE_PSTAT_TX_CSUM]++;
1329 	if (skb_shinfo(skb)->gso_size)
1330 		qs->port_stats[SGE_PSTAT_TSO]++;
1331 	if (vlan_tx_tag_present(skb))
1332 		qs->port_stats[SGE_PSTAT_VLANINS]++;
1333 
1334 	/*
1335 	 * We do not use Tx completion interrupts to free DMAd Tx packets.
1336 	 * This is good for performance but means that we rely on new Tx
1337 	 * packets arriving to run the destructors of completed packets,
1338 	 * which open up space in their sockets' send queues.  Sometimes
1339 	 * we do not get such new packets causing Tx to stall.  A single
1340 	 * UDP transmitter is a good example of this situation.  We have
1341 	 * a clean up timer that periodically reclaims completed packets
1342 	 * but it doesn't run often enough (nor do we want it to) to prevent
1343 	 * lengthy stalls.  A solution to this problem is to run the
1344 	 * destructor early, after the packet is queued but before it's DMAd.
1345 	 * A cons is that we lie to socket memory accounting, but the amount
1346 	 * of extra memory is reasonable (limited by the number of Tx
1347 	 * descriptors), the packets do actually get freed quickly by new
1348 	 * packets almost always, and for protocols like TCP that wait for
1349 	 * acks to really free up the data the extra memory is even less.
1350 	 * On the positive side we run the destructors on the sending CPU
1351 	 * rather than on a potentially different completing CPU, usually a
1352 	 * good thing.  We also run them without holding our Tx queue lock,
1353 	 * unlike what reclaim_completed_tx() would otherwise do.
1354 	 *
1355 	 * Run the destructor before telling the DMA engine about the packet
1356 	 * to make sure it doesn't complete and get freed prematurely.
1357 	 */
1358 	if (likely(!skb_shared(skb)))
1359 		skb_orphan(skb);
1360 
1361 	write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl, addr);
1362 	check_ring_tx_db(adap, q);
1363 	return NETDEV_TX_OK;
1364 }
1365 
1366 /**
1367  *	write_imm - write a packet into a Tx descriptor as immediate data
1368  *	@d: the Tx descriptor to write
1369  *	@skb: the packet
1370  *	@len: the length of packet data to write as immediate data
1371  *	@gen: the generation bit value to write
1372  *
1373  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1374  *	contains a work request at its beginning.  We must write the packet
1375  *	carefully so the SGE doesn't read it accidentally before it's written
1376  *	in its entirety.
1377  */
1378 static inline void write_imm(struct tx_desc *d, struct sk_buff *skb,
1379 			     unsigned int len, unsigned int gen)
1380 {
1381 	struct work_request_hdr *from = (struct work_request_hdr *)skb->data;
1382 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1383 
1384 	if (likely(!skb->data_len))
1385 		memcpy(&to[1], &from[1], len - sizeof(*from));
1386 	else
1387 		skb_copy_bits(skb, sizeof(*from), &to[1], len - sizeof(*from));
1388 
1389 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1390 					V_WR_BCNTLFLT(len & 7));
1391 	wmb();
1392 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1393 					V_WR_LEN((len + 7) / 8));
1394 	wr_gen2(d, gen);
1395 	kfree_skb(skb);
1396 }
1397 
1398 /**
1399  *	check_desc_avail - check descriptor availability on a send queue
1400  *	@adap: the adapter
1401  *	@q: the send queue
1402  *	@skb: the packet needing the descriptors
1403  *	@ndesc: the number of Tx descriptors needed
1404  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1405  *
1406  *	Checks if the requested number of Tx descriptors is available on an
1407  *	SGE send queue.  If the queue is already suspended or not enough
1408  *	descriptors are available the packet is queued for later transmission.
1409  *	Must be called with the Tx queue locked.
1410  *
1411  *	Returns 0 if enough descriptors are available, 1 if there aren't
1412  *	enough descriptors and the packet has been queued, and 2 if the caller
1413  *	needs to retry because there weren't enough descriptors at the
1414  *	beginning of the call but some freed up in the mean time.
1415  */
1416 static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q,
1417 				   struct sk_buff *skb, unsigned int ndesc,
1418 				   unsigned int qid)
1419 {
1420 	if (unlikely(!skb_queue_empty(&q->sendq))) {
1421 	      addq_exit:__skb_queue_tail(&q->sendq, skb);
1422 		return 1;
1423 	}
1424 	if (unlikely(q->size - q->in_use < ndesc)) {
1425 		struct sge_qset *qs = txq_to_qset(q, qid);
1426 
1427 		set_bit(qid, &qs->txq_stopped);
1428 		smp_mb__after_clear_bit();
1429 
1430 		if (should_restart_tx(q) &&
1431 		    test_and_clear_bit(qid, &qs->txq_stopped))
1432 			return 2;
1433 
1434 		q->stops++;
1435 		goto addq_exit;
1436 	}
1437 	return 0;
1438 }
1439 
1440 /**
1441  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1442  *	@q: the SGE control Tx queue
1443  *
1444  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1445  *	that send only immediate data (presently just the control queues) and
1446  *	thus do not have any sk_buffs to release.
1447  */
1448 static inline void reclaim_completed_tx_imm(struct sge_txq *q)
1449 {
1450 	unsigned int reclaim = q->processed - q->cleaned;
1451 
1452 	q->in_use -= reclaim;
1453 	q->cleaned += reclaim;
1454 }
1455 
1456 static inline int immediate(const struct sk_buff *skb)
1457 {
1458 	return skb->len <= WR_LEN;
1459 }
1460 
1461 /**
1462  *	ctrl_xmit - send a packet through an SGE control Tx queue
1463  *	@adap: the adapter
1464  *	@q: the control queue
1465  *	@skb: the packet
1466  *
1467  *	Send a packet through an SGE control Tx queue.  Packets sent through
1468  *	a control queue must fit entirely as immediate data in a single Tx
1469  *	descriptor and have no page fragments.
1470  */
1471 static int ctrl_xmit(struct adapter *adap, struct sge_txq *q,
1472 		     struct sk_buff *skb)
1473 {
1474 	int ret;
1475 	struct work_request_hdr *wrp = (struct work_request_hdr *)skb->data;
1476 
1477 	if (unlikely(!immediate(skb))) {
1478 		WARN_ON(1);
1479 		dev_kfree_skb(skb);
1480 		return NET_XMIT_SUCCESS;
1481 	}
1482 
1483 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1484 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1485 
1486 	spin_lock(&q->lock);
1487       again:reclaim_completed_tx_imm(q);
1488 
1489 	ret = check_desc_avail(adap, q, skb, 1, TXQ_CTRL);
1490 	if (unlikely(ret)) {
1491 		if (ret == 1) {
1492 			spin_unlock(&q->lock);
1493 			return NET_XMIT_CN;
1494 		}
1495 		goto again;
1496 	}
1497 
1498 	write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1499 
1500 	q->in_use++;
1501 	if (++q->pidx >= q->size) {
1502 		q->pidx = 0;
1503 		q->gen ^= 1;
1504 	}
1505 	spin_unlock(&q->lock);
1506 	wmb();
1507 	t3_write_reg(adap, A_SG_KDOORBELL,
1508 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1509 	return NET_XMIT_SUCCESS;
1510 }
1511 
1512 /**
1513  *	restart_ctrlq - restart a suspended control queue
1514  *	@qs: the queue set cotaining the control queue
1515  *
1516  *	Resumes transmission on a suspended Tx control queue.
1517  */
1518 static void restart_ctrlq(unsigned long data)
1519 {
1520 	struct sk_buff *skb;
1521 	struct sge_qset *qs = (struct sge_qset *)data;
1522 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1523 
1524 	spin_lock(&q->lock);
1525       again:reclaim_completed_tx_imm(q);
1526 
1527 	while (q->in_use < q->size &&
1528 	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
1529 
1530 		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1531 
1532 		if (++q->pidx >= q->size) {
1533 			q->pidx = 0;
1534 			q->gen ^= 1;
1535 		}
1536 		q->in_use++;
1537 	}
1538 
1539 	if (!skb_queue_empty(&q->sendq)) {
1540 		set_bit(TXQ_CTRL, &qs->txq_stopped);
1541 		smp_mb__after_clear_bit();
1542 
1543 		if (should_restart_tx(q) &&
1544 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1545 			goto again;
1546 		q->stops++;
1547 	}
1548 
1549 	spin_unlock(&q->lock);
1550 	wmb();
1551 	t3_write_reg(qs->adap, A_SG_KDOORBELL,
1552 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1553 }
1554 
1555 /*
1556  * Send a management message through control queue 0
1557  */
1558 int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
1559 {
1560 	int ret;
1561 	local_bh_disable();
1562 	ret = ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], skb);
1563 	local_bh_enable();
1564 
1565 	return ret;
1566 }
1567 
1568 /**
1569  *	deferred_unmap_destructor - unmap a packet when it is freed
1570  *	@skb: the packet
1571  *
1572  *	This is the packet destructor used for Tx packets that need to remain
1573  *	mapped until they are freed rather than until their Tx descriptors are
1574  *	freed.
1575  */
1576 static void deferred_unmap_destructor(struct sk_buff *skb)
1577 {
1578 	int i;
1579 	const dma_addr_t *p;
1580 	const struct skb_shared_info *si;
1581 	const struct deferred_unmap_info *dui;
1582 
1583 	dui = (struct deferred_unmap_info *)skb->head;
1584 	p = dui->addr;
1585 
1586 	if (skb_tail_pointer(skb) - skb_transport_header(skb))
1587 		pci_unmap_single(dui->pdev, *p++, skb_tail_pointer(skb) -
1588 				 skb_transport_header(skb), PCI_DMA_TODEVICE);
1589 
1590 	si = skb_shinfo(skb);
1591 	for (i = 0; i < si->nr_frags; i++)
1592 		pci_unmap_page(dui->pdev, *p++, skb_frag_size(&si->frags[i]),
1593 			       PCI_DMA_TODEVICE);
1594 }
1595 
1596 static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
1597 				     const struct sg_ent *sgl, int sgl_flits)
1598 {
1599 	dma_addr_t *p;
1600 	struct deferred_unmap_info *dui;
1601 
1602 	dui = (struct deferred_unmap_info *)skb->head;
1603 	dui->pdev = pdev;
1604 	for (p = dui->addr; sgl_flits >= 3; sgl++, sgl_flits -= 3) {
1605 		*p++ = be64_to_cpu(sgl->addr[0]);
1606 		*p++ = be64_to_cpu(sgl->addr[1]);
1607 	}
1608 	if (sgl_flits)
1609 		*p = be64_to_cpu(sgl->addr[0]);
1610 }
1611 
1612 /**
1613  *	write_ofld_wr - write an offload work request
1614  *	@adap: the adapter
1615  *	@skb: the packet to send
1616  *	@q: the Tx queue
1617  *	@pidx: index of the first Tx descriptor to write
1618  *	@gen: the generation value to use
1619  *	@ndesc: number of descriptors the packet will occupy
1620  *
1621  *	Write an offload work request to send the supplied packet.  The packet
1622  *	data already carry the work request with most fields populated.
1623  */
1624 static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
1625 			  struct sge_txq *q, unsigned int pidx,
1626 			  unsigned int gen, unsigned int ndesc,
1627 			  const dma_addr_t *addr)
1628 {
1629 	unsigned int sgl_flits, flits;
1630 	struct work_request_hdr *from;
1631 	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
1632 	struct tx_desc *d = &q->desc[pidx];
1633 
1634 	if (immediate(skb)) {
1635 		q->sdesc[pidx].skb = NULL;
1636 		write_imm(d, skb, skb->len, gen);
1637 		return;
1638 	}
1639 
1640 	/* Only TX_DATA builds SGLs */
1641 
1642 	from = (struct work_request_hdr *)skb->data;
1643 	memcpy(&d->flit[1], &from[1],
1644 	       skb_transport_offset(skb) - sizeof(*from));
1645 
1646 	flits = skb_transport_offset(skb) / 8;
1647 	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1648 	sgl_flits = write_sgl(skb, sgp, skb_transport_header(skb),
1649 			     skb_tail_pointer(skb) -
1650 			     skb_transport_header(skb), addr);
1651 	if (need_skb_unmap()) {
1652 		setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
1653 		skb->destructor = deferred_unmap_destructor;
1654 	}
1655 
1656 	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits,
1657 			 gen, from->wr_hi, from->wr_lo);
1658 }
1659 
1660 /**
1661  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1662  *	@skb: the packet
1663  *
1664  * 	Returns the number of Tx descriptors needed for the given offload
1665  * 	packet.  These packets are already fully constructed.
1666  */
1667 static inline unsigned int calc_tx_descs_ofld(const struct sk_buff *skb)
1668 {
1669 	unsigned int flits, cnt;
1670 
1671 	if (skb->len <= WR_LEN)
1672 		return 1;	/* packet fits as immediate data */
1673 
1674 	flits = skb_transport_offset(skb) / 8;	/* headers */
1675 	cnt = skb_shinfo(skb)->nr_frags;
1676 	if (skb_tail_pointer(skb) != skb_transport_header(skb))
1677 		cnt++;
1678 	return flits_to_desc(flits + sgl_len(cnt));
1679 }
1680 
1681 /**
1682  *	ofld_xmit - send a packet through an offload queue
1683  *	@adap: the adapter
1684  *	@q: the Tx offload queue
1685  *	@skb: the packet
1686  *
1687  *	Send an offload packet through an SGE offload queue.
1688  */
1689 static int ofld_xmit(struct adapter *adap, struct sge_txq *q,
1690 		     struct sk_buff *skb)
1691 {
1692 	int ret;
1693 	unsigned int ndesc = calc_tx_descs_ofld(skb), pidx, gen;
1694 
1695 	spin_lock(&q->lock);
1696 again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
1697 
1698 	ret = check_desc_avail(adap, q, skb, ndesc, TXQ_OFLD);
1699 	if (unlikely(ret)) {
1700 		if (ret == 1) {
1701 			skb->priority = ndesc;	/* save for restart */
1702 			spin_unlock(&q->lock);
1703 			return NET_XMIT_CN;
1704 		}
1705 		goto again;
1706 	}
1707 
1708 	if (map_skb(adap->pdev, skb, (dma_addr_t *)skb->head)) {
1709 		spin_unlock(&q->lock);
1710 		return NET_XMIT_SUCCESS;
1711 	}
1712 
1713 	gen = q->gen;
1714 	q->in_use += ndesc;
1715 	pidx = q->pidx;
1716 	q->pidx += ndesc;
1717 	if (q->pidx >= q->size) {
1718 		q->pidx -= q->size;
1719 		q->gen ^= 1;
1720 	}
1721 	spin_unlock(&q->lock);
1722 
1723 	write_ofld_wr(adap, skb, q, pidx, gen, ndesc, (dma_addr_t *)skb->head);
1724 	check_ring_tx_db(adap, q);
1725 	return NET_XMIT_SUCCESS;
1726 }
1727 
1728 /**
1729  *	restart_offloadq - restart a suspended offload queue
1730  *	@qs: the queue set cotaining the offload queue
1731  *
1732  *	Resumes transmission on a suspended Tx offload queue.
1733  */
1734 static void restart_offloadq(unsigned long data)
1735 {
1736 	struct sk_buff *skb;
1737 	struct sge_qset *qs = (struct sge_qset *)data;
1738 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
1739 	const struct port_info *pi = netdev_priv(qs->netdev);
1740 	struct adapter *adap = pi->adapter;
1741 	unsigned int written = 0;
1742 
1743 	spin_lock(&q->lock);
1744 again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
1745 
1746 	while ((skb = skb_peek(&q->sendq)) != NULL) {
1747 		unsigned int gen, pidx;
1748 		unsigned int ndesc = skb->priority;
1749 
1750 		if (unlikely(q->size - q->in_use < ndesc)) {
1751 			set_bit(TXQ_OFLD, &qs->txq_stopped);
1752 			smp_mb__after_clear_bit();
1753 
1754 			if (should_restart_tx(q) &&
1755 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
1756 				goto again;
1757 			q->stops++;
1758 			break;
1759 		}
1760 
1761 		if (map_skb(adap->pdev, skb, (dma_addr_t *)skb->head))
1762 			break;
1763 
1764 		gen = q->gen;
1765 		q->in_use += ndesc;
1766 		pidx = q->pidx;
1767 		q->pidx += ndesc;
1768 		written += ndesc;
1769 		if (q->pidx >= q->size) {
1770 			q->pidx -= q->size;
1771 			q->gen ^= 1;
1772 		}
1773 		__skb_unlink(skb, &q->sendq);
1774 		spin_unlock(&q->lock);
1775 
1776 		write_ofld_wr(adap, skb, q, pidx, gen, ndesc,
1777 			     (dma_addr_t *)skb->head);
1778 		spin_lock(&q->lock);
1779 	}
1780 	spin_unlock(&q->lock);
1781 
1782 #if USE_GTS
1783 	set_bit(TXQ_RUNNING, &q->flags);
1784 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
1785 #endif
1786 	wmb();
1787 	if (likely(written))
1788 		t3_write_reg(adap, A_SG_KDOORBELL,
1789 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1790 }
1791 
1792 /**
1793  *	queue_set - return the queue set a packet should use
1794  *	@skb: the packet
1795  *
1796  *	Maps a packet to the SGE queue set it should use.  The desired queue
1797  *	set is carried in bits 1-3 in the packet's priority.
1798  */
1799 static inline int queue_set(const struct sk_buff *skb)
1800 {
1801 	return skb->priority >> 1;
1802 }
1803 
1804 /**
1805  *	is_ctrl_pkt - return whether an offload packet is a control packet
1806  *	@skb: the packet
1807  *
1808  *	Determines whether an offload packet should use an OFLD or a CTRL
1809  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
1810  */
1811 static inline int is_ctrl_pkt(const struct sk_buff *skb)
1812 {
1813 	return skb->priority & 1;
1814 }
1815 
1816 /**
1817  *	t3_offload_tx - send an offload packet
1818  *	@tdev: the offload device to send to
1819  *	@skb: the packet
1820  *
1821  *	Sends an offload packet.  We use the packet priority to select the
1822  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
1823  *	should be sent as regular or control, bits 1-3 select the queue set.
1824  */
1825 int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
1826 {
1827 	struct adapter *adap = tdev2adap(tdev);
1828 	struct sge_qset *qs = &adap->sge.qs[queue_set(skb)];
1829 
1830 	if (unlikely(is_ctrl_pkt(skb)))
1831 		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], skb);
1832 
1833 	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], skb);
1834 }
1835 
1836 /**
1837  *	offload_enqueue - add an offload packet to an SGE offload receive queue
1838  *	@q: the SGE response queue
1839  *	@skb: the packet
1840  *
1841  *	Add a new offload packet to an SGE response queue's offload packet
1842  *	queue.  If the packet is the first on the queue it schedules the RX
1843  *	softirq to process the queue.
1844  */
1845 static inline void offload_enqueue(struct sge_rspq *q, struct sk_buff *skb)
1846 {
1847 	int was_empty = skb_queue_empty(&q->rx_queue);
1848 
1849 	__skb_queue_tail(&q->rx_queue, skb);
1850 
1851 	if (was_empty) {
1852 		struct sge_qset *qs = rspq_to_qset(q);
1853 
1854 		napi_schedule(&qs->napi);
1855 	}
1856 }
1857 
1858 /**
1859  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
1860  *	@tdev: the offload device that will be receiving the packets
1861  *	@q: the SGE response queue that assembled the bundle
1862  *	@skbs: the partial bundle
1863  *	@n: the number of packets in the bundle
1864  *
1865  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
1866  */
1867 static inline void deliver_partial_bundle(struct t3cdev *tdev,
1868 					  struct sge_rspq *q,
1869 					  struct sk_buff *skbs[], int n)
1870 {
1871 	if (n) {
1872 		q->offload_bundles++;
1873 		tdev->recv(tdev, skbs, n);
1874 	}
1875 }
1876 
1877 /**
1878  *	ofld_poll - NAPI handler for offload packets in interrupt mode
1879  *	@dev: the network device doing the polling
1880  *	@budget: polling budget
1881  *
1882  *	The NAPI handler for offload packets when a response queue is serviced
1883  *	by the hard interrupt handler, i.e., when it's operating in non-polling
1884  *	mode.  Creates small packet batches and sends them through the offload
1885  *	receive handler.  Batches need to be of modest size as we do prefetches
1886  *	on the packets in each.
1887  */
1888 static int ofld_poll(struct napi_struct *napi, int budget)
1889 {
1890 	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
1891 	struct sge_rspq *q = &qs->rspq;
1892 	struct adapter *adapter = qs->adap;
1893 	int work_done = 0;
1894 
1895 	while (work_done < budget) {
1896 		struct sk_buff *skb, *tmp, *skbs[RX_BUNDLE_SIZE];
1897 		struct sk_buff_head queue;
1898 		int ngathered;
1899 
1900 		spin_lock_irq(&q->lock);
1901 		__skb_queue_head_init(&queue);
1902 		skb_queue_splice_init(&q->rx_queue, &queue);
1903 		if (skb_queue_empty(&queue)) {
1904 			napi_complete(napi);
1905 			spin_unlock_irq(&q->lock);
1906 			return work_done;
1907 		}
1908 		spin_unlock_irq(&q->lock);
1909 
1910 		ngathered = 0;
1911 		skb_queue_walk_safe(&queue, skb, tmp) {
1912 			if (work_done >= budget)
1913 				break;
1914 			work_done++;
1915 
1916 			__skb_unlink(skb, &queue);
1917 			prefetch(skb->data);
1918 			skbs[ngathered] = skb;
1919 			if (++ngathered == RX_BUNDLE_SIZE) {
1920 				q->offload_bundles++;
1921 				adapter->tdev.recv(&adapter->tdev, skbs,
1922 						   ngathered);
1923 				ngathered = 0;
1924 			}
1925 		}
1926 		if (!skb_queue_empty(&queue)) {
1927 			/* splice remaining packets back onto Rx queue */
1928 			spin_lock_irq(&q->lock);
1929 			skb_queue_splice(&queue, &q->rx_queue);
1930 			spin_unlock_irq(&q->lock);
1931 		}
1932 		deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered);
1933 	}
1934 
1935 	return work_done;
1936 }
1937 
1938 /**
1939  *	rx_offload - process a received offload packet
1940  *	@tdev: the offload device receiving the packet
1941  *	@rq: the response queue that received the packet
1942  *	@skb: the packet
1943  *	@rx_gather: a gather list of packets if we are building a bundle
1944  *	@gather_idx: index of the next available slot in the bundle
1945  *
1946  *	Process an ingress offload pakcet and add it to the offload ingress
1947  *	queue. 	Returns the index of the next available slot in the bundle.
1948  */
1949 static inline int rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1950 			     struct sk_buff *skb, struct sk_buff *rx_gather[],
1951 			     unsigned int gather_idx)
1952 {
1953 	skb_reset_mac_header(skb);
1954 	skb_reset_network_header(skb);
1955 	skb_reset_transport_header(skb);
1956 
1957 	if (rq->polling) {
1958 		rx_gather[gather_idx++] = skb;
1959 		if (gather_idx == RX_BUNDLE_SIZE) {
1960 			tdev->recv(tdev, rx_gather, RX_BUNDLE_SIZE);
1961 			gather_idx = 0;
1962 			rq->offload_bundles++;
1963 		}
1964 	} else
1965 		offload_enqueue(rq, skb);
1966 
1967 	return gather_idx;
1968 }
1969 
1970 /**
1971  *	restart_tx - check whether to restart suspended Tx queues
1972  *	@qs: the queue set to resume
1973  *
1974  *	Restarts suspended Tx queues of an SGE queue set if they have enough
1975  *	free resources to resume operation.
1976  */
1977 static void restart_tx(struct sge_qset *qs)
1978 {
1979 	if (test_bit(TXQ_ETH, &qs->txq_stopped) &&
1980 	    should_restart_tx(&qs->txq[TXQ_ETH]) &&
1981 	    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1982 		qs->txq[TXQ_ETH].restarts++;
1983 		if (netif_running(qs->netdev))
1984 			netif_tx_wake_queue(qs->tx_q);
1985 	}
1986 
1987 	if (test_bit(TXQ_OFLD, &qs->txq_stopped) &&
1988 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
1989 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
1990 		qs->txq[TXQ_OFLD].restarts++;
1991 		tasklet_schedule(&qs->txq[TXQ_OFLD].qresume_tsk);
1992 	}
1993 	if (test_bit(TXQ_CTRL, &qs->txq_stopped) &&
1994 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
1995 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
1996 		qs->txq[TXQ_CTRL].restarts++;
1997 		tasklet_schedule(&qs->txq[TXQ_CTRL].qresume_tsk);
1998 	}
1999 }
2000 
2001 /**
2002  *	cxgb3_arp_process - process an ARP request probing a private IP address
2003  *	@adapter: the adapter
2004  *	@skb: the skbuff containing the ARP request
2005  *
2006  *	Check if the ARP request is probing the private IP address
2007  *	dedicated to iSCSI, generate an ARP reply if so.
2008  */
2009 static void cxgb3_arp_process(struct port_info *pi, struct sk_buff *skb)
2010 {
2011 	struct net_device *dev = skb->dev;
2012 	struct arphdr *arp;
2013 	unsigned char *arp_ptr;
2014 	unsigned char *sha;
2015 	__be32 sip, tip;
2016 
2017 	if (!dev)
2018 		return;
2019 
2020 	skb_reset_network_header(skb);
2021 	arp = arp_hdr(skb);
2022 
2023 	if (arp->ar_op != htons(ARPOP_REQUEST))
2024 		return;
2025 
2026 	arp_ptr = (unsigned char *)(arp + 1);
2027 	sha = arp_ptr;
2028 	arp_ptr += dev->addr_len;
2029 	memcpy(&sip, arp_ptr, sizeof(sip));
2030 	arp_ptr += sizeof(sip);
2031 	arp_ptr += dev->addr_len;
2032 	memcpy(&tip, arp_ptr, sizeof(tip));
2033 
2034 	if (tip != pi->iscsi_ipv4addr)
2035 		return;
2036 
2037 	arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
2038 		 pi->iscsic.mac_addr, sha);
2039 
2040 }
2041 
2042 static inline int is_arp(struct sk_buff *skb)
2043 {
2044 	return skb->protocol == htons(ETH_P_ARP);
2045 }
2046 
2047 static void cxgb3_process_iscsi_prov_pack(struct port_info *pi,
2048 					struct sk_buff *skb)
2049 {
2050 	if (is_arp(skb)) {
2051 		cxgb3_arp_process(pi, skb);
2052 		return;
2053 	}
2054 
2055 	if (pi->iscsic.recv)
2056 		pi->iscsic.recv(pi, skb);
2057 
2058 }
2059 
2060 /**
2061  *	rx_eth - process an ingress ethernet packet
2062  *	@adap: the adapter
2063  *	@rq: the response queue that received the packet
2064  *	@skb: the packet
2065  *	@pad: amount of padding at the start of the buffer
2066  *
2067  *	Process an ingress ethernet pakcet and deliver it to the stack.
2068  *	The padding is 2 if the packet was delivered in an Rx buffer and 0
2069  *	if it was immediate data in a response.
2070  */
2071 static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
2072 		   struct sk_buff *skb, int pad, int lro)
2073 {
2074 	struct cpl_rx_pkt *p = (struct cpl_rx_pkt *)(skb->data + pad);
2075 	struct sge_qset *qs = rspq_to_qset(rq);
2076 	struct port_info *pi;
2077 
2078 	skb_pull(skb, sizeof(*p) + pad);
2079 	skb->protocol = eth_type_trans(skb, adap->port[p->iff]);
2080 	pi = netdev_priv(skb->dev);
2081 	if ((skb->dev->features & NETIF_F_RXCSUM) && p->csum_valid &&
2082 	    p->csum == htons(0xffff) && !p->fragment) {
2083 		qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2084 		skb->ip_summed = CHECKSUM_UNNECESSARY;
2085 	} else
2086 		skb_checksum_none_assert(skb);
2087 	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
2088 
2089 	if (p->vlan_valid) {
2090 		qs->port_stats[SGE_PSTAT_VLANEX]++;
2091 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(p->vlan));
2092 	}
2093 	if (rq->polling) {
2094 		if (lro)
2095 			napi_gro_receive(&qs->napi, skb);
2096 		else {
2097 			if (unlikely(pi->iscsic.flags))
2098 				cxgb3_process_iscsi_prov_pack(pi, skb);
2099 			netif_receive_skb(skb);
2100 		}
2101 	} else
2102 		netif_rx(skb);
2103 }
2104 
2105 static inline int is_eth_tcp(u32 rss)
2106 {
2107 	return G_HASHTYPE(ntohl(rss)) == RSS_HASH_4_TUPLE;
2108 }
2109 
2110 /**
2111  *	lro_add_page - add a page chunk to an LRO session
2112  *	@adap: the adapter
2113  *	@qs: the associated queue set
2114  *	@fl: the free list containing the page chunk to add
2115  *	@len: packet length
2116  *	@complete: Indicates the last fragment of a frame
2117  *
2118  *	Add a received packet contained in a page chunk to an existing LRO
2119  *	session.
2120  */
2121 static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
2122 			 struct sge_fl *fl, int len, int complete)
2123 {
2124 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2125 	struct port_info *pi = netdev_priv(qs->netdev);
2126 	struct sk_buff *skb = NULL;
2127 	struct cpl_rx_pkt *cpl;
2128 	struct skb_frag_struct *rx_frag;
2129 	int nr_frags;
2130 	int offset = 0;
2131 
2132 	if (!qs->nomem) {
2133 		skb = napi_get_frags(&qs->napi);
2134 		qs->nomem = !skb;
2135 	}
2136 
2137 	fl->credits--;
2138 
2139 	pci_dma_sync_single_for_cpu(adap->pdev,
2140 				    dma_unmap_addr(sd, dma_addr),
2141 				    fl->buf_size - SGE_PG_RSVD,
2142 				    PCI_DMA_FROMDEVICE);
2143 
2144 	(*sd->pg_chunk.p_cnt)--;
2145 	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
2146 		pci_unmap_page(adap->pdev,
2147 			       sd->pg_chunk.mapping,
2148 			       fl->alloc_size,
2149 			       PCI_DMA_FROMDEVICE);
2150 
2151 	if (!skb) {
2152 		put_page(sd->pg_chunk.page);
2153 		if (complete)
2154 			qs->nomem = 0;
2155 		return;
2156 	}
2157 
2158 	rx_frag = skb_shinfo(skb)->frags;
2159 	nr_frags = skb_shinfo(skb)->nr_frags;
2160 
2161 	if (!nr_frags) {
2162 		offset = 2 + sizeof(struct cpl_rx_pkt);
2163 		cpl = qs->lro_va = sd->pg_chunk.va + 2;
2164 
2165 		if ((qs->netdev->features & NETIF_F_RXCSUM) &&
2166 		     cpl->csum_valid && cpl->csum == htons(0xffff)) {
2167 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2168 			qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2169 		} else
2170 			skb->ip_summed = CHECKSUM_NONE;
2171 	} else
2172 		cpl = qs->lro_va;
2173 
2174 	len -= offset;
2175 
2176 	rx_frag += nr_frags;
2177 	__skb_frag_set_page(rx_frag, sd->pg_chunk.page);
2178 	rx_frag->page_offset = sd->pg_chunk.offset + offset;
2179 	skb_frag_size_set(rx_frag, len);
2180 
2181 	skb->len += len;
2182 	skb->data_len += len;
2183 	skb->truesize += len;
2184 	skb_shinfo(skb)->nr_frags++;
2185 
2186 	if (!complete)
2187 		return;
2188 
2189 	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
2190 
2191 	if (cpl->vlan_valid) {
2192 		qs->port_stats[SGE_PSTAT_VLANEX]++;
2193 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan));
2194 	}
2195 	napi_gro_frags(&qs->napi);
2196 }
2197 
2198 /**
2199  *	handle_rsp_cntrl_info - handles control information in a response
2200  *	@qs: the queue set corresponding to the response
2201  *	@flags: the response control flags
2202  *
2203  *	Handles the control information of an SGE response, such as GTS
2204  *	indications and completion credits for the queue set's Tx queues.
2205  *	HW coalesces credits, we don't do any extra SW coalescing.
2206  */
2207 static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
2208 {
2209 	unsigned int credits;
2210 
2211 #if USE_GTS
2212 	if (flags & F_RSPD_TXQ0_GTS)
2213 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2214 #endif
2215 
2216 	credits = G_RSPD_TXQ0_CR(flags);
2217 	if (credits)
2218 		qs->txq[TXQ_ETH].processed += credits;
2219 
2220 	credits = G_RSPD_TXQ2_CR(flags);
2221 	if (credits)
2222 		qs->txq[TXQ_CTRL].processed += credits;
2223 
2224 # if USE_GTS
2225 	if (flags & F_RSPD_TXQ1_GTS)
2226 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2227 # endif
2228 	credits = G_RSPD_TXQ1_CR(flags);
2229 	if (credits)
2230 		qs->txq[TXQ_OFLD].processed += credits;
2231 }
2232 
2233 /**
2234  *	check_ring_db - check if we need to ring any doorbells
2235  *	@adapter: the adapter
2236  *	@qs: the queue set whose Tx queues are to be examined
2237  *	@sleeping: indicates which Tx queue sent GTS
2238  *
2239  *	Checks if some of a queue set's Tx queues need to ring their doorbells
2240  *	to resume transmission after idling while they still have unprocessed
2241  *	descriptors.
2242  */
2243 static void check_ring_db(struct adapter *adap, struct sge_qset *qs,
2244 			  unsigned int sleeping)
2245 {
2246 	if (sleeping & F_RSPD_TXQ0_GTS) {
2247 		struct sge_txq *txq = &qs->txq[TXQ_ETH];
2248 
2249 		if (txq->cleaned + txq->in_use != txq->processed &&
2250 		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
2251 			set_bit(TXQ_RUNNING, &txq->flags);
2252 			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
2253 				     V_EGRCNTX(txq->cntxt_id));
2254 		}
2255 	}
2256 
2257 	if (sleeping & F_RSPD_TXQ1_GTS) {
2258 		struct sge_txq *txq = &qs->txq[TXQ_OFLD];
2259 
2260 		if (txq->cleaned + txq->in_use != txq->processed &&
2261 		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
2262 			set_bit(TXQ_RUNNING, &txq->flags);
2263 			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
2264 				     V_EGRCNTX(txq->cntxt_id));
2265 		}
2266 	}
2267 }
2268 
2269 /**
2270  *	is_new_response - check if a response is newly written
2271  *	@r: the response descriptor
2272  *	@q: the response queue
2273  *
2274  *	Returns true if a response descriptor contains a yet unprocessed
2275  *	response.
2276  */
2277 static inline int is_new_response(const struct rsp_desc *r,
2278 				  const struct sge_rspq *q)
2279 {
2280 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2281 }
2282 
2283 static inline void clear_rspq_bufstate(struct sge_rspq * const q)
2284 {
2285 	q->pg_skb = NULL;
2286 	q->rx_recycle_buf = 0;
2287 }
2288 
2289 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2290 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2291 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2292 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2293 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2294 
2295 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2296 #define NOMEM_INTR_DELAY 2500
2297 
2298 /**
2299  *	process_responses - process responses from an SGE response queue
2300  *	@adap: the adapter
2301  *	@qs: the queue set to which the response queue belongs
2302  *	@budget: how many responses can be processed in this round
2303  *
2304  *	Process responses from an SGE response queue up to the supplied budget.
2305  *	Responses include received packets as well as credits and other events
2306  *	for the queues that belong to the response queue's queue set.
2307  *	A negative budget is effectively unlimited.
2308  *
2309  *	Additionally choose the interrupt holdoff time for the next interrupt
2310  *	on this queue.  If the system is under memory shortage use a fairly
2311  *	long delay to help recovery.
2312  */
2313 static int process_responses(struct adapter *adap, struct sge_qset *qs,
2314 			     int budget)
2315 {
2316 	struct sge_rspq *q = &qs->rspq;
2317 	struct rsp_desc *r = &q->desc[q->cidx];
2318 	int budget_left = budget;
2319 	unsigned int sleeping = 0;
2320 	struct sk_buff *offload_skbs[RX_BUNDLE_SIZE];
2321 	int ngathered = 0;
2322 
2323 	q->next_holdoff = q->holdoff_tmr;
2324 
2325 	while (likely(budget_left && is_new_response(r, q))) {
2326 		int packet_complete, eth, ethpad = 2;
2327 		int lro = !!(qs->netdev->features & NETIF_F_GRO);
2328 		struct sk_buff *skb = NULL;
2329 		u32 len, flags;
2330 		__be32 rss_hi, rss_lo;
2331 
2332 		rmb();
2333 		eth = r->rss_hdr.opcode == CPL_RX_PKT;
2334 		rss_hi = *(const __be32 *)r;
2335 		rss_lo = r->rss_hdr.rss_hash_val;
2336 		flags = ntohl(r->flags);
2337 
2338 		if (unlikely(flags & F_RSPD_ASYNC_NOTIF)) {
2339 			skb = alloc_skb(AN_PKT_SIZE, GFP_ATOMIC);
2340 			if (!skb)
2341 				goto no_mem;
2342 
2343 			memcpy(__skb_put(skb, AN_PKT_SIZE), r, AN_PKT_SIZE);
2344 			skb->data[0] = CPL_ASYNC_NOTIF;
2345 			rss_hi = htonl(CPL_ASYNC_NOTIF << 24);
2346 			q->async_notif++;
2347 		} else if (flags & F_RSPD_IMM_DATA_VALID) {
2348 			skb = get_imm_packet(r);
2349 			if (unlikely(!skb)) {
2350 no_mem:
2351 				q->next_holdoff = NOMEM_INTR_DELAY;
2352 				q->nomem++;
2353 				/* consume one credit since we tried */
2354 				budget_left--;
2355 				break;
2356 			}
2357 			q->imm_data++;
2358 			ethpad = 0;
2359 		} else if ((len = ntohl(r->len_cq)) != 0) {
2360 			struct sge_fl *fl;
2361 
2362 			lro &= eth && is_eth_tcp(rss_hi);
2363 
2364 			fl = (len & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2365 			if (fl->use_pages) {
2366 				void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
2367 
2368 				prefetch(addr);
2369 #if L1_CACHE_BYTES < 128
2370 				prefetch(addr + L1_CACHE_BYTES);
2371 #endif
2372 				__refill_fl(adap, fl);
2373 				if (lro > 0) {
2374 					lro_add_page(adap, qs, fl,
2375 						     G_RSPD_LEN(len),
2376 						     flags & F_RSPD_EOP);
2377 					 goto next_fl;
2378 				}
2379 
2380 				skb = get_packet_pg(adap, fl, q,
2381 						    G_RSPD_LEN(len),
2382 						    eth ?
2383 						    SGE_RX_DROP_THRES : 0);
2384 				q->pg_skb = skb;
2385 			} else
2386 				skb = get_packet(adap, fl, G_RSPD_LEN(len),
2387 						 eth ? SGE_RX_DROP_THRES : 0);
2388 			if (unlikely(!skb)) {
2389 				if (!eth)
2390 					goto no_mem;
2391 				q->rx_drops++;
2392 			} else if (unlikely(r->rss_hdr.opcode == CPL_TRACE_PKT))
2393 				__skb_pull(skb, 2);
2394 next_fl:
2395 			if (++fl->cidx == fl->size)
2396 				fl->cidx = 0;
2397 		} else
2398 			q->pure_rsps++;
2399 
2400 		if (flags & RSPD_CTRL_MASK) {
2401 			sleeping |= flags & RSPD_GTS_MASK;
2402 			handle_rsp_cntrl_info(qs, flags);
2403 		}
2404 
2405 		r++;
2406 		if (unlikely(++q->cidx == q->size)) {
2407 			q->cidx = 0;
2408 			q->gen ^= 1;
2409 			r = q->desc;
2410 		}
2411 		prefetch(r);
2412 
2413 		if (++q->credits >= (q->size / 4)) {
2414 			refill_rspq(adap, q, q->credits);
2415 			q->credits = 0;
2416 		}
2417 
2418 		packet_complete = flags &
2419 				  (F_RSPD_EOP | F_RSPD_IMM_DATA_VALID |
2420 				   F_RSPD_ASYNC_NOTIF);
2421 
2422 		if (skb != NULL && packet_complete) {
2423 			if (eth)
2424 				rx_eth(adap, q, skb, ethpad, lro);
2425 			else {
2426 				q->offload_pkts++;
2427 				/* Preserve the RSS info in csum & priority */
2428 				skb->csum = rss_hi;
2429 				skb->priority = rss_lo;
2430 				ngathered = rx_offload(&adap->tdev, q, skb,
2431 						       offload_skbs,
2432 						       ngathered);
2433 			}
2434 
2435 			if (flags & F_RSPD_EOP)
2436 				clear_rspq_bufstate(q);
2437 		}
2438 		--budget_left;
2439 	}
2440 
2441 	deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
2442 
2443 	if (sleeping)
2444 		check_ring_db(adap, qs, sleeping);
2445 
2446 	smp_mb();		/* commit Tx queue .processed updates */
2447 	if (unlikely(qs->txq_stopped != 0))
2448 		restart_tx(qs);
2449 
2450 	budget -= budget_left;
2451 	return budget;
2452 }
2453 
2454 static inline int is_pure_response(const struct rsp_desc *r)
2455 {
2456 	__be32 n = r->flags & htonl(F_RSPD_ASYNC_NOTIF | F_RSPD_IMM_DATA_VALID);
2457 
2458 	return (n | r->len_cq) == 0;
2459 }
2460 
2461 /**
2462  *	napi_rx_handler - the NAPI handler for Rx processing
2463  *	@napi: the napi instance
2464  *	@budget: how many packets we can process in this round
2465  *
2466  *	Handler for new data events when using NAPI.
2467  */
2468 static int napi_rx_handler(struct napi_struct *napi, int budget)
2469 {
2470 	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
2471 	struct adapter *adap = qs->adap;
2472 	int work_done = process_responses(adap, qs, budget);
2473 
2474 	if (likely(work_done < budget)) {
2475 		napi_complete(napi);
2476 
2477 		/*
2478 		 * Because we don't atomically flush the following
2479 		 * write it is possible that in very rare cases it can
2480 		 * reach the device in a way that races with a new
2481 		 * response being written plus an error interrupt
2482 		 * causing the NAPI interrupt handler below to return
2483 		 * unhandled status to the OS.  To protect against
2484 		 * this would require flushing the write and doing
2485 		 * both the write and the flush with interrupts off.
2486 		 * Way too expensive and unjustifiable given the
2487 		 * rarity of the race.
2488 		 *
2489 		 * The race cannot happen at all with MSI-X.
2490 		 */
2491 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2492 			     V_NEWTIMER(qs->rspq.next_holdoff) |
2493 			     V_NEWINDEX(qs->rspq.cidx));
2494 	}
2495 	return work_done;
2496 }
2497 
2498 /*
2499  * Returns true if the device is already scheduled for polling.
2500  */
2501 static inline int napi_is_scheduled(struct napi_struct *napi)
2502 {
2503 	return test_bit(NAPI_STATE_SCHED, &napi->state);
2504 }
2505 
2506 /**
2507  *	process_pure_responses - process pure responses from a response queue
2508  *	@adap: the adapter
2509  *	@qs: the queue set owning the response queue
2510  *	@r: the first pure response to process
2511  *
2512  *	A simpler version of process_responses() that handles only pure (i.e.,
2513  *	non data-carrying) responses.  Such respones are too light-weight to
2514  *	justify calling a softirq under NAPI, so we handle them specially in
2515  *	the interrupt handler.  The function is called with a pointer to a
2516  *	response, which the caller must ensure is a valid pure response.
2517  *
2518  *	Returns 1 if it encounters a valid data-carrying response, 0 otherwise.
2519  */
2520 static int process_pure_responses(struct adapter *adap, struct sge_qset *qs,
2521 				  struct rsp_desc *r)
2522 {
2523 	struct sge_rspq *q = &qs->rspq;
2524 	unsigned int sleeping = 0;
2525 
2526 	do {
2527 		u32 flags = ntohl(r->flags);
2528 
2529 		r++;
2530 		if (unlikely(++q->cidx == q->size)) {
2531 			q->cidx = 0;
2532 			q->gen ^= 1;
2533 			r = q->desc;
2534 		}
2535 		prefetch(r);
2536 
2537 		if (flags & RSPD_CTRL_MASK) {
2538 			sleeping |= flags & RSPD_GTS_MASK;
2539 			handle_rsp_cntrl_info(qs, flags);
2540 		}
2541 
2542 		q->pure_rsps++;
2543 		if (++q->credits >= (q->size / 4)) {
2544 			refill_rspq(adap, q, q->credits);
2545 			q->credits = 0;
2546 		}
2547 		if (!is_new_response(r, q))
2548 			break;
2549 		rmb();
2550 	} while (is_pure_response(r));
2551 
2552 	if (sleeping)
2553 		check_ring_db(adap, qs, sleeping);
2554 
2555 	smp_mb();		/* commit Tx queue .processed updates */
2556 	if (unlikely(qs->txq_stopped != 0))
2557 		restart_tx(qs);
2558 
2559 	return is_new_response(r, q);
2560 }
2561 
2562 /**
2563  *	handle_responses - decide what to do with new responses in NAPI mode
2564  *	@adap: the adapter
2565  *	@q: the response queue
2566  *
2567  *	This is used by the NAPI interrupt handlers to decide what to do with
2568  *	new SGE responses.  If there are no new responses it returns -1.  If
2569  *	there are new responses and they are pure (i.e., non-data carrying)
2570  *	it handles them straight in hard interrupt context as they are very
2571  *	cheap and don't deliver any packets.  Finally, if there are any data
2572  *	signaling responses it schedules the NAPI handler.  Returns 1 if it
2573  *	schedules NAPI, 0 if all new responses were pure.
2574  *
2575  *	The caller must ascertain NAPI is not already running.
2576  */
2577 static inline int handle_responses(struct adapter *adap, struct sge_rspq *q)
2578 {
2579 	struct sge_qset *qs = rspq_to_qset(q);
2580 	struct rsp_desc *r = &q->desc[q->cidx];
2581 
2582 	if (!is_new_response(r, q))
2583 		return -1;
2584 	rmb();
2585 	if (is_pure_response(r) && process_pure_responses(adap, qs, r) == 0) {
2586 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2587 			     V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx));
2588 		return 0;
2589 	}
2590 	napi_schedule(&qs->napi);
2591 	return 1;
2592 }
2593 
2594 /*
2595  * The MSI-X interrupt handler for an SGE response queue for the non-NAPI case
2596  * (i.e., response queue serviced in hard interrupt).
2597  */
2598 static irqreturn_t t3_sge_intr_msix(int irq, void *cookie)
2599 {
2600 	struct sge_qset *qs = cookie;
2601 	struct adapter *adap = qs->adap;
2602 	struct sge_rspq *q = &qs->rspq;
2603 
2604 	spin_lock(&q->lock);
2605 	if (process_responses(adap, qs, -1) == 0)
2606 		q->unhandled_irqs++;
2607 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2608 		     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2609 	spin_unlock(&q->lock);
2610 	return IRQ_HANDLED;
2611 }
2612 
2613 /*
2614  * The MSI-X interrupt handler for an SGE response queue for the NAPI case
2615  * (i.e., response queue serviced by NAPI polling).
2616  */
2617 static irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie)
2618 {
2619 	struct sge_qset *qs = cookie;
2620 	struct sge_rspq *q = &qs->rspq;
2621 
2622 	spin_lock(&q->lock);
2623 
2624 	if (handle_responses(qs->adap, q) < 0)
2625 		q->unhandled_irqs++;
2626 	spin_unlock(&q->lock);
2627 	return IRQ_HANDLED;
2628 }
2629 
2630 /*
2631  * The non-NAPI MSI interrupt handler.  This needs to handle data events from
2632  * SGE response queues as well as error and other async events as they all use
2633  * the same MSI vector.  We use one SGE response queue per port in this mode
2634  * and protect all response queues with queue 0's lock.
2635  */
2636 static irqreturn_t t3_intr_msi(int irq, void *cookie)
2637 {
2638 	int new_packets = 0;
2639 	struct adapter *adap = cookie;
2640 	struct sge_rspq *q = &adap->sge.qs[0].rspq;
2641 
2642 	spin_lock(&q->lock);
2643 
2644 	if (process_responses(adap, &adap->sge.qs[0], -1)) {
2645 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2646 			     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2647 		new_packets = 1;
2648 	}
2649 
2650 	if (adap->params.nports == 2 &&
2651 	    process_responses(adap, &adap->sge.qs[1], -1)) {
2652 		struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2653 
2654 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q1->cntxt_id) |
2655 			     V_NEWTIMER(q1->next_holdoff) |
2656 			     V_NEWINDEX(q1->cidx));
2657 		new_packets = 1;
2658 	}
2659 
2660 	if (!new_packets && t3_slow_intr_handler(adap) == 0)
2661 		q->unhandled_irqs++;
2662 
2663 	spin_unlock(&q->lock);
2664 	return IRQ_HANDLED;
2665 }
2666 
2667 static int rspq_check_napi(struct sge_qset *qs)
2668 {
2669 	struct sge_rspq *q = &qs->rspq;
2670 
2671 	if (!napi_is_scheduled(&qs->napi) &&
2672 	    is_new_response(&q->desc[q->cidx], q)) {
2673 		napi_schedule(&qs->napi);
2674 		return 1;
2675 	}
2676 	return 0;
2677 }
2678 
2679 /*
2680  * The MSI interrupt handler for the NAPI case (i.e., response queues serviced
2681  * by NAPI polling).  Handles data events from SGE response queues as well as
2682  * error and other async events as they all use the same MSI vector.  We use
2683  * one SGE response queue per port in this mode and protect all response
2684  * queues with queue 0's lock.
2685  */
2686 static irqreturn_t t3_intr_msi_napi(int irq, void *cookie)
2687 {
2688 	int new_packets;
2689 	struct adapter *adap = cookie;
2690 	struct sge_rspq *q = &adap->sge.qs[0].rspq;
2691 
2692 	spin_lock(&q->lock);
2693 
2694 	new_packets = rspq_check_napi(&adap->sge.qs[0]);
2695 	if (adap->params.nports == 2)
2696 		new_packets += rspq_check_napi(&adap->sge.qs[1]);
2697 	if (!new_packets && t3_slow_intr_handler(adap) == 0)
2698 		q->unhandled_irqs++;
2699 
2700 	spin_unlock(&q->lock);
2701 	return IRQ_HANDLED;
2702 }
2703 
2704 /*
2705  * A helper function that processes responses and issues GTS.
2706  */
2707 static inline int process_responses_gts(struct adapter *adap,
2708 					struct sge_rspq *rq)
2709 {
2710 	int work;
2711 
2712 	work = process_responses(adap, rspq_to_qset(rq), -1);
2713 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2714 		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2715 	return work;
2716 }
2717 
2718 /*
2719  * The legacy INTx interrupt handler.  This needs to handle data events from
2720  * SGE response queues as well as error and other async events as they all use
2721  * the same interrupt pin.  We use one SGE response queue per port in this mode
2722  * and protect all response queues with queue 0's lock.
2723  */
2724 static irqreturn_t t3_intr(int irq, void *cookie)
2725 {
2726 	int work_done, w0, w1;
2727 	struct adapter *adap = cookie;
2728 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2729 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2730 
2731 	spin_lock(&q0->lock);
2732 
2733 	w0 = is_new_response(&q0->desc[q0->cidx], q0);
2734 	w1 = adap->params.nports == 2 &&
2735 	    is_new_response(&q1->desc[q1->cidx], q1);
2736 
2737 	if (likely(w0 | w1)) {
2738 		t3_write_reg(adap, A_PL_CLI, 0);
2739 		t3_read_reg(adap, A_PL_CLI);	/* flush */
2740 
2741 		if (likely(w0))
2742 			process_responses_gts(adap, q0);
2743 
2744 		if (w1)
2745 			process_responses_gts(adap, q1);
2746 
2747 		work_done = w0 | w1;
2748 	} else
2749 		work_done = t3_slow_intr_handler(adap);
2750 
2751 	spin_unlock(&q0->lock);
2752 	return IRQ_RETVAL(work_done != 0);
2753 }
2754 
2755 /*
2756  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2757  * Handles data events from SGE response queues as well as error and other
2758  * async events as they all use the same interrupt pin.  We use one SGE
2759  * response queue per port in this mode and protect all response queues with
2760  * queue 0's lock.
2761  */
2762 static irqreturn_t t3b_intr(int irq, void *cookie)
2763 {
2764 	u32 map;
2765 	struct adapter *adap = cookie;
2766 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2767 
2768 	t3_write_reg(adap, A_PL_CLI, 0);
2769 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2770 
2771 	if (unlikely(!map))	/* shared interrupt, most likely */
2772 		return IRQ_NONE;
2773 
2774 	spin_lock(&q0->lock);
2775 
2776 	if (unlikely(map & F_ERRINTR))
2777 		t3_slow_intr_handler(adap);
2778 
2779 	if (likely(map & 1))
2780 		process_responses_gts(adap, q0);
2781 
2782 	if (map & 2)
2783 		process_responses_gts(adap, &adap->sge.qs[1].rspq);
2784 
2785 	spin_unlock(&q0->lock);
2786 	return IRQ_HANDLED;
2787 }
2788 
2789 /*
2790  * NAPI interrupt handler for legacy INTx interrupts for T3B-based cards.
2791  * Handles data events from SGE response queues as well as error and other
2792  * async events as they all use the same interrupt pin.  We use one SGE
2793  * response queue per port in this mode and protect all response queues with
2794  * queue 0's lock.
2795  */
2796 static irqreturn_t t3b_intr_napi(int irq, void *cookie)
2797 {
2798 	u32 map;
2799 	struct adapter *adap = cookie;
2800 	struct sge_qset *qs0 = &adap->sge.qs[0];
2801 	struct sge_rspq *q0 = &qs0->rspq;
2802 
2803 	t3_write_reg(adap, A_PL_CLI, 0);
2804 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2805 
2806 	if (unlikely(!map))	/* shared interrupt, most likely */
2807 		return IRQ_NONE;
2808 
2809 	spin_lock(&q0->lock);
2810 
2811 	if (unlikely(map & F_ERRINTR))
2812 		t3_slow_intr_handler(adap);
2813 
2814 	if (likely(map & 1))
2815 		napi_schedule(&qs0->napi);
2816 
2817 	if (map & 2)
2818 		napi_schedule(&adap->sge.qs[1].napi);
2819 
2820 	spin_unlock(&q0->lock);
2821 	return IRQ_HANDLED;
2822 }
2823 
2824 /**
2825  *	t3_intr_handler - select the top-level interrupt handler
2826  *	@adap: the adapter
2827  *	@polling: whether using NAPI to service response queues
2828  *
2829  *	Selects the top-level interrupt handler based on the type of interrupts
2830  *	(MSI-X, MSI, or legacy) and whether NAPI will be used to service the
2831  *	response queues.
2832  */
2833 irq_handler_t t3_intr_handler(struct adapter *adap, int polling)
2834 {
2835 	if (adap->flags & USING_MSIX)
2836 		return polling ? t3_sge_intr_msix_napi : t3_sge_intr_msix;
2837 	if (adap->flags & USING_MSI)
2838 		return polling ? t3_intr_msi_napi : t3_intr_msi;
2839 	if (adap->params.rev > 0)
2840 		return polling ? t3b_intr_napi : t3b_intr;
2841 	return t3_intr;
2842 }
2843 
2844 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
2845 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
2846 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
2847 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
2848 		    F_HIRCQPARITYERROR)
2849 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
2850 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
2851 		      F_RSPQDISABLED)
2852 
2853 /**
2854  *	t3_sge_err_intr_handler - SGE async event interrupt handler
2855  *	@adapter: the adapter
2856  *
2857  *	Interrupt handler for SGE asynchronous (non-data) events.
2858  */
2859 void t3_sge_err_intr_handler(struct adapter *adapter)
2860 {
2861 	unsigned int v, status = t3_read_reg(adapter, A_SG_INT_CAUSE) &
2862 				 ~F_FLEMPTY;
2863 
2864 	if (status & SGE_PARERR)
2865 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
2866 			 status & SGE_PARERR);
2867 	if (status & SGE_FRAMINGERR)
2868 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
2869 			 status & SGE_FRAMINGERR);
2870 
2871 	if (status & F_RSPQCREDITOVERFOW)
2872 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
2873 
2874 	if (status & F_RSPQDISABLED) {
2875 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
2876 
2877 		CH_ALERT(adapter,
2878 			 "packet delivered to disabled response queue "
2879 			 "(0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff);
2880 	}
2881 
2882 	if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR))
2883 		queue_work(cxgb3_wq, &adapter->db_drop_task);
2884 
2885 	if (status & (F_HIPRIORITYDBFULL | F_LOPRIORITYDBFULL))
2886 		queue_work(cxgb3_wq, &adapter->db_full_task);
2887 
2888 	if (status & (F_HIPRIORITYDBEMPTY | F_LOPRIORITYDBEMPTY))
2889 		queue_work(cxgb3_wq, &adapter->db_empty_task);
2890 
2891 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
2892 	if (status &  SGE_FATALERR)
2893 		t3_fatal_err(adapter);
2894 }
2895 
2896 /**
2897  *	sge_timer_tx - perform periodic maintenance of an SGE qset
2898  *	@data: the SGE queue set to maintain
2899  *
2900  *	Runs periodically from a timer to perform maintenance of an SGE queue
2901  *	set.  It performs two tasks:
2902  *
2903  *	Cleans up any completed Tx descriptors that may still be pending.
2904  *	Normal descriptor cleanup happens when new packets are added to a Tx
2905  *	queue so this timer is relatively infrequent and does any cleanup only
2906  *	if the Tx queue has not seen any new packets in a while.  We make a
2907  *	best effort attempt to reclaim descriptors, in that we don't wait
2908  *	around if we cannot get a queue's lock (which most likely is because
2909  *	someone else is queueing new packets and so will also handle the clean
2910  *	up).  Since control queues use immediate data exclusively we don't
2911  *	bother cleaning them up here.
2912  *
2913  */
2914 static void sge_timer_tx(unsigned long data)
2915 {
2916 	struct sge_qset *qs = (struct sge_qset *)data;
2917 	struct port_info *pi = netdev_priv(qs->netdev);
2918 	struct adapter *adap = pi->adapter;
2919 	unsigned int tbd[SGE_TXQ_PER_SET] = {0, 0};
2920 	unsigned long next_period;
2921 
2922 	if (__netif_tx_trylock(qs->tx_q)) {
2923                 tbd[TXQ_ETH] = reclaim_completed_tx(adap, &qs->txq[TXQ_ETH],
2924                                                      TX_RECLAIM_TIMER_CHUNK);
2925 		__netif_tx_unlock(qs->tx_q);
2926 	}
2927 
2928 	if (spin_trylock(&qs->txq[TXQ_OFLD].lock)) {
2929 		tbd[TXQ_OFLD] = reclaim_completed_tx(adap, &qs->txq[TXQ_OFLD],
2930 						     TX_RECLAIM_TIMER_CHUNK);
2931 		spin_unlock(&qs->txq[TXQ_OFLD].lock);
2932 	}
2933 
2934 	next_period = TX_RECLAIM_PERIOD >>
2935                       (max(tbd[TXQ_ETH], tbd[TXQ_OFLD]) /
2936                       TX_RECLAIM_TIMER_CHUNK);
2937 	mod_timer(&qs->tx_reclaim_timer, jiffies + next_period);
2938 }
2939 
2940 /**
2941  *	sge_timer_rx - perform periodic maintenance of an SGE qset
2942  *	@data: the SGE queue set to maintain
2943  *
2944  *	a) Replenishes Rx queues that have run out due to memory shortage.
2945  *	Normally new Rx buffers are added when existing ones are consumed but
2946  *	when out of memory a queue can become empty.  We try to add only a few
2947  *	buffers here, the queue will be replenished fully as these new buffers
2948  *	are used up if memory shortage has subsided.
2949  *
2950  *	b) Return coalesced response queue credits in case a response queue is
2951  *	starved.
2952  *
2953  */
2954 static void sge_timer_rx(unsigned long data)
2955 {
2956 	spinlock_t *lock;
2957 	struct sge_qset *qs = (struct sge_qset *)data;
2958 	struct port_info *pi = netdev_priv(qs->netdev);
2959 	struct adapter *adap = pi->adapter;
2960 	u32 status;
2961 
2962 	lock = adap->params.rev > 0 ?
2963 	       &qs->rspq.lock : &adap->sge.qs[0].rspq.lock;
2964 
2965 	if (!spin_trylock_irq(lock))
2966 		goto out;
2967 
2968 	if (napi_is_scheduled(&qs->napi))
2969 		goto unlock;
2970 
2971 	if (adap->params.rev < 4) {
2972 		status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS);
2973 
2974 		if (status & (1 << qs->rspq.cntxt_id)) {
2975 			qs->rspq.starved++;
2976 			if (qs->rspq.credits) {
2977 				qs->rspq.credits--;
2978 				refill_rspq(adap, &qs->rspq, 1);
2979 				qs->rspq.restarted++;
2980 				t3_write_reg(adap, A_SG_RSPQ_FL_STATUS,
2981 					     1 << qs->rspq.cntxt_id);
2982 			}
2983 		}
2984 	}
2985 
2986 	if (qs->fl[0].credits < qs->fl[0].size)
2987 		__refill_fl(adap, &qs->fl[0]);
2988 	if (qs->fl[1].credits < qs->fl[1].size)
2989 		__refill_fl(adap, &qs->fl[1]);
2990 
2991 unlock:
2992 	spin_unlock_irq(lock);
2993 out:
2994 	mod_timer(&qs->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD);
2995 }
2996 
2997 /**
2998  *	t3_update_qset_coalesce - update coalescing settings for a queue set
2999  *	@qs: the SGE queue set
3000  *	@p: new queue set parameters
3001  *
3002  *	Update the coalescing settings for an SGE queue set.  Nothing is done
3003  *	if the queue set is not initialized yet.
3004  */
3005 void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
3006 {
3007 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */
3008 	qs->rspq.polling = p->polling;
3009 	qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll;
3010 }
3011 
3012 /**
3013  *	t3_sge_alloc_qset - initialize an SGE queue set
3014  *	@adapter: the adapter
3015  *	@id: the queue set id
3016  *	@nports: how many Ethernet ports will be using this queue set
3017  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
3018  *	@p: configuration parameters for this queue set
3019  *	@ntxq: number of Tx queues for the queue set
3020  *	@netdev: net device associated with this queue set
3021  *	@netdevq: net device TX queue associated with this queue set
3022  *
3023  *	Allocate resources and initialize an SGE queue set.  A queue set
3024  *	comprises a response queue, two Rx free-buffer queues, and up to 3
3025  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
3026  *	queue, offload queue, and control queue.
3027  */
3028 int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
3029 		      int irq_vec_idx, const struct qset_params *p,
3030 		      int ntxq, struct net_device *dev,
3031 		      struct netdev_queue *netdevq)
3032 {
3033 	int i, avail, ret = -ENOMEM;
3034 	struct sge_qset *q = &adapter->sge.qs[id];
3035 
3036 	init_qset_cntxt(q, id);
3037 	setup_timer(&q->tx_reclaim_timer, sge_timer_tx, (unsigned long)q);
3038 	setup_timer(&q->rx_reclaim_timer, sge_timer_rx, (unsigned long)q);
3039 
3040 	q->fl[0].desc = alloc_ring(adapter->pdev, p->fl_size,
3041 				   sizeof(struct rx_desc),
3042 				   sizeof(struct rx_sw_desc),
3043 				   &q->fl[0].phys_addr, &q->fl[0].sdesc);
3044 	if (!q->fl[0].desc)
3045 		goto err;
3046 
3047 	q->fl[1].desc = alloc_ring(adapter->pdev, p->jumbo_size,
3048 				   sizeof(struct rx_desc),
3049 				   sizeof(struct rx_sw_desc),
3050 				   &q->fl[1].phys_addr, &q->fl[1].sdesc);
3051 	if (!q->fl[1].desc)
3052 		goto err;
3053 
3054 	q->rspq.desc = alloc_ring(adapter->pdev, p->rspq_size,
3055 				  sizeof(struct rsp_desc), 0,
3056 				  &q->rspq.phys_addr, NULL);
3057 	if (!q->rspq.desc)
3058 		goto err;
3059 
3060 	for (i = 0; i < ntxq; ++i) {
3061 		/*
3062 		 * The control queue always uses immediate data so does not
3063 		 * need to keep track of any sk_buffs.
3064 		 */
3065 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
3066 
3067 		q->txq[i].desc = alloc_ring(adapter->pdev, p->txq_size[i],
3068 					    sizeof(struct tx_desc), sz,
3069 					    &q->txq[i].phys_addr,
3070 					    &q->txq[i].sdesc);
3071 		if (!q->txq[i].desc)
3072 			goto err;
3073 
3074 		q->txq[i].gen = 1;
3075 		q->txq[i].size = p->txq_size[i];
3076 		spin_lock_init(&q->txq[i].lock);
3077 		skb_queue_head_init(&q->txq[i].sendq);
3078 	}
3079 
3080 	tasklet_init(&q->txq[TXQ_OFLD].qresume_tsk, restart_offloadq,
3081 		     (unsigned long)q);
3082 	tasklet_init(&q->txq[TXQ_CTRL].qresume_tsk, restart_ctrlq,
3083 		     (unsigned long)q);
3084 
3085 	q->fl[0].gen = q->fl[1].gen = 1;
3086 	q->fl[0].size = p->fl_size;
3087 	q->fl[1].size = p->jumbo_size;
3088 
3089 	q->rspq.gen = 1;
3090 	q->rspq.size = p->rspq_size;
3091 	spin_lock_init(&q->rspq.lock);
3092 	skb_queue_head_init(&q->rspq.rx_queue);
3093 
3094 	q->txq[TXQ_ETH].stop_thres = nports *
3095 	    flits_to_desc(sgl_len(MAX_SKB_FRAGS + 1) + 3);
3096 
3097 #if FL0_PG_CHUNK_SIZE > 0
3098 	q->fl[0].buf_size = FL0_PG_CHUNK_SIZE;
3099 #else
3100 	q->fl[0].buf_size = SGE_RX_SM_BUF_SIZE + sizeof(struct cpl_rx_data);
3101 #endif
3102 #if FL1_PG_CHUNK_SIZE > 0
3103 	q->fl[1].buf_size = FL1_PG_CHUNK_SIZE;
3104 #else
3105 	q->fl[1].buf_size = is_offload(adapter) ?
3106 		(16 * 1024) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
3107 		MAX_FRAME_SIZE + 2 + sizeof(struct cpl_rx_pkt);
3108 #endif
3109 
3110 	q->fl[0].use_pages = FL0_PG_CHUNK_SIZE > 0;
3111 	q->fl[1].use_pages = FL1_PG_CHUNK_SIZE > 0;
3112 	q->fl[0].order = FL0_PG_ORDER;
3113 	q->fl[1].order = FL1_PG_ORDER;
3114 	q->fl[0].alloc_size = FL0_PG_ALLOC_SIZE;
3115 	q->fl[1].alloc_size = FL1_PG_ALLOC_SIZE;
3116 
3117 	spin_lock_irq(&adapter->sge.reg_lock);
3118 
3119 	/* FL threshold comparison uses < */
3120 	ret = t3_sge_init_rspcntxt(adapter, q->rspq.cntxt_id, irq_vec_idx,
3121 				   q->rspq.phys_addr, q->rspq.size,
3122 				   q->fl[0].buf_size - SGE_PG_RSVD, 1, 0);
3123 	if (ret)
3124 		goto err_unlock;
3125 
3126 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
3127 		ret = t3_sge_init_flcntxt(adapter, q->fl[i].cntxt_id, 0,
3128 					  q->fl[i].phys_addr, q->fl[i].size,
3129 					  q->fl[i].buf_size - SGE_PG_RSVD,
3130 					  p->cong_thres, 1, 0);
3131 		if (ret)
3132 			goto err_unlock;
3133 	}
3134 
3135 	ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
3136 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
3137 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
3138 				 1, 0);
3139 	if (ret)
3140 		goto err_unlock;
3141 
3142 	if (ntxq > 1) {
3143 		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_OFLD].cntxt_id,
3144 					 USE_GTS, SGE_CNTXT_OFLD, id,
3145 					 q->txq[TXQ_OFLD].phys_addr,
3146 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
3147 		if (ret)
3148 			goto err_unlock;
3149 	}
3150 
3151 	if (ntxq > 2) {
3152 		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_CTRL].cntxt_id, 0,
3153 					 SGE_CNTXT_CTRL, id,
3154 					 q->txq[TXQ_CTRL].phys_addr,
3155 					 q->txq[TXQ_CTRL].size,
3156 					 q->txq[TXQ_CTRL].token, 1, 0);
3157 		if (ret)
3158 			goto err_unlock;
3159 	}
3160 
3161 	spin_unlock_irq(&adapter->sge.reg_lock);
3162 
3163 	q->adap = adapter;
3164 	q->netdev = dev;
3165 	q->tx_q = netdevq;
3166 	t3_update_qset_coalesce(q, p);
3167 
3168 	avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
3169 			  GFP_KERNEL | __GFP_COMP);
3170 	if (!avail) {
3171 		CH_ALERT(adapter, "free list queue 0 initialization failed\n");
3172 		goto err;
3173 	}
3174 	if (avail < q->fl[0].size)
3175 		CH_WARN(adapter, "free list queue 0 enabled with %d credits\n",
3176 			avail);
3177 
3178 	avail = refill_fl(adapter, &q->fl[1], q->fl[1].size,
3179 			  GFP_KERNEL | __GFP_COMP);
3180 	if (avail < q->fl[1].size)
3181 		CH_WARN(adapter, "free list queue 1 enabled with %d credits\n",
3182 			avail);
3183 	refill_rspq(adapter, &q->rspq, q->rspq.size - 1);
3184 
3185 	t3_write_reg(adapter, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
3186 		     V_NEWTIMER(q->rspq.holdoff_tmr));
3187 
3188 	return 0;
3189 
3190 err_unlock:
3191 	spin_unlock_irq(&adapter->sge.reg_lock);
3192 err:
3193 	t3_free_qset(adapter, q);
3194 	return ret;
3195 }
3196 
3197 /**
3198  *      t3_start_sge_timers - start SGE timer call backs
3199  *      @adap: the adapter
3200  *
3201  *      Starts each SGE queue set's timer call back
3202  */
3203 void t3_start_sge_timers(struct adapter *adap)
3204 {
3205 	int i;
3206 
3207 	for (i = 0; i < SGE_QSETS; ++i) {
3208 		struct sge_qset *q = &adap->sge.qs[i];
3209 
3210 	if (q->tx_reclaim_timer.function)
3211 		mod_timer(&q->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD);
3212 
3213 	if (q->rx_reclaim_timer.function)
3214 		mod_timer(&q->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD);
3215 	}
3216 }
3217 
3218 /**
3219  *	t3_stop_sge_timers - stop SGE timer call backs
3220  *	@adap: the adapter
3221  *
3222  *	Stops each SGE queue set's timer call back
3223  */
3224 void t3_stop_sge_timers(struct adapter *adap)
3225 {
3226 	int i;
3227 
3228 	for (i = 0; i < SGE_QSETS; ++i) {
3229 		struct sge_qset *q = &adap->sge.qs[i];
3230 
3231 		if (q->tx_reclaim_timer.function)
3232 			del_timer_sync(&q->tx_reclaim_timer);
3233 		if (q->rx_reclaim_timer.function)
3234 			del_timer_sync(&q->rx_reclaim_timer);
3235 	}
3236 }
3237 
3238 /**
3239  *	t3_free_sge_resources - free SGE resources
3240  *	@adap: the adapter
3241  *
3242  *	Frees resources used by the SGE queue sets.
3243  */
3244 void t3_free_sge_resources(struct adapter *adap)
3245 {
3246 	int i;
3247 
3248 	for (i = 0; i < SGE_QSETS; ++i)
3249 		t3_free_qset(adap, &adap->sge.qs[i]);
3250 }
3251 
3252 /**
3253  *	t3_sge_start - enable SGE
3254  *	@adap: the adapter
3255  *
3256  *	Enables the SGE for DMAs.  This is the last step in starting packet
3257  *	transfers.
3258  */
3259 void t3_sge_start(struct adapter *adap)
3260 {
3261 	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
3262 }
3263 
3264 /**
3265  *	t3_sge_stop - disable SGE operation
3266  *	@adap: the adapter
3267  *
3268  *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
3269  *	from error interrupts) or from normal process context.  In the latter
3270  *	case it also disables any pending queue restart tasklets.  Note that
3271  *	if it is called in interrupt context it cannot disable the restart
3272  *	tasklets as it cannot wait, however the tasklets will have no effect
3273  *	since the doorbells are disabled and the driver will call this again
3274  *	later from process context, at which time the tasklets will be stopped
3275  *	if they are still running.
3276  */
3277 void t3_sge_stop(struct adapter *adap)
3278 {
3279 	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, 0);
3280 	if (!in_interrupt()) {
3281 		int i;
3282 
3283 		for (i = 0; i < SGE_QSETS; ++i) {
3284 			struct sge_qset *qs = &adap->sge.qs[i];
3285 
3286 			tasklet_kill(&qs->txq[TXQ_OFLD].qresume_tsk);
3287 			tasklet_kill(&qs->txq[TXQ_CTRL].qresume_tsk);
3288 		}
3289 	}
3290 }
3291 
3292 /**
3293  *	t3_sge_init - initialize SGE
3294  *	@adap: the adapter
3295  *	@p: the SGE parameters
3296  *
3297  *	Performs SGE initialization needed every time after a chip reset.
3298  *	We do not initialize any of the queue sets here, instead the driver
3299  *	top-level must request those individually.  We also do not enable DMA
3300  *	here, that should be done after the queues have been set up.
3301  */
3302 void t3_sge_init(struct adapter *adap, struct sge_params *p)
3303 {
3304 	unsigned int ctrl, ups = ffs(pci_resource_len(adap->pdev, 2) >> 12);
3305 
3306 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
3307 	    F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
3308 	    V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
3309 	    V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
3310 #if SGE_NUM_GENBITS == 1
3311 	ctrl |= F_EGRGENCTRL;
3312 #endif
3313 	if (adap->params.rev > 0) {
3314 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
3315 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
3316 	}
3317 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
3318 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
3319 		     V_LORCQDRBTHRSH(512));
3320 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
3321 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
3322 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
3323 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
3324 		     adap->params.rev < T3_REV_C ? 1000 : 500);
3325 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
3326 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
3327 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
3328 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
3329 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
3330 }
3331 
3332 /**
3333  *	t3_sge_prep - one-time SGE initialization
3334  *	@adap: the associated adapter
3335  *	@p: SGE parameters
3336  *
3337  *	Performs one-time initialization of SGE SW state.  Includes determining
3338  *	defaults for the assorted SGE parameters, which admins can change until
3339  *	they are used to initialize the SGE.
3340  */
3341 void t3_sge_prep(struct adapter *adap, struct sge_params *p)
3342 {
3343 	int i;
3344 
3345 	p->max_pkt_size = (16 * 1024) - sizeof(struct cpl_rx_data) -
3346 	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
3347 
3348 	for (i = 0; i < SGE_QSETS; ++i) {
3349 		struct qset_params *q = p->qset + i;
3350 
3351 		q->polling = adap->params.rev > 0;
3352 		q->coalesce_usecs = 5;
3353 		q->rspq_size = 1024;
3354 		q->fl_size = 1024;
3355  		q->jumbo_size = 512;
3356 		q->txq_size[TXQ_ETH] = 1024;
3357 		q->txq_size[TXQ_OFLD] = 1024;
3358 		q->txq_size[TXQ_CTRL] = 256;
3359 		q->cong_thres = 0;
3360 	}
3361 
3362 	spin_lock_init(&adap->sge.reg_lock);
3363 }
3364