xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision 596596fec79f04e1f413850b44159224ff1fb8dc)
1 /*-
2  * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/smp.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 
40 #include <net/ethernet.h>
41 #include <net/if.h>
42 #include <net/if_vlan_var.h>
43 
44 #include <netinet/in.h>
45 #include <netinet/ip.h>
46 #include <netinet/ip6.h>
47 #include <netinet/tcp.h>
48 
49 #include <machine/in_cksum.h>
50 
51 #include "common/efx.h"
52 
53 
54 #include "sfxge.h"
55 #include "sfxge_rx.h"
56 
57 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
58 
59 #ifdef SFXGE_LRO
60 
61 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
62  * means we can accelerate a larger number of streams.
63  */
64 static unsigned lro_table_size = 128;
65 
66 /* Maximum length of a hash chain.  If chains get too long then the lookup
67  * time increases and may exceed the benefit of LRO.
68  */
69 static unsigned lro_chain_max = 20;
70 
71 /* Maximum time (in ticks) that a connection can be idle before it's LRO
72  * state is discarded.
73  */
74 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
75 
76 /* Number of packets with payload that must arrive in-order before a
77  * connection is eligible for LRO.  The idea is we should avoid coalescing
78  * segments when the sender is in slow-start because reducing the ACK rate
79  * can damage performance.
80  */
81 static int lro_slow_start_packets = 2000;
82 
83 /* Number of packets with payload that must arrive in-order following loss
84  * before a connection is eligible for LRO.  The idea is we should avoid
85  * coalescing segments when the sender is recovering from loss, because
86  * reducing the ACK rate can damage performance.
87  */
88 static int lro_loss_packets = 20;
89 
90 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
91 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
92 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
93 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
94 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
95 
96 /* Compare IPv6 addresses, avoiding conditional branches */
97 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
98 				   const struct in6_addr *right)
99 {
100 #if LONG_BIT == 64
101 	const uint64_t *left64 = (const uint64_t *)left;
102 	const uint64_t *right64 = (const uint64_t *)right;
103 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
104 #else
105 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
106 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
107 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
108 	       (left->s6_addr32[3] - right->s6_addr32[3]);
109 #endif
110 }
111 
112 #endif	/* SFXGE_LRO */
113 
114 void
115 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
116 {
117 
118 	rxq->flush_state = SFXGE_FLUSH_DONE;
119 }
120 
121 void
122 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
123 {
124 
125 	rxq->flush_state = SFXGE_FLUSH_FAILED;
126 }
127 
128 static uint8_t toep_key[] = {
129 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
130 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
131 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
132 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
133 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
134 };
135 
136 static void
137 sfxge_rx_post_refill(void *arg)
138 {
139 	struct sfxge_rxq *rxq = arg;
140 	struct sfxge_softc *sc;
141 	unsigned int index;
142 	struct sfxge_evq *evq;
143 	uint16_t magic;
144 
145 	sc = rxq->sc;
146 	index = rxq->index;
147 	evq = sc->evq[index];
148 
149 	magic = SFXGE_MAGIC_RX_QREFILL | index;
150 
151 	/* This is guaranteed due to the start/stop order of rx and ev */
152 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
153 	    ("evq not started"));
154 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
155 	    ("rxq not started"));
156 	efx_ev_qpost(evq->common, magic);
157 }
158 
159 static void
160 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
161 {
162 	/* Initially retry after 100 ms, but back off in case of
163 	 * repeated failures as we probably have to wait for the
164 	 * administrator to raise the pool limit. */
165 	if (retrying)
166 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
167 	else
168 		rxq->refill_delay = hz / 10;
169 
170 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
171 			     sfxge_rx_post_refill, rxq);
172 }
173 
174 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
175 {
176 	struct mb_args args;
177 	struct mbuf *m;
178 
179 	/* Allocate mbuf structure */
180 	args.flags = M_PKTHDR;
181 	args.type = MT_DATA;
182 	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
183 
184 	/* Allocate (and attach) packet buffer */
185 	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
186 		uma_zfree(zone_mbuf, m);
187 		m = NULL;
188 	}
189 
190 	return (m);
191 }
192 
193 #define	SFXGE_REFILL_BATCH  64
194 
195 static void
196 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
197 {
198 	struct sfxge_softc *sc;
199 	unsigned int index;
200 	struct sfxge_evq *evq;
201 	unsigned int batch;
202 	unsigned int rxfill;
203 	unsigned int mblksize;
204 	int ntodo;
205 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
206 
207 	sc = rxq->sc;
208 	index = rxq->index;
209 	evq = sc->evq[index];
210 
211 	prefetch_read_many(sc->enp);
212 	prefetch_read_many(rxq->common);
213 
214 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
215 
216 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
217 		return;
218 
219 	rxfill = rxq->added - rxq->completed;
220 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
221 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
222 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
223 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
224 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
225 
226 	if (ntodo == 0)
227 		return;
228 
229 	batch = 0;
230 	mblksize = sc->rx_buffer_size;
231 	while (ntodo-- > 0) {
232 		unsigned int id;
233 		struct sfxge_rx_sw_desc *rx_desc;
234 		bus_dma_segment_t seg;
235 		struct mbuf *m;
236 
237 		id = (rxq->added + batch) & rxq->ptr_mask;
238 		rx_desc = &rxq->queue[id];
239 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
240 
241 		rx_desc->flags = EFX_DISCARD;
242 		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
243 		if (m == NULL)
244 			break;
245 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
246 		addr[batch++] = seg.ds_addr;
247 
248 		if (batch == SFXGE_REFILL_BATCH) {
249 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
250 			    rxq->completed, rxq->added);
251 			rxq->added += batch;
252 			batch = 0;
253 		}
254 	}
255 
256 	if (ntodo != 0)
257 		sfxge_rx_schedule_refill(rxq, retrying);
258 
259 	if (batch != 0) {
260 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
261 		    rxq->completed, rxq->added);
262 		rxq->added += batch;
263 	}
264 
265 	/* Make the descriptors visible to the hardware */
266 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
267 			BUS_DMASYNC_PREWRITE);
268 
269 	efx_rx_qpush(rxq->common, rxq->added);
270 }
271 
272 void
273 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
274 {
275 
276 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
277 		return;
278 
279 	/* Make sure the queue is full */
280 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
281 }
282 
283 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
284 {
285 	struct ifnet *ifp = sc->ifnet;
286 
287 	m->m_pkthdr.rcvif = ifp;
288 	m->m_pkthdr.csum_data = 0xffff;
289 	ifp->if_input(ifp, m);
290 }
291 
292 static void
293 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
294 {
295 	struct mbuf *m = rx_desc->mbuf;
296 	int csum_flags;
297 
298 	/* Convert checksum flags */
299 	csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
300 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
301 	if (rx_desc->flags & EFX_CKSUM_TCPUDP)
302 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
303 
304 #ifdef SFXGE_HAVE_MQ
305 	/* The hash covers a 4-tuple for TCP only */
306 	if (rx_desc->flags & EFX_PKT_TCP) {
307 		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
308 						       mtod(m, uint8_t *));
309 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
310 	}
311 #endif
312 	m->m_data += sc->rx_prefix_size;
313 	m->m_len = rx_desc->size - sc->rx_prefix_size;
314 	m->m_pkthdr.len = m->m_len;
315 	m->m_pkthdr.csum_flags = csum_flags;
316 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
317 
318 	rx_desc->flags = EFX_DISCARD;
319 	rx_desc->mbuf = NULL;
320 }
321 
322 #ifdef SFXGE_LRO
323 
324 static void
325 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
326 {
327 	struct sfxge_softc *sc = st->sc;
328 	struct mbuf *m = c->mbuf;
329 	struct tcphdr *c_th;
330 	int csum_flags;
331 
332 	KASSERT(m, ("no mbuf to deliver"));
333 
334 	++st->n_bursts;
335 
336 	/* Finish off packet munging and recalculate IP header checksum. */
337 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
338 		struct ip *iph = c->nh;
339 		iph->ip_len = htons(iph->ip_len);
340 		iph->ip_sum = 0;
341 		iph->ip_sum = in_cksum_hdr(iph);
342 		c_th = (struct tcphdr *)(iph + 1);
343 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
344 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
345 	} else {
346 		struct ip6_hdr *iph = c->nh;
347 		iph->ip6_plen = htons(iph->ip6_plen);
348 		c_th = (struct tcphdr *)(iph + 1);
349 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
350 	}
351 
352 	c_th->th_win = c->th_last->th_win;
353 	c_th->th_ack = c->th_last->th_ack;
354 	if (c_th->th_off == c->th_last->th_off) {
355 		/* Copy TCP options (take care to avoid going negative). */
356 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
357 		memcpy(c_th + 1, c->th_last + 1, optlen);
358 	}
359 
360 #ifdef SFXGE_HAVE_MQ
361 	m->m_pkthdr.flowid = c->conn_hash;
362 	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
363 #endif
364 	m->m_pkthdr.csum_flags = csum_flags;
365 	__sfxge_rx_deliver(sc, m);
366 
367 	c->mbuf = NULL;
368 	c->delivered = 1;
369 }
370 
371 /* Drop the given connection, and add it to the free list. */
372 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
373 {
374 	unsigned bucket;
375 
376 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
377 
378 	if (c->next_buf.mbuf != NULL) {
379 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
380 		LIST_REMOVE(c, active_link);
381 	}
382 
383 	bucket = c->conn_hash & rxq->lro.conns_mask;
384 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
385 	--rxq->lro.conns_n[bucket];
386 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
387 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
388 }
389 
390 /* Stop tracking connections that have gone idle in order to keep hash
391  * chains short.
392  */
393 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
394 {
395 	struct sfxge_lro_conn *c;
396 	unsigned i;
397 
398 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
399 		("found active connections"));
400 
401 	rxq->lro.last_purge_ticks = now;
402 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
403 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
404 			continue;
405 
406 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
407 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
408 			++rxq->lro.n_drop_idle;
409 			sfxge_lro_drop(rxq, c);
410 		}
411 	}
412 }
413 
414 static void
415 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
416 		struct mbuf *mbuf, struct tcphdr *th)
417 {
418 	struct tcphdr *c_th;
419 
420 	/* Tack the new mbuf onto the chain. */
421 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
422 	c->mbuf_tail->m_next = mbuf;
423 	c->mbuf_tail = mbuf;
424 
425 	/* Increase length appropriately */
426 	c->mbuf->m_pkthdr.len += mbuf->m_len;
427 
428 	/* Update the connection state flags */
429 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
430 		struct ip *iph = c->nh;
431 		iph->ip_len += mbuf->m_len;
432 		c_th = (struct tcphdr *)(iph + 1);
433 	} else {
434 		struct ip6_hdr *iph = c->nh;
435 		iph->ip6_plen += mbuf->m_len;
436 		c_th = (struct tcphdr *)(iph + 1);
437 	}
438 	c_th->th_flags |= (th->th_flags & TH_PUSH);
439 	c->th_last = th;
440 	++st->n_merges;
441 
442 	/* Pass packet up now if another segment could overflow the IP
443 	 * length.
444 	 */
445 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
446 		sfxge_lro_deliver(st, c);
447 }
448 
449 static void
450 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
451 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
452 {
453 	/* Start the chain */
454 	c->mbuf = mbuf;
455 	c->mbuf_tail = c->mbuf;
456 	c->nh = nh;
457 	c->th_last = th;
458 
459 	mbuf->m_pkthdr.len = mbuf->m_len;
460 
461 	/* Mangle header fields for later processing */
462 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
463 		struct ip *iph = nh;
464 		iph->ip_len = ntohs(iph->ip_len);
465 	} else {
466 		struct ip6_hdr *iph = nh;
467 		iph->ip6_plen = ntohs(iph->ip6_plen);
468 	}
469 }
470 
471 /* Try to merge or otherwise hold or deliver (as appropriate) the
472  * packet buffered for this connection (c->next_buf).  Return a flag
473  * indicating whether the connection is still active for LRO purposes.
474  */
475 static int
476 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
477 {
478 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
479 	char *eh = c->next_eh;
480 	int data_length, hdr_length, dont_merge;
481 	unsigned th_seq, pkt_length;
482 	struct tcphdr *th;
483 	unsigned now;
484 
485 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
486 		struct ip *iph = c->next_nh;
487 		th = (struct tcphdr *)(iph + 1);
488 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
489 	} else {
490 		struct ip6_hdr *iph = c->next_nh;
491 		th = (struct tcphdr *)(iph + 1);
492 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
493 	}
494 
495 	hdr_length = (char *) th + th->th_off * 4 - eh;
496 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
497 		       hdr_length);
498 	th_seq = ntohl(th->th_seq);
499 	dont_merge = ((data_length <= 0)
500 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
501 
502 	/* Check for options other than aligned timestamp. */
503 	if (th->th_off != 5) {
504 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
505 		if (th->th_off == 8 &&
506 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
507 					(TCPOPT_NOP << 16) |
508 					(TCPOPT_TIMESTAMP << 8) |
509 					TCPOLEN_TIMESTAMP)) {
510 			/* timestamp option -- okay */
511 		} else {
512 			dont_merge = 1;
513 		}
514 	}
515 
516 	if (__predict_false(th_seq != c->next_seq)) {
517 		/* Out-of-order, so start counting again. */
518 		if (c->mbuf != NULL)
519 			sfxge_lro_deliver(&rxq->lro, c);
520 		c->n_in_order_pkts -= lro_loss_packets;
521 		c->next_seq = th_seq + data_length;
522 		++rxq->lro.n_misorder;
523 		goto deliver_buf_out;
524 	}
525 	c->next_seq = th_seq + data_length;
526 
527 	now = ticks;
528 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
529 		++rxq->lro.n_drop_idle;
530 		if (c->mbuf != NULL)
531 			sfxge_lro_deliver(&rxq->lro, c);
532 		sfxge_lro_drop(rxq, c);
533 		return (0);
534 	}
535 	c->last_pkt_ticks = ticks;
536 
537 	if (c->n_in_order_pkts < lro_slow_start_packets) {
538 		/* May be in slow-start, so don't merge. */
539 		++rxq->lro.n_slow_start;
540 		++c->n_in_order_pkts;
541 		goto deliver_buf_out;
542 	}
543 
544 	if (__predict_false(dont_merge)) {
545 		if (c->mbuf != NULL)
546 			sfxge_lro_deliver(&rxq->lro, c);
547 		if (th->th_flags & (TH_FIN | TH_RST)) {
548 			++rxq->lro.n_drop_closed;
549 			sfxge_lro_drop(rxq, c);
550 			return (0);
551 		}
552 		goto deliver_buf_out;
553 	}
554 
555 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
556 
557 	if (__predict_true(c->mbuf != NULL)) {
558 		/* Remove headers and any padding */
559 		rx_buf->mbuf->m_data += hdr_length;
560 		rx_buf->mbuf->m_len = data_length;
561 
562 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
563 	} else {
564 		/* Remove any padding */
565 		rx_buf->mbuf->m_len = pkt_length;
566 
567 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
568 	}
569 
570 	rx_buf->mbuf = NULL;
571 	return (1);
572 
573  deliver_buf_out:
574 	sfxge_rx_deliver(rxq->sc, rx_buf);
575 	return (1);
576 }
577 
578 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
579 			       uint16_t l2_id, void *nh, struct tcphdr *th)
580 {
581 	unsigned bucket = conn_hash & st->conns_mask;
582 	struct sfxge_lro_conn *c;
583 
584 	if (st->conns_n[bucket] >= lro_chain_max) {
585 		++st->n_too_many;
586 		return;
587 	}
588 
589 	if (!TAILQ_EMPTY(&st->free_conns)) {
590 		c = TAILQ_FIRST(&st->free_conns);
591 		TAILQ_REMOVE(&st->free_conns, c, link);
592 	} else {
593 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
594 		if (c == NULL)
595 			return;
596 		c->mbuf = NULL;
597 		c->next_buf.mbuf = NULL;
598 	}
599 
600 	/* Create the connection tracking data */
601 	++st->conns_n[bucket];
602 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
603 	c->l2_id = l2_id;
604 	c->conn_hash = conn_hash;
605 	c->source = th->th_sport;
606 	c->dest = th->th_dport;
607 	c->n_in_order_pkts = 0;
608 	c->last_pkt_ticks = *(volatile int *)&ticks;
609 	c->delivered = 0;
610 	++st->n_new_stream;
611 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
612 	 * value it has.  Most likely the next packet received for this
613 	 * connection will not match -- no harm done.
614 	 */
615 }
616 
617 /* Process mbuf and decide whether to dispatch it to the stack now or
618  * later.
619  */
620 static void
621 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
622 {
623 	struct sfxge_softc *sc = rxq->sc;
624 	struct mbuf *m = rx_buf->mbuf;
625 	struct ether_header *eh;
626 	struct sfxge_lro_conn *c;
627 	uint16_t l2_id;
628 	uint16_t l3_proto;
629 	void *nh;
630 	struct tcphdr *th;
631 	uint32_t conn_hash;
632 	unsigned bucket;
633 
634 	/* Get the hardware hash */
635 	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
636 				      mtod(m, uint8_t *));
637 
638 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
639 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
640 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
641 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
642 			SFXGE_LRO_L2_ID_VLAN;
643 		l3_proto = veh->evl_proto;
644 		nh = veh + 1;
645 	} else {
646 		l2_id = 0;
647 		l3_proto = eh->ether_type;
648 		nh = eh + 1;
649 	}
650 
651 	/* Check whether this is a suitable packet (unfragmented
652 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
653 	 * length, and compute a hash if necessary.  If not, return.
654 	 */
655 	if (l3_proto == htons(ETHERTYPE_IP)) {
656 		struct ip *iph = nh;
657 		if ((iph->ip_p - IPPROTO_TCP) |
658 		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
659 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
660 			goto deliver_now;
661 		th = (struct tcphdr *)(iph + 1);
662 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
663 		struct ip6_hdr *iph = nh;
664 		if (iph->ip6_nxt != IPPROTO_TCP)
665 			goto deliver_now;
666 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
667 		th = (struct tcphdr *)(iph + 1);
668 	} else {
669 		goto deliver_now;
670 	}
671 
672 	bucket = conn_hash & rxq->lro.conns_mask;
673 
674 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
675 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
676 			continue;
677 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
678 			continue;
679 		if (c->mbuf != NULL) {
680 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
681 				struct ip *c_iph, *iph = nh;
682 				c_iph = c->nh;
683 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
684 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
685 					continue;
686 			} else {
687 				struct ip6_hdr *c_iph, *iph = nh;
688 				c_iph = c->nh;
689 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
690 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
691 					continue;
692 			}
693 		}
694 
695 		/* Re-insert at head of list to reduce lookup time. */
696 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
697 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
698 
699 		if (c->next_buf.mbuf != NULL) {
700 			if (!sfxge_lro_try_merge(rxq, c))
701 				goto deliver_now;
702 		} else {
703 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
704 			    active_link);
705 		}
706 		c->next_buf = *rx_buf;
707 		c->next_eh = eh;
708 		c->next_nh = nh;
709 
710 		rx_buf->mbuf = NULL;
711 		rx_buf->flags = EFX_DISCARD;
712 		return;
713 	}
714 
715 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
716  deliver_now:
717 	sfxge_rx_deliver(sc, rx_buf);
718 }
719 
720 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
721 {
722 	struct sfxge_lro_state *st = &rxq->lro;
723 	struct sfxge_lro_conn *c;
724 	unsigned t;
725 
726 	while (!LIST_EMPTY(&st->active_conns)) {
727 		c = LIST_FIRST(&st->active_conns);
728 		if (!c->delivered && c->mbuf != NULL)
729 			sfxge_lro_deliver(st, c);
730 		if (sfxge_lro_try_merge(rxq, c)) {
731 			if (c->mbuf != NULL)
732 				sfxge_lro_deliver(st, c);
733 			LIST_REMOVE(c, active_link);
734 		}
735 		c->delivered = 0;
736 	}
737 
738 	t = *(volatile int *)&ticks;
739 	if (__predict_false(t != st->last_purge_ticks))
740 		sfxge_lro_purge_idle(rxq, t);
741 }
742 
743 #else	/* !SFXGE_LRO */
744 
745 static void
746 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
747 {
748 }
749 
750 static void
751 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
752 {
753 }
754 
755 #endif	/* SFXGE_LRO */
756 
757 void
758 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
759 {
760 	struct sfxge_softc *sc = rxq->sc;
761 	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
762 	unsigned int index;
763 	struct sfxge_evq *evq;
764 	unsigned int completed;
765 	unsigned int level;
766 	struct mbuf *m;
767 	struct sfxge_rx_sw_desc *prev = NULL;
768 
769 	index = rxq->index;
770 	evq = sc->evq[index];
771 
772 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
773 
774 	completed = rxq->completed;
775 	while (completed != rxq->pending) {
776 		unsigned int id;
777 		struct sfxge_rx_sw_desc *rx_desc;
778 
779 		id = completed++ & rxq->ptr_mask;
780 		rx_desc = &rxq->queue[id];
781 		m = rx_desc->mbuf;
782 
783 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
784 			goto discard;
785 
786 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
787 			goto discard;
788 
789 		prefetch_read_many(mtod(m, caddr_t));
790 
791 		/* Check for loopback packets */
792 		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
793 		    !(rx_desc->flags & EFX_PKT_IPV6)) {
794 			struct ether_header *etherhp;
795 
796 			/*LINTED*/
797 			etherhp = mtod(m, struct ether_header *);
798 
799 			if (etherhp->ether_type ==
800 			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
801 				EFSYS_PROBE(loopback);
802 
803 				rxq->loopback++;
804 				goto discard;
805 			}
806 		}
807 
808 		/* Pass packet up the stack or into LRO (pipelined) */
809 		if (prev != NULL) {
810 			if (lro_enabled)
811 				sfxge_lro(rxq, prev);
812 			else
813 				sfxge_rx_deliver(sc, prev);
814 		}
815 		prev = rx_desc;
816 		continue;
817 
818 discard:
819 		/* Return the packet to the pool */
820 		m_free(m);
821 		rx_desc->mbuf = NULL;
822 	}
823 	rxq->completed = completed;
824 
825 	level = rxq->added - rxq->completed;
826 
827 	/* Pass last packet up the stack or into LRO */
828 	if (prev != NULL) {
829 		if (lro_enabled)
830 			sfxge_lro(rxq, prev);
831 		else
832 			sfxge_rx_deliver(sc, prev);
833 	}
834 
835 	/*
836 	 * If there are any pending flows and this is the end of the
837 	 * poll then they must be completed.
838 	 */
839 	if (eop)
840 		sfxge_lro_end_of_burst(rxq);
841 
842 	/* Top up the queue if necessary */
843 	if (level < rxq->refill_threshold)
844 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
845 }
846 
847 static void
848 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
849 {
850 	struct sfxge_rxq *rxq;
851 	struct sfxge_evq *evq;
852 	unsigned int count;
853 
854 	rxq = sc->rxq[index];
855 	evq = sc->evq[index];
856 
857 	SFXGE_EVQ_LOCK(evq);
858 
859 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
860 	    ("rxq not started"));
861 
862 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
863 
864 	callout_stop(&rxq->refill_callout);
865 
866 again:
867 	rxq->flush_state = SFXGE_FLUSH_PENDING;
868 
869 	/* Flush the receive queue */
870 	efx_rx_qflush(rxq->common);
871 
872 	SFXGE_EVQ_UNLOCK(evq);
873 
874 	count = 0;
875 	do {
876 		/* Spin for 100 ms */
877 		DELAY(100000);
878 
879 		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
880 			break;
881 
882 	} while (++count < 20);
883 
884 	SFXGE_EVQ_LOCK(evq);
885 
886 	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
887 		goto again;
888 
889 	rxq->flush_state = SFXGE_FLUSH_DONE;
890 
891 	rxq->pending = rxq->added;
892 	sfxge_rx_qcomplete(rxq, B_TRUE);
893 
894 	KASSERT(rxq->completed == rxq->pending,
895 	    ("rxq->completed != rxq->pending"));
896 
897 	rxq->added = 0;
898 	rxq->pending = 0;
899 	rxq->completed = 0;
900 	rxq->loopback = 0;
901 
902 	/* Destroy the common code receive queue. */
903 	efx_rx_qdestroy(rxq->common);
904 
905 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
906 	    EFX_RXQ_NBUFS(sc->rxq_entries));
907 
908 	SFXGE_EVQ_UNLOCK(evq);
909 }
910 
911 static int
912 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
913 {
914 	struct sfxge_rxq *rxq;
915 	efsys_mem_t *esmp;
916 	struct sfxge_evq *evq;
917 	int rc;
918 
919 	rxq = sc->rxq[index];
920 	esmp = &rxq->mem;
921 	evq = sc->evq[index];
922 
923 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
924 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
925 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
926 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
927 
928 	/* Program the buffer table. */
929 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
930 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
931 		return (rc);
932 
933 	/* Create the common code receive queue. */
934 	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
935 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
936 	    &rxq->common)) != 0)
937 		goto fail;
938 
939 	SFXGE_EVQ_LOCK(evq);
940 
941 	/* Enable the receive queue. */
942 	efx_rx_qenable(rxq->common);
943 
944 	rxq->init_state = SFXGE_RXQ_STARTED;
945 
946 	/* Try to fill the queue from the pool. */
947 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
948 
949 	SFXGE_EVQ_UNLOCK(evq);
950 
951 	return (0);
952 
953 fail:
954 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
955 	    EFX_RXQ_NBUFS(sc->rxq_entries));
956 	return (rc);
957 }
958 
959 void
960 sfxge_rx_stop(struct sfxge_softc *sc)
961 {
962 	int index;
963 
964 	/* Stop the receive queue(s) */
965 	index = sc->rxq_count;
966 	while (--index >= 0)
967 		sfxge_rx_qstop(sc, index);
968 
969 	sc->rx_prefix_size = 0;
970 	sc->rx_buffer_size = 0;
971 
972 	efx_rx_fini(sc->enp);
973 }
974 
975 int
976 sfxge_rx_start(struct sfxge_softc *sc)
977 {
978 	struct sfxge_intr *intr;
979 	int index;
980 	int rc;
981 
982 	intr = &sc->intr;
983 
984 	/* Initialize the common code receive module. */
985 	if ((rc = efx_rx_init(sc->enp)) != 0)
986 		return (rc);
987 
988 	/* Calculate the receive packet buffer size. */
989 	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
990 	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
991 			      sc->rx_prefix_size);
992 
993 	/* Select zone for packet buffers */
994 	if (sc->rx_buffer_size <= MCLBYTES)
995 		sc->rx_buffer_zone = zone_clust;
996 	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
997 		sc->rx_buffer_zone = zone_jumbop;
998 	else if (sc->rx_buffer_size <= MJUM9BYTES)
999 		sc->rx_buffer_zone = zone_jumbo9;
1000 	else
1001 		sc->rx_buffer_zone = zone_jumbo16;
1002 
1003 	/*
1004 	 * Set up the scale table.  Enable all hash types and hash insertion.
1005 	 */
1006 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1007 		sc->rx_indir_table[index] = index % sc->rxq_count;
1008 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1009 				       SFXGE_RX_SCALE_MAX)) != 0)
1010 		goto fail;
1011 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1012 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1013 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1014 
1015 	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1016 	    sizeof(toep_key))) != 0)
1017 		goto fail;
1018 
1019 	/* Start the receive queue(s). */
1020 	for (index = 0; index < sc->rxq_count; index++) {
1021 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1022 			goto fail2;
1023 	}
1024 
1025 	return (0);
1026 
1027 fail2:
1028 	while (--index >= 0)
1029 		sfxge_rx_qstop(sc, index);
1030 
1031 fail:
1032 	efx_rx_fini(sc->enp);
1033 
1034 	return (rc);
1035 }
1036 
1037 #ifdef SFXGE_LRO
1038 
1039 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1040 {
1041 	struct sfxge_lro_state *st = &rxq->lro;
1042 	unsigned i;
1043 
1044 	st->conns_mask = lro_table_size - 1;
1045 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1046 		("lro_table_size must be a power of 2"));
1047 	st->sc = rxq->sc;
1048 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1049 			   M_SFXGE, M_WAITOK);
1050 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1051 			     M_SFXGE, M_WAITOK);
1052 	for (i = 0; i <= st->conns_mask; ++i) {
1053 		TAILQ_INIT(&st->conns[i]);
1054 		st->conns_n[i] = 0;
1055 	}
1056 	LIST_INIT(&st->active_conns);
1057 	TAILQ_INIT(&st->free_conns);
1058 }
1059 
1060 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1061 {
1062 	struct sfxge_lro_state *st = &rxq->lro;
1063 	struct sfxge_lro_conn *c;
1064 	unsigned i;
1065 
1066 	/* Return cleanly if sfxge_lro_init() has not been called. */
1067 	if (st->conns == NULL)
1068 		return;
1069 
1070 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1071 
1072 	for (i = 0; i <= st->conns_mask; ++i) {
1073 		while (!TAILQ_EMPTY(&st->conns[i])) {
1074 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1075 			sfxge_lro_drop(rxq, c);
1076 		}
1077 	}
1078 
1079 	while (!TAILQ_EMPTY(&st->free_conns)) {
1080 		c = TAILQ_FIRST(&st->free_conns);
1081 		TAILQ_REMOVE(&st->free_conns, c, link);
1082 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1083 		free(c, M_SFXGE);
1084 	}
1085 
1086 	free(st->conns_n, M_SFXGE);
1087 	free(st->conns, M_SFXGE);
1088 	st->conns = NULL;
1089 }
1090 
1091 #else
1092 
1093 static void
1094 sfxge_lro_init(struct sfxge_rxq *rxq)
1095 {
1096 }
1097 
1098 static void
1099 sfxge_lro_fini(struct sfxge_rxq *rxq)
1100 {
1101 }
1102 
1103 #endif	/* SFXGE_LRO */
1104 
1105 static void
1106 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1107 {
1108 	struct sfxge_rxq *rxq;
1109 
1110 	rxq = sc->rxq[index];
1111 
1112 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1113 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1114 
1115 	/* Free the context array and the flow table. */
1116 	free(rxq->queue, M_SFXGE);
1117 	sfxge_lro_fini(rxq);
1118 
1119 	/* Release DMA memory. */
1120 	sfxge_dma_free(&rxq->mem);
1121 
1122 	sc->rxq[index] = NULL;
1123 
1124 	free(rxq, M_SFXGE);
1125 }
1126 
1127 static int
1128 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1129 {
1130 	struct sfxge_rxq *rxq;
1131 	struct sfxge_evq *evq;
1132 	efsys_mem_t *esmp;
1133 	int rc;
1134 
1135 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1136 
1137 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1138 	rxq->sc = sc;
1139 	rxq->index = index;
1140 	rxq->entries = sc->rxq_entries;
1141 	rxq->ptr_mask = rxq->entries - 1;
1142 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1143 
1144 	sc->rxq[index] = rxq;
1145 	esmp = &rxq->mem;
1146 
1147 	evq = sc->evq[index];
1148 
1149 	/* Allocate and zero DMA space. */
1150 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1151 		return (rc);
1152 
1153 	/* Allocate buffer table entries. */
1154 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1155 				 &rxq->buf_base_id);
1156 
1157 	/* Allocate the context array and the flow table. */
1158 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1159 	    M_SFXGE, M_WAITOK | M_ZERO);
1160 	sfxge_lro_init(rxq);
1161 
1162 	callout_init(&rxq->refill_callout, B_TRUE);
1163 
1164 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1165 
1166 	return (0);
1167 }
1168 
1169 static const struct {
1170 	const char *name;
1171 	size_t offset;
1172 } sfxge_rx_stats[] = {
1173 #define	SFXGE_RX_STAT(name, member) \
1174 	{ #name, offsetof(struct sfxge_rxq, member) }
1175 #ifdef SFXGE_LRO
1176 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1177 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1178 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1179 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1180 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1181 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1182 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1183 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1184 #endif
1185 };
1186 
1187 static int
1188 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1189 {
1190 	struct sfxge_softc *sc = arg1;
1191 	unsigned int id = arg2;
1192 	unsigned int sum, index;
1193 
1194 	/* Sum across all RX queues */
1195 	sum = 0;
1196 	for (index = 0; index < sc->rxq_count; index++)
1197 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1198 					 sfxge_rx_stats[id].offset);
1199 
1200 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1201 }
1202 
1203 static void
1204 sfxge_rx_stat_init(struct sfxge_softc *sc)
1205 {
1206 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1207 	struct sysctl_oid_list *stat_list;
1208 	unsigned int id;
1209 
1210 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1211 
1212 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1213 		SYSCTL_ADD_PROC(
1214 			ctx, stat_list,
1215 			OID_AUTO, sfxge_rx_stats[id].name,
1216 			CTLTYPE_UINT|CTLFLAG_RD,
1217 			sc, id, sfxge_rx_stat_handler, "IU",
1218 			"");
1219 	}
1220 }
1221 
1222 void
1223 sfxge_rx_fini(struct sfxge_softc *sc)
1224 {
1225 	int index;
1226 
1227 	index = sc->rxq_count;
1228 	while (--index >= 0)
1229 		sfxge_rx_qfini(sc, index);
1230 
1231 	sc->rxq_count = 0;
1232 }
1233 
1234 int
1235 sfxge_rx_init(struct sfxge_softc *sc)
1236 {
1237 	struct sfxge_intr *intr;
1238 	int index;
1239 	int rc;
1240 
1241 #ifdef SFXGE_LRO
1242 	if (lro_idle_ticks == 0)
1243 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1244 #endif
1245 
1246 	intr = &sc->intr;
1247 
1248 	sc->rxq_count = intr->n_alloc;
1249 
1250 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1251 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1252 
1253 	/* Initialize the receive queue(s) - one per interrupt. */
1254 	for (index = 0; index < sc->rxq_count; index++) {
1255 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1256 			goto fail;
1257 	}
1258 
1259 	sfxge_rx_stat_init(sc);
1260 
1261 	return (0);
1262 
1263 fail:
1264 	/* Tear down the receive queue(s). */
1265 	while (--index >= 0)
1266 		sfxge_rx_qfini(sc, index);
1267 
1268 	sc->rxq_count = 0;
1269 	return (rc);
1270 }
1271