xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision f95c930e6701aa675377bc5871ea490dd565eeba)
1 /*-
2  * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/smp.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 #include <sys/syslog.h>
40 
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <net/if_vlan_var.h>
44 
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/ip6.h>
48 #include <netinet/tcp.h>
49 
50 #include <machine/in_cksum.h>
51 
52 #include "common/efx.h"
53 
54 
55 #include "sfxge.h"
56 #include "sfxge_rx.h"
57 
58 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59 
60 #ifdef SFXGE_LRO
61 
62 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63 	    "Large receive offload (LRO) parameters");
64 
65 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66 
67 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
68  * means we can accelerate a larger number of streams.
69  */
70 static unsigned lro_table_size = 128;
71 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73 	    &lro_table_size, 0,
74 	    "Size of the LRO hash table (must be a power of 2)");
75 
76 /* Maximum length of a hash chain.  If chains get too long then the lookup
77  * time increases and may exceed the benefit of LRO.
78  */
79 static unsigned lro_chain_max = 20;
80 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82 	    &lro_chain_max, 0,
83 	    "The maximum length of a hash chain");
84 
85 /* Maximum time (in ticks) that a connection can be idle before it's LRO
86  * state is discarded.
87  */
88 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91 	    &lro_idle_ticks, 0,
92 	    "The maximum time (in ticks) that a connection can be idle "
93 	    "before it's LRO state is discarded");
94 
95 /* Number of packets with payload that must arrive in-order before a
96  * connection is eligible for LRO.  The idea is we should avoid coalescing
97  * segments when the sender is in slow-start because reducing the ACK rate
98  * can damage performance.
99  */
100 static int lro_slow_start_packets = 2000;
101 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103 	    &lro_slow_start_packets, 0,
104 	    "Number of packets with payload that must arrive in-order before "
105 	    "a connection is eligible for LRO");
106 
107 /* Number of packets with payload that must arrive in-order following loss
108  * before a connection is eligible for LRO.  The idea is we should avoid
109  * coalescing segments when the sender is recovering from loss, because
110  * reducing the ACK rate can damage performance.
111  */
112 static int lro_loss_packets = 20;
113 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115 	    &lro_loss_packets, 0,
116 	    "Number of packets with payload that must arrive in-order "
117 	    "following loss before a connection is eligible for LRO");
118 
119 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
121 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
122 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124 
125 /* Compare IPv6 addresses, avoiding conditional branches */
126 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127 				   const struct in6_addr *right)
128 {
129 #if LONG_BIT == 64
130 	const uint64_t *left64 = (const uint64_t *)left;
131 	const uint64_t *right64 = (const uint64_t *)right;
132 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133 #else
134 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137 	       (left->s6_addr32[3] - right->s6_addr32[3]);
138 #endif
139 }
140 
141 #endif	/* SFXGE_LRO */
142 
143 void
144 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145 {
146 
147 	rxq->flush_state = SFXGE_FLUSH_DONE;
148 }
149 
150 void
151 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152 {
153 
154 	rxq->flush_state = SFXGE_FLUSH_FAILED;
155 }
156 
157 static uint8_t toep_key[] = {
158 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163 };
164 
165 static void
166 sfxge_rx_post_refill(void *arg)
167 {
168 	struct sfxge_rxq *rxq = arg;
169 	struct sfxge_softc *sc;
170 	unsigned int index;
171 	struct sfxge_evq *evq;
172 	uint16_t magic;
173 
174 	sc = rxq->sc;
175 	index = rxq->index;
176 	evq = sc->evq[index];
177 
178 	magic = SFXGE_MAGIC_RX_QREFILL | index;
179 
180 	/* This is guaranteed due to the start/stop order of rx and ev */
181 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182 	    ("evq not started"));
183 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184 	    ("rxq not started"));
185 	efx_ev_qpost(evq->common, magic);
186 }
187 
188 static void
189 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190 {
191 	/* Initially retry after 100 ms, but back off in case of
192 	 * repeated failures as we probably have to wait for the
193 	 * administrator to raise the pool limit. */
194 	if (retrying)
195 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196 	else
197 		rxq->refill_delay = hz / 10;
198 
199 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200 			     sfxge_rx_post_refill, rxq);
201 }
202 
203 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204 {
205 	struct mb_args args;
206 	struct mbuf *m;
207 
208 	/* Allocate mbuf structure */
209 	args.flags = M_PKTHDR;
210 	args.type = MT_DATA;
211 	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212 
213 	/* Allocate (and attach) packet buffer */
214 	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215 		uma_zfree(zone_mbuf, m);
216 		m = NULL;
217 	}
218 
219 	return (m);
220 }
221 
222 #define	SFXGE_REFILL_BATCH  64
223 
224 static void
225 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226 {
227 	struct sfxge_softc *sc;
228 	unsigned int index;
229 	struct sfxge_evq *evq;
230 	unsigned int batch;
231 	unsigned int rxfill;
232 	unsigned int mblksize;
233 	int ntodo;
234 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235 
236 	sc = rxq->sc;
237 	index = rxq->index;
238 	evq = sc->evq[index];
239 
240 	prefetch_read_many(sc->enp);
241 	prefetch_read_many(rxq->common);
242 
243 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244 
245 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246 		return;
247 
248 	rxfill = rxq->added - rxq->completed;
249 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254 
255 	if (ntodo == 0)
256 		return;
257 
258 	batch = 0;
259 	mblksize = sc->rx_buffer_size;
260 	while (ntodo-- > 0) {
261 		unsigned int id;
262 		struct sfxge_rx_sw_desc *rx_desc;
263 		bus_dma_segment_t seg;
264 		struct mbuf *m;
265 
266 		id = (rxq->added + batch) & rxq->ptr_mask;
267 		rx_desc = &rxq->queue[id];
268 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269 
270 		rx_desc->flags = EFX_DISCARD;
271 		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272 		if (m == NULL)
273 			break;
274 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275 		addr[batch++] = seg.ds_addr;
276 
277 		if (batch == SFXGE_REFILL_BATCH) {
278 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279 			    rxq->completed, rxq->added);
280 			rxq->added += batch;
281 			batch = 0;
282 		}
283 	}
284 
285 	if (ntodo != 0)
286 		sfxge_rx_schedule_refill(rxq, retrying);
287 
288 	if (batch != 0) {
289 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290 		    rxq->completed, rxq->added);
291 		rxq->added += batch;
292 	}
293 
294 	/* Make the descriptors visible to the hardware */
295 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296 			BUS_DMASYNC_PREWRITE);
297 
298 	efx_rx_qpush(rxq->common, rxq->added);
299 }
300 
301 void
302 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303 {
304 
305 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306 		return;
307 
308 	/* Make sure the queue is full */
309 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310 }
311 
312 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313 {
314 	struct ifnet *ifp = sc->ifnet;
315 
316 	m->m_pkthdr.rcvif = ifp;
317 	m->m_pkthdr.csum_data = 0xffff;
318 	ifp->if_input(ifp, m);
319 }
320 
321 static void
322 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323 {
324 	struct mbuf *m = rx_desc->mbuf;
325 	int csum_flags;
326 
327 	/* Convert checksum flags */
328 	csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
329 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
330 	if (rx_desc->flags & EFX_CKSUM_TCPUDP)
331 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
332 
333 	if (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
334 		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
335 						       mtod(m, uint8_t *));
336 		/* The hash covers a 4-tuple for TCP only */
337 		M_HASHTYPE_SET(m,
338 		    (rx_desc->flags & EFX_PKT_IPV4) ?
339 			((rx_desc->flags & EFX_PKT_TCP) ?
340 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
341 			((rx_desc->flags & EFX_PKT_TCP) ?
342 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
343 	}
344 	m->m_data += sc->rx_prefix_size;
345 	m->m_len = rx_desc->size - sc->rx_prefix_size;
346 	m->m_pkthdr.len = m->m_len;
347 	m->m_pkthdr.csum_flags = csum_flags;
348 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
349 
350 	rx_desc->flags = EFX_DISCARD;
351 	rx_desc->mbuf = NULL;
352 }
353 
354 #ifdef SFXGE_LRO
355 
356 static void
357 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
358 {
359 	struct sfxge_softc *sc = st->sc;
360 	struct mbuf *m = c->mbuf;
361 	struct tcphdr *c_th;
362 	int csum_flags;
363 
364 	KASSERT(m, ("no mbuf to deliver"));
365 
366 	++st->n_bursts;
367 
368 	/* Finish off packet munging and recalculate IP header checksum. */
369 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
370 		struct ip *iph = c->nh;
371 		iph->ip_len = htons(iph->ip_len);
372 		iph->ip_sum = 0;
373 		iph->ip_sum = in_cksum_hdr(iph);
374 		c_th = (struct tcphdr *)(iph + 1);
375 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
376 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
377 	} else {
378 		struct ip6_hdr *iph = c->nh;
379 		iph->ip6_plen = htons(iph->ip6_plen);
380 		c_th = (struct tcphdr *)(iph + 1);
381 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
382 	}
383 
384 	c_th->th_win = c->th_last->th_win;
385 	c_th->th_ack = c->th_last->th_ack;
386 	if (c_th->th_off == c->th_last->th_off) {
387 		/* Copy TCP options (take care to avoid going negative). */
388 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
389 		memcpy(c_th + 1, c->th_last + 1, optlen);
390 	}
391 
392 	m->m_pkthdr.flowid = c->conn_hash;
393 	M_HASHTYPE_SET(m,
394 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
395 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
396 
397 	m->m_pkthdr.csum_flags = csum_flags;
398 	__sfxge_rx_deliver(sc, m);
399 
400 	c->mbuf = NULL;
401 	c->delivered = 1;
402 }
403 
404 /* Drop the given connection, and add it to the free list. */
405 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
406 {
407 	unsigned bucket;
408 
409 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
410 
411 	if (c->next_buf.mbuf != NULL) {
412 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
413 		LIST_REMOVE(c, active_link);
414 	}
415 
416 	bucket = c->conn_hash & rxq->lro.conns_mask;
417 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
418 	--rxq->lro.conns_n[bucket];
419 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
420 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
421 }
422 
423 /* Stop tracking connections that have gone idle in order to keep hash
424  * chains short.
425  */
426 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
427 {
428 	struct sfxge_lro_conn *c;
429 	unsigned i;
430 
431 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
432 		("found active connections"));
433 
434 	rxq->lro.last_purge_ticks = now;
435 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
436 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
437 			continue;
438 
439 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
440 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
441 			++rxq->lro.n_drop_idle;
442 			sfxge_lro_drop(rxq, c);
443 		}
444 	}
445 }
446 
447 static void
448 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
449 		struct mbuf *mbuf, struct tcphdr *th)
450 {
451 	struct tcphdr *c_th;
452 
453 	/* Tack the new mbuf onto the chain. */
454 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
455 	c->mbuf_tail->m_next = mbuf;
456 	c->mbuf_tail = mbuf;
457 
458 	/* Increase length appropriately */
459 	c->mbuf->m_pkthdr.len += mbuf->m_len;
460 
461 	/* Update the connection state flags */
462 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
463 		struct ip *iph = c->nh;
464 		iph->ip_len += mbuf->m_len;
465 		c_th = (struct tcphdr *)(iph + 1);
466 	} else {
467 		struct ip6_hdr *iph = c->nh;
468 		iph->ip6_plen += mbuf->m_len;
469 		c_th = (struct tcphdr *)(iph + 1);
470 	}
471 	c_th->th_flags |= (th->th_flags & TH_PUSH);
472 	c->th_last = th;
473 	++st->n_merges;
474 
475 	/* Pass packet up now if another segment could overflow the IP
476 	 * length.
477 	 */
478 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
479 		sfxge_lro_deliver(st, c);
480 }
481 
482 static void
483 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
484 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
485 {
486 	/* Start the chain */
487 	c->mbuf = mbuf;
488 	c->mbuf_tail = c->mbuf;
489 	c->nh = nh;
490 	c->th_last = th;
491 
492 	mbuf->m_pkthdr.len = mbuf->m_len;
493 
494 	/* Mangle header fields for later processing */
495 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
496 		struct ip *iph = nh;
497 		iph->ip_len = ntohs(iph->ip_len);
498 	} else {
499 		struct ip6_hdr *iph = nh;
500 		iph->ip6_plen = ntohs(iph->ip6_plen);
501 	}
502 }
503 
504 /* Try to merge or otherwise hold or deliver (as appropriate) the
505  * packet buffered for this connection (c->next_buf).  Return a flag
506  * indicating whether the connection is still active for LRO purposes.
507  */
508 static int
509 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
510 {
511 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
512 	char *eh = c->next_eh;
513 	int data_length, hdr_length, dont_merge;
514 	unsigned th_seq, pkt_length;
515 	struct tcphdr *th;
516 	unsigned now;
517 
518 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
519 		struct ip *iph = c->next_nh;
520 		th = (struct tcphdr *)(iph + 1);
521 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
522 	} else {
523 		struct ip6_hdr *iph = c->next_nh;
524 		th = (struct tcphdr *)(iph + 1);
525 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
526 	}
527 
528 	hdr_length = (char *) th + th->th_off * 4 - eh;
529 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
530 		       hdr_length);
531 	th_seq = ntohl(th->th_seq);
532 	dont_merge = ((data_length <= 0)
533 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
534 
535 	/* Check for options other than aligned timestamp. */
536 	if (th->th_off != 5) {
537 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
538 		if (th->th_off == 8 &&
539 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
540 					(TCPOPT_NOP << 16) |
541 					(TCPOPT_TIMESTAMP << 8) |
542 					TCPOLEN_TIMESTAMP)) {
543 			/* timestamp option -- okay */
544 		} else {
545 			dont_merge = 1;
546 		}
547 	}
548 
549 	if (__predict_false(th_seq != c->next_seq)) {
550 		/* Out-of-order, so start counting again. */
551 		if (c->mbuf != NULL)
552 			sfxge_lro_deliver(&rxq->lro, c);
553 		c->n_in_order_pkts -= lro_loss_packets;
554 		c->next_seq = th_seq + data_length;
555 		++rxq->lro.n_misorder;
556 		goto deliver_buf_out;
557 	}
558 	c->next_seq = th_seq + data_length;
559 
560 	now = ticks;
561 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
562 		++rxq->lro.n_drop_idle;
563 		if (c->mbuf != NULL)
564 			sfxge_lro_deliver(&rxq->lro, c);
565 		sfxge_lro_drop(rxq, c);
566 		return (0);
567 	}
568 	c->last_pkt_ticks = ticks;
569 
570 	if (c->n_in_order_pkts < lro_slow_start_packets) {
571 		/* May be in slow-start, so don't merge. */
572 		++rxq->lro.n_slow_start;
573 		++c->n_in_order_pkts;
574 		goto deliver_buf_out;
575 	}
576 
577 	if (__predict_false(dont_merge)) {
578 		if (c->mbuf != NULL)
579 			sfxge_lro_deliver(&rxq->lro, c);
580 		if (th->th_flags & (TH_FIN | TH_RST)) {
581 			++rxq->lro.n_drop_closed;
582 			sfxge_lro_drop(rxq, c);
583 			return (0);
584 		}
585 		goto deliver_buf_out;
586 	}
587 
588 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
589 
590 	if (__predict_true(c->mbuf != NULL)) {
591 		/* Remove headers and any padding */
592 		rx_buf->mbuf->m_data += hdr_length;
593 		rx_buf->mbuf->m_len = data_length;
594 
595 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
596 	} else {
597 		/* Remove any padding */
598 		rx_buf->mbuf->m_len = pkt_length;
599 
600 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
601 	}
602 
603 	rx_buf->mbuf = NULL;
604 	return (1);
605 
606  deliver_buf_out:
607 	sfxge_rx_deliver(rxq->sc, rx_buf);
608 	return (1);
609 }
610 
611 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
612 			       uint16_t l2_id, void *nh, struct tcphdr *th)
613 {
614 	unsigned bucket = conn_hash & st->conns_mask;
615 	struct sfxge_lro_conn *c;
616 
617 	if (st->conns_n[bucket] >= lro_chain_max) {
618 		++st->n_too_many;
619 		return;
620 	}
621 
622 	if (!TAILQ_EMPTY(&st->free_conns)) {
623 		c = TAILQ_FIRST(&st->free_conns);
624 		TAILQ_REMOVE(&st->free_conns, c, link);
625 	} else {
626 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
627 		if (c == NULL)
628 			return;
629 		c->mbuf = NULL;
630 		c->next_buf.mbuf = NULL;
631 	}
632 
633 	/* Create the connection tracking data */
634 	++st->conns_n[bucket];
635 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
636 	c->l2_id = l2_id;
637 	c->conn_hash = conn_hash;
638 	c->source = th->th_sport;
639 	c->dest = th->th_dport;
640 	c->n_in_order_pkts = 0;
641 	c->last_pkt_ticks = *(volatile int *)&ticks;
642 	c->delivered = 0;
643 	++st->n_new_stream;
644 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
645 	 * value it has.  Most likely the next packet received for this
646 	 * connection will not match -- no harm done.
647 	 */
648 }
649 
650 /* Process mbuf and decide whether to dispatch it to the stack now or
651  * later.
652  */
653 static void
654 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
655 {
656 	struct sfxge_softc *sc = rxq->sc;
657 	struct mbuf *m = rx_buf->mbuf;
658 	struct ether_header *eh;
659 	struct sfxge_lro_conn *c;
660 	uint16_t l2_id;
661 	uint16_t l3_proto;
662 	void *nh;
663 	struct tcphdr *th;
664 	uint32_t conn_hash;
665 	unsigned bucket;
666 
667 	/* Get the hardware hash */
668 	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
669 				      mtod(m, uint8_t *));
670 
671 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
672 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
673 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
674 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
675 			SFXGE_LRO_L2_ID_VLAN;
676 		l3_proto = veh->evl_proto;
677 		nh = veh + 1;
678 	} else {
679 		l2_id = 0;
680 		l3_proto = eh->ether_type;
681 		nh = eh + 1;
682 	}
683 
684 	/* Check whether this is a suitable packet (unfragmented
685 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
686 	 * length, and compute a hash if necessary.  If not, return.
687 	 */
688 	if (l3_proto == htons(ETHERTYPE_IP)) {
689 		struct ip *iph = nh;
690 		if ((iph->ip_p - IPPROTO_TCP) |
691 		    (iph->ip_hl - (sizeof(*iph) >> 2u)) |
692 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
693 			goto deliver_now;
694 		th = (struct tcphdr *)(iph + 1);
695 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
696 		struct ip6_hdr *iph = nh;
697 		if (iph->ip6_nxt != IPPROTO_TCP)
698 			goto deliver_now;
699 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
700 		th = (struct tcphdr *)(iph + 1);
701 	} else {
702 		goto deliver_now;
703 	}
704 
705 	bucket = conn_hash & rxq->lro.conns_mask;
706 
707 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
708 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
709 			continue;
710 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
711 			continue;
712 		if (c->mbuf != NULL) {
713 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
714 				struct ip *c_iph, *iph = nh;
715 				c_iph = c->nh;
716 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
717 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
718 					continue;
719 			} else {
720 				struct ip6_hdr *c_iph, *iph = nh;
721 				c_iph = c->nh;
722 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
723 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
724 					continue;
725 			}
726 		}
727 
728 		/* Re-insert at head of list to reduce lookup time. */
729 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
730 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
731 
732 		if (c->next_buf.mbuf != NULL) {
733 			if (!sfxge_lro_try_merge(rxq, c))
734 				goto deliver_now;
735 		} else {
736 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
737 			    active_link);
738 		}
739 		c->next_buf = *rx_buf;
740 		c->next_eh = eh;
741 		c->next_nh = nh;
742 
743 		rx_buf->mbuf = NULL;
744 		rx_buf->flags = EFX_DISCARD;
745 		return;
746 	}
747 
748 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
749  deliver_now:
750 	sfxge_rx_deliver(sc, rx_buf);
751 }
752 
753 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
754 {
755 	struct sfxge_lro_state *st = &rxq->lro;
756 	struct sfxge_lro_conn *c;
757 	unsigned t;
758 
759 	while (!LIST_EMPTY(&st->active_conns)) {
760 		c = LIST_FIRST(&st->active_conns);
761 		if (!c->delivered && c->mbuf != NULL)
762 			sfxge_lro_deliver(st, c);
763 		if (sfxge_lro_try_merge(rxq, c)) {
764 			if (c->mbuf != NULL)
765 				sfxge_lro_deliver(st, c);
766 			LIST_REMOVE(c, active_link);
767 		}
768 		c->delivered = 0;
769 	}
770 
771 	t = *(volatile int *)&ticks;
772 	if (__predict_false(t != st->last_purge_ticks))
773 		sfxge_lro_purge_idle(rxq, t);
774 }
775 
776 #else	/* !SFXGE_LRO */
777 
778 static void
779 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
780 {
781 }
782 
783 static void
784 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
785 {
786 }
787 
788 #endif	/* SFXGE_LRO */
789 
790 void
791 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
792 {
793 	struct sfxge_softc *sc = rxq->sc;
794 	int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
795 	unsigned int index;
796 	struct sfxge_evq *evq;
797 	unsigned int completed;
798 	unsigned int level;
799 	struct mbuf *m;
800 	struct sfxge_rx_sw_desc *prev = NULL;
801 
802 	index = rxq->index;
803 	evq = sc->evq[index];
804 
805 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
806 
807 	completed = rxq->completed;
808 	while (completed != rxq->pending) {
809 		unsigned int id;
810 		struct sfxge_rx_sw_desc *rx_desc;
811 
812 		id = completed++ & rxq->ptr_mask;
813 		rx_desc = &rxq->queue[id];
814 		m = rx_desc->mbuf;
815 
816 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
817 			goto discard;
818 
819 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
820 			goto discard;
821 
822 		prefetch_read_many(mtod(m, caddr_t));
823 
824 		/* Check for loopback packets */
825 		if (!(rx_desc->flags & EFX_PKT_IPV4) &&
826 		    !(rx_desc->flags & EFX_PKT_IPV6)) {
827 			struct ether_header *etherhp;
828 
829 			/*LINTED*/
830 			etherhp = mtod(m, struct ether_header *);
831 
832 			if (etherhp->ether_type ==
833 			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
834 				EFSYS_PROBE(loopback);
835 
836 				rxq->loopback++;
837 				goto discard;
838 			}
839 		}
840 
841 		/* Pass packet up the stack or into LRO (pipelined) */
842 		if (prev != NULL) {
843 			if (lro_enabled)
844 				sfxge_lro(rxq, prev);
845 			else
846 				sfxge_rx_deliver(sc, prev);
847 		}
848 		prev = rx_desc;
849 		continue;
850 
851 discard:
852 		/* Return the packet to the pool */
853 		m_free(m);
854 		rx_desc->mbuf = NULL;
855 	}
856 	rxq->completed = completed;
857 
858 	level = rxq->added - rxq->completed;
859 
860 	/* Pass last packet up the stack or into LRO */
861 	if (prev != NULL) {
862 		if (lro_enabled)
863 			sfxge_lro(rxq, prev);
864 		else
865 			sfxge_rx_deliver(sc, prev);
866 	}
867 
868 	/*
869 	 * If there are any pending flows and this is the end of the
870 	 * poll then they must be completed.
871 	 */
872 	if (eop)
873 		sfxge_lro_end_of_burst(rxq);
874 
875 	/* Top up the queue if necessary */
876 	if (level < rxq->refill_threshold)
877 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
878 }
879 
880 static void
881 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
882 {
883 	struct sfxge_rxq *rxq;
884 	struct sfxge_evq *evq;
885 	unsigned int count;
886 
887 	rxq = sc->rxq[index];
888 	evq = sc->evq[index];
889 
890 	SFXGE_EVQ_LOCK(evq);
891 
892 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
893 	    ("rxq not started"));
894 
895 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
896 
897 	callout_stop(&rxq->refill_callout);
898 
899 again:
900 	rxq->flush_state = SFXGE_FLUSH_PENDING;
901 
902 	/* Flush the receive queue */
903 	efx_rx_qflush(rxq->common);
904 
905 	SFXGE_EVQ_UNLOCK(evq);
906 
907 	count = 0;
908 	do {
909 		/* Spin for 100 ms */
910 		DELAY(100000);
911 
912 		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
913 			break;
914 
915 	} while (++count < 20);
916 
917 	SFXGE_EVQ_LOCK(evq);
918 
919 	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
920 		goto again;
921 
922 	rxq->flush_state = SFXGE_FLUSH_DONE;
923 
924 	rxq->pending = rxq->added;
925 	sfxge_rx_qcomplete(rxq, B_TRUE);
926 
927 	KASSERT(rxq->completed == rxq->pending,
928 	    ("rxq->completed != rxq->pending"));
929 
930 	rxq->added = 0;
931 	rxq->pending = 0;
932 	rxq->completed = 0;
933 	rxq->loopback = 0;
934 
935 	/* Destroy the common code receive queue. */
936 	efx_rx_qdestroy(rxq->common);
937 
938 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
939 	    EFX_RXQ_NBUFS(sc->rxq_entries));
940 
941 	SFXGE_EVQ_UNLOCK(evq);
942 }
943 
944 static int
945 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
946 {
947 	struct sfxge_rxq *rxq;
948 	efsys_mem_t *esmp;
949 	struct sfxge_evq *evq;
950 	int rc;
951 
952 	rxq = sc->rxq[index];
953 	esmp = &rxq->mem;
954 	evq = sc->evq[index];
955 
956 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
957 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
958 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
959 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
960 
961 	/* Program the buffer table. */
962 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
963 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
964 		return (rc);
965 
966 	/* Create the common code receive queue. */
967 	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
968 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
969 	    &rxq->common)) != 0)
970 		goto fail;
971 
972 	SFXGE_EVQ_LOCK(evq);
973 
974 	/* Enable the receive queue. */
975 	efx_rx_qenable(rxq->common);
976 
977 	rxq->init_state = SFXGE_RXQ_STARTED;
978 
979 	/* Try to fill the queue from the pool. */
980 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
981 
982 	SFXGE_EVQ_UNLOCK(evq);
983 
984 	return (0);
985 
986 fail:
987 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
988 	    EFX_RXQ_NBUFS(sc->rxq_entries));
989 	return (rc);
990 }
991 
992 void
993 sfxge_rx_stop(struct sfxge_softc *sc)
994 {
995 	int index;
996 
997 	/* Stop the receive queue(s) */
998 	index = sc->rxq_count;
999 	while (--index >= 0)
1000 		sfxge_rx_qstop(sc, index);
1001 
1002 	sc->rx_prefix_size = 0;
1003 	sc->rx_buffer_size = 0;
1004 
1005 	efx_rx_fini(sc->enp);
1006 }
1007 
1008 int
1009 sfxge_rx_start(struct sfxge_softc *sc)
1010 {
1011 	struct sfxge_intr *intr;
1012 	int index;
1013 	int rc;
1014 
1015 	intr = &sc->intr;
1016 
1017 	/* Initialize the common code receive module. */
1018 	if ((rc = efx_rx_init(sc->enp)) != 0)
1019 		return (rc);
1020 
1021 	/* Calculate the receive packet buffer size. */
1022 	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1023 	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1024 			      sc->rx_prefix_size);
1025 
1026 	/* Select zone for packet buffers */
1027 	if (sc->rx_buffer_size <= MCLBYTES)
1028 		sc->rx_buffer_zone = zone_clust;
1029 	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1030 		sc->rx_buffer_zone = zone_jumbop;
1031 	else if (sc->rx_buffer_size <= MJUM9BYTES)
1032 		sc->rx_buffer_zone = zone_jumbo9;
1033 	else
1034 		sc->rx_buffer_zone = zone_jumbo16;
1035 
1036 	/*
1037 	 * Set up the scale table.  Enable all hash types and hash insertion.
1038 	 */
1039 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1040 		sc->rx_indir_table[index] = index % sc->rxq_count;
1041 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1042 				       SFXGE_RX_SCALE_MAX)) != 0)
1043 		goto fail;
1044 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1045 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1046 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1047 
1048 	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1049 	    sizeof(toep_key))) != 0)
1050 		goto fail;
1051 
1052 	/* Start the receive queue(s). */
1053 	for (index = 0; index < sc->rxq_count; index++) {
1054 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1055 			goto fail2;
1056 	}
1057 
1058 	return (0);
1059 
1060 fail2:
1061 	while (--index >= 0)
1062 		sfxge_rx_qstop(sc, index);
1063 
1064 fail:
1065 	efx_rx_fini(sc->enp);
1066 
1067 	return (rc);
1068 }
1069 
1070 #ifdef SFXGE_LRO
1071 
1072 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1073 {
1074 	struct sfxge_lro_state *st = &rxq->lro;
1075 	unsigned i;
1076 
1077 	st->conns_mask = lro_table_size - 1;
1078 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1079 		("lro_table_size must be a power of 2"));
1080 	st->sc = rxq->sc;
1081 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1082 			   M_SFXGE, M_WAITOK);
1083 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1084 			     M_SFXGE, M_WAITOK);
1085 	for (i = 0; i <= st->conns_mask; ++i) {
1086 		TAILQ_INIT(&st->conns[i]);
1087 		st->conns_n[i] = 0;
1088 	}
1089 	LIST_INIT(&st->active_conns);
1090 	TAILQ_INIT(&st->free_conns);
1091 }
1092 
1093 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1094 {
1095 	struct sfxge_lro_state *st = &rxq->lro;
1096 	struct sfxge_lro_conn *c;
1097 	unsigned i;
1098 
1099 	/* Return cleanly if sfxge_lro_init() has not been called. */
1100 	if (st->conns == NULL)
1101 		return;
1102 
1103 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1104 
1105 	for (i = 0; i <= st->conns_mask; ++i) {
1106 		while (!TAILQ_EMPTY(&st->conns[i])) {
1107 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1108 			sfxge_lro_drop(rxq, c);
1109 		}
1110 	}
1111 
1112 	while (!TAILQ_EMPTY(&st->free_conns)) {
1113 		c = TAILQ_FIRST(&st->free_conns);
1114 		TAILQ_REMOVE(&st->free_conns, c, link);
1115 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1116 		free(c, M_SFXGE);
1117 	}
1118 
1119 	free(st->conns_n, M_SFXGE);
1120 	free(st->conns, M_SFXGE);
1121 	st->conns = NULL;
1122 }
1123 
1124 #else
1125 
1126 static void
1127 sfxge_lro_init(struct sfxge_rxq *rxq)
1128 {
1129 }
1130 
1131 static void
1132 sfxge_lro_fini(struct sfxge_rxq *rxq)
1133 {
1134 }
1135 
1136 #endif	/* SFXGE_LRO */
1137 
1138 static void
1139 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1140 {
1141 	struct sfxge_rxq *rxq;
1142 
1143 	rxq = sc->rxq[index];
1144 
1145 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1146 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1147 
1148 	/* Free the context array and the flow table. */
1149 	free(rxq->queue, M_SFXGE);
1150 	sfxge_lro_fini(rxq);
1151 
1152 	/* Release DMA memory. */
1153 	sfxge_dma_free(&rxq->mem);
1154 
1155 	sc->rxq[index] = NULL;
1156 
1157 	free(rxq, M_SFXGE);
1158 }
1159 
1160 static int
1161 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1162 {
1163 	struct sfxge_rxq *rxq;
1164 	struct sfxge_evq *evq;
1165 	efsys_mem_t *esmp;
1166 	int rc;
1167 
1168 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1169 
1170 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1171 	rxq->sc = sc;
1172 	rxq->index = index;
1173 	rxq->entries = sc->rxq_entries;
1174 	rxq->ptr_mask = rxq->entries - 1;
1175 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1176 
1177 	sc->rxq[index] = rxq;
1178 	esmp = &rxq->mem;
1179 
1180 	evq = sc->evq[index];
1181 
1182 	/* Allocate and zero DMA space. */
1183 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1184 		return (rc);
1185 
1186 	/* Allocate buffer table entries. */
1187 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1188 				 &rxq->buf_base_id);
1189 
1190 	/* Allocate the context array and the flow table. */
1191 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1192 	    M_SFXGE, M_WAITOK | M_ZERO);
1193 	sfxge_lro_init(rxq);
1194 
1195 	callout_init(&rxq->refill_callout, B_TRUE);
1196 
1197 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1198 
1199 	return (0);
1200 }
1201 
1202 static const struct {
1203 	const char *name;
1204 	size_t offset;
1205 } sfxge_rx_stats[] = {
1206 #define	SFXGE_RX_STAT(name, member) \
1207 	{ #name, offsetof(struct sfxge_rxq, member) }
1208 #ifdef SFXGE_LRO
1209 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1210 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1211 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1212 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1213 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1214 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1215 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1216 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1217 #endif
1218 };
1219 
1220 static int
1221 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1222 {
1223 	struct sfxge_softc *sc = arg1;
1224 	unsigned int id = arg2;
1225 	unsigned int sum, index;
1226 
1227 	/* Sum across all RX queues */
1228 	sum = 0;
1229 	for (index = 0; index < sc->rxq_count; index++)
1230 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1231 					 sfxge_rx_stats[id].offset);
1232 
1233 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1234 }
1235 
1236 static void
1237 sfxge_rx_stat_init(struct sfxge_softc *sc)
1238 {
1239 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1240 	struct sysctl_oid_list *stat_list;
1241 	unsigned int id;
1242 
1243 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1244 
1245 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1246 		SYSCTL_ADD_PROC(
1247 			ctx, stat_list,
1248 			OID_AUTO, sfxge_rx_stats[id].name,
1249 			CTLTYPE_UINT|CTLFLAG_RD,
1250 			sc, id, sfxge_rx_stat_handler, "IU",
1251 			"");
1252 	}
1253 }
1254 
1255 void
1256 sfxge_rx_fini(struct sfxge_softc *sc)
1257 {
1258 	int index;
1259 
1260 	index = sc->rxq_count;
1261 	while (--index >= 0)
1262 		sfxge_rx_qfini(sc, index);
1263 
1264 	sc->rxq_count = 0;
1265 }
1266 
1267 int
1268 sfxge_rx_init(struct sfxge_softc *sc)
1269 {
1270 	struct sfxge_intr *intr;
1271 	int index;
1272 	int rc;
1273 
1274 #ifdef SFXGE_LRO
1275 	if (!ISP2(lro_table_size)) {
1276 		log(LOG_ERR, "%s=%u must be power of 2",
1277 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1278 		rc = EINVAL;
1279 		goto fail_lro_table_size;
1280 	}
1281 
1282 	if (lro_idle_ticks == 0)
1283 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1284 #endif
1285 
1286 	intr = &sc->intr;
1287 
1288 	sc->rxq_count = intr->n_alloc;
1289 
1290 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1291 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1292 
1293 	/* Initialize the receive queue(s) - one per interrupt. */
1294 	for (index = 0; index < sc->rxq_count; index++) {
1295 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1296 			goto fail;
1297 	}
1298 
1299 	sfxge_rx_stat_init(sc);
1300 
1301 	return (0);
1302 
1303 fail:
1304 	/* Tear down the receive queue(s). */
1305 	while (--index >= 0)
1306 		sfxge_rx_qfini(sc, index);
1307 
1308 	sc->rxq_count = 0;
1309 
1310 #ifdef SFXGE_LRO
1311 fail_lro_table_size:
1312 #endif
1313 	return (rc);
1314 }
1315