xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision 98e0ffaefb0f241cda3a72395d3be04192ae0d47)
1 /*-
2  * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3  * All rights reserved.
4  *
5  * This software was developed in part by Philip Paeps under contract for
6  * Solarflare Communications, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/smp.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39 #include <sys/syslog.h>
40 
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <net/if_vlan_var.h>
44 
45 #include <netinet/in.h>
46 #include <netinet/ip.h>
47 #include <netinet/ip6.h>
48 #include <netinet/tcp.h>
49 
50 #include <machine/in_cksum.h>
51 
52 #include "common/efx.h"
53 
54 
55 #include "sfxge.h"
56 #include "sfxge_rx.h"
57 
58 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
59 
60 #ifdef SFXGE_LRO
61 
62 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
63 	    "Large receive offload (LRO) parameters");
64 
65 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
66 
67 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
68  * means we can accelerate a larger number of streams.
69  */
70 static unsigned lro_table_size = 128;
71 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
72 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
73 	    &lro_table_size, 0,
74 	    "Size of the LRO hash table (must be a power of 2)");
75 
76 /* Maximum length of a hash chain.  If chains get too long then the lookup
77  * time increases and may exceed the benefit of LRO.
78  */
79 static unsigned lro_chain_max = 20;
80 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
82 	    &lro_chain_max, 0,
83 	    "The maximum length of a hash chain");
84 
85 /* Maximum time (in ticks) that a connection can be idle before it's LRO
86  * state is discarded.
87  */
88 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
89 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
91 	    &lro_idle_ticks, 0,
92 	    "The maximum time (in ticks) that a connection can be idle "
93 	    "before it's LRO state is discarded");
94 
95 /* Number of packets with payload that must arrive in-order before a
96  * connection is eligible for LRO.  The idea is we should avoid coalescing
97  * segments when the sender is in slow-start because reducing the ACK rate
98  * can damage performance.
99  */
100 static int lro_slow_start_packets = 2000;
101 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
103 	    &lro_slow_start_packets, 0,
104 	    "Number of packets with payload that must arrive in-order before "
105 	    "a connection is eligible for LRO");
106 
107 /* Number of packets with payload that must arrive in-order following loss
108  * before a connection is eligible for LRO.  The idea is we should avoid
109  * coalescing segments when the sender is recovering from loss, because
110  * reducing the ACK rate can damage performance.
111  */
112 static int lro_loss_packets = 20;
113 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
115 	    &lro_loss_packets, 0,
116 	    "Number of packets with payload that must arrive in-order "
117 	    "following loss before a connection is eligible for LRO");
118 
119 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
120 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
121 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
122 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
123 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
124 
125 /* Compare IPv6 addresses, avoiding conditional branches */
126 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
127 				   const struct in6_addr *right)
128 {
129 #if LONG_BIT == 64
130 	const uint64_t *left64 = (const uint64_t *)left;
131 	const uint64_t *right64 = (const uint64_t *)right;
132 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
133 #else
134 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
135 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
136 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
137 	       (left->s6_addr32[3] - right->s6_addr32[3]);
138 #endif
139 }
140 
141 #endif	/* SFXGE_LRO */
142 
143 void
144 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
145 {
146 
147 	rxq->flush_state = SFXGE_FLUSH_DONE;
148 }
149 
150 void
151 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
152 {
153 
154 	rxq->flush_state = SFXGE_FLUSH_FAILED;
155 }
156 
157 static uint8_t toep_key[] = {
158 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
159 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
160 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
161 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
162 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
163 };
164 
165 static void
166 sfxge_rx_post_refill(void *arg)
167 {
168 	struct sfxge_rxq *rxq = arg;
169 	struct sfxge_softc *sc;
170 	unsigned int index;
171 	struct sfxge_evq *evq;
172 	uint16_t magic;
173 
174 	sc = rxq->sc;
175 	index = rxq->index;
176 	evq = sc->evq[index];
177 
178 	magic = SFXGE_MAGIC_RX_QREFILL | index;
179 
180 	/* This is guaranteed due to the start/stop order of rx and ev */
181 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
182 	    ("evq not started"));
183 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
184 	    ("rxq not started"));
185 	efx_ev_qpost(evq->common, magic);
186 }
187 
188 static void
189 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
190 {
191 	/* Initially retry after 100 ms, but back off in case of
192 	 * repeated failures as we probably have to wait for the
193 	 * administrator to raise the pool limit. */
194 	if (retrying)
195 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
196 	else
197 		rxq->refill_delay = hz / 10;
198 
199 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
200 			     sfxge_rx_post_refill, rxq);
201 }
202 
203 static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
204 {
205 	struct mb_args args;
206 	struct mbuf *m;
207 
208 	/* Allocate mbuf structure */
209 	args.flags = M_PKTHDR;
210 	args.type = MT_DATA;
211 	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
212 
213 	/* Allocate (and attach) packet buffer */
214 	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
215 		uma_zfree(zone_mbuf, m);
216 		m = NULL;
217 	}
218 
219 	return (m);
220 }
221 
222 #define	SFXGE_REFILL_BATCH  64
223 
224 static void
225 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
226 {
227 	struct sfxge_softc *sc;
228 	unsigned int index;
229 	struct sfxge_evq *evq;
230 	unsigned int batch;
231 	unsigned int rxfill;
232 	unsigned int mblksize;
233 	int ntodo;
234 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
235 
236 	sc = rxq->sc;
237 	index = rxq->index;
238 	evq = sc->evq[index];
239 
240 	prefetch_read_many(sc->enp);
241 	prefetch_read_many(rxq->common);
242 
243 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
244 
245 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
246 		return;
247 
248 	rxfill = rxq->added - rxq->completed;
249 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
250 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
251 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
252 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
253 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
254 
255 	if (ntodo == 0)
256 		return;
257 
258 	batch = 0;
259 	mblksize = sc->rx_buffer_size;
260 	while (ntodo-- > 0) {
261 		unsigned int id;
262 		struct sfxge_rx_sw_desc *rx_desc;
263 		bus_dma_segment_t seg;
264 		struct mbuf *m;
265 
266 		id = (rxq->added + batch) & rxq->ptr_mask;
267 		rx_desc = &rxq->queue[id];
268 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
269 
270 		rx_desc->flags = EFX_DISCARD;
271 		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
272 		if (m == NULL)
273 			break;
274 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
275 		addr[batch++] = seg.ds_addr;
276 
277 		if (batch == SFXGE_REFILL_BATCH) {
278 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
279 			    rxq->completed, rxq->added);
280 			rxq->added += batch;
281 			batch = 0;
282 		}
283 	}
284 
285 	if (ntodo != 0)
286 		sfxge_rx_schedule_refill(rxq, retrying);
287 
288 	if (batch != 0) {
289 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
290 		    rxq->completed, rxq->added);
291 		rxq->added += batch;
292 	}
293 
294 	/* Make the descriptors visible to the hardware */
295 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
296 			BUS_DMASYNC_PREWRITE);
297 
298 	efx_rx_qpush(rxq->common, rxq->added);
299 }
300 
301 void
302 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
303 {
304 
305 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
306 		return;
307 
308 	/* Make sure the queue is full */
309 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
310 }
311 
312 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
313 {
314 	struct ifnet *ifp = sc->ifnet;
315 
316 	m->m_pkthdr.rcvif = ifp;
317 	m->m_pkthdr.csum_data = 0xffff;
318 	ifp->if_input(ifp, m);
319 }
320 
321 static void
322 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
323 {
324 	struct mbuf *m = rx_desc->mbuf;
325 	int flags = rx_desc->flags;
326 	int csum_flags;
327 
328 	/* Convert checksum flags */
329 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
330 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
331 	if (flags & EFX_CKSUM_TCPUDP)
332 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
333 
334 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
335 		m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
336 						       mtod(m, uint8_t *));
337 		/* The hash covers a 4-tuple for TCP only */
338 		M_HASHTYPE_SET(m,
339 		    (flags & EFX_PKT_IPV4) ?
340 			((flags & EFX_PKT_TCP) ?
341 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
342 			((flags & EFX_PKT_TCP) ?
343 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
344 	}
345 	m->m_data += sc->rx_prefix_size;
346 	m->m_len = rx_desc->size - sc->rx_prefix_size;
347 	m->m_pkthdr.len = m->m_len;
348 	m->m_pkthdr.csum_flags = csum_flags;
349 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
350 
351 	rx_desc->flags = EFX_DISCARD;
352 	rx_desc->mbuf = NULL;
353 }
354 
355 #ifdef SFXGE_LRO
356 
357 static void
358 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
359 {
360 	struct sfxge_softc *sc = st->sc;
361 	struct mbuf *m = c->mbuf;
362 	struct tcphdr *c_th;
363 	int csum_flags;
364 
365 	KASSERT(m, ("no mbuf to deliver"));
366 
367 	++st->n_bursts;
368 
369 	/* Finish off packet munging and recalculate IP header checksum. */
370 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
371 		struct ip *iph = c->nh;
372 		iph->ip_len = htons(iph->ip_len);
373 		iph->ip_sum = 0;
374 		iph->ip_sum = in_cksum_hdr(iph);
375 		c_th = (struct tcphdr *)(iph + 1);
376 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
377 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
378 	} else {
379 		struct ip6_hdr *iph = c->nh;
380 		iph->ip6_plen = htons(iph->ip6_plen);
381 		c_th = (struct tcphdr *)(iph + 1);
382 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
383 	}
384 
385 	c_th->th_win = c->th_last->th_win;
386 	c_th->th_ack = c->th_last->th_ack;
387 	if (c_th->th_off == c->th_last->th_off) {
388 		/* Copy TCP options (take care to avoid going negative). */
389 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
390 		memcpy(c_th + 1, c->th_last + 1, optlen);
391 	}
392 
393 	m->m_pkthdr.flowid = c->conn_hash;
394 	M_HASHTYPE_SET(m,
395 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
396 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
397 
398 	m->m_pkthdr.csum_flags = csum_flags;
399 	__sfxge_rx_deliver(sc, m);
400 
401 	c->mbuf = NULL;
402 	c->delivered = 1;
403 }
404 
405 /* Drop the given connection, and add it to the free list. */
406 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
407 {
408 	unsigned bucket;
409 
410 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
411 
412 	if (c->next_buf.mbuf != NULL) {
413 		sfxge_rx_deliver(rxq->sc, &c->next_buf);
414 		LIST_REMOVE(c, active_link);
415 	}
416 
417 	bucket = c->conn_hash & rxq->lro.conns_mask;
418 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
419 	--rxq->lro.conns_n[bucket];
420 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
421 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
422 }
423 
424 /* Stop tracking connections that have gone idle in order to keep hash
425  * chains short.
426  */
427 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
428 {
429 	struct sfxge_lro_conn *c;
430 	unsigned i;
431 
432 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
433 		("found active connections"));
434 
435 	rxq->lro.last_purge_ticks = now;
436 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
437 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
438 			continue;
439 
440 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
441 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
442 			++rxq->lro.n_drop_idle;
443 			sfxge_lro_drop(rxq, c);
444 		}
445 	}
446 }
447 
448 static void
449 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
450 		struct mbuf *mbuf, struct tcphdr *th)
451 {
452 	struct tcphdr *c_th;
453 
454 	/* Tack the new mbuf onto the chain. */
455 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
456 	c->mbuf_tail->m_next = mbuf;
457 	c->mbuf_tail = mbuf;
458 
459 	/* Increase length appropriately */
460 	c->mbuf->m_pkthdr.len += mbuf->m_len;
461 
462 	/* Update the connection state flags */
463 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
464 		struct ip *iph = c->nh;
465 		iph->ip_len += mbuf->m_len;
466 		c_th = (struct tcphdr *)(iph + 1);
467 	} else {
468 		struct ip6_hdr *iph = c->nh;
469 		iph->ip6_plen += mbuf->m_len;
470 		c_th = (struct tcphdr *)(iph + 1);
471 	}
472 	c_th->th_flags |= (th->th_flags & TH_PUSH);
473 	c->th_last = th;
474 	++st->n_merges;
475 
476 	/* Pass packet up now if another segment could overflow the IP
477 	 * length.
478 	 */
479 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
480 		sfxge_lro_deliver(st, c);
481 }
482 
483 static void
484 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
485 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
486 {
487 	/* Start the chain */
488 	c->mbuf = mbuf;
489 	c->mbuf_tail = c->mbuf;
490 	c->nh = nh;
491 	c->th_last = th;
492 
493 	mbuf->m_pkthdr.len = mbuf->m_len;
494 
495 	/* Mangle header fields for later processing */
496 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
497 		struct ip *iph = nh;
498 		iph->ip_len = ntohs(iph->ip_len);
499 	} else {
500 		struct ip6_hdr *iph = nh;
501 		iph->ip6_plen = ntohs(iph->ip6_plen);
502 	}
503 }
504 
505 /* Try to merge or otherwise hold or deliver (as appropriate) the
506  * packet buffered for this connection (c->next_buf).  Return a flag
507  * indicating whether the connection is still active for LRO purposes.
508  */
509 static int
510 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
511 {
512 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
513 	char *eh = c->next_eh;
514 	int data_length, hdr_length, dont_merge;
515 	unsigned th_seq, pkt_length;
516 	struct tcphdr *th;
517 	unsigned now;
518 
519 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
520 		struct ip *iph = c->next_nh;
521 		th = (struct tcphdr *)(iph + 1);
522 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
523 	} else {
524 		struct ip6_hdr *iph = c->next_nh;
525 		th = (struct tcphdr *)(iph + 1);
526 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
527 	}
528 
529 	hdr_length = (char *) th + th->th_off * 4 - eh;
530 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
531 		       hdr_length);
532 	th_seq = ntohl(th->th_seq);
533 	dont_merge = ((data_length <= 0)
534 		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
535 
536 	/* Check for options other than aligned timestamp. */
537 	if (th->th_off != 5) {
538 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
539 		if (th->th_off == 8 &&
540 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
541 					(TCPOPT_NOP << 16) |
542 					(TCPOPT_TIMESTAMP << 8) |
543 					TCPOLEN_TIMESTAMP)) {
544 			/* timestamp option -- okay */
545 		} else {
546 			dont_merge = 1;
547 		}
548 	}
549 
550 	if (__predict_false(th_seq != c->next_seq)) {
551 		/* Out-of-order, so start counting again. */
552 		if (c->mbuf != NULL)
553 			sfxge_lro_deliver(&rxq->lro, c);
554 		c->n_in_order_pkts -= lro_loss_packets;
555 		c->next_seq = th_seq + data_length;
556 		++rxq->lro.n_misorder;
557 		goto deliver_buf_out;
558 	}
559 	c->next_seq = th_seq + data_length;
560 
561 	now = ticks;
562 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
563 		++rxq->lro.n_drop_idle;
564 		if (c->mbuf != NULL)
565 			sfxge_lro_deliver(&rxq->lro, c);
566 		sfxge_lro_drop(rxq, c);
567 		return (0);
568 	}
569 	c->last_pkt_ticks = ticks;
570 
571 	if (c->n_in_order_pkts < lro_slow_start_packets) {
572 		/* May be in slow-start, so don't merge. */
573 		++rxq->lro.n_slow_start;
574 		++c->n_in_order_pkts;
575 		goto deliver_buf_out;
576 	}
577 
578 	if (__predict_false(dont_merge)) {
579 		if (c->mbuf != NULL)
580 			sfxge_lro_deliver(&rxq->lro, c);
581 		if (th->th_flags & (TH_FIN | TH_RST)) {
582 			++rxq->lro.n_drop_closed;
583 			sfxge_lro_drop(rxq, c);
584 			return (0);
585 		}
586 		goto deliver_buf_out;
587 	}
588 
589 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
590 
591 	if (__predict_true(c->mbuf != NULL)) {
592 		/* Remove headers and any padding */
593 		rx_buf->mbuf->m_data += hdr_length;
594 		rx_buf->mbuf->m_len = data_length;
595 
596 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
597 	} else {
598 		/* Remove any padding */
599 		rx_buf->mbuf->m_len = pkt_length;
600 
601 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
602 	}
603 
604 	rx_buf->mbuf = NULL;
605 	return (1);
606 
607  deliver_buf_out:
608 	sfxge_rx_deliver(rxq->sc, rx_buf);
609 	return (1);
610 }
611 
612 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
613 			       uint16_t l2_id, void *nh, struct tcphdr *th)
614 {
615 	unsigned bucket = conn_hash & st->conns_mask;
616 	struct sfxge_lro_conn *c;
617 
618 	if (st->conns_n[bucket] >= lro_chain_max) {
619 		++st->n_too_many;
620 		return;
621 	}
622 
623 	if (!TAILQ_EMPTY(&st->free_conns)) {
624 		c = TAILQ_FIRST(&st->free_conns);
625 		TAILQ_REMOVE(&st->free_conns, c, link);
626 	} else {
627 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
628 		if (c == NULL)
629 			return;
630 		c->mbuf = NULL;
631 		c->next_buf.mbuf = NULL;
632 	}
633 
634 	/* Create the connection tracking data */
635 	++st->conns_n[bucket];
636 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
637 	c->l2_id = l2_id;
638 	c->conn_hash = conn_hash;
639 	c->source = th->th_sport;
640 	c->dest = th->th_dport;
641 	c->n_in_order_pkts = 0;
642 	c->last_pkt_ticks = *(volatile int *)&ticks;
643 	c->delivered = 0;
644 	++st->n_new_stream;
645 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
646 	 * value it has.  Most likely the next packet received for this
647 	 * connection will not match -- no harm done.
648 	 */
649 }
650 
651 /* Process mbuf and decide whether to dispatch it to the stack now or
652  * later.
653  */
654 static void
655 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
656 {
657 	struct sfxge_softc *sc = rxq->sc;
658 	struct mbuf *m = rx_buf->mbuf;
659 	struct ether_header *eh;
660 	struct sfxge_lro_conn *c;
661 	uint16_t l2_id;
662 	uint16_t l3_proto;
663 	void *nh;
664 	struct tcphdr *th;
665 	uint32_t conn_hash;
666 	unsigned bucket;
667 
668 	/* Get the hardware hash */
669 	conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
670 				      mtod(m, uint8_t *));
671 
672 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
673 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
674 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
675 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
676 			SFXGE_LRO_L2_ID_VLAN;
677 		l3_proto = veh->evl_proto;
678 		nh = veh + 1;
679 	} else {
680 		l2_id = 0;
681 		l3_proto = eh->ether_type;
682 		nh = eh + 1;
683 	}
684 
685 	/* Check whether this is a suitable packet (unfragmented
686 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
687 	 * length, and compute a hash if necessary.  If not, return.
688 	 */
689 	if (l3_proto == htons(ETHERTYPE_IP)) {
690 		struct ip *iph = nh;
691 
692 		KASSERT(iph->ip_p == IPPROTO_TCP,
693 		    ("IPv4 protocol is not TCP, but packet marker is set"));
694 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
695 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
696 			goto deliver_now;
697 		th = (struct tcphdr *)(iph + 1);
698 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
699 		struct ip6_hdr *iph = nh;
700 
701 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
702 		    ("IPv6 next header is not TCP, but packet marker is set"));
703 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
704 		th = (struct tcphdr *)(iph + 1);
705 	} else {
706 		goto deliver_now;
707 	}
708 
709 	bucket = conn_hash & rxq->lro.conns_mask;
710 
711 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
712 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
713 			continue;
714 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
715 			continue;
716 		if (c->mbuf != NULL) {
717 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
718 				struct ip *c_iph, *iph = nh;
719 				c_iph = c->nh;
720 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
721 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
722 					continue;
723 			} else {
724 				struct ip6_hdr *c_iph, *iph = nh;
725 				c_iph = c->nh;
726 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
727 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
728 					continue;
729 			}
730 		}
731 
732 		/* Re-insert at head of list to reduce lookup time. */
733 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
734 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
735 
736 		if (c->next_buf.mbuf != NULL) {
737 			if (!sfxge_lro_try_merge(rxq, c))
738 				goto deliver_now;
739 		} else {
740 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
741 			    active_link);
742 		}
743 		c->next_buf = *rx_buf;
744 		c->next_eh = eh;
745 		c->next_nh = nh;
746 
747 		rx_buf->mbuf = NULL;
748 		rx_buf->flags = EFX_DISCARD;
749 		return;
750 	}
751 
752 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
753  deliver_now:
754 	sfxge_rx_deliver(sc, rx_buf);
755 }
756 
757 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
758 {
759 	struct sfxge_lro_state *st = &rxq->lro;
760 	struct sfxge_lro_conn *c;
761 	unsigned t;
762 
763 	while (!LIST_EMPTY(&st->active_conns)) {
764 		c = LIST_FIRST(&st->active_conns);
765 		if (!c->delivered && c->mbuf != NULL)
766 			sfxge_lro_deliver(st, c);
767 		if (sfxge_lro_try_merge(rxq, c)) {
768 			if (c->mbuf != NULL)
769 				sfxge_lro_deliver(st, c);
770 			LIST_REMOVE(c, active_link);
771 		}
772 		c->delivered = 0;
773 	}
774 
775 	t = *(volatile int *)&ticks;
776 	if (__predict_false(t != st->last_purge_ticks))
777 		sfxge_lro_purge_idle(rxq, t);
778 }
779 
780 #else	/* !SFXGE_LRO */
781 
782 static void
783 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
784 {
785 }
786 
787 static void
788 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
789 {
790 }
791 
792 #endif	/* SFXGE_LRO */
793 
794 void
795 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
796 {
797 	struct sfxge_softc *sc = rxq->sc;
798 	int if_capenable = sc->ifnet->if_capenable;
799 	int lro_enabled = if_capenable & IFCAP_LRO;
800 	unsigned int index;
801 	struct sfxge_evq *evq;
802 	unsigned int completed;
803 	unsigned int level;
804 	struct mbuf *m;
805 	struct sfxge_rx_sw_desc *prev = NULL;
806 
807 	index = rxq->index;
808 	evq = sc->evq[index];
809 
810 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
811 
812 	completed = rxq->completed;
813 	while (completed != rxq->pending) {
814 		unsigned int id;
815 		struct sfxge_rx_sw_desc *rx_desc;
816 
817 		id = completed++ & rxq->ptr_mask;
818 		rx_desc = &rxq->queue[id];
819 		m = rx_desc->mbuf;
820 
821 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
822 			goto discard;
823 
824 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
825 			goto discard;
826 
827 		prefetch_read_many(mtod(m, caddr_t));
828 
829 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
830 		case EFX_PKT_IPV4:
831 			if (~if_capenable & IFCAP_RXCSUM)
832 				rx_desc->flags &=
833 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
834 			break;
835 		case EFX_PKT_IPV6:
836 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
837 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
838 			break;
839 		case 0:
840 			/* Check for loopback packets */
841 			{
842 				struct ether_header *etherhp;
843 
844 				/*LINTED*/
845 				etherhp = mtod(m, struct ether_header *);
846 
847 				if (etherhp->ether_type ==
848 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
849 					EFSYS_PROBE(loopback);
850 
851 					rxq->loopback++;
852 					goto discard;
853 				}
854 			}
855 			break;
856 		default:
857 			KASSERT(B_FALSE,
858 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
859 			goto discard;
860 		}
861 
862 		/* Pass packet up the stack or into LRO (pipelined) */
863 		if (prev != NULL) {
864 			if (lro_enabled &&
865 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
866 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
867 				sfxge_lro(rxq, prev);
868 			else
869 				sfxge_rx_deliver(sc, prev);
870 		}
871 		prev = rx_desc;
872 		continue;
873 
874 discard:
875 		/* Return the packet to the pool */
876 		m_free(m);
877 		rx_desc->mbuf = NULL;
878 	}
879 	rxq->completed = completed;
880 
881 	level = rxq->added - rxq->completed;
882 
883 	/* Pass last packet up the stack or into LRO */
884 	if (prev != NULL) {
885 		if (lro_enabled &&
886 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
887 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
888 			sfxge_lro(rxq, prev);
889 		else
890 			sfxge_rx_deliver(sc, prev);
891 	}
892 
893 	/*
894 	 * If there are any pending flows and this is the end of the
895 	 * poll then they must be completed.
896 	 */
897 	if (eop)
898 		sfxge_lro_end_of_burst(rxq);
899 
900 	/* Top up the queue if necessary */
901 	if (level < rxq->refill_threshold)
902 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
903 }
904 
905 static void
906 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
907 {
908 	struct sfxge_rxq *rxq;
909 	struct sfxge_evq *evq;
910 	unsigned int count;
911 
912 	rxq = sc->rxq[index];
913 	evq = sc->evq[index];
914 
915 	SFXGE_EVQ_LOCK(evq);
916 
917 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
918 	    ("rxq not started"));
919 
920 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
921 
922 	callout_stop(&rxq->refill_callout);
923 
924 again:
925 	rxq->flush_state = SFXGE_FLUSH_PENDING;
926 
927 	/* Flush the receive queue */
928 	efx_rx_qflush(rxq->common);
929 
930 	SFXGE_EVQ_UNLOCK(evq);
931 
932 	count = 0;
933 	do {
934 		/* Spin for 100 ms */
935 		DELAY(100000);
936 
937 		if (rxq->flush_state != SFXGE_FLUSH_PENDING)
938 			break;
939 
940 	} while (++count < 20);
941 
942 	SFXGE_EVQ_LOCK(evq);
943 
944 	if (rxq->flush_state == SFXGE_FLUSH_FAILED)
945 		goto again;
946 
947 	rxq->flush_state = SFXGE_FLUSH_DONE;
948 
949 	rxq->pending = rxq->added;
950 	sfxge_rx_qcomplete(rxq, B_TRUE);
951 
952 	KASSERT(rxq->completed == rxq->pending,
953 	    ("rxq->completed != rxq->pending"));
954 
955 	rxq->added = 0;
956 	rxq->pending = 0;
957 	rxq->completed = 0;
958 	rxq->loopback = 0;
959 
960 	/* Destroy the common code receive queue. */
961 	efx_rx_qdestroy(rxq->common);
962 
963 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
964 	    EFX_RXQ_NBUFS(sc->rxq_entries));
965 
966 	SFXGE_EVQ_UNLOCK(evq);
967 }
968 
969 static int
970 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
971 {
972 	struct sfxge_rxq *rxq;
973 	efsys_mem_t *esmp;
974 	struct sfxge_evq *evq;
975 	int rc;
976 
977 	rxq = sc->rxq[index];
978 	esmp = &rxq->mem;
979 	evq = sc->evq[index];
980 
981 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
982 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
983 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
984 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
985 
986 	/* Program the buffer table. */
987 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
988 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
989 		return (rc);
990 
991 	/* Create the common code receive queue. */
992 	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
993 	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
994 	    &rxq->common)) != 0)
995 		goto fail;
996 
997 	SFXGE_EVQ_LOCK(evq);
998 
999 	/* Enable the receive queue. */
1000 	efx_rx_qenable(rxq->common);
1001 
1002 	rxq->init_state = SFXGE_RXQ_STARTED;
1003 
1004 	/* Try to fill the queue from the pool. */
1005 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1006 
1007 	SFXGE_EVQ_UNLOCK(evq);
1008 
1009 	return (0);
1010 
1011 fail:
1012 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1013 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1014 	return (rc);
1015 }
1016 
1017 void
1018 sfxge_rx_stop(struct sfxge_softc *sc)
1019 {
1020 	int index;
1021 
1022 	/* Stop the receive queue(s) */
1023 	index = sc->rxq_count;
1024 	while (--index >= 0)
1025 		sfxge_rx_qstop(sc, index);
1026 
1027 	sc->rx_prefix_size = 0;
1028 	sc->rx_buffer_size = 0;
1029 
1030 	efx_rx_fini(sc->enp);
1031 }
1032 
1033 int
1034 sfxge_rx_start(struct sfxge_softc *sc)
1035 {
1036 	struct sfxge_intr *intr;
1037 	int index;
1038 	int rc;
1039 
1040 	intr = &sc->intr;
1041 
1042 	/* Initialize the common code receive module. */
1043 	if ((rc = efx_rx_init(sc->enp)) != 0)
1044 		return (rc);
1045 
1046 	/* Calculate the receive packet buffer size. */
1047 	sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
1048 	sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
1049 			      sc->rx_prefix_size);
1050 
1051 	/* Select zone for packet buffers */
1052 	if (sc->rx_buffer_size <= MCLBYTES)
1053 		sc->rx_buffer_zone = zone_clust;
1054 	else if (sc->rx_buffer_size <= MJUMPAGESIZE)
1055 		sc->rx_buffer_zone = zone_jumbop;
1056 	else if (sc->rx_buffer_size <= MJUM9BYTES)
1057 		sc->rx_buffer_zone = zone_jumbo9;
1058 	else
1059 		sc->rx_buffer_zone = zone_jumbo16;
1060 
1061 	/*
1062 	 * Set up the scale table.  Enable all hash types and hash insertion.
1063 	 */
1064 	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1065 		sc->rx_indir_table[index] = index % sc->rxq_count;
1066 	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1067 				       SFXGE_RX_SCALE_MAX)) != 0)
1068 		goto fail;
1069 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1070 	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1071 	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1072 
1073 	if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1074 	    sizeof(toep_key))) != 0)
1075 		goto fail;
1076 
1077 	/* Start the receive queue(s). */
1078 	for (index = 0; index < sc->rxq_count; index++) {
1079 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1080 			goto fail2;
1081 	}
1082 
1083 	return (0);
1084 
1085 fail2:
1086 	while (--index >= 0)
1087 		sfxge_rx_qstop(sc, index);
1088 
1089 fail:
1090 	efx_rx_fini(sc->enp);
1091 
1092 	return (rc);
1093 }
1094 
1095 #ifdef SFXGE_LRO
1096 
1097 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1098 {
1099 	struct sfxge_lro_state *st = &rxq->lro;
1100 	unsigned i;
1101 
1102 	st->conns_mask = lro_table_size - 1;
1103 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1104 		("lro_table_size must be a power of 2"));
1105 	st->sc = rxq->sc;
1106 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1107 			   M_SFXGE, M_WAITOK);
1108 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1109 			     M_SFXGE, M_WAITOK);
1110 	for (i = 0; i <= st->conns_mask; ++i) {
1111 		TAILQ_INIT(&st->conns[i]);
1112 		st->conns_n[i] = 0;
1113 	}
1114 	LIST_INIT(&st->active_conns);
1115 	TAILQ_INIT(&st->free_conns);
1116 }
1117 
1118 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1119 {
1120 	struct sfxge_lro_state *st = &rxq->lro;
1121 	struct sfxge_lro_conn *c;
1122 	unsigned i;
1123 
1124 	/* Return cleanly if sfxge_lro_init() has not been called. */
1125 	if (st->conns == NULL)
1126 		return;
1127 
1128 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1129 
1130 	for (i = 0; i <= st->conns_mask; ++i) {
1131 		while (!TAILQ_EMPTY(&st->conns[i])) {
1132 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1133 			sfxge_lro_drop(rxq, c);
1134 		}
1135 	}
1136 
1137 	while (!TAILQ_EMPTY(&st->free_conns)) {
1138 		c = TAILQ_FIRST(&st->free_conns);
1139 		TAILQ_REMOVE(&st->free_conns, c, link);
1140 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1141 		free(c, M_SFXGE);
1142 	}
1143 
1144 	free(st->conns_n, M_SFXGE);
1145 	free(st->conns, M_SFXGE);
1146 	st->conns = NULL;
1147 }
1148 
1149 #else
1150 
1151 static void
1152 sfxge_lro_init(struct sfxge_rxq *rxq)
1153 {
1154 }
1155 
1156 static void
1157 sfxge_lro_fini(struct sfxge_rxq *rxq)
1158 {
1159 }
1160 
1161 #endif	/* SFXGE_LRO */
1162 
1163 static void
1164 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1165 {
1166 	struct sfxge_rxq *rxq;
1167 
1168 	rxq = sc->rxq[index];
1169 
1170 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1171 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1172 
1173 	/* Free the context array and the flow table. */
1174 	free(rxq->queue, M_SFXGE);
1175 	sfxge_lro_fini(rxq);
1176 
1177 	/* Release DMA memory. */
1178 	sfxge_dma_free(&rxq->mem);
1179 
1180 	sc->rxq[index] = NULL;
1181 
1182 	free(rxq, M_SFXGE);
1183 }
1184 
1185 static int
1186 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1187 {
1188 	struct sfxge_rxq *rxq;
1189 	struct sfxge_evq *evq;
1190 	efsys_mem_t *esmp;
1191 	int rc;
1192 
1193 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1194 
1195 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1196 	rxq->sc = sc;
1197 	rxq->index = index;
1198 	rxq->entries = sc->rxq_entries;
1199 	rxq->ptr_mask = rxq->entries - 1;
1200 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1201 
1202 	sc->rxq[index] = rxq;
1203 	esmp = &rxq->mem;
1204 
1205 	evq = sc->evq[index];
1206 
1207 	/* Allocate and zero DMA space. */
1208 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1209 		return (rc);
1210 
1211 	/* Allocate buffer table entries. */
1212 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1213 				 &rxq->buf_base_id);
1214 
1215 	/* Allocate the context array and the flow table. */
1216 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1217 	    M_SFXGE, M_WAITOK | M_ZERO);
1218 	sfxge_lro_init(rxq);
1219 
1220 	callout_init(&rxq->refill_callout, 1);
1221 
1222 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1223 
1224 	return (0);
1225 }
1226 
1227 static const struct {
1228 	const char *name;
1229 	size_t offset;
1230 } sfxge_rx_stats[] = {
1231 #define	SFXGE_RX_STAT(name, member) \
1232 	{ #name, offsetof(struct sfxge_rxq, member) }
1233 #ifdef SFXGE_LRO
1234 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1235 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1236 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1237 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1238 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1239 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1240 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1241 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1242 #endif
1243 };
1244 
1245 static int
1246 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1247 {
1248 	struct sfxge_softc *sc = arg1;
1249 	unsigned int id = arg2;
1250 	unsigned int sum, index;
1251 
1252 	/* Sum across all RX queues */
1253 	sum = 0;
1254 	for (index = 0; index < sc->rxq_count; index++)
1255 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1256 					 sfxge_rx_stats[id].offset);
1257 
1258 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1259 }
1260 
1261 static void
1262 sfxge_rx_stat_init(struct sfxge_softc *sc)
1263 {
1264 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1265 	struct sysctl_oid_list *stat_list;
1266 	unsigned int id;
1267 
1268 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1269 
1270 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1271 		SYSCTL_ADD_PROC(
1272 			ctx, stat_list,
1273 			OID_AUTO, sfxge_rx_stats[id].name,
1274 			CTLTYPE_UINT|CTLFLAG_RD,
1275 			sc, id, sfxge_rx_stat_handler, "IU",
1276 			"");
1277 	}
1278 }
1279 
1280 void
1281 sfxge_rx_fini(struct sfxge_softc *sc)
1282 {
1283 	int index;
1284 
1285 	index = sc->rxq_count;
1286 	while (--index >= 0)
1287 		sfxge_rx_qfini(sc, index);
1288 
1289 	sc->rxq_count = 0;
1290 }
1291 
1292 int
1293 sfxge_rx_init(struct sfxge_softc *sc)
1294 {
1295 	struct sfxge_intr *intr;
1296 	int index;
1297 	int rc;
1298 
1299 #ifdef SFXGE_LRO
1300 	if (!ISP2(lro_table_size)) {
1301 		log(LOG_ERR, "%s=%u must be power of 2",
1302 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1303 		rc = EINVAL;
1304 		goto fail_lro_table_size;
1305 	}
1306 
1307 	if (lro_idle_ticks == 0)
1308 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1309 #endif
1310 
1311 	intr = &sc->intr;
1312 
1313 	sc->rxq_count = intr->n_alloc;
1314 
1315 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1316 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1317 
1318 	/* Initialize the receive queue(s) - one per interrupt. */
1319 	for (index = 0; index < sc->rxq_count; index++) {
1320 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1321 			goto fail;
1322 	}
1323 
1324 	sfxge_rx_stat_init(sc);
1325 
1326 	return (0);
1327 
1328 fail:
1329 	/* Tear down the receive queue(s). */
1330 	while (--index >= 0)
1331 		sfxge_rx_qfini(sc, index);
1332 
1333 	sc->rxq_count = 0;
1334 
1335 #ifdef SFXGE_LRO
1336 fail_lro_table_size:
1337 #endif
1338 	return (rc);
1339 }
1340