xref: /freebsd/sys/dev/sfxge/sfxge_rx.c (revision 99b3c9adbc4152ceae234d21b6d0d19e2d0ea7d9)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
5  * All rights reserved.
6  *
7  * This software was developed in part by Philip Paeps under contract for
8  * Solarflare Communications, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright notice,
14  *    this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright notice,
16  *    this list of conditions and the following disclaimer in the documentation
17  *    and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * The views and conclusions contained in the software and documentation are
32  * those of the authors and should not be interpreted as representing official
33  * policies, either expressed or implied, of the FreeBSD Project.
34  */
35 
36 #include <sys/cdefs.h>
37 #include "opt_rss.h"
38 
39 #include <sys/param.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/smp.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
46 #include <sys/limits.h>
47 #include <sys/syslog.h>
48 
49 #include <net/ethernet.h>
50 #include <net/if.h>
51 #include <net/if_vlan_var.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
57 
58 #include <machine/in_cksum.h>
59 
60 #include <net/rss_config.h>
61 
62 #include "common/efx.h"
63 
64 #include "sfxge.h"
65 #include "sfxge_rx.h"
66 
67 #define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
68 
69 #ifdef SFXGE_LRO
70 
71 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
72     "Large receive offload (LRO) parameters");
73 
74 #define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
75 
76 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
77  * means we can accelerate a larger number of streams.
78  */
79 static unsigned lro_table_size = 128;
80 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
81 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
82 	    &lro_table_size, 0,
83 	    "Size of the LRO hash table (must be a power of 2)");
84 
85 /* Maximum length of a hash chain.  If chains get too long then the lookup
86  * time increases and may exceed the benefit of LRO.
87  */
88 static unsigned lro_chain_max = 20;
89 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
90 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
91 	    &lro_chain_max, 0,
92 	    "The maximum length of a hash chain");
93 
94 /* Maximum time (in ticks) that a connection can be idle before it's LRO
95  * state is discarded.
96  */
97 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
98 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
99 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
100 	    &lro_idle_ticks, 0,
101 	    "The maximum time (in ticks) that a connection can be idle "
102 	    "before it's LRO state is discarded");
103 
104 /* Number of packets with payload that must arrive in-order before a
105  * connection is eligible for LRO.  The idea is we should avoid coalescing
106  * segments when the sender is in slow-start because reducing the ACK rate
107  * can damage performance.
108  */
109 static int lro_slow_start_packets = 2000;
110 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
111 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
112 	    &lro_slow_start_packets, 0,
113 	    "Number of packets with payload that must arrive in-order before "
114 	    "a connection is eligible for LRO");
115 
116 /* Number of packets with payload that must arrive in-order following loss
117  * before a connection is eligible for LRO.  The idea is we should avoid
118  * coalescing segments when the sender is recovering from loss, because
119  * reducing the ACK rate can damage performance.
120  */
121 static int lro_loss_packets = 20;
122 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
123 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
124 	    &lro_loss_packets, 0,
125 	    "Number of packets with payload that must arrive in-order "
126 	    "following loss before a connection is eligible for LRO");
127 
128 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
129 #define	SFXGE_LRO_L2_ID_VLAN 0x4000
130 #define	SFXGE_LRO_L2_ID_IPV6 0x8000
131 #define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
132 #define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
133 
134 /* Compare IPv6 addresses, avoiding conditional branches */
ipv6_addr_cmp(const struct in6_addr * left,const struct in6_addr * right)135 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
136 				   const struct in6_addr *right)
137 {
138 #if LONG_BIT == 64
139 	const uint64_t *left64 = (const uint64_t *)left;
140 	const uint64_t *right64 = (const uint64_t *)right;
141 	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
142 #else
143 	return (left->s6_addr32[0] - right->s6_addr32[0]) |
144 	       (left->s6_addr32[1] - right->s6_addr32[1]) |
145 	       (left->s6_addr32[2] - right->s6_addr32[2]) |
146 	       (left->s6_addr32[3] - right->s6_addr32[3]);
147 #endif
148 }
149 
150 #endif	/* SFXGE_LRO */
151 
152 void
sfxge_rx_qflush_done(struct sfxge_rxq * rxq)153 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
154 {
155 
156 	rxq->flush_state = SFXGE_FLUSH_DONE;
157 }
158 
159 void
sfxge_rx_qflush_failed(struct sfxge_rxq * rxq)160 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
161 {
162 
163 	rxq->flush_state = SFXGE_FLUSH_FAILED;
164 }
165 
166 static uint8_t toep_key[RSS_KEYSIZE];
167 
168 static void
sfxge_rx_post_refill(void * arg)169 sfxge_rx_post_refill(void *arg)
170 {
171 	struct sfxge_rxq *rxq = arg;
172 	struct sfxge_softc *sc;
173 	unsigned int index;
174 	struct sfxge_evq *evq;
175 	uint16_t magic;
176 
177 	sc = rxq->sc;
178 	index = rxq->index;
179 	evq = sc->evq[index];
180 	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
181 
182 	/* This is guaranteed due to the start/stop order of rx and ev */
183 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
184 	    ("evq not started"));
185 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
186 	    ("rxq not started"));
187 	efx_ev_qpost(evq->common, magic);
188 }
189 
190 static void
sfxge_rx_schedule_refill(struct sfxge_rxq * rxq,boolean_t retrying)191 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
192 {
193 	/* Initially retry after 100 ms, but back off in case of
194 	 * repeated failures as we probably have to wait for the
195 	 * administrator to raise the pool limit. */
196 	if (retrying)
197 		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
198 	else
199 		rxq->refill_delay = hz / 10;
200 
201 	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
202 			     sfxge_rx_post_refill, rxq);
203 }
204 
205 #define	SFXGE_REFILL_BATCH  64
206 
207 static void
sfxge_rx_qfill(struct sfxge_rxq * rxq,unsigned int target,boolean_t retrying)208 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
209 {
210 	struct sfxge_softc *sc;
211 	unsigned int index;
212 	struct sfxge_evq *evq __diagused;
213 	unsigned int batch;
214 	unsigned int rxfill;
215 	unsigned int mblksize;
216 	int ntodo;
217 	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
218 
219 	sc = rxq->sc;
220 	index = rxq->index;
221 	evq = sc->evq[index];
222 
223 	prefetch_read_many(sc->enp);
224 	prefetch_read_many(rxq->common);
225 
226 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
227 
228 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
229 		return;
230 
231 	rxfill = rxq->added - rxq->completed;
232 	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
233 	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
234 	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
235 	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
236 	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
237 
238 	if (ntodo == 0)
239 		return;
240 
241 	batch = 0;
242 	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
243 	while (ntodo-- > 0) {
244 		unsigned int id;
245 		struct sfxge_rx_sw_desc *rx_desc;
246 		bus_dma_segment_t seg;
247 		struct mbuf *m;
248 
249 		id = (rxq->added + batch) & rxq->ptr_mask;
250 		rx_desc = &rxq->queue[id];
251 		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
252 
253 		rx_desc->flags = EFX_DISCARD;
254 		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
255 		    sc->rx_cluster_size);
256 		if (m == NULL)
257 			break;
258 
259 		/* m_len specifies length of area to be mapped for DMA */
260 		m->m_len  = mblksize;
261 		m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
262 						   CACHE_LINE_SIZE);
263 		m->m_data += sc->rx_buffer_align;
264 
265 		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
266 		addr[batch++] = seg.ds_addr;
267 
268 		if (batch == SFXGE_REFILL_BATCH) {
269 			efx_rx_qpost(rxq->common, addr, mblksize, batch,
270 			    rxq->completed, rxq->added);
271 			rxq->added += batch;
272 			batch = 0;
273 		}
274 	}
275 
276 	if (ntodo != 0)
277 		sfxge_rx_schedule_refill(rxq, retrying);
278 
279 	if (batch != 0) {
280 		efx_rx_qpost(rxq->common, addr, mblksize, batch,
281 		    rxq->completed, rxq->added);
282 		rxq->added += batch;
283 	}
284 
285 	/* Make the descriptors visible to the hardware */
286 	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
287 			BUS_DMASYNC_PREWRITE);
288 
289 	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
290 
291 	/* The queue could still be empty if no descriptors were actually
292 	 * pushed, in which case there will be no event to cause the next
293 	 * refill, so we must schedule a refill ourselves.
294 	 */
295 	if(rxq->pushed == rxq->completed) {
296 		sfxge_rx_schedule_refill(rxq, retrying);
297 	}
298 }
299 
300 void
sfxge_rx_qrefill(struct sfxge_rxq * rxq)301 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
302 {
303 
304 	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
305 		return;
306 
307 	/* Make sure the queue is full */
308 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
309 }
310 
__sfxge_rx_deliver(struct sfxge_softc * sc,struct mbuf * m)311 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
312 {
313 	if_t ifp = sc->ifnet;
314 
315 	m->m_pkthdr.rcvif = ifp;
316 	m->m_pkthdr.csum_data = 0xffff;
317 	if_input(ifp, m);
318 }
319 
320 static void
sfxge_rx_deliver(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_desc)321 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
322 {
323 	struct sfxge_softc *sc = rxq->sc;
324 	struct mbuf *m = rx_desc->mbuf;
325 	int flags = rx_desc->flags;
326 	int csum_flags;
327 
328 	/* Convert checksum flags */
329 	csum_flags = (flags & EFX_CKSUM_IPV4) ?
330 		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
331 	if (flags & EFX_CKSUM_TCPUDP)
332 		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
333 
334 	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
335 		m->m_pkthdr.flowid =
336 			efx_pseudo_hdr_hash_get(rxq->common,
337 						EFX_RX_HASHALG_TOEPLITZ,
338 						mtod(m, uint8_t *));
339 		/* The hash covers a 4-tuple for TCP only */
340 		M_HASHTYPE_SET(m,
341 		    (flags & EFX_PKT_IPV4) ?
342 			((flags & EFX_PKT_TCP) ?
343 			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
344 			((flags & EFX_PKT_TCP) ?
345 			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
346 	}
347 	m->m_data += sc->rx_prefix_size;
348 	m->m_len = rx_desc->size - sc->rx_prefix_size;
349 	m->m_pkthdr.len = m->m_len;
350 	m->m_pkthdr.csum_flags = csum_flags;
351 	__sfxge_rx_deliver(sc, rx_desc->mbuf);
352 
353 	rx_desc->flags = EFX_DISCARD;
354 	rx_desc->mbuf = NULL;
355 }
356 
357 #ifdef SFXGE_LRO
358 
359 static void
sfxge_lro_deliver(struct sfxge_lro_state * st,struct sfxge_lro_conn * c)360 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
361 {
362 	struct sfxge_softc *sc = st->sc;
363 	struct mbuf *m = c->mbuf;
364 	struct tcphdr *c_th;
365 	int csum_flags;
366 
367 	KASSERT(m, ("no mbuf to deliver"));
368 
369 	++st->n_bursts;
370 
371 	/* Finish off packet munging and recalculate IP header checksum. */
372 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
373 		struct ip *iph = c->nh;
374 		iph->ip_len = htons(iph->ip_len);
375 		iph->ip_sum = 0;
376 		iph->ip_sum = in_cksum_hdr(iph);
377 		c_th = (struct tcphdr *)(iph + 1);
378 		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
379 			      CSUM_IP_CHECKED | CSUM_IP_VALID);
380 	} else {
381 		struct ip6_hdr *iph = c->nh;
382 		iph->ip6_plen = htons(iph->ip6_plen);
383 		c_th = (struct tcphdr *)(iph + 1);
384 		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
385 	}
386 
387 	c_th->th_win = c->th_last->th_win;
388 	c_th->th_ack = c->th_last->th_ack;
389 	if (c_th->th_off == c->th_last->th_off) {
390 		/* Copy TCP options (take care to avoid going negative). */
391 		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
392 		memcpy(c_th + 1, c->th_last + 1, optlen);
393 	}
394 
395 	m->m_pkthdr.flowid = c->conn_hash;
396 	M_HASHTYPE_SET(m,
397 	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
398 		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
399 
400 	m->m_pkthdr.csum_flags = csum_flags;
401 	__sfxge_rx_deliver(sc, m);
402 
403 	c->mbuf = NULL;
404 	c->delivered = 1;
405 }
406 
407 /* Drop the given connection, and add it to the free list. */
sfxge_lro_drop(struct sfxge_rxq * rxq,struct sfxge_lro_conn * c)408 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
409 {
410 	unsigned bucket;
411 
412 	KASSERT(!c->mbuf, ("found orphaned mbuf"));
413 
414 	if (c->next_buf.mbuf != NULL) {
415 		sfxge_rx_deliver(rxq, &c->next_buf);
416 		LIST_REMOVE(c, active_link);
417 	}
418 
419 	bucket = c->conn_hash & rxq->lro.conns_mask;
420 	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
421 	--rxq->lro.conns_n[bucket];
422 	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
423 	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
424 }
425 
426 /* Stop tracking connections that have gone idle in order to keep hash
427  * chains short.
428  */
sfxge_lro_purge_idle(struct sfxge_rxq * rxq,unsigned now)429 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
430 {
431 	struct sfxge_lro_conn *c;
432 	unsigned i;
433 
434 	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
435 		("found active connections"));
436 
437 	rxq->lro.last_purge_ticks = now;
438 	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
439 		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
440 			continue;
441 
442 		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
443 		if (now - c->last_pkt_ticks > lro_idle_ticks) {
444 			++rxq->lro.n_drop_idle;
445 			sfxge_lro_drop(rxq, c);
446 		}
447 	}
448 }
449 
450 static void
sfxge_lro_merge(struct sfxge_lro_state * st,struct sfxge_lro_conn * c,struct mbuf * mbuf,struct tcphdr * th)451 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
452 		struct mbuf *mbuf, struct tcphdr *th)
453 {
454 	struct tcphdr *c_th;
455 
456 	/* Tack the new mbuf onto the chain. */
457 	KASSERT(!mbuf->m_next, ("mbuf already chained"));
458 	c->mbuf_tail->m_next = mbuf;
459 	c->mbuf_tail = mbuf;
460 
461 	/* Increase length appropriately */
462 	c->mbuf->m_pkthdr.len += mbuf->m_len;
463 
464 	/* Update the connection state flags */
465 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
466 		struct ip *iph = c->nh;
467 		iph->ip_len += mbuf->m_len;
468 		c_th = (struct tcphdr *)(iph + 1);
469 	} else {
470 		struct ip6_hdr *iph = c->nh;
471 		iph->ip6_plen += mbuf->m_len;
472 		c_th = (struct tcphdr *)(iph + 1);
473 	}
474 	tcp_set_flags(c_th, tcp_get_flags(c_th) | (tcp_get_flags(th) & TH_PUSH));
475 	c->th_last = th;
476 	++st->n_merges;
477 
478 	/* Pass packet up now if another segment could overflow the IP
479 	 * length.
480 	 */
481 	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
482 		sfxge_lro_deliver(st, c);
483 }
484 
485 static void
sfxge_lro_start(struct sfxge_lro_state * st,struct sfxge_lro_conn * c,struct mbuf * mbuf,void * nh,struct tcphdr * th)486 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
487 		struct mbuf *mbuf, void *nh, struct tcphdr *th)
488 {
489 	/* Start the chain */
490 	c->mbuf = mbuf;
491 	c->mbuf_tail = c->mbuf;
492 	c->nh = nh;
493 	c->th_last = th;
494 
495 	mbuf->m_pkthdr.len = mbuf->m_len;
496 
497 	/* Mangle header fields for later processing */
498 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
499 		struct ip *iph = nh;
500 		iph->ip_len = ntohs(iph->ip_len);
501 	} else {
502 		struct ip6_hdr *iph = nh;
503 		iph->ip6_plen = ntohs(iph->ip6_plen);
504 	}
505 }
506 
507 /* Try to merge or otherwise hold or deliver (as appropriate) the
508  * packet buffered for this connection (c->next_buf).  Return a flag
509  * indicating whether the connection is still active for LRO purposes.
510  */
511 static int
sfxge_lro_try_merge(struct sfxge_rxq * rxq,struct sfxge_lro_conn * c)512 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
513 {
514 	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
515 	char *eh = c->next_eh;
516 	int data_length, hdr_length, dont_merge;
517 	unsigned th_seq, pkt_length;
518 	struct tcphdr *th;
519 	unsigned now;
520 
521 	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
522 		struct ip *iph = c->next_nh;
523 		th = (struct tcphdr *)(iph + 1);
524 		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
525 	} else {
526 		struct ip6_hdr *iph = c->next_nh;
527 		th = (struct tcphdr *)(iph + 1);
528 		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
529 	}
530 
531 	hdr_length = (char *) th + th->th_off * 4 - eh;
532 	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
533 		       hdr_length);
534 	th_seq = ntohl(th->th_seq);
535 	dont_merge = ((data_length <= 0)
536 		      | (tcp_get_flags(th) & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
537 
538 	/* Check for options other than aligned timestamp. */
539 	if (th->th_off != 5) {
540 		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
541 		if (th->th_off == 8 &&
542 		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
543 					(TCPOPT_NOP << 16) |
544 					(TCPOPT_TIMESTAMP << 8) |
545 					TCPOLEN_TIMESTAMP)) {
546 			/* timestamp option -- okay */
547 		} else {
548 			dont_merge = 1;
549 		}
550 	}
551 
552 	if (__predict_false(th_seq != c->next_seq)) {
553 		/* Out-of-order, so start counting again. */
554 		if (c->mbuf != NULL)
555 			sfxge_lro_deliver(&rxq->lro, c);
556 		c->n_in_order_pkts -= lro_loss_packets;
557 		c->next_seq = th_seq + data_length;
558 		++rxq->lro.n_misorder;
559 		goto deliver_buf_out;
560 	}
561 	c->next_seq = th_seq + data_length;
562 
563 	now = ticks;
564 	if (now - c->last_pkt_ticks > lro_idle_ticks) {
565 		++rxq->lro.n_drop_idle;
566 		if (c->mbuf != NULL)
567 			sfxge_lro_deliver(&rxq->lro, c);
568 		sfxge_lro_drop(rxq, c);
569 		return (0);
570 	}
571 	c->last_pkt_ticks = ticks;
572 
573 	if (c->n_in_order_pkts < lro_slow_start_packets) {
574 		/* May be in slow-start, so don't merge. */
575 		++rxq->lro.n_slow_start;
576 		++c->n_in_order_pkts;
577 		goto deliver_buf_out;
578 	}
579 
580 	if (__predict_false(dont_merge)) {
581 		if (c->mbuf != NULL)
582 			sfxge_lro_deliver(&rxq->lro, c);
583 		if (tcp_get_flags(th) & (TH_FIN | TH_RST)) {
584 			++rxq->lro.n_drop_closed;
585 			sfxge_lro_drop(rxq, c);
586 			return (0);
587 		}
588 		goto deliver_buf_out;
589 	}
590 
591 	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
592 
593 	if (__predict_true(c->mbuf != NULL)) {
594 		/* Remove headers and any padding */
595 		rx_buf->mbuf->m_data += hdr_length;
596 		rx_buf->mbuf->m_len = data_length;
597 
598 		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
599 	} else {
600 		/* Remove any padding */
601 		rx_buf->mbuf->m_len = pkt_length;
602 
603 		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
604 	}
605 
606 	rx_buf->mbuf = NULL;
607 	return (1);
608 
609  deliver_buf_out:
610 	sfxge_rx_deliver(rxq, rx_buf);
611 	return (1);
612 }
613 
sfxge_lro_new_conn(struct sfxge_lro_state * st,uint32_t conn_hash,uint16_t l2_id,void * nh,struct tcphdr * th)614 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
615 			       uint16_t l2_id, void *nh, struct tcphdr *th)
616 {
617 	unsigned bucket = conn_hash & st->conns_mask;
618 	struct sfxge_lro_conn *c;
619 
620 	if (st->conns_n[bucket] >= lro_chain_max) {
621 		++st->n_too_many;
622 		return;
623 	}
624 
625 	if (!TAILQ_EMPTY(&st->free_conns)) {
626 		c = TAILQ_FIRST(&st->free_conns);
627 		TAILQ_REMOVE(&st->free_conns, c, link);
628 	} else {
629 		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
630 		if (c == NULL)
631 			return;
632 		c->mbuf = NULL;
633 		c->next_buf.mbuf = NULL;
634 	}
635 
636 	/* Create the connection tracking data */
637 	++st->conns_n[bucket];
638 	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
639 	c->l2_id = l2_id;
640 	c->conn_hash = conn_hash;
641 	c->source = th->th_sport;
642 	c->dest = th->th_dport;
643 	c->n_in_order_pkts = 0;
644 	c->last_pkt_ticks = *(volatile int *)&ticks;
645 	c->delivered = 0;
646 	++st->n_new_stream;
647 	/* NB. We don't initialise c->next_seq, and it doesn't matter what
648 	 * value it has.  Most likely the next packet received for this
649 	 * connection will not match -- no harm done.
650 	 */
651 }
652 
653 /* Process mbuf and decide whether to dispatch it to the stack now or
654  * later.
655  */
656 static void
sfxge_lro(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_buf)657 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
658 {
659 	struct sfxge_softc *sc = rxq->sc;
660 	struct mbuf *m = rx_buf->mbuf;
661 	struct ether_header *eh;
662 	struct sfxge_lro_conn *c;
663 	uint16_t l2_id;
664 	uint16_t l3_proto;
665 	void *nh;
666 	struct tcphdr *th;
667 	uint32_t conn_hash;
668 	unsigned bucket;
669 
670 	/* Get the hardware hash */
671 	conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
672 					    EFX_RX_HASHALG_TOEPLITZ,
673 					    mtod(m, uint8_t *));
674 
675 	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
676 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
677 		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
678 		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
679 			SFXGE_LRO_L2_ID_VLAN;
680 		l3_proto = veh->evl_proto;
681 		nh = veh + 1;
682 	} else {
683 		l2_id = 0;
684 		l3_proto = eh->ether_type;
685 		nh = eh + 1;
686 	}
687 
688 	/* Check whether this is a suitable packet (unfragmented
689 	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
690 	 * length, and compute a hash if necessary.  If not, return.
691 	 */
692 	if (l3_proto == htons(ETHERTYPE_IP)) {
693 		struct ip *iph = nh;
694 
695 		KASSERT(iph->ip_p == IPPROTO_TCP,
696 		    ("IPv4 protocol is not TCP, but packet marker is set"));
697 		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
698 		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
699 			goto deliver_now;
700 		th = (struct tcphdr *)(iph + 1);
701 	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
702 		struct ip6_hdr *iph = nh;
703 
704 		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
705 		    ("IPv6 next header is not TCP, but packet marker is set"));
706 		l2_id |= SFXGE_LRO_L2_ID_IPV6;
707 		th = (struct tcphdr *)(iph + 1);
708 	} else {
709 		goto deliver_now;
710 	}
711 
712 	bucket = conn_hash & rxq->lro.conns_mask;
713 
714 	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
715 		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
716 			continue;
717 		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
718 			continue;
719 		if (c->mbuf != NULL) {
720 			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
721 				struct ip *c_iph, *iph = nh;
722 				c_iph = c->nh;
723 				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
724 				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
725 					continue;
726 			} else {
727 				struct ip6_hdr *c_iph, *iph = nh;
728 				c_iph = c->nh;
729 				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
730 				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
731 					continue;
732 			}
733 		}
734 
735 		/* Re-insert at head of list to reduce lookup time. */
736 		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
737 		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
738 
739 		if (c->next_buf.mbuf != NULL) {
740 			if (!sfxge_lro_try_merge(rxq, c))
741 				goto deliver_now;
742 		} else {
743 			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
744 			    active_link);
745 		}
746 		c->next_buf = *rx_buf;
747 		c->next_eh = eh;
748 		c->next_nh = nh;
749 
750 		rx_buf->mbuf = NULL;
751 		rx_buf->flags = EFX_DISCARD;
752 		return;
753 	}
754 
755 	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
756  deliver_now:
757 	sfxge_rx_deliver(rxq, rx_buf);
758 }
759 
sfxge_lro_end_of_burst(struct sfxge_rxq * rxq)760 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
761 {
762 	struct sfxge_lro_state *st = &rxq->lro;
763 	struct sfxge_lro_conn *c;
764 	unsigned t;
765 
766 	while (!LIST_EMPTY(&st->active_conns)) {
767 		c = LIST_FIRST(&st->active_conns);
768 		if (!c->delivered && c->mbuf != NULL)
769 			sfxge_lro_deliver(st, c);
770 		if (sfxge_lro_try_merge(rxq, c)) {
771 			if (c->mbuf != NULL)
772 				sfxge_lro_deliver(st, c);
773 			LIST_REMOVE(c, active_link);
774 		}
775 		c->delivered = 0;
776 	}
777 
778 	t = *(volatile int *)&ticks;
779 	if (__predict_false(t != st->last_purge_ticks))
780 		sfxge_lro_purge_idle(rxq, t);
781 }
782 
783 #else	/* !SFXGE_LRO */
784 
785 static void
sfxge_lro(struct sfxge_rxq * rxq,struct sfxge_rx_sw_desc * rx_buf)786 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
787 {
788 }
789 
790 static void
sfxge_lro_end_of_burst(struct sfxge_rxq * rxq)791 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
792 {
793 }
794 
795 #endif	/* SFXGE_LRO */
796 
797 void
sfxge_rx_qcomplete(struct sfxge_rxq * rxq,boolean_t eop)798 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
799 {
800 	struct sfxge_softc *sc = rxq->sc;
801 	int if_capenable = if_getcapenable(sc->ifnet);
802 	int lro_enabled = if_capenable & IFCAP_LRO;
803 	unsigned int index;
804 	struct sfxge_evq *evq __diagused;
805 	unsigned int completed;
806 	unsigned int level;
807 	struct mbuf *m;
808 	struct sfxge_rx_sw_desc *prev = NULL;
809 
810 	index = rxq->index;
811 	evq = sc->evq[index];
812 
813 	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
814 
815 	completed = rxq->completed;
816 	while (completed != rxq->pending) {
817 		unsigned int id;
818 		struct sfxge_rx_sw_desc *rx_desc;
819 
820 		id = completed++ & rxq->ptr_mask;
821 		rx_desc = &rxq->queue[id];
822 		m = rx_desc->mbuf;
823 
824 		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
825 			goto discard;
826 
827 		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
828 			goto discard;
829 
830 		/* Read the length from the pseudo header if required */
831 		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
832 			uint16_t tmp_size;
833 			int rc __diagused;
834 
835 			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
836 							   mtod(m, uint8_t *),
837 							   &tmp_size);
838 			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
839 			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
840 		}
841 
842 		prefetch_read_many(mtod(m, caddr_t));
843 
844 		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
845 		case EFX_PKT_IPV4:
846 			if (~if_capenable & IFCAP_RXCSUM)
847 				rx_desc->flags &=
848 				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
849 			break;
850 		case EFX_PKT_IPV6:
851 			if (~if_capenable & IFCAP_RXCSUM_IPV6)
852 				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
853 			break;
854 		case 0:
855 			/* Check for loopback packets */
856 			{
857 				struct ether_header *etherhp;
858 
859 				/*LINTED*/
860 				etherhp = mtod(m, struct ether_header *);
861 
862 				if (etherhp->ether_type ==
863 				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
864 					EFSYS_PROBE(loopback);
865 
866 					rxq->loopback++;
867 					goto discard;
868 				}
869 			}
870 			break;
871 		default:
872 			KASSERT(B_FALSE,
873 			    ("Rx descriptor with both IPv4 and IPv6 flags"));
874 			goto discard;
875 		}
876 
877 		/* Pass packet up the stack or into LRO (pipelined) */
878 		if (prev != NULL) {
879 			if (lro_enabled &&
880 			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
881 			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
882 				sfxge_lro(rxq, prev);
883 			else
884 				sfxge_rx_deliver(rxq, prev);
885 		}
886 		prev = rx_desc;
887 		continue;
888 
889 discard:
890 		/* Return the packet to the pool */
891 		m_free(m);
892 		rx_desc->mbuf = NULL;
893 	}
894 	rxq->completed = completed;
895 
896 	level = rxq->added - rxq->completed;
897 
898 	/* Pass last packet up the stack or into LRO */
899 	if (prev != NULL) {
900 		if (lro_enabled &&
901 		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
902 		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
903 			sfxge_lro(rxq, prev);
904 		else
905 			sfxge_rx_deliver(rxq, prev);
906 	}
907 
908 	/*
909 	 * If there are any pending flows and this is the end of the
910 	 * poll then they must be completed.
911 	 */
912 	if (eop)
913 		sfxge_lro_end_of_burst(rxq);
914 
915 	/* Top up the queue if necessary */
916 	if (level < rxq->refill_threshold)
917 		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
918 }
919 
920 static void
sfxge_rx_qstop(struct sfxge_softc * sc,unsigned int index)921 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
922 {
923 	struct sfxge_rxq *rxq;
924 	struct sfxge_evq *evq;
925 	unsigned int count;
926 	unsigned int retry = 3;
927 
928 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
929 
930 	rxq = sc->rxq[index];
931 	evq = sc->evq[index];
932 
933 	SFXGE_EVQ_LOCK(evq);
934 
935 	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
936 	    ("rxq not started"));
937 
938 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
939 
940 	callout_stop(&rxq->refill_callout);
941 
942 	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
943 		rxq->flush_state = SFXGE_FLUSH_PENDING;
944 
945 		SFXGE_EVQ_UNLOCK(evq);
946 
947 		/* Flush the receive queue */
948 		if (efx_rx_qflush(rxq->common) != 0) {
949 			SFXGE_EVQ_LOCK(evq);
950 			rxq->flush_state = SFXGE_FLUSH_FAILED;
951 			break;
952 		}
953 
954 		count = 0;
955 		do {
956 			/* Spin for 100 ms */
957 			DELAY(100000);
958 
959 			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
960 				break;
961 
962 		} while (++count < 20);
963 
964 		SFXGE_EVQ_LOCK(evq);
965 
966 		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
967 			/* Flush timeout - neither done nor failed */
968 			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
969 			    device_get_nameunit(sc->dev), index);
970 			rxq->flush_state = SFXGE_FLUSH_DONE;
971 		}
972 		retry--;
973 	}
974 	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
975 		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
976 		    device_get_nameunit(sc->dev), index);
977 		rxq->flush_state = SFXGE_FLUSH_DONE;
978 	}
979 
980 	rxq->pending = rxq->added;
981 	sfxge_rx_qcomplete(rxq, B_TRUE);
982 
983 	KASSERT(rxq->completed == rxq->pending,
984 	    ("rxq->completed != rxq->pending"));
985 
986 	rxq->added = 0;
987 	rxq->pushed = 0;
988 	rxq->pending = 0;
989 	rxq->completed = 0;
990 	rxq->loopback = 0;
991 
992 	/* Destroy the common code receive queue. */
993 	efx_rx_qdestroy(rxq->common);
994 
995 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
996 	    EFX_RXQ_NBUFS(sc->rxq_entries));
997 
998 	SFXGE_EVQ_UNLOCK(evq);
999 }
1000 
1001 static int
sfxge_rx_qstart(struct sfxge_softc * sc,unsigned int index)1002 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1003 {
1004 	struct sfxge_rxq *rxq;
1005 	efsys_mem_t *esmp;
1006 	struct sfxge_evq *evq;
1007 	int rc;
1008 
1009 	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1010 
1011 	rxq = sc->rxq[index];
1012 	esmp = &rxq->mem;
1013 	evq = sc->evq[index];
1014 
1015 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1016 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1017 	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1018 	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1019 
1020 	/* Program the buffer table. */
1021 	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1022 	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1023 		return (rc);
1024 
1025 	/* Create the common code receive queue. */
1026 	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1027 	    esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
1028 	    evq->common, &rxq->common)) != 0)
1029 		goto fail;
1030 
1031 	SFXGE_EVQ_LOCK(evq);
1032 
1033 	/* Enable the receive queue. */
1034 	efx_rx_qenable(rxq->common);
1035 
1036 	rxq->init_state = SFXGE_RXQ_STARTED;
1037 	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1038 
1039 	/* Try to fill the queue from the pool. */
1040 	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1041 
1042 	SFXGE_EVQ_UNLOCK(evq);
1043 
1044 	return (0);
1045 
1046 fail:
1047 	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1048 	    EFX_RXQ_NBUFS(sc->rxq_entries));
1049 	return (rc);
1050 }
1051 
1052 void
sfxge_rx_stop(struct sfxge_softc * sc)1053 sfxge_rx_stop(struct sfxge_softc *sc)
1054 {
1055 	int index;
1056 
1057 	efx_mac_filter_default_rxq_clear(sc->enp);
1058 
1059 	/* Stop the receive queue(s) */
1060 	index = sc->rxq_count;
1061 	while (--index >= 0)
1062 		sfxge_rx_qstop(sc, index);
1063 
1064 	sc->rx_prefix_size = 0;
1065 	sc->rx_buffer_size = 0;
1066 
1067 	efx_rx_fini(sc->enp);
1068 }
1069 
1070 int
sfxge_rx_start(struct sfxge_softc * sc)1071 sfxge_rx_start(struct sfxge_softc *sc)
1072 {
1073 	const efx_nic_cfg_t *encp;
1074 	size_t hdrlen, align, reserved;
1075 	int index;
1076 	int rc;
1077 
1078 	/* Initialize the common code receive module. */
1079 	if ((rc = efx_rx_init(sc->enp)) != 0)
1080 		return (rc);
1081 
1082 	encp = efx_nic_cfg_get(sc->enp);
1083 	sc->rx_buffer_size = EFX_MAC_PDU(if_getmtu(sc->ifnet));
1084 
1085 	/* Calculate the receive packet buffer size. */
1086 	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1087 
1088 	/* Ensure IP headers are 32bit aligned */
1089 	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1090 	sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
1091 
1092 	sc->rx_buffer_size += sc->rx_buffer_align;
1093 
1094 	/* Align end of packet buffer for RX DMA end padding */
1095 	align = MAX(1, encp->enc_rx_buf_align_end);
1096 	EFSYS_ASSERT(ISP2(align));
1097 	sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
1098 
1099 	/*
1100 	 * Standard mbuf zones only guarantee pointer-size alignment;
1101 	 * we need extra space to align to the cache line
1102 	 */
1103 	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1104 
1105 	/* Select zone for packet buffers */
1106 	if (reserved <= MCLBYTES)
1107 		sc->rx_cluster_size = MCLBYTES;
1108 	else if (reserved <= MJUMPAGESIZE)
1109 		sc->rx_cluster_size = MJUMPAGESIZE;
1110 	else if (reserved <= MJUM9BYTES)
1111 		sc->rx_cluster_size = MJUM9BYTES;
1112 	else
1113 		sc->rx_cluster_size = MJUM16BYTES;
1114 
1115 	/*
1116 	 * Set up the scale table.  Enable all hash types and hash insertion.
1117 	 */
1118 	for (index = 0; index < nitems(sc->rx_indir_table); index++)
1119 #ifdef RSS
1120 		sc->rx_indir_table[index] =
1121 			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1122 #else
1123 		sc->rx_indir_table[index] = index % sc->rxq_count;
1124 #endif
1125 	if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1126 				       sc->rx_indir_table,
1127 				       nitems(sc->rx_indir_table))) != 0)
1128 		goto fail;
1129 	(void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1130 	    EFX_RX_HASHALG_TOEPLITZ,
1131 	    EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1132 	    EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1133 
1134 	rss_getkey(toep_key);
1135 	if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
1136 				       toep_key,
1137 				       sizeof(toep_key))) != 0)
1138 		goto fail;
1139 
1140 	/* Start the receive queue(s). */
1141 	for (index = 0; index < sc->rxq_count; index++) {
1142 		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1143 			goto fail2;
1144 	}
1145 
1146 	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1147 					    sc->intr.n_alloc > 1);
1148 	if (rc != 0)
1149 		goto fail3;
1150 
1151 	return (0);
1152 
1153 fail3:
1154 fail2:
1155 	while (--index >= 0)
1156 		sfxge_rx_qstop(sc, index);
1157 
1158 fail:
1159 	efx_rx_fini(sc->enp);
1160 
1161 	return (rc);
1162 }
1163 
1164 #ifdef SFXGE_LRO
1165 
sfxge_lro_init(struct sfxge_rxq * rxq)1166 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1167 {
1168 	struct sfxge_lro_state *st = &rxq->lro;
1169 	unsigned i;
1170 
1171 	st->conns_mask = lro_table_size - 1;
1172 	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1173 		("lro_table_size must be a power of 2"));
1174 	st->sc = rxq->sc;
1175 	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1176 			   M_SFXGE, M_WAITOK);
1177 	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1178 			     M_SFXGE, M_WAITOK);
1179 	for (i = 0; i <= st->conns_mask; ++i) {
1180 		TAILQ_INIT(&st->conns[i]);
1181 		st->conns_n[i] = 0;
1182 	}
1183 	LIST_INIT(&st->active_conns);
1184 	TAILQ_INIT(&st->free_conns);
1185 }
1186 
sfxge_lro_fini(struct sfxge_rxq * rxq)1187 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1188 {
1189 	struct sfxge_lro_state *st = &rxq->lro;
1190 	struct sfxge_lro_conn *c;
1191 	unsigned i;
1192 
1193 	/* Return cleanly if sfxge_lro_init() has not been called. */
1194 	if (st->conns == NULL)
1195 		return;
1196 
1197 	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1198 
1199 	for (i = 0; i <= st->conns_mask; ++i) {
1200 		while (!TAILQ_EMPTY(&st->conns[i])) {
1201 			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1202 			sfxge_lro_drop(rxq, c);
1203 		}
1204 	}
1205 
1206 	while (!TAILQ_EMPTY(&st->free_conns)) {
1207 		c = TAILQ_FIRST(&st->free_conns);
1208 		TAILQ_REMOVE(&st->free_conns, c, link);
1209 		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1210 		free(c, M_SFXGE);
1211 	}
1212 
1213 	free(st->conns_n, M_SFXGE);
1214 	free(st->conns, M_SFXGE);
1215 	st->conns = NULL;
1216 }
1217 
1218 #else
1219 
1220 static void
sfxge_lro_init(struct sfxge_rxq * rxq)1221 sfxge_lro_init(struct sfxge_rxq *rxq)
1222 {
1223 }
1224 
1225 static void
sfxge_lro_fini(struct sfxge_rxq * rxq)1226 sfxge_lro_fini(struct sfxge_rxq *rxq)
1227 {
1228 }
1229 
1230 #endif	/* SFXGE_LRO */
1231 
1232 static void
sfxge_rx_qfini(struct sfxge_softc * sc,unsigned int index)1233 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1234 {
1235 	struct sfxge_rxq *rxq;
1236 
1237 	rxq = sc->rxq[index];
1238 
1239 	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1240 	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1241 
1242 	/* Free the context array and the flow table. */
1243 	free(rxq->queue, M_SFXGE);
1244 	sfxge_lro_fini(rxq);
1245 
1246 	/* Release DMA memory. */
1247 	sfxge_dma_free(&rxq->mem);
1248 
1249 	sc->rxq[index] = NULL;
1250 
1251 	free(rxq, M_SFXGE);
1252 }
1253 
1254 static int
sfxge_rx_qinit(struct sfxge_softc * sc,unsigned int index)1255 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1256 {
1257 	struct sfxge_rxq *rxq;
1258 	efsys_mem_t *esmp;
1259 	int rc;
1260 
1261 	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1262 
1263 	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1264 	rxq->sc = sc;
1265 	rxq->index = index;
1266 	rxq->entries = sc->rxq_entries;
1267 	rxq->ptr_mask = rxq->entries - 1;
1268 	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1269 
1270 	sc->rxq[index] = rxq;
1271 	esmp = &rxq->mem;
1272 
1273 	/* Allocate and zero DMA space. */
1274 	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1275 		return (rc);
1276 
1277 	/* Allocate buffer table entries. */
1278 	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1279 				 &rxq->buf_base_id);
1280 
1281 	/* Allocate the context array and the flow table. */
1282 	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1283 	    M_SFXGE, M_WAITOK | M_ZERO);
1284 	sfxge_lro_init(rxq);
1285 
1286 	callout_init(&rxq->refill_callout, 1);
1287 
1288 	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1289 
1290 	return (0);
1291 }
1292 
1293 static const struct {
1294 	const char *name;
1295 	size_t offset;
1296 } sfxge_rx_stats[] = {
1297 #define	SFXGE_RX_STAT(name, member) \
1298 	{ #name, offsetof(struct sfxge_rxq, member) }
1299 #ifdef SFXGE_LRO
1300 	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1301 	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1302 	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1303 	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1304 	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1305 	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1306 	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1307 	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1308 #endif
1309 };
1310 
1311 static int
sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)1312 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1313 {
1314 	struct sfxge_softc *sc = arg1;
1315 	unsigned int id = arg2;
1316 	unsigned int sum, index;
1317 
1318 	/* Sum across all RX queues */
1319 	sum = 0;
1320 	for (index = 0; index < sc->rxq_count; index++)
1321 		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1322 					 sfxge_rx_stats[id].offset);
1323 
1324 	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1325 }
1326 
1327 static void
sfxge_rx_stat_init(struct sfxge_softc * sc)1328 sfxge_rx_stat_init(struct sfxge_softc *sc)
1329 {
1330 	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1331 	struct sysctl_oid_list *stat_list;
1332 	unsigned int id;
1333 
1334 	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1335 
1336 	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1337 		SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
1338 		    sfxge_rx_stats[id].name,
1339 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
1340 		    sc, id, sfxge_rx_stat_handler, "IU", "");
1341 	}
1342 }
1343 
1344 void
sfxge_rx_fini(struct sfxge_softc * sc)1345 sfxge_rx_fini(struct sfxge_softc *sc)
1346 {
1347 	int index;
1348 
1349 	index = sc->rxq_count;
1350 	while (--index >= 0)
1351 		sfxge_rx_qfini(sc, index);
1352 
1353 	sc->rxq_count = 0;
1354 }
1355 
1356 int
sfxge_rx_init(struct sfxge_softc * sc)1357 sfxge_rx_init(struct sfxge_softc *sc)
1358 {
1359 	struct sfxge_intr *intr;
1360 	int index;
1361 	int rc;
1362 
1363 #ifdef SFXGE_LRO
1364 	if (!ISP2(lro_table_size)) {
1365 		log(LOG_ERR, "%s=%u must be power of 2",
1366 		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1367 		rc = EINVAL;
1368 		goto fail_lro_table_size;
1369 	}
1370 
1371 	if (lro_idle_ticks == 0)
1372 		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1373 #endif
1374 
1375 	intr = &sc->intr;
1376 
1377 	sc->rxq_count = intr->n_alloc;
1378 
1379 	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1380 	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1381 
1382 	/* Initialize the receive queue(s) - one per interrupt. */
1383 	for (index = 0; index < sc->rxq_count; index++) {
1384 		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1385 			goto fail;
1386 	}
1387 
1388 	sfxge_rx_stat_init(sc);
1389 
1390 	return (0);
1391 
1392 fail:
1393 	/* Tear down the receive queue(s). */
1394 	while (--index >= 0)
1395 		sfxge_rx_qfini(sc, index);
1396 
1397 	sc->rxq_count = 0;
1398 
1399 #ifdef SFXGE_LRO
1400 fail_lro_table_size:
1401 #endif
1402 	return (rc);
1403 }
1404