xref: /freebsd/sys/dev/ixl/ixl_txrx.c (revision 3fc36ee018bb836bd1796067cf4ef8683f166ebc)
1 /******************************************************************************
2 
3   Copyright (c) 2013-2015, Intel Corporation
4   All rights reserved.
5 
6   Redistribution and use in source and binary forms, with or without
7   modification, are permitted provided that the following conditions are met:
8 
9    1. Redistributions of source code must retain the above copyright notice,
10       this list of conditions and the following disclaimer.
11 
12    2. Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15 
16    3. Neither the name of the Intel Corporation nor the names of its
17       contributors may be used to endorse or promote products derived from
18       this software without specific prior written permission.
19 
20   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   POSSIBILITY OF SUCH DAMAGE.
31 
32 ******************************************************************************/
33 /*$FreeBSD$*/
34 
35 /*
36 **	IXL driver TX/RX Routines:
37 **	    This was seperated to allow usage by
38 ** 	    both the PF and VF drivers.
39 */
40 
41 #ifndef IXL_STANDALONE_BUILD
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_rss.h"
45 #endif
46 
47 #include "ixl.h"
48 
49 #ifdef RSS
50 #include <net/rss_config.h>
51 #endif
52 
53 /* Local Prototypes */
54 static void	ixl_rx_checksum(struct mbuf *, u32, u32, u8);
55 static void	ixl_refresh_mbufs(struct ixl_queue *, int);
56 static int      ixl_xmit(struct ixl_queue *, struct mbuf **);
57 static int	ixl_tx_setup_offload(struct ixl_queue *,
58 		    struct mbuf *, u32 *, u32 *);
59 static bool	ixl_tso_setup(struct ixl_queue *, struct mbuf *);
60 
61 static inline void ixl_rx_discard(struct rx_ring *, int);
62 static inline void ixl_rx_input(struct rx_ring *, struct ifnet *,
63 		    struct mbuf *, u8);
64 
65 static inline bool ixl_tso_detect_sparse(struct mbuf *mp);
66 static int	ixl_tx_setup_offload(struct ixl_queue *que,
67     struct mbuf *mp, u32 *cmd, u32 *off);
68 static inline u32 ixl_get_tx_head(struct ixl_queue *que);
69 
70 #ifdef DEV_NETMAP
71 #include <dev/netmap/if_ixl_netmap.h>
72 int ixl_rx_miss, ixl_rx_miss_bufs, ixl_crcstrip = 1;
73 #endif /* DEV_NETMAP */
74 
75 /*
76  * @key key is saved into this parameter
77  */
78 void
79 ixl_get_default_rss_key(u32 *key)
80 {
81 	MPASS(key != NULL);
82 
83 	u32 rss_seed[IXL_RSS_KEY_SIZE_REG] = {0x41b01687,
84 	    0x183cfd8c, 0xce880440, 0x580cbc3c,
85 	    0x35897377, 0x328b25e1, 0x4fa98922,
86 	    0xb7d90c14, 0xd5bad70d, 0xcd15a2c1,
87 	    0x0, 0x0, 0x0};
88 
89 	bcopy(rss_seed, key, IXL_RSS_KEY_SIZE);
90 }
91 
92 /*
93 ** Multiqueue Transmit driver
94 */
95 int
96 ixl_mq_start(struct ifnet *ifp, struct mbuf *m)
97 {
98 	struct ixl_vsi		*vsi = ifp->if_softc;
99 	struct ixl_queue	*que;
100 	struct tx_ring		*txr;
101 	int 			err, i;
102 #ifdef RSS
103 	u32			bucket_id;
104 #endif
105 
106 	/*
107 	** Which queue to use:
108 	**
109 	** When doing RSS, map it to the same outbound
110 	** queue as the incoming flow would be mapped to.
111 	** If everything is setup correctly, it should be
112 	** the same bucket that the current CPU we're on is.
113 	*/
114 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
115 #ifdef  RSS
116 		if (rss_hash2bucket(m->m_pkthdr.flowid,
117 		    M_HASHTYPE_GET(m), &bucket_id) == 0) {
118 			i = bucket_id % vsi->num_queues;
119                 } else
120 #endif
121                         i = m->m_pkthdr.flowid % vsi->num_queues;
122         } else
123 		i = curcpu % vsi->num_queues;
124 
125 	que = &vsi->queues[i];
126 	txr = &que->txr;
127 
128 	err = drbr_enqueue(ifp, txr->br, m);
129 	if (err)
130 		return (err);
131 	if (IXL_TX_TRYLOCK(txr)) {
132 		ixl_mq_start_locked(ifp, txr);
133 		IXL_TX_UNLOCK(txr);
134 	} else
135 		taskqueue_enqueue(que->tq, &que->tx_task);
136 
137 	return (0);
138 }
139 
140 int
141 ixl_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
142 {
143 	struct ixl_queue	*que = txr->que;
144 	struct ixl_vsi		*vsi = que->vsi;
145         struct mbuf		*next;
146         int			err = 0;
147 
148 
149 	if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ||
150 	    vsi->link_active == 0)
151 		return (ENETDOWN);
152 
153 	/* Process the transmit queue */
154 	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
155 		if ((err = ixl_xmit(que, &next)) != 0) {
156 			if (next == NULL)
157 				drbr_advance(ifp, txr->br);
158 			else
159 				drbr_putback(ifp, txr->br, next);
160 			break;
161 		}
162 		drbr_advance(ifp, txr->br);
163 		/* Send a copy of the frame to the BPF listener */
164 		ETHER_BPF_MTAP(ifp, next);
165 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
166 			break;
167 	}
168 
169 	if (txr->avail < IXL_TX_CLEANUP_THRESHOLD)
170 		ixl_txeof(que);
171 
172 	return (err);
173 }
174 
175 /*
176  * Called from a taskqueue to drain queued transmit packets.
177  */
178 void
179 ixl_deferred_mq_start(void *arg, int pending)
180 {
181 	struct ixl_queue	*que = arg;
182         struct tx_ring		*txr = &que->txr;
183 	struct ixl_vsi		*vsi = que->vsi;
184         struct ifnet		*ifp = vsi->ifp;
185 
186 	IXL_TX_LOCK(txr);
187 	if (!drbr_empty(ifp, txr->br))
188 		ixl_mq_start_locked(ifp, txr);
189 	IXL_TX_UNLOCK(txr);
190 }
191 
192 /*
193 ** Flush all queue ring buffers
194 */
195 void
196 ixl_qflush(struct ifnet *ifp)
197 {
198 	struct ixl_vsi	*vsi = ifp->if_softc;
199 
200         for (int i = 0; i < vsi->num_queues; i++) {
201 		struct ixl_queue *que = &vsi->queues[i];
202 		struct tx_ring	*txr = &que->txr;
203 		struct mbuf	*m;
204 		IXL_TX_LOCK(txr);
205 		while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
206 			m_freem(m);
207 		IXL_TX_UNLOCK(txr);
208 	}
209 	if_qflush(ifp);
210 }
211 
212 /*
213 ** Find mbuf chains passed to the driver
214 ** that are 'sparse', using more than 8
215 ** mbufs to deliver an mss-size chunk of data
216 */
217 static inline bool
218 ixl_tso_detect_sparse(struct mbuf *mp)
219 {
220 	struct mbuf	*m;
221 	int		num = 0, mss;
222 	bool		ret = FALSE;
223 
224 	mss = mp->m_pkthdr.tso_segsz;
225 	for (m = mp->m_next; m != NULL; m = m->m_next) {
226 		num++;
227 		mss -= m->m_len;
228 		if (mss < 1)
229 			break;
230 		if (m->m_next == NULL)
231 			break;
232 	}
233 	if (num > IXL_SPARSE_CHAIN)
234 		ret = TRUE;
235 
236 	return (ret);
237 }
238 
239 
240 /*********************************************************************
241  *
242  *  This routine maps the mbufs to tx descriptors, allowing the
243  *  TX engine to transmit the packets.
244  *  	- return 0 on success, positive on failure
245  *
246  **********************************************************************/
247 #define IXL_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
248 
249 static int
250 ixl_xmit(struct ixl_queue *que, struct mbuf **m_headp)
251 {
252 	struct ixl_vsi		*vsi = que->vsi;
253 	struct i40e_hw		*hw = vsi->hw;
254 	struct tx_ring		*txr = &que->txr;
255 	struct ixl_tx_buf	*buf;
256 	struct i40e_tx_desc	*txd = NULL;
257 	struct mbuf		*m_head, *m;
258 	int             	i, j, error, nsegs;
259 	int			first, last = 0;
260 	u16			vtag = 0;
261 	u32			cmd, off;
262 	bus_dmamap_t		map;
263 	bus_dma_tag_t		tag;
264 	bus_dma_segment_t	segs[IXL_MAX_TSO_SEGS];
265 
266 	cmd = off = 0;
267 	m_head = *m_headp;
268 
269         /*
270          * Important to capture the first descriptor
271          * used because it will contain the index of
272          * the one we tell the hardware to report back
273          */
274         first = txr->next_avail;
275 	buf = &txr->buffers[first];
276 	map = buf->map;
277 	tag = txr->tx_tag;
278 
279 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
280 		/* Use larger mapping for TSO */
281 		tag = txr->tso_tag;
282 		if (ixl_tso_detect_sparse(m_head)) {
283 			m = m_defrag(m_head, M_NOWAIT);
284 			if (m == NULL) {
285 				m_freem(*m_headp);
286 				*m_headp = NULL;
287 				return (ENOBUFS);
288 			}
289 			*m_headp = m;
290 		}
291 	}
292 
293 	/*
294 	 * Map the packet for DMA.
295 	 */
296 	error = bus_dmamap_load_mbuf_sg(tag, map,
297 	    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
298 
299 	if (error == EFBIG) {
300 		struct mbuf *m;
301 
302 		m = m_defrag(*m_headp, M_NOWAIT);
303 		if (m == NULL) {
304 			que->mbuf_defrag_failed++;
305 			m_freem(*m_headp);
306 			*m_headp = NULL;
307 			return (ENOBUFS);
308 		}
309 		*m_headp = m;
310 
311 		/* Try it again */
312 		error = bus_dmamap_load_mbuf_sg(tag, map,
313 		    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
314 
315 		if (error == ENOMEM) {
316 			que->tx_dmamap_failed++;
317 			return (error);
318 		} else if (error != 0) {
319 			que->tx_dmamap_failed++;
320 			m_freem(*m_headp);
321 			*m_headp = NULL;
322 			return (error);
323 		}
324 	} else if (error == ENOMEM) {
325 		que->tx_dmamap_failed++;
326 		return (error);
327 	} else if (error != 0) {
328 		que->tx_dmamap_failed++;
329 		m_freem(*m_headp);
330 		*m_headp = NULL;
331 		return (error);
332 	}
333 
334 	/* Make certain there are enough descriptors */
335 	if (nsegs > txr->avail - 2) {
336 		txr->no_desc++;
337 		error = ENOBUFS;
338 		goto xmit_fail;
339 	}
340 	m_head = *m_headp;
341 
342 	/* Set up the TSO/CSUM offload */
343 	if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) {
344 		error = ixl_tx_setup_offload(que, m_head, &cmd, &off);
345 		if (error)
346 			goto xmit_fail;
347 	}
348 
349 	cmd |= I40E_TX_DESC_CMD_ICRC;
350 	/* Grab the VLAN tag */
351 	if (m_head->m_flags & M_VLANTAG) {
352 		cmd |= I40E_TX_DESC_CMD_IL2TAG1;
353 		vtag = htole16(m_head->m_pkthdr.ether_vtag);
354 	}
355 
356 	i = txr->next_avail;
357 	for (j = 0; j < nsegs; j++) {
358 		bus_size_t seglen;
359 
360 		buf = &txr->buffers[i];
361 		buf->tag = tag; /* Keep track of the type tag */
362 		txd = &txr->base[i];
363 		seglen = segs[j].ds_len;
364 
365 		txd->buffer_addr = htole64(segs[j].ds_addr);
366 		txd->cmd_type_offset_bsz =
367 		    htole64(I40E_TX_DESC_DTYPE_DATA
368 		    | ((u64)cmd  << I40E_TXD_QW1_CMD_SHIFT)
369 		    | ((u64)off << I40E_TXD_QW1_OFFSET_SHIFT)
370 		    | ((u64)seglen  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)
371 		    | ((u64)vtag  << I40E_TXD_QW1_L2TAG1_SHIFT));
372 
373 		last = i; /* descriptor that will get completion IRQ */
374 
375 		if (++i == que->num_desc)
376 			i = 0;
377 
378 		buf->m_head = NULL;
379 		buf->eop_index = -1;
380 	}
381 	/* Set the last descriptor for report */
382 	txd->cmd_type_offset_bsz |=
383 	    htole64(((u64)IXL_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT));
384 	txr->avail -= nsegs;
385 	txr->next_avail = i;
386 
387 	buf->m_head = m_head;
388 	/* Swap the dma map between the first and last descriptor */
389 	txr->buffers[first].map = buf->map;
390 	buf->map = map;
391 	bus_dmamap_sync(tag, map, BUS_DMASYNC_PREWRITE);
392 
393         /* Set the index of the descriptor that will be marked done */
394         buf = &txr->buffers[first];
395 	buf->eop_index = last;
396 
397         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
398             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
399 	/*
400 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
401 	 * hardware that this frame is available to transmit.
402 	 */
403 	++txr->total_packets;
404 	wr32(hw, txr->tail, i);
405 
406 	/* Mark outstanding work */
407 	if (que->busy == 0)
408 		que->busy = 1;
409 	return (0);
410 
411 xmit_fail:
412 	bus_dmamap_unload(tag, buf->map);
413 	return (error);
414 }
415 
416 
417 /*********************************************************************
418  *
419  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
420  *  the information needed to transmit a packet on the wire. This is
421  *  called only once at attach, setup is done every reset.
422  *
423  **********************************************************************/
424 int
425 ixl_allocate_tx_data(struct ixl_queue *que)
426 {
427 	struct tx_ring		*txr = &que->txr;
428 	struct ixl_vsi		*vsi = que->vsi;
429 	device_t		dev = vsi->dev;
430 	struct ixl_tx_buf	*buf;
431 	int			error = 0;
432 
433 	/*
434 	 * Setup DMA descriptor areas.
435 	 */
436 	if ((error = bus_dma_tag_create(NULL,		/* parent */
437 			       1, 0,			/* alignment, bounds */
438 			       BUS_SPACE_MAXADDR,	/* lowaddr */
439 			       BUS_SPACE_MAXADDR,	/* highaddr */
440 			       NULL, NULL,		/* filter, filterarg */
441 			       IXL_TSO_SIZE,		/* maxsize */
442 			       IXL_MAX_TX_SEGS,		/* nsegments */
443 			       PAGE_SIZE,		/* maxsegsize */
444 			       0,			/* flags */
445 			       NULL,			/* lockfunc */
446 			       NULL,			/* lockfuncarg */
447 			       &txr->tx_tag))) {
448 		device_printf(dev,"Unable to allocate TX DMA tag\n");
449 		goto fail;
450 	}
451 
452 	/* Make a special tag for TSO */
453 	if ((error = bus_dma_tag_create(NULL,		/* parent */
454 			       1, 0,			/* alignment, bounds */
455 			       BUS_SPACE_MAXADDR,	/* lowaddr */
456 			       BUS_SPACE_MAXADDR,	/* highaddr */
457 			       NULL, NULL,		/* filter, filterarg */
458 			       IXL_TSO_SIZE,		/* maxsize */
459 			       IXL_MAX_TSO_SEGS,	/* nsegments */
460 			       PAGE_SIZE,		/* maxsegsize */
461 			       0,			/* flags */
462 			       NULL,			/* lockfunc */
463 			       NULL,			/* lockfuncarg */
464 			       &txr->tso_tag))) {
465 		device_printf(dev,"Unable to allocate TX TSO DMA tag\n");
466 		goto fail;
467 	}
468 
469 	if (!(txr->buffers =
470 	    (struct ixl_tx_buf *) malloc(sizeof(struct ixl_tx_buf) *
471 	    que->num_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) {
472 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
473 		error = ENOMEM;
474 		goto fail;
475 	}
476 
477         /* Create the descriptor buffer default dma maps */
478 	buf = txr->buffers;
479 	for (int i = 0; i < que->num_desc; i++, buf++) {
480 		buf->tag = txr->tx_tag;
481 		error = bus_dmamap_create(buf->tag, 0, &buf->map);
482 		if (error != 0) {
483 			device_printf(dev, "Unable to create TX DMA map\n");
484 			goto fail;
485 		}
486 	}
487 fail:
488 	return (error);
489 }
490 
491 
492 /*********************************************************************
493  *
494  *  (Re)Initialize a queue transmit ring.
495  *	- called by init, it clears the descriptor ring,
496  *	  and frees any stale mbufs
497  *
498  **********************************************************************/
499 void
500 ixl_init_tx_ring(struct ixl_queue *que)
501 {
502 #ifdef DEV_NETMAP
503 	struct netmap_adapter *na = NA(que->vsi->ifp);
504 	struct netmap_slot *slot;
505 #endif /* DEV_NETMAP */
506 	struct tx_ring		*txr = &que->txr;
507 	struct ixl_tx_buf	*buf;
508 
509 	/* Clear the old ring contents */
510 	IXL_TX_LOCK(txr);
511 
512 #ifdef DEV_NETMAP
513 	/*
514 	 * (under lock): if in netmap mode, do some consistency
515 	 * checks and set slot to entry 0 of the netmap ring.
516 	 */
517 	slot = netmap_reset(na, NR_TX, que->me, 0);
518 #endif /* DEV_NETMAP */
519 
520 	bzero((void *)txr->base,
521 	      (sizeof(struct i40e_tx_desc)) * que->num_desc);
522 
523 	/* Reset indices */
524 	txr->next_avail = 0;
525 	txr->next_to_clean = 0;
526 
527 #ifdef IXL_FDIR
528 	/* Initialize flow director */
529 	txr->atr_rate = ixl_atr_rate;
530 	txr->atr_count = 0;
531 #endif
532 
533 	/* Free any existing tx mbufs. */
534         buf = txr->buffers;
535 	for (int i = 0; i < que->num_desc; i++, buf++) {
536 		if (buf->m_head != NULL) {
537 			bus_dmamap_sync(buf->tag, buf->map,
538 			    BUS_DMASYNC_POSTWRITE);
539 			bus_dmamap_unload(buf->tag, buf->map);
540 			m_freem(buf->m_head);
541 			buf->m_head = NULL;
542 		}
543 #ifdef DEV_NETMAP
544 		/*
545 		 * In netmap mode, set the map for the packet buffer.
546 		 * NOTE: Some drivers (not this one) also need to set
547 		 * the physical buffer address in the NIC ring.
548 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
549 		 * netmap slot index, si
550 		 */
551 		if (slot) {
552 			int si = netmap_idx_n2k(&na->tx_rings[que->me], i);
553 			netmap_load_map(na, buf->tag, buf->map, NMB(na, slot + si));
554 		}
555 #endif /* DEV_NETMAP */
556 		/* Clear the EOP index */
557 		buf->eop_index = -1;
558         }
559 
560 	/* Set number of descriptors available */
561 	txr->avail = que->num_desc;
562 
563 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
564 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
565 	IXL_TX_UNLOCK(txr);
566 }
567 
568 
569 /*********************************************************************
570  *
571  *  Free transmit ring related data structures.
572  *
573  **********************************************************************/
574 void
575 ixl_free_que_tx(struct ixl_queue *que)
576 {
577 	struct tx_ring *txr = &que->txr;
578 	struct ixl_tx_buf *buf;
579 
580 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
581 
582 	for (int i = 0; i < que->num_desc; i++) {
583 		buf = &txr->buffers[i];
584 		if (buf->m_head != NULL) {
585 			bus_dmamap_sync(buf->tag, buf->map,
586 			    BUS_DMASYNC_POSTWRITE);
587 			bus_dmamap_unload(buf->tag,
588 			    buf->map);
589 			m_freem(buf->m_head);
590 			buf->m_head = NULL;
591 			if (buf->map != NULL) {
592 				bus_dmamap_destroy(buf->tag,
593 				    buf->map);
594 				buf->map = NULL;
595 			}
596 		} else if (buf->map != NULL) {
597 			bus_dmamap_unload(buf->tag,
598 			    buf->map);
599 			bus_dmamap_destroy(buf->tag,
600 			    buf->map);
601 			buf->map = NULL;
602 		}
603 	}
604 	if (txr->br != NULL)
605 		buf_ring_free(txr->br, M_DEVBUF);
606 	if (txr->buffers != NULL) {
607 		free(txr->buffers, M_DEVBUF);
608 		txr->buffers = NULL;
609 	}
610 	if (txr->tx_tag != NULL) {
611 		bus_dma_tag_destroy(txr->tx_tag);
612 		txr->tx_tag = NULL;
613 	}
614 	if (txr->tso_tag != NULL) {
615 		bus_dma_tag_destroy(txr->tso_tag);
616 		txr->tso_tag = NULL;
617 	}
618 
619 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
620 	return;
621 }
622 
623 /*********************************************************************
624  *
625  *  Setup descriptor for hw offloads
626  *
627  **********************************************************************/
628 
629 static int
630 ixl_tx_setup_offload(struct ixl_queue *que,
631     struct mbuf *mp, u32 *cmd, u32 *off)
632 {
633 	struct ether_vlan_header	*eh;
634 #ifdef INET
635 	struct ip			*ip = NULL;
636 #endif
637 	struct tcphdr			*th = NULL;
638 #ifdef INET6
639 	struct ip6_hdr			*ip6;
640 #endif
641 	int				elen, ip_hlen = 0, tcp_hlen;
642 	u16				etype;
643 	u8				ipproto = 0;
644 	bool				tso = FALSE;
645 
646 	/* Set up the TSO context descriptor if required */
647 	if (mp->m_pkthdr.csum_flags & CSUM_TSO) {
648 		tso = ixl_tso_setup(que, mp);
649 		if (tso)
650 			++que->tso;
651 		else
652 			return (ENXIO);
653 	}
654 
655 	/*
656 	 * Determine where frame payload starts.
657 	 * Jump over vlan headers if already present,
658 	 * helpful for QinQ too.
659 	 */
660 	eh = mtod(mp, struct ether_vlan_header *);
661 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
662 		etype = ntohs(eh->evl_proto);
663 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
664 	} else {
665 		etype = ntohs(eh->evl_encap_proto);
666 		elen = ETHER_HDR_LEN;
667 	}
668 
669 	switch (etype) {
670 #ifdef INET
671 		case ETHERTYPE_IP:
672 			ip = (struct ip *)(mp->m_data + elen);
673 			ip_hlen = ip->ip_hl << 2;
674 			ipproto = ip->ip_p;
675 			th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
676 			/* The IP checksum must be recalculated with TSO */
677 			if (tso)
678 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
679 			else
680 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4;
681 			break;
682 #endif
683 #ifdef INET6
684 		case ETHERTYPE_IPV6:
685 			ip6 = (struct ip6_hdr *)(mp->m_data + elen);
686 			ip_hlen = sizeof(struct ip6_hdr);
687 			ipproto = ip6->ip6_nxt;
688 			th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
689 			*cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
690 			break;
691 #endif
692 		default:
693 			break;
694 	}
695 
696 	*off |= (elen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
697 	*off |= (ip_hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
698 
699 	switch (ipproto) {
700 		case IPPROTO_TCP:
701 			tcp_hlen = th->th_off << 2;
702 			if (mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) {
703 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
704 				*off |= (tcp_hlen >> 2) <<
705 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
706 			}
707 #ifdef IXL_FDIR
708 			ixl_atr(que, th, etype);
709 #endif
710 			break;
711 		case IPPROTO_UDP:
712 			if (mp->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_UDP_IPV6)) {
713 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
714 				*off |= (sizeof(struct udphdr) >> 2) <<
715 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
716 			}
717 			break;
718 
719 		case IPPROTO_SCTP:
720 			if (mp->m_pkthdr.csum_flags & (CSUM_SCTP|CSUM_SCTP_IPV6)) {
721 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
722 				*off |= (sizeof(struct sctphdr) >> 2) <<
723 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
724 			}
725 			/* Fall Thru */
726 		default:
727 			break;
728 	}
729 
730         return (0);
731 }
732 
733 
734 /**********************************************************************
735  *
736  *  Setup context for hardware segmentation offload (TSO)
737  *
738  **********************************************************************/
739 static bool
740 ixl_tso_setup(struct ixl_queue *que, struct mbuf *mp)
741 {
742 	struct tx_ring			*txr = &que->txr;
743 	struct i40e_tx_context_desc	*TXD;
744 	struct ixl_tx_buf		*buf;
745 	u32				cmd, mss, type, tsolen;
746 	u16				etype;
747 	int				idx, elen, ip_hlen, tcp_hlen;
748 	struct ether_vlan_header	*eh;
749 #ifdef INET
750 	struct ip			*ip;
751 #endif
752 #ifdef INET6
753 	struct ip6_hdr			*ip6;
754 #endif
755 #if defined(INET6) || defined(INET)
756 	struct tcphdr			*th;
757 #endif
758 	u64				type_cmd_tso_mss;
759 
760 	/*
761 	 * Determine where frame payload starts.
762 	 * Jump over vlan headers if already present
763 	 */
764 	eh = mtod(mp, struct ether_vlan_header *);
765 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
766 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
767 		etype = eh->evl_proto;
768 	} else {
769 		elen = ETHER_HDR_LEN;
770 		etype = eh->evl_encap_proto;
771 	}
772 
773         switch (ntohs(etype)) {
774 #ifdef INET6
775 	case ETHERTYPE_IPV6:
776 		ip6 = (struct ip6_hdr *)(mp->m_data + elen);
777 		if (ip6->ip6_nxt != IPPROTO_TCP)
778 			return (ENXIO);
779 		ip_hlen = sizeof(struct ip6_hdr);
780 		th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
781 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
782 		tcp_hlen = th->th_off << 2;
783 		/*
784 		 * The corresponding flag is set by the stack in the IPv4
785 		 * TSO case, but not in IPv6 (at least in FreeBSD 10.2).
786 		 * So, set it here because the rest of the flow requires it.
787 		 */
788 		mp->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
789 		break;
790 #endif
791 #ifdef INET
792 	case ETHERTYPE_IP:
793 		ip = (struct ip *)(mp->m_data + elen);
794 		if (ip->ip_p != IPPROTO_TCP)
795 			return (ENXIO);
796 		ip->ip_sum = 0;
797 		ip_hlen = ip->ip_hl << 2;
798 		th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
799 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
800 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
801 		tcp_hlen = th->th_off << 2;
802 		break;
803 #endif
804 	default:
805 		printf("%s: CSUM_TSO but no supported IP version (0x%04x)",
806 		    __func__, ntohs(etype));
807 		return FALSE;
808         }
809 
810         /* Ensure we have at least the IP+TCP header in the first mbuf. */
811         if (mp->m_len < elen + ip_hlen + sizeof(struct tcphdr))
812 		return FALSE;
813 
814 	idx = txr->next_avail;
815 	buf = &txr->buffers[idx];
816 	TXD = (struct i40e_tx_context_desc *) &txr->base[idx];
817 	tsolen = mp->m_pkthdr.len - (elen + ip_hlen + tcp_hlen);
818 
819 	type = I40E_TX_DESC_DTYPE_CONTEXT;
820 	cmd = I40E_TX_CTX_DESC_TSO;
821 	/* ERJ: this must not be less than 64 */
822 	mss = mp->m_pkthdr.tso_segsz;
823 
824 	type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) |
825 	    ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
826 	    ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
827 	    ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
828 	TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss);
829 
830 	TXD->tunneling_params = htole32(0);
831 	buf->m_head = NULL;
832 	buf->eop_index = -1;
833 
834 	if (++idx == que->num_desc)
835 		idx = 0;
836 
837 	txr->avail--;
838 	txr->next_avail = idx;
839 
840 	return TRUE;
841 }
842 
843 /*
844 ** ixl_get_tx_head - Retrieve the value from the
845 **    location the HW records its HEAD index
846 */
847 static inline u32
848 ixl_get_tx_head(struct ixl_queue *que)
849 {
850 	struct tx_ring  *txr = &que->txr;
851 	void *head = &txr->base[que->num_desc];
852 	return LE32_TO_CPU(*(volatile __le32 *)head);
853 }
854 
855 /**********************************************************************
856  *
857  *  Examine each tx_buffer in the used queue. If the hardware is done
858  *  processing the packet then free associated resources. The
859  *  tx_buffer is put back on the free queue.
860  *
861  **********************************************************************/
862 bool
863 ixl_txeof(struct ixl_queue *que)
864 {
865 	struct tx_ring		*txr = &que->txr;
866 	u32			first, last, head, done, processed;
867 	struct ixl_tx_buf	*buf;
868 	struct i40e_tx_desc	*tx_desc, *eop_desc;
869 
870 
871 	mtx_assert(&txr->mtx, MA_OWNED);
872 
873 #ifdef DEV_NETMAP
874 	// XXX todo: implement moderation
875 	if (netmap_tx_irq(que->vsi->ifp, que->me))
876 		return FALSE;
877 #endif /* DEF_NETMAP */
878 
879 	/* These are not the descriptors you seek, move along :) */
880 	if (txr->avail == que->num_desc) {
881 		que->busy = 0;
882 		return FALSE;
883 	}
884 
885 	processed = 0;
886 	first = txr->next_to_clean;
887 	buf = &txr->buffers[first];
888 	tx_desc = (struct i40e_tx_desc *)&txr->base[first];
889 	last = buf->eop_index;
890 	if (last == -1)
891 		return FALSE;
892 	eop_desc = (struct i40e_tx_desc *)&txr->base[last];
893 
894 	/* Get the Head WB value */
895 	head = ixl_get_tx_head(que);
896 
897 	/*
898 	** Get the index of the first descriptor
899 	** BEYOND the EOP and call that 'done'.
900 	** I do this so the comparison in the
901 	** inner while loop below can be simple
902 	*/
903 	if (++last == que->num_desc) last = 0;
904 	done = last;
905 
906         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
907             BUS_DMASYNC_POSTREAD);
908 	/*
909 	** The HEAD index of the ring is written in a
910 	** defined location, this rather than a done bit
911 	** is what is used to keep track of what must be
912 	** 'cleaned'.
913 	*/
914 	while (first != head) {
915 		/* We clean the range of the packet */
916 		while (first != done) {
917 			++txr->avail;
918 			++processed;
919 
920 			if (buf->m_head) {
921 				txr->bytes += /* for ITR adjustment */
922 				    buf->m_head->m_pkthdr.len;
923 				txr->tx_bytes += /* for TX stats */
924 				    buf->m_head->m_pkthdr.len;
925 				bus_dmamap_sync(buf->tag,
926 				    buf->map,
927 				    BUS_DMASYNC_POSTWRITE);
928 				bus_dmamap_unload(buf->tag,
929 				    buf->map);
930 				m_freem(buf->m_head);
931 				buf->m_head = NULL;
932 				buf->map = NULL;
933 			}
934 			buf->eop_index = -1;
935 
936 			if (++first == que->num_desc)
937 				first = 0;
938 
939 			buf = &txr->buffers[first];
940 			tx_desc = &txr->base[first];
941 		}
942 		++txr->packets;
943 		/* See if there is more work now */
944 		last = buf->eop_index;
945 		if (last != -1) {
946 			eop_desc = &txr->base[last];
947 			/* Get next done point */
948 			if (++last == que->num_desc) last = 0;
949 			done = last;
950 		} else
951 			break;
952 	}
953 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
954 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
955 
956 	txr->next_to_clean = first;
957 
958 
959 	/*
960 	** Hang detection, we know there's
961 	** work outstanding or the first return
962 	** would have been taken, so indicate an
963 	** unsuccessful pass, in local_timer if
964 	** the value is too great the queue will
965 	** be considered hung. If anything has been
966 	** cleaned then reset the state.
967 	*/
968 	if ((processed == 0) && (que->busy != IXL_QUEUE_HUNG))
969 		++que->busy;
970 
971 	if (processed)
972 		que->busy = 1; /* Note this turns off HUNG */
973 
974 	/*
975 	 * If there are no pending descriptors, clear the timeout.
976 	 */
977 	if (txr->avail == que->num_desc) {
978 		que->busy = 0;
979 		return FALSE;
980 	}
981 
982 	return TRUE;
983 }
984 
985 /*********************************************************************
986  *
987  *  Refresh mbuf buffers for RX descriptor rings
988  *   - now keeps its own state so discards due to resource
989  *     exhaustion are unnecessary, if an mbuf cannot be obtained
990  *     it just returns, keeping its placeholder, thus it can simply
991  *     be recalled to try again.
992  *
993  **********************************************************************/
994 static void
995 ixl_refresh_mbufs(struct ixl_queue *que, int limit)
996 {
997 	struct ixl_vsi		*vsi = que->vsi;
998 	struct rx_ring		*rxr = &que->rxr;
999 	bus_dma_segment_t	hseg[1];
1000 	bus_dma_segment_t	pseg[1];
1001 	struct ixl_rx_buf	*buf;
1002 	struct mbuf		*mh, *mp;
1003 	int			i, j, nsegs, error;
1004 	bool			refreshed = FALSE;
1005 
1006 	i = j = rxr->next_refresh;
1007 	/* Control the loop with one beyond */
1008 	if (++j == que->num_desc)
1009 		j = 0;
1010 
1011 	while (j != limit) {
1012 		buf = &rxr->buffers[i];
1013 		if (rxr->hdr_split == FALSE)
1014 			goto no_split;
1015 
1016 		if (buf->m_head == NULL) {
1017 			mh = m_gethdr(M_NOWAIT, MT_DATA);
1018 			if (mh == NULL)
1019 				goto update;
1020 		} else
1021 			mh = buf->m_head;
1022 
1023 		mh->m_pkthdr.len = mh->m_len = MHLEN;
1024 		mh->m_len = MHLEN;
1025 		mh->m_flags |= M_PKTHDR;
1026 		/* Get the memory mapping */
1027 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1028 		    buf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT);
1029 		if (error != 0) {
1030 			printf("Refresh mbufs: hdr dmamap load"
1031 			    " failure - %d\n", error);
1032 			m_free(mh);
1033 			buf->m_head = NULL;
1034 			goto update;
1035 		}
1036 		buf->m_head = mh;
1037 		bus_dmamap_sync(rxr->htag, buf->hmap,
1038 		    BUS_DMASYNC_PREREAD);
1039 		rxr->base[i].read.hdr_addr =
1040 		   htole64(hseg[0].ds_addr);
1041 
1042 no_split:
1043 		if (buf->m_pack == NULL) {
1044 			mp = m_getjcl(M_NOWAIT, MT_DATA,
1045 			    M_PKTHDR, rxr->mbuf_sz);
1046 			if (mp == NULL)
1047 				goto update;
1048 		} else
1049 			mp = buf->m_pack;
1050 
1051 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1052 		/* Get the memory mapping */
1053 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1054 		    buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT);
1055 		if (error != 0) {
1056 			printf("Refresh mbufs: payload dmamap load"
1057 			    " failure - %d\n", error);
1058 			m_free(mp);
1059 			buf->m_pack = NULL;
1060 			goto update;
1061 		}
1062 		buf->m_pack = mp;
1063 		bus_dmamap_sync(rxr->ptag, buf->pmap,
1064 		    BUS_DMASYNC_PREREAD);
1065 		rxr->base[i].read.pkt_addr =
1066 		   htole64(pseg[0].ds_addr);
1067 		/* Used only when doing header split */
1068 		rxr->base[i].read.hdr_addr = 0;
1069 
1070 		refreshed = TRUE;
1071 		/* Next is precalculated */
1072 		i = j;
1073 		rxr->next_refresh = i;
1074 		if (++j == que->num_desc)
1075 			j = 0;
1076 	}
1077 update:
1078 	if (refreshed) /* Update hardware tail index */
1079 		wr32(vsi->hw, rxr->tail, rxr->next_refresh);
1080 	return;
1081 }
1082 
1083 
1084 /*********************************************************************
1085  *
1086  *  Allocate memory for rx_buffer structures. Since we use one
1087  *  rx_buffer per descriptor, the maximum number of rx_buffer's
1088  *  that we'll need is equal to the number of receive descriptors
1089  *  that we've defined.
1090  *
1091  **********************************************************************/
1092 int
1093 ixl_allocate_rx_data(struct ixl_queue *que)
1094 {
1095 	struct rx_ring		*rxr = &que->rxr;
1096 	struct ixl_vsi		*vsi = que->vsi;
1097 	device_t 		dev = vsi->dev;
1098 	struct ixl_rx_buf 	*buf;
1099 	int             	i, bsize, error;
1100 
1101 	bsize = sizeof(struct ixl_rx_buf) * que->num_desc;
1102 	if (!(rxr->buffers =
1103 	    (struct ixl_rx_buf *) malloc(bsize,
1104 	    M_DEVBUF, M_NOWAIT | M_ZERO))) {
1105 		device_printf(dev, "Unable to allocate rx_buffer memory\n");
1106 		error = ENOMEM;
1107 		return (error);
1108 	}
1109 
1110 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1111 				   1, 0,	/* alignment, bounds */
1112 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1113 				   BUS_SPACE_MAXADDR,	/* highaddr */
1114 				   NULL, NULL,		/* filter, filterarg */
1115 				   MSIZE,		/* maxsize */
1116 				   1,			/* nsegments */
1117 				   MSIZE,		/* maxsegsize */
1118 				   0,			/* flags */
1119 				   NULL,		/* lockfunc */
1120 				   NULL,		/* lockfuncarg */
1121 				   &rxr->htag))) {
1122 		device_printf(dev, "Unable to create RX DMA htag\n");
1123 		return (error);
1124 	}
1125 
1126 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1127 				   1, 0,	/* alignment, bounds */
1128 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1129 				   BUS_SPACE_MAXADDR,	/* highaddr */
1130 				   NULL, NULL,		/* filter, filterarg */
1131 				   MJUM16BYTES,		/* maxsize */
1132 				   1,			/* nsegments */
1133 				   MJUM16BYTES,		/* maxsegsize */
1134 				   0,			/* flags */
1135 				   NULL,		/* lockfunc */
1136 				   NULL,		/* lockfuncarg */
1137 				   &rxr->ptag))) {
1138 		device_printf(dev, "Unable to create RX DMA ptag\n");
1139 		return (error);
1140 	}
1141 
1142 	for (i = 0; i < que->num_desc; i++) {
1143 		buf = &rxr->buffers[i];
1144 		error = bus_dmamap_create(rxr->htag,
1145 		    BUS_DMA_NOWAIT, &buf->hmap);
1146 		if (error) {
1147 			device_printf(dev, "Unable to create RX head map\n");
1148 			break;
1149 		}
1150 		error = bus_dmamap_create(rxr->ptag,
1151 		    BUS_DMA_NOWAIT, &buf->pmap);
1152 		if (error) {
1153 			device_printf(dev, "Unable to create RX pkt map\n");
1154 			break;
1155 		}
1156 	}
1157 
1158 	return (error);
1159 }
1160 
1161 
1162 /*********************************************************************
1163  *
1164  *  (Re)Initialize the queue receive ring and its buffers.
1165  *
1166  **********************************************************************/
1167 int
1168 ixl_init_rx_ring(struct ixl_queue *que)
1169 {
1170 	struct	rx_ring 	*rxr = &que->rxr;
1171 	struct ixl_vsi		*vsi = que->vsi;
1172 #if defined(INET6) || defined(INET)
1173 	struct ifnet		*ifp = vsi->ifp;
1174 	struct lro_ctrl		*lro = &rxr->lro;
1175 #endif
1176 	struct ixl_rx_buf	*buf;
1177 	bus_dma_segment_t	pseg[1], hseg[1];
1178 	int			rsize, nsegs, error = 0;
1179 #ifdef DEV_NETMAP
1180 	struct netmap_adapter *na = NA(que->vsi->ifp);
1181 	struct netmap_slot *slot;
1182 #endif /* DEV_NETMAP */
1183 
1184 	IXL_RX_LOCK(rxr);
1185 #ifdef DEV_NETMAP
1186 	/* same as in ixl_init_tx_ring() */
1187 	slot = netmap_reset(na, NR_RX, que->me, 0);
1188 #endif /* DEV_NETMAP */
1189 	/* Clear the ring contents */
1190 	rsize = roundup2(que->num_desc *
1191 	    sizeof(union i40e_rx_desc), DBA_ALIGN);
1192 	bzero((void *)rxr->base, rsize);
1193 	/* Cleanup any existing buffers */
1194 	for (int i = 0; i < que->num_desc; i++) {
1195 		buf = &rxr->buffers[i];
1196 		if (buf->m_head != NULL) {
1197 			bus_dmamap_sync(rxr->htag, buf->hmap,
1198 			    BUS_DMASYNC_POSTREAD);
1199 			bus_dmamap_unload(rxr->htag, buf->hmap);
1200 			buf->m_head->m_flags |= M_PKTHDR;
1201 			m_freem(buf->m_head);
1202 		}
1203 		if (buf->m_pack != NULL) {
1204 			bus_dmamap_sync(rxr->ptag, buf->pmap,
1205 			    BUS_DMASYNC_POSTREAD);
1206 			bus_dmamap_unload(rxr->ptag, buf->pmap);
1207 			buf->m_pack->m_flags |= M_PKTHDR;
1208 			m_freem(buf->m_pack);
1209 		}
1210 		buf->m_head = NULL;
1211 		buf->m_pack = NULL;
1212 	}
1213 
1214 	/* header split is off */
1215 	rxr->hdr_split = FALSE;
1216 
1217 	/* Now replenish the mbufs */
1218 	for (int j = 0; j != que->num_desc; ++j) {
1219 		struct mbuf	*mh, *mp;
1220 
1221 		buf = &rxr->buffers[j];
1222 #ifdef DEV_NETMAP
1223 		/*
1224 		 * In netmap mode, fill the map and set the buffer
1225 		 * address in the NIC ring, considering the offset
1226 		 * between the netmap and NIC rings (see comment in
1227 		 * ixgbe_setup_transmit_ring() ). No need to allocate
1228 		 * an mbuf, so end the block with a continue;
1229 		 */
1230 		if (slot) {
1231 			int sj = netmap_idx_n2k(&na->rx_rings[que->me], j);
1232 			uint64_t paddr;
1233 			void *addr;
1234 
1235 			addr = PNMB(na, slot + sj, &paddr);
1236 			netmap_load_map(na, rxr->dma.tag, buf->pmap, addr);
1237 			/* Update descriptor and the cached value */
1238 			rxr->base[j].read.pkt_addr = htole64(paddr);
1239 			rxr->base[j].read.hdr_addr = 0;
1240 			continue;
1241 		}
1242 #endif /* DEV_NETMAP */
1243 		/*
1244 		** Don't allocate mbufs if not
1245 		** doing header split, its wasteful
1246 		*/
1247 		if (rxr->hdr_split == FALSE)
1248 			goto skip_head;
1249 
1250 		/* First the header */
1251 		buf->m_head = m_gethdr(M_NOWAIT, MT_DATA);
1252 		if (buf->m_head == NULL) {
1253 			error = ENOBUFS;
1254 			goto fail;
1255 		}
1256 		m_adj(buf->m_head, ETHER_ALIGN);
1257 		mh = buf->m_head;
1258 		mh->m_len = mh->m_pkthdr.len = MHLEN;
1259 		mh->m_flags |= M_PKTHDR;
1260 		/* Get the memory mapping */
1261 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1262 		    buf->hmap, buf->m_head, hseg,
1263 		    &nsegs, BUS_DMA_NOWAIT);
1264 		if (error != 0) /* Nothing elegant to do here */
1265 			goto fail;
1266 		bus_dmamap_sync(rxr->htag,
1267 		    buf->hmap, BUS_DMASYNC_PREREAD);
1268 		/* Update descriptor */
1269 		rxr->base[j].read.hdr_addr = htole64(hseg[0].ds_addr);
1270 
1271 skip_head:
1272 		/* Now the payload cluster */
1273 		buf->m_pack = m_getjcl(M_NOWAIT, MT_DATA,
1274 		    M_PKTHDR, rxr->mbuf_sz);
1275 		if (buf->m_pack == NULL) {
1276 			error = ENOBUFS;
1277                         goto fail;
1278 		}
1279 		mp = buf->m_pack;
1280 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1281 		/* Get the memory mapping */
1282 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1283 		    buf->pmap, mp, pseg,
1284 		    &nsegs, BUS_DMA_NOWAIT);
1285 		if (error != 0)
1286                         goto fail;
1287 		bus_dmamap_sync(rxr->ptag,
1288 		    buf->pmap, BUS_DMASYNC_PREREAD);
1289 		/* Update descriptor */
1290 		rxr->base[j].read.pkt_addr = htole64(pseg[0].ds_addr);
1291 		rxr->base[j].read.hdr_addr = 0;
1292 	}
1293 
1294 
1295 	/* Setup our descriptor indices */
1296 	rxr->next_check = 0;
1297 	rxr->next_refresh = 0;
1298 	rxr->lro_enabled = FALSE;
1299 	rxr->split = 0;
1300 	rxr->bytes = 0;
1301 	rxr->discard = FALSE;
1302 
1303 	wr32(vsi->hw, rxr->tail, que->num_desc - 1);
1304 	ixl_flush(vsi->hw);
1305 
1306 #if defined(INET6) || defined(INET)
1307 	/*
1308 	** Now set up the LRO interface:
1309 	*/
1310 	if (ifp->if_capenable & IFCAP_LRO) {
1311 		int err = tcp_lro_init(lro);
1312 		if (err) {
1313 			if_printf(ifp, "queue %d: LRO Initialization failed!\n", que->me);
1314 			goto fail;
1315 		}
1316 		INIT_DBG_IF(ifp, "queue %d: RX Soft LRO Initialized", que->me);
1317 		rxr->lro_enabled = TRUE;
1318 		lro->ifp = vsi->ifp;
1319 	}
1320 #endif
1321 
1322 	bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1323 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1324 
1325 fail:
1326 	IXL_RX_UNLOCK(rxr);
1327 	return (error);
1328 }
1329 
1330 
1331 /*********************************************************************
1332  *
1333  *  Free station receive ring data structures
1334  *
1335  **********************************************************************/
1336 void
1337 ixl_free_que_rx(struct ixl_queue *que)
1338 {
1339 	struct rx_ring		*rxr = &que->rxr;
1340 	struct ixl_rx_buf	*buf;
1341 
1342 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
1343 
1344 	/* Cleanup any existing buffers */
1345 	if (rxr->buffers != NULL) {
1346 		for (int i = 0; i < que->num_desc; i++) {
1347 			buf = &rxr->buffers[i];
1348 			if (buf->m_head != NULL) {
1349 				bus_dmamap_sync(rxr->htag, buf->hmap,
1350 				    BUS_DMASYNC_POSTREAD);
1351 				bus_dmamap_unload(rxr->htag, buf->hmap);
1352 				buf->m_head->m_flags |= M_PKTHDR;
1353 				m_freem(buf->m_head);
1354 			}
1355 			if (buf->m_pack != NULL) {
1356 				bus_dmamap_sync(rxr->ptag, buf->pmap,
1357 				    BUS_DMASYNC_POSTREAD);
1358 				bus_dmamap_unload(rxr->ptag, buf->pmap);
1359 				buf->m_pack->m_flags |= M_PKTHDR;
1360 				m_freem(buf->m_pack);
1361 			}
1362 			buf->m_head = NULL;
1363 			buf->m_pack = NULL;
1364 			if (buf->hmap != NULL) {
1365 				bus_dmamap_destroy(rxr->htag, buf->hmap);
1366 				buf->hmap = NULL;
1367 			}
1368 			if (buf->pmap != NULL) {
1369 				bus_dmamap_destroy(rxr->ptag, buf->pmap);
1370 				buf->pmap = NULL;
1371 			}
1372 		}
1373 		if (rxr->buffers != NULL) {
1374 			free(rxr->buffers, M_DEVBUF);
1375 			rxr->buffers = NULL;
1376 		}
1377 	}
1378 
1379 	if (rxr->htag != NULL) {
1380 		bus_dma_tag_destroy(rxr->htag);
1381 		rxr->htag = NULL;
1382 	}
1383 	if (rxr->ptag != NULL) {
1384 		bus_dma_tag_destroy(rxr->ptag);
1385 		rxr->ptag = NULL;
1386 	}
1387 
1388 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
1389 	return;
1390 }
1391 
1392 static inline void
1393 ixl_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u8 ptype)
1394 {
1395 
1396 #if defined(INET6) || defined(INET)
1397         /*
1398          * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet
1399          * should be computed by hardware. Also it should not have VLAN tag in
1400          * ethernet header.
1401          */
1402         if (rxr->lro_enabled &&
1403             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
1404             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) ==
1405             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) {
1406                 /*
1407                  * Send to the stack if:
1408                  **  - LRO not enabled, or
1409                  **  - no LRO resources, or
1410                  **  - lro enqueue fails
1411                  */
1412                 if (rxr->lro.lro_cnt != 0)
1413                         if (tcp_lro_rx(&rxr->lro, m, 0) == 0)
1414                                 return;
1415         }
1416 #endif
1417 	IXL_RX_UNLOCK(rxr);
1418         (*ifp->if_input)(ifp, m);
1419 	IXL_RX_LOCK(rxr);
1420 }
1421 
1422 
1423 static inline void
1424 ixl_rx_discard(struct rx_ring *rxr, int i)
1425 {
1426 	struct ixl_rx_buf	*rbuf;
1427 
1428 	rbuf = &rxr->buffers[i];
1429 
1430         if (rbuf->fmp != NULL) {/* Partial chain ? */
1431 		rbuf->fmp->m_flags |= M_PKTHDR;
1432                 m_freem(rbuf->fmp);
1433                 rbuf->fmp = NULL;
1434 	}
1435 
1436 	/*
1437 	** With advanced descriptors the writeback
1438 	** clobbers the buffer addrs, so its easier
1439 	** to just free the existing mbufs and take
1440 	** the normal refresh path to get new buffers
1441 	** and mapping.
1442 	*/
1443 	if (rbuf->m_head) {
1444 		m_free(rbuf->m_head);
1445 		rbuf->m_head = NULL;
1446 	}
1447 
1448 	if (rbuf->m_pack) {
1449 		m_free(rbuf->m_pack);
1450 		rbuf->m_pack = NULL;
1451 	}
1452 
1453 	return;
1454 }
1455 
1456 #ifdef RSS
1457 /*
1458 ** i40e_ptype_to_hash: parse the packet type
1459 ** to determine the appropriate hash.
1460 */
1461 static inline int
1462 ixl_ptype_to_hash(u8 ptype)
1463 {
1464         struct i40e_rx_ptype_decoded	decoded;
1465 	u8				ex = 0;
1466 
1467 	decoded = decode_rx_desc_ptype(ptype);
1468 	ex = decoded.outer_frag;
1469 
1470 	if (!decoded.known)
1471 		return M_HASHTYPE_OPAQUE_HASH;
1472 
1473 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_L2)
1474 		return M_HASHTYPE_OPAQUE_HASH;
1475 
1476 	/* Note: anything that gets to this point is IP */
1477         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6) {
1478 		switch (decoded.inner_prot) {
1479 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1480 				if (ex)
1481 					return M_HASHTYPE_RSS_TCP_IPV6_EX;
1482 				else
1483 					return M_HASHTYPE_RSS_TCP_IPV6;
1484 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1485 				if (ex)
1486 					return M_HASHTYPE_RSS_UDP_IPV6_EX;
1487 				else
1488 					return M_HASHTYPE_RSS_UDP_IPV6;
1489 			default:
1490 				if (ex)
1491 					return M_HASHTYPE_RSS_IPV6_EX;
1492 				else
1493 					return M_HASHTYPE_RSS_IPV6;
1494 		}
1495 	}
1496         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1497 		switch (decoded.inner_prot) {
1498 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1499 					return M_HASHTYPE_RSS_TCP_IPV4;
1500 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1501 				if (ex)
1502 					return M_HASHTYPE_RSS_UDP_IPV4_EX;
1503 				else
1504 					return M_HASHTYPE_RSS_UDP_IPV4;
1505 			default:
1506 					return M_HASHTYPE_RSS_IPV4;
1507 		}
1508 	}
1509 	/* We should never get here!! */
1510 	return M_HASHTYPE_OPAQUE_HASH;
1511 }
1512 #endif /* RSS */
1513 
1514 /*********************************************************************
1515  *
1516  *  This routine executes in interrupt context. It replenishes
1517  *  the mbufs in the descriptor and sends data which has been
1518  *  dma'ed into host memory to upper layer.
1519  *
1520  *  We loop at most count times if count is > 0, or until done if
1521  *  count < 0.
1522  *
1523  *  Return TRUE for more work, FALSE for all clean.
1524  *********************************************************************/
1525 bool
1526 ixl_rxeof(struct ixl_queue *que, int count)
1527 {
1528 	struct ixl_vsi		*vsi = que->vsi;
1529 	struct rx_ring		*rxr = &que->rxr;
1530 	struct ifnet		*ifp = vsi->ifp;
1531 #if defined(INET6) || defined(INET)
1532 	struct lro_ctrl		*lro = &rxr->lro;
1533 #endif
1534 	int			i, nextp, processed = 0;
1535 	union i40e_rx_desc	*cur;
1536 	struct ixl_rx_buf	*rbuf, *nbuf;
1537 
1538 
1539 	IXL_RX_LOCK(rxr);
1540 
1541 #ifdef DEV_NETMAP
1542 	if (netmap_rx_irq(ifp, que->me, &count)) {
1543 		IXL_RX_UNLOCK(rxr);
1544 		return (FALSE);
1545 	}
1546 #endif /* DEV_NETMAP */
1547 
1548 	for (i = rxr->next_check; count != 0;) {
1549 		struct mbuf	*sendmp, *mh, *mp;
1550 		u32		status, error;
1551 		u16		hlen, plen, vtag;
1552 		u64		qword;
1553 		u8		ptype;
1554 		bool		eop;
1555 
1556 		/* Sync the ring. */
1557 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1558 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1559 
1560 		cur = &rxr->base[i];
1561 		qword = le64toh(cur->wb.qword1.status_error_len);
1562 		status = (qword & I40E_RXD_QW1_STATUS_MASK)
1563 		    >> I40E_RXD_QW1_STATUS_SHIFT;
1564 		error = (qword & I40E_RXD_QW1_ERROR_MASK)
1565 		    >> I40E_RXD_QW1_ERROR_SHIFT;
1566 		plen = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK)
1567 		    >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1568 		hlen = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK)
1569 		    >> I40E_RXD_QW1_LENGTH_HBUF_SHIFT;
1570 		ptype = (qword & I40E_RXD_QW1_PTYPE_MASK)
1571 		    >> I40E_RXD_QW1_PTYPE_SHIFT;
1572 
1573 		if ((status & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) == 0) {
1574 			++rxr->not_done;
1575 			break;
1576 		}
1577 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1578 			break;
1579 
1580 		count--;
1581 		sendmp = NULL;
1582 		nbuf = NULL;
1583 		cur->wb.qword1.status_error_len = 0;
1584 		rbuf = &rxr->buffers[i];
1585 		mh = rbuf->m_head;
1586 		mp = rbuf->m_pack;
1587 		eop = (status & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT));
1588 		if (status & (1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT))
1589 			vtag = le16toh(cur->wb.qword0.lo_dword.l2tag1);
1590 		else
1591 			vtag = 0;
1592 
1593 		/*
1594 		** Make sure bad packets are discarded,
1595 		** note that only EOP descriptor has valid
1596 		** error results.
1597 		*/
1598                 if (eop && (error & (1 << I40E_RX_DESC_ERROR_RXE_SHIFT))) {
1599 			rxr->desc_errs++;
1600 			ixl_rx_discard(rxr, i);
1601 			goto next_desc;
1602 		}
1603 
1604 		/* Prefetch the next buffer */
1605 		if (!eop) {
1606 			nextp = i + 1;
1607 			if (nextp == que->num_desc)
1608 				nextp = 0;
1609 			nbuf = &rxr->buffers[nextp];
1610 			prefetch(nbuf);
1611 		}
1612 
1613 		/*
1614 		** The header mbuf is ONLY used when header
1615 		** split is enabled, otherwise we get normal
1616 		** behavior, ie, both header and payload
1617 		** are DMA'd into the payload buffer.
1618 		**
1619 		** Rather than using the fmp/lmp global pointers
1620 		** we now keep the head of a packet chain in the
1621 		** buffer struct and pass this along from one
1622 		** descriptor to the next, until we get EOP.
1623 		*/
1624 		if (rxr->hdr_split && (rbuf->fmp == NULL)) {
1625 			if (hlen > IXL_RX_HDR)
1626 				hlen = IXL_RX_HDR;
1627 			mh->m_len = hlen;
1628 			mh->m_flags |= M_PKTHDR;
1629 			mh->m_next = NULL;
1630 			mh->m_pkthdr.len = mh->m_len;
1631 			/* Null buf pointer so it is refreshed */
1632 			rbuf->m_head = NULL;
1633 			/*
1634 			** Check the payload length, this
1635 			** could be zero if its a small
1636 			** packet.
1637 			*/
1638 			if (plen > 0) {
1639 				mp->m_len = plen;
1640 				mp->m_next = NULL;
1641 				mp->m_flags &= ~M_PKTHDR;
1642 				mh->m_next = mp;
1643 				mh->m_pkthdr.len += mp->m_len;
1644 				/* Null buf pointer so it is refreshed */
1645 				rbuf->m_pack = NULL;
1646 				rxr->split++;
1647 			}
1648 			/*
1649 			** Now create the forward
1650 			** chain so when complete
1651 			** we wont have to.
1652 			*/
1653                         if (eop == 0) {
1654 				/* stash the chain head */
1655                                 nbuf->fmp = mh;
1656 				/* Make forward chain */
1657                                 if (plen)
1658                                         mp->m_next = nbuf->m_pack;
1659                                 else
1660                                         mh->m_next = nbuf->m_pack;
1661                         } else {
1662 				/* Singlet, prepare to send */
1663                                 sendmp = mh;
1664                                 if (vtag) {
1665                                         sendmp->m_pkthdr.ether_vtag = vtag;
1666                                         sendmp->m_flags |= M_VLANTAG;
1667                                 }
1668                         }
1669 		} else {
1670 			/*
1671 			** Either no header split, or a
1672 			** secondary piece of a fragmented
1673 			** split packet.
1674 			*/
1675 			mp->m_len = plen;
1676 			/*
1677 			** See if there is a stored head
1678 			** that determines what we are
1679 			*/
1680 			sendmp = rbuf->fmp;
1681 			rbuf->m_pack = rbuf->fmp = NULL;
1682 
1683 			if (sendmp != NULL) /* secondary frag */
1684 				sendmp->m_pkthdr.len += mp->m_len;
1685 			else {
1686 				/* first desc of a non-ps chain */
1687 				sendmp = mp;
1688 				sendmp->m_flags |= M_PKTHDR;
1689 				sendmp->m_pkthdr.len = mp->m_len;
1690                         }
1691 			/* Pass the head pointer on */
1692 			if (eop == 0) {
1693 				nbuf->fmp = sendmp;
1694 				sendmp = NULL;
1695 				mp->m_next = nbuf->m_pack;
1696 			}
1697 		}
1698 		++processed;
1699 		/* Sending this frame? */
1700 		if (eop) {
1701 			sendmp->m_pkthdr.rcvif = ifp;
1702 			/* gather stats */
1703 			rxr->rx_packets++;
1704 			rxr->rx_bytes += sendmp->m_pkthdr.len;
1705 			/* capture data for dynamic ITR adjustment */
1706 			rxr->packets++;
1707 			rxr->bytes += sendmp->m_pkthdr.len;
1708 			/* Set VLAN tag (field only valid in eop desc) */
1709 			if (vtag) {
1710 				sendmp->m_pkthdr.ether_vtag = vtag;
1711 				sendmp->m_flags |= M_VLANTAG;
1712 			}
1713 			if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
1714 				ixl_rx_checksum(sendmp, status, error, ptype);
1715 #ifdef RSS
1716 			sendmp->m_pkthdr.flowid =
1717 			    le32toh(cur->wb.qword0.hi_dword.rss);
1718 			M_HASHTYPE_SET(sendmp, ixl_ptype_to_hash(ptype));
1719 #else
1720 			sendmp->m_pkthdr.flowid = que->msix;
1721 			M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE);
1722 #endif
1723 		}
1724 next_desc:
1725 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1726 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1727 
1728 		/* Advance our pointers to the next descriptor. */
1729 		if (++i == que->num_desc)
1730 			i = 0;
1731 
1732 		/* Now send to the stack or do LRO */
1733 		if (sendmp != NULL) {
1734 			rxr->next_check = i;
1735 			ixl_rx_input(rxr, ifp, sendmp, ptype);
1736 			i = rxr->next_check;
1737 		}
1738 
1739                /* Every 8 descriptors we go to refresh mbufs */
1740 		if (processed == 8) {
1741 			ixl_refresh_mbufs(que, i);
1742 			processed = 0;
1743 		}
1744 	}
1745 
1746 	/* Refresh any remaining buf structs */
1747 	if (ixl_rx_unrefreshed(que))
1748 		ixl_refresh_mbufs(que, i);
1749 
1750 	rxr->next_check = i;
1751 
1752 #if defined(INET6) || defined(INET)
1753 	/*
1754 	 * Flush any outstanding LRO work
1755 	 */
1756 	tcp_lro_flush_all(lro);
1757 #endif
1758 
1759 	IXL_RX_UNLOCK(rxr);
1760 	return (FALSE);
1761 }
1762 
1763 
1764 /*********************************************************************
1765  *
1766  *  Verify that the hardware indicated that the checksum is valid.
1767  *  Inform the stack about the status of checksum so that stack
1768  *  doesn't spend time verifying the checksum.
1769  *
1770  *********************************************************************/
1771 static void
1772 ixl_rx_checksum(struct mbuf * mp, u32 status, u32 error, u8 ptype)
1773 {
1774 	struct i40e_rx_ptype_decoded decoded;
1775 
1776 	decoded = decode_rx_desc_ptype(ptype);
1777 
1778 	/* Errors? */
1779  	if (error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) |
1780 	    (1 << I40E_RX_DESC_ERROR_L4E_SHIFT))) {
1781 		mp->m_pkthdr.csum_flags = 0;
1782 		return;
1783 	}
1784 
1785 	/* IPv6 with extension headers likely have bad csum */
1786 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1787 	    decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6)
1788 		if (status &
1789 		    (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT)) {
1790 			mp->m_pkthdr.csum_flags = 0;
1791 			return;
1792 		}
1793 
1794 
1795 	/* IP Checksum Good */
1796 	mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
1797 	mp->m_pkthdr.csum_flags |= CSUM_IP_VALID;
1798 
1799 	if (status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) {
1800 		mp->m_pkthdr.csum_flags |=
1801 		    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1802 		mp->m_pkthdr.csum_data |= htons(0xffff);
1803 	}
1804 	return;
1805 }
1806 
1807 #if __FreeBSD_version >= 1100000
1808 uint64_t
1809 ixl_get_counter(if_t ifp, ift_counter cnt)
1810 {
1811 	struct ixl_vsi *vsi;
1812 
1813 	vsi = if_getsoftc(ifp);
1814 
1815 	switch (cnt) {
1816 	case IFCOUNTER_IPACKETS:
1817 		return (vsi->ipackets);
1818 	case IFCOUNTER_IERRORS:
1819 		return (vsi->ierrors);
1820 	case IFCOUNTER_OPACKETS:
1821 		return (vsi->opackets);
1822 	case IFCOUNTER_OERRORS:
1823 		return (vsi->oerrors);
1824 	case IFCOUNTER_COLLISIONS:
1825 		/* Collisions are by standard impossible in 40G/10G Ethernet */
1826 		return (0);
1827 	case IFCOUNTER_IBYTES:
1828 		return (vsi->ibytes);
1829 	case IFCOUNTER_OBYTES:
1830 		return (vsi->obytes);
1831 	case IFCOUNTER_IMCASTS:
1832 		return (vsi->imcasts);
1833 	case IFCOUNTER_OMCASTS:
1834 		return (vsi->omcasts);
1835 	case IFCOUNTER_IQDROPS:
1836 		return (vsi->iqdrops);
1837 	case IFCOUNTER_OQDROPS:
1838 		return (vsi->oqdrops);
1839 	case IFCOUNTER_NOPROTO:
1840 		return (vsi->noproto);
1841 	default:
1842 		return (if_get_counter_default(ifp, cnt));
1843 	}
1844 }
1845 #endif
1846 
1847