xref: /freebsd/sys/dev/ixl/ixl_txrx.c (revision 15c433351f54e7cd5bec8d36c8e89e6a7fa55b26)
1 /******************************************************************************
2 
3   Copyright (c) 2013-2015, Intel Corporation
4   All rights reserved.
5 
6   Redistribution and use in source and binary forms, with or without
7   modification, are permitted provided that the following conditions are met:
8 
9    1. Redistributions of source code must retain the above copyright notice,
10       this list of conditions and the following disclaimer.
11 
12    2. Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15 
16    3. Neither the name of the Intel Corporation nor the names of its
17       contributors may be used to endorse or promote products derived from
18       this software without specific prior written permission.
19 
20   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   POSSIBILITY OF SUCH DAMAGE.
31 
32 ******************************************************************************/
33 /*$FreeBSD$*/
34 
35 /*
36 **	IXL driver TX/RX Routines:
37 **	    This was seperated to allow usage by
38 ** 	    both the BASE and the VF drivers.
39 */
40 
41 #ifndef IXL_STANDALONE_BUILD
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_rss.h"
45 #endif
46 
47 #include "ixl.h"
48 
49 #ifdef RSS
50 #include <net/rss_config.h>
51 #endif
52 
53 /* Local Prototypes */
54 static void	ixl_rx_checksum(struct mbuf *, u32, u32, u8);
55 static void	ixl_refresh_mbufs(struct ixl_queue *, int);
56 static int      ixl_xmit(struct ixl_queue *, struct mbuf **);
57 static int	ixl_tx_setup_offload(struct ixl_queue *,
58 		    struct mbuf *, u32 *, u32 *);
59 static bool	ixl_tso_setup(struct ixl_queue *, struct mbuf *);
60 
61 static __inline void ixl_rx_discard(struct rx_ring *, int);
62 static __inline void ixl_rx_input(struct rx_ring *, struct ifnet *,
63 		    struct mbuf *, u8);
64 
65 #ifdef DEV_NETMAP
66 #include <dev/netmap/if_ixl_netmap.h>
67 #endif /* DEV_NETMAP */
68 
69 /*
70 ** Multiqueue Transmit driver
71 */
72 int
73 ixl_mq_start(struct ifnet *ifp, struct mbuf *m)
74 {
75 	struct ixl_vsi		*vsi = ifp->if_softc;
76 	struct ixl_queue	*que;
77 	struct tx_ring		*txr;
78 	int 			err, i;
79 #ifdef RSS
80 	u32			bucket_id;
81 #endif
82 
83 	/*
84 	** Which queue to use:
85 	**
86 	** When doing RSS, map it to the same outbound
87 	** queue as the incoming flow would be mapped to.
88 	** If everything is setup correctly, it should be
89 	** the same bucket that the current CPU we're on is.
90 	*/
91 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
92 #ifdef  RSS
93 		if (rss_hash2bucket(m->m_pkthdr.flowid,
94 		    M_HASHTYPE_GET(m), &bucket_id) == 0) {
95 			i = bucket_id % vsi->num_queues;
96                 } else
97 #endif
98                         i = m->m_pkthdr.flowid % vsi->num_queues;
99         } else
100 		i = curcpu % vsi->num_queues;
101 	/*
102 	** This may not be perfect, but until something
103 	** better comes along it will keep from scheduling
104 	** on stalled queues.
105 	*/
106 	if (((1 << i) & vsi->active_queues) == 0)
107 		i = ffsl(vsi->active_queues);
108 
109 	que = &vsi->queues[i];
110 	txr = &que->txr;
111 
112 	err = drbr_enqueue(ifp, txr->br, m);
113 	if (err)
114 		return (err);
115 	if (IXL_TX_TRYLOCK(txr)) {
116 		ixl_mq_start_locked(ifp, txr);
117 		IXL_TX_UNLOCK(txr);
118 	} else
119 		taskqueue_enqueue(que->tq, &que->tx_task);
120 
121 	return (0);
122 }
123 
124 int
125 ixl_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
126 {
127 	struct ixl_queue	*que = txr->que;
128 	struct ixl_vsi		*vsi = que->vsi;
129         struct mbuf		*next;
130         int			err = 0;
131 
132 
133 	if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ||
134 	    vsi->link_active == 0)
135 		return (ENETDOWN);
136 
137 	/* Process the transmit queue */
138 	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
139 		if ((err = ixl_xmit(que, &next)) != 0) {
140 			if (next == NULL)
141 				drbr_advance(ifp, txr->br);
142 			else
143 				drbr_putback(ifp, txr->br, next);
144 			break;
145 		}
146 		drbr_advance(ifp, txr->br);
147 		/* Send a copy of the frame to the BPF listener */
148 		ETHER_BPF_MTAP(ifp, next);
149 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
150 			break;
151 	}
152 
153 	if (txr->avail < IXL_TX_CLEANUP_THRESHOLD)
154 		ixl_txeof(que);
155 
156 	return (err);
157 }
158 
159 /*
160  * Called from a taskqueue to drain queued transmit packets.
161  */
162 void
163 ixl_deferred_mq_start(void *arg, int pending)
164 {
165 	struct ixl_queue	*que = arg;
166         struct tx_ring		*txr = &que->txr;
167 	struct ixl_vsi		*vsi = que->vsi;
168         struct ifnet		*ifp = vsi->ifp;
169 
170 	IXL_TX_LOCK(txr);
171 	if (!drbr_empty(ifp, txr->br))
172 		ixl_mq_start_locked(ifp, txr);
173 	IXL_TX_UNLOCK(txr);
174 }
175 
176 /*
177 ** Flush all queue ring buffers
178 */
179 void
180 ixl_qflush(struct ifnet *ifp)
181 {
182 	struct ixl_vsi	*vsi = ifp->if_softc;
183 
184         for (int i = 0; i < vsi->num_queues; i++) {
185 		struct ixl_queue *que = &vsi->queues[i];
186 		struct tx_ring	*txr = &que->txr;
187 		struct mbuf	*m;
188 		IXL_TX_LOCK(txr);
189 		while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
190 			m_freem(m);
191 		IXL_TX_UNLOCK(txr);
192 	}
193 	if_qflush(ifp);
194 }
195 
196 /*
197 ** Find mbuf chains passed to the driver
198 ** that are 'sparse', using more than 8
199 ** mbufs to deliver an mss-size chunk of data
200 */
201 static inline bool
202 ixl_tso_detect_sparse(struct mbuf *mp)
203 {
204 	struct mbuf	*m;
205 	int		num = 0, mss;
206 	bool		ret = FALSE;
207 
208 	mss = mp->m_pkthdr.tso_segsz;
209 	for (m = mp->m_next; m != NULL; m = m->m_next) {
210 		num++;
211 		mss -= m->m_len;
212 		if (mss < 1)
213 			break;
214 		if (m->m_next == NULL)
215 			break;
216 	}
217 	if (num > IXL_SPARSE_CHAIN)
218 		ret = TRUE;
219 
220 	return (ret);
221 }
222 
223 
224 /*********************************************************************
225  *
226  *  This routine maps the mbufs to tx descriptors, allowing the
227  *  TX engine to transmit the packets.
228  *  	- return 0 on success, positive on failure
229  *
230  **********************************************************************/
231 #define IXL_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
232 
233 static int
234 ixl_xmit(struct ixl_queue *que, struct mbuf **m_headp)
235 {
236 	struct ixl_vsi		*vsi = que->vsi;
237 	struct i40e_hw		*hw = vsi->hw;
238 	struct tx_ring		*txr = &que->txr;
239 	struct ixl_tx_buf	*buf;
240 	struct i40e_tx_desc	*txd = NULL;
241 	struct mbuf		*m_head, *m;
242 	int             	i, j, error, nsegs, maxsegs;
243 	int			first, last = 0;
244 	u16			vtag = 0;
245 	u32			cmd, off;
246 	bus_dmamap_t		map;
247 	bus_dma_tag_t		tag;
248 	bus_dma_segment_t	segs[IXL_MAX_TSO_SEGS];
249 
250 
251 	cmd = off = 0;
252 	m_head = *m_headp;
253 
254         /*
255          * Important to capture the first descriptor
256          * used because it will contain the index of
257          * the one we tell the hardware to report back
258          */
259         first = txr->next_avail;
260 	buf = &txr->buffers[first];
261 	map = buf->map;
262 	tag = txr->tx_tag;
263 	maxsegs = IXL_MAX_TX_SEGS;
264 
265 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
266 		/* Use larger mapping for TSO */
267 		tag = txr->tso_tag;
268 		maxsegs = IXL_MAX_TSO_SEGS;
269 		if (ixl_tso_detect_sparse(m_head)) {
270 			m = m_defrag(m_head, M_NOWAIT);
271 			if (m == NULL) {
272 				m_freem(*m_headp);
273 				*m_headp = NULL;
274 				return (ENOBUFS);
275 			}
276 			*m_headp = m;
277 		}
278 	}
279 
280 	/*
281 	 * Map the packet for DMA.
282 	 */
283 	error = bus_dmamap_load_mbuf_sg(tag, map,
284 	    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
285 
286 	if (error == EFBIG) {
287 		struct mbuf *m;
288 
289 		m = m_defrag(*m_headp, M_NOWAIT);
290 		if (m == NULL) {
291 			que->mbuf_defrag_failed++;
292 			m_freem(*m_headp);
293 			*m_headp = NULL;
294 			return (ENOBUFS);
295 		}
296 		*m_headp = m;
297 
298 		/* Try it again */
299 		error = bus_dmamap_load_mbuf_sg(tag, map,
300 		    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
301 
302 		if (error == ENOMEM) {
303 			que->tx_dma_setup++;
304 			return (error);
305 		} else if (error != 0) {
306 			que->tx_dma_setup++;
307 			m_freem(*m_headp);
308 			*m_headp = NULL;
309 			return (error);
310 		}
311 	} else if (error == ENOMEM) {
312 		que->tx_dma_setup++;
313 		return (error);
314 	} else if (error != 0) {
315 		que->tx_dma_setup++;
316 		m_freem(*m_headp);
317 		*m_headp = NULL;
318 		return (error);
319 	}
320 
321 	/* Make certain there are enough descriptors */
322 	if (nsegs > txr->avail - 2) {
323 		txr->no_desc++;
324 		error = ENOBUFS;
325 		goto xmit_fail;
326 	}
327 	m_head = *m_headp;
328 
329 	/* Set up the TSO/CSUM offload */
330 	if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) {
331 		error = ixl_tx_setup_offload(que, m_head, &cmd, &off);
332 		if (error)
333 			goto xmit_fail;
334 	}
335 
336 	cmd |= I40E_TX_DESC_CMD_ICRC;
337 	/* Grab the VLAN tag */
338 	if (m_head->m_flags & M_VLANTAG) {
339 		cmd |= I40E_TX_DESC_CMD_IL2TAG1;
340 		vtag = htole16(m_head->m_pkthdr.ether_vtag);
341 	}
342 
343 	i = txr->next_avail;
344 	for (j = 0; j < nsegs; j++) {
345 		bus_size_t seglen;
346 
347 		buf = &txr->buffers[i];
348 		buf->tag = tag; /* Keep track of the type tag */
349 		txd = &txr->base[i];
350 		seglen = segs[j].ds_len;
351 
352 		txd->buffer_addr = htole64(segs[j].ds_addr);
353 		txd->cmd_type_offset_bsz =
354 		    htole64(I40E_TX_DESC_DTYPE_DATA
355 		    | ((u64)cmd  << I40E_TXD_QW1_CMD_SHIFT)
356 		    | ((u64)off << I40E_TXD_QW1_OFFSET_SHIFT)
357 		    | ((u64)seglen  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)
358 		    | ((u64)vtag  << I40E_TXD_QW1_L2TAG1_SHIFT));
359 
360 		last = i; /* descriptor that will get completion IRQ */
361 
362 		if (++i == que->num_desc)
363 			i = 0;
364 
365 		buf->m_head = NULL;
366 		buf->eop_index = -1;
367 	}
368 	/* Set the last descriptor for report */
369 	txd->cmd_type_offset_bsz |=
370 	    htole64(((u64)IXL_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT));
371 	txr->avail -= nsegs;
372 	txr->next_avail = i;
373 
374 	buf->m_head = m_head;
375 	/* Swap the dma map between the first and last descriptor */
376 	txr->buffers[first].map = buf->map;
377 	buf->map = map;
378 	bus_dmamap_sync(tag, map, BUS_DMASYNC_PREWRITE);
379 
380         /* Set the index of the descriptor that will be marked done */
381         buf = &txr->buffers[first];
382 	buf->eop_index = last;
383 
384         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
385             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
386 	/*
387 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
388 	 * hardware that this frame is available to transmit.
389 	 */
390 	++txr->total_packets;
391 	wr32(hw, txr->tail, i);
392 
393 	/* Mark outstanding work */
394 	if (que->busy == 0)
395 		que->busy = 1;
396 	return (0);
397 
398 xmit_fail:
399 	bus_dmamap_unload(tag, buf->map);
400 	return (error);
401 }
402 
403 
404 /*********************************************************************
405  *
406  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
407  *  the information needed to transmit a packet on the wire. This is
408  *  called only once at attach, setup is done every reset.
409  *
410  **********************************************************************/
411 int
412 ixl_allocate_tx_data(struct ixl_queue *que)
413 {
414 	struct tx_ring		*txr = &que->txr;
415 	struct ixl_vsi		*vsi = que->vsi;
416 	device_t		dev = vsi->dev;
417 	struct ixl_tx_buf	*buf;
418 	int			error = 0;
419 
420 	/*
421 	 * Setup DMA descriptor areas.
422 	 */
423 	if ((error = bus_dma_tag_create(NULL,		/* parent */
424 			       1, 0,			/* alignment, bounds */
425 			       BUS_SPACE_MAXADDR,	/* lowaddr */
426 			       BUS_SPACE_MAXADDR,	/* highaddr */
427 			       NULL, NULL,		/* filter, filterarg */
428 			       IXL_TSO_SIZE,		/* maxsize */
429 			       IXL_MAX_TX_SEGS,		/* nsegments */
430 			       PAGE_SIZE,		/* maxsegsize */
431 			       0,			/* flags */
432 			       NULL,			/* lockfunc */
433 			       NULL,			/* lockfuncarg */
434 			       &txr->tx_tag))) {
435 		device_printf(dev,"Unable to allocate TX DMA tag\n");
436 		goto fail;
437 	}
438 
439 	/* Make a special tag for TSO */
440 	if ((error = bus_dma_tag_create(NULL,		/* parent */
441 			       1, 0,			/* alignment, bounds */
442 			       BUS_SPACE_MAXADDR,	/* lowaddr */
443 			       BUS_SPACE_MAXADDR,	/* highaddr */
444 			       NULL, NULL,		/* filter, filterarg */
445 			       IXL_TSO_SIZE,		/* maxsize */
446 			       IXL_MAX_TSO_SEGS,	/* nsegments */
447 			       PAGE_SIZE,		/* maxsegsize */
448 			       0,			/* flags */
449 			       NULL,			/* lockfunc */
450 			       NULL,			/* lockfuncarg */
451 			       &txr->tso_tag))) {
452 		device_printf(dev,"Unable to allocate TX TSO DMA tag\n");
453 		goto fail;
454 	}
455 
456 	if (!(txr->buffers =
457 	    (struct ixl_tx_buf *) malloc(sizeof(struct ixl_tx_buf) *
458 	    que->num_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) {
459 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
460 		error = ENOMEM;
461 		goto fail;
462 	}
463 
464         /* Create the descriptor buffer default dma maps */
465 	buf = txr->buffers;
466 	for (int i = 0; i < que->num_desc; i++, buf++) {
467 		buf->tag = txr->tx_tag;
468 		error = bus_dmamap_create(buf->tag, 0, &buf->map);
469 		if (error != 0) {
470 			device_printf(dev, "Unable to create TX DMA map\n");
471 			goto fail;
472 		}
473 	}
474 fail:
475 	return (error);
476 }
477 
478 
479 /*********************************************************************
480  *
481  *  (Re)Initialize a queue transmit ring.
482  *	- called by init, it clears the descriptor ring,
483  *	  and frees any stale mbufs
484  *
485  **********************************************************************/
486 void
487 ixl_init_tx_ring(struct ixl_queue *que)
488 {
489 #ifdef DEV_NETMAP
490 	struct netmap_adapter *na = NA(que->vsi->ifp);
491 	struct netmap_slot *slot;
492 #endif /* DEV_NETMAP */
493 	struct tx_ring		*txr = &que->txr;
494 	struct ixl_tx_buf	*buf;
495 
496 	/* Clear the old ring contents */
497 	IXL_TX_LOCK(txr);
498 
499 #ifdef DEV_NETMAP
500 	/*
501 	 * (under lock): if in netmap mode, do some consistency
502 	 * checks and set slot to entry 0 of the netmap ring.
503 	 */
504 	slot = netmap_reset(na, NR_TX, que->me, 0);
505 #endif /* DEV_NETMAP */
506 
507 	bzero((void *)txr->base,
508 	      (sizeof(struct i40e_tx_desc)) * que->num_desc);
509 
510 	/* Reset indices */
511 	txr->next_avail = 0;
512 	txr->next_to_clean = 0;
513 
514 #ifdef IXL_FDIR
515 	/* Initialize flow director */
516 	txr->atr_rate = ixl_atr_rate;
517 	txr->atr_count = 0;
518 #endif
519 
520 	/* Free any existing tx mbufs. */
521         buf = txr->buffers;
522 	for (int i = 0; i < que->num_desc; i++, buf++) {
523 		if (buf->m_head != NULL) {
524 			bus_dmamap_sync(buf->tag, buf->map,
525 			    BUS_DMASYNC_POSTWRITE);
526 			bus_dmamap_unload(buf->tag, buf->map);
527 			m_freem(buf->m_head);
528 			buf->m_head = NULL;
529 		}
530 #ifdef DEV_NETMAP
531 		/*
532 		 * In netmap mode, set the map for the packet buffer.
533 		 * NOTE: Some drivers (not this one) also need to set
534 		 * the physical buffer address in the NIC ring.
535 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
536 		 * netmap slot index, si
537 		 */
538 		if (slot) {
539 			int si = netmap_idx_n2k(&na->tx_rings[que->me], i);
540 			netmap_load_map(na, buf->tag, buf->map, NMB(na, slot + si));
541 		}
542 #endif /* DEV_NETMAP */
543 		/* Clear the EOP index */
544 		buf->eop_index = -1;
545         }
546 
547 	/* Set number of descriptors available */
548 	txr->avail = que->num_desc;
549 
550 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
551 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
552 	IXL_TX_UNLOCK(txr);
553 }
554 
555 
556 /*********************************************************************
557  *
558  *  Free transmit ring related data structures.
559  *
560  **********************************************************************/
561 void
562 ixl_free_que_tx(struct ixl_queue *que)
563 {
564 	struct tx_ring *txr = &que->txr;
565 	struct ixl_tx_buf *buf;
566 
567 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
568 
569 	for (int i = 0; i < que->num_desc; i++) {
570 		buf = &txr->buffers[i];
571 		if (buf->m_head != NULL) {
572 			bus_dmamap_sync(buf->tag, buf->map,
573 			    BUS_DMASYNC_POSTWRITE);
574 			bus_dmamap_unload(buf->tag,
575 			    buf->map);
576 			m_freem(buf->m_head);
577 			buf->m_head = NULL;
578 			if (buf->map != NULL) {
579 				bus_dmamap_destroy(buf->tag,
580 				    buf->map);
581 				buf->map = NULL;
582 			}
583 		} else if (buf->map != NULL) {
584 			bus_dmamap_unload(buf->tag,
585 			    buf->map);
586 			bus_dmamap_destroy(buf->tag,
587 			    buf->map);
588 			buf->map = NULL;
589 		}
590 	}
591 	if (txr->br != NULL)
592 		buf_ring_free(txr->br, M_DEVBUF);
593 	if (txr->buffers != NULL) {
594 		free(txr->buffers, M_DEVBUF);
595 		txr->buffers = NULL;
596 	}
597 	if (txr->tx_tag != NULL) {
598 		bus_dma_tag_destroy(txr->tx_tag);
599 		txr->tx_tag = NULL;
600 	}
601 	if (txr->tso_tag != NULL) {
602 		bus_dma_tag_destroy(txr->tso_tag);
603 		txr->tso_tag = NULL;
604 	}
605 
606 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
607 	return;
608 }
609 
610 /*********************************************************************
611  *
612  *  Setup descriptor for hw offloads
613  *
614  **********************************************************************/
615 
616 static int
617 ixl_tx_setup_offload(struct ixl_queue *que,
618     struct mbuf *mp, u32 *cmd, u32 *off)
619 {
620 	struct ether_vlan_header	*eh;
621 #ifdef INET
622 	struct ip			*ip = NULL;
623 #endif
624 	struct tcphdr			*th = NULL;
625 #ifdef INET6
626 	struct ip6_hdr			*ip6;
627 #endif
628 	int				elen, ip_hlen = 0, tcp_hlen;
629 	u16				etype;
630 	u8				ipproto = 0;
631 	bool				tso = FALSE;
632 
633 
634 	/* Set up the TSO context descriptor if required */
635 	if (mp->m_pkthdr.csum_flags & CSUM_TSO) {
636 		tso = ixl_tso_setup(que, mp);
637 		if (tso)
638 			++que->tso;
639 		else
640 			return (ENXIO);
641 	}
642 
643 	/*
644 	 * Determine where frame payload starts.
645 	 * Jump over vlan headers if already present,
646 	 * helpful for QinQ too.
647 	 */
648 	eh = mtod(mp, struct ether_vlan_header *);
649 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
650 		etype = ntohs(eh->evl_proto);
651 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
652 	} else {
653 		etype = ntohs(eh->evl_encap_proto);
654 		elen = ETHER_HDR_LEN;
655 	}
656 
657 	switch (etype) {
658 #ifdef INET
659 		case ETHERTYPE_IP:
660 			ip = (struct ip *)(mp->m_data + elen);
661 			ip_hlen = ip->ip_hl << 2;
662 			ipproto = ip->ip_p;
663 			th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
664 			/* The IP checksum must be recalculated with TSO */
665 			if (tso)
666 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
667 			else
668 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4;
669 			break;
670 #endif
671 #ifdef INET6
672 		case ETHERTYPE_IPV6:
673 			ip6 = (struct ip6_hdr *)(mp->m_data + elen);
674 			ip_hlen = sizeof(struct ip6_hdr);
675 			ipproto = ip6->ip6_nxt;
676 			th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
677 			*cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
678 			break;
679 #endif
680 		default:
681 			break;
682 	}
683 
684 	*off |= (elen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
685 	*off |= (ip_hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
686 
687 	switch (ipproto) {
688 		case IPPROTO_TCP:
689 			tcp_hlen = th->th_off << 2;
690 			if (mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) {
691 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
692 				*off |= (tcp_hlen >> 2) <<
693 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
694 			}
695 #ifdef IXL_FDIR
696 			ixl_atr(que, th, etype);
697 #endif
698 			break;
699 		case IPPROTO_UDP:
700 			if (mp->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_UDP_IPV6)) {
701 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
702 				*off |= (sizeof(struct udphdr) >> 2) <<
703 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
704 			}
705 			break;
706 
707 		case IPPROTO_SCTP:
708 			if (mp->m_pkthdr.csum_flags & (CSUM_SCTP|CSUM_SCTP_IPV6)) {
709 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
710 				*off |= (sizeof(struct sctphdr) >> 2) <<
711 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
712 			}
713 			/* Fall Thru */
714 		default:
715 			break;
716 	}
717 
718         return (0);
719 }
720 
721 
722 /**********************************************************************
723  *
724  *  Setup context for hardware segmentation offload (TSO)
725  *
726  **********************************************************************/
727 static bool
728 ixl_tso_setup(struct ixl_queue *que, struct mbuf *mp)
729 {
730 	struct tx_ring			*txr = &que->txr;
731 	struct i40e_tx_context_desc	*TXD;
732 	struct ixl_tx_buf		*buf;
733 	u32				cmd, mss, type, tsolen;
734 	u16				etype;
735 	int				idx, elen, ip_hlen, tcp_hlen;
736 	struct ether_vlan_header	*eh;
737 #ifdef INET
738 	struct ip			*ip;
739 #endif
740 #ifdef INET6
741 	struct ip6_hdr			*ip6;
742 #endif
743 #if defined(INET6) || defined(INET)
744 	struct tcphdr			*th;
745 #endif
746 	u64				type_cmd_tso_mss;
747 
748 	/*
749 	 * Determine where frame payload starts.
750 	 * Jump over vlan headers if already present
751 	 */
752 	eh = mtod(mp, struct ether_vlan_header *);
753 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
754 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
755 		etype = eh->evl_proto;
756 	} else {
757 		elen = ETHER_HDR_LEN;
758 		etype = eh->evl_encap_proto;
759 	}
760 
761         switch (ntohs(etype)) {
762 #ifdef INET6
763 	case ETHERTYPE_IPV6:
764 		ip6 = (struct ip6_hdr *)(mp->m_data + elen);
765 		if (ip6->ip6_nxt != IPPROTO_TCP)
766 			return (ENXIO);
767 		ip_hlen = sizeof(struct ip6_hdr);
768 		th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
769 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
770 		tcp_hlen = th->th_off << 2;
771 		break;
772 #endif
773 #ifdef INET
774 	case ETHERTYPE_IP:
775 		ip = (struct ip *)(mp->m_data + elen);
776 		if (ip->ip_p != IPPROTO_TCP)
777 			return (ENXIO);
778 		ip->ip_sum = 0;
779 		ip_hlen = ip->ip_hl << 2;
780 		th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
781 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
782 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
783 		tcp_hlen = th->th_off << 2;
784 		break;
785 #endif
786 	default:
787 		printf("%s: CSUM_TSO but no supported IP version (0x%04x)",
788 		    __func__, ntohs(etype));
789 		return FALSE;
790         }
791 
792         /* Ensure we have at least the IP+TCP header in the first mbuf. */
793         if (mp->m_len < elen + ip_hlen + sizeof(struct tcphdr))
794 		return FALSE;
795 
796 	idx = txr->next_avail;
797 	buf = &txr->buffers[idx];
798 	TXD = (struct i40e_tx_context_desc *) &txr->base[idx];
799 	tsolen = mp->m_pkthdr.len - (elen + ip_hlen + tcp_hlen);
800 
801 	type = I40E_TX_DESC_DTYPE_CONTEXT;
802 	cmd = I40E_TX_CTX_DESC_TSO;
803 	mss = mp->m_pkthdr.tso_segsz;
804 
805 	type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) |
806 	    ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
807 	    ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
808 	    ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
809 	TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss);
810 
811 	TXD->tunneling_params = htole32(0);
812 	buf->m_head = NULL;
813 	buf->eop_index = -1;
814 
815 	if (++idx == que->num_desc)
816 		idx = 0;
817 
818 	txr->avail--;
819 	txr->next_avail = idx;
820 
821 	return TRUE;
822 }
823 
824 /*
825 ** ixl_get_tx_head - Retrieve the value from the
826 **    location the HW records its HEAD index
827 */
828 static inline u32
829 ixl_get_tx_head(struct ixl_queue *que)
830 {
831 	struct tx_ring  *txr = &que->txr;
832 	void *head = &txr->base[que->num_desc];
833 	return LE32_TO_CPU(*(volatile __le32 *)head);
834 }
835 
836 /**********************************************************************
837  *
838  *  Examine each tx_buffer in the used queue. If the hardware is done
839  *  processing the packet then free associated resources. The
840  *  tx_buffer is put back on the free queue.
841  *
842  **********************************************************************/
843 bool
844 ixl_txeof(struct ixl_queue *que)
845 {
846 	struct tx_ring		*txr = &que->txr;
847 	u32			first, last, head, done, processed;
848 	struct ixl_tx_buf	*buf;
849 	struct i40e_tx_desc	*tx_desc, *eop_desc;
850 
851 
852 	mtx_assert(&txr->mtx, MA_OWNED);
853 
854 #ifdef DEV_NETMAP
855 	// XXX todo: implement moderation
856 	if (netmap_tx_irq(que->vsi->ifp, que->me))
857 		return FALSE;
858 #endif /* DEF_NETMAP */
859 
860 	/* These are not the descriptors you seek, move along :) */
861 	if (txr->avail == que->num_desc) {
862 		que->busy = 0;
863 		return FALSE;
864 	}
865 
866 	processed = 0;
867 	first = txr->next_to_clean;
868 	buf = &txr->buffers[first];
869 	tx_desc = (struct i40e_tx_desc *)&txr->base[first];
870 	last = buf->eop_index;
871 	if (last == -1)
872 		return FALSE;
873 	eop_desc = (struct i40e_tx_desc *)&txr->base[last];
874 
875 	/* Get the Head WB value */
876 	head = ixl_get_tx_head(que);
877 
878 	/*
879 	** Get the index of the first descriptor
880 	** BEYOND the EOP and call that 'done'.
881 	** I do this so the comparison in the
882 	** inner while loop below can be simple
883 	*/
884 	if (++last == que->num_desc) last = 0;
885 	done = last;
886 
887         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
888             BUS_DMASYNC_POSTREAD);
889 	/*
890 	** The HEAD index of the ring is written in a
891 	** defined location, this rather than a done bit
892 	** is what is used to keep track of what must be
893 	** 'cleaned'.
894 	*/
895 	while (first != head) {
896 		/* We clean the range of the packet */
897 		while (first != done) {
898 			++txr->avail;
899 			++processed;
900 
901 			if (buf->m_head) {
902 				txr->bytes += /* for ITR adjustment */
903 				    buf->m_head->m_pkthdr.len;
904 				txr->tx_bytes += /* for TX stats */
905 				    buf->m_head->m_pkthdr.len;
906 				bus_dmamap_sync(buf->tag,
907 				    buf->map,
908 				    BUS_DMASYNC_POSTWRITE);
909 				bus_dmamap_unload(buf->tag,
910 				    buf->map);
911 				m_freem(buf->m_head);
912 				buf->m_head = NULL;
913 				buf->map = NULL;
914 			}
915 			buf->eop_index = -1;
916 
917 			if (++first == que->num_desc)
918 				first = 0;
919 
920 			buf = &txr->buffers[first];
921 			tx_desc = &txr->base[first];
922 		}
923 		++txr->packets;
924 		/* See if there is more work now */
925 		last = buf->eop_index;
926 		if (last != -1) {
927 			eop_desc = &txr->base[last];
928 			/* Get next done point */
929 			if (++last == que->num_desc) last = 0;
930 			done = last;
931 		} else
932 			break;
933 	}
934 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
935 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
936 
937 	txr->next_to_clean = first;
938 
939 
940 	/*
941 	** Hang detection, we know there's
942 	** work outstanding or the first return
943 	** would have been taken, so indicate an
944 	** unsuccessful pass, in local_timer if
945 	** the value is too great the queue will
946 	** be considered hung. If anything has been
947 	** cleaned then reset the state.
948 	*/
949 	if ((processed == 0) && (que->busy != IXL_QUEUE_HUNG))
950 		++que->busy;
951 
952 	if (processed)
953 		que->busy = 1; /* Note this turns off HUNG */
954 
955 	/*
956 	 * If there are no pending descriptors, clear the timeout.
957 	 */
958 	if (txr->avail == que->num_desc) {
959 		que->busy = 0;
960 		return FALSE;
961 	}
962 
963 	return TRUE;
964 }
965 
966 /*********************************************************************
967  *
968  *  Refresh mbuf buffers for RX descriptor rings
969  *   - now keeps its own state so discards due to resource
970  *     exhaustion are unnecessary, if an mbuf cannot be obtained
971  *     it just returns, keeping its placeholder, thus it can simply
972  *     be recalled to try again.
973  *
974  **********************************************************************/
975 static void
976 ixl_refresh_mbufs(struct ixl_queue *que, int limit)
977 {
978 	struct ixl_vsi		*vsi = que->vsi;
979 	struct rx_ring		*rxr = &que->rxr;
980 	bus_dma_segment_t	hseg[1];
981 	bus_dma_segment_t	pseg[1];
982 	struct ixl_rx_buf	*buf;
983 	struct mbuf		*mh, *mp;
984 	int			i, j, nsegs, error;
985 	bool			refreshed = FALSE;
986 
987 	i = j = rxr->next_refresh;
988 	/* Control the loop with one beyond */
989 	if (++j == que->num_desc)
990 		j = 0;
991 
992 	while (j != limit) {
993 		buf = &rxr->buffers[i];
994 		if (rxr->hdr_split == FALSE)
995 			goto no_split;
996 
997 		if (buf->m_head == NULL) {
998 			mh = m_gethdr(M_NOWAIT, MT_DATA);
999 			if (mh == NULL)
1000 				goto update;
1001 		} else
1002 			mh = buf->m_head;
1003 
1004 		mh->m_pkthdr.len = mh->m_len = MHLEN;
1005 		mh->m_len = MHLEN;
1006 		mh->m_flags |= M_PKTHDR;
1007 		/* Get the memory mapping */
1008 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1009 		    buf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT);
1010 		if (error != 0) {
1011 			printf("Refresh mbufs: hdr dmamap load"
1012 			    " failure - %d\n", error);
1013 			m_free(mh);
1014 			buf->m_head = NULL;
1015 			goto update;
1016 		}
1017 		buf->m_head = mh;
1018 		bus_dmamap_sync(rxr->htag, buf->hmap,
1019 		    BUS_DMASYNC_PREREAD);
1020 		rxr->base[i].read.hdr_addr =
1021 		   htole64(hseg[0].ds_addr);
1022 
1023 no_split:
1024 		if (buf->m_pack == NULL) {
1025 			mp = m_getjcl(M_NOWAIT, MT_DATA,
1026 			    M_PKTHDR, rxr->mbuf_sz);
1027 			if (mp == NULL)
1028 				goto update;
1029 		} else
1030 			mp = buf->m_pack;
1031 
1032 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1033 		/* Get the memory mapping */
1034 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1035 		    buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT);
1036 		if (error != 0) {
1037 			printf("Refresh mbufs: payload dmamap load"
1038 			    " failure - %d\n", error);
1039 			m_free(mp);
1040 			buf->m_pack = NULL;
1041 			goto update;
1042 		}
1043 		buf->m_pack = mp;
1044 		bus_dmamap_sync(rxr->ptag, buf->pmap,
1045 		    BUS_DMASYNC_PREREAD);
1046 		rxr->base[i].read.pkt_addr =
1047 		   htole64(pseg[0].ds_addr);
1048 		/* Used only when doing header split */
1049 		rxr->base[i].read.hdr_addr = 0;
1050 
1051 		refreshed = TRUE;
1052 		/* Next is precalculated */
1053 		i = j;
1054 		rxr->next_refresh = i;
1055 		if (++j == que->num_desc)
1056 			j = 0;
1057 	}
1058 update:
1059 	if (refreshed) /* Update hardware tail index */
1060 		wr32(vsi->hw, rxr->tail, rxr->next_refresh);
1061 	return;
1062 }
1063 
1064 
1065 /*********************************************************************
1066  *
1067  *  Allocate memory for rx_buffer structures. Since we use one
1068  *  rx_buffer per descriptor, the maximum number of rx_buffer's
1069  *  that we'll need is equal to the number of receive descriptors
1070  *  that we've defined.
1071  *
1072  **********************************************************************/
1073 int
1074 ixl_allocate_rx_data(struct ixl_queue *que)
1075 {
1076 	struct rx_ring		*rxr = &que->rxr;
1077 	struct ixl_vsi		*vsi = que->vsi;
1078 	device_t 		dev = vsi->dev;
1079 	struct ixl_rx_buf 	*buf;
1080 	int             	i, bsize, error;
1081 
1082 	bsize = sizeof(struct ixl_rx_buf) * que->num_desc;
1083 	if (!(rxr->buffers =
1084 	    (struct ixl_rx_buf *) malloc(bsize,
1085 	    M_DEVBUF, M_NOWAIT | M_ZERO))) {
1086 		device_printf(dev, "Unable to allocate rx_buffer memory\n");
1087 		error = ENOMEM;
1088 		return (error);
1089 	}
1090 
1091 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1092 				   1, 0,	/* alignment, bounds */
1093 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1094 				   BUS_SPACE_MAXADDR,	/* highaddr */
1095 				   NULL, NULL,		/* filter, filterarg */
1096 				   MSIZE,		/* maxsize */
1097 				   1,			/* nsegments */
1098 				   MSIZE,		/* maxsegsize */
1099 				   0,			/* flags */
1100 				   NULL,		/* lockfunc */
1101 				   NULL,		/* lockfuncarg */
1102 				   &rxr->htag))) {
1103 		device_printf(dev, "Unable to create RX DMA htag\n");
1104 		return (error);
1105 	}
1106 
1107 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1108 				   1, 0,	/* alignment, bounds */
1109 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1110 				   BUS_SPACE_MAXADDR,	/* highaddr */
1111 				   NULL, NULL,		/* filter, filterarg */
1112 				   MJUM16BYTES,		/* maxsize */
1113 				   1,			/* nsegments */
1114 				   MJUM16BYTES,		/* maxsegsize */
1115 				   0,			/* flags */
1116 				   NULL,		/* lockfunc */
1117 				   NULL,		/* lockfuncarg */
1118 				   &rxr->ptag))) {
1119 		device_printf(dev, "Unable to create RX DMA ptag\n");
1120 		return (error);
1121 	}
1122 
1123 	for (i = 0; i < que->num_desc; i++) {
1124 		buf = &rxr->buffers[i];
1125 		error = bus_dmamap_create(rxr->htag,
1126 		    BUS_DMA_NOWAIT, &buf->hmap);
1127 		if (error) {
1128 			device_printf(dev, "Unable to create RX head map\n");
1129 			break;
1130 		}
1131 		error = bus_dmamap_create(rxr->ptag,
1132 		    BUS_DMA_NOWAIT, &buf->pmap);
1133 		if (error) {
1134 			device_printf(dev, "Unable to create RX pkt map\n");
1135 			break;
1136 		}
1137 	}
1138 
1139 	return (error);
1140 }
1141 
1142 
1143 /*********************************************************************
1144  *
1145  *  (Re)Initialize the queue receive ring and its buffers.
1146  *
1147  **********************************************************************/
1148 int
1149 ixl_init_rx_ring(struct ixl_queue *que)
1150 {
1151 	struct	rx_ring 	*rxr = &que->rxr;
1152 	struct ixl_vsi		*vsi = que->vsi;
1153 #if defined(INET6) || defined(INET)
1154 	struct ifnet		*ifp = vsi->ifp;
1155 	struct lro_ctrl		*lro = &rxr->lro;
1156 #endif
1157 	struct ixl_rx_buf	*buf;
1158 	bus_dma_segment_t	pseg[1], hseg[1];
1159 	int			rsize, nsegs, error = 0;
1160 #ifdef DEV_NETMAP
1161 	struct netmap_adapter *na = NA(que->vsi->ifp);
1162 	struct netmap_slot *slot;
1163 #endif /* DEV_NETMAP */
1164 
1165 	IXL_RX_LOCK(rxr);
1166 #ifdef DEV_NETMAP
1167 	/* same as in ixl_init_tx_ring() */
1168 	slot = netmap_reset(na, NR_RX, que->me, 0);
1169 #endif /* DEV_NETMAP */
1170 	/* Clear the ring contents */
1171 	rsize = roundup2(que->num_desc *
1172 	    sizeof(union i40e_rx_desc), DBA_ALIGN);
1173 	bzero((void *)rxr->base, rsize);
1174 	/* Cleanup any existing buffers */
1175 	for (int i = 0; i < que->num_desc; i++) {
1176 		buf = &rxr->buffers[i];
1177 		if (buf->m_head != NULL) {
1178 			bus_dmamap_sync(rxr->htag, buf->hmap,
1179 			    BUS_DMASYNC_POSTREAD);
1180 			bus_dmamap_unload(rxr->htag, buf->hmap);
1181 			buf->m_head->m_flags |= M_PKTHDR;
1182 			m_freem(buf->m_head);
1183 		}
1184 		if (buf->m_pack != NULL) {
1185 			bus_dmamap_sync(rxr->ptag, buf->pmap,
1186 			    BUS_DMASYNC_POSTREAD);
1187 			bus_dmamap_unload(rxr->ptag, buf->pmap);
1188 			buf->m_pack->m_flags |= M_PKTHDR;
1189 			m_freem(buf->m_pack);
1190 		}
1191 		buf->m_head = NULL;
1192 		buf->m_pack = NULL;
1193 	}
1194 
1195 	/* header split is off */
1196 	rxr->hdr_split = FALSE;
1197 
1198 	/* Now replenish the mbufs */
1199 	for (int j = 0; j != que->num_desc; ++j) {
1200 		struct mbuf	*mh, *mp;
1201 
1202 		buf = &rxr->buffers[j];
1203 #ifdef DEV_NETMAP
1204 		/*
1205 		 * In netmap mode, fill the map and set the buffer
1206 		 * address in the NIC ring, considering the offset
1207 		 * between the netmap and NIC rings (see comment in
1208 		 * ixgbe_setup_transmit_ring() ). No need to allocate
1209 		 * an mbuf, so end the block with a continue;
1210 		 */
1211 		if (slot) {
1212 			int sj = netmap_idx_n2k(&na->rx_rings[que->me], j);
1213 			uint64_t paddr;
1214 			void *addr;
1215 
1216 			addr = PNMB(na, slot + sj, &paddr);
1217 			netmap_load_map(na, rxr->dma.tag, buf->pmap, addr);
1218 			/* Update descriptor and the cached value */
1219 			rxr->base[j].read.pkt_addr = htole64(paddr);
1220 			rxr->base[j].read.hdr_addr = 0;
1221 			continue;
1222 		}
1223 #endif /* DEV_NETMAP */
1224 		/*
1225 		** Don't allocate mbufs if not
1226 		** doing header split, its wasteful
1227 		*/
1228 		if (rxr->hdr_split == FALSE)
1229 			goto skip_head;
1230 
1231 		/* First the header */
1232 		buf->m_head = m_gethdr(M_NOWAIT, MT_DATA);
1233 		if (buf->m_head == NULL) {
1234 			error = ENOBUFS;
1235 			goto fail;
1236 		}
1237 		m_adj(buf->m_head, ETHER_ALIGN);
1238 		mh = buf->m_head;
1239 		mh->m_len = mh->m_pkthdr.len = MHLEN;
1240 		mh->m_flags |= M_PKTHDR;
1241 		/* Get the memory mapping */
1242 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1243 		    buf->hmap, buf->m_head, hseg,
1244 		    &nsegs, BUS_DMA_NOWAIT);
1245 		if (error != 0) /* Nothing elegant to do here */
1246 			goto fail;
1247 		bus_dmamap_sync(rxr->htag,
1248 		    buf->hmap, BUS_DMASYNC_PREREAD);
1249 		/* Update descriptor */
1250 		rxr->base[j].read.hdr_addr = htole64(hseg[0].ds_addr);
1251 
1252 skip_head:
1253 		/* Now the payload cluster */
1254 		buf->m_pack = m_getjcl(M_NOWAIT, MT_DATA,
1255 		    M_PKTHDR, rxr->mbuf_sz);
1256 		if (buf->m_pack == NULL) {
1257 			error = ENOBUFS;
1258                         goto fail;
1259 		}
1260 		mp = buf->m_pack;
1261 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1262 		/* Get the memory mapping */
1263 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1264 		    buf->pmap, mp, pseg,
1265 		    &nsegs, BUS_DMA_NOWAIT);
1266 		if (error != 0)
1267                         goto fail;
1268 		bus_dmamap_sync(rxr->ptag,
1269 		    buf->pmap, BUS_DMASYNC_PREREAD);
1270 		/* Update descriptor */
1271 		rxr->base[j].read.pkt_addr = htole64(pseg[0].ds_addr);
1272 		rxr->base[j].read.hdr_addr = 0;
1273 	}
1274 
1275 
1276 	/* Setup our descriptor indices */
1277 	rxr->next_check = 0;
1278 	rxr->next_refresh = 0;
1279 	rxr->lro_enabled = FALSE;
1280 	rxr->split = 0;
1281 	rxr->bytes = 0;
1282 	rxr->discard = FALSE;
1283 
1284 	wr32(vsi->hw, rxr->tail, que->num_desc - 1);
1285 	ixl_flush(vsi->hw);
1286 
1287 #if defined(INET6) || defined(INET)
1288 	/*
1289 	** Now set up the LRO interface:
1290 	*/
1291 	if (ifp->if_capenable & IFCAP_LRO) {
1292 		int err = tcp_lro_init(lro);
1293 		if (err) {
1294 			if_printf(ifp, "queue %d: LRO Initialization failed!\n", que->me);
1295 			goto fail;
1296 		}
1297 		INIT_DBG_IF(ifp, "queue %d: RX Soft LRO Initialized", que->me);
1298 		rxr->lro_enabled = TRUE;
1299 		lro->ifp = vsi->ifp;
1300 	}
1301 #endif
1302 
1303 	bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1304 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1305 
1306 fail:
1307 	IXL_RX_UNLOCK(rxr);
1308 	return (error);
1309 }
1310 
1311 
1312 /*********************************************************************
1313  *
1314  *  Free station receive ring data structures
1315  *
1316  **********************************************************************/
1317 void
1318 ixl_free_que_rx(struct ixl_queue *que)
1319 {
1320 	struct rx_ring		*rxr = &que->rxr;
1321 	struct ixl_rx_buf	*buf;
1322 
1323 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
1324 
1325 	/* Cleanup any existing buffers */
1326 	if (rxr->buffers != NULL) {
1327 		for (int i = 0; i < que->num_desc; i++) {
1328 			buf = &rxr->buffers[i];
1329 			if (buf->m_head != NULL) {
1330 				bus_dmamap_sync(rxr->htag, buf->hmap,
1331 				    BUS_DMASYNC_POSTREAD);
1332 				bus_dmamap_unload(rxr->htag, buf->hmap);
1333 				buf->m_head->m_flags |= M_PKTHDR;
1334 				m_freem(buf->m_head);
1335 			}
1336 			if (buf->m_pack != NULL) {
1337 				bus_dmamap_sync(rxr->ptag, buf->pmap,
1338 				    BUS_DMASYNC_POSTREAD);
1339 				bus_dmamap_unload(rxr->ptag, buf->pmap);
1340 				buf->m_pack->m_flags |= M_PKTHDR;
1341 				m_freem(buf->m_pack);
1342 			}
1343 			buf->m_head = NULL;
1344 			buf->m_pack = NULL;
1345 			if (buf->hmap != NULL) {
1346 				bus_dmamap_destroy(rxr->htag, buf->hmap);
1347 				buf->hmap = NULL;
1348 			}
1349 			if (buf->pmap != NULL) {
1350 				bus_dmamap_destroy(rxr->ptag, buf->pmap);
1351 				buf->pmap = NULL;
1352 			}
1353 		}
1354 		if (rxr->buffers != NULL) {
1355 			free(rxr->buffers, M_DEVBUF);
1356 			rxr->buffers = NULL;
1357 		}
1358 	}
1359 
1360 	if (rxr->htag != NULL) {
1361 		bus_dma_tag_destroy(rxr->htag);
1362 		rxr->htag = NULL;
1363 	}
1364 	if (rxr->ptag != NULL) {
1365 		bus_dma_tag_destroy(rxr->ptag);
1366 		rxr->ptag = NULL;
1367 	}
1368 
1369 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
1370 	return;
1371 }
1372 
1373 static __inline void
1374 ixl_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u8 ptype)
1375 {
1376 
1377 #if defined(INET6) || defined(INET)
1378         /*
1379          * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet
1380          * should be computed by hardware. Also it should not have VLAN tag in
1381          * ethernet header.
1382          */
1383         if (rxr->lro_enabled &&
1384             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
1385             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) ==
1386             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) {
1387                 /*
1388                  * Send to the stack if:
1389                  **  - LRO not enabled, or
1390                  **  - no LRO resources, or
1391                  **  - lro enqueue fails
1392                  */
1393                 if (rxr->lro.lro_cnt != 0)
1394                         if (tcp_lro_rx(&rxr->lro, m, 0) == 0)
1395                                 return;
1396         }
1397 #endif
1398 	IXL_RX_UNLOCK(rxr);
1399         (*ifp->if_input)(ifp, m);
1400 	IXL_RX_LOCK(rxr);
1401 }
1402 
1403 
1404 static __inline void
1405 ixl_rx_discard(struct rx_ring *rxr, int i)
1406 {
1407 	struct ixl_rx_buf	*rbuf;
1408 
1409 	rbuf = &rxr->buffers[i];
1410 
1411         if (rbuf->fmp != NULL) {/* Partial chain ? */
1412 		rbuf->fmp->m_flags |= M_PKTHDR;
1413                 m_freem(rbuf->fmp);
1414                 rbuf->fmp = NULL;
1415 	}
1416 
1417 	/*
1418 	** With advanced descriptors the writeback
1419 	** clobbers the buffer addrs, so its easier
1420 	** to just free the existing mbufs and take
1421 	** the normal refresh path to get new buffers
1422 	** and mapping.
1423 	*/
1424 	if (rbuf->m_head) {
1425 		m_free(rbuf->m_head);
1426 		rbuf->m_head = NULL;
1427 	}
1428 
1429 	if (rbuf->m_pack) {
1430 		m_free(rbuf->m_pack);
1431 		rbuf->m_pack = NULL;
1432 	}
1433 
1434 	return;
1435 }
1436 
1437 #ifdef RSS
1438 /*
1439 ** i40e_ptype_to_hash: parse the packet type
1440 ** to determine the appropriate hash.
1441 */
1442 static inline int
1443 ixl_ptype_to_hash(u8 ptype)
1444 {
1445         struct i40e_rx_ptype_decoded	decoded;
1446 	u8				ex = 0;
1447 
1448 	decoded = decode_rx_desc_ptype(ptype);
1449 	ex = decoded.outer_frag;
1450 
1451 	if (!decoded.known)
1452 		return M_HASHTYPE_OPAQUE;
1453 
1454 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_L2)
1455 		return M_HASHTYPE_OPAQUE;
1456 
1457 	/* Note: anything that gets to this point is IP */
1458         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6) {
1459 		switch (decoded.inner_prot) {
1460 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1461 				if (ex)
1462 					return M_HASHTYPE_RSS_TCP_IPV6_EX;
1463 				else
1464 					return M_HASHTYPE_RSS_TCP_IPV6;
1465 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1466 				if (ex)
1467 					return M_HASHTYPE_RSS_UDP_IPV6_EX;
1468 				else
1469 					return M_HASHTYPE_RSS_UDP_IPV6;
1470 			default:
1471 				if (ex)
1472 					return M_HASHTYPE_RSS_IPV6_EX;
1473 				else
1474 					return M_HASHTYPE_RSS_IPV6;
1475 		}
1476 	}
1477         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1478 		switch (decoded.inner_prot) {
1479 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1480 					return M_HASHTYPE_RSS_TCP_IPV4;
1481 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1482 				if (ex)
1483 					return M_HASHTYPE_RSS_UDP_IPV4_EX;
1484 				else
1485 					return M_HASHTYPE_RSS_UDP_IPV4;
1486 			default:
1487 					return M_HASHTYPE_RSS_IPV4;
1488 		}
1489 	}
1490 	/* We should never get here!! */
1491 	return M_HASHTYPE_OPAQUE;
1492 }
1493 #endif /* RSS */
1494 
1495 /*********************************************************************
1496  *
1497  *  This routine executes in interrupt context. It replenishes
1498  *  the mbufs in the descriptor and sends data which has been
1499  *  dma'ed into host memory to upper layer.
1500  *
1501  *  We loop at most count times if count is > 0, or until done if
1502  *  count < 0.
1503  *
1504  *  Return TRUE for more work, FALSE for all clean.
1505  *********************************************************************/
1506 bool
1507 ixl_rxeof(struct ixl_queue *que, int count)
1508 {
1509 	struct ixl_vsi		*vsi = que->vsi;
1510 	struct rx_ring		*rxr = &que->rxr;
1511 	struct ifnet		*ifp = vsi->ifp;
1512 #if defined(INET6) || defined(INET)
1513 	struct lro_ctrl		*lro = &rxr->lro;
1514 	struct lro_entry	*queued;
1515 #endif
1516 	int			i, nextp, processed = 0;
1517 	union i40e_rx_desc	*cur;
1518 	struct ixl_rx_buf	*rbuf, *nbuf;
1519 
1520 
1521 	IXL_RX_LOCK(rxr);
1522 
1523 #ifdef DEV_NETMAP
1524 	if (netmap_rx_irq(ifp, que->me, &count)) {
1525 		IXL_RX_UNLOCK(rxr);
1526 		return (FALSE);
1527 	}
1528 #endif /* DEV_NETMAP */
1529 
1530 	for (i = rxr->next_check; count != 0;) {
1531 		struct mbuf	*sendmp, *mh, *mp;
1532 		u32		rsc, status, error;
1533 		u16		hlen, plen, vtag;
1534 		u64		qword;
1535 		u8		ptype;
1536 		bool		eop;
1537 
1538 		/* Sync the ring. */
1539 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1540 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1541 
1542 		cur = &rxr->base[i];
1543 		qword = le64toh(cur->wb.qword1.status_error_len);
1544 		status = (qword & I40E_RXD_QW1_STATUS_MASK)
1545 		    >> I40E_RXD_QW1_STATUS_SHIFT;
1546 		error = (qword & I40E_RXD_QW1_ERROR_MASK)
1547 		    >> I40E_RXD_QW1_ERROR_SHIFT;
1548 		plen = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK)
1549 		    >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1550 		hlen = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK)
1551 		    >> I40E_RXD_QW1_LENGTH_HBUF_SHIFT;
1552 		ptype = (qword & I40E_RXD_QW1_PTYPE_MASK)
1553 		    >> I40E_RXD_QW1_PTYPE_SHIFT;
1554 
1555 		if ((status & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) == 0) {
1556 			++rxr->not_done;
1557 			break;
1558 		}
1559 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1560 			break;
1561 
1562 		count--;
1563 		sendmp = NULL;
1564 		nbuf = NULL;
1565 		rsc = 0;
1566 		cur->wb.qword1.status_error_len = 0;
1567 		rbuf = &rxr->buffers[i];
1568 		mh = rbuf->m_head;
1569 		mp = rbuf->m_pack;
1570 		eop = (status & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT));
1571 		if (status & (1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT))
1572 			vtag = le16toh(cur->wb.qword0.lo_dword.l2tag1);
1573 		else
1574 			vtag = 0;
1575 
1576 		/*
1577 		** Make sure bad packets are discarded,
1578 		** note that only EOP descriptor has valid
1579 		** error results.
1580 		*/
1581                 if (eop && (error & (1 << I40E_RX_DESC_ERROR_RXE_SHIFT))) {
1582 			rxr->discarded++;
1583 			ixl_rx_discard(rxr, i);
1584 			goto next_desc;
1585 		}
1586 
1587 		/* Prefetch the next buffer */
1588 		if (!eop) {
1589 			nextp = i + 1;
1590 			if (nextp == que->num_desc)
1591 				nextp = 0;
1592 			nbuf = &rxr->buffers[nextp];
1593 			prefetch(nbuf);
1594 		}
1595 
1596 		/*
1597 		** The header mbuf is ONLY used when header
1598 		** split is enabled, otherwise we get normal
1599 		** behavior, ie, both header and payload
1600 		** are DMA'd into the payload buffer.
1601 		**
1602 		** Rather than using the fmp/lmp global pointers
1603 		** we now keep the head of a packet chain in the
1604 		** buffer struct and pass this along from one
1605 		** descriptor to the next, until we get EOP.
1606 		*/
1607 		if (rxr->hdr_split && (rbuf->fmp == NULL)) {
1608 			if (hlen > IXL_RX_HDR)
1609 				hlen = IXL_RX_HDR;
1610 			mh->m_len = hlen;
1611 			mh->m_flags |= M_PKTHDR;
1612 			mh->m_next = NULL;
1613 			mh->m_pkthdr.len = mh->m_len;
1614 			/* Null buf pointer so it is refreshed */
1615 			rbuf->m_head = NULL;
1616 			/*
1617 			** Check the payload length, this
1618 			** could be zero if its a small
1619 			** packet.
1620 			*/
1621 			if (plen > 0) {
1622 				mp->m_len = plen;
1623 				mp->m_next = NULL;
1624 				mp->m_flags &= ~M_PKTHDR;
1625 				mh->m_next = mp;
1626 				mh->m_pkthdr.len += mp->m_len;
1627 				/* Null buf pointer so it is refreshed */
1628 				rbuf->m_pack = NULL;
1629 				rxr->split++;
1630 			}
1631 			/*
1632 			** Now create the forward
1633 			** chain so when complete
1634 			** we wont have to.
1635 			*/
1636                         if (eop == 0) {
1637 				/* stash the chain head */
1638                                 nbuf->fmp = mh;
1639 				/* Make forward chain */
1640                                 if (plen)
1641                                         mp->m_next = nbuf->m_pack;
1642                                 else
1643                                         mh->m_next = nbuf->m_pack;
1644                         } else {
1645 				/* Singlet, prepare to send */
1646                                 sendmp = mh;
1647                                 if (vtag) {
1648                                         sendmp->m_pkthdr.ether_vtag = vtag;
1649                                         sendmp->m_flags |= M_VLANTAG;
1650                                 }
1651                         }
1652 		} else {
1653 			/*
1654 			** Either no header split, or a
1655 			** secondary piece of a fragmented
1656 			** split packet.
1657 			*/
1658 			mp->m_len = plen;
1659 			/*
1660 			** See if there is a stored head
1661 			** that determines what we are
1662 			*/
1663 			sendmp = rbuf->fmp;
1664 			rbuf->m_pack = rbuf->fmp = NULL;
1665 
1666 			if (sendmp != NULL) /* secondary frag */
1667 				sendmp->m_pkthdr.len += mp->m_len;
1668 			else {
1669 				/* first desc of a non-ps chain */
1670 				sendmp = mp;
1671 				sendmp->m_flags |= M_PKTHDR;
1672 				sendmp->m_pkthdr.len = mp->m_len;
1673 				if (vtag) {
1674 					sendmp->m_pkthdr.ether_vtag = vtag;
1675 					sendmp->m_flags |= M_VLANTAG;
1676 				}
1677                         }
1678 			/* Pass the head pointer on */
1679 			if (eop == 0) {
1680 				nbuf->fmp = sendmp;
1681 				sendmp = NULL;
1682 				mp->m_next = nbuf->m_pack;
1683 			}
1684 		}
1685 		++processed;
1686 		/* Sending this frame? */
1687 		if (eop) {
1688 			sendmp->m_pkthdr.rcvif = ifp;
1689 			/* gather stats */
1690 			rxr->rx_packets++;
1691 			rxr->rx_bytes += sendmp->m_pkthdr.len;
1692 			/* capture data for dynamic ITR adjustment */
1693 			rxr->packets++;
1694 			rxr->bytes += sendmp->m_pkthdr.len;
1695 			if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
1696 				ixl_rx_checksum(sendmp, status, error, ptype);
1697 #ifdef RSS
1698 			sendmp->m_pkthdr.flowid =
1699 			    le32toh(cur->wb.qword0.hi_dword.rss);
1700 			M_HASHTYPE_SET(sendmp, ixl_ptype_to_hash(ptype));
1701 #else
1702 			sendmp->m_pkthdr.flowid = que->msix;
1703 			M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE);
1704 #endif
1705 		}
1706 next_desc:
1707 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1708 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1709 
1710 		/* Advance our pointers to the next descriptor. */
1711 		if (++i == que->num_desc)
1712 			i = 0;
1713 
1714 		/* Now send to the stack or do LRO */
1715 		if (sendmp != NULL) {
1716 			rxr->next_check = i;
1717 			ixl_rx_input(rxr, ifp, sendmp, ptype);
1718 			i = rxr->next_check;
1719 		}
1720 
1721                /* Every 8 descriptors we go to refresh mbufs */
1722 		if (processed == 8) {
1723 			ixl_refresh_mbufs(que, i);
1724 			processed = 0;
1725 		}
1726 	}
1727 
1728 	/* Refresh any remaining buf structs */
1729 	if (ixl_rx_unrefreshed(que))
1730 		ixl_refresh_mbufs(que, i);
1731 
1732 	rxr->next_check = i;
1733 
1734 #if defined(INET6) || defined(INET)
1735 	/*
1736 	 * Flush any outstanding LRO work
1737 	 */
1738 	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1739 		SLIST_REMOVE_HEAD(&lro->lro_active, next);
1740 		tcp_lro_flush(lro, queued);
1741 	}
1742 #endif
1743 
1744 	IXL_RX_UNLOCK(rxr);
1745 	return (FALSE);
1746 }
1747 
1748 
1749 /*********************************************************************
1750  *
1751  *  Verify that the hardware indicated that the checksum is valid.
1752  *  Inform the stack about the status of checksum so that stack
1753  *  doesn't spend time verifying the checksum.
1754  *
1755  *********************************************************************/
1756 static void
1757 ixl_rx_checksum(struct mbuf * mp, u32 status, u32 error, u8 ptype)
1758 {
1759 	struct i40e_rx_ptype_decoded decoded;
1760 
1761 	decoded = decode_rx_desc_ptype(ptype);
1762 
1763 	/* Errors? */
1764  	if (error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) |
1765 	    (1 << I40E_RX_DESC_ERROR_L4E_SHIFT))) {
1766 		mp->m_pkthdr.csum_flags = 0;
1767 		return;
1768 	}
1769 
1770 	/* IPv6 with extension headers likely have bad csum */
1771 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1772 	    decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6)
1773 		if (status &
1774 		    (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT)) {
1775 			mp->m_pkthdr.csum_flags = 0;
1776 			return;
1777 		}
1778 
1779 
1780 	/* IP Checksum Good */
1781 	mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
1782 	mp->m_pkthdr.csum_flags |= CSUM_IP_VALID;
1783 
1784 	if (status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) {
1785 		mp->m_pkthdr.csum_flags |=
1786 		    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1787 		mp->m_pkthdr.csum_data |= htons(0xffff);
1788 	}
1789 	return;
1790 }
1791 
1792 #if __FreeBSD_version >= 1100000
1793 uint64_t
1794 ixl_get_counter(if_t ifp, ift_counter cnt)
1795 {
1796 	struct ixl_vsi *vsi;
1797 
1798 	vsi = if_getsoftc(ifp);
1799 
1800 	switch (cnt) {
1801 	case IFCOUNTER_IPACKETS:
1802 		return (vsi->ipackets);
1803 	case IFCOUNTER_IERRORS:
1804 		return (vsi->ierrors);
1805 	case IFCOUNTER_OPACKETS:
1806 		return (vsi->opackets);
1807 	case IFCOUNTER_OERRORS:
1808 		return (vsi->oerrors);
1809 	case IFCOUNTER_COLLISIONS:
1810 		/* Collisions are by standard impossible in 40G/10G Ethernet */
1811 		return (0);
1812 	case IFCOUNTER_IBYTES:
1813 		return (vsi->ibytes);
1814 	case IFCOUNTER_OBYTES:
1815 		return (vsi->obytes);
1816 	case IFCOUNTER_IMCASTS:
1817 		return (vsi->imcasts);
1818 	case IFCOUNTER_OMCASTS:
1819 		return (vsi->omcasts);
1820 	case IFCOUNTER_IQDROPS:
1821 		return (vsi->iqdrops);
1822 	case IFCOUNTER_OQDROPS:
1823 		return (vsi->oqdrops);
1824 	case IFCOUNTER_NOPROTO:
1825 		return (vsi->noproto);
1826 	default:
1827 		return (if_get_counter_default(ifp, cnt));
1828 	}
1829 }
1830 #endif
1831 
1832