xref: /freebsd/sys/dev/ixl/ixl_txrx.c (revision 01b792f1f535c12a1a14000cf3360ef6c36cee2d)
1 /******************************************************************************
2 
3   Copyright (c) 2013-2015, Intel Corporation
4   All rights reserved.
5 
6   Redistribution and use in source and binary forms, with or without
7   modification, are permitted provided that the following conditions are met:
8 
9    1. Redistributions of source code must retain the above copyright notice,
10       this list of conditions and the following disclaimer.
11 
12    2. Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15 
16    3. Neither the name of the Intel Corporation nor the names of its
17       contributors may be used to endorse or promote products derived from
18       this software without specific prior written permission.
19 
20   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   POSSIBILITY OF SUCH DAMAGE.
31 
32 ******************************************************************************/
33 /*$FreeBSD$*/
34 
35 /*
36 **	IXL driver TX/RX Routines:
37 **	    This was seperated to allow usage by
38 ** 	    both the BASE and the VF drivers.
39 */
40 
41 #ifndef IXL_STANDALONE_BUILD
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_rss.h"
45 #endif
46 
47 #include "ixl.h"
48 
49 #ifdef RSS
50 #include <net/rss_config.h>
51 #endif
52 
53 /* Local Prototypes */
54 static void	ixl_rx_checksum(struct mbuf *, u32, u32, u8);
55 static void	ixl_refresh_mbufs(struct ixl_queue *, int);
56 static int      ixl_xmit(struct ixl_queue *, struct mbuf **);
57 static int	ixl_tx_setup_offload(struct ixl_queue *,
58 		    struct mbuf *, u32 *, u32 *);
59 static bool	ixl_tso_setup(struct ixl_queue *, struct mbuf *);
60 
61 static __inline void ixl_rx_discard(struct rx_ring *, int);
62 static __inline void ixl_rx_input(struct rx_ring *, struct ifnet *,
63 		    struct mbuf *, u8);
64 
65 #ifdef DEV_NETMAP
66 #include <dev/netmap/if_ixl_netmap.h>
67 #endif /* DEV_NETMAP */
68 
69 /*
70 ** Multiqueue Transmit driver
71 **
72 */
73 int
74 ixl_mq_start(struct ifnet *ifp, struct mbuf *m)
75 {
76 	struct ixl_vsi		*vsi = ifp->if_softc;
77 	struct ixl_queue	*que;
78 	struct tx_ring		*txr;
79 	int 			err, i;
80 #ifdef RSS
81 	u32			bucket_id;
82 #endif
83 
84 	/*
85 	** Which queue to use:
86 	**
87 	** When doing RSS, map it to the same outbound
88 	** queue as the incoming flow would be mapped to.
89 	** If everything is setup correctly, it should be
90 	** the same bucket that the current CPU we're on is.
91 	*/
92 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
93 #ifdef  RSS
94 		if (rss_hash2bucket(m->m_pkthdr.flowid,
95 		    M_HASHTYPE_GET(m), &bucket_id) == 0) {
96 			i = bucket_id % vsi->num_queues;
97                 } else
98 #endif
99                         i = m->m_pkthdr.flowid % vsi->num_queues;
100         } else
101 		i = curcpu % vsi->num_queues;
102 	/*
103 	** This may not be perfect, but until something
104 	** better comes along it will keep from scheduling
105 	** on stalled queues.
106 	*/
107 	if (((1 << i) & vsi->active_queues) == 0)
108 		i = ffsl(vsi->active_queues);
109 
110 	que = &vsi->queues[i];
111 	txr = &que->txr;
112 
113 	err = drbr_enqueue(ifp, txr->br, m);
114 	if (err)
115 		return(err);
116 	if (IXL_TX_TRYLOCK(txr)) {
117 		ixl_mq_start_locked(ifp, txr);
118 		IXL_TX_UNLOCK(txr);
119 	} else
120 		taskqueue_enqueue(que->tq, &que->tx_task);
121 
122 	return (0);
123 }
124 
125 int
126 ixl_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
127 {
128 	struct ixl_queue	*que = txr->que;
129 	struct ixl_vsi		*vsi = que->vsi;
130         struct mbuf		*next;
131         int			err = 0;
132 
133 
134 	if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ||
135 	    vsi->link_active == 0)
136 		return (ENETDOWN);
137 
138 	/* Process the transmit queue */
139 	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
140 		if ((err = ixl_xmit(que, &next)) != 0) {
141 			if (next == NULL)
142 				drbr_advance(ifp, txr->br);
143 			else
144 				drbr_putback(ifp, txr->br, next);
145 			break;
146 		}
147 		drbr_advance(ifp, txr->br);
148 		/* Send a copy of the frame to the BPF listener */
149 		ETHER_BPF_MTAP(ifp, next);
150 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
151 			break;
152 	}
153 
154 	if (txr->avail < IXL_TX_CLEANUP_THRESHOLD)
155 		ixl_txeof(que);
156 
157 	return (err);
158 }
159 
160 /*
161  * Called from a taskqueue to drain queued transmit packets.
162  */
163 void
164 ixl_deferred_mq_start(void *arg, int pending)
165 {
166 	struct ixl_queue	*que = arg;
167         struct tx_ring		*txr = &que->txr;
168 	struct ixl_vsi		*vsi = que->vsi;
169         struct ifnet		*ifp = vsi->ifp;
170 
171 	IXL_TX_LOCK(txr);
172 	if (!drbr_empty(ifp, txr->br))
173 		ixl_mq_start_locked(ifp, txr);
174 	IXL_TX_UNLOCK(txr);
175 }
176 
177 /*
178 ** Flush all queue ring buffers
179 */
180 void
181 ixl_qflush(struct ifnet *ifp)
182 {
183 	struct ixl_vsi	*vsi = ifp->if_softc;
184 
185         for (int i = 0; i < vsi->num_queues; i++) {
186 		struct ixl_queue *que = &vsi->queues[i];
187 		struct tx_ring	*txr = &que->txr;
188 		struct mbuf	*m;
189 		IXL_TX_LOCK(txr);
190 		while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
191 			m_freem(m);
192 		IXL_TX_UNLOCK(txr);
193 	}
194 	if_qflush(ifp);
195 }
196 
197 /*
198 ** Find mbuf chains passed to the driver
199 ** that are 'sparse', using more than 8
200 ** mbufs to deliver an mss-size chunk of data
201 */
202 static inline bool
203 ixl_tso_detect_sparse(struct mbuf *mp)
204 {
205 	struct mbuf	*m;
206 	int		num = 0, mss;
207 	bool		ret = FALSE;
208 
209 	mss = mp->m_pkthdr.tso_segsz;
210 	for (m = mp->m_next; m != NULL; m = m->m_next) {
211 		num++;
212 		mss -= m->m_len;
213 		if (mss < 1)
214 			break;
215 		if (m->m_next == NULL)
216 			break;
217 	}
218 	if (num > IXL_SPARSE_CHAIN)
219 		ret = TRUE;
220 
221 	return (ret);
222 }
223 
224 
225 /*********************************************************************
226  *
227  *  This routine maps the mbufs to tx descriptors, allowing the
228  *  TX engine to transmit the packets.
229  *  	- return 0 on success, positive on failure
230  *
231  **********************************************************************/
232 #define IXL_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
233 
234 static int
235 ixl_xmit(struct ixl_queue *que, struct mbuf **m_headp)
236 {
237 	struct ixl_vsi		*vsi = que->vsi;
238 	struct i40e_hw		*hw = vsi->hw;
239 	struct tx_ring		*txr = &que->txr;
240 	struct ixl_tx_buf	*buf;
241 	struct i40e_tx_desc	*txd = NULL;
242 	struct mbuf		*m_head, *m;
243 	int             	i, j, error, nsegs, maxsegs;
244 	int			first, last = 0;
245 	u16			vtag = 0;
246 	u32			cmd, off;
247 	bus_dmamap_t		map;
248 	bus_dma_tag_t		tag;
249 	bus_dma_segment_t	segs[IXL_MAX_TSO_SEGS];
250 
251 
252 	cmd = off = 0;
253 	m_head = *m_headp;
254 
255         /*
256          * Important to capture the first descriptor
257          * used because it will contain the index of
258          * the one we tell the hardware to report back
259          */
260         first = txr->next_avail;
261 	buf = &txr->buffers[first];
262 	map = buf->map;
263 	tag = txr->tx_tag;
264 	maxsegs = IXL_MAX_TX_SEGS;
265 
266 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
267 		/* Use larger mapping for TSO */
268 		tag = txr->tso_tag;
269 		maxsegs = IXL_MAX_TSO_SEGS;
270 		if (ixl_tso_detect_sparse(m_head)) {
271 			m = m_defrag(m_head, M_NOWAIT);
272 			if (m == NULL) {
273 				m_freem(*m_headp);
274 				*m_headp = NULL;
275 				return (ENOBUFS);
276 			}
277 			*m_headp = m;
278 		}
279 	}
280 
281 	/*
282 	 * Map the packet for DMA.
283 	 */
284 	error = bus_dmamap_load_mbuf_sg(tag, map,
285 	    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
286 
287 	if (error == EFBIG) {
288 		struct mbuf *m;
289 
290 		m = m_collapse(*m_headp, M_NOWAIT, maxsegs);
291 		if (m == NULL) {
292 			que->mbuf_defrag_failed++;
293 			m_freem(*m_headp);
294 			*m_headp = NULL;
295 			return (ENOBUFS);
296 		}
297 		*m_headp = m;
298 
299 		/* Try it again */
300 		error = bus_dmamap_load_mbuf_sg(tag, map,
301 		    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
302 
303 		if (error == ENOMEM) {
304 			que->tx_dma_setup++;
305 			return (error);
306 		} else if (error != 0) {
307 			que->tx_dma_setup++;
308 			m_freem(*m_headp);
309 			*m_headp = NULL;
310 			return (error);
311 		}
312 	} else if (error == ENOMEM) {
313 		que->tx_dma_setup++;
314 		return (error);
315 	} else if (error != 0) {
316 		que->tx_dma_setup++;
317 		m_freem(*m_headp);
318 		*m_headp = NULL;
319 		return (error);
320 	}
321 
322 	/* Make certain there are enough descriptors */
323 	if (nsegs > txr->avail - 2) {
324 		txr->no_desc++;
325 		error = ENOBUFS;
326 		goto xmit_fail;
327 	}
328 	m_head = *m_headp;
329 
330 	/* Set up the TSO/CSUM offload */
331 	if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) {
332 		error = ixl_tx_setup_offload(que, m_head, &cmd, &off);
333 		if (error)
334 			goto xmit_fail;
335 	}
336 
337 	cmd |= I40E_TX_DESC_CMD_ICRC;
338 	/* Grab the VLAN tag */
339 	if (m_head->m_flags & M_VLANTAG) {
340 		cmd |= I40E_TX_DESC_CMD_IL2TAG1;
341 		vtag = htole16(m_head->m_pkthdr.ether_vtag);
342 	}
343 
344 	i = txr->next_avail;
345 	for (j = 0; j < nsegs; j++) {
346 		bus_size_t seglen;
347 
348 		buf = &txr->buffers[i];
349 		buf->tag = tag; /* Keep track of the type tag */
350 		txd = &txr->base[i];
351 		seglen = segs[j].ds_len;
352 
353 		txd->buffer_addr = htole64(segs[j].ds_addr);
354 		txd->cmd_type_offset_bsz =
355 		    htole64(I40E_TX_DESC_DTYPE_DATA
356 		    | ((u64)cmd  << I40E_TXD_QW1_CMD_SHIFT)
357 		    | ((u64)off << I40E_TXD_QW1_OFFSET_SHIFT)
358 		    | ((u64)seglen  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)
359 		    | ((u64)vtag  << I40E_TXD_QW1_L2TAG1_SHIFT));
360 
361 		last = i; /* descriptor that will get completion IRQ */
362 
363 		if (++i == que->num_desc)
364 			i = 0;
365 
366 		buf->m_head = NULL;
367 		buf->eop_index = -1;
368 	}
369 	/* Set the last descriptor for report */
370 	txd->cmd_type_offset_bsz |=
371 	    htole64(((u64)IXL_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT));
372 	txr->avail -= nsegs;
373 	txr->next_avail = i;
374 
375 	buf->m_head = m_head;
376 	/* Swap the dma map between the first and last descriptor */
377 	txr->buffers[first].map = buf->map;
378 	buf->map = map;
379 	bus_dmamap_sync(tag, map, BUS_DMASYNC_PREWRITE);
380 
381         /* Set the index of the descriptor that will be marked done */
382         buf = &txr->buffers[first];
383 	buf->eop_index = last;
384 
385         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
386             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
387 	/*
388 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
389 	 * hardware that this frame is available to transmit.
390 	 */
391 	++txr->total_packets;
392 	wr32(hw, txr->tail, i);
393 
394 	ixl_flush(hw);
395 	/* Mark outstanding work */
396 	if (que->busy == 0)
397 		que->busy = 1;
398 	return (0);
399 
400 xmit_fail:
401 	bus_dmamap_unload(tag, buf->map);
402 	return (error);
403 }
404 
405 
406 /*********************************************************************
407  *
408  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
409  *  the information needed to transmit a packet on the wire. This is
410  *  called only once at attach, setup is done every reset.
411  *
412  **********************************************************************/
413 int
414 ixl_allocate_tx_data(struct ixl_queue *que)
415 {
416 	struct tx_ring		*txr = &que->txr;
417 	struct ixl_vsi		*vsi = que->vsi;
418 	device_t		dev = vsi->dev;
419 	struct ixl_tx_buf	*buf;
420 	int			error = 0;
421 
422 	/*
423 	 * Setup DMA descriptor areas.
424 	 */
425 	if ((error = bus_dma_tag_create(NULL,		/* parent */
426 			       1, 0,			/* alignment, bounds */
427 			       BUS_SPACE_MAXADDR,	/* lowaddr */
428 			       BUS_SPACE_MAXADDR,	/* highaddr */
429 			       NULL, NULL,		/* filter, filterarg */
430 			       IXL_TSO_SIZE,		/* maxsize */
431 			       IXL_MAX_TX_SEGS,		/* nsegments */
432 			       PAGE_SIZE,		/* maxsegsize */
433 			       0,			/* flags */
434 			       NULL,			/* lockfunc */
435 			       NULL,			/* lockfuncarg */
436 			       &txr->tx_tag))) {
437 		device_printf(dev,"Unable to allocate TX DMA tag\n");
438 		goto fail;
439 	}
440 
441 	/* Make a special tag for TSO */
442 	if ((error = bus_dma_tag_create(NULL,		/* parent */
443 			       1, 0,			/* alignment, bounds */
444 			       BUS_SPACE_MAXADDR,	/* lowaddr */
445 			       BUS_SPACE_MAXADDR,	/* highaddr */
446 			       NULL, NULL,		/* filter, filterarg */
447 			       IXL_TSO_SIZE,		/* maxsize */
448 			       IXL_MAX_TSO_SEGS,	/* nsegments */
449 			       PAGE_SIZE,		/* maxsegsize */
450 			       0,			/* flags */
451 			       NULL,			/* lockfunc */
452 			       NULL,			/* lockfuncarg */
453 			       &txr->tso_tag))) {
454 		device_printf(dev,"Unable to allocate TX TSO DMA tag\n");
455 		goto fail;
456 	}
457 
458 	if (!(txr->buffers =
459 	    (struct ixl_tx_buf *) malloc(sizeof(struct ixl_tx_buf) *
460 	    que->num_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) {
461 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
462 		error = ENOMEM;
463 		goto fail;
464 	}
465 
466         /* Create the descriptor buffer default dma maps */
467 	buf = txr->buffers;
468 	for (int i = 0; i < que->num_desc; i++, buf++) {
469 		buf->tag = txr->tx_tag;
470 		error = bus_dmamap_create(buf->tag, 0, &buf->map);
471 		if (error != 0) {
472 			device_printf(dev, "Unable to create TX DMA map\n");
473 			goto fail;
474 		}
475 	}
476 fail:
477 	return (error);
478 }
479 
480 
481 /*********************************************************************
482  *
483  *  (Re)Initialize a queue transmit ring.
484  *	- called by init, it clears the descriptor ring,
485  *	  and frees any stale mbufs
486  *
487  **********************************************************************/
488 void
489 ixl_init_tx_ring(struct ixl_queue *que)
490 {
491 	struct tx_ring *txr = &que->txr;
492 	struct ixl_tx_buf *buf;
493 #ifdef DEV_NETMAP
494 	struct netmap_adapter *na = NA(que->vsi->ifp);
495 	struct netmap_slot *slot;
496 #endif /* DEV_NETMAP */
497 
498 	/* Clear the old ring contents */
499 	IXL_TX_LOCK(txr);
500 #ifdef DEV_NETMAP
501 	/*
502 	 * (under lock): if in netmap mode, do some consistency
503 	 * checks and set slot to entry 0 of the netmap ring.
504 	 */
505 	slot = netmap_reset(na, NR_TX, que->me, 0);
506 #endif /* DEV_NETMAP */
507 
508 	bzero((void *)txr->base,
509 	      (sizeof(struct i40e_tx_desc)) * que->num_desc);
510 
511 	/* Reset indices */
512 	txr->next_avail = 0;
513 	txr->next_to_clean = 0;
514 
515 #ifdef IXL_FDIR
516 	/* Initialize flow director */
517 	txr->atr_rate = ixl_atr_rate;
518 	txr->atr_count = 0;
519 #endif
520 
521 	/* Free any existing tx mbufs. */
522         buf = txr->buffers;
523 	for (int i = 0; i < que->num_desc; i++, buf++) {
524 		if (buf->m_head != NULL) {
525 			bus_dmamap_sync(buf->tag, buf->map,
526 			    BUS_DMASYNC_POSTWRITE);
527 			bus_dmamap_unload(buf->tag, buf->map);
528 			m_freem(buf->m_head);
529 			buf->m_head = NULL;
530 		}
531 #ifdef DEV_NETMAP
532 		/*
533 		 * In netmap mode, set the map for the packet buffer.
534 		 * NOTE: Some drivers (not this one) also need to set
535 		 * the physical buffer address in the NIC ring.
536 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
537 		 * netmap slot index, si
538 		 */
539 		if (slot) {
540 			int si = netmap_idx_n2k(&na->tx_rings[que->me], i);
541 			netmap_load_map(na, buf->tag, buf->map, NMB(na, slot + si));
542 		}
543 #endif /* DEV_NETMAP */
544 		/* Clear the EOP index */
545 		buf->eop_index = -1;
546         }
547 
548 	/* Set number of descriptors available */
549 	txr->avail = que->num_desc;
550 
551 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
552 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
553 	IXL_TX_UNLOCK(txr);
554 }
555 
556 
557 /*********************************************************************
558  *
559  *  Free transmit ring related data structures.
560  *
561  **********************************************************************/
562 void
563 ixl_free_que_tx(struct ixl_queue *que)
564 {
565 	struct tx_ring *txr = &que->txr;
566 	struct ixl_tx_buf *buf;
567 
568 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
569 
570 	for (int i = 0; i < que->num_desc; i++) {
571 		buf = &txr->buffers[i];
572 		if (buf->m_head != NULL) {
573 			bus_dmamap_sync(buf->tag, buf->map,
574 			    BUS_DMASYNC_POSTWRITE);
575 			bus_dmamap_unload(buf->tag,
576 			    buf->map);
577 			m_freem(buf->m_head);
578 			buf->m_head = NULL;
579 			if (buf->map != NULL) {
580 				bus_dmamap_destroy(buf->tag,
581 				    buf->map);
582 				buf->map = NULL;
583 			}
584 		} else if (buf->map != NULL) {
585 			bus_dmamap_unload(buf->tag,
586 			    buf->map);
587 			bus_dmamap_destroy(buf->tag,
588 			    buf->map);
589 			buf->map = NULL;
590 		}
591 	}
592 	if (txr->br != NULL)
593 		buf_ring_free(txr->br, M_DEVBUF);
594 	if (txr->buffers != NULL) {
595 		free(txr->buffers, M_DEVBUF);
596 		txr->buffers = NULL;
597 	}
598 	if (txr->tx_tag != NULL) {
599 		bus_dma_tag_destroy(txr->tx_tag);
600 		txr->tx_tag = NULL;
601 	}
602 	if (txr->tso_tag != NULL) {
603 		bus_dma_tag_destroy(txr->tso_tag);
604 		txr->tso_tag = NULL;
605 	}
606 
607 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
608 	return;
609 }
610 
611 /*********************************************************************
612  *
613  *  Setup descriptor for hw offloads
614  *
615  **********************************************************************/
616 
617 static int
618 ixl_tx_setup_offload(struct ixl_queue *que,
619     struct mbuf *mp, u32 *cmd, u32 *off)
620 {
621 	struct ether_vlan_header	*eh;
622 #ifdef INET
623 	struct ip			*ip = NULL;
624 #endif
625 	struct tcphdr			*th = NULL;
626 #ifdef INET6
627 	struct ip6_hdr			*ip6;
628 #endif
629 	int				elen, ip_hlen = 0, tcp_hlen;
630 	u16				etype;
631 	u8				ipproto = 0;
632 	bool				tso = FALSE;
633 
634 
635 	/* Set up the TSO context descriptor if required */
636 	if (mp->m_pkthdr.csum_flags & CSUM_TSO) {
637 		tso = ixl_tso_setup(que, mp);
638 		if (tso)
639 			++que->tso;
640 		else
641 			return (ENXIO);
642 	}
643 
644 	/*
645 	 * Determine where frame payload starts.
646 	 * Jump over vlan headers if already present,
647 	 * helpful for QinQ too.
648 	 */
649 	eh = mtod(mp, struct ether_vlan_header *);
650 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
651 		etype = ntohs(eh->evl_proto);
652 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
653 	} else {
654 		etype = ntohs(eh->evl_encap_proto);
655 		elen = ETHER_HDR_LEN;
656 	}
657 
658 	switch (etype) {
659 #ifdef INET
660 		case ETHERTYPE_IP:
661 			ip = (struct ip *)(mp->m_data + elen);
662 			ip_hlen = ip->ip_hl << 2;
663 			ipproto = ip->ip_p;
664 			th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
665 			/* The IP checksum must be recalculated with TSO */
666 			if (tso)
667 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
668 			else
669 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4;
670 			break;
671 #endif
672 #ifdef INET6
673 		case ETHERTYPE_IPV6:
674 			ip6 = (struct ip6_hdr *)(mp->m_data + elen);
675 			ip_hlen = sizeof(struct ip6_hdr);
676 			ipproto = ip6->ip6_nxt;
677 			th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
678 			*cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
679 			break;
680 #endif
681 		default:
682 			break;
683 	}
684 
685 	*off |= (elen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
686 	*off |= (ip_hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
687 
688 	switch (ipproto) {
689 		case IPPROTO_TCP:
690 			tcp_hlen = th->th_off << 2;
691 			if (mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) {
692 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
693 				*off |= (tcp_hlen >> 2) <<
694 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
695 			}
696 #ifdef IXL_FDIR
697 			ixl_atr(que, th, etype);
698 #endif
699 			break;
700 		case IPPROTO_UDP:
701 			if (mp->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_UDP_IPV6)) {
702 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
703 				*off |= (sizeof(struct udphdr) >> 2) <<
704 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
705 			}
706 			break;
707 
708 		case IPPROTO_SCTP:
709 			if (mp->m_pkthdr.csum_flags & (CSUM_SCTP|CSUM_SCTP_IPV6)) {
710 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
711 				*off |= (sizeof(struct sctphdr) >> 2) <<
712 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
713 			}
714 			/* Fall Thru */
715 		default:
716 			break;
717 	}
718 
719         return (0);
720 }
721 
722 
723 /**********************************************************************
724  *
725  *  Setup context for hardware segmentation offload (TSO)
726  *
727  **********************************************************************/
728 static bool
729 ixl_tso_setup(struct ixl_queue *que, struct mbuf *mp)
730 {
731 	struct tx_ring			*txr = &que->txr;
732 	struct i40e_tx_context_desc	*TXD;
733 	struct ixl_tx_buf		*buf;
734 	u32				cmd, mss, type, tsolen;
735 	u16				etype;
736 	int				idx, elen, ip_hlen, tcp_hlen;
737 	struct ether_vlan_header	*eh;
738 #ifdef INET
739 	struct ip			*ip;
740 #endif
741 #ifdef INET6
742 	struct ip6_hdr			*ip6;
743 #endif
744 #if defined(INET6) || defined(INET)
745 	struct tcphdr			*th;
746 #endif
747 	u64				type_cmd_tso_mss;
748 
749 	/*
750 	 * Determine where frame payload starts.
751 	 * Jump over vlan headers if already present
752 	 */
753 	eh = mtod(mp, struct ether_vlan_header *);
754 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
755 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
756 		etype = eh->evl_proto;
757 	} else {
758 		elen = ETHER_HDR_LEN;
759 		etype = eh->evl_encap_proto;
760 	}
761 
762         switch (ntohs(etype)) {
763 #ifdef INET6
764 	case ETHERTYPE_IPV6:
765 		ip6 = (struct ip6_hdr *)(mp->m_data + elen);
766 		if (ip6->ip6_nxt != IPPROTO_TCP)
767 			return (ENXIO);
768 		ip_hlen = sizeof(struct ip6_hdr);
769 		th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
770 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
771 		tcp_hlen = th->th_off << 2;
772 		break;
773 #endif
774 #ifdef INET
775 	case ETHERTYPE_IP:
776 		ip = (struct ip *)(mp->m_data + elen);
777 		if (ip->ip_p != IPPROTO_TCP)
778 			return (ENXIO);
779 		ip->ip_sum = 0;
780 		ip_hlen = ip->ip_hl << 2;
781 		th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
782 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
783 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
784 		tcp_hlen = th->th_off << 2;
785 		break;
786 #endif
787 	default:
788 		printf("%s: CSUM_TSO but no supported IP version (0x%04x)",
789 		    __func__, ntohs(etype));
790 		return FALSE;
791         }
792 
793         /* Ensure we have at least the IP+TCP header in the first mbuf. */
794         if (mp->m_len < elen + ip_hlen + sizeof(struct tcphdr))
795 		return FALSE;
796 
797 	idx = txr->next_avail;
798 	buf = &txr->buffers[idx];
799 	TXD = (struct i40e_tx_context_desc *) &txr->base[idx];
800 	tsolen = mp->m_pkthdr.len - (elen + ip_hlen + tcp_hlen);
801 
802 	type = I40E_TX_DESC_DTYPE_CONTEXT;
803 	cmd = I40E_TX_CTX_DESC_TSO;
804 	mss = mp->m_pkthdr.tso_segsz;
805 
806 	type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) |
807 	    ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
808 	    ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
809 	    ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
810 	TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss);
811 
812 	TXD->tunneling_params = htole32(0);
813 	buf->m_head = NULL;
814 	buf->eop_index = -1;
815 
816 	if (++idx == que->num_desc)
817 		idx = 0;
818 
819 	txr->avail--;
820 	txr->next_avail = idx;
821 
822 	return TRUE;
823 }
824 
825 /*
826 ** ixl_get_tx_head - Retrieve the value from the
827 **    location the HW records its HEAD index
828 */
829 static inline u32
830 ixl_get_tx_head(struct ixl_queue *que)
831 {
832 	struct tx_ring  *txr = &que->txr;
833 	void *head = &txr->base[que->num_desc];
834 	return LE32_TO_CPU(*(volatile __le32 *)head);
835 }
836 
837 /**********************************************************************
838  *
839  *  Examine each tx_buffer in the used queue. If the hardware is done
840  *  processing the packet then free associated resources. The
841  *  tx_buffer is put back on the free queue.
842  *
843  **********************************************************************/
844 bool
845 ixl_txeof(struct ixl_queue *que)
846 {
847 	struct tx_ring		*txr = &que->txr;
848 	u32			first, last, head, done, processed;
849 	struct ixl_tx_buf	*buf;
850 	struct i40e_tx_desc	*tx_desc, *eop_desc;
851 
852 
853 	mtx_assert(&txr->mtx, MA_OWNED);
854 
855 #ifdef DEV_NETMAP
856 	// XXX todo: implement moderation
857 	if (netmap_tx_irq(que->vsi->ifp, que->me))
858 		return FALSE;
859 #endif /* DEF_NETMAP */
860 
861 	/* These are not the descriptors you seek, move along :) */
862 	if (txr->avail == que->num_desc) {
863 		que->busy = 0;
864 		return FALSE;
865 	}
866 
867 	processed = 0;
868 	first = txr->next_to_clean;
869 	buf = &txr->buffers[first];
870 	tx_desc = (struct i40e_tx_desc *)&txr->base[first];
871 	last = buf->eop_index;
872 	if (last == -1)
873 		return FALSE;
874 	eop_desc = (struct i40e_tx_desc *)&txr->base[last];
875 
876 	/* Get the Head WB value */
877 	head = ixl_get_tx_head(que);
878 
879 	/*
880 	** Get the index of the first descriptor
881 	** BEYOND the EOP and call that 'done'.
882 	** I do this so the comparison in the
883 	** inner while loop below can be simple
884 	*/
885 	if (++last == que->num_desc) last = 0;
886 	done = last;
887 
888         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
889             BUS_DMASYNC_POSTREAD);
890 	/*
891 	** The HEAD index of the ring is written in a
892 	** defined location, this rather than a done bit
893 	** is what is used to keep track of what must be
894 	** 'cleaned'.
895 	*/
896 	while (first != head) {
897 		/* We clean the range of the packet */
898 		while (first != done) {
899 			++txr->avail;
900 			++processed;
901 
902 			if (buf->m_head) {
903 				txr->bytes += /* for ITR adjustment */
904 				    buf->m_head->m_pkthdr.len;
905 				txr->tx_bytes += /* for TX stats */
906 				    buf->m_head->m_pkthdr.len;
907 				bus_dmamap_sync(buf->tag,
908 				    buf->map,
909 				    BUS_DMASYNC_POSTWRITE);
910 				bus_dmamap_unload(buf->tag,
911 				    buf->map);
912 				m_freem(buf->m_head);
913 				buf->m_head = NULL;
914 				buf->map = NULL;
915 			}
916 			buf->eop_index = -1;
917 
918 			if (++first == que->num_desc)
919 				first = 0;
920 
921 			buf = &txr->buffers[first];
922 			tx_desc = &txr->base[first];
923 		}
924 		++txr->packets;
925 		/* See if there is more work now */
926 		last = buf->eop_index;
927 		if (last != -1) {
928 			eop_desc = &txr->base[last];
929 			/* Get next done point */
930 			if (++last == que->num_desc) last = 0;
931 			done = last;
932 		} else
933 			break;
934 	}
935 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
936 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
937 
938 	txr->next_to_clean = first;
939 
940 
941 	/*
942 	** Hang detection, we know there's
943 	** work outstanding or the first return
944 	** would have been taken, so indicate an
945 	** unsuccessful pass, in local_timer if
946 	** the value is too great the queue will
947 	** be considered hung. If anything has been
948 	** cleaned then reset the state.
949 	*/
950 	if ((processed == 0) && (que->busy != IXL_QUEUE_HUNG))
951 		++que->busy;
952 
953 	if (processed)
954 		que->busy = 1; /* Note this turns off HUNG */
955 
956 	/*
957 	 * If there are no pending descriptors, clear the timeout.
958 	 */
959 	if (txr->avail == que->num_desc) {
960 		que->busy = 0;
961 		return FALSE;
962 	}
963 
964 	return TRUE;
965 }
966 
967 /*********************************************************************
968  *
969  *  Refresh mbuf buffers for RX descriptor rings
970  *   - now keeps its own state so discards due to resource
971  *     exhaustion are unnecessary, if an mbuf cannot be obtained
972  *     it just returns, keeping its placeholder, thus it can simply
973  *     be recalled to try again.
974  *
975  **********************************************************************/
976 static void
977 ixl_refresh_mbufs(struct ixl_queue *que, int limit)
978 {
979 	struct ixl_vsi		*vsi = que->vsi;
980 	struct rx_ring		*rxr = &que->rxr;
981 	bus_dma_segment_t	hseg[1];
982 	bus_dma_segment_t	pseg[1];
983 	struct ixl_rx_buf	*buf;
984 	struct mbuf		*mh, *mp;
985 	int			i, j, nsegs, error;
986 	bool			refreshed = FALSE;
987 
988 	i = j = rxr->next_refresh;
989 	/* Control the loop with one beyond */
990 	if (++j == que->num_desc)
991 		j = 0;
992 
993 	while (j != limit) {
994 		buf = &rxr->buffers[i];
995 		if (rxr->hdr_split == FALSE)
996 			goto no_split;
997 
998 		if (buf->m_head == NULL) {
999 			mh = m_gethdr(M_NOWAIT, MT_DATA);
1000 			if (mh == NULL)
1001 				goto update;
1002 		} else
1003 			mh = buf->m_head;
1004 
1005 		mh->m_pkthdr.len = mh->m_len = MHLEN;
1006 		mh->m_len = MHLEN;
1007 		mh->m_flags |= M_PKTHDR;
1008 		/* Get the memory mapping */
1009 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1010 		    buf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT);
1011 		if (error != 0) {
1012 			printf("Refresh mbufs: hdr dmamap load"
1013 			    " failure - %d\n", error);
1014 			m_free(mh);
1015 			buf->m_head = NULL;
1016 			goto update;
1017 		}
1018 		buf->m_head = mh;
1019 		bus_dmamap_sync(rxr->htag, buf->hmap,
1020 		    BUS_DMASYNC_PREREAD);
1021 		rxr->base[i].read.hdr_addr =
1022 		   htole64(hseg[0].ds_addr);
1023 
1024 no_split:
1025 		if (buf->m_pack == NULL) {
1026 			mp = m_getjcl(M_NOWAIT, MT_DATA,
1027 			    M_PKTHDR, rxr->mbuf_sz);
1028 			if (mp == NULL)
1029 				goto update;
1030 		} else
1031 			mp = buf->m_pack;
1032 
1033 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1034 		/* Get the memory mapping */
1035 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1036 		    buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT);
1037 		if (error != 0) {
1038 			printf("Refresh mbufs: payload dmamap load"
1039 			    " failure - %d\n", error);
1040 			m_free(mp);
1041 			buf->m_pack = NULL;
1042 			goto update;
1043 		}
1044 		buf->m_pack = mp;
1045 		bus_dmamap_sync(rxr->ptag, buf->pmap,
1046 		    BUS_DMASYNC_PREREAD);
1047 		rxr->base[i].read.pkt_addr =
1048 		   htole64(pseg[0].ds_addr);
1049 		/* Used only when doing header split */
1050 		rxr->base[i].read.hdr_addr = 0;
1051 
1052 		refreshed = TRUE;
1053 		/* Next is precalculated */
1054 		i = j;
1055 		rxr->next_refresh = i;
1056 		if (++j == que->num_desc)
1057 			j = 0;
1058 	}
1059 update:
1060 	if (refreshed) /* Update hardware tail index */
1061 		wr32(vsi->hw, rxr->tail, rxr->next_refresh);
1062 	return;
1063 }
1064 
1065 
1066 /*********************************************************************
1067  *
1068  *  Allocate memory for rx_buffer structures. Since we use one
1069  *  rx_buffer per descriptor, the maximum number of rx_buffer's
1070  *  that we'll need is equal to the number of receive descriptors
1071  *  that we've defined.
1072  *
1073  **********************************************************************/
1074 int
1075 ixl_allocate_rx_data(struct ixl_queue *que)
1076 {
1077 	struct rx_ring		*rxr = &que->rxr;
1078 	struct ixl_vsi		*vsi = que->vsi;
1079 	device_t 		dev = vsi->dev;
1080 	struct ixl_rx_buf 	*buf;
1081 	int             	i, bsize, error;
1082 
1083 	bsize = sizeof(struct ixl_rx_buf) * que->num_desc;
1084 	if (!(rxr->buffers =
1085 	    (struct ixl_rx_buf *) malloc(bsize,
1086 	    M_DEVBUF, M_NOWAIT | M_ZERO))) {
1087 		device_printf(dev, "Unable to allocate rx_buffer memory\n");
1088 		error = ENOMEM;
1089 		return (error);
1090 	}
1091 
1092 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1093 				   1, 0,	/* alignment, bounds */
1094 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1095 				   BUS_SPACE_MAXADDR,	/* highaddr */
1096 				   NULL, NULL,		/* filter, filterarg */
1097 				   MSIZE,		/* maxsize */
1098 				   1,			/* nsegments */
1099 				   MSIZE,		/* maxsegsize */
1100 				   0,			/* flags */
1101 				   NULL,		/* lockfunc */
1102 				   NULL,		/* lockfuncarg */
1103 				   &rxr->htag))) {
1104 		device_printf(dev, "Unable to create RX DMA htag\n");
1105 		return (error);
1106 	}
1107 
1108 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1109 				   1, 0,	/* alignment, bounds */
1110 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1111 				   BUS_SPACE_MAXADDR,	/* highaddr */
1112 				   NULL, NULL,		/* filter, filterarg */
1113 				   MJUM16BYTES,		/* maxsize */
1114 				   1,			/* nsegments */
1115 				   MJUM16BYTES,		/* maxsegsize */
1116 				   0,			/* flags */
1117 				   NULL,		/* lockfunc */
1118 				   NULL,		/* lockfuncarg */
1119 				   &rxr->ptag))) {
1120 		device_printf(dev, "Unable to create RX DMA ptag\n");
1121 		return (error);
1122 	}
1123 
1124 	for (i = 0; i < que->num_desc; i++) {
1125 		buf = &rxr->buffers[i];
1126 		error = bus_dmamap_create(rxr->htag,
1127 		    BUS_DMA_NOWAIT, &buf->hmap);
1128 		if (error) {
1129 			device_printf(dev, "Unable to create RX head map\n");
1130 			break;
1131 		}
1132 		error = bus_dmamap_create(rxr->ptag,
1133 		    BUS_DMA_NOWAIT, &buf->pmap);
1134 		if (error) {
1135 			device_printf(dev, "Unable to create RX pkt map\n");
1136 			break;
1137 		}
1138 	}
1139 
1140 	return (error);
1141 }
1142 
1143 
1144 /*********************************************************************
1145  *
1146  *  (Re)Initialize the queue receive ring and its buffers.
1147  *
1148  **********************************************************************/
1149 int
1150 ixl_init_rx_ring(struct ixl_queue *que)
1151 {
1152 	struct	rx_ring 	*rxr = &que->rxr;
1153 	struct ixl_vsi		*vsi = que->vsi;
1154 #if defined(INET6) || defined(INET)
1155 	struct ifnet		*ifp = vsi->ifp;
1156 	struct lro_ctrl		*lro = &rxr->lro;
1157 #endif
1158 	struct ixl_rx_buf	*buf;
1159 	bus_dma_segment_t	pseg[1], hseg[1];
1160 	int			rsize, nsegs, error = 0;
1161 #ifdef DEV_NETMAP
1162 	struct netmap_adapter *na = NA(que->vsi->ifp);
1163 	struct netmap_slot *slot;
1164 #endif /* DEV_NETMAP */
1165 
1166 	IXL_RX_LOCK(rxr);
1167 #ifdef DEV_NETMAP
1168 	/* same as in ixl_init_tx_ring() */
1169 	slot = netmap_reset(na, NR_RX, que->me, 0);
1170 #endif /* DEV_NETMAP */
1171 	/* Clear the ring contents */
1172 	rsize = roundup2(que->num_desc *
1173 	    sizeof(union i40e_rx_desc), DBA_ALIGN);
1174 	bzero((void *)rxr->base, rsize);
1175 	/* Cleanup any existing buffers */
1176 	for (int i = 0; i < que->num_desc; i++) {
1177 		buf = &rxr->buffers[i];
1178 		if (buf->m_head != NULL) {
1179 			bus_dmamap_sync(rxr->htag, buf->hmap,
1180 			    BUS_DMASYNC_POSTREAD);
1181 			bus_dmamap_unload(rxr->htag, buf->hmap);
1182 			buf->m_head->m_flags |= M_PKTHDR;
1183 			m_freem(buf->m_head);
1184 		}
1185 		if (buf->m_pack != NULL) {
1186 			bus_dmamap_sync(rxr->ptag, buf->pmap,
1187 			    BUS_DMASYNC_POSTREAD);
1188 			bus_dmamap_unload(rxr->ptag, buf->pmap);
1189 			buf->m_pack->m_flags |= M_PKTHDR;
1190 			m_freem(buf->m_pack);
1191 		}
1192 		buf->m_head = NULL;
1193 		buf->m_pack = NULL;
1194 	}
1195 
1196 	/* header split is off */
1197 	rxr->hdr_split = FALSE;
1198 
1199 	/* Now replenish the mbufs */
1200 	for (int j = 0; j != que->num_desc; ++j) {
1201 		struct mbuf	*mh, *mp;
1202 
1203 		buf = &rxr->buffers[j];
1204 #ifdef DEV_NETMAP
1205 		/*
1206 		 * In netmap mode, fill the map and set the buffer
1207 		 * address in the NIC ring, considering the offset
1208 		 * between the netmap and NIC rings (see comment in
1209 		 * ixgbe_setup_transmit_ring() ). No need to allocate
1210 		 * an mbuf, so end the block with a continue;
1211 		 */
1212 		if (slot) {
1213 			int sj = netmap_idx_n2k(&na->rx_rings[que->me], j);
1214 			uint64_t paddr;
1215 			void *addr;
1216 
1217 			addr = PNMB(na, slot + sj, &paddr);
1218 			netmap_load_map(na, rxr->dma.tag, buf->pmap, addr);
1219 			/* Update descriptor and the cached value */
1220 			rxr->base[j].read.pkt_addr = htole64(paddr);
1221 			rxr->base[j].read.hdr_addr = 0;
1222 			continue;
1223 		}
1224 #endif /* DEV_NETMAP */
1225 
1226 		/*
1227 		** Don't allocate mbufs if not
1228 		** doing header split, its wasteful
1229 		*/
1230 		if (rxr->hdr_split == FALSE)
1231 			goto skip_head;
1232 
1233 		/* First the header */
1234 		buf->m_head = m_gethdr(M_NOWAIT, MT_DATA);
1235 		if (buf->m_head == NULL) {
1236 			error = ENOBUFS;
1237 			goto fail;
1238 		}
1239 		m_adj(buf->m_head, ETHER_ALIGN);
1240 		mh = buf->m_head;
1241 		mh->m_len = mh->m_pkthdr.len = MHLEN;
1242 		mh->m_flags |= M_PKTHDR;
1243 		/* Get the memory mapping */
1244 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1245 		    buf->hmap, buf->m_head, hseg,
1246 		    &nsegs, BUS_DMA_NOWAIT);
1247 		if (error != 0) /* Nothing elegant to do here */
1248 			goto fail;
1249 		bus_dmamap_sync(rxr->htag,
1250 		    buf->hmap, BUS_DMASYNC_PREREAD);
1251 		/* Update descriptor */
1252 		rxr->base[j].read.hdr_addr = htole64(hseg[0].ds_addr);
1253 
1254 skip_head:
1255 		/* Now the payload cluster */
1256 		buf->m_pack = m_getjcl(M_NOWAIT, MT_DATA,
1257 		    M_PKTHDR, rxr->mbuf_sz);
1258 		if (buf->m_pack == NULL) {
1259 			error = ENOBUFS;
1260                         goto fail;
1261 		}
1262 		mp = buf->m_pack;
1263 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1264 		/* Get the memory mapping */
1265 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1266 		    buf->pmap, mp, pseg,
1267 		    &nsegs, BUS_DMA_NOWAIT);
1268 		if (error != 0)
1269                         goto fail;
1270 		bus_dmamap_sync(rxr->ptag,
1271 		    buf->pmap, BUS_DMASYNC_PREREAD);
1272 		/* Update descriptor */
1273 		rxr->base[j].read.pkt_addr = htole64(pseg[0].ds_addr);
1274 		rxr->base[j].read.hdr_addr = 0;
1275 	}
1276 
1277 
1278 	/* Setup our descriptor indices */
1279 	rxr->next_check = 0;
1280 	rxr->next_refresh = 0;
1281 	rxr->lro_enabled = FALSE;
1282 	rxr->split = 0;
1283 	rxr->bytes = 0;
1284 	rxr->discard = FALSE;
1285 
1286 	wr32(vsi->hw, rxr->tail, que->num_desc - 1);
1287 	ixl_flush(vsi->hw);
1288 
1289 #if defined(INET6) || defined(INET)
1290 	/*
1291 	** Now set up the LRO interface:
1292 	*/
1293 	if (ifp->if_capenable & IFCAP_LRO) {
1294 		int err = tcp_lro_init(lro);
1295 		if (err) {
1296 			if_printf(ifp, "queue %d: LRO Initialization failed!\n", que->me);
1297 			goto fail;
1298 		}
1299 		INIT_DBG_IF(ifp, "queue %d: RX Soft LRO Initialized", que->me);
1300 		rxr->lro_enabled = TRUE;
1301 		lro->ifp = vsi->ifp;
1302 	}
1303 #endif
1304 
1305 	bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1306 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1307 
1308 fail:
1309 	IXL_RX_UNLOCK(rxr);
1310 	return (error);
1311 }
1312 
1313 
1314 /*********************************************************************
1315  *
1316  *  Free station receive ring data structures
1317  *
1318  **********************************************************************/
1319 void
1320 ixl_free_que_rx(struct ixl_queue *que)
1321 {
1322 	struct rx_ring		*rxr = &que->rxr;
1323 	struct ixl_rx_buf	*buf;
1324 
1325 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
1326 
1327 	/* Cleanup any existing buffers */
1328 	if (rxr->buffers != NULL) {
1329 		for (int i = 0; i < que->num_desc; i++) {
1330 			buf = &rxr->buffers[i];
1331 			if (buf->m_head != NULL) {
1332 				bus_dmamap_sync(rxr->htag, buf->hmap,
1333 				    BUS_DMASYNC_POSTREAD);
1334 				bus_dmamap_unload(rxr->htag, buf->hmap);
1335 				buf->m_head->m_flags |= M_PKTHDR;
1336 				m_freem(buf->m_head);
1337 			}
1338 			if (buf->m_pack != NULL) {
1339 				bus_dmamap_sync(rxr->ptag, buf->pmap,
1340 				    BUS_DMASYNC_POSTREAD);
1341 				bus_dmamap_unload(rxr->ptag, buf->pmap);
1342 				buf->m_pack->m_flags |= M_PKTHDR;
1343 				m_freem(buf->m_pack);
1344 			}
1345 			buf->m_head = NULL;
1346 			buf->m_pack = NULL;
1347 			if (buf->hmap != NULL) {
1348 				bus_dmamap_destroy(rxr->htag, buf->hmap);
1349 				buf->hmap = NULL;
1350 			}
1351 			if (buf->pmap != NULL) {
1352 				bus_dmamap_destroy(rxr->ptag, buf->pmap);
1353 				buf->pmap = NULL;
1354 			}
1355 		}
1356 		if (rxr->buffers != NULL) {
1357 			free(rxr->buffers, M_DEVBUF);
1358 			rxr->buffers = NULL;
1359 		}
1360 	}
1361 
1362 	if (rxr->htag != NULL) {
1363 		bus_dma_tag_destroy(rxr->htag);
1364 		rxr->htag = NULL;
1365 	}
1366 	if (rxr->ptag != NULL) {
1367 		bus_dma_tag_destroy(rxr->ptag);
1368 		rxr->ptag = NULL;
1369 	}
1370 
1371 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
1372 	return;
1373 }
1374 
1375 static __inline void
1376 ixl_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u8 ptype)
1377 {
1378 
1379 #if defined(INET6) || defined(INET)
1380         /*
1381          * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet
1382          * should be computed by hardware. Also it should not have VLAN tag in
1383          * ethernet header.
1384          */
1385         if (rxr->lro_enabled &&
1386             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
1387             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) ==
1388             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) {
1389                 /*
1390                  * Send to the stack if:
1391                  **  - LRO not enabled, or
1392                  **  - no LRO resources, or
1393                  **  - lro enqueue fails
1394                  */
1395                 if (rxr->lro.lro_cnt != 0)
1396                         if (tcp_lro_rx(&rxr->lro, m, 0) == 0)
1397                                 return;
1398         }
1399 #endif
1400 	IXL_RX_UNLOCK(rxr);
1401         (*ifp->if_input)(ifp, m);
1402 	IXL_RX_LOCK(rxr);
1403 }
1404 
1405 
1406 static __inline void
1407 ixl_rx_discard(struct rx_ring *rxr, int i)
1408 {
1409 	struct ixl_rx_buf	*rbuf;
1410 
1411 	rbuf = &rxr->buffers[i];
1412 
1413         if (rbuf->fmp != NULL) {/* Partial chain ? */
1414 		rbuf->fmp->m_flags |= M_PKTHDR;
1415                 m_freem(rbuf->fmp);
1416                 rbuf->fmp = NULL;
1417 	}
1418 
1419 	/*
1420 	** With advanced descriptors the writeback
1421 	** clobbers the buffer addrs, so its easier
1422 	** to just free the existing mbufs and take
1423 	** the normal refresh path to get new buffers
1424 	** and mapping.
1425 	*/
1426 	if (rbuf->m_head) {
1427 		m_free(rbuf->m_head);
1428 		rbuf->m_head = NULL;
1429 	}
1430 
1431 	if (rbuf->m_pack) {
1432 		m_free(rbuf->m_pack);
1433 		rbuf->m_pack = NULL;
1434 	}
1435 
1436 	return;
1437 }
1438 
1439 #ifdef RSS
1440 /*
1441 ** i40e_ptype_to_hash: parse the packet type
1442 ** to determine the appropriate hash.
1443 */
1444 static inline int
1445 ixl_ptype_to_hash(u8 ptype)
1446 {
1447         struct i40e_rx_ptype_decoded	decoded;
1448 	u8				ex = 0;
1449 
1450 	decoded = decode_rx_desc_ptype(ptype);
1451 	ex = decoded.outer_frag;
1452 
1453 	if (!decoded.known)
1454 		return M_HASHTYPE_OPAQUE;
1455 
1456 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_L2)
1457 		return M_HASHTYPE_OPAQUE;
1458 
1459 	/* Note: anything that gets to this point is IP */
1460         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6) {
1461 		switch (decoded.inner_prot) {
1462 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1463 				if (ex)
1464 					return M_HASHTYPE_RSS_TCP_IPV6_EX;
1465 				else
1466 					return M_HASHTYPE_RSS_TCP_IPV6;
1467 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1468 				if (ex)
1469 					return M_HASHTYPE_RSS_UDP_IPV6_EX;
1470 				else
1471 					return M_HASHTYPE_RSS_UDP_IPV6;
1472 			default:
1473 				if (ex)
1474 					return M_HASHTYPE_RSS_IPV6_EX;
1475 				else
1476 					return M_HASHTYPE_RSS_IPV6;
1477 		}
1478 	}
1479         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1480 		switch (decoded.inner_prot) {
1481 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1482 					return M_HASHTYPE_RSS_TCP_IPV4;
1483 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1484 				if (ex)
1485 					return M_HASHTYPE_RSS_UDP_IPV4_EX;
1486 				else
1487 					return M_HASHTYPE_RSS_UDP_IPV4;
1488 			default:
1489 					return M_HASHTYPE_RSS_IPV4;
1490 		}
1491 	}
1492 	/* We should never get here!! */
1493 	return M_HASHTYPE_OPAQUE;
1494 }
1495 #endif /* RSS */
1496 
1497 /*********************************************************************
1498  *
1499  *  This routine executes in interrupt context. It replenishes
1500  *  the mbufs in the descriptor and sends data which has been
1501  *  dma'ed into host memory to upper layer.
1502  *
1503  *  We loop at most count times if count is > 0, or until done if
1504  *  count < 0.
1505  *
1506  *  Return TRUE for more work, FALSE for all clean.
1507  *********************************************************************/
1508 bool
1509 ixl_rxeof(struct ixl_queue *que, int count)
1510 {
1511 	struct ixl_vsi		*vsi = que->vsi;
1512 	struct rx_ring		*rxr = &que->rxr;
1513 	struct ifnet		*ifp = vsi->ifp;
1514 #if defined(INET6) || defined(INET)
1515 	struct lro_ctrl		*lro = &rxr->lro;
1516 	struct lro_entry	*queued;
1517 #endif
1518 	int			i, nextp, processed = 0;
1519 	union i40e_rx_desc	*cur;
1520 	struct ixl_rx_buf	*rbuf, *nbuf;
1521 
1522 
1523 	IXL_RX_LOCK(rxr);
1524 
1525 #ifdef DEV_NETMAP
1526 	if (netmap_rx_irq(ifp, que->me, &count)) {
1527 		IXL_RX_UNLOCK(rxr);
1528 		return (FALSE);
1529 	}
1530 #endif /* DEV_NETMAP */
1531 
1532 	for (i = rxr->next_check; count != 0;) {
1533 		struct mbuf	*sendmp, *mh, *mp;
1534 		u32		rsc, status, error;
1535 		u16		hlen, plen, vtag;
1536 		u64		qword;
1537 		u8		ptype;
1538 		bool		eop;
1539 
1540 		/* Sync the ring. */
1541 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1542 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1543 
1544 		cur = &rxr->base[i];
1545 		qword = le64toh(cur->wb.qword1.status_error_len);
1546 		status = (qword & I40E_RXD_QW1_STATUS_MASK)
1547 		    >> I40E_RXD_QW1_STATUS_SHIFT;
1548 		error = (qword & I40E_RXD_QW1_ERROR_MASK)
1549 		    >> I40E_RXD_QW1_ERROR_SHIFT;
1550 		plen = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK)
1551 		    >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1552 		hlen = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK)
1553 		    >> I40E_RXD_QW1_LENGTH_HBUF_SHIFT;
1554 		ptype = (qword & I40E_RXD_QW1_PTYPE_MASK)
1555 		    >> I40E_RXD_QW1_PTYPE_SHIFT;
1556 
1557 		if ((status & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) == 0) {
1558 			++rxr->not_done;
1559 			break;
1560 		}
1561 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1562 			break;
1563 
1564 		count--;
1565 		sendmp = NULL;
1566 		nbuf = NULL;
1567 		rsc = 0;
1568 		cur->wb.qword1.status_error_len = 0;
1569 		rbuf = &rxr->buffers[i];
1570 		mh = rbuf->m_head;
1571 		mp = rbuf->m_pack;
1572 		eop = (status & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT));
1573 		if (status & (1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT))
1574 			vtag = le16toh(cur->wb.qword0.lo_dword.l2tag1);
1575 		else
1576 			vtag = 0;
1577 
1578 		/*
1579 		** Make sure bad packets are discarded,
1580 		** note that only EOP descriptor has valid
1581 		** error results.
1582 		*/
1583                 if (eop && (error & (1 << I40E_RX_DESC_ERROR_RXE_SHIFT))) {
1584 			rxr->discarded++;
1585 			ixl_rx_discard(rxr, i);
1586 			goto next_desc;
1587 		}
1588 
1589 		/* Prefetch the next buffer */
1590 		if (!eop) {
1591 			nextp = i + 1;
1592 			if (nextp == que->num_desc)
1593 				nextp = 0;
1594 			nbuf = &rxr->buffers[nextp];
1595 			prefetch(nbuf);
1596 		}
1597 
1598 		/*
1599 		** The header mbuf is ONLY used when header
1600 		** split is enabled, otherwise we get normal
1601 		** behavior, ie, both header and payload
1602 		** are DMA'd into the payload buffer.
1603 		**
1604 		** Rather than using the fmp/lmp global pointers
1605 		** we now keep the head of a packet chain in the
1606 		** buffer struct and pass this along from one
1607 		** descriptor to the next, until we get EOP.
1608 		*/
1609 		if (rxr->hdr_split && (rbuf->fmp == NULL)) {
1610 			if (hlen > IXL_RX_HDR)
1611 				hlen = IXL_RX_HDR;
1612 			mh->m_len = hlen;
1613 			mh->m_flags |= M_PKTHDR;
1614 			mh->m_next = NULL;
1615 			mh->m_pkthdr.len = mh->m_len;
1616 			/* Null buf pointer so it is refreshed */
1617 			rbuf->m_head = NULL;
1618 			/*
1619 			** Check the payload length, this
1620 			** could be zero if its a small
1621 			** packet.
1622 			*/
1623 			if (plen > 0) {
1624 				mp->m_len = plen;
1625 				mp->m_next = NULL;
1626 				mp->m_flags &= ~M_PKTHDR;
1627 				mh->m_next = mp;
1628 				mh->m_pkthdr.len += mp->m_len;
1629 				/* Null buf pointer so it is refreshed */
1630 				rbuf->m_pack = NULL;
1631 				rxr->split++;
1632 			}
1633 			/*
1634 			** Now create the forward
1635 			** chain so when complete
1636 			** we wont have to.
1637 			*/
1638                         if (eop == 0) {
1639 				/* stash the chain head */
1640                                 nbuf->fmp = mh;
1641 				/* Make forward chain */
1642                                 if (plen)
1643                                         mp->m_next = nbuf->m_pack;
1644                                 else
1645                                         mh->m_next = nbuf->m_pack;
1646                         } else {
1647 				/* Singlet, prepare to send */
1648                                 sendmp = mh;
1649                                 if (vtag) {
1650                                         sendmp->m_pkthdr.ether_vtag = vtag;
1651                                         sendmp->m_flags |= M_VLANTAG;
1652                                 }
1653                         }
1654 		} else {
1655 			/*
1656 			** Either no header split, or a
1657 			** secondary piece of a fragmented
1658 			** split packet.
1659 			*/
1660 			mp->m_len = plen;
1661 			/*
1662 			** See if there is a stored head
1663 			** that determines what we are
1664 			*/
1665 			sendmp = rbuf->fmp;
1666 			rbuf->m_pack = rbuf->fmp = NULL;
1667 
1668 			if (sendmp != NULL) /* secondary frag */
1669 				sendmp->m_pkthdr.len += mp->m_len;
1670 			else {
1671 				/* first desc of a non-ps chain */
1672 				sendmp = mp;
1673 				sendmp->m_flags |= M_PKTHDR;
1674 				sendmp->m_pkthdr.len = mp->m_len;
1675 				if (vtag) {
1676 					sendmp->m_pkthdr.ether_vtag = vtag;
1677 					sendmp->m_flags |= M_VLANTAG;
1678 				}
1679                         }
1680 			/* Pass the head pointer on */
1681 			if (eop == 0) {
1682 				nbuf->fmp = sendmp;
1683 				sendmp = NULL;
1684 				mp->m_next = nbuf->m_pack;
1685 			}
1686 		}
1687 		++processed;
1688 		/* Sending this frame? */
1689 		if (eop) {
1690 			sendmp->m_pkthdr.rcvif = ifp;
1691 			/* gather stats */
1692 			rxr->rx_packets++;
1693 			rxr->rx_bytes += sendmp->m_pkthdr.len;
1694 			/* capture data for dynamic ITR adjustment */
1695 			rxr->packets++;
1696 			rxr->bytes += sendmp->m_pkthdr.len;
1697 			if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
1698 				ixl_rx_checksum(sendmp, status, error, ptype);
1699 #ifdef RSS
1700 			sendmp->m_pkthdr.flowid =
1701 			    le32toh(cur->wb.qword0.hi_dword.rss);
1702 			M_HASHTYPE_SET(sendmp, ixl_ptype_to_hash(ptype));
1703 #else
1704 			sendmp->m_pkthdr.flowid = que->msix;
1705 			M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE);
1706 #endif
1707 		}
1708 next_desc:
1709 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1710 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1711 
1712 		/* Advance our pointers to the next descriptor. */
1713 		if (++i == que->num_desc)
1714 			i = 0;
1715 
1716 		/* Now send to the stack or do LRO */
1717 		if (sendmp != NULL) {
1718 			rxr->next_check = i;
1719 			ixl_rx_input(rxr, ifp, sendmp, ptype);
1720 			i = rxr->next_check;
1721 		}
1722 
1723                /* Every 8 descriptors we go to refresh mbufs */
1724 		if (processed == 8) {
1725 			ixl_refresh_mbufs(que, i);
1726 			processed = 0;
1727 		}
1728 	}
1729 
1730 	/* Refresh any remaining buf structs */
1731 	if (ixl_rx_unrefreshed(que))
1732 		ixl_refresh_mbufs(que, i);
1733 
1734 	rxr->next_check = i;
1735 
1736 #if defined(INET6) || defined(INET)
1737 	/*
1738 	 * Flush any outstanding LRO work
1739 	 */
1740 	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1741 		SLIST_REMOVE_HEAD(&lro->lro_active, next);
1742 		tcp_lro_flush(lro, queued);
1743 	}
1744 #endif
1745 
1746 	IXL_RX_UNLOCK(rxr);
1747 	return (FALSE);
1748 }
1749 
1750 
1751 /*********************************************************************
1752  *
1753  *  Verify that the hardware indicated that the checksum is valid.
1754  *  Inform the stack about the status of checksum so that stack
1755  *  doesn't spend time verifying the checksum.
1756  *
1757  *********************************************************************/
1758 static void
1759 ixl_rx_checksum(struct mbuf * mp, u32 status, u32 error, u8 ptype)
1760 {
1761 	struct i40e_rx_ptype_decoded decoded;
1762 
1763 	decoded = decode_rx_desc_ptype(ptype);
1764 
1765 	/* Errors? */
1766  	if (error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) |
1767 	    (1 << I40E_RX_DESC_ERROR_L4E_SHIFT))) {
1768 		mp->m_pkthdr.csum_flags = 0;
1769 		return;
1770 	}
1771 
1772 	/* IPv6 with extension headers likely have bad csum */
1773 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1774 	    decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6)
1775 		if (status &
1776 		    (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT)) {
1777 			mp->m_pkthdr.csum_flags = 0;
1778 			return;
1779 		}
1780 
1781 
1782 	/* IP Checksum Good */
1783 	mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
1784 	mp->m_pkthdr.csum_flags |= CSUM_IP_VALID;
1785 
1786 	if (status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) {
1787 		mp->m_pkthdr.csum_flags |=
1788 		    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1789 		mp->m_pkthdr.csum_data |= htons(0xffff);
1790 	}
1791 	return;
1792 }
1793 
1794 #if __FreeBSD_version >= 1100000
1795 uint64_t
1796 ixl_get_counter(if_t ifp, ift_counter cnt)
1797 {
1798 	struct ixl_vsi *vsi;
1799 
1800 	vsi = if_getsoftc(ifp);
1801 
1802 	switch (cnt) {
1803 	case IFCOUNTER_IPACKETS:
1804 		return (vsi->ipackets);
1805 	case IFCOUNTER_IERRORS:
1806 		return (vsi->ierrors);
1807 	case IFCOUNTER_OPACKETS:
1808 		return (vsi->opackets);
1809 	case IFCOUNTER_OERRORS:
1810 		return (vsi->oerrors);
1811 	case IFCOUNTER_COLLISIONS:
1812 		/* Collisions are by standard impossible in 40G/10G Ethernet */
1813 		return (0);
1814 	case IFCOUNTER_IBYTES:
1815 		return (vsi->ibytes);
1816 	case IFCOUNTER_OBYTES:
1817 		return (vsi->obytes);
1818 	case IFCOUNTER_IMCASTS:
1819 		return (vsi->imcasts);
1820 	case IFCOUNTER_OMCASTS:
1821 		return (vsi->omcasts);
1822 	case IFCOUNTER_IQDROPS:
1823 		return (vsi->iqdrops);
1824 	case IFCOUNTER_OQDROPS:
1825 		return (vsi->oqdrops);
1826 	case IFCOUNTER_NOPROTO:
1827 		return (vsi->noproto);
1828 	default:
1829 		return (if_get_counter_default(ifp, cnt));
1830 	}
1831 }
1832 #endif
1833 
1834